From 65b448ea75a69b62967d540f826bffc9bea5bc11 Mon Sep 17 00:00:00 2001 From: Daigo Tanaka Date: Tue, 25 May 2021 11:29:22 -0700 Subject: [PATCH] fix null record issue (#7) (#8) --- .travis.yml | 26 +++++++++++++++++++ HISTORY.md | 4 +++ getschema/impl.py | 63 +++++++++++++++++++++++++++++++-------------- tests/test_types.py | 39 ++++++++++++++++++++++++++++ 4 files changed, 113 insertions(+), 19 deletions(-) create mode 100644 .travis.yml create mode 100644 tests/test_types.py diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..996124a --- /dev/null +++ b/.travis.yml @@ -0,0 +1,26 @@ +sudo: false +dist: trusty + +language: python + +python: + - "3.6" + +before_install: + - export + - ls /opt/python/ + - /opt/python/3.6/bin/python --version + +install: + - /opt/python/3.6/bin/python -m venv ./venv + - source ./venv/bin/activate + - which python + - pip install --upgrade pip + - pip install wheel + - pip install --no-cache -e . + - pip install -r requirements_dev.txt + +script: + - which python + - pytest -s tests + # - tests/install_test.sh diff --git a/HISTORY.md b/HISTORY.md index 5cc2efa..56b38a7 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,9 @@ ## History +### 0.2.3 (2021-05-25) + +- fix: Default type to be string instead of null when no non-null values are found (#7) + ### 0.2.2 (2021-05-12) - fix: Fix JSON file recoginition (#4) diff --git a/getschema/impl.py b/getschema/impl.py index f1ffe4f..11a6d31 100644 --- a/getschema/impl.py +++ b/getschema/impl.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -import argparse, csv, datetime, dateutil, os, re, sys +import argparse, csv, datetime, dateutil, logging, os, re, sys from dateutil.tz import tzoffset import jsonpath_ng as jsonpath import simplejson as json @@ -9,6 +9,9 @@ # https://json-schema.org/ COMMAND = "json2schema" +LOGGER = logging.getLogger(__name__) +DEFAULT_TYPE = ["null", "string"] + def _convert_key(old_key, lower=False, replace_special=False, snake_case=False): new_key = old_key @@ -35,14 +38,9 @@ def _do_infer_schema(obj, record_level=None, lower=False, if record_level: obj = _get_jsonpath(obj, record_level)[0] - default_type = { - "type": ["null"] - } - if obj is None: - return default_type - - if type(obj) is dict and obj.keys(): + schema["type"] = ["null"] + elif type(obj) is dict and obj.keys(): schema["type"] = ["null", "object"] schema["properties"] = dict() for key in obj.keys(): @@ -54,19 +52,19 @@ def _do_infer_schema(obj, record_level=None, lower=False, schema["properties"][new_key] = ret elif type(obj) is list: + schema["type"] = ["null", "array"] if not obj: - return default_type + schema["items"] = None # TODO: Check more than the first record - ret = _do_infer_schema( - obj[0], lower=lower, replace_special=replace_special, - snake_case=snake_case) - if ret: - schema["type"] = ["null", "array"] + else: + ret = _do_infer_schema( + obj[0], lower=lower, replace_special=replace_special, + snake_case=snake_case) schema["items"] = ret else: try: float(obj) - except: + except ValueError: schema["type"] = ["null", "string"] # TODO: This is a very loose regex for date-time. if (type(obj) is datetime.datetime or @@ -145,6 +143,28 @@ def _infer_from_two(schema1, schema2): return schema +def _replace_null_type(schema, path=""): + new_schema = {} + new_schema.update(schema) + if schema["type"] in ("object", ["object"], ["null", "object"]): + new_schema ["properties"] = {} + for key in schema.get("properties", {}).keys(): + new_path = path + "." + key + new_schema["properties"][key] = _replace_null_type(schema["properties"][key], new_path) + elif schema["type"] == ["null", "array"]: + if not schema.get("items"): + new_schema["items"] = {} + if new_schema["items"].get("type") in (None, "null", ["null"]): + LOGGER.warning( + f"{path} is an array without non-null values." + f"Replacing with the default {DEFAULT_TYPE}") + new_schema["items"]["type"] = DEFAULT_TYPE + elif schema["type"] == ["null"]: + LOGGER.warning(f"{path} contained non-null values only. Replacing with the default {DEFAULT_TYPE}") + new_schema["type"] = DEFAULT_TYPE + return new_schema + + def _nested_get(input_dict, nested_key): internal_dict_value = input_dict for k in nested_key: @@ -187,7 +207,7 @@ def infer_schema(obj, record_level=None, if type(obj[0]) is not dict: raise ValueError("Input must be a dict object.") schema = None - # Go through the entire list of objects and find the most safe type assumption + # Go through the list of objects and find the most safe type assumption for o in obj: cur_schema = _do_infer_schema( o, record_level, lower, replace_special, snake_case) @@ -196,6 +216,9 @@ def infer_schema(obj, record_level=None, schema = _infer_from_two(schema, cur_schema) schema["type"] = "object" + + schema = _replace_null_type(schema) + return schema @@ -249,7 +272,8 @@ def infer_from_file(filename, fmt="json", skip=0, lower=False, schema = infer_from_yaml_file( filename, skip, lower, replace_special, snake_case) elif fmt == "csv": - schema = infer_from_csv_file(filename, skip, lower, replace_special, snake_case) + schema = infer_from_csv_file( + filename, skip, lower, replace_special, snake_case) else: raise KeyError("Unsupported format : " + fmt) return schema @@ -267,8 +291,9 @@ def fix_type(obj, schema, dict_path=[], on_invalid_property="raise", - force: Keep it as is (string) """ invalid_actions = ["raise", "null", "force"] - if not on_invalid_property in invalid_actions: - raise ValueError("on_invalid_property is not one of %s" % invalid_actions) + if on_invalid_property not in invalid_actions: + raise ValueError( + "on_invalid_property is not one of %s" % invalid_actions) obj_type = _nested_get(schema, dict_path + ["type"]) obj_format = _nested_get(schema, dict_path + ["format"]) diff --git a/tests/test_types.py b/tests/test_types.py new file mode 100644 index 0000000..d1ac1a2 --- /dev/null +++ b/tests/test_types.py @@ -0,0 +1,39 @@ +import getschema + + +def test_null_records(): + records = [ + { + "field": "1", + "null_field": None, + "array": [ + ], + "null_array": [ + ], + "nested_field": { + "some_date": "2021-05-25", + "null_subfield": None, + }, + }, + { + "field": "10.0", + "null_field": None, + "array": [ + "1", + "a", + ], + "null_array": [ + ], + "nested_field": { + "some_date": "2021-05-25", + "null_subfield": None, + }, + }, + ] + schema = getschema.infer_schema(records) + assert(schema["properties"]["field"]["type"] == ["null", "number"]) + assert(schema["properties"]["null_field"]["type"] == ["null", "string"]) + assert(schema["properties"]["nested_field"]["properties"]["some_date"]["type"] == ["null", "string"]) + assert(schema["properties"]["nested_field"]["properties"]["some_date"]["format"] == "date-time") + assert(schema["properties"]["nested_field"]["properties"]["null_subfield"]["type"] == ["null", "string"]) +