diff --git a/koza/__init__.py b/koza/__init__.py index 3b5dbfe..5f418d9 100644 --- a/koza/__init__.py +++ b/koza/__init__.py @@ -1,2 +1,2 @@ """Koza, an ETL framework for LinkML data models""" -__version__ = '0.1.13' +__version__ = '0.1.14' diff --git a/koza/io/reader/json_reader.py b/koza/io/reader/json_reader.py index 1f9c681..7f7432b 100644 --- a/koza/io/reader/json_reader.py +++ b/koza/io/reader/json_reader.py @@ -1,12 +1,12 @@ -import json +import json, yaml import logging from typing import IO, Any, Dict, Iterator, List, Union +#from xmlrpc.client import Boolean -import yaml +from koza.io.utils import check_data LOG = logging.getLogger(__name__) - class JSONReader: """ A JSON reader that optionally iterates over a json list @@ -34,6 +34,8 @@ def __init__( self.json_path = json_path self.name = name + + if self.json_path: if is_yaml: self.json_obj = yaml.safe_load(self.io_str) @@ -73,18 +75,18 @@ def __next__(self) -> Dict[str, Any]: self._line_num += 1 + # Check that required properties exist in row if self.required_properties: - if not set(next_obj.keys()) >= set(self.required_properties): - # TODO - have koza runner handle this exception - # based on some configuration? similar to - # on_map_error + properties = [] + for prop in self.required_properties: + new_prop = check_data(next_obj, prop) + properties.append(new_prop) + + if False in properties: raise ValueError( f"Required properties defined for {self.name} are missing from {self.io_str.name}\n" f"Missing properties: {set(self.required_properties) - set(next_obj.keys())}\n" f"Row: {next_obj}" ) - # If we want to subset - # next_obj = {key: next_obj[key] for key in next_obj.keys() if key in self.required_properties} - return next_obj diff --git a/koza/io/reader/jsonl_reader.py b/koza/io/reader/jsonl_reader.py index a610901..1ee7a31 100644 --- a/koza/io/reader/jsonl_reader.py +++ b/koza/io/reader/jsonl_reader.py @@ -1,6 +1,7 @@ import json import logging from typing import IO, Any, Dict, Iterator, List +from koza.io.utils import check_data LOG = logging.getLogger(__name__) @@ -45,16 +46,18 @@ def __next__(self) -> Dict[str, Any]: json_obj = json.loads(next_line) + # Check that required properties exist in row if self.required_properties: - if not set(json_obj.keys()) >= set(self.required_properties): - # TODO - have koza runner handle this exception - # based on some configuration? similar to - # on_map_error + properties = [] + for prop in self.required_properties: + new_prop = check_data(json_obj, prop) + properties.append(new_prop) + + if False in properties: raise ValueError( - f"Configured properties missing in source file " - f"{set(self.required_properties) - set(json_obj.keys())}" + f"Required properties defined for {self.name} are missing from {self.io_str.name}\n" + f"Missing properties: {set(self.required_properties) - set(json_obj.keys())}\n" + f"Row: {json_obj}" ) - # If we want to turn this into a subsetter - # json_obj = {key: json_obj[key] for key in json_obj.keys() if key in self.required_properties} return json_obj diff --git a/koza/io/utils.py b/koza/io/utils.py index d1c673a..6ee8539 100644 --- a/koza/io/utils.py +++ b/koza/io/utils.py @@ -11,6 +11,7 @@ import requests +##### Helper Functions for Reader classes ##### def open_resource(resource: Union[str, PathLike]) -> IO[str]: """ @@ -59,6 +60,28 @@ def open_resource(resource: Union[str, PathLike]) -> IO[str]: else: raise ValueError(f"Cannot open local or remote file: {resource}") +def check_data(entry, path) -> bool: + """ + Given a dot delimited JSON tag path, + returns the value of the field in the entry. + :param entry: + :param path: + :return: str value of the given path into the entry + """ + ppart = path.split(".") + + tag = ppart.pop(0) + + while True: + if tag in entry: + entry = entry[tag] + exists = True + else: + exists = False + if len(ppart) == 0: + return exists + else: + tag = ppart.pop(0) ##### Helper functions for Writer classes ##### diff --git a/tests/resources/source-files/test_BGI_ZFIN.json.gz b/tests/resources/source-files/test_BGI_ZFIN.json.gz new file mode 100644 index 0000000..dc5dfeb Binary files /dev/null and b/tests/resources/source-files/test_BGI_ZFIN.json.gz differ diff --git a/tests/unit/test_jsonlreader.py b/tests/unit/test_jsonlreader.py index 284f63c..ad283d4 100644 --- a/tests/unit/test_jsonlreader.py +++ b/tests/unit/test_jsonlreader.py @@ -5,29 +5,29 @@ from koza.io.reader.jsonl_reader import JSONLReader -test_zfin = ( - Path(__file__).parent.parent / 'resources' / 'source-files' / 'ZFIN_PHENOTYPE_0.jsonl.gz' -) - +test_zfin_data = Path(__file__).parents[1] / 'resources' / 'source-files' / 'ZFIN_PHENOTYPE_0.jsonl.gz' def test_normal_case(): - with gzip.open(test_zfin, 'rt') as zfin: + with gzip.open(test_zfin_data, 'rt') as zfin: jsonl_reader = JSONLReader(zfin) row = next(jsonl_reader) assert len(row) == 6 + assert row['objectId'] == 'ZFIN:ZDB-GENE-011026-1' def test_required_property(): - with gzip.open(test_zfin, 'rt') as zfin: - jsonl_reader = JSONLReader(zfin, ['objectId']) + with gzip.open(test_zfin_data, 'rt') as zfin: + jsonl_reader = JSONLReader(zfin, required_properties = ['objectId', 'evidence.publicationId']) for row in jsonl_reader: # assert len(row) == 1 # removed subsetter + print(row) assert 'objectId' in row + assert row['evidence']['publicationId'] def test_missing_req_property_raises_exception(): - with gzip.open(test_zfin, 'rt') as zfin: + with gzip.open(test_zfin_data, 'rt') as zfin: jsonl_reader = JSONLReader(zfin, ['objectId', 'foobar']) with pytest.raises(ValueError): next(jsonl_reader) diff --git a/tests/unit/test_jsonreader.py b/tests/unit/test_jsonreader.py index 8ba2ebe..9130d5e 100644 --- a/tests/unit/test_jsonreader.py +++ b/tests/unit/test_jsonreader.py @@ -5,27 +5,28 @@ from koza.io.reader.json_reader import JSONReader -test_ddpheno = Path(__file__).parents[1] / 'resources' / 'source-files' / 'ddpheno.json.gz' +test_zfin_data = Path(__file__).parents[1] / 'resources' / 'source-files' / 'test_BGI_ZFIN.json.gz' -json_path = ['graphs', 0, 'nodes'] +json_path = ['data', 0,] def test_normal_case(): - with gzip.open(test_ddpheno, 'rt') as ddpheno: - json_reader = JSONReader(ddpheno, json_path=json_path) + with gzip.open(test_zfin_data, 'rt') as zfin: + json_reader = JSONReader(zfin, json_path=json_path) row = next(json_reader) - assert row['id'] == 'http://purl.obolibrary.org/obo/DDPHENO_0001198' + assert row['symbol'] == 'gdnfa' def test_required_properties(): - with gzip.open(test_ddpheno, 'rt') as ddpheno: - json_reader = JSONReader(ddpheno, ['id'], json_path=json_path) + with gzip.open(test_zfin_data, 'rt') as zfin: + json_reader = JSONReader(zfin, ['name', 'basicGeneticEntity.primaryId'], json_path=json_path) for row in json_reader: - assert 'id' in row - + print(row) + assert row['name'] + assert row['basicGeneticEntity']['primaryId'] def test_missing_req_property_raises_exception(): - with gzip.open(test_ddpheno, 'rt') as ddpheno: - json_reader = JSONReader(ddpheno, ['fake_prop'], json_path=json_path) + with gzip.open(test_zfin_data, 'rt') as zfin: + json_reader = JSONReader(zfin, ['fake_prop'], json_path=json_path) with pytest.raises(ValueError): next(json_reader)