diff --git a/.gitignore b/.gitignore index a0f7880..3332b42 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +koza-env/ + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/README.md b/README.md index ca58670..0f4f2fe 100644 --- a/README.md +++ b/README.md @@ -6,10 +6,10 @@ ![pupa](docs/img/pupa.png) Data transformation framework -*Disclaimer*: Koza is in beta; we are looking for beta testers +_Disclaimer_: Koza is in beta; we are looking for beta testers Transform csv, json, yaml, jsonl, and xml and converting them to a target -csv, json, or jsonl format based on your dataclass model. Koza also can output +csv, json, or jsonl format based on your dataclass model. Koza also can output data in the [KGX format](https://github.com/biolink/kgx/blob/master/specification/kgx-format.md#kgx-format-as-tsv) **Documentation**: https://koza.monarchinitiative.org/ @@ -21,7 +21,6 @@ data in the [KGX format](https://github.com/biolink/kgx/blob/master/specificatio - Create or import mapping files to be used in ingests (eg id mapping, type mappings) - Create and use translation tables to map between source and target vocabularies - #### Installation ``` @@ -30,7 +29,7 @@ pip install koza #### Getting Started -Send a local or remove csv file through Koza to get some basic information (headers, number of rows) +Send a local or remote csv file through Koza to get some basic information (headers, number of rows) ```bash koza validate \ @@ -39,6 +38,7 @@ koza validate \ ``` Sending a json or jsonl formatted file will confirm if the file is valid json or jsonl + ```bash koza validate \ --file ./examples/data/ZFIN_PHENOTYPE_0.jsonl.gz \ @@ -55,7 +55,7 @@ koza validate \ ###### Example: transforming StringDB ```bash -koza transform --source examples/string/protein-links-detailed.yaml --global-table examples/translation_table.yaml +koza transform --source examples/string/protein-links-detailed.yaml --global-table examples/translation_table.yaml koza transform --source examples/string-declarative/protein-links-detailed.yaml --global-table examples/translation_table.yaml ``` diff --git a/koza/__init__.py b/koza/__init__.py index 04f67b9..51349fd 100644 --- a/koza/__init__.py +++ b/koza/__init__.py @@ -1,2 +1,2 @@ """Koza, an ETL framework for LinkML data models""" -__version__ = '0.1.6' +__version__ = '0.1.7' diff --git a/koza/cli_runner.py b/koza/cli_runner.py index 26a3770..3a040e3 100644 --- a/koza/cli_runner.py +++ b/koza/cli_runner.py @@ -133,6 +133,7 @@ def transform_source( output_format: OutputFormat = OutputFormat('tsv'), global_table: str = None, local_table: str = None, + row_limit: int = None ): with open(source, 'r') as source_fh: @@ -144,7 +145,7 @@ def transform_source( # look for it alongside the source conf as a .py file source_config.transform_code = str(Path(source).parent / Path(source).stem) + '.py' - koza_source = Source(source_config) + koza_source = Source(source_config, row_limit) translation_table = get_translation_table(global_table if global_table else source_config.global_table, local_table if local_table else source_config.local_table) diff --git a/koza/io/reader/csv_reader.py b/koza/io/reader/csv_reader.py index 238dedb..0707ed6 100644 --- a/koza/io/reader/csv_reader.py +++ b/koza/io/reader/csv_reader.py @@ -51,6 +51,7 @@ def __init__( skip_blank_lines: bool = True, name: str = "csv file", comment_char: str = "#", + row_limit: int = None, *args, **kwargs, ): @@ -70,6 +71,7 @@ def __init__( :param skip_blank_lines: true to skip blank lines, false to insert NaN for blank lines, :param name: filename or alias :param comment_char: string representing a commented line, eg # or !! + :param row_limit: int number of lines to process :param args: additional args to pass to csv.reader :param kwargs: additional kwargs to pass to csv.reader """ @@ -81,9 +83,14 @@ def __init__( self.skip_blank_lines = skip_blank_lines self.name = name self.comment_char = comment_char + self.row_limit = row_limit + # used by _set_header self.line_num = 0 + # used for row_limit + self.line_count = 0 + self._header = None if delimiter == '\\s': @@ -100,9 +107,13 @@ def __next__(self) -> Dict[str, Any]: if not self._header: self._set_header() - + try: - row = next(self.reader) + if self.line_count == self.row_limit: + raise StopIteration + else: + row = next(self.reader) + self.line_count += 1 except StopIteration: LOG.info(f"Finished processing {self.line_num} rows for {self.name}") raise StopIteration diff --git a/koza/io/reader/json_reader.py b/koza/io/reader/json_reader.py index c1116e9..70f5d48 100644 --- a/koza/io/reader/json_reader.py +++ b/koza/io/reader/json_reader.py @@ -19,11 +19,13 @@ def __init__( json_path: List[Union[str, int]] = None, name: str = 'json file', is_yaml: bool = False, + row_limit: int = None, ): """ :param io_str: Any IO stream that yields a string See https://docs.python.org/3/library/io.html#io.IOBase :param required_properties: required top level properties + :param row_limit: integer number of non-header rows to process :param iterate_over: todo :param name: todo """ @@ -31,7 +33,7 @@ def __init__( self.required_properties = required_properties self.json_path = json_path self.name = name - + if self.json_path: if is_yaml: self.json_obj = yaml.safe_load(self.io_str) @@ -53,18 +55,23 @@ def __init__( self._len = 0 self._line_num = 0 + if row_limit: + self._line_limit = row_limit + else: + self._line_limit = self._len + def __iter__(self) -> Iterator: return self def __next__(self) -> Dict[str, Any]: - if self._line_num + 1 > self._len: + if self._line_num == self._line_limit: LOG.info(f"Finished processing {self.name}") raise StopIteration next_obj = self.json_obj[self._line_num] - self._line_num = self._line_num + 1 + self._line_num += 1 if self.required_properties: if not set(next_obj.keys()) >= set(self.required_properties): diff --git a/koza/io/reader/jsonl_reader.py b/koza/io/reader/jsonl_reader.py index 126e53e..02cab73 100644 --- a/koza/io/reader/jsonl_reader.py +++ b/koza/io/reader/jsonl_reader.py @@ -12,7 +12,10 @@ class JSONLReader: """ def __init__( - self, io_str: IO[str], required_properties: List[str] = None, name: str = 'jsonl file' + self, io_str: IO[str], + required_properties: List[str] = None, + name: str = 'jsonl file', + row_limit: int = None, ): """ :param io_str: Any IO stream that yields a string @@ -24,6 +27,7 @@ def __init__( self.required_properties = required_properties self.line_num = 0 self.name = name + self.line_limit = row_limit def __iter__(self) -> Iterator: return self @@ -34,6 +38,10 @@ def __next__(self) -> Dict[str, Any]: LOG.info(f"Finished processing {self.line_num} lines") raise StopIteration self.line_num += 1 + if self.line_limit: + if self.line_num == self.line_limit: + raise StopIteration + json_obj = json.loads(next_line) if self.required_properties: diff --git a/koza/main.py b/koza/main.py index b7a63b9..30435fa 100755 --- a/koza/main.py +++ b/koza/main.py @@ -27,6 +27,7 @@ def transform( local_table: str = typer.Option(None, help="Path to local translation table"), quiet: bool = False, debug: bool = False, + row_limit: int = typer.Option(None, help="Number of rows to process. If skipped, processes entire source file."), ): """ Transform a source file @@ -40,7 +41,7 @@ def transform( elif not output_path.exists(): output_path.mkdir(parents=True) - transform_source(source, output_dir, output_format, global_table, local_table) + transform_source(source, output_dir, output_format, global_table, local_table, row_limit) @typer_app.command() diff --git a/koza/model/source.py b/koza/model/source.py index b3167cb..c18b521 100644 --- a/koza/model/source.py +++ b/koza/model/source.py @@ -26,14 +26,17 @@ class Source: def __init__( self, config: Union[PrimaryFileConfig, MapFileConfig, str], + row_limit: Optional[int] = None ): self.config = config + self.row_limit = row_limit self._filter = RowFilter(config.filters) self._reader = None self._readers: List = [] self.last_row: Optional[Dict] = None + if not isinstance(config, SourceConfig): # Check to see if it's a file path with open(config, 'r') as source_file_fh: @@ -56,6 +59,7 @@ def __init__( header_delimiter=config.header_delimiter, header=config.header, comment_char=self.config.comment_char, + row_limit = self.row_limit ) ) elif self.config.format == 'jsonl': @@ -64,6 +68,7 @@ def __init__( resource_io, name=config.name, required_properties=config.required_properties, + row_limit = self.row_limit ) ) elif self.config.format == 'json' or self.config.format == 'yaml': @@ -74,6 +79,7 @@ def __init__( json_path=config.json_path, required_properties=config.required_properties, is_yaml=(self.config.format == 'yaml'), + row_limit = self.row_limit ) ) else: diff --git a/tests/integration/test_examples.py b/tests/integration/test_examples.py index a52a2e3..feb44be 100644 --- a/tests/integration/test_examples.py +++ b/tests/integration/test_examples.py @@ -1,6 +1,5 @@ """ End to end test of String load from examples/string - """ from pathlib import Path @@ -70,4 +69,3 @@ def test_examples(ingest, output_names, output_format): assert Path(file).stat().st_size > 0 # TODO: at some point, these assertions could get more rigorous, but knowing if we have errors/exceptions is a start - # TODO: kgx validation could also be added back in, especially if something programatic is done with the output diff --git a/tests/integration/test_row_limit.py b/tests/integration/test_row_limit.py new file mode 100644 index 0000000..aa88a9c --- /dev/null +++ b/tests/integration/test_row_limit.py @@ -0,0 +1,51 @@ +""" +Test the row_limit argument for transforms +Assert correct number of rows has been processed +""" +#TODO: Parameterize row_limit, and test reading from JSON and JSONL +#TODO: Address filter in examples/string-declarative/protein-links-detailed.yaml + +from pathlib import Path + +import pytest + +from koza.cli_runner import transform_source +from koza.model.config.source_config import OutputFormat + +@pytest.mark.parametrize( + "ingest, output_names, output_format, row_limit, header_len, expected_node_len, expected_edge_len", + [ + ( + "string-declarative", # ingest + ["protein-links-detailed"], # output_names + OutputFormat.tsv, # output_format + 3, # row_limit + 1, # header_len + 11, # expected_node_leng + 6 # expected_edge_leng + ) + ] +) +def test_examples(ingest, output_names, output_format, row_limit, header_len, expected_node_len, expected_edge_len): + + source = f"examples/{ingest}/protein-links-detailed.yaml" + output_suffix = ".tsv" + output_dir = f"./test-output/{ingest}-row-limit" + + transform_source(source=source, + output_dir=output_dir, + output_format=output_format, + global_table="examples/translation_table.yaml", + local_table=None, + row_limit=row_limit) + + # hacky check that correct number of rows was processed + node_file = f"{output_dir}/protein-links-detailed_nodes{output_suffix}" + edge_file = f"{output_dir}/protein-links-detailed_edges{output_suffix}" + + node_lines = sum(1 for line in open(node_file)) + edge_lines = sum(1 for line in open(edge_file)) + + assert node_lines == expected_node_len + assert edge_lines == expected_edge_len + \ No newline at end of file diff --git a/tests/unit/test_jsonlreader_row_limit.py b/tests/unit/test_jsonlreader_row_limit.py new file mode 100644 index 0000000..c72b61e --- /dev/null +++ b/tests/unit/test_jsonlreader_row_limit.py @@ -0,0 +1,38 @@ +import gzip +from pathlib import Path + +import pytest + +from koza.io.reader.jsonl_reader import JSONLReader + +test_zfin = ( + Path(__file__).parent.parent / 'resources' / 'source-files' / 'ZFIN_PHENOTYPE_0.jsonl.gz' +) + + +def test_normal_case(): + with gzip.open(test_zfin, 'rt') as zfin: + row_limit = 3 + jsonl_reader = JSONLReader(zfin, row_limit=row_limit) + row = next(jsonl_reader) + assert len(row) == 6 + assert row['objectId'] == 'ZFIN:ZDB-GENE-011026-1' + + +def test_required_property(): + with gzip.open(test_zfin, 'rt') as zfin: + row_limit = 3 + row_count = 1 + jsonl_reader = JSONLReader(zfin, ['objectId'], row_limit=row_limit) + for row in jsonl_reader: + row_count += 1 + assert 'objectId' in row + assert row_count == row_limit + + +def test_missing_req_property_raises_exception(): + with gzip.open(test_zfin, 'rt') as zfin: + row_limit = 3 + jsonl_reader = JSONLReader(zfin, ['objectId', 'foobar'], row_limit=row_limit) + with pytest.raises(ValueError): + next(jsonl_reader) diff --git a/tests/unit/test_jsonreader_row_limit.py b/tests/unit/test_jsonreader_row_limit.py new file mode 100644 index 0000000..3992055 --- /dev/null +++ b/tests/unit/test_jsonreader_row_limit.py @@ -0,0 +1,35 @@ +import gzip +from pathlib import Path + +import pytest + +from koza.io.reader.json_reader import JSONReader + +test_ddpheno = Path(__file__).parents[1] / 'resources' / 'source-files' / 'ddpheno.json.gz' + +json_path = ['graphs', 0, 'nodes'] + + +def test_normal_case(): + with gzip.open(test_ddpheno, 'rt') as ddpheno: + json_reader = JSONReader(ddpheno, json_path=json_path, row_limit=3) + row = next(json_reader) + assert row['id'] == 'http://purl.obolibrary.org/obo/DDPHENO_0001198' + + +def test_required_properties(): + with gzip.open(test_ddpheno, 'rt') as ddpheno: + row_limit=3 + row_count = 0 + json_reader = JSONReader(ddpheno, ['id'], json_path=json_path, row_limit=row_limit) + for row in json_reader: + row_count += 1 + assert 'id' in row + assert row_count == row_limit + + +def test_missing_req_property_raises_exception(): + with gzip.open(test_ddpheno, 'rt') as ddpheno: + json_reader = JSONReader(ddpheno, ['fake_prop'], json_path=json_path, row_limit=3) + with pytest.raises(ValueError): + next(json_reader)