Merge pull request #73 from monarch-initiative/feature/line_limit

Feature/line limit
monarch-initiative · Feb 15, 2022 · 61289d3 · 61289d3
2 parents 27c2cc1 + d57f430
commit 61289d3
Show file tree

Hide file tree

Showing 13 changed files with 174 additions and 16 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
+koza-env/
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/README.md b/README.md
@@ -6,10 +6,10 @@
 
 ![pupa](docs/img/pupa.png) Data transformation framework
 
-*Disclaimer*: Koza is in beta; we are looking for beta testers
+_Disclaimer_: Koza is in beta; we are looking for beta testers
 
 Transform csv, json, yaml, jsonl, and xml and converting them to a target
-csv, json, or jsonl format based on your dataclass model.  Koza also can output
+csv, json, or jsonl format based on your dataclass model. Koza also can output
 data in the [KGX format](https://github.com/biolink/kgx/blob/master/specification/kgx-format.md#kgx-format-as-tsv)
 
 **Documentation**: https://koza.monarchinitiative.org/
@@ -21,7 +21,6 @@ data in the [KGX format](https://github.com/biolink/kgx/blob/master/specificatio
 - Create or import mapping files to be used in ingests (eg id mapping, type mappings)
 - Create and use translation tables to map between source and target vocabularies
 
-
 #### Installation
 
 ```
@@ -30,7 +29,7 @@ pip install koza
 
 #### Getting Started
 
-Send a local or remove csv file through Koza to get some basic information (headers, number of rows)
+Send a local or remote csv file through Koza to get some basic information (headers, number of rows)
 
 ```bash
 koza validate \
@@ -39,6 +38,7 @@ koza validate \
 ```
 
 Sending a json or jsonl formatted file will confirm if the file is valid json or jsonl
+
 ```bash
 koza validate \
   --file ./examples/data/ZFIN_PHENOTYPE_0.jsonl.gz \
@@ -55,7 +55,7 @@ koza validate \
 ###### Example: transforming StringDB
 
 ```bash
-koza transform --source examples/string/protein-links-detailed.yaml --global-table examples/translation_table.yaml 
+koza transform --source examples/string/protein-links-detailed.yaml --global-table examples/translation_table.yaml
 
 koza transform --source examples/string-declarative/protein-links-detailed.yaml --global-table examples/translation_table.yaml
 ```
diff --git a/koza/__init__.py b/koza/__init__.py
@@ -1,2 +1,2 @@
 """Koza, an ETL framework for LinkML data models"""
-__version__ = '0.1.6'
+__version__ = '0.1.7'
diff --git a/koza/cli_runner.py b/koza/cli_runner.py
@@ -133,6 +133,7 @@ def transform_source(
     output_format: OutputFormat = OutputFormat('tsv'),
     global_table: str = None,
     local_table: str = None,
+    row_limit: int = None
 ):
 
     with open(source, 'r') as source_fh:
@@ -144,7 +145,7 @@ def transform_source(
             # look for it alongside the source conf as a .py file
             source_config.transform_code = str(Path(source).parent / Path(source).stem) + '.py'
 
-        koza_source = Source(source_config)
+        koza_source = Source(source_config, row_limit)
 
         translation_table = get_translation_table(global_table if global_table else source_config.global_table,
                                                   local_table if local_table else source_config.local_table)

diff --git a/koza/io/reader/csv_reader.py b/koza/io/reader/csv_reader.py
@@ -51,6 +51,7 @@ def __init__(
         skip_blank_lines: bool = True,
         name: str = "csv file",
         comment_char: str = "#",
+        row_limit: int = None,
         *args,
         **kwargs,
     ):
@@ -70,6 +71,7 @@ def __init__(
         :param skip_blank_lines: true to skip blank lines, false to insert NaN for blank lines,
         :param name: filename or alias
         :param comment_char: string representing a commented line, eg # or !!
+        :param row_limit: int number of lines to process
         :param args: additional args to pass to csv.reader
         :param kwargs: additional kwargs to pass to csv.reader
         """
@@ -81,9 +83,14 @@ def __init__(
         self.skip_blank_lines = skip_blank_lines
         self.name = name
         self.comment_char = comment_char
+        self.row_limit = row_limit
 
+        # used by _set_header
         self.line_num = 0
 
+        # used for row_limit
+        self.line_count = 0
+
         self._header = None
 
         if delimiter == '\\s':
@@ -100,9 +107,13 @@ def __next__(self) -> Dict[str, Any]:
 
         if not self._header:
             self._set_header()
-
+                
         try:
-            row = next(self.reader)
+            if self.line_count == self.row_limit:
+                raise StopIteration
+            else:
+                row = next(self.reader)
+                self.line_count += 1
         except StopIteration:
             LOG.info(f"Finished processing {self.line_num} rows for {self.name}")
             raise StopIteration

diff --git a/koza/io/reader/json_reader.py b/koza/io/reader/json_reader.py
@@ -19,19 +19,21 @@ def __init__(
         json_path: List[Union[str, int]] = None,
         name: str = 'json file',
         is_yaml: bool = False,
+        row_limit: int = None,
     ):
         """
         :param io_str: Any IO stream that yields a string
                        See https://docs.python.org/3/library/io.html#io.IOBase
         :param required_properties: required top level properties
+        :param row_limit: integer number of non-header rows to process
         :param iterate_over: todo
         :param name: todo
         """
         self.io_str = io_str
         self.required_properties = required_properties
         self.json_path = json_path
         self.name = name
-
+        
         if self.json_path:
             if is_yaml:
                 self.json_obj = yaml.safe_load(self.io_str)
@@ -53,18 +55,23 @@ def __init__(
             self._len = 0
             self._line_num = 0
 
+        if row_limit:
+            self._line_limit = row_limit 
+        else:
+            self._line_limit = self._len
+
     def __iter__(self) -> Iterator:
         return self
 
     def __next__(self) -> Dict[str, Any]:
 
-        if self._line_num + 1 > self._len:
+        if self._line_num == self._line_limit:
             LOG.info(f"Finished processing {self.name}")
             raise StopIteration
 
         next_obj = self.json_obj[self._line_num]
 
-        self._line_num = self._line_num + 1
+        self._line_num += 1
 
         if self.required_properties:
             if not set(next_obj.keys()) >= set(self.required_properties):

diff --git a/koza/io/reader/jsonl_reader.py b/koza/io/reader/jsonl_reader.py
@@ -12,7 +12,10 @@ class JSONLReader:
     """
 
     def __init__(
-        self, io_str: IO[str], required_properties: List[str] = None, name: str = 'jsonl file'
+        self, io_str: IO[str], 
+        required_properties: List[str] = None, 
+        name: str = 'jsonl file',
+        row_limit: int = None,
     ):
         """
         :param io_str: Any IO stream that yields a string
@@ -24,6 +27,7 @@ def __init__(
         self.required_properties = required_properties
         self.line_num = 0
         self.name = name
+        self.line_limit = row_limit
 
     def __iter__(self) -> Iterator:
         return self
@@ -34,6 +38,10 @@ def __next__(self) -> Dict[str, Any]:
             LOG.info(f"Finished processing {self.line_num} lines")
             raise StopIteration
         self.line_num += 1
+        if self.line_limit:
+            if self.line_num == self.line_limit:
+                raise StopIteration
+
         json_obj = json.loads(next_line)
 
         if self.required_properties:

diff --git a/koza/main.py b/koza/main.py
@@ -27,6 +27,7 @@ def transform(
     local_table: str = typer.Option(None, help="Path to local translation table"),
     quiet: bool = False,
     debug: bool = False,
+    row_limit: int = typer.Option(None, help="Number of rows to process. If skipped, processes entire source file."),
 ):
     """
     Transform a source file
@@ -40,7 +41,7 @@ def transform(
     elif not output_path.exists():
         output_path.mkdir(parents=True)
 
-    transform_source(source, output_dir, output_format, global_table, local_table)
+    transform_source(source, output_dir, output_format, global_table, local_table, row_limit)
 
 
 @typer_app.command()

diff --git a/koza/model/source.py b/koza/model/source.py
@@ -26,14 +26,17 @@ class Source:
     def __init__(
         self,
         config: Union[PrimaryFileConfig, MapFileConfig, str],
+        row_limit: Optional[int] = None
     ):
 
         self.config = config
+        self.row_limit = row_limit
         self._filter = RowFilter(config.filters)
         self._reader = None
         self._readers: List = []
         self.last_row: Optional[Dict] = None
 
+
         if not isinstance(config, SourceConfig):
             # Check to see if it's a file path
             with open(config, 'r') as source_file_fh:
@@ -56,6 +59,7 @@ def __init__(
                         header_delimiter=config.header_delimiter,
                         header=config.header,
                         comment_char=self.config.comment_char,
+                        row_limit = self.row_limit
                     )
                 )
             elif self.config.format == 'jsonl':
@@ -64,6 +68,7 @@ def __init__(
                         resource_io,
                         name=config.name,
                         required_properties=config.required_properties,
+                        row_limit = self.row_limit
                     )
                 )
             elif self.config.format == 'json' or self.config.format == 'yaml':
@@ -74,6 +79,7 @@ def __init__(
                         json_path=config.json_path,
                         required_properties=config.required_properties,
                         is_yaml=(self.config.format == 'yaml'),
+                        row_limit = self.row_limit
                     )
                 )
             else:

diff --git a/tests/integration/test_examples.py b/tests/integration/test_examples.py
@@ -1,6 +1,5 @@
 """
 End to end test of String load from examples/string
-
 """
 
 from pathlib import Path
@@ -70,4 +69,3 @@ def test_examples(ingest, output_names, output_format):
         assert Path(file).stat().st_size > 0
 
     # TODO: at some point, these assertions could get more rigorous, but knowing if we have errors/exceptions is a start
-    # TODO: kgx validation could also be added back in, especially if something programatic is done with the output
diff --git a/tests/integration/test_row_limit.py b/tests/integration/test_row_limit.py
@@ -0,0 +1,51 @@
+"""
+Test the row_limit argument for transforms
+Assert correct number of rows has been processed
+"""
+#TODO: Parameterize row_limit, and test reading from JSON and JSONL 
+#TODO: Address filter in examples/string-declarative/protein-links-detailed.yaml
+
+from pathlib import Path
+
+import pytest
+
+from koza.cli_runner import transform_source
+from koza.model.config.source_config import OutputFormat
+
+@pytest.mark.parametrize(
+    "ingest, output_names, output_format, row_limit, header_len, expected_node_len, expected_edge_len",
+    [  
+        (
+            "string-declarative",       # ingest
+            ["protein-links-detailed"], # output_names
+            OutputFormat.tsv,           # output_format
+            3,                          # row_limit
+            1,                          # header_len
+            11,                         # expected_node_leng
+            6                           # expected_edge_leng
+        )
+    ]
+)
+def test_examples(ingest, output_names, output_format, row_limit, header_len, expected_node_len, expected_edge_len):
+
+    source = f"examples/{ingest}/protein-links-detailed.yaml"
+    output_suffix = ".tsv"
+    output_dir = f"./test-output/{ingest}-row-limit"
+
+    transform_source(source=source, 
+                    output_dir=output_dir, 
+                    output_format=output_format, 
+                    global_table="examples/translation_table.yaml", 
+                    local_table=None,
+                    row_limit=row_limit)
+
+    # hacky check that correct number of rows was processed
+    node_file = f"{output_dir}/protein-links-detailed_nodes{output_suffix}"
+    edge_file = f"{output_dir}/protein-links-detailed_edges{output_suffix}"
+
+    node_lines = sum(1 for line in open(node_file))
+    edge_lines = sum(1 for line in open(edge_file))
+
+    assert node_lines == expected_node_len
+    assert edge_lines == expected_edge_len
+
diff --git a/tests/unit/test_jsonlreader_row_limit.py b/tests/unit/test_jsonlreader_row_limit.py
@@ -0,0 +1,38 @@
+import gzip
+from pathlib import Path
+
+import pytest
+
+from koza.io.reader.jsonl_reader import JSONLReader
+
+test_zfin = (
+    Path(__file__).parent.parent / 'resources' / 'source-files' / 'ZFIN_PHENOTYPE_0.jsonl.gz'
+)
+
+
+def test_normal_case():
+    with gzip.open(test_zfin, 'rt') as zfin:
+        row_limit = 3
+        jsonl_reader = JSONLReader(zfin, row_limit=row_limit)
+        row = next(jsonl_reader)
+        assert len(row) == 6
+        assert row['objectId'] == 'ZFIN:ZDB-GENE-011026-1'
+
+
+def test_required_property():
+    with gzip.open(test_zfin, 'rt') as zfin:
+        row_limit = 3
+        row_count = 1
+        jsonl_reader = JSONLReader(zfin, ['objectId'], row_limit=row_limit)
+        for row in jsonl_reader:
+            row_count += 1
+            assert 'objectId' in row
+        assert row_count == row_limit
+
+
+def test_missing_req_property_raises_exception():
+    with gzip.open(test_zfin, 'rt') as zfin:
+        row_limit = 3
+        jsonl_reader = JSONLReader(zfin, ['objectId', 'foobar'], row_limit=row_limit)
+        with pytest.raises(ValueError):
+            next(jsonl_reader)
diff --git a/tests/unit/test_jsonreader_row_limit.py b/tests/unit/test_jsonreader_row_limit.py
@@ -0,0 +1,35 @@
+import gzip
+from pathlib import Path
+
+import pytest
+
+from koza.io.reader.json_reader import JSONReader
+
+test_ddpheno = Path(__file__).parents[1] / 'resources' / 'source-files' / 'ddpheno.json.gz'
+
+json_path = ['graphs', 0, 'nodes']
+
+
+def test_normal_case():
+    with gzip.open(test_ddpheno, 'rt') as ddpheno:
+        json_reader = JSONReader(ddpheno, json_path=json_path, row_limit=3)
+        row = next(json_reader)
+        assert row['id'] == 'http://purl.obolibrary.org/obo/DDPHENO_0001198'
+
+
+def test_required_properties():
+    with gzip.open(test_ddpheno, 'rt') as ddpheno:
+        row_limit=3
+        row_count = 0
+        json_reader = JSONReader(ddpheno, ['id'], json_path=json_path, row_limit=row_limit)
+        for row in json_reader:
+            row_count += 1
+            assert 'id' in row
+        assert row_count == row_limit
+
+
+def test_missing_req_property_raises_exception():
+    with gzip.open(test_ddpheno, 'rt') as ddpheno:
+        json_reader = JSONReader(ddpheno, ['fake_prop'], json_path=json_path, row_limit=3)
+        with pytest.raises(ValueError):
+            next(json_reader)