Skip to content

Commit

Permalink
Merge pull request #73 from monarch-initiative/feature/line_limit
Browse files Browse the repository at this point in the history
Feature/line limit
  • Loading branch information
kevinschaper authored Feb 15, 2022
2 parents 27c2cc1 + d57f430 commit 61289d3
Show file tree
Hide file tree
Showing 13 changed files with 174 additions and 16 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
koza-env/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@

![pupa](docs/img/pupa.png) Data transformation framework

*Disclaimer*: Koza is in beta; we are looking for beta testers
_Disclaimer_: Koza is in beta; we are looking for beta testers

Transform csv, json, yaml, jsonl, and xml and converting them to a target
csv, json, or jsonl format based on your dataclass model. Koza also can output
csv, json, or jsonl format based on your dataclass model. Koza also can output
data in the [KGX format](https://github.com/biolink/kgx/blob/master/specification/kgx-format.md#kgx-format-as-tsv)

**Documentation**: https://koza.monarchinitiative.org/
Expand All @@ -21,7 +21,6 @@ data in the [KGX format](https://github.com/biolink/kgx/blob/master/specificatio
- Create or import mapping files to be used in ingests (eg id mapping, type mappings)
- Create and use translation tables to map between source and target vocabularies


#### Installation

```
Expand All @@ -30,7 +29,7 @@ pip install koza

#### Getting Started

Send a local or remove csv file through Koza to get some basic information (headers, number of rows)
Send a local or remote csv file through Koza to get some basic information (headers, number of rows)

```bash
koza validate \
Expand All @@ -39,6 +38,7 @@ koza validate \
```

Sending a json or jsonl formatted file will confirm if the file is valid json or jsonl

```bash
koza validate \
--file ./examples/data/ZFIN_PHENOTYPE_0.jsonl.gz \
Expand All @@ -55,7 +55,7 @@ koza validate \
###### Example: transforming StringDB

```bash
koza transform --source examples/string/protein-links-detailed.yaml --global-table examples/translation_table.yaml
koza transform --source examples/string/protein-links-detailed.yaml --global-table examples/translation_table.yaml

koza transform --source examples/string-declarative/protein-links-detailed.yaml --global-table examples/translation_table.yaml
```
2 changes: 1 addition & 1 deletion koza/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
"""Koza, an ETL framework for LinkML data models"""
__version__ = '0.1.6'
__version__ = '0.1.7'
3 changes: 2 additions & 1 deletion koza/cli_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ def transform_source(
output_format: OutputFormat = OutputFormat('tsv'),
global_table: str = None,
local_table: str = None,
row_limit: int = None
):

with open(source, 'r') as source_fh:
Expand All @@ -144,7 +145,7 @@ def transform_source(
# look for it alongside the source conf as a .py file
source_config.transform_code = str(Path(source).parent / Path(source).stem) + '.py'

koza_source = Source(source_config)
koza_source = Source(source_config, row_limit)

translation_table = get_translation_table(global_table if global_table else source_config.global_table,
local_table if local_table else source_config.local_table)
Expand Down
15 changes: 13 additions & 2 deletions koza/io/reader/csv_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def __init__(
skip_blank_lines: bool = True,
name: str = "csv file",
comment_char: str = "#",
row_limit: int = None,
*args,
**kwargs,
):
Expand All @@ -70,6 +71,7 @@ def __init__(
:param skip_blank_lines: true to skip blank lines, false to insert NaN for blank lines,
:param name: filename or alias
:param comment_char: string representing a commented line, eg # or !!
:param row_limit: int number of lines to process
:param args: additional args to pass to csv.reader
:param kwargs: additional kwargs to pass to csv.reader
"""
Expand All @@ -81,9 +83,14 @@ def __init__(
self.skip_blank_lines = skip_blank_lines
self.name = name
self.comment_char = comment_char
self.row_limit = row_limit

# used by _set_header
self.line_num = 0

# used for row_limit
self.line_count = 0

self._header = None

if delimiter == '\\s':
Expand All @@ -100,9 +107,13 @@ def __next__(self) -> Dict[str, Any]:

if not self._header:
self._set_header()

try:
row = next(self.reader)
if self.line_count == self.row_limit:
raise StopIteration
else:
row = next(self.reader)
self.line_count += 1
except StopIteration:
LOG.info(f"Finished processing {self.line_num} rows for {self.name}")
raise StopIteration
Expand Down
13 changes: 10 additions & 3 deletions koza/io/reader/json_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,21 @@ def __init__(
json_path: List[Union[str, int]] = None,
name: str = 'json file',
is_yaml: bool = False,
row_limit: int = None,
):
"""
:param io_str: Any IO stream that yields a string
See https://docs.python.org/3/library/io.html#io.IOBase
:param required_properties: required top level properties
:param row_limit: integer number of non-header rows to process
:param iterate_over: todo
:param name: todo
"""
self.io_str = io_str
self.required_properties = required_properties
self.json_path = json_path
self.name = name

if self.json_path:
if is_yaml:
self.json_obj = yaml.safe_load(self.io_str)
Expand All @@ -53,18 +55,23 @@ def __init__(
self._len = 0
self._line_num = 0

if row_limit:
self._line_limit = row_limit
else:
self._line_limit = self._len

def __iter__(self) -> Iterator:
return self

def __next__(self) -> Dict[str, Any]:

if self._line_num + 1 > self._len:
if self._line_num == self._line_limit:
LOG.info(f"Finished processing {self.name}")
raise StopIteration

next_obj = self.json_obj[self._line_num]

self._line_num = self._line_num + 1
self._line_num += 1

if self.required_properties:
if not set(next_obj.keys()) >= set(self.required_properties):
Expand Down
10 changes: 9 additions & 1 deletion koza/io/reader/jsonl_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@ class JSONLReader:
"""

def __init__(
self, io_str: IO[str], required_properties: List[str] = None, name: str = 'jsonl file'
self, io_str: IO[str],
required_properties: List[str] = None,
name: str = 'jsonl file',
row_limit: int = None,
):
"""
:param io_str: Any IO stream that yields a string
Expand All @@ -24,6 +27,7 @@ def __init__(
self.required_properties = required_properties
self.line_num = 0
self.name = name
self.line_limit = row_limit

def __iter__(self) -> Iterator:
return self
Expand All @@ -34,6 +38,10 @@ def __next__(self) -> Dict[str, Any]:
LOG.info(f"Finished processing {self.line_num} lines")
raise StopIteration
self.line_num += 1
if self.line_limit:
if self.line_num == self.line_limit:
raise StopIteration

json_obj = json.loads(next_line)

if self.required_properties:
Expand Down
3 changes: 2 additions & 1 deletion koza/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def transform(
local_table: str = typer.Option(None, help="Path to local translation table"),
quiet: bool = False,
debug: bool = False,
row_limit: int = typer.Option(None, help="Number of rows to process. If skipped, processes entire source file."),
):
"""
Transform a source file
Expand All @@ -40,7 +41,7 @@ def transform(
elif not output_path.exists():
output_path.mkdir(parents=True)

transform_source(source, output_dir, output_format, global_table, local_table)
transform_source(source, output_dir, output_format, global_table, local_table, row_limit)


@typer_app.command()
Expand Down
6 changes: 6 additions & 0 deletions koza/model/source.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,17 @@ class Source:
def __init__(
self,
config: Union[PrimaryFileConfig, MapFileConfig, str],
row_limit: Optional[int] = None
):

self.config = config
self.row_limit = row_limit
self._filter = RowFilter(config.filters)
self._reader = None
self._readers: List = []
self.last_row: Optional[Dict] = None


if not isinstance(config, SourceConfig):
# Check to see if it's a file path
with open(config, 'r') as source_file_fh:
Expand All @@ -56,6 +59,7 @@ def __init__(
header_delimiter=config.header_delimiter,
header=config.header,
comment_char=self.config.comment_char,
row_limit = self.row_limit
)
)
elif self.config.format == 'jsonl':
Expand All @@ -64,6 +68,7 @@ def __init__(
resource_io,
name=config.name,
required_properties=config.required_properties,
row_limit = self.row_limit
)
)
elif self.config.format == 'json' or self.config.format == 'yaml':
Expand All @@ -74,6 +79,7 @@ def __init__(
json_path=config.json_path,
required_properties=config.required_properties,
is_yaml=(self.config.format == 'yaml'),
row_limit = self.row_limit
)
)
else:
Expand Down
2 changes: 0 additions & 2 deletions tests/integration/test_examples.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""
End to end test of String load from examples/string
"""

from pathlib import Path
Expand Down Expand Up @@ -70,4 +69,3 @@ def test_examples(ingest, output_names, output_format):
assert Path(file).stat().st_size > 0

# TODO: at some point, these assertions could get more rigorous, but knowing if we have errors/exceptions is a start
# TODO: kgx validation could also be added back in, especially if something programatic is done with the output
51 changes: 51 additions & 0 deletions tests/integration/test_row_limit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""
Test the row_limit argument for transforms
Assert correct number of rows has been processed
"""
#TODO: Parameterize row_limit, and test reading from JSON and JSONL
#TODO: Address filter in examples/string-declarative/protein-links-detailed.yaml

from pathlib import Path

import pytest

from koza.cli_runner import transform_source
from koza.model.config.source_config import OutputFormat

@pytest.mark.parametrize(
"ingest, output_names, output_format, row_limit, header_len, expected_node_len, expected_edge_len",
[
(
"string-declarative", # ingest
["protein-links-detailed"], # output_names
OutputFormat.tsv, # output_format
3, # row_limit
1, # header_len
11, # expected_node_leng
6 # expected_edge_leng
)
]
)
def test_examples(ingest, output_names, output_format, row_limit, header_len, expected_node_len, expected_edge_len):

source = f"examples/{ingest}/protein-links-detailed.yaml"
output_suffix = ".tsv"
output_dir = f"./test-output/{ingest}-row-limit"

transform_source(source=source,
output_dir=output_dir,
output_format=output_format,
global_table="examples/translation_table.yaml",
local_table=None,
row_limit=row_limit)

# hacky check that correct number of rows was processed
node_file = f"{output_dir}/protein-links-detailed_nodes{output_suffix}"
edge_file = f"{output_dir}/protein-links-detailed_edges{output_suffix}"

node_lines = sum(1 for line in open(node_file))
edge_lines = sum(1 for line in open(edge_file))

assert node_lines == expected_node_len
assert edge_lines == expected_edge_len

38 changes: 38 additions & 0 deletions tests/unit/test_jsonlreader_row_limit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import gzip
from pathlib import Path

import pytest

from koza.io.reader.jsonl_reader import JSONLReader

test_zfin = (
Path(__file__).parent.parent / 'resources' / 'source-files' / 'ZFIN_PHENOTYPE_0.jsonl.gz'
)


def test_normal_case():
with gzip.open(test_zfin, 'rt') as zfin:
row_limit = 3
jsonl_reader = JSONLReader(zfin, row_limit=row_limit)
row = next(jsonl_reader)
assert len(row) == 6
assert row['objectId'] == 'ZFIN:ZDB-GENE-011026-1'


def test_required_property():
with gzip.open(test_zfin, 'rt') as zfin:
row_limit = 3
row_count = 1
jsonl_reader = JSONLReader(zfin, ['objectId'], row_limit=row_limit)
for row in jsonl_reader:
row_count += 1
assert 'objectId' in row
assert row_count == row_limit


def test_missing_req_property_raises_exception():
with gzip.open(test_zfin, 'rt') as zfin:
row_limit = 3
jsonl_reader = JSONLReader(zfin, ['objectId', 'foobar'], row_limit=row_limit)
with pytest.raises(ValueError):
next(jsonl_reader)
35 changes: 35 additions & 0 deletions tests/unit/test_jsonreader_row_limit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import gzip
from pathlib import Path

import pytest

from koza.io.reader.json_reader import JSONReader

test_ddpheno = Path(__file__).parents[1] / 'resources' / 'source-files' / 'ddpheno.json.gz'

json_path = ['graphs', 0, 'nodes']


def test_normal_case():
with gzip.open(test_ddpheno, 'rt') as ddpheno:
json_reader = JSONReader(ddpheno, json_path=json_path, row_limit=3)
row = next(json_reader)
assert row['id'] == 'http://purl.obolibrary.org/obo/DDPHENO_0001198'


def test_required_properties():
with gzip.open(test_ddpheno, 'rt') as ddpheno:
row_limit=3
row_count = 0
json_reader = JSONReader(ddpheno, ['id'], json_path=json_path, row_limit=row_limit)
for row in json_reader:
row_count += 1
assert 'id' in row
assert row_count == row_limit


def test_missing_req_property_raises_exception():
with gzip.open(test_ddpheno, 'rt') as ddpheno:
json_reader = JSONReader(ddpheno, ['fake_prop'], json_path=json_path, row_limit=3)
with pytest.raises(ValueError):
next(json_reader)

0 comments on commit 61289d3

Please sign in to comment.