Skip to content

Commit

Permalink
Merge pull request #81 from monarch-initiative/extract-targz
Browse files Browse the repository at this point in the history
Add "file_archive" attribute for SourceConfig
  • Loading branch information
kevinschaper authored Apr 8, 2022
2 parents 5e05923 + 3990b18 commit dbebf8d
Show file tree
Hide file tree
Showing 14 changed files with 103 additions and 52 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
koza-env/
tests/resources/source-files/string.tsv*

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
Binary file removed examples/data/ZFIN_PHENOTYPE_0.jsonl.gz
Binary file not shown.
Binary file removed examples/data/ddpheno.json.gz
Binary file not shown.
2 changes: 1 addition & 1 deletion koza/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
"""Koza, an ETL framework for LinkML data models"""
__version__ = '0.1.10'
__version__ = '0.1.11'
4 changes: 1 addition & 3 deletions koza/cli_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from koza.io.utils import open_resource
from koza.io.yaml_loader import UniqueIncludeLoader
from koza.model.config.source_config import (
CompressionType,
FormatType,
OutputFormat,
PrimaryFileConfig,
Expand Down Expand Up @@ -61,7 +60,6 @@ def validate_file(
format: FormatType = FormatType.csv,
delimiter: str = ',',
header_delimiter: str = None,
compression: CompressionType = None,
skip_blank_lines: bool = True,
):
"""
Expand All @@ -71,7 +69,7 @@ def validate_file(
For json and jsonl just validates them
"""

with open_resource(file, compression) as resource_io:
with open_resource(file) as resource_io:

if format == FormatType.csv:
reader = CSVReader(
Expand Down
24 changes: 7 additions & 17 deletions koza/io/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,7 @@

import requests

from koza.model.config.source_config import CompressionType

def open_resource(resource: Union[str, PathLike], compression: CompressionType = None) -> IO[str]:
def open_resource(resource: Union[str, PathLike]) -> IO[str]:
"""
A generic function for opening a local or remote file
Expand All @@ -27,25 +25,17 @@ def open_resource(resource: Union[str, PathLike], compression: CompressionType =
that requests does not support FTP (consider ftplib or urllib.request)
:param resource: str or PathLike - local filepath or remote resource
:param compression: str or PathLike - compression type
:return: str, next line in resource
"""
if Path(resource).exists():
if compression is None:
# Try gzip first
try:
file = gzip.open(resource, 'rt')
file.read(1)
file.seek(0)

except OSError:
file = open(resource, 'r')
elif compression == CompressionType.gzip:
# Try gzip first
try:
file = gzip.open(resource, 'rt')
else:
file.read(1)
file.seek(0)
except OSError:
file = open(resource, 'r')

return file

elif isinstance(resource, str) and resource.startswith('http'):
Expand All @@ -56,7 +46,7 @@ def open_resource(resource: Union[str, PathLike], compression: CompressionType =
tmp_file.write(request.content)
request.close() # not sure this is needed
tmp_file.seek(0)
if resource.endswith('gz') or compression == CompressionType.gzip:
if resource.endswith('gz'):
# This should be more robust, either check headers
# or use https://github.com/ahupp/python-magic
remote_file = gzip.open(tmp_file, 'rt')
Expand Down
5 changes: 2 additions & 3 deletions koza/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import typer

from koza.cli_runner import transform_source, validate_file
from koza.model.config.source_config import CompressionType, FormatType, OutputFormat
from koza.model.config.source_config import FormatType, OutputFormat

typer_app = typer.Typer()

Expand Down Expand Up @@ -51,7 +51,6 @@ def validate(
format: FormatType = FormatType.csv,
delimiter: str = ',',
header_delimiter: str = None,
compression: CompressionType = None,
skip_blank_lines: bool = True,
):
"""
Expand All @@ -62,7 +61,7 @@ def validate(
"""
_set_log_level(debug=True)
validate_file(
file, format, delimiter, header_delimiter, compression, skip_blank_lines
file, format, delimiter, header_delimiter, skip_blank_lines
)


Expand Down
47 changes: 22 additions & 25 deletions koza/model/config/source_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@
source config data class
map config data class
"""
import logging
import os, logging
import tarfile, zipfile
import yaml
from dataclasses import field
from enum import Enum
from pathlib import Path
from typing import Dict, List, Union

import yaml
from pydantic import StrictFloat, StrictInt, StrictStr
from pydantic.dataclasses import dataclass

Expand All @@ -25,7 +26,6 @@ class MapErrorEnum(str, Enum):
warning = 'warning'
error = 'error'


class FormatType(str, Enum):
"""
Enum for supported file types
Expand All @@ -37,21 +37,11 @@ class FormatType(str, Enum):
yaml = 'yaml'
xml = 'xml' # TODO


class StandardFormat(str, Enum):
gpi = 'gpi'
bgi = 'bgi'
oban = 'oban'


class CompressionType(str, Enum):
"""
Enum for supported compression
"""

gzip = 'gzip'


class FilterCode(str, Enum):
"""
Enum for filter codes
Expand All @@ -68,7 +58,6 @@ class FilterCode(str, Enum):
ne = 'ne'
inlist = 'in'


class FilterInclusion(str, Enum):
"""
Enum for filter inclusion/exclusion
Expand All @@ -77,7 +66,6 @@ class FilterInclusion(str, Enum):
include = 'include'
exclude = 'exclude'


class FieldType(str, Enum):
"""
Enum for filter codes
Expand All @@ -88,7 +76,6 @@ class FieldType(str, Enum):
int = 'int'
float = 'float'


class OutputFormat(str, Enum):
"""
Output formats
Expand All @@ -98,7 +85,6 @@ class OutputFormat(str, Enum):
jsonl = 'jsonl'
kgx = 'kgx'


class TransformMode(str, Enum):
"""
Configures how an external transform file is processed
Expand All @@ -110,7 +96,6 @@ class TransformMode(str, Enum):
flat = 'flat'
loop = 'loop'


class HeaderMode(str, Enum):
"""
Enum for supported header modes in addition to an index based lookup
Expand All @@ -119,15 +104,13 @@ class HeaderMode(str, Enum):
infer = 'infer'
none = 'none'


@dataclass(frozen=True)
class ColumnFilter:
column: str
inclusion: FilterInclusion
filter_code: FilterCode
value: Union[StrictInt, StrictFloat, StrictStr, List[Union[StrictInt, StrictFloat, StrictStr]]]


@dataclass(frozen=True)
class DatasetDescription:
"""
Expand All @@ -148,7 +131,6 @@ class DatasetDescription:
license: str = None
rights: str = None


@dataclass(config=PydanticConfig)
class SourceConfig:
"""
Expand All @@ -172,6 +154,7 @@ class SourceConfig:

name: str
files: List[Union[str, Path]]
file_archive: Union[str, Path] = None
format: FormatType = FormatType.csv
metadata: Union[DatasetDescription, str] = None
columns: List[Union[str, Dict[str, FieldType]]] = None
Expand All @@ -181,22 +164,38 @@ class SourceConfig:
header: Union[int, HeaderMode] = HeaderMode.infer
comment_char: str = '#'
skip_blank_lines: bool = True
compression: CompressionType = None
filters: List[ColumnFilter] = field(default_factory=list)
json_path: List[Union[StrictStr, StrictInt]] = None
transform_code: str = None
transform_mode: TransformMode = TransformMode.flat
global_table: Union[str, Dict] = None
local_table: Union[str, Dict] = None

def extract_archive(self):
archive_path = Path(self.file_archive).parent#.absolute()
if self.file_archive.endswith('.tar.gz') or self.file_archive.endswith('.tar'):
with tarfile.open(self.file_archive) as archive:
archive.extractall(archive_path)
elif self.file_archive.endswith('.zip'):
with zipfile.ZipFile(self.file_archive, 'r') as archive:
archive.extractall(archive_path)
else:
raise ValueError("Error extracting archive. Supported archive types: .tar.gz, .zip")
files = [os.path.join(archive_path, file) for file in self.files]
return files

def __post_init_post_parse__(self):
"""
TO DO figure out why we're using object.__setattr__(self, ...
here and document it
"""
if self.file_archive:
files = self.extract_archive()
else:
files = self.files

files_as_paths: List[Path] = []
for file in self.files:
for file in files:
if isinstance(file, str):
files_as_paths.append(Path(file))
else:
Expand Down Expand Up @@ -294,7 +293,6 @@ def __post_init_post_parse__(self):
def field_type_map(self):
return self._field_type_map


@dataclass(config=PydanticConfig)
class PrimaryFileConfig(SourceConfig):
"""
Expand All @@ -307,7 +305,6 @@ class PrimaryFileConfig(SourceConfig):
depends_on: List[str] = field(default_factory=list)
on_map_failure: MapErrorEnum = MapErrorEnum.warning


@dataclass(config=PydanticConfig)
class MapFileConfig(SourceConfig):
key: str = None
Expand Down
2 changes: 1 addition & 1 deletion koza/model/source.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def __init__(
self.config = config

for file in config.files:
resource_io = open_resource(file, config.compression)
resource_io = open_resource(file)
if self.config.format == 'csv':
self._readers.append(
CSVReader(
Expand Down
23 changes: 23 additions & 0 deletions tests/integration/test_archives.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import os, yaml
from pathlib import Path

from koza.model.config.source_config import PrimaryFileConfig
from koza.io.yaml_loader import UniqueIncludeLoader

def test_archive_targz():
source = Path('tests/resources/string.yaml')
unzipped_data = Path('tests/resources/source-files/string.tsv.gz')
# Delete unzipped archive if it exists
if os.path.exists(unzipped_data.absolute()):
os.remove(unzipped_data.absolute())

# Create a SourceConfig object with test config
with open(source.absolute(), 'r') as src:
source_config = PrimaryFileConfig(**yaml.load(src, Loader=UniqueIncludeLoader))

# This method only happens after validation - force it now
source_config.__post_init_post_parse__()

assert os.path.exists(unzipped_data)

#test_archive_targz()
Binary file modified tests/resources/source-files/ddpheno.json.gz
Binary file not shown.
Binary file added tests/resources/source-files/string.tar.gz
Binary file not shown.
45 changes: 45 additions & 0 deletions tests/resources/string.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
name: 'zfin_phenotype_0'

format: 'csv'

delimiter: '\t'

#header_delimiter: '\t'

file_archive: 'tests/resources/source-files/string.tar.gz'

files:
- 'string.tsv'

global_table: 'tests/resources/translation_table.yaml'

columns:
- 'protein1'
- 'protein2'
- 'neighborhood'
- 'fusion'
- 'cooccurence'
- 'coexpression'
- 'experimental'
- 'database'
- 'textmining'
- 'combined_score' : 'int'

filters:
- column: 'combined_score'
inclusion: 'include'
filter_code: 'lt'
value: 700

#transform_code: './examples/string/protein-links-detailed.py'

#transform_mode: 'loop'

edge_properties:
- 'id'
- 'subject'
- 'predicate'
- 'object'
- 'category'
- 'relation'
- 'provided_by'
2 changes: 0 additions & 2 deletions tests/unit/resources/primary-source.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@ on_map_failure: 'warning'
delimiter: '\t'
header_delimiter: '\t'

compression: 'gzip'

files:
- '9606.protein.links.detailed.v11.0.txt.gz'
- '10090.protein.links.detailed.v11.0.txt.gz'
Expand Down

0 comments on commit dbebf8d

Please sign in to comment.