Merge pull request #81 from monarch-initiative/extract-targz

Add "file_archive" attribute for SourceConfig
monarch-initiative · Apr 8, 2022 · dbebf8d · dbebf8d
2 parents 5e05923 + 3990b18
commit dbebf8d
Show file tree

Hide file tree

Showing 14 changed files with 103 additions and 52 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 koza-env/
+tests/resources/source-files/string.tsv*
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/examples/data/ZFIN_PHENOTYPE_0.jsonl.gz b/examples/data/ZFIN_PHENOTYPE_0.jsonl.gz
diff --git a/examples/data/ddpheno.json.gz b/examples/data/ddpheno.json.gz
diff --git a/koza/__init__.py b/koza/__init__.py
@@ -1,2 +1,2 @@
 """Koza, an ETL framework for LinkML data models"""
-__version__ = '0.1.10'
+__version__ = '0.1.11'
diff --git a/koza/cli_runner.py b/koza/cli_runner.py
@@ -15,7 +15,6 @@
 from koza.io.utils import open_resource
 from koza.io.yaml_loader import UniqueIncludeLoader
 from koza.model.config.source_config import (
-    CompressionType,
     FormatType,
     OutputFormat,
     PrimaryFileConfig,
@@ -61,7 +60,6 @@ def validate_file(
     format: FormatType = FormatType.csv,
     delimiter: str = ',',
     header_delimiter: str = None,
-    compression: CompressionType = None,
     skip_blank_lines: bool = True,
 ):
     """
@@ -71,7 +69,7 @@ def validate_file(
     For json and jsonl just validates them
     """
 
-    with open_resource(file, compression) as resource_io:
+    with open_resource(file) as resource_io:
 
         if format == FormatType.csv:
             reader = CSVReader(

diff --git a/koza/io/utils.py b/koza/io/utils.py
@@ -11,9 +11,7 @@
 
 import requests
 
-from koza.model.config.source_config import CompressionType
-
-def open_resource(resource: Union[str, PathLike], compression: CompressionType = None) -> IO[str]:
+def open_resource(resource: Union[str, PathLike]) -> IO[str]:
     """
     A generic function for opening a local or remote file
 
@@ -27,25 +25,17 @@ def open_resource(resource: Union[str, PathLike], compression: CompressionType =
     that requests does not support FTP (consider ftplib or urllib.request)
 
     :param resource: str or PathLike - local filepath or remote resource
-    :param compression: str or PathLike - compression type
     :return: str, next line in resource
 
     """
     if Path(resource).exists():
-        if compression is None:
-            # Try gzip first
-            try:
-                file = gzip.open(resource, 'rt')
-                file.read(1)
-                file.seek(0)
-
-            except OSError:
-                file = open(resource, 'r')
-        elif compression == CompressionType.gzip:
+        # Try gzip first
+        try:
             file = gzip.open(resource, 'rt')
-        else:
+            file.read(1)
+            file.seek(0)
+        except OSError:
             file = open(resource, 'r')
-
         return file
 
     elif isinstance(resource, str) and resource.startswith('http'):
@@ -56,7 +46,7 @@ def open_resource(resource: Union[str, PathLike], compression: CompressionType =
         tmp_file.write(request.content)
         request.close()  # not sure this is needed
         tmp_file.seek(0)
-        if resource.endswith('gz') or compression == CompressionType.gzip:
+        if resource.endswith('gz'):
             # This should be more robust, either check headers
             # or use https://github.com/ahupp/python-magic
             remote_file = gzip.open(tmp_file, 'rt')

diff --git a/koza/main.py b/koza/main.py
@@ -9,7 +9,7 @@
 import typer
 
 from koza.cli_runner import transform_source, validate_file
-from koza.model.config.source_config import CompressionType, FormatType, OutputFormat
+from koza.model.config.source_config import FormatType, OutputFormat
 
 typer_app = typer.Typer()
 
@@ -51,7 +51,6 @@ def validate(
     format: FormatType = FormatType.csv,
     delimiter: str = ',',
     header_delimiter: str = None,
-    compression: CompressionType = None,
     skip_blank_lines: bool = True,
 ):
     """
@@ -62,7 +61,7 @@ def validate(
     """
     _set_log_level(debug=True)
     validate_file(
-        file, format, delimiter, header_delimiter, compression, skip_blank_lines
+        file, format, delimiter, header_delimiter, skip_blank_lines
     )
 
 

diff --git a/koza/model/config/source_config.py b/koza/model/config/source_config.py
@@ -2,13 +2,14 @@
 source config data class
 map config data class
 """
-import logging
+import os, logging
+import tarfile, zipfile
+import yaml
 from dataclasses import field
 from enum import Enum
 from pathlib import Path
 from typing import Dict, List, Union
 
-import yaml
 from pydantic import StrictFloat, StrictInt, StrictStr
 from pydantic.dataclasses import dataclass
 
@@ -25,7 +26,6 @@ class MapErrorEnum(str, Enum):
     warning = 'warning'
     error = 'error'
 
-
 class FormatType(str, Enum):
     """
     Enum for supported file types
@@ -37,21 +37,11 @@ class FormatType(str, Enum):
     yaml = 'yaml'
     xml = 'xml'  # TODO
 
-
 class StandardFormat(str, Enum):
     gpi = 'gpi'
     bgi = 'bgi'
     oban = 'oban'
 
-
-class CompressionType(str, Enum):
-    """
-    Enum for supported compression
-    """
-
-    gzip = 'gzip'
-
-
 class FilterCode(str, Enum):
     """
     Enum for filter codes
@@ -68,7 +58,6 @@ class FilterCode(str, Enum):
     ne = 'ne'
     inlist = 'in'
 
-
 class FilterInclusion(str, Enum):
     """
     Enum for filter inclusion/exclusion
@@ -77,7 +66,6 @@ class FilterInclusion(str, Enum):
     include = 'include'
     exclude = 'exclude'
 
-
 class FieldType(str, Enum):
     """
     Enum for filter codes
@@ -88,7 +76,6 @@ class FieldType(str, Enum):
     int = 'int'
     float = 'float'
 
-
 class OutputFormat(str, Enum):
     """
     Output formats
@@ -98,7 +85,6 @@ class OutputFormat(str, Enum):
     jsonl = 'jsonl'
     kgx = 'kgx'
 
-
 class TransformMode(str, Enum):
     """
     Configures how an external transform file is processed
@@ -110,7 +96,6 @@ class TransformMode(str, Enum):
     flat = 'flat'
     loop = 'loop'
 
-
 class HeaderMode(str, Enum):
     """
     Enum for supported header modes in addition to an index based lookup
@@ -119,15 +104,13 @@ class HeaderMode(str, Enum):
     infer = 'infer'
     none = 'none'
 
-
 @dataclass(frozen=True)
 class ColumnFilter:
     column: str
     inclusion: FilterInclusion
     filter_code: FilterCode
     value: Union[StrictInt, StrictFloat, StrictStr, List[Union[StrictInt, StrictFloat, StrictStr]]]
 
-
 @dataclass(frozen=True)
 class DatasetDescription:
     """
@@ -148,7 +131,6 @@ class DatasetDescription:
     license: str = None
     rights: str = None
 
-
 @dataclass(config=PydanticConfig)
 class SourceConfig:
     """
@@ -172,6 +154,7 @@ class SourceConfig:
 
     name: str
     files: List[Union[str, Path]]
+    file_archive: Union[str, Path] = None
     format: FormatType = FormatType.csv
     metadata: Union[DatasetDescription, str] = None
     columns: List[Union[str, Dict[str, FieldType]]] = None
@@ -181,22 +164,38 @@ class SourceConfig:
     header: Union[int, HeaderMode] = HeaderMode.infer
     comment_char: str = '#'
     skip_blank_lines: bool = True
-    compression: CompressionType = None
     filters: List[ColumnFilter] = field(default_factory=list)
     json_path: List[Union[StrictStr, StrictInt]] = None
     transform_code: str = None
     transform_mode: TransformMode = TransformMode.flat
     global_table: Union[str, Dict] = None
     local_table: Union[str, Dict] = None
 
+    def extract_archive(self):
+        archive_path = Path(self.file_archive).parent#.absolute()
+        if self.file_archive.endswith('.tar.gz') or self.file_archive.endswith('.tar'):
+            with tarfile.open(self.file_archive) as archive:
+                archive.extractall(archive_path)
+        elif self.file_archive.endswith('.zip'):
+            with zipfile.ZipFile(self.file_archive, 'r') as archive:
+                archive.extractall(archive_path)
+        else:
+            raise ValueError("Error extracting archive. Supported archive types: .tar.gz, .zip")
+        files = [os.path.join(archive_path, file) for file in self.files]
+        return files
 
     def __post_init_post_parse__(self):
         """
         TO DO figure out why we're using object.__setattr__(self, ...
               here and document it
         """
+        if self.file_archive:
+            files = self.extract_archive()
+        else:
+            files = self.files
+
         files_as_paths: List[Path] = []
-        for file in self.files:
+        for file in files:
             if isinstance(file, str):
                 files_as_paths.append(Path(file))
             else:
@@ -294,7 +293,6 @@ def __post_init_post_parse__(self):
     def field_type_map(self):
         return self._field_type_map
 
-
 @dataclass(config=PydanticConfig)
 class PrimaryFileConfig(SourceConfig):
     """
@@ -307,7 +305,6 @@ class PrimaryFileConfig(SourceConfig):
     depends_on: List[str] = field(default_factory=list)
     on_map_failure: MapErrorEnum = MapErrorEnum.warning
 
-
 @dataclass(config=PydanticConfig)
 class MapFileConfig(SourceConfig):
     key: str = None

diff --git a/koza/model/source.py b/koza/model/source.py
@@ -48,7 +48,7 @@ def __init__(
             self.config = config
 
         for file in config.files:
-            resource_io = open_resource(file, config.compression)
+            resource_io = open_resource(file)
             if self.config.format == 'csv':
                 self._readers.append(
                     CSVReader(

diff --git a/tests/integration/test_archives.py b/tests/integration/test_archives.py
@@ -0,0 +1,23 @@
+import os, yaml
+from pathlib import Path
+
+from koza.model.config.source_config import PrimaryFileConfig
+from koza.io.yaml_loader import UniqueIncludeLoader
+
+def test_archive_targz():
+    source = Path('tests/resources/string.yaml')
+    unzipped_data = Path('tests/resources/source-files/string.tsv.gz')
+    # Delete unzipped archive if it exists    
+    if os.path.exists(unzipped_data.absolute()):
+        os.remove(unzipped_data.absolute())
+
+    # Create a SourceConfig object with test config
+    with open(source.absolute(), 'r') as src:
+        source_config = PrimaryFileConfig(**yaml.load(src, Loader=UniqueIncludeLoader))
+
+    # This method only happens after validation - force it now
+    source_config.__post_init_post_parse__()
+
+    assert os.path.exists(unzipped_data)
+
+#test_archive_targz()
diff --git a/tests/resources/source-files/ddpheno.json.gz b/tests/resources/source-files/ddpheno.json.gz
diff --git a/tests/resources/source-files/string.tar.gz b/tests/resources/source-files/string.tar.gz
diff --git a/tests/resources/string.yaml b/tests/resources/string.yaml
@@ -0,0 +1,45 @@
+name: 'zfin_phenotype_0'
+
+format: 'csv'
+
+delimiter: '\t'
+
+#header_delimiter: '\t'
+
+file_archive: 'tests/resources/source-files/string.tar.gz'
+
+files:
+  - 'string.tsv'
+
+global_table: 'tests/resources/translation_table.yaml'
+
+columns:
+  - 'protein1'
+  - 'protein2'
+  - 'neighborhood'
+  - 'fusion'
+  - 'cooccurence'
+  - 'coexpression'
+  - 'experimental'
+  - 'database'
+  - 'textmining'
+  - 'combined_score' : 'int'
+
+filters:
+  - column: 'combined_score'
+    inclusion: 'include'
+    filter_code: 'lt'
+    value: 700
+
+#transform_code: './examples/string/protein-links-detailed.py'
+
+#transform_mode: 'loop'
+
+edge_properties:
+  - 'id'
+  - 'subject'
+  - 'predicate'
+  - 'object'
+  - 'category'
+  - 'relation'
+  - 'provided_by'
diff --git a/tests/unit/resources/primary-source.yaml b/tests/unit/resources/primary-source.yaml
@@ -11,8 +11,6 @@ on_map_failure: 'warning'
 delimiter: '\t'
 header_delimiter: '\t'
 
-compression: 'gzip'
-
 files:
   - '9606.protein.links.detailed.v11.0.txt.gz'
   - '10090.protein.links.detailed.v11.0.txt.gz'