diff --git a/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/path_spec.py b/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/path_spec.py index d1c949f48e2cd5..a35fb94614f722 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/path_spec.py +++ b/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/path_spec.py @@ -18,7 +18,14 @@ logger: logging.Logger = logging.getLogger(__name__) SUPPORTED_FILE_TYPES: List[str] = ["csv", "tsv", "json", "parquet", "avro"] -SUPPORTED_COMPRESSIONS: List[str] = ["gz", "bz2"] + +# These come from the smart_open library. +SUPPORTED_COMPRESSIONS: List[str] = [ + "gz", + "bz2", + # We have a monkeypatch on smart_open that aliases .gzip to .gz. + "gzip", +] class PathSpec(ConfigModel): diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py index ac4433b7eb1f0c..eb49fcbb268c0a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py @@ -10,6 +10,7 @@ from pathlib import PurePath from typing import Any, Dict, Iterable, List, Optional, Tuple +import smart_open.compression as so_compression from more_itertools import peekable from pyspark.conf import SparkConf from pyspark.sql import SparkSession @@ -120,6 +121,9 @@ } PAGE_SIZE = 1000 +# Hack to support the .gzip extension with smart_open. +so_compression.register_compressor(".gzip", so_compression._COMPRESSOR_REGISTRY[".gz"]) + def get_column_type( report: SourceReport, dataset_name: str, column_type: str @@ -407,7 +411,9 @@ def get_fields(self, table_data: TableData, path_spec: PathSpec) -> List: table_data.full_path, "rb", transport_params={"client": s3_client} ) else: - file = open(table_data.full_path, "rb") + # We still use smart_open here to take advantage of the compression + # capabilities of smart_open. + file = smart_open(table_data.full_path, "rb") fields = []