From b74ba11d937ef4b902f320e1bdb39a7ece35ffc4 Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Mon, 14 Oct 2024 18:21:05 +0530 Subject: [PATCH] fix(ingest/delta-lake): skip file count if require_files is false (#11611) --- .../src/datahub/ingestion/source/delta_lake/source.py | 5 ++--- metadata-ingestion/tests/unit/test_mlflow_source.py | 6 ++---- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/delta_lake/source.py b/metadata-ingestion/src/datahub/ingestion/source/delta_lake/source.py index 6a52d8fdd89057..98133ca69011e7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/delta_lake/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/delta_lake/source.py @@ -223,15 +223,14 @@ def ingest_table( ) customProperties = { - "number_of_files": str(get_file_count(delta_table)), "partition_columns": str(delta_table.metadata().partition_columns), "table_creation_time": str(delta_table.metadata().created_time), "id": str(delta_table.metadata().id), "version": str(delta_table.version()), "location": self.source_config.complete_path, } - if not self.source_config.require_files: - del customProperties["number_of_files"] # always 0 + if self.source_config.require_files: + customProperties["number_of_files"] = str(get_file_count(delta_table)) dataset_properties = DatasetPropertiesClass( description=delta_table.metadata().description, diff --git a/metadata-ingestion/tests/unit/test_mlflow_source.py b/metadata-ingestion/tests/unit/test_mlflow_source.py index ae5a42bad229d2..d213dd92352e62 100644 --- a/metadata-ingestion/tests/unit/test_mlflow_source.py +++ b/metadata-ingestion/tests/unit/test_mlflow_source.py @@ -1,6 +1,6 @@ import datetime from pathlib import Path -from typing import Any, TypeVar, Union +from typing import Any, Union import pytest from mlflow import MlflowClient @@ -11,8 +11,6 @@ from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.mlflow import MLflowConfig, MLflowSource -T = TypeVar("T") - @pytest.fixture def tracking_uri(tmp_path: Path) -> str: @@ -46,7 +44,7 @@ def model_version( ) -def dummy_search_func(page_token: Union[None, str], **kwargs: Any) -> PagedList[T]: +def dummy_search_func(page_token: Union[None, str], **kwargs: Any) -> PagedList[str]: dummy_pages = dict( page_1=PagedList(items=["a", "b"], token="page_2"), page_2=PagedList(items=["c", "d"], token="page_3"),