From 1a3fde9562c4b4dcb79a0757aa265d1076ad4447 Mon Sep 17 00:00:00 2001 From: Siddique Bagwan Date: Fri, 13 Dec 2024 18:54:07 +0530 Subject: [PATCH 1/4] remove the fetch_size --- .../datahub/ingestion/source/tableau/tableau.py | 16 +++++++++++++--- .../integration/tableau/test_tableau_ingest.py | 1 + 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index 6844b8a425a7b6..872272ff8bfe72 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -380,7 +380,8 @@ class TableauConfig( fetch_size: int = Field( default=250, - description="Specifies the number of records to retrieve in each batch during a query execution.", + description="[deprecated] Use page_size instead. Specifies the number of records to retrieve in each batch during a query execution.", + exclude=True, ) # We've found that even with a small workbook page size (e.g. 10), the Tableau API often @@ -497,6 +498,11 @@ class TableauConfig( "This can only be used with ingest_tags enabled as it will overwrite tags entered from the UI.", ) + _fetch_size = pydantic_field_deprecated( + "fetch_size", + message="fetch_size is deprecated, use page_size instead", + ) + # pre = True because we want to take some decision before pydantic initialize the configuration to default values @root_validator(pre=True) def projects_backward_compatibility(cls, values: Dict) -> Dict: @@ -1108,7 +1114,7 @@ def get_connection_object_page( connection_type: str, query_filter: str, current_cursor: Optional[str], - fetch_size: int = 250, + fetch_size: int, retry_on_auth_error: bool = True, retries_remaining: Optional[int] = None, ) -> Tuple[dict, Optional[str], int]: @@ -1306,7 +1312,11 @@ def get_connection_objects( connection_type=connection_type, query_filter=filter_, current_cursor=current_cursor, - fetch_size=self.config.fetch_size, + # `filter_page` contains metadata object IDs (e.g., Project IDs, Field IDs, Sheet IDs, etc.). + # The number of IDs is always less than or equal to page_size. + # If the IDs are primary keys, the number of metadata objects to load matches the number of records to return. + # In our case, mostly, the IDs are primary key, therefore, fetch_size is set equal to page_size. + fetch_size=page_size, ) yield from connection_objects.get(c.NODES) or [] diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index 5b557efdab0bb0..2afe72397e50a0 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -1353,6 +1353,7 @@ def test_permission_mode_switched_error(pytestconfig, tmp_path, mock_datahub_gra query_filter=mock.MagicMock(), current_cursor=None, retries_remaining=1, + fetch_size=10, ) warnings = list(reporter.warnings) From 7e525bcdba4cc2d948c90bd648b17dee533680b5 Mon Sep 17 00:00:00 2001 From: Siddique Bagwan Date: Fri, 13 Dec 2024 19:02:37 +0530 Subject: [PATCH 2/4] hide from doc --- .../src/datahub/ingestion/source/tableau/tableau.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index 872272ff8bfe72..e5ab1137cbd1db 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -381,7 +381,7 @@ class TableauConfig( fetch_size: int = Field( default=250, description="[deprecated] Use page_size instead. Specifies the number of records to retrieve in each batch during a query execution.", - exclude=True, + hidden_from_docs=True, ) # We've found that even with a small workbook page size (e.g. 10), the Tableau API often From c409a83ebe3a4f5c251eaae1fa6952b27081b7a2 Mon Sep 17 00:00:00 2001 From: Siddique Bagwan Date: Thu, 19 Dec 2024 15:20:49 +0530 Subject: [PATCH 3/4] address review comment --- .../src/datahub/ingestion/source/tableau/tableau.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index 5cd12bd8aa9b78..f00629436a8338 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -49,6 +49,7 @@ DatasetSourceConfigMixin, ) from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated +from datahub.configuration.validate_field_removal import pydantic_removed_field from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_builder import ( ContainerKey, @@ -382,7 +383,7 @@ class TableauConfig( fetch_size: int = Field( default=250, - description="[deprecated] Use page_size instead. Specifies the number of records to retrieve in each batch during a query execution.", + description="[removed] The 'fetch_size' field is no longer used. Please use 'page_size' instead.", hidden_from_docs=True, ) @@ -500,9 +501,8 @@ class TableauConfig( "This can only be used with ingest_tags enabled as it will overwrite tags entered from the UI.", ) - _fetch_size = pydantic_field_deprecated( + _fetch_size = pydantic_removed_field( "fetch_size", - message="fetch_size is deprecated, use page_size instead", ) # pre = True because we want to take some decision before pydantic initialize the configuration to default values From 9821635ba67502f3cc18f6cd7fd02611f70b2c62 Mon Sep 17 00:00:00 2001 From: Siddique Bagwan Date: Fri, 20 Dec 2024 15:16:54 +0530 Subject: [PATCH 4/4] address review comment --- .../src/datahub/ingestion/source/tableau/tableau.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index 26e8612459e5ab..93b9ecacbcc48d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -381,12 +381,6 @@ class TableauConfig( description="[advanced] Number of metadata objects (e.g. CustomSQLTable, PublishedDatasource, etc) to query at a time using the Tableau API.", ) - fetch_size: int = Field( - default=250, - description="[removed] The 'fetch_size' field is no longer used. Please use 'page_size' instead.", - hidden_from_docs=True, - ) - # We've found that even with a small workbook page size (e.g. 10), the Tableau API often # returns warnings like this: # {