From d06980f6f3421ac5d3a3fc21d5c15f3e3057338f Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Mon, 23 Dec 2024 19:11:40 +0530 Subject: [PATCH 01/28] fix(ingest/snowflake): handle empty snowflake column upstreams (#12207) --- .../source/snowflake/snowflake_lineage_v2.py | 6 ++--- .../unit/snowflake/test_snowflake_source.py | 24 +++++++++++++++++++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index 69f28a0e6e595a..b815a6584379ac 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -4,7 +4,7 @@ from datetime import datetime from typing import Any, Collection, Iterable, List, Optional, Set, Tuple, Type -from pydantic import BaseModel, validator +from pydantic import BaseModel, Field, validator from datahub.configuration.datetimes import parse_absolute_time from datahub.ingestion.api.closeable import Closeable @@ -72,8 +72,8 @@ class ColumnUpstreamJob(BaseModel): class ColumnUpstreamLineage(BaseModel): - column_name: str - upstreams: List[ColumnUpstreamJob] + column_name: Optional[str] + upstreams: List[ColumnUpstreamJob] = Field(default_factory=list) class UpstreamTableNode(BaseModel): diff --git a/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py b/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py index c735feb5396086..2ff85a08f052f9 100644 --- a/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py +++ b/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py @@ -18,6 +18,7 @@ DEFAULT_TEMP_TABLES_PATTERNS, SnowflakeV2Config, ) +from datahub.ingestion.source.snowflake.snowflake_lineage_v2 import UpstreamLineageEdge from datahub.ingestion.source.snowflake.snowflake_query import ( SnowflakeQuery, create_deny_regex_sql_filter, @@ -664,3 +665,26 @@ def test_create_snowsight_base_url_ap_northeast_1(): def test_snowflake_utils() -> None: assert_doctest(datahub.ingestion.source.snowflake.snowflake_utils) + + +def test_snowflake_query_result_parsing(): + db_row = { + "DOWNSTREAM_TABLE_NAME": "db.schema.downstream_table", + "DOWNSTREAM_TABLE_DOMAIN": "Table", + "UPSTREAM_TABLES": [ + { + "query_id": "01b92f61-0611-c826-000d-0103cf9b5db7", + "upstream_object_domain": "Table", + "upstream_object_name": "db.schema.upstream_table", + } + ], + "UPSTREAM_COLUMNS": [{}], + "QUERIES": [ + { + "query_id": "01b92f61-0611-c826-000d-0103cf9b5db7", + "query_text": "Query test", + "start_time": "2022-12-01 19:56:34", + } + ], + } + assert UpstreamLineageEdge.parse_obj(db_row) From dd23f9e294a72076e2cbe241cd6ce18f205bac68 Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Mon, 23 Dec 2024 21:28:18 +0530 Subject: [PATCH 02/28] fix(ui): null dereference (#12193) --- .../styled/ERModelRelationship/ERModelRelationUtils.tsx | 2 +- .../shared/tabs/Dataset/Queries/utils/filterQueries.ts | 6 +++--- .../shared/tabs/Dataset/Schema/utils/filterSchemaRows.ts | 2 +- .../source/executions/ExecutionRequestDetailsModal.tsx | 4 ++-- datahub-web-react/src/app/lineage/utils/titleUtils.ts | 4 ++-- .../src/app/search/context/SearchResultContext.tsx | 2 +- .../src/app/search/matches/MatchedFieldList.tsx | 2 +- .../src/app/search/matches/SearchTextHighlighter.tsx | 2 +- 8 files changed, 12 insertions(+), 12 deletions(-) diff --git a/datahub-web-react/src/app/entity/shared/components/styled/ERModelRelationship/ERModelRelationUtils.tsx b/datahub-web-react/src/app/entity/shared/components/styled/ERModelRelationship/ERModelRelationUtils.tsx index 0eb198aec48033..811ebf99b123a2 100644 --- a/datahub-web-react/src/app/entity/shared/components/styled/ERModelRelationship/ERModelRelationUtils.tsx +++ b/datahub-web-react/src/app/entity/shared/components/styled/ERModelRelationship/ERModelRelationUtils.tsx @@ -68,6 +68,6 @@ export function getDatasetName(datainput: any): string { datainput?.editableProperties?.name || datainput?.properties?.name || datainput?.name || - datainput?.urn?.split(',').at(1) + datainput?.urn?.split(',')?.at(1) ); } diff --git a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Queries/utils/filterQueries.ts b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Queries/utils/filterQueries.ts index a8ec960ea2e081..fb97c8235cbe60 100644 --- a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Queries/utils/filterQueries.ts +++ b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Queries/utils/filterQueries.ts @@ -10,9 +10,9 @@ export const filterQueries = (filterText, queries: Query[]) => { const lowerFilterText = filterText.toLowerCase(); return queries.filter((query) => { return ( - query.title?.toLowerCase().includes(lowerFilterText) || - query.description?.toLowerCase().includes(lowerFilterText) || - query.query?.toLowerCase().includes(lowerFilterText) + query.title?.toLowerCase()?.includes(lowerFilterText) || + query.description?.toLowerCase()?.includes(lowerFilterText) || + query.query?.toLowerCase()?.includes(lowerFilterText) ); }); }; diff --git a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/utils/filterSchemaRows.ts b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/utils/filterSchemaRows.ts index 53b76d53f886af..9c0813fc2b85ab 100644 --- a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/utils/filterSchemaRows.ts +++ b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/utils/filterSchemaRows.ts @@ -12,7 +12,7 @@ function matchesTagsOrTermsOrDescription(field: SchemaField, filterText: string, .toLocaleLowerCase() .includes(filterText), ) || - field.description?.toLocaleLowerCase().includes(filterText) + field.description?.toLocaleLowerCase()?.includes(filterText) ); } diff --git a/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx b/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx index a7e6f516bb7943..f56eb06b6af14e 100644 --- a/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx +++ b/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx @@ -129,7 +129,7 @@ export const ExecutionDetailsModal = ({ urn, open, onClose }: Props) => { downloadFile(output, `exec-${urn}.log`); }; - const logs = (showExpandedLogs && output) || output?.split('\n').slice(0, 5).join('\n'); + const logs = (showExpandedLogs && output) || output?.split('\n')?.slice(0, 5)?.join('\n'); const result = data?.executionRequest?.result as Partial; const status = getIngestionSourceStatus(result); @@ -163,7 +163,7 @@ export const ExecutionDetailsModal = ({ urn, open, onClose }: Props) => { } catch (e) { recipeYaml = ''; } - const recipe = showExpandedRecipe ? recipeYaml : recipeYaml?.split('\n').slice(0, 5).join('\n'); + const recipe = showExpandedRecipe ? recipeYaml : recipeYaml?.split('\n')?.slice(0, 5)?.join('\n'); const areLogsExpandable = output?.split(/\r\n|\r|\n/)?.length > 5; const isRecipeExpandable = recipeYaml?.split(/\r\n|\r|\n/)?.length > 5; diff --git a/datahub-web-react/src/app/lineage/utils/titleUtils.ts b/datahub-web-react/src/app/lineage/utils/titleUtils.ts index 6bd4cfea0f09a7..8bd0cbda55b33e 100644 --- a/datahub-web-react/src/app/lineage/utils/titleUtils.ts +++ b/datahub-web-react/src/app/lineage/utils/titleUtils.ts @@ -124,10 +124,10 @@ function truncate(input, length) { function getLastTokenOfTitle(title?: string): string { if (!title) return ''; - const lastToken = title?.split('.').slice(-1)[0]; + const lastToken = title?.split('.')?.slice(-1)?.[0]; // if the last token does not contain any content, the string should not be tokenized on `.` - if (lastToken.replace(/\s/g, '').length === 0) { + if (lastToken?.replace(/\s/g, '')?.length === 0) { return title; } diff --git a/datahub-web-react/src/app/search/context/SearchResultContext.tsx b/datahub-web-react/src/app/search/context/SearchResultContext.tsx index 68adead0051492..961a50c1d4bfe9 100644 --- a/datahub-web-react/src/app/search/context/SearchResultContext.tsx +++ b/datahub-web-react/src/app/search/context/SearchResultContext.tsx @@ -40,7 +40,7 @@ export const useSearchResult = () => { }; export const useEntityType = () => { - return useSearchResultContext()?.searchResult.entity.type; + return useSearchResultContext()?.searchResult?.entity?.type; }; export const useMatchedFields = () => { diff --git a/datahub-web-react/src/app/search/matches/MatchedFieldList.tsx b/datahub-web-react/src/app/search/matches/MatchedFieldList.tsx index 0bfe000dea3663..9d77d446ff3b82 100644 --- a/datahub-web-react/src/app/search/matches/MatchedFieldList.tsx +++ b/datahub-web-react/src/app/search/matches/MatchedFieldList.tsx @@ -42,7 +42,7 @@ const RenderedField = ({ field: MatchedField; }) => { const entityRegistry = useEntityRegistry(); - const query = useSearchQuery()?.trim().toLowerCase(); + const query = useSearchQuery()?.trim()?.toLowerCase(); const customRenderedField = customFieldRenderer?.(field); if (customRenderedField) return {customRenderedField}; if (isHighlightableEntityField(field)) { diff --git a/datahub-web-react/src/app/search/matches/SearchTextHighlighter.tsx b/datahub-web-react/src/app/search/matches/SearchTextHighlighter.tsx index d8da1088ea89d1..7a0a0e1e41a4b9 100644 --- a/datahub-web-react/src/app/search/matches/SearchTextHighlighter.tsx +++ b/datahub-web-react/src/app/search/matches/SearchTextHighlighter.tsx @@ -23,7 +23,7 @@ const SearchTextHighlighter = ({ field, text, enableFullHighlight = false }: Pro const enableNameHighlight = appConfig.config.visualConfig.searchResult?.enableNameHighlight; const matchedFields = useMatchedFieldsByGroup(field); const hasMatchedField = !!matchedFields?.length; - const normalizedSearchQuery = useSearchQuery()?.trim().toLowerCase(); + const normalizedSearchQuery = useSearchQuery()?.trim()?.toLowerCase(); const normalizedText = text.trim().toLowerCase(); const hasSubstring = hasMatchedField && !!normalizedSearchQuery && normalizedText.includes(normalizedSearchQuery); const pattern = enableFullHighlight ? HIGHLIGHT_ALL_PATTERN : undefined; From dc82251afed92ed605ce6dcc7c956396c494ca29 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 23 Dec 2024 13:03:52 -0500 Subject: [PATCH 03/28] fix(ingest): quote asset urns in patch path (#12212) --- metadata-ingestion/src/datahub/specific/dataproduct.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/specific/dataproduct.py b/metadata-ingestion/src/datahub/specific/dataproduct.py index 6b7e695b4d57e7..f9830a4b23df05 100644 --- a/metadata-ingestion/src/datahub/specific/dataproduct.py +++ b/metadata-ingestion/src/datahub/specific/dataproduct.py @@ -131,7 +131,7 @@ def add_asset(self, asset_urn: str) -> "DataProductPatchBuilder": self._add_patch( DataProductProperties.ASPECT_NAME, "add", - path=f"/assets/{asset_urn}", + path=f"/assets/{self.quote(asset_urn)}", value=DataProductAssociation(destinationUrn=asset_urn), ) return self @@ -140,7 +140,7 @@ def remove_asset(self, asset_urn: str) -> "DataProductPatchBuilder": self._add_patch( DataProductProperties.ASPECT_NAME, "remove", - path=f"/assets/{asset_urn}", + path=f"/assets/{self.quote(asset_urn)}", value={}, ) return self From 4c0b568887c7a3c2aa8a1e1b888ce362ce768485 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 23 Dec 2024 13:04:06 -0500 Subject: [PATCH 04/28] feat(ingest): add sql parser trace mode (#12210) --- .../datahub/sql_parsing/sqlglot_lineage.py | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py index f387618bfaec12..bf28ab0e7b229b 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py @@ -66,6 +66,7 @@ "SQL_LINEAGE_TIMEOUT_ENABLED", True ) SQL_LINEAGE_TIMEOUT_SECONDS = 10 +SQL_PARSER_TRACE = get_boolean_env_variable("DATAHUB_SQL_PARSER_TRACE", False) # These rules are a subset of the rules in sqlglot.optimizer.optimizer.RULES. @@ -365,10 +366,11 @@ def _sqlglot_force_column_normalizer( return node - # logger.debug( - # "Prior to case normalization sql %s", - # statement.sql(pretty=True, dialect=dialect), - # ) + if SQL_PARSER_TRACE: + logger.debug( + "Prior to case normalization sql %s", + statement.sql(pretty=True, dialect=dialect), + ) statement = statement.transform(_sqlglot_force_column_normalizer, copy=False) # logger.debug( # "Sql after casing normalization %s", @@ -562,7 +564,7 @@ def _select_statement_cll( # noqa: C901 ) ) - # TODO: Also extract referenced columns (aka auxillary / non-SELECT lineage) + # TODO: Also extract referenced columns (aka auxiliary / non-SELECT lineage) except (sqlglot.errors.OptimizeError, ValueError, IndexError) as e: raise SqlUnderstandingError( f"sqlglot failed to compute some lineage: {e}" @@ -1022,6 +1024,14 @@ def _sqlglot_lineage_inner( logger.debug( f"Resolved {total_schemas_resolved} of {total_tables_discovered} table schemas" ) + if SQL_PARSER_TRACE: + for qualified_table, schema_info in table_name_schema_mapping.items(): + logger.debug( + "Table name %s resolved to %s with schema %s", + qualified_table, + table_name_urn_mapping[qualified_table], + schema_info, + ) column_lineage: Optional[List[_ColumnLineageInfo]] = None try: From b6ea974630d68c61eb7c5cd624ee013817de7bd6 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 23 Dec 2024 13:04:15 -0500 Subject: [PATCH 05/28] fix(ingest): preserve certs when converting emitter to graph (#12211) --- metadata-ingestion/src/datahub/ingestion/graph/client.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index 4aa937639e9590..ca9a41172e5b6e 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -188,9 +188,12 @@ def from_emitter(cls, emitter: DatahubRestEmitter) -> "DataHubGraph": retry_max_times=emitter._retry_max_times, extra_headers=emitter._session.headers, disable_ssl_verification=emitter._session.verify is False, - # TODO: Support these headers. - # ca_certificate_path=emitter._ca_certificate_path, - # client_certificate_path=emitter._client_certificate_path, + ca_certificate_path=( + emitter._session.verify + if isinstance(emitter._session.verify, str) + else None + ), + client_certificate_path=emitter._session.cert, ) ) From 21ddb5538d08b64279f3526aa250ec489f5497ed Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 23 Dec 2024 16:32:49 -0500 Subject: [PATCH 06/28] fix(ingest/mode): move sql logic to view properties aspect (#12196) --- .../src/datahub/ingestion/source/mode.py | 21 ++++++--- .../integration/mode/mode_mces_golden.json | 43 ++++++++++++++++++- 2 files changed, 55 insertions(+), 9 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/mode.py b/metadata-ingestion/src/datahub/ingestion/source/mode.py index c1ab9271ce13ae..ef0b499129f97b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mode.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mode.py @@ -98,6 +98,7 @@ TagPropertiesClass, UpstreamClass, UpstreamLineageClass, + ViewPropertiesClass, ) from datahub.metadata.urns import QueryUrn from datahub.sql_parsing.sqlglot_lineage import ( @@ -930,16 +931,13 @@ def construct_query_or_dataset( dataset_props = DatasetPropertiesClass( name=report_info.get("name") if is_mode_dataset else query_data.get("name"), - description=f"""### Source Code -``` sql -{query_data.get("raw_query")} -``` - """, + description=None, externalUrl=externalUrl, customProperties=self.get_custom_props_from_dict( query_data, [ - "id" "created_at", + "id", + "created_at", "updated_at", "last_run_id", "data_source_id", @@ -949,7 +947,6 @@ def construct_query_or_dataset( ], ), ) - yield ( MetadataChangeProposalWrapper( entityUrn=query_urn, @@ -957,6 +954,16 @@ def construct_query_or_dataset( ).as_workunit() ) + if raw_query := query_data.get("raw_query"): + yield MetadataChangeProposalWrapper( + entityUrn=query_urn, + aspect=ViewPropertiesClass( + viewLogic=raw_query, + viewLanguage=QueryLanguageClass.SQL, + materialized=False, + ), + ).as_workunit() + if is_mode_dataset: space_container_key = self.gen_space_key(space_token) yield from add_dataset_to_container( diff --git a/metadata-ingestion/tests/integration/mode/mode_mces_golden.json b/metadata-ingestion/tests/integration/mode/mode_mces_golden.json index ed00dc5734680d..84dbdbe89f7b50 100644 --- a/metadata-ingestion/tests/integration/mode/mode_mces_golden.json +++ b/metadata-ingestion/tests/integration/mode/mode_mces_golden.json @@ -176,6 +176,7 @@ "datasets": [ "urn:li:dataset:(urn:li:dataPlatform:mode,5450544,PROD)" ], + "dashboards": [], "lastModified": { "created": { "time": 1639169724316, @@ -253,6 +254,8 @@ "aspect": { "json": { "customProperties": { + "id": "19780522", + "created_at": "2024-09-02T07:38:43.755Z", "updated_at": "2024-09-02T07:40:44.046Z", "last_run_id": "3535709679", "data_source_id": "44763", @@ -260,7 +263,6 @@ }, "externalUrl": "https://app.mode.com/acryl/datasets/24f66e1701b6", "name": "Dataset 1", - "description": "### Source Code\n``` sql\n-- Returns first 100 rows from DATAHUB_COMMUNITY.POSTGRES_PUBLIC.COMPANY\n SELECT \n\t\tAGE,\n\t\tID,\n\t\tNAME,\n\t\t_FIVETRAN_DELETED,\n\t\t_FIVETRAN_SYNCED\n FROM DATAHUB_COMMUNITY.POSTGRES_PUBLIC.COMPANY LIMIT 100;\n\n-- Returns first 100 rows from ETHAN_TEST_DB.PUBLIC.ACCOUNT_PHONE_NUMBER\n SELECT \n\t\tCOMMUNICATION_ACCOUNT_ID,\n\t\tID,\n\t\tMMS_CAPABLE,\n\t\tPHONE_NUMBER,\n\t\tSMS_CAPABLE,\n\t\tSTATUS,\n\t\tSTATUS_TLM,\n\t\tTLM,\n\t\tVOICE_CAPABLE,\n\t\tWHEN_CREATED\n FROM ETHAN_TEST_DB.PUBLIC.ACCOUNT_PHONE_NUMBER LIMIT 100;\n \n \n```\n ", "tags": [] } }, @@ -270,6 +272,24 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mode,5450544,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "-- Returns first 100 rows from DATAHUB_COMMUNITY.POSTGRES_PUBLIC.COMPANY\n SELECT \n\t\tAGE,\n\t\tID,\n\t\tNAME,\n\t\t_FIVETRAN_DELETED,\n\t\t_FIVETRAN_SYNCED\n FROM DATAHUB_COMMUNITY.POSTGRES_PUBLIC.COMPANY LIMIT 100;\n\n-- Returns first 100 rows from ETHAN_TEST_DB.PUBLIC.ACCOUNT_PHONE_NUMBER\n SELECT \n\t\tCOMMUNICATION_ACCOUNT_ID,\n\t\tID,\n\t\tMMS_CAPABLE,\n\t\tPHONE_NUMBER,\n\t\tSMS_CAPABLE,\n\t\tSTATUS,\n\t\tSTATUS_TLM,\n\t\tTLM,\n\t\tVOICE_CAPABLE,\n\t\tWHEN_CREATED\n FROM ETHAN_TEST_DB.PUBLIC.ACCOUNT_PHONE_NUMBER LIMIT 100;\n \n ", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "mode-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mode,5450544,PROD)", @@ -336,13 +356,14 @@ "aspect": { "json": { "customProperties": { + "id": "10149707", + "created_at": "2021-12-10T20:55:24.361Z", "updated_at": "2021-12-10T23:12:53.273Z", "last_run_id": "1897576958", "data_source_id": "34499" }, "externalUrl": "https://app.mode.com/acryl/reports/9d2da37fa91e/details/queries/6e26a9f3d4e2", "name": "Customer and staff", - "description": "### Source Code\n``` sql\nSELECT rental.*, staff.first_name \"Staff First Name\", staff.last_name \"Staff Last Name\" FROM {{ @join_on_definition as rental }} join staff on staff.staff_id = rental.staff_id where selected_id = {{ selected_id }} \n{% form %}\nselected_id:\n type: text\n default: my_id\n{% endform %}\n```\n ", "tags": [] } }, @@ -352,6 +373,24 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mode,10149707,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "SELECT rental.*, staff.first_name \"Staff First Name\", staff.last_name \"Staff Last Name\" FROM {{ @join_on_definition as rental }} join staff on staff.staff_id = rental.staff_id where selected_id = {{ selected_id }} \n{% form %}\nselected_id:\n type: text\n default: my_id\n{% endform %}", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "mode-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mode,10149707,PROD)", From 047644b888b121fa3feb10a5f33bdef60b1072ce Mon Sep 17 00:00:00 2001 From: Hyejin Yoon <0327jane@gmail.com> Date: Tue, 24 Dec 2024 10:06:35 +0900 Subject: [PATCH 07/28] feat: update mlflow-related metadata models (#12174) Co-authored-by: John Joyce Co-authored-by: John Joyce --- .../src/main/resources/entity.graphql | 196 +++++++++++++++++- .../dataprocess/DataProcessInstanceOutput.pdl | 2 +- .../DataProcessInstanceProperties.pdl | 2 +- .../ml/metadata/MLModelGroupProperties.pdl | 35 ++++ .../ml/metadata/MLModelProperties.pdl | 28 ++- .../ml/metadata/MLTrainingRunProperties.pdl | 36 ++++ .../src/main/resources/entity-registry.yml | 4 + .../com.linkedin.entity.aspects.snapshot.json | 54 +++-- ...com.linkedin.entity.entities.snapshot.json | 99 +++++++-- .../com.linkedin.entity.runs.snapshot.json | 54 +++-- ...nkedin.operations.operations.snapshot.json | 54 +++-- ...m.linkedin.platform.platform.snapshot.json | 99 +++++++-- 12 files changed, 568 insertions(+), 95 deletions(-) create mode 100644 metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLTrainingRunProperties.pdl diff --git a/datahub-graphql-core/src/main/resources/entity.graphql b/datahub-graphql-core/src/main/resources/entity.graphql index e086273068ee53..9abf4e16f12dd7 100644 --- a/datahub-graphql-core/src/main/resources/entity.graphql +++ b/datahub-graphql-core/src/main/resources/entity.graphql @@ -262,8 +262,16 @@ type Query { Fetch all Business Attributes """ listBusinessAttributes(input: ListBusinessAttributesInput!): ListBusinessAttributesResult + + """ + Fetch a Data Process Instance by primary key (urn) + """ + dataProcessInstance(urn: String!): DataProcessInstance + + } + """ An ERModelRelationship is a high-level abstraction that dictates what datasets fields are erModelRelationshiped. """ @@ -9832,15 +9840,45 @@ type MLModelGroup implements EntityWithRelationships & Entity & BrowsableEntity privileges: EntityPrivileges } +""" +Properties describing a group of related ML models +""" type MLModelGroupProperties { + """ + Display name of the model group + """ + name: String + """ + Detailed description of the model group's purpose and contents + """ description: String - createdAt: Long + """ + When this model group was created + """ + created: AuditStamp + """ + When this model group was last modified + """ + lastModified: AuditStamp + + """ + Version identifier for this model group + """ version: VersionTag + """ + Custom key-value properties for the model group + """ customProperties: [CustomPropertiesEntry!] + + """ + Deprecated creation timestamp + @deprecated Use the 'created' field instead + """ + createdAt: Long @deprecated(reason: "Use `created` instead") } """ @@ -9990,40 +10028,103 @@ description: String } type MLMetric { + """ + Name of the metric (e.g. accuracy, precision, recall) + """ name: String + """ + Description of what this metric measures + """ description: String + """ + The computed value of the metric + """ value: String + """ + Timestamp when this metric was recorded + """ createdAt: Long } type MLModelProperties { + """ + The display name of the model used in the UI + """ + name: String! + """ + Detailed description of the model's purpose and characteristics + """ description: String - date: Long + """ + When the model was last modified + """ + lastModified: AuditStamp + """ + Version identifier for this model + """ version: String + """ + The type/category of ML model (e.g. classification, regression) + """ type: String + """ + Mapping of hyperparameter configurations + """ hyperParameters: HyperParameterMap - hyperParams: [MLHyperParam] + """ + List of hyperparameter settings used to train this model + """ + hyperParams: [MLHyperParam] + """ + Performance metrics from model training + """ trainingMetrics: [MLMetric] + """ + Names of ML features used by this model + """ mlFeatures: [String!] + """ + Tags for categorizing and searching models + """ tags: [String!] + """ + Model groups this model belongs to + """ groups: [MLModelGroup] + """ + Additional custom properties specific to this model + """ customProperties: [CustomPropertiesEntry!] + """ + URL to view this model in external system + """ externalUrl: String + + """ + When this model was created + """ + created: AuditStamp + + """ + Deprecated timestamp for model creation + @deprecated Use 'created' field instead + """ + date: Long @deprecated(reason: "Use `created` instead") } type MLFeatureProperties { @@ -12804,3 +12905,92 @@ type CronSchedule { """ timezone: String! } + + +""" +Properties describing a data process instance's execution metadata +""" +type DataProcessInstanceProperties { + """ + The display name of this process instance + """ + name: String! + + """ + URL to view this process instance in the external system + """ + externalUrl: String + + """ + When this process instance was created + """ + created: AuditStamp + + """ + Additional custom properties specific to this process instance + """ + customProperties: [CustomPropertiesEntry!] +} + +""" +Properties specific to an ML model training run instance +""" +type MLTrainingRunProperties { + """ + Unique identifier for this training run + """ + id: String + + """ + List of URLs to access training run outputs (e.g. model artifacts, logs) + """ + outputUrls: [String] + + """ + Hyperparameters used in this training run + """ + hyperParams: [MLHyperParam] + + """ + Performance metrics recorded during this training run + """ + trainingMetrics: [MLMetric] +} + +extend type DataProcessInstance { + + """ + Additional read only properties associated with the Data Job + """ + properties: DataProcessInstanceProperties + + """ + The specific instance of the data platform that this entity belongs to + """ + dataPlatformInstance: DataPlatformInstance + + """ + Sub Types that this entity implements + """ + subTypes: SubTypes + + """ + The parent container in which the entity resides + """ + container: Container + + """ + Standardized platform urn where the data process instance is defined + """ + platform: DataPlatform! + + """ + Recursively get the lineage of containers for this entity + """ + parentContainers: ParentContainersResult + + """ + Additional properties when subtype is Training Run + """ + mlTrainingRunProperties: MLTrainingRunProperties +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceOutput.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceOutput.pdl index f33c41e63efed6..fe782dbe01ca9b 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceOutput.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceOutput.pdl @@ -15,7 +15,7 @@ record DataProcessInstanceOutput { @Relationship = { "/*": { "name": "Produces", - "entityTypes": [ "dataset" ] + "entityTypes": [ "dataset", "mlModel" ] } } @Searchable = { diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl index c63cb1a97c017d..5c6bfaecf1ef4d 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl @@ -52,4 +52,4 @@ record DataProcessInstanceProperties includes CustomProperties, ExternalReferenc } created: AuditStamp -} +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelGroupProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelGroupProperties.pdl index b54e430038082d..81c5e7a240f618 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelGroupProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelGroupProperties.pdl @@ -4,6 +4,7 @@ import com.linkedin.common.Urn import com.linkedin.common.Time import com.linkedin.common.VersionTag import com.linkedin.common.CustomProperties +import com.linkedin.common.TimeStamp /** * Properties associated with an ML Model Group @@ -13,6 +14,17 @@ import com.linkedin.common.CustomProperties } record MLModelGroupProperties includes CustomProperties { + /** + * Display name of the MLModelGroup + */ + @Searchable = { + "fieldType": "WORD_GRAM", + "enableAutocomplete": true, + "boostScore": 10.0, + "queryByDefault": true, + } + name: optional string + /** * Documentation of the MLModelGroup */ @@ -25,8 +37,31 @@ record MLModelGroupProperties includes CustomProperties { /** * Date when the MLModelGroup was developed */ + @deprecated createdAt: optional Time + /** + * Time and Actor who created the MLModelGroup + */ + created: optional TimeStamp + + /** + * Date when the MLModelGroup was last modified + */ + lastModified: optional TimeStamp + + /** + * List of jobs (if any) used to train the model group. Visible in Lineage. + */ + @Relationship = { + "/*": { + "name": "TrainedBy", + "entityTypes": [ "dataJob" ], + "isLineage": true + } + } + trainingJobs: optional array[Urn] + /** * Version of the MLModelGroup */ diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelProperties.pdl index 621a3e1747b504..d89d07384bba1d 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelProperties.pdl @@ -6,6 +6,7 @@ import com.linkedin.common.Time import com.linkedin.common.VersionTag import com.linkedin.common.CustomProperties import com.linkedin.common.ExternalReference +import com.linkedin.common.TimeStamp /** * Properties associated with a ML Model @@ -15,6 +16,18 @@ import com.linkedin.common.ExternalReference } record MLModelProperties includes CustomProperties, ExternalReference { + /** + * Display name of the MLModel + */ + @Searchable = { + "fieldType": "WORD_GRAM", + "enableAutocomplete": true, + "boostScore": 10.0, + "queryByDefault": true, + } + name: optional string + + /** * Documentation of the MLModel */ @@ -27,8 +40,19 @@ record MLModelProperties includes CustomProperties, ExternalReference { /** * Date when the MLModel was developed */ + @deprecated date: optional Time + /** + * Audit stamp containing who created this and when + */ + created: optional TimeStamp + + /** + * Date when the MLModel was last modified + */ + lastModified: optional TimeStamp + /** * Version of the MLModel */ @@ -93,12 +117,12 @@ record MLModelProperties includes CustomProperties, ExternalReference { deployments: optional array[Urn] /** - * List of jobs (if any) used to train the model + * List of jobs (if any) used to train the model. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect. */ @Relationship = { "/*": { "name": "TrainedBy", - "entityTypes": [ "dataJob" ], + "entityTypes": [ "dataJob", "dataProcessInstance" ], "isLineage": true } } diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLTrainingRunProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLTrainingRunProperties.pdl new file mode 100644 index 00000000000000..f8b8eeafe908b7 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLTrainingRunProperties.pdl @@ -0,0 +1,36 @@ +namespace com.linkedin.ml.metadata + +import com.linkedin.common.AuditStamp +import com.linkedin.common.CustomProperties +import com.linkedin.common.ExternalReference +import com.linkedin.common.Urn +import com.linkedin.common.JobFlowUrn +import com.linkedin.common.DataJobUrn +/** + * The inputs and outputs of this training run + */ +@Aspect = { + "name": "mlTrainingRunProperties", +} +record MLTrainingRunProperties includes CustomProperties, ExternalReference { + + /** + * Run Id of the ML Training Run + */ + id: optional string + + /** + * List of URLs for the Outputs of the ML Training Run + */ + outputUrls: optional array[string] + + /** + * Hyperparameters of the ML Training Run + */ + hyperParams: optional array[MLHyperParam] + + /** + * Metrics of the ML Training Run + */ + trainingMetrics: optional array[MLMetric] +} \ No newline at end of file diff --git a/metadata-models/src/main/resources/entity-registry.yml b/metadata-models/src/main/resources/entity-registry.yml index 1c3eb5b574e204..4fe170ced69f33 100644 --- a/metadata-models/src/main/resources/entity-registry.yml +++ b/metadata-models/src/main/resources/entity-registry.yml @@ -116,6 +116,10 @@ entities: - dataProcessInstanceRunEvent - status - testResults + - dataPlatformInstance + - subTypes + - container + - mlTrainingRunProperties - name: chart category: core keyAspect: chartKey diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json index 827789130d8bbb..1c713fd33884b5 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json @@ -3826,12 +3826,23 @@ "type" : "record", "name" : "MLModelProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with a ML Model", + "doc" : "Properties associated with a ML Model\r", "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModel\r", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModel", + "doc" : "Documentation of the MLModel\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -3840,17 +3851,28 @@ }, { "name" : "date", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModel was developed", + "doc" : "Date when the MLModel was developed\r", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Audit stamp containing who created this and when\r", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModel was last modified\r", "optional" : true }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModel", + "doc" : "Version of the MLModel\r", "optional" : true }, { "name" : "type", "type" : "string", - "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc", + "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT_PARTIAL" @@ -3866,7 +3888,7 @@ "ref" : [ "string", "int", "float", "double", "boolean" ] } }, - "doc" : "Hyper Parameters of the MLModel\n\nNOTE: these are deprecated in favor of hyperParams", + "doc" : "Hyper Parameters of the MLModel\r\n\r\nNOTE: these are deprecated in favor of hyperParams\r", "optional" : true }, { "name" : "hyperParams", @@ -3901,7 +3923,7 @@ } } }, - "doc" : "Hyperparameters of the MLModel", + "doc" : "Hyperparameters of the MLModel\r", "optional" : true }, { "name" : "trainingMetrics", @@ -3936,7 +3958,7 @@ } } }, - "doc" : "Metrics of the MLModel used in training", + "doc" : "Metrics of the MLModel used in training\r", "optional" : true }, { "name" : "onlineMetrics", @@ -3944,7 +3966,7 @@ "type" : "array", "items" : "MLMetric" }, - "doc" : "Metrics of the MLModel used in production", + "doc" : "Metrics of the MLModel used in production\r", "optional" : true }, { "name" : "mlFeatures", @@ -3952,7 +3974,7 @@ "type" : "array", "items" : "com.linkedin.common.MLFeatureUrn" }, - "doc" : "List of features used for MLModel training", + "doc" : "List of features used for MLModel training\r", "optional" : true, "Relationship" : { "/*" : { @@ -3967,7 +3989,7 @@ "type" : "array", "items" : "string" }, - "doc" : "Tags for the MLModel", + "doc" : "Tags for the MLModel\r", "default" : [ ] }, { "name" : "deployments", @@ -3975,7 +3997,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Deployments for the MLModel", + "doc" : "Deployments for the MLModel\r", "optional" : true, "Relationship" : { "/*" : { @@ -3989,11 +4011,11 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) used to train the model", + "doc" : "List of jobs (if any) used to train the model. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", "optional" : true, "Relationship" : { "/*" : { - "entityTypes" : [ "dataJob" ], + "entityTypes" : [ "dataJob", "dataProcessInstance" ], "isLineage" : true, "name" : "TrainedBy" } @@ -4004,7 +4026,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) that use the model", + "doc" : "List of jobs (if any) that use the model\r", "optional" : true, "Relationship" : { "/*" : { @@ -4020,7 +4042,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Groups the model belongs to", + "doc" : "Groups the model belongs to\r", "optional" : true, "Relationship" : { "/*" : { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json index b549cef0af84b2..77d4644f3c121a 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json @@ -3984,12 +3984,23 @@ "type" : "record", "name" : "MLModelProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with a ML Model", + "doc" : "Properties associated with a ML Model\r", "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModel\r", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModel", + "doc" : "Documentation of the MLModel\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -3998,17 +4009,28 @@ }, { "name" : "date", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModel was developed", + "doc" : "Date when the MLModel was developed\r", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Audit stamp containing who created this and when\r", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModel was last modified\r", "optional" : true }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModel", + "doc" : "Version of the MLModel\r", "optional" : true }, { "name" : "type", "type" : "string", - "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc", + "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT_PARTIAL" @@ -4024,7 +4046,7 @@ "ref" : [ "string", "int", "float", "double", "boolean" ] } }, - "doc" : "Hyper Parameters of the MLModel\n\nNOTE: these are deprecated in favor of hyperParams", + "doc" : "Hyper Parameters of the MLModel\r\n\r\nNOTE: these are deprecated in favor of hyperParams\r", "optional" : true }, { "name" : "hyperParams", @@ -4059,7 +4081,7 @@ } } }, - "doc" : "Hyperparameters of the MLModel", + "doc" : "Hyperparameters of the MLModel\r", "optional" : true }, { "name" : "trainingMetrics", @@ -4094,7 +4116,7 @@ } } }, - "doc" : "Metrics of the MLModel used in training", + "doc" : "Metrics of the MLModel used in training\r", "optional" : true }, { "name" : "onlineMetrics", @@ -4102,7 +4124,7 @@ "type" : "array", "items" : "MLMetric" }, - "doc" : "Metrics of the MLModel used in production", + "doc" : "Metrics of the MLModel used in production\r", "optional" : true }, { "name" : "mlFeatures", @@ -4110,7 +4132,7 @@ "type" : "array", "items" : "com.linkedin.common.MLFeatureUrn" }, - "doc" : "List of features used for MLModel training", + "doc" : "List of features used for MLModel training\r", "optional" : true, "Relationship" : { "/*" : { @@ -4125,7 +4147,7 @@ "type" : "array", "items" : "string" }, - "doc" : "Tags for the MLModel", + "doc" : "Tags for the MLModel\r", "default" : [ ] }, { "name" : "deployments", @@ -4133,7 +4155,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Deployments for the MLModel", + "doc" : "Deployments for the MLModel\r", "optional" : true, "Relationship" : { "/*" : { @@ -4147,11 +4169,11 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) used to train the model", + "doc" : "List of jobs (if any) used to train the model. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", "optional" : true, "Relationship" : { "/*" : { - "entityTypes" : [ "dataJob" ], + "entityTypes" : [ "dataJob", "dataProcessInstance" ], "isLineage" : true, "name" : "TrainedBy" } @@ -4162,7 +4184,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) that use the model", + "doc" : "List of jobs (if any) that use the model\r", "optional" : true, "Relationship" : { "/*" : { @@ -4178,7 +4200,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Groups the model belongs to", + "doc" : "Groups the model belongs to\r", "optional" : true, "Relationship" : { "/*" : { @@ -4981,12 +5003,23 @@ "type" : "record", "name" : "MLModelGroupProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with an ML Model Group", + "doc" : "Properties associated with an ML Model Group\r", "include" : [ "com.linkedin.common.CustomProperties" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModelGroup\r", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModelGroup", + "doc" : "Documentation of the MLModelGroup\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -4995,12 +5028,38 @@ }, { "name" : "createdAt", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModelGroup was developed", + "doc" : "Date when the MLModelGroup was developed\r", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Time and Actor who created the MLModelGroup\r", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModelGroup was last modified\r", "optional" : true + }, { + "name" : "trainingJobs", + "type" : { + "type" : "array", + "items" : "com.linkedin.common.Urn" + }, + "doc" : "List of jobs (if any) used to train the model group. Visible in Lineage.\r", + "optional" : true, + "Relationship" : { + "/*" : { + "entityTypes" : [ "dataJob" ], + "isLineage" : true, + "name" : "TrainedBy" + } + } }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModelGroup", + "doc" : "Version of the MLModelGroup\r", "optional" : true } ], "Aspect" : { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json index c8be9d063eaea9..8b6def75f7a665 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json @@ -3550,12 +3550,23 @@ "type" : "record", "name" : "MLModelProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with a ML Model", + "doc" : "Properties associated with a ML Model\r", "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModel\r", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModel", + "doc" : "Documentation of the MLModel\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -3564,17 +3575,28 @@ }, { "name" : "date", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModel was developed", + "doc" : "Date when the MLModel was developed\r", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Audit stamp containing who created this and when\r", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModel was last modified\r", "optional" : true }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModel", + "doc" : "Version of the MLModel\r", "optional" : true }, { "name" : "type", "type" : "string", - "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc", + "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT_PARTIAL" @@ -3590,7 +3612,7 @@ "ref" : [ "string", "int", "float", "double", "boolean" ] } }, - "doc" : "Hyper Parameters of the MLModel\n\nNOTE: these are deprecated in favor of hyperParams", + "doc" : "Hyper Parameters of the MLModel\r\n\r\nNOTE: these are deprecated in favor of hyperParams\r", "optional" : true }, { "name" : "hyperParams", @@ -3625,7 +3647,7 @@ } } }, - "doc" : "Hyperparameters of the MLModel", + "doc" : "Hyperparameters of the MLModel\r", "optional" : true }, { "name" : "trainingMetrics", @@ -3660,7 +3682,7 @@ } } }, - "doc" : "Metrics of the MLModel used in training", + "doc" : "Metrics of the MLModel used in training\r", "optional" : true }, { "name" : "onlineMetrics", @@ -3668,7 +3690,7 @@ "type" : "array", "items" : "MLMetric" }, - "doc" : "Metrics of the MLModel used in production", + "doc" : "Metrics of the MLModel used in production\r", "optional" : true }, { "name" : "mlFeatures", @@ -3676,7 +3698,7 @@ "type" : "array", "items" : "com.linkedin.common.MLFeatureUrn" }, - "doc" : "List of features used for MLModel training", + "doc" : "List of features used for MLModel training\r", "optional" : true, "Relationship" : { "/*" : { @@ -3691,7 +3713,7 @@ "type" : "array", "items" : "string" }, - "doc" : "Tags for the MLModel", + "doc" : "Tags for the MLModel\r", "default" : [ ] }, { "name" : "deployments", @@ -3699,7 +3721,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Deployments for the MLModel", + "doc" : "Deployments for the MLModel\r", "optional" : true, "Relationship" : { "/*" : { @@ -3713,11 +3735,11 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) used to train the model", + "doc" : "List of jobs (if any) used to train the model. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", "optional" : true, "Relationship" : { "/*" : { - "entityTypes" : [ "dataJob" ], + "entityTypes" : [ "dataJob", "dataProcessInstance" ], "isLineage" : true, "name" : "TrainedBy" } @@ -3728,7 +3750,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) that use the model", + "doc" : "List of jobs (if any) that use the model\r", "optional" : true, "Relationship" : { "/*" : { @@ -3744,7 +3766,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Groups the model belongs to", + "doc" : "Groups the model belongs to\r", "optional" : true, "Relationship" : { "/*" : { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json index 8c7595c5e505d8..e4cc5c42303ee2 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json @@ -3544,12 +3544,23 @@ "type" : "record", "name" : "MLModelProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with a ML Model", + "doc" : "Properties associated with a ML Model\r", "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModel\r", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModel", + "doc" : "Documentation of the MLModel\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -3558,17 +3569,28 @@ }, { "name" : "date", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModel was developed", + "doc" : "Date when the MLModel was developed\r", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Audit stamp containing who created this and when\r", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModel was last modified\r", "optional" : true }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModel", + "doc" : "Version of the MLModel\r", "optional" : true }, { "name" : "type", "type" : "string", - "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc", + "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT_PARTIAL" @@ -3584,7 +3606,7 @@ "ref" : [ "string", "int", "float", "double", "boolean" ] } }, - "doc" : "Hyper Parameters of the MLModel\n\nNOTE: these are deprecated in favor of hyperParams", + "doc" : "Hyper Parameters of the MLModel\r\n\r\nNOTE: these are deprecated in favor of hyperParams\r", "optional" : true }, { "name" : "hyperParams", @@ -3619,7 +3641,7 @@ } } }, - "doc" : "Hyperparameters of the MLModel", + "doc" : "Hyperparameters of the MLModel\r", "optional" : true }, { "name" : "trainingMetrics", @@ -3654,7 +3676,7 @@ } } }, - "doc" : "Metrics of the MLModel used in training", + "doc" : "Metrics of the MLModel used in training\r", "optional" : true }, { "name" : "onlineMetrics", @@ -3662,7 +3684,7 @@ "type" : "array", "items" : "MLMetric" }, - "doc" : "Metrics of the MLModel used in production", + "doc" : "Metrics of the MLModel used in production\r", "optional" : true }, { "name" : "mlFeatures", @@ -3670,7 +3692,7 @@ "type" : "array", "items" : "com.linkedin.common.MLFeatureUrn" }, - "doc" : "List of features used for MLModel training", + "doc" : "List of features used for MLModel training\r", "optional" : true, "Relationship" : { "/*" : { @@ -3685,7 +3707,7 @@ "type" : "array", "items" : "string" }, - "doc" : "Tags for the MLModel", + "doc" : "Tags for the MLModel\r", "default" : [ ] }, { "name" : "deployments", @@ -3693,7 +3715,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Deployments for the MLModel", + "doc" : "Deployments for the MLModel\r", "optional" : true, "Relationship" : { "/*" : { @@ -3707,11 +3729,11 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) used to train the model", + "doc" : "List of jobs (if any) used to train the model. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", "optional" : true, "Relationship" : { "/*" : { - "entityTypes" : [ "dataJob" ], + "entityTypes" : [ "dataJob", "dataProcessInstance" ], "isLineage" : true, "name" : "TrainedBy" } @@ -3722,7 +3744,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) that use the model", + "doc" : "List of jobs (if any) that use the model\r", "optional" : true, "Relationship" : { "/*" : { @@ -3738,7 +3760,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Groups the model belongs to", + "doc" : "Groups the model belongs to\r", "optional" : true, "Relationship" : { "/*" : { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json index 75e5c9a559076b..e375ac698ab516 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json @@ -3978,12 +3978,23 @@ "type" : "record", "name" : "MLModelProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with a ML Model", + "doc" : "Properties associated with a ML Model\r", "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModel\r", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModel", + "doc" : "Documentation of the MLModel\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -3992,17 +4003,28 @@ }, { "name" : "date", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModel was developed", + "doc" : "Date when the MLModel was developed\r", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Audit stamp containing who created this and when\r", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModel was last modified\r", "optional" : true }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModel", + "doc" : "Version of the MLModel\r", "optional" : true }, { "name" : "type", "type" : "string", - "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc", + "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT_PARTIAL" @@ -4018,7 +4040,7 @@ "ref" : [ "string", "int", "float", "double", "boolean" ] } }, - "doc" : "Hyper Parameters of the MLModel\n\nNOTE: these are deprecated in favor of hyperParams", + "doc" : "Hyper Parameters of the MLModel\r\n\r\nNOTE: these are deprecated in favor of hyperParams\r", "optional" : true }, { "name" : "hyperParams", @@ -4053,7 +4075,7 @@ } } }, - "doc" : "Hyperparameters of the MLModel", + "doc" : "Hyperparameters of the MLModel\r", "optional" : true }, { "name" : "trainingMetrics", @@ -4088,7 +4110,7 @@ } } }, - "doc" : "Metrics of the MLModel used in training", + "doc" : "Metrics of the MLModel used in training\r", "optional" : true }, { "name" : "onlineMetrics", @@ -4096,7 +4118,7 @@ "type" : "array", "items" : "MLMetric" }, - "doc" : "Metrics of the MLModel used in production", + "doc" : "Metrics of the MLModel used in production\r", "optional" : true }, { "name" : "mlFeatures", @@ -4104,7 +4126,7 @@ "type" : "array", "items" : "com.linkedin.common.MLFeatureUrn" }, - "doc" : "List of features used for MLModel training", + "doc" : "List of features used for MLModel training\r", "optional" : true, "Relationship" : { "/*" : { @@ -4119,7 +4141,7 @@ "type" : "array", "items" : "string" }, - "doc" : "Tags for the MLModel", + "doc" : "Tags for the MLModel\r", "default" : [ ] }, { "name" : "deployments", @@ -4127,7 +4149,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Deployments for the MLModel", + "doc" : "Deployments for the MLModel\r", "optional" : true, "Relationship" : { "/*" : { @@ -4141,11 +4163,11 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) used to train the model", + "doc" : "List of jobs (if any) used to train the model. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", "optional" : true, "Relationship" : { "/*" : { - "entityTypes" : [ "dataJob" ], + "entityTypes" : [ "dataJob", "dataProcessInstance" ], "isLineage" : true, "name" : "TrainedBy" } @@ -4156,7 +4178,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) that use the model", + "doc" : "List of jobs (if any) that use the model\r", "optional" : true, "Relationship" : { "/*" : { @@ -4172,7 +4194,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Groups the model belongs to", + "doc" : "Groups the model belongs to\r", "optional" : true, "Relationship" : { "/*" : { @@ -4975,12 +4997,23 @@ "type" : "record", "name" : "MLModelGroupProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with an ML Model Group", + "doc" : "Properties associated with an ML Model Group\r", "include" : [ "com.linkedin.common.CustomProperties" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModelGroup\r", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModelGroup", + "doc" : "Documentation of the MLModelGroup\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -4989,12 +5022,38 @@ }, { "name" : "createdAt", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModelGroup was developed", + "doc" : "Date when the MLModelGroup was developed\r", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Time and Actor who created the MLModelGroup\r", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModelGroup was last modified\r", "optional" : true + }, { + "name" : "trainingJobs", + "type" : { + "type" : "array", + "items" : "com.linkedin.common.Urn" + }, + "doc" : "List of jobs (if any) used to train the model group. Visible in Lineage.\r", + "optional" : true, + "Relationship" : { + "/*" : { + "entityTypes" : [ "dataJob" ], + "isLineage" : true, + "name" : "TrainedBy" + } + } }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModelGroup", + "doc" : "Version of the MLModelGroup\r", "optional" : true } ], "Aspect" : { From 09a9b6eef912d8f855a2cc6fdc03032f5ec7a652 Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Mon, 23 Dec 2024 22:39:57 -0800 Subject: [PATCH 08/28] feat(ingest/looker): Do not emit usage for non-ingested dashboards and charts (#11647) --- .../ingestion/source/looker/looker_common.py | 9 + .../ingestion/source/looker/looker_source.py | 22 +- .../ingestion/source/looker/looker_usage.py | 40 +- .../looker/looker_mces_usage_history.json | 364 +++++++++++++++++- .../tests/integration/looker/test_looker.py | 87 ++++- 5 files changed, 482 insertions(+), 40 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py index a66962f962255f..1183916e9b3fef 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py @@ -1408,6 +1408,15 @@ class LookerDashboardSourceReport(StaleEntityRemovalSourceReport): dashboards_with_activity: LossySet[str] = dataclasses_field( default_factory=LossySet ) + + # Entities that don't seem to exist, so we don't emit usage aspects for them despite having usage data + dashboards_skipped_for_usage: LossySet[str] = dataclasses_field( + default_factory=LossySet + ) + charts_skipped_for_usage: LossySet[str] = dataclasses_field( + default_factory=LossySet + ) + stage_latency: List[StageLatency] = dataclasses_field(default_factory=list) _looker_explore_registry: Optional[LookerExploreRegistry] = None total_explores: int = 0 diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py index 815c5dfb1c0147..8487d5113bc1d3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py @@ -68,6 +68,7 @@ ViewField, ViewFieldType, gen_model_key, + get_urn_looker_element_id, ) from datahub.ingestion.source.looker.looker_config import LookerDashboardSourceConfig from datahub.ingestion.source.looker.looker_lib_wrapper import LookerAPI @@ -165,6 +166,9 @@ def __init__(self, config: LookerDashboardSourceConfig, ctx: PipelineContext): # Required, as we do not ingest all folders but only those that have dashboards/looks self.processed_folders: List[str] = [] + # Keep track of ingested chart urns, to omit usage for non-ingested entities + self.chart_urns: Set[str] = set() + @staticmethod def test_connection(config_dict: dict) -> TestConnectionReport: test_report = TestConnectionReport() @@ -642,6 +646,7 @@ def _make_chart_metadata_events( chart_urn = self._make_chart_urn( element_id=dashboard_element.get_urn_element_id() ) + self.chart_urns.add(chart_urn) chart_snapshot = ChartSnapshot( urn=chart_urn, aspects=[Status(removed=False)], @@ -1380,7 +1385,9 @@ def _get_folder_and_ancestors_workunits( yield from self._emit_folder_as_container(folder) def extract_usage_stat( - self, looker_dashboards: List[looker_usage.LookerDashboardForUsage] + self, + looker_dashboards: List[looker_usage.LookerDashboardForUsage], + ingested_chart_urns: Set[str], ) -> List[MetadataChangeProposalWrapper]: looks: List[looker_usage.LookerChartForUsage] = [] # filter out look from all dashboard @@ -1391,6 +1398,15 @@ def extract_usage_stat( # dedup looks looks = list({str(look.id): look for look in looks}.values()) + filtered_looks = [] + for look in looks: + if not look.id: + continue + chart_urn = self._make_chart_urn(get_urn_looker_element_id(look.id)) + if chart_urn in ingested_chart_urns: + filtered_looks.append(look) + else: + self.reporter.charts_skipped_for_usage.add(look.id) # Keep stat generators to generate entity stat aspect later stat_generator_config: looker_usage.StatGeneratorConfig = ( @@ -1414,7 +1430,7 @@ def extract_usage_stat( stat_generator_config, self.reporter, self._make_chart_urn, - looks, + filtered_looks, ) mcps: List[MetadataChangeProposalWrapper] = [] @@ -1669,7 +1685,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: if self.source_config.extract_usage_history: self.reporter.report_stage_start("usage_extraction") usage_mcps: List[MetadataChangeProposalWrapper] = self.extract_usage_stat( - looker_dashboards_for_usage + looker_dashboards_for_usage, self.chart_urns ) for usage_mcp in usage_mcps: yield usage_mcp.as_workunit() diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py index ef7d64e4f42d43..098d7d73a3da84 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py @@ -42,6 +42,7 @@ TimeWindowSizeClass, _Aspect as AspectAbstract, ) +from datahub.utilities.lossy_collections import LossySet logger = logging.getLogger(__name__) @@ -170,7 +171,7 @@ def __init__( self.config = config self.looker_models = looker_models # Later it will help to find out for what are the looker entities from query result - self.id_vs_model: Dict[str, ModelForUsage] = { + self.id_to_model: Dict[str, ModelForUsage] = { self.get_id(looker_object): looker_object for looker_object in looker_models } self.post_filter = len(self.looker_models) > 100 @@ -225,6 +226,10 @@ def get_id(self, looker_object: ModelForUsage) -> str: def get_id_from_row(self, row: dict) -> str: pass + @abstractmethod + def report_skip_set(self) -> LossySet[str]: + pass + def create_mcp( self, model: ModelForUsage, aspect: Aspect ) -> MetadataChangeProposalWrapper: @@ -258,20 +263,11 @@ def _process_entity_timeseries_rows( return entity_stat_aspect - def _process_absolute_aspect(self) -> List[Tuple[ModelForUsage, AspectAbstract]]: - aspects: List[Tuple[ModelForUsage, AspectAbstract]] = [] - for looker_object in self.looker_models: - aspects.append( - (looker_object, self.to_entity_absolute_stat_aspect(looker_object)) - ) - - return aspects - def _fill_user_stat_aspect( self, entity_usage_stat: Dict[Tuple[str, str], Aspect], user_wise_rows: List[Dict], - ) -> Iterable[Tuple[ModelForUsage, Aspect]]: + ) -> Iterable[Tuple[str, Aspect]]: logger.debug("Entering fill user stat aspect") # We first resolve all the users using a threadpool to warm up the cache @@ -300,7 +296,7 @@ def _fill_user_stat_aspect( for row in user_wise_rows: # Confirm looker object was given for stat generation - looker_object = self.id_vs_model.get(self.get_id_from_row(row)) + looker_object = self.id_to_model.get(self.get_id_from_row(row)) if looker_object is None: logger.warning( "Looker object with id({}) was not register with stat generator".format( @@ -338,7 +334,7 @@ def _fill_user_stat_aspect( logger.debug("Starting to yield answers for user-wise counts") for (id, _), aspect in entity_usage_stat.items(): - yield self.id_vs_model[id], aspect + yield id, aspect def _execute_query(self, query: LookerQuery, query_name: str) -> List[Dict]: rows = [] @@ -357,7 +353,7 @@ def _execute_query(self, query: LookerQuery, query_name: str) -> List[Dict]: ) if self.post_filter: logger.debug("post filtering") - rows = [r for r in rows if self.get_id_from_row(r) in self.id_vs_model] + rows = [r for r in rows if self.get_id_from_row(r) in self.id_to_model] logger.debug("Filtered down to %d rows", len(rows)) except Exception as e: logger.warning(f"Failed to execute {query_name} query: {e}") @@ -378,7 +374,8 @@ def generate_usage_stat_mcps(self) -> Iterable[MetadataChangeProposalWrapper]: return # yield absolute stat for looker entities - for looker_object, aspect in self._process_absolute_aspect(): # type: ignore + for looker_object in self.looker_models: + aspect = self.to_entity_absolute_stat_aspect(looker_object) yield self.create_mcp(looker_object, aspect) # Execute query and process the raw json which contains stat information @@ -399,10 +396,13 @@ def generate_usage_stat_mcps(self) -> Iterable[MetadataChangeProposalWrapper]: ) user_wise_rows = self._execute_query(user_wise_query_with_filters, "user_query") # yield absolute stat for entity - for looker_object, aspect in self._fill_user_stat_aspect( + for object_id, aspect in self._fill_user_stat_aspect( entity_usage_stat, user_wise_rows ): - yield self.create_mcp(looker_object, aspect) + if object_id in self.id_to_model: + yield self.create_mcp(self.id_to_model[object_id], aspect) + else: + self.report_skip_set().add(object_id) class DashboardStatGenerator(BaseStatGenerator): @@ -425,6 +425,9 @@ def __init__( def get_stats_generator_name(self) -> str: return "DashboardStats" + def report_skip_set(self) -> LossySet[str]: + return self.report.dashboards_skipped_for_usage + def get_filter(self) -> Dict[ViewField, str]: return { HistoryViewField.HISTORY_DASHBOARD_ID: ",".join( @@ -541,6 +544,9 @@ def __init__( def get_stats_generator_name(self) -> str: return "ChartStats" + def report_skip_set(self) -> LossySet[str]: + return self.report.charts_skipped_for_usage + def get_filter(self) -> Dict[ViewField, str]: return { LookViewField.LOOK_ID: ",".join( diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json index 594983c8fb0f2a..ed0c5401c9029f 100644 --- a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json +++ b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json @@ -1,4 +1,66 @@ [ +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.ChartSnapshot": { + "urn": "urn:li:chart:(looker,dashboard_elements.3)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.chart.ChartInfo": { + "customProperties": { + "upstream_fields": "" + }, + "title": "", + "description": "", + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + }, + "chartUrl": "https://looker.company.com/x/", + "inputs": [ + { + "string": "urn:li:dataset:(urn:li:dataPlatform:looker,look_data.explore.look_view,PROD)" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(looker,dashboard_elements.3)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Look" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DashboardSnapshot": { @@ -9,7 +71,9 @@ "customProperties": {}, "title": "foo", "description": "lorem ipsum", - "charts": [], + "charts": [ + "urn:li:chart:(looker,dashboard_elements.3)" + ], "datasets": [], "dashboards": [], "lastModified": { @@ -89,6 +153,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(looker,dashboard_elements.3)", + "changeType": "UPSERT", + "aspectName": "inputFields", + "aspect": { + "json": { + "fields": [] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dashboard", "entityUrn": "urn:li:dashboard:(looker,dashboards.1)", @@ -215,6 +295,98 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "looker", + "env": "PROD", + "model_name": "look_data" + }, + "name": "look_data", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "LookML Model" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "Explore" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { @@ -389,6 +561,180 @@ "lastRunId": "no-run-id-provided" } }, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,look_data.explore.look_view,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.BrowsePaths": { + "paths": [ + "/Explore/look_data" + ] + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "project": "lkml_samples", + "model": "look_data", + "looker.explore.label": "My Explore View", + "looker.explore.name": "look_view", + "looker.explore.file": "test_source_file.lkml" + }, + "externalUrl": "https://looker.company.com/explore/look_data/look_view", + "name": "My Explore View", + "description": "lorem ipsum", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", + "type": "VIEW" + } + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "look_view", + "platform": "urn:li:dataPlatform:looker", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "" + } + }, + "fields": [ + { + "fieldPath": "dim1", + "nullable": false, + "description": "dimension one description", + "label": "Dimensions One Label", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + } + ], + "primaryKeys": [] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,look_data.explore.look_view,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Explore" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,look_data.explore.look_view,PROD)", + "changeType": "UPSERT", + "aspectName": "embed", + "aspect": { + "json": { + "renderUrl": "https://looker.company.com/embed/explore/look_data/look_view" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,look_data.explore.look_view,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,look_data.explore.look_view,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "Explore" + }, + { + "id": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb", + "urn": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { @@ -747,22 +1093,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "chart", - "entityUrn": "urn:li:chart:(looker,dashboard_elements.3)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "looker-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/test_looker.py b/metadata-ingestion/tests/integration/looker/test_looker.py index a39de8384efb23..c96bcc729a95da 100644 --- a/metadata-ingestion/tests/integration/looker/test_looker.py +++ b/metadata-ingestion/tests/integration/looker/test_looker.py @@ -31,7 +31,10 @@ from datahub.ingestion.api.source import SourceReport from datahub.ingestion.run.pipeline import Pipeline, PipelineInitError from datahub.ingestion.source.looker import looker_common, looker_usage -from datahub.ingestion.source.looker.looker_common import LookerExplore +from datahub.ingestion.source.looker.looker_common import ( + LookerDashboardSourceReport, + LookerExplore, +) from datahub.ingestion.source.looker.looker_config import LookerCommonConfig from datahub.ingestion.source.looker.looker_lib_wrapper import ( LookerAPI, @@ -414,7 +417,9 @@ def setup_mock_dashboard_multiple_charts(mocked_client): ) -def setup_mock_dashboard_with_usage(mocked_client): +def setup_mock_dashboard_with_usage( + mocked_client: mock.MagicMock, skip_look: bool = False +) -> None: mocked_client.all_dashboards.return_value = [Dashboard(id="1")] mocked_client.dashboard.return_value = Dashboard( id="1", @@ -437,7 +442,13 @@ def setup_mock_dashboard_with_usage(mocked_client): ), ), DashboardElement( - id="3", type="", look=LookWithQuery(id="3", view_count=30) + id="3", + type="" if skip_look else "vis", # Looks only ingested if type == `vis` + look=LookWithQuery( + id="3", + view_count=30, + query=Query(model="look_data", view="look_view"), + ), ), ], ) @@ -611,6 +622,12 @@ def side_effect_query_inline( HistoryViewField.HISTORY_DASHBOARD_USER: 1, HistoryViewField.HISTORY_DASHBOARD_RUN_COUNT: 5, }, + { + HistoryViewField.HISTORY_DASHBOARD_ID: "5", + HistoryViewField.HISTORY_CREATED_DATE: "2022-07-07", + HistoryViewField.HISTORY_DASHBOARD_USER: 1, + HistoryViewField.HISTORY_DASHBOARD_RUN_COUNT: 5, + }, ] ), looker_usage.QueryId.DASHBOARD_PER_USER_PER_DAY_USAGE_STAT: json.dumps( @@ -790,6 +807,70 @@ def test_looker_ingest_usage_history(pytestconfig, tmp_path, mock_time): ) +@freeze_time(FROZEN_TIME) +def test_looker_filter_usage_history(pytestconfig, tmp_path, mock_time): + mocked_client = mock.MagicMock() + with mock.patch("looker_sdk.init40") as mock_sdk: + mock_sdk.return_value = mocked_client + setup_mock_dashboard_with_usage(mocked_client, skip_look=True) + mocked_client.run_inline_query.side_effect = side_effect_query_inline + setup_mock_explore(mocked_client) + setup_mock_user(mocked_client) + + temp_output_file = f"{tmp_path}/looker_mces.json" + pipeline = Pipeline.create( + { + "run_id": "looker-test", + "source": { + "type": "looker", + "config": { + "base_url": "https://looker.company.com", + "client_id": "foo", + "client_secret": "bar", + "extract_usage_history": True, + "max_threads": 1, + }, + }, + "sink": { + "type": "file", + "config": { + "filename": temp_output_file, + }, + }, + } + ) + pipeline.run() + pipeline.pretty_print_summary() + pipeline.raise_from_status() + + # There should be 4 dashboardUsageStatistics aspects (one absolute and 3 timeseries) + dashboard_usage_aspect_count = 0 + # There should be 0 chartUsageStatistics -- filtered by set of ingested charts + chart_usage_aspect_count = 0 + with open(temp_output_file) as f: + temp_output_dict = json.load(f) + for element in temp_output_dict: + if ( + element.get("entityType") == "dashboard" + and element.get("aspectName") == "dashboardUsageStatistics" + ): + dashboard_usage_aspect_count = dashboard_usage_aspect_count + 1 + if ( + element.get("entityType") == "chart" + and element.get("aspectName") == "chartUsageStatistics" + ): + chart_usage_aspect_count = chart_usage_aspect_count + 1 + + assert dashboard_usage_aspect_count == 4 + assert chart_usage_aspect_count == 0 + + source_report = cast(LookerDashboardSourceReport, pipeline.source.get_report()) + # From timeseries query + assert str(source_report.dashboards_skipped_for_usage) == str(["5"]) + # From dashboard element + assert str(source_report.charts_skipped_for_usage) == str(["3"]) + + @freeze_time(FROZEN_TIME) def test_looker_ingest_stateful(pytestconfig, tmp_path, mock_time, mock_datahub_graph): output_file_name: str = "looker_mces.json" From 87e7b58ac699005ca5757e6ef47fb853d89a6583 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Tue, 24 Dec 2024 10:46:19 +0100 Subject: [PATCH 09/28] fix(tableau): retry on InternalServerError 504 (#12213) --- .../ingestion/source/tableau/tableau.py | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index 984cf9357199d6..2b7aac2bea1d05 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -35,7 +35,10 @@ SiteItem, TableauAuth, ) -from tableauserverclient.server.endpoint.exceptions import NonXMLResponseError +from tableauserverclient.server.endpoint.exceptions import ( + InternalServerError, + NonXMLResponseError, +) from urllib3 import Retry import datahub.emitter.mce_builder as builder @@ -1196,6 +1199,24 @@ def get_connection_object_page( retry_on_auth_error=False, retries_remaining=retries_remaining - 1, ) + + except InternalServerError as ise: + # In some cases Tableau Server returns 504 error, which is a timeout error, so it worths to retry. + if ise.code == 504: + if retries_remaining <= 0: + raise ise + return self.get_connection_object_page( + query=query, + connection_type=connection_type, + query_filter=query_filter, + fetch_size=fetch_size, + current_cursor=current_cursor, + retry_on_auth_error=False, + retries_remaining=retries_remaining - 1, + ) + else: + raise ise + except OSError: # In tableauseverclient 0.26 (which was yanked and released in 0.28 on 2023-10-04), # the request logic was changed to use threads. From 4d990b06bd0df4f51443893e2efb39e09d9818b6 Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Tue, 24 Dec 2024 18:14:51 +0530 Subject: [PATCH 10/28] fix(ingest/snowflake): always ingest view and external table ddl lineage (#12191) --- docs/how/updating-datahub.md | 2 +- .../source/snowflake/snowflake_config.py | 28 ++----------------- .../source/snowflake/snowflake_lineage_v2.py | 13 ++------- .../source/snowflake/snowflake_query.py | 9 ------ .../source/snowflake/snowflake_schema_gen.py | 6 +--- .../source/snowflake/snowflake_shares.py | 2 +- .../source/snowflake/snowflake_v2.py | 20 +++++++++---- .../source_report/ingestion_stage.py | 1 + .../tests/integration/snowflake/common.py | 2 -- .../integration/snowflake/test_snowflake.py | 2 -- .../test_snowflake_classification.py | 1 - .../snowflake/test_snowflake_failures.py | 2 -- .../snowflake/test_snowflake_tag.py | 2 -- .../performance/snowflake/test_snowflake.py | 1 - .../unit/snowflake/test_snowflake_source.py | 23 +++++++-------- 15 files changed, 36 insertions(+), 78 deletions(-) diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index 5bc0e66fa2ff1d..a742ebe0cd8968 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -17,7 +17,7 @@ This file documents any backwards-incompatible changes in DataHub and assists people when migrating to a new version. ## Next - +- #12191 - Configs `include_view_lineage` and `include_view_column_lineage` are removed from snowflake ingestion source. View and External Table DDL lineage will always be ingested when definitions are available. - #11560 - The PowerBI ingestion source configuration option include_workspace_name_in_dataset_urn determines whether the workspace name is included in the PowerBI dataset's URN.
PowerBI allows to have identical name of semantic model and their tables across the workspace, It will overwrite the semantic model in-case of multi-workspace ingestion.
Entity urn with `include_workspace_name_in_dataset_urn: false` diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index 1d1cc3c2af4f08..2b2dcf860cdb07 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -163,26 +163,13 @@ class SnowflakeConfig( default=True, description="If enabled, populates the snowflake table-to-table and s3-to-snowflake table lineage. Requires appropriate grants given to the role and Snowflake Enterprise Edition or above.", ) - include_view_lineage: bool = pydantic.Field( - default=True, - description="If enabled, populates the snowflake view->table and table->view lineages. Requires appropriate grants given to the role, and include_table_lineage to be True. view->table lineage requires Snowflake Enterprise Edition or above.", - ) + + _include_view_lineage = pydantic_removed_field("include_view_lineage") + _include_view_column_lineage = pydantic_removed_field("include_view_column_lineage") ignore_start_time_lineage: bool = False upstream_lineage_in_report: bool = False - @pydantic.root_validator(skip_on_failure=True) - def validate_include_view_lineage(cls, values): - if ( - "include_table_lineage" in values - and not values.get("include_table_lineage") - and values.get("include_view_lineage") - ): - raise ValueError( - "include_table_lineage must be True for include_view_lineage to be set." - ) - return values - class SnowflakeV2Config( SnowflakeConfig, @@ -222,11 +209,6 @@ class SnowflakeV2Config( description="Populates table->table and view->table column lineage. Requires appropriate grants given to the role and the Snowflake Enterprise Edition or above.", ) - include_view_column_lineage: bool = Field( - default=True, - description="Populates view->view and table->view column lineage using DataHub's sql parser.", - ) - use_queries_v2: bool = Field( default=False, description="If enabled, uses the new queries extractor to extract queries from snowflake.", @@ -355,10 +337,6 @@ def get_sql_alchemy_url( self, database=database, username=username, password=password, role=role ) - @property - def parse_view_ddl(self) -> bool: - return self.include_view_column_lineage - @validator("shares") def validate_shares( cls, shares: Optional[Dict[str, SnowflakeShareConfig]], values: Dict diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index b815a6584379ac..6b200590d7ab63 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -8,7 +8,6 @@ from datahub.configuration.datetimes import parse_absolute_time from datahub.ingestion.api.closeable import Closeable -from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.aws.s3_util import make_s3_urn_for_lineage from datahub.ingestion.source.snowflake.constants import ( LINEAGE_PERMISSION_ERROR, @@ -163,11 +162,11 @@ def get_time_window(self) -> Tuple[datetime, datetime]: self.config.end_time, ) - def get_workunits( + def add_time_based_lineage_to_aggregator( self, discovered_tables: List[str], discovered_views: List[str], - ) -> Iterable[MetadataWorkUnit]: + ) -> None: if not self._should_ingest_lineage(): return @@ -177,9 +176,7 @@ def get_workunits( # snowflake view/table -> snowflake table self.populate_table_upstreams(discovered_tables) - for mcp in self.sql_aggregator.gen_metadata(): - yield mcp.as_workunit() - + def update_state(self): if self.redundant_run_skip_handler: # Update the checkpoint state for this run. self.redundant_run_skip_handler.update_state( @@ -337,10 +334,6 @@ def _fetch_upstream_lineages_for_tables(self) -> Iterable[UpstreamLineageEdge]: start_time_millis=int(self.start_time.timestamp() * 1000), end_time_millis=int(self.end_time.timestamp() * 1000), upstreams_deny_pattern=self.config.temporary_tables_pattern, - # The self.config.include_view_lineage setting is about fetching upstreams of views. - # We always generate lineage pointing at views from tables, even if self.config.include_view_lineage is False. - # TODO: Remove this `include_view_lineage` flag, since it's effectively dead code. - include_view_lineage=True, include_column_lineage=self.config.include_column_lineage, ) try: diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py index 97c398c1962d6b..a94b39476b2c22 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py @@ -376,7 +376,6 @@ def view_dependencies() -> str: def table_to_table_lineage_history_v2( start_time_millis: int, end_time_millis: int, - include_view_lineage: bool = True, include_column_lineage: bool = True, upstreams_deny_pattern: List[str] = DEFAULT_TEMP_TABLES_PATTERNS, ) -> str: @@ -385,14 +384,12 @@ def table_to_table_lineage_history_v2( start_time_millis, end_time_millis, upstreams_deny_pattern, - include_view_lineage, ) else: return SnowflakeQuery.table_upstreams_only( start_time_millis, end_time_millis, upstreams_deny_pattern, - include_view_lineage, ) @staticmethod @@ -677,12 +674,9 @@ def table_upstreams_with_column_lineage( start_time_millis: int, end_time_millis: int, upstreams_deny_pattern: List[str], - include_view_lineage: bool = True, ) -> str: allowed_upstream_table_domains = ( SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER - if include_view_lineage - else SnowflakeQuery.ACCESS_HISTORY_TABLE_DOMAINS_FILTER ) upstream_sql_filter = create_deny_regex_sql_filter( @@ -847,12 +841,9 @@ def table_upstreams_only( start_time_millis: int, end_time_millis: int, upstreams_deny_pattern: List[str], - include_view_lineage: bool = True, ) -> str: allowed_upstream_table_domains = ( SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER - if include_view_lineage - else SnowflakeQuery.ACCESS_HISTORY_TABLE_DOMAINS_FILTER ) upstream_sql_filter = create_deny_regex_sql_filter( diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py index 4b72b09fafe2dd..8a1bf15b7a7bc4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py @@ -435,11 +435,7 @@ def _process_schema( ) if self.config.include_views: - if ( - self.aggregator - and self.config.include_view_lineage - and self.config.parse_view_ddl - ): + if self.aggregator: for view in views: view_identifier = self.identifiers.get_dataset_identifier( view.name, schema_name, db_name diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py index 794a6f4a59f46f..606acd53dc3324 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py @@ -72,7 +72,7 @@ def get_shares_workunits( assert len(sibling_dbs) == 1 # SnowflakeLineageExtractor is unaware of database->schema->table hierarchy # hence this lineage code is not written in SnowflakeLineageExtractor - # also this is not governed by configs include_table_lineage and include_view_lineage + # also this is not governed by configs include_table_lineage yield self.get_upstream_lineage_with_primary_sibling( db.name, schema.name, table_name, sibling_dbs[0] ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index 884e6c49f5b62a..954e8a29c1a1bd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -82,6 +82,7 @@ LINEAGE_EXTRACTION, METADATA_EXTRACTION, QUERIES_EXTRACTION, + VIEW_PARSING, ) from datahub.sql_parsing.sql_parsing_aggregator import SqlParsingAggregator from datahub.utilities.registries.domain_registry import DomainRegistry @@ -103,7 +104,7 @@ @capability(SourceCapability.DESCRIPTIONS, "Enabled by default") @capability( SourceCapability.LINEAGE_COARSE, - "Enabled by default, can be disabled via configuration `include_table_lineage` and `include_view_lineage`", + "Enabled by default, can be disabled via configuration `include_table_lineage`", ) @capability( SourceCapability.LINEAGE_FINE, @@ -512,15 +513,14 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: discovered_datasets = discovered_tables + discovered_views if self.config.use_queries_v2: - self.report.set_ingestion_stage("*", "View Parsing") - assert self.aggregator is not None + self.report.set_ingestion_stage("*", VIEW_PARSING) yield from auto_workunit(self.aggregator.gen_metadata()) self.report.set_ingestion_stage("*", QUERIES_EXTRACTION) schema_resolver = self.aggregator._schema_resolver - queries_extractor: SnowflakeQueriesExtractor = SnowflakeQueriesExtractor( + queries_extractor = SnowflakeQueriesExtractor( connection=self.connection, config=SnowflakeQueriesExtractorConfig( window=self.config, @@ -546,13 +546,21 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: queries_extractor.close() else: - if self.config.include_table_lineage and self.lineage_extractor: + if self.lineage_extractor: self.report.set_ingestion_stage("*", LINEAGE_EXTRACTION) - yield from self.lineage_extractor.get_workunits( + self.lineage_extractor.add_time_based_lineage_to_aggregator( discovered_tables=discovered_tables, discovered_views=discovered_views, ) + # This would emit view and external table ddl lineage + # as well as query lineage via lineage_extractor + for mcp in self.aggregator.gen_metadata(): + yield mcp.as_workunit() + + if self.lineage_extractor: + self.lineage_extractor.update_state() + if ( self.config.include_usage_stats or self.config.include_operational_stats ) and self.usage_extractor: diff --git a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py index 92407eaae6e901..42b3b648bd298d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py +++ b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py @@ -15,6 +15,7 @@ USAGE_EXTRACTION_OPERATIONAL_STATS = "Usage Extraction Operational Stats" USAGE_EXTRACTION_USAGE_AGGREGATION = "Usage Extraction Usage Aggregation" EXTERNAL_TABLE_DDL_LINEAGE = "External table DDL Lineage" +VIEW_PARSING = "View Parsing" QUERIES_EXTRACTION = "Queries Extraction" PROFILING = "Profiling" diff --git a/metadata-ingestion/tests/integration/snowflake/common.py b/metadata-ingestion/tests/integration/snowflake/common.py index 862d27186703a8..7b4f5abe1cd462 100644 --- a/metadata-ingestion/tests/integration/snowflake/common.py +++ b/metadata-ingestion/tests/integration/snowflake/common.py @@ -458,7 +458,6 @@ def default_query_results( # noqa: C901 snowflake_query.SnowflakeQuery.table_to_table_lineage_history_v2( start_time_millis=1654473600000, end_time_millis=1654586220000, - include_view_lineage=True, include_column_lineage=True, ), ): @@ -548,7 +547,6 @@ def default_query_results( # noqa: C901 snowflake_query.SnowflakeQuery.table_to_table_lineage_history_v2( start_time_millis=1654473600000, end_time_millis=1654586220000, - include_view_lineage=True, include_column_lineage=False, ), ): diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py index 1d7470d24f7689..ef4918a20e640c 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py @@ -117,7 +117,6 @@ def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph): schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]), include_technical_schema=True, include_table_lineage=True, - include_view_lineage=True, include_usage_stats=True, format_sql_queries=True, validate_upstreams_against_patterns=False, @@ -216,7 +215,6 @@ def test_snowflake_private_link_and_incremental_mcps( include_table_lineage=True, include_column_lineage=False, include_views=True, - include_view_lineage=True, include_usage_stats=False, format_sql_queries=True, incremental_lineage=False, diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_classification.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_classification.py index 75a9df4f280512..52453b30f740ab 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_classification.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_classification.py @@ -66,7 +66,6 @@ def test_snowflake_classification_perf(num_workers, num_cols_per_table, num_tabl schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]), include_technical_schema=True, include_table_lineage=False, - include_view_lineage=False, include_column_lineage=False, include_usage_stats=False, include_operational_stats=False, diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py index 0b838b0bb59c3a..de6e996a52642b 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py @@ -49,7 +49,6 @@ def snowflake_pipeline_config(tmp_path): include_technical_schema=True, match_fully_qualified_names=True, schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]), - include_view_lineage=False, include_usage_stats=False, start_time=datetime(2022, 6, 6, 0, 0, 0, 0).replace( tzinfo=timezone.utc @@ -227,7 +226,6 @@ def test_snowflake_missing_snowflake_lineage_permission_causes_pipeline_failure( snowflake_query.SnowflakeQuery.table_to_table_lineage_history_v2( start_time_millis=1654473600000, end_time_millis=1654586220000, - include_view_lineage=True, include_column_lineage=True, ) ], diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_tag.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_tag.py index d5e265e7838825..9bb598cb0c1c7f 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_tag.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_tag.py @@ -30,7 +30,6 @@ def test_snowflake_tag_pattern(): ), include_technical_schema=True, include_table_lineage=False, - include_view_lineage=False, include_column_lineage=False, include_usage_stats=False, include_operational_stats=False, @@ -74,7 +73,6 @@ def test_snowflake_tag_pattern_deny(): ), include_technical_schema=True, include_table_lineage=False, - include_view_lineage=False, include_column_lineage=False, include_usage_stats=False, include_operational_stats=False, diff --git a/metadata-ingestion/tests/performance/snowflake/test_snowflake.py b/metadata-ingestion/tests/performance/snowflake/test_snowflake.py index 5042c78c2e7b91..984d9e42957452 100644 --- a/metadata-ingestion/tests/performance/snowflake/test_snowflake.py +++ b/metadata-ingestion/tests/performance/snowflake/test_snowflake.py @@ -37,7 +37,6 @@ def run_test(): password="TST_PWD", include_technical_schema=False, include_table_lineage=True, - include_view_lineage=True, include_usage_stats=True, include_operational_stats=True, start_time=datetime(2022, 6, 6, 0, 0, 0, 0).replace(tzinfo=timezone.utc), diff --git a/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py b/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py index 2ff85a08f052f9..75f32b535eb2e8 100644 --- a/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py +++ b/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py @@ -257,17 +257,6 @@ def test_options_contain_connect_args(): assert connect_args is not None -def test_snowflake_config_with_view_lineage_no_table_lineage_throws_error(): - config_dict = default_config_dict.copy() - config_dict["include_view_lineage"] = True - config_dict["include_table_lineage"] = False - with pytest.raises( - ValidationError, - match="include_table_lineage must be True for include_view_lineage to be set", - ): - SnowflakeV2Config.parse_obj(config_dict) - - def test_snowflake_config_with_column_lineage_no_table_lineage_throws_error(): config_dict = default_config_dict.copy() config_dict["include_column_lineage"] = True @@ -667,6 +656,18 @@ def test_snowflake_utils() -> None: assert_doctest(datahub.ingestion.source.snowflake.snowflake_utils) +def test_using_removed_fields_causes_no_error() -> None: + assert SnowflakeV2Config.parse_obj( + { + "account_id": "test", + "username": "snowflake", + "password": "snowflake", + "include_view_lineage": "true", + "include_view_column_lineage": "true", + } + ) + + def test_snowflake_query_result_parsing(): db_row = { "DOWNSTREAM_TABLE_NAME": "db.schema.downstream_table", From d88e6c997713509d8ecdb463c42d072c5c857853 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Tue, 24 Dec 2024 16:03:36 +0100 Subject: [PATCH 11/28] fix(tableau): fixes wrong argument when reauthenticating (#12216) --- .../ingestion/source/tableau/tableau.py | 48 +++++++++++-------- .../tableau/test_tableau_ingest.py | 10 ++-- 2 files changed, 33 insertions(+), 25 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index 2b7aac2bea1d05..508500ffe489b9 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -621,6 +621,12 @@ def update_table( self.parsed_columns = parsed_columns +@dataclass +class SiteIdContentUrl: + site_id: str + site_content_url: str + + class TableauSourceReport(StaleEntityRemovalSourceReport): get_all_datasources_query_failed: bool = False num_get_datasource_query_failures: int = 0 @@ -773,7 +779,6 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: config=self.config, ctx=self.ctx, site=site, - site_id=site.id, report=self.report, server=self.server, platform=self.platform, @@ -792,8 +797,11 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: site_source = TableauSiteSource( config=self.config, ctx=self.ctx, - site=site, - site_id=self.server.site_id, + site=site + if site + else SiteIdContentUrl( + site_id=self.server.site_id, site_content_url=self.config.site + ), report=self.report, server=self.server, platform=self.platform, @@ -826,8 +834,7 @@ def __init__( self, config: TableauConfig, ctx: PipelineContext, - site: Optional[SiteItem], - site_id: Optional[str], + site: Union[SiteItem, SiteIdContentUrl], report: TableauSourceReport, server: Server, platform: str, @@ -838,13 +845,18 @@ def __init__( self.ctx: PipelineContext = ctx self.platform = platform - self.site: Optional[SiteItem] = site - if site_id is not None: - self.site_id: str = site_id + self.site: Optional[SiteItem] = None + if isinstance(site, SiteItem): + self.site = site + assert site.id is not None, "Site ID is required" + self.site_id = site.id + self.site_content_url = site.content_url + elif isinstance(site, SiteIdContentUrl): + self.site = None + self.site_id = site.site_id + self.site_content_url = site.site_content_url else: - assert self.site is not None, "site or site_id is required" - assert self.site.id is not None, "site_id is required when site is provided" - self.site_id = self.site.id + raise AssertionError("site or site id+content_url pair is required") self.database_tables: Dict[str, DatabaseTable] = {} self.tableau_stat_registry: Dict[str, UsageStat] = {} @@ -898,16 +910,14 @@ def dataset_browse_prefix(self) -> str: # datasets also have the env in the browse path return f"/{self.config.env.lower()}{self.no_env_browse_prefix}" - def _re_authenticate(self): + def _re_authenticate(self) -> None: + self.report.info( + message="Re-authenticating to Tableau", + context=f"site='{self.site_content_url}'", + ) # Sign-in again may not be enough because Tableau sometimes caches invalid sessions # so we need to recreate the Tableau Server object - self.server = self.config.make_tableau_client(self.site_id) - - @property - def site_content_url(self) -> Optional[str]: - if self.site and self.site.content_url: - return self.site.content_url - return None + self.server = self.config.make_tableau_client(self.site_content_url) def _populate_usage_stat_registry(self) -> None: if self.server is None: diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index c3a8880bf20a09..902ff243c802a8 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -26,6 +26,7 @@ from datahub.ingestion.run.pipeline import Pipeline, PipelineContext from datahub.ingestion.source.tableau import tableau_constant as c from datahub.ingestion.source.tableau.tableau import ( + SiteIdContentUrl, TableauConfig, TableauProject, TableauSiteSource, @@ -1008,8 +1009,7 @@ def check_lineage_metadata( config=config, ctx=context, platform="tableau", - site=SiteItem(name="Site 1", content_url="site1"), - site_id="site1", + site=SiteIdContentUrl(site_id="id1", site_content_url="site1"), report=TableauSourceReport(), server=Server("https://test-tableau-server.com"), ) @@ -1313,8 +1313,7 @@ def test_permission_warning(pytestconfig, tmp_path, mock_datahub_graph): platform="tableau", config=mock.MagicMock(), ctx=mock.MagicMock(), - site=mock.MagicMock(), - site_id=None, + site=mock.MagicMock(spec=SiteItem, id="Site1", content_url="site1"), server=mock_sdk.return_value, report=reporter, ) @@ -1371,8 +1370,7 @@ def test_extract_project_hierarchy(extract_project_hierarchy, allowed_projects): config=config, ctx=context, platform="tableau", - site=SiteItem(name="Site 1", content_url="site1"), - site_id="site1", + site=mock.MagicMock(spec=SiteItem, id="Site1", content_url="site1"), report=TableauSourceReport(), server=Server("https://test-tableau-server.com"), ) From 48736a03dd56c70a3894efcbef6e95a23d8cbfdd Mon Sep 17 00:00:00 2001 From: sagar-salvi-apptware <159135491+sagar-salvi-apptware@users.noreply.github.com> Date: Wed, 25 Dec 2024 00:57:27 +0530 Subject: [PATCH 12/28] fix(ingest/looker): Add flag for Looker metadata extraction (#12205) Co-authored-by: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> --- .../src/datahub/sql_parsing/tool_meta_extractor.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py b/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py index 5af9d9d4f0fffc..d2682252e0fbf5 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py +++ b/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py @@ -40,6 +40,7 @@ def _get_last_line(query: str) -> str: class ToolMetaExtractorReport(Report): num_queries_meta_extracted: Dict[str, int] = field(default_factory=int_top_k_dict) failures: List[str] = field(default_factory=list) + looker_user_mapping_missing: Optional[bool] = None class ToolMetaExtractor: @@ -108,7 +109,9 @@ def extract_looker_user_mapping_from_graph( PlatformResource.search_by_filters(query=query, graph_client=graph) ) - if len(platform_resources) > 1: + if len(platform_resources) == 0: + report.looker_user_mapping_missing = True + elif len(platform_resources) > 1: report.failures.append( "Looker user metadata extraction failed. Found more than one looker user id mappings." ) From f4b33b59d1726dd962db4d3300f085cc60626a81 Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Tue, 24 Dec 2024 11:33:06 -0800 Subject: [PATCH 13/28] fix(ingest/mode): Handle 204 response and invalid json (#12156) Co-authored-by: Aseem Bansal --- .../src/datahub/ingestion/source/mode.py | 46 +++--- .../tests/integration/mode/test_mode.py | 141 ++++++++++++++++-- 2 files changed, 151 insertions(+), 36 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/mode.py b/metadata-ingestion/src/datahub/ingestion/source/mode.py index ef0b499129f97b..68ecc5d8694ac5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mode.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mode.py @@ -5,6 +5,7 @@ from dataclasses import dataclass from datetime import datetime, timezone from functools import lru_cache +from json import JSONDecodeError from typing import Dict, Iterable, List, Optional, Set, Tuple, Union import dateutil.parser as dp @@ -193,6 +194,9 @@ class HTTPError429(HTTPError): pass +ModeRequestError = (HTTPError, JSONDecodeError) + + @dataclass class ModeSourceReport(StaleEntityRemovalSourceReport): filtered_spaces: LossyList[str] = dataclasses.field(default_factory=LossyList) @@ -328,11 +332,11 @@ def __init__(self, ctx: PipelineContext, config: ModeConfig): # Test the connection try: self._get_request_json(f"{self.config.connect_uri}/api/verify") - except HTTPError as http_error: + except ModeRequestError as e: self.report.report_failure( title="Failed to Connect", message="Unable to verify connection to mode.", - context=f"Error: {str(http_error)}", + context=f"Error: {str(e)}", ) self.workspace_uri = f"{self.config.connect_uri}/api/{self.config.workspace}" @@ -521,11 +525,11 @@ def _get_creator(self, href: str) -> Optional[str]: if self.config.owner_username_instead_of_email else user_json.get("email") ) - except HTTPError as http_error: + except ModeRequestError as e: self.report.report_warning( title="Failed to retrieve Mode creator", message=f"Unable to retrieve user for {href}", - context=f"Reason: {str(http_error)}", + context=f"Reason: {str(e)}", ) return user @@ -571,11 +575,11 @@ def _get_space_name_and_tokens(self) -> dict: logging.debug(f"Skipping space {space_name} due to space pattern") continue space_info[s.get("token", "")] = s.get("name", "") - except HTTPError as http_error: + except ModeRequestError as e: self.report.report_failure( title="Failed to Retrieve Spaces", message="Unable to retrieve spaces / collections for workspace.", - context=f"Workspace: {self.workspace_uri}, Error: {str(http_error)}", + context=f"Workspace: {self.workspace_uri}, Error: {str(e)}", ) return space_info @@ -721,11 +725,11 @@ def _get_data_sources(self) -> List[dict]: try: ds_json = self._get_request_json(f"{self.workspace_uri}/data_sources") data_sources = ds_json.get("_embedded", {}).get("data_sources", []) - except HTTPError as http_error: + except ModeRequestError as e: self.report.report_failure( title="Failed to retrieve Data Sources", message="Unable to retrieve data sources from Mode.", - context=f"Error: {str(http_error)}", + context=f"Error: {str(e)}", ) return data_sources @@ -812,11 +816,11 @@ def _get_definition(self, definition_name): if definition.get("name", "") == definition_name: return definition.get("source", "") - except HTTPError as http_error: + except ModeRequestError as e: self.report.report_failure( title="Failed to Retrieve Definition", message="Unable to retrieve definition from Mode.", - context=f"Definition Name: {definition_name}, Error: {str(http_error)}", + context=f"Definition Name: {definition_name}, Error: {str(e)}", ) return None @@ -1382,11 +1386,11 @@ def _get_reports(self, space_token: str) -> List[dict]: f"{self.workspace_uri}/spaces/{space_token}/reports" ) reports = reports_json.get("_embedded", {}).get("reports", {}) - except HTTPError as http_error: + except ModeRequestError as e: self.report.report_failure( title="Failed to Retrieve Reports for Space", message="Unable to retrieve reports for space token.", - context=f"Space Token: {space_token}, Error: {str(http_error)}", + context=f"Space Token: {space_token}, Error: {str(e)}", ) return reports @@ -1400,11 +1404,11 @@ def _get_datasets(self, space_token: str) -> List[dict]: url = f"{self.workspace_uri}/spaces/{space_token}/datasets" datasets_json = self._get_request_json(url) datasets = datasets_json.get("_embedded", {}).get("reports", []) - except HTTPError as http_error: + except ModeRequestError as e: self.report.report_failure( title="Failed to Retrieve Datasets for Space", message=f"Unable to retrieve datasets for space token {space_token}.", - context=f"Error: {str(http_error)}", + context=f"Error: {str(e)}", ) return datasets @@ -1416,11 +1420,11 @@ def _get_queries(self, report_token: str) -> list: f"{self.workspace_uri}/reports/{report_token}/queries" ) queries = queries_json.get("_embedded", {}).get("queries", {}) - except HTTPError as http_error: + except ModeRequestError as e: self.report.report_failure( title="Failed to Retrieve Queries", message="Unable to retrieve queries for report token.", - context=f"Report Token: {report_token}, Error: {str(http_error)}", + context=f"Report Token: {report_token}, Error: {str(e)}", ) return queries @@ -1433,11 +1437,11 @@ def _get_last_query_run( f"{self.workspace_uri}/reports/{report_token}/runs/{report_run_id}/query_runs{query_run_id}" ) queries = queries_json.get("_embedded", {}).get("queries", {}) - except HTTPError as http_error: + except ModeRequestError as e: self.report.report_failure( title="Failed to Retrieve Queries for Report", message="Unable to retrieve queries for report token.", - context=f"Report Token:{report_token}, Error: {str(http_error)}", + context=f"Report Token:{report_token}, Error: {str(e)}", ) return {} return queries @@ -1451,13 +1455,13 @@ def _get_charts(self, report_token: str, query_token: str) -> list: f"/queries/{query_token}/charts" ) charts = charts_json.get("_embedded", {}).get("charts", {}) - except HTTPError as http_error: + except ModeRequestError as e: self.report.report_failure( title="Failed to Retrieve Charts", message="Unable to retrieve charts from Mode.", context=f"Report Token: {report_token}, " f"Query token: {query_token}, " - f"Error: {str(http_error)}", + f"Error: {str(e)}", ) return charts @@ -1477,6 +1481,8 @@ def get_request(): response = self.session.get( url, timeout=self.config.api_options.timeout ) + if response.status_code == 204: # No content, don't parse json + return {} return response.json() except HTTPError as http_error: error_response = http_error.response diff --git a/metadata-ingestion/tests/integration/mode/test_mode.py b/metadata-ingestion/tests/integration/mode/test_mode.py index ce7533d5611e49..7f1e3935aa0fa1 100644 --- a/metadata-ingestion/tests/integration/mode/test_mode.py +++ b/metadata-ingestion/tests/integration/mode/test_mode.py @@ -1,11 +1,14 @@ import json import pathlib +from typing import Sequence from unittest.mock import patch +import pytest from freezegun import freeze_time from requests.models import HTTPError from datahub.configuration.common import PipelineExecutionError +from datahub.ingestion.api.source import StructuredLogEntry from datahub.ingestion.run.pipeline import Pipeline from tests.test_helpers import mce_helpers @@ -28,7 +31,7 @@ "https://app.mode.com/api/acryl/reports/24f66e1701b6/queries": "dataset_queries_24f66e1701b6.json", } -RESPONSE_ERROR_LIST = ["https://app.mode.com/api/acryl/spaces/75737b70402e/reports"] +ERROR_URL = "https://app.mode.com/api/acryl/spaces/75737b70402e/reports" test_resources_dir = pathlib.Path(__file__).parent @@ -49,6 +52,14 @@ def mount(self, prefix, adaptor): return self def get(self, url, timeout=40): + if self.error_list is not None and self.url in self.error_list: + http_error_msg = "{} Client Error: {} for url: {}".format( + 400, + "Simulate error", + self.url, + ) + raise HTTPError(http_error_msg, response=self) + self.url = url self.timeout = timeout response_json_path = f"{test_resources_dir}/setup/{JSON_RESPONSE_MAP.get(url)}" @@ -57,29 +68,46 @@ def get(self, url, timeout=40): self.json_data = data return self - def raise_for_status(self): - if self.error_list is not None and self.url in self.error_list: - http_error_msg = "{} Client Error: {} for url: {}".format( - 400, - "Simulate error", - self.url, - ) - raise HTTPError(http_error_msg, response=self) + +class MockResponseJson(MockResponse): + def __init__( + self, + status_code: int = 200, + *, + json_empty_list: Sequence[str] = (), + json_error_list: Sequence[str] = (), + ): + super().__init__(None, status_code) + self.json_empty_list = json_empty_list + self.json_error_list = json_error_list + + def json(self): + if self.url in self.json_empty_list: + return json.loads("") # Shouldn't be called + if self.url in self.json_error_list: + return json.loads("{") + return super().json() + + def get(self, url, timeout=40): + response = super().get(url, timeout) + if self.url in self.json_empty_list: + response.status_code = 204 + return response -def mocked_requests_sucess(*args, **kwargs): +def mocked_requests_success(*args, **kwargs): return MockResponse(None, 200) def mocked_requests_failure(*args, **kwargs): - return MockResponse(RESPONSE_ERROR_LIST, 200) + return MockResponse([ERROR_URL], 200) @freeze_time(FROZEN_TIME) def test_mode_ingest_success(pytestconfig, tmp_path): with patch( "datahub.ingestion.source.mode.requests.Session", - side_effect=mocked_requests_sucess, + side_effect=mocked_requests_success, ): pipeline = Pipeline.create( { @@ -142,8 +170,89 @@ def test_mode_ingest_failure(pytestconfig, tmp_path): } ) pipeline.run() - try: + with pytest.raises(PipelineExecutionError) as exec_error: pipeline.raise_from_status() - except PipelineExecutionError as exec_error: - assert exec_error.args[0] == "Source reported errors" - assert len(exec_error.args[1].failures) == 1 + assert exec_error.value.args[0] == "Source reported errors" + assert len(exec_error.value.args[1].failures) == 1 + error_dict: StructuredLogEntry + _level, error_dict = exec_error.value.args[1].failures[0] + error = next(iter(error_dict.context)) + assert "Simulate error" in error + assert ERROR_URL in error + + +@freeze_time(FROZEN_TIME) +def test_mode_ingest_json_empty(pytestconfig, tmp_path): + with patch( + "datahub.ingestion.source.mode.requests.Session", + side_effect=lambda *args, **kwargs: MockResponseJson( + json_empty_list=["https://app.mode.com/api/modeuser"] + ), + ): + global test_resources_dir + test_resources_dir = pytestconfig.rootpath / "tests/integration/mode" + + pipeline = Pipeline.create( + { + "run_id": "mode-test", + "source": { + "type": "mode", + "config": { + "token": "xxxx", + "password": "xxxx", + "connect_uri": "https://app.mode.com/", + "workspace": "acryl", + }, + }, + "sink": { + "type": "file", + "config": { + "filename": f"{tmp_path}/mode_mces.json", + }, + }, + } + ) + pipeline.run() + pipeline.raise_from_status(raise_warnings=True) + + +@freeze_time(FROZEN_TIME) +def test_mode_ingest_json_failure(pytestconfig, tmp_path): + with patch( + "datahub.ingestion.source.mode.requests.Session", + side_effect=lambda *args, **kwargs: MockResponseJson( + json_error_list=["https://app.mode.com/api/modeuser"] + ), + ): + global test_resources_dir + test_resources_dir = pytestconfig.rootpath / "tests/integration/mode" + + pipeline = Pipeline.create( + { + "run_id": "mode-test", + "source": { + "type": "mode", + "config": { + "token": "xxxx", + "password": "xxxx", + "connect_uri": "https://app.mode.com/", + "workspace": "acryl", + }, + }, + "sink": { + "type": "file", + "config": { + "filename": f"{tmp_path}/mode_mces.json", + }, + }, + } + ) + pipeline.run() + pipeline.raise_from_status(raise_warnings=False) + with pytest.raises(PipelineExecutionError) as exec_error: + pipeline.raise_from_status(raise_warnings=True) + assert len(exec_error.value.args[1].warnings) > 0 + error_dict: StructuredLogEntry + _level, error_dict = exec_error.value.args[1].warnings[0] + error = next(iter(error_dict.context)) + assert "Expecting property name enclosed in double quotes" in error From 756b199506d57449c60a3a28901f7d22fe89f9f1 Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Tue, 24 Dec 2024 14:56:35 -0800 Subject: [PATCH 14/28] fix(ingest/glue): Add additional checks and logging when specifying catalog_id (#12168) --- .../src/datahub/ingestion/source/aws/glue.py | 14 +++++- .../tests/unit/glue/glue_mces_golden.json | 2 +- .../glue/glue_mces_golden_table_lineage.json | 2 +- .../glue_mces_platform_instance_golden.json | 2 +- .../tests/unit/glue/test_glue_source.py | 43 +++++++++++++++++-- .../tests/unit/glue/test_glue_source_stubs.py | 8 ++-- 6 files changed, 59 insertions(+), 12 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index 37c146218e2633..7a5ed154d40bc7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -52,6 +52,7 @@ platform_name, support_status, ) +from datahub.ingestion.api.report import EntityFilterReport from datahub.ingestion.api.source import MetadataWorkUnitProcessor from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.aws import s3_util @@ -115,7 +116,6 @@ logger = logging.getLogger(__name__) - DEFAULT_PLATFORM = "glue" VALID_PLATFORMS = [DEFAULT_PLATFORM, "athena"] @@ -220,6 +220,7 @@ def platform_validator(cls, v: str) -> str: class GlueSourceReport(StaleEntityRemovalSourceReport): tables_scanned = 0 filtered: List[str] = dataclass_field(default_factory=list) + databases: EntityFilterReport = EntityFilterReport.field(type="database") num_job_script_location_missing: int = 0 num_job_script_location_invalid: int = 0 @@ -668,6 +669,7 @@ def get_datajob_wu(self, node: Dict[str, Any], job_name: str) -> MetadataWorkUni return MetadataWorkUnit(id=f'{job_name}-{node["Id"]}', mce=mce) def get_all_databases(self) -> Iterable[Mapping[str, Any]]: + logger.debug("Getting all databases") # see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue/paginator/GetDatabases.html paginator = self.glue_client.get_paginator("get_databases") @@ -684,10 +686,18 @@ def get_all_databases(self) -> Iterable[Mapping[str, Any]]: pattern += "[?!TargetDatabase]" for database in paginator_response.search(pattern): - if self.source_config.database_pattern.allowed(database["Name"]): + if (not self.source_config.database_pattern.allowed(database["Name"])) or ( + self.source_config.catalog_id + and database.get("CatalogId") + and database.get("CatalogId") != self.source_config.catalog_id + ): + self.report.databases.dropped(database["Name"]) + else: + self.report.databases.processed(database["Name"]) yield database def get_tables_from_database(self, database: Mapping[str, Any]) -> Iterable[Dict]: + logger.debug(f"Getting tables from database {database['Name']}") # see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue/paginator/GetTables.html paginator = self.glue_client.get_paginator("get_tables") database_name = database["Name"] diff --git a/metadata-ingestion/tests/unit/glue/glue_mces_golden.json b/metadata-ingestion/tests/unit/glue/glue_mces_golden.json index 87971de12fbb39..71d7c31b222bde 100644 --- a/metadata-ingestion/tests/unit/glue/glue_mces_golden.json +++ b/metadata-ingestion/tests/unit/glue/glue_mces_golden.json @@ -124,7 +124,7 @@ "CreateTime": "June 01, 2021 at 14:55:13" }, "name": "empty-database", - "qualifiedName": "arn:aws:glue:us-west-2:123412341234:database/empty-database", + "qualifiedName": "arn:aws:glue:us-west-2:000000000000:database/empty-database", "env": "PROD" } } diff --git a/metadata-ingestion/tests/unit/glue/glue_mces_golden_table_lineage.json b/metadata-ingestion/tests/unit/glue/glue_mces_golden_table_lineage.json index e2dd4cec97c2ec..22bb4b53b91efd 100644 --- a/metadata-ingestion/tests/unit/glue/glue_mces_golden_table_lineage.json +++ b/metadata-ingestion/tests/unit/glue/glue_mces_golden_table_lineage.json @@ -124,7 +124,7 @@ "CreateTime": "June 01, 2021 at 14:55:13" }, "name": "empty-database", - "qualifiedName": "arn:aws:glue:us-west-2:123412341234:database/empty-database", + "qualifiedName": "arn:aws:glue:us-west-2:000000000000:database/empty-database", "env": "PROD" } } diff --git a/metadata-ingestion/tests/unit/glue/glue_mces_platform_instance_golden.json b/metadata-ingestion/tests/unit/glue/glue_mces_platform_instance_golden.json index 0b883062763f41..b700335c26e5aa 100644 --- a/metadata-ingestion/tests/unit/glue/glue_mces_platform_instance_golden.json +++ b/metadata-ingestion/tests/unit/glue/glue_mces_platform_instance_golden.json @@ -129,7 +129,7 @@ "CreateTime": "June 01, 2021 at 14:55:13" }, "name": "empty-database", - "qualifiedName": "arn:aws:glue:us-west-2:123412341234:database/empty-database", + "qualifiedName": "arn:aws:glue:us-west-2:000000000000:database/empty-database", "env": "PROD" } } diff --git a/metadata-ingestion/tests/unit/glue/test_glue_source.py b/metadata-ingestion/tests/unit/glue/test_glue_source.py index 693fd6bc336fd3..9e3f260a23f1c8 100644 --- a/metadata-ingestion/tests/unit/glue/test_glue_source.py +++ b/metadata-ingestion/tests/unit/glue/test_glue_source.py @@ -35,8 +35,8 @@ validate_all_providers_have_committed_successfully, ) from tests.unit.glue.test_glue_source_stubs import ( - databases_1, - databases_2, + empty_database, + flights_database, get_bucket_tagging, get_databases_delta_response, get_databases_response, @@ -64,6 +64,7 @@ tables_2, tables_profiling_1, target_database_tables, + test_database, ) FROZEN_TIME = "2020-04-14 07:00:00" @@ -310,6 +311,40 @@ def test_config_without_platform(): assert source.platform == "glue" +def test_get_databases_filters_by_catalog(): + def format_databases(databases): + return set(d["Name"] for d in databases) + + all_catalogs_source: GlueSource = GlueSource( + config=GlueSourceConfig(aws_region="us-west-2"), + ctx=PipelineContext(run_id="glue-source-test"), + ) + with Stubber(all_catalogs_source.glue_client) as glue_stubber: + glue_stubber.add_response("get_databases", get_databases_response, {}) + + expected = [flights_database, test_database, empty_database] + actual = all_catalogs_source.get_all_databases() + assert format_databases(actual) == format_databases(expected) + assert all_catalogs_source.report.databases.dropped_entities.as_obj() == [] + + catalog_id = "123412341234" + single_catalog_source: GlueSource = GlueSource( + config=GlueSourceConfig(catalog_id=catalog_id, aws_region="us-west-2"), + ctx=PipelineContext(run_id="glue-source-test"), + ) + with Stubber(single_catalog_source.glue_client) as glue_stubber: + glue_stubber.add_response( + "get_databases", get_databases_response, {"CatalogId": catalog_id} + ) + + expected = [flights_database, test_database] + actual = single_catalog_source.get_all_databases() + assert format_databases(actual) == format_databases(expected) + assert single_catalog_source.report.databases.dropped_entities.as_obj() == [ + "empty-database" + ] + + @freeze_time(FROZEN_TIME) def test_glue_stateful(pytestconfig, tmp_path, mock_time, mock_datahub_graph): deleted_actor_golden_mcs = "{}/glue_deleted_actor_mces_golden.json".format( @@ -357,8 +392,8 @@ def test_glue_stateful(pytestconfig, tmp_path, mock_time, mock_datahub_graph): tables_on_first_call = tables_1 tables_on_second_call = tables_2 mock_get_all_databases_and_tables.side_effect = [ - (databases_1, tables_on_first_call), - (databases_2, tables_on_second_call), + ([flights_database], tables_on_first_call), + ([test_database], tables_on_second_call), ] pipeline_run1 = run_and_get_pipeline(pipeline_config_dict) diff --git a/metadata-ingestion/tests/unit/glue/test_glue_source_stubs.py b/metadata-ingestion/tests/unit/glue/test_glue_source_stubs.py index dba1eea3010c2f..43bf62fd4e3b8a 100644 --- a/metadata-ingestion/tests/unit/glue/test_glue_source_stubs.py +++ b/metadata-ingestion/tests/unit/glue/test_glue_source_stubs.py @@ -88,12 +88,14 @@ "Permissions": ["ALL"], } ], - "CatalogId": "123412341234", + "CatalogId": "000000000000", }, ] } -databases_1 = [{"Name": "flights-database", "CatalogId": "123412341234"}] -databases_2 = [{"Name": "test-database", "CatalogId": "123412341234"}] +flights_database = {"Name": "flights-database", "CatalogId": "123412341234"} +test_database = {"Name": "test-database", "CatalogId": "123412341234"} +empty_database = {"Name": "empty-database", "CatalogId": "000000000000"} + tables_1 = [ { "Name": "avro", From 16698da509ec5c9f86188db7a16c38cea19d61bf Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Thu, 26 Dec 2024 18:10:09 +0530 Subject: [PATCH 15/28] fix(ingest/gc): misc fixes in gc source (#12226) --- .../datahub/ingestion/source/gc/datahub_gc.py | 23 +++++-- .../source/gc/execution_request_cleanup.py | 61 +++++++++++++++---- .../source_report/ingestion_stage.py | 1 + 3 files changed, 68 insertions(+), 17 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py b/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py index 4eecbb4d9d7177..168b787b85e8be 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py @@ -34,6 +34,7 @@ SoftDeletedEntitiesCleanupConfig, SoftDeletedEntitiesReport, ) +from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport logger = logging.getLogger(__name__) @@ -86,6 +87,7 @@ class DataHubGcSourceReport( DataProcessCleanupReport, SoftDeletedEntitiesReport, DatahubExecutionRequestCleanupReport, + IngestionStageReport, ): expired_tokens_revoked: int = 0 @@ -139,31 +141,40 @@ def get_workunits_internal( ) -> Iterable[MetadataWorkUnit]: if self.config.cleanup_expired_tokens: try: + self.report.report_ingestion_stage_start("Expired Token Cleanup") self.revoke_expired_tokens() except Exception as e: self.report.failure("While trying to cleanup expired token ", exc=e) if self.config.truncate_indices: try: + self.report.report_ingestion_stage_start("Truncate Indices") self.truncate_indices() except Exception as e: self.report.failure("While trying to truncate indices ", exc=e) if self.config.soft_deleted_entities_cleanup.enabled: try: + self.report.report_ingestion_stage_start( + "Soft Deleted Entities Cleanup" + ) self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities() except Exception as e: self.report.failure( "While trying to cleanup soft deleted entities ", exc=e ) - if self.config.execution_request_cleanup.enabled: - try: - self.execution_request_cleanup.run() - except Exception as e: - self.report.failure("While trying to cleanup execution request ", exc=e) if self.config.dataprocess_cleanup.enabled: try: + self.report.report_ingestion_stage_start("Data Process Cleanup") yield from self.dataprocess_cleanup.get_workunits_internal() except Exception as e: self.report.failure("While trying to cleanup data process ", exc=e) + if self.config.execution_request_cleanup.enabled: + try: + self.report.report_ingestion_stage_start("Execution request Cleanup") + self.execution_request_cleanup.run() + except Exception as e: + self.report.failure("While trying to cleanup execution request ", exc=e) + # Otherwise last stage's duration does not get calculated. + self.report.report_ingestion_stage_start("End") yield from [] def truncate_indices(self) -> None: @@ -281,6 +292,8 @@ def revoke_expired_tokens(self) -> None: list_access_tokens = expired_tokens_res.get("listAccessTokens", {}) tokens = list_access_tokens.get("tokens", []) total = list_access_tokens.get("total", 0) + if tokens == []: + break for token in tokens: self.report.expired_tokens_revoked += 1 token_id = token["id"] diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py index 3baf858e44cdc8..170a6ada3e336f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py @@ -1,3 +1,4 @@ +import datetime import logging import time from typing import Any, Dict, Iterator, Optional @@ -42,16 +43,28 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel): description="Global switch for this cleanup task", ) + runtime_limit_seconds: int = Field( + default=3600, + description="Maximum runtime in seconds for the cleanup task", + ) + + max_read_errors: int = Field( + default=10, + description="Maximum number of read errors before aborting", + ) + def keep_history_max_milliseconds(self): return self.keep_history_max_days * 24 * 3600 * 1000 class DatahubExecutionRequestCleanupReport(SourceReport): - execution_request_cleanup_records_read: int = 0 - execution_request_cleanup_records_preserved: int = 0 - execution_request_cleanup_records_deleted: int = 0 - execution_request_cleanup_read_errors: int = 0 - execution_request_cleanup_delete_errors: int = 0 + ergc_records_read: int = 0 + ergc_records_preserved: int = 0 + ergc_records_deleted: int = 0 + ergc_read_errors: int = 0 + ergc_delete_errors: int = 0 + ergc_start_time: Optional[datetime.datetime] = None + ergc_end_time: Optional[datetime.datetime] = None class CleanupRecord(BaseModel): @@ -124,6 +137,13 @@ def _scroll_execution_requests( params.update(overrides) while True: + if self._reached_runtime_limit(): + break + if self.report.ergc_read_errors >= self.config.max_read_errors: + self.report.failure( + f"ergc({self.instance_id}): too many read errors, aborting." + ) + break try: url = f"{self.graph.config.server}/openapi/v2/entity/{DATAHUB_EXECUTION_REQUEST_ENTITY_NAME}" response = self.graph._session.get(url, headers=headers, params=params) @@ -141,7 +161,7 @@ def _scroll_execution_requests( logger.error( f"ergc({self.instance_id}): failed to fetch next batch of execution requests: {e}" ) - self.report.execution_request_cleanup_read_errors += 1 + self.report.ergc_read_errors += 1 def _scroll_garbage_records(self): state: Dict[str, Dict] = {} @@ -150,7 +170,7 @@ def _scroll_garbage_records(self): running_guard_timeout = now_ms - 30 * 24 * 3600 * 1000 for entry in self._scroll_execution_requests(): - self.report.execution_request_cleanup_records_read += 1 + self.report.ergc_records_read += 1 key = entry.ingestion_source # Always delete corrupted records @@ -171,7 +191,7 @@ def _scroll_garbage_records(self): # Do not delete if number of requests is below minimum if state[key]["count"] < self.config.keep_history_min_count: - self.report.execution_request_cleanup_records_preserved += 1 + self.report.ergc_records_preserved += 1 continue # Do not delete if number of requests do not exceed allowed maximum, @@ -179,7 +199,7 @@ def _scroll_garbage_records(self): if (state[key]["count"] < self.config.keep_history_max_count) and ( entry.requested_at > state[key]["cutoffTimestamp"] ): - self.report.execution_request_cleanup_records_preserved += 1 + self.report.ergc_records_preserved += 1 continue # Do not delete if status is RUNNING or PENDING and created within last month. If the record is >month old and it did not @@ -188,7 +208,7 @@ def _scroll_garbage_records(self): "RUNNING", "PENDING", ]: - self.report.execution_request_cleanup_records_preserved += 1 + self.report.ergc_records_preserved += 1 continue # Otherwise delete current record @@ -200,7 +220,7 @@ def _scroll_garbage_records(self): f"record timestamp: {entry.requested_at}." ) ) - self.report.execution_request_cleanup_records_deleted += 1 + self.report.ergc_records_deleted += 1 yield entry def _delete_entry(self, entry: CleanupRecord) -> None: @@ -210,17 +230,31 @@ def _delete_entry(self, entry: CleanupRecord) -> None: ) self.graph.delete_entity(entry.urn, True) except Exception as e: - self.report.execution_request_cleanup_delete_errors += 1 + self.report.ergc_delete_errors += 1 logger.error( f"ergc({self.instance_id}): failed to delete ExecutionRequest {entry.request_id}: {e}" ) + def _reached_runtime_limit(self) -> bool: + if ( + self.config.runtime_limit_seconds + and self.report.ergc_start_time + and ( + datetime.datetime.now() - self.report.ergc_start_time + >= datetime.timedelta(seconds=self.config.runtime_limit_seconds) + ) + ): + logger.info(f"ergc({self.instance_id}): max runtime reached.") + return True + return False + def run(self) -> None: if not self.config.enabled: logger.info( f"ergc({self.instance_id}): ExecutionRequest cleaner is disabled." ) return + self.report.ergc_start_time = datetime.datetime.now() logger.info( ( @@ -232,8 +266,11 @@ def run(self) -> None: ) for entry in self._scroll_garbage_records(): + if self._reached_runtime_limit(): + break self._delete_entry(entry) + self.report.ergc_end_time = datetime.datetime.now() logger.info( f"ergc({self.instance_id}): Finished cleanup of ExecutionRequest records." ) diff --git a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py index 42b3b648bd298d..ce683e64b3f468 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py +++ b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py @@ -42,4 +42,5 @@ def report_ingestion_stage_start(self, stage: str) -> None: self._timer = PerfTimer() self.ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}" + logger.info(f"Stage started: {self.ingestion_stage}") self._timer.start() From fe43f076dadc754313864f7a9ca286223ebb0275 Mon Sep 17 00:00:00 2001 From: Chakru <161002324+chakru-r@users.noreply.github.com> Date: Thu, 26 Dec 2024 21:22:16 +0530 Subject: [PATCH 16/28] Parallelize smoke test (#12225) --- .github/workflows/docker-unified.yml | 50 +++++++++++++------ smoke-test/.gitignore | 4 +- smoke-test/build.gradle | 38 ++++++-------- smoke-test/conftest.py | 52 ++++++++++++++++++++ smoke-test/smoke.sh | 25 ++++++---- smoke-test/tests/cypress/integration_test.py | 49 ++++++++++++------ 6 files changed, 155 insertions(+), 63 deletions(-) diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index 03a9b3afc3bc58..47c26068347c07 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -1011,18 +1011,39 @@ jobs: needs: setup outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} + cypress_batch_count: ${{ steps.set-batch-count.outputs.cypress_batch_count }} + python_batch_count: ${{ steps.set-batch-count.outputs.python_batch_count }} steps: + - id: set-batch-count + # Tests are split simply to ensure the configured number of batches for parallelization. This may need some + # increase as a new tests added increase the duration where an additional parallel batch helps. + # python_batch_count is used to split pytests in the smoke-test (batches of actual test functions) + # cypress_batch_count is used to split the collection of cypress test specs into batches. + run: | + echo "cypress_batch_count=11" >> "$GITHUB_OUTPUT" + echo "python_batch_count=5" >> "$GITHUB_OUTPUT" + - id: set-matrix + # For m batches for python and n batches for cypress, we need a test matrix of python x m + cypress x n. + # while the github action matrix generation can handle these two parts individually, there isnt a way to use the + # two generated matrices for the same job. So, produce that matrix with scripting and use the include directive + # to add it to the test matrix. run: | - if [ '${{ needs.setup.outputs.frontend_only }}' == 'true' ]; then - echo 'matrix=["cypress_suite1","cypress_rest"]' >> "$GITHUB_OUTPUT" - elif [ '${{ needs.setup.outputs.ingestion_only }}' == 'true' ]; then - echo 'matrix=["no_cypress_suite0","no_cypress_suite1"]' >> "$GITHUB_OUTPUT" - elif [[ '${{ needs.setup.outputs.backend_change }}' == 'true' || '${{ needs.setup.outputs.smoke_test_change }}' == 'true' ]]; then - echo 'matrix=["no_cypress_suite0","no_cypress_suite1","cypress_suite1","cypress_rest"]' >> "$GITHUB_OUTPUT" - else - echo 'matrix=[]' >> "$GITHUB_OUTPUT" + python_batch_count=${{ steps.set-batch-count.outputs.python_batch_count }} + python_matrix=$(printf "{\"test_strategy\":\"pytests\",\"batch\":\"0\",\"batch_count\":\"$python_batch_count\"}"; for ((i=1;i> "$GITHUB_OUTPUT" smoke_test: name: Run Smoke Tests @@ -1043,8 +1064,7 @@ jobs: ] strategy: fail-fast: false - matrix: - test_strategy: ${{ fromJson(needs.smoke_test_matrix.outputs.matrix) }} + matrix: ${{ fromJson(needs.smoke_test_matrix.outputs.matrix) }} if: ${{ always() && !failure() && !cancelled() && needs.smoke_test_matrix.outputs.matrix != '[]' }} steps: - name: Free up disk space @@ -1220,6 +1240,8 @@ jobs: CYPRESS_RECORD_KEY: ${{ secrets.CYPRESS_RECORD_KEY }} CLEANUP_DATA: "false" TEST_STRATEGY: ${{ matrix.test_strategy }} + BATCH_COUNT: ${{ matrix.batch_count }} + BATCH_NUMBER: ${{ matrix.batch }} run: | echo "$DATAHUB_VERSION" ./gradlew --stop @@ -1230,25 +1252,25 @@ jobs: if: failure() run: | docker ps -a - TEST_STRATEGY="-${{ matrix.test_strategy }}" + TEST_STRATEGY="-${{ matrix.test_strategy }}-${{ matrix.batch }}" source .github/scripts/docker_logs.sh - name: Upload logs uses: actions/upload-artifact@v3 if: failure() with: - name: docker-logs-${{ matrix.test_strategy }} + name: docker-logs-${{ matrix.test_strategy }}-${{ matrix.batch }} path: "docker_logs/*.log" retention-days: 5 - name: Upload screenshots uses: actions/upload-artifact@v3 if: failure() with: - name: cypress-snapshots-${{ matrix.test_strategy }} + name: cypress-snapshots-${{ matrix.test_strategy }}-${{ matrix.batch }} path: smoke-test/tests/cypress/cypress/screenshots/ - uses: actions/upload-artifact@v3 if: always() with: - name: Test Results (smoke tests) ${{ matrix.test_strategy }} + name: Test Results (smoke tests) ${{ matrix.test_strategy }} ${{ matrix.batch }} path: | **/build/reports/tests/test/** **/build/test-results/test/** diff --git a/smoke-test/.gitignore b/smoke-test/.gitignore index b8af2eef535a0b..d8cfd65ff81b9c 100644 --- a/smoke-test/.gitignore +++ b/smoke-test/.gitignore @@ -29,6 +29,8 @@ share/python-wheels/ .installed.cfg *.egg MANIFEST +**/cypress/node_modules + # PyInstaller # Usually these files are written by a python script from a template @@ -132,4 +134,4 @@ dmypy.json # Pyre type checker .pyre/ junit* -tests/cypress/onboarding.json \ No newline at end of file +tests/cypress/onboarding.json diff --git a/smoke-test/build.gradle b/smoke-test/build.gradle index 73ecdcb08ea149..60d08e0206cdab 100644 --- a/smoke-test/build.gradle +++ b/smoke-test/build.gradle @@ -91,39 +91,31 @@ task pythonLintFix(type: Exec, dependsOn: installDev) { * The following tasks assume an already running quickstart. * ./gradlew quickstart (or another variation `quickstartDebug`) */ -task noCypressSuite0(type: Exec, dependsOn: [installDev, ':metadata-ingestion:installDev']) { - environment 'RUN_QUICKSTART', 'false' - environment 'TEST_STRATEGY', 'no_cypress_suite0' - - workingDir = project.projectDir - commandLine 'bash', '-c', - "source ${venv_name}/bin/activate && set -x && " + - "./smoke.sh" -} +// ./gradlew :smoke-test:pytest -PbatchNumber=2 (default 0) +task pytest(type: Exec, dependsOn: [installDev, ':metadata-ingestion:installDev']) { + // Get BATCH_NUMBER from command line argument with default value of 0 + def batchNumber = project.hasProperty('batchNumber') ? project.property('batchNumber') : '0' -task noCypressSuite1(type: Exec, dependsOn: [installDev, ':metadata-ingestion:installDev']) { environment 'RUN_QUICKSTART', 'false' - environment 'TEST_STRATEGY', 'no_cypress_suite1' + environment 'TEST_STRATEGY', 'pytests' + environment 'BATCH_COUNT', 5 + environment 'BATCH_NUMBER', batchNumber workingDir = project.projectDir commandLine 'bash', '-c', "source ${venv_name}/bin/activate && set -x && " + - "./smoke.sh" + "./smoke.sh" } -task cypressSuite1(type: Exec, dependsOn: [installDev, ':metadata-ingestion:installDev']) { - environment 'RUN_QUICKSTART', 'false' - environment 'TEST_STRATEGY', 'cypress_suite1' - - workingDir = project.projectDir - commandLine 'bash', '-c', - "source ${venv_name}/bin/activate && set -x && " + - "./smoke.sh" -} +// ./gradlew :smoke-test:cypressTest -PbatchNumber=2 (default 0) +task cypressTest(type: Exec, dependsOn: [installDev, ':metadata-ingestion:installDev']) { + // Get BATCH_NUMBER from command line argument with default value of 0 + def batchNumber = project.hasProperty('batchNumber') ? project.property('batchNumber') : '0' -task cypressRest(type: Exec, dependsOn: [installDev, ':metadata-ingestion:installDev']) { environment 'RUN_QUICKSTART', 'false' - environment 'TEST_STRATEGY', 'cypress_rest' + environment 'TEST_STRATEGY', 'cypress' + environment 'BATCH_COUNT', 11 + environment 'BATCH_NUMBER', batchNumber workingDir = project.projectDir commandLine 'bash', '-c', diff --git a/smoke-test/conftest.py b/smoke-test/conftest.py index 6d148db9886a48..d48a92b22ab48f 100644 --- a/smoke-test/conftest.py +++ b/smoke-test/conftest.py @@ -1,6 +1,8 @@ import os import pytest +from typing import List, Tuple +from _pytest.nodes import Item import requests from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph @@ -45,3 +47,53 @@ def graph_client(auth_session) -> DataHubGraph: def pytest_sessionfinish(session, exitstatus): """whole test run finishes.""" send_message(exitstatus) + + +def get_batch_start_end(num_tests: int) -> Tuple[int, int]: + batch_count_env = os.getenv("BATCH_COUNT", 1) + batch_count = int(batch_count_env) + + batch_number_env = os.getenv("BATCH_NUMBER", 0) + batch_number = int(batch_number_env) + + if batch_count == 0 or batch_count > num_tests: + raise ValueError( + f"Invalid batch count {batch_count}: must be >0 and <= {num_tests} (num_tests)" + ) + if batch_number >= batch_count: + raise ValueError( + f"Invalid batch number: {batch_number}, must be less than {batch_count} (zer0 based index)" + ) + + batch_size = round(num_tests / batch_count) + + batch_start = batch_size * batch_number + batch_end = batch_start + batch_size + # We must have exactly as many batches as specified by BATCH_COUNT. + if ( + num_tests - batch_end < batch_size + ): # We must have exactly as many batches as specified by BATCH_COUNT, put the remaining in the last batch. + batch_end = num_tests + + if batch_count > 0: + print(f"Running tests for batch {batch_number} of {batch_count}") + + return batch_start, batch_end + + +def pytest_collection_modifyitems( + session: pytest.Session, config: pytest.Config, items: List[Item] +) -> None: + if os.getenv("TEST_STRATEGY") == "cypress": + return # We launch cypress via pytests, but needs a different batching mechanism at cypress level. + + # If BATCH_COUNT and BATCH_ENV vars are set, splits the pytests to batches and runs filters only the BATCH_NUMBER + # batch for execution. Enables multiple parallel launches. Current implementation assumes all test are of equal + # weight for batching. TODO. A weighted batching method can help make batches more equal sized by cost. + # this effectively is a no-op if BATCH_COUNT=1 + start_index, end_index = get_batch_start_end(num_tests=len(items)) + + items.sort(key=lambda x: x.nodeid) # we want the order to be stable across batches + # replace items with the filtered list + print(f"Running tests for batch {start_index}-{end_index}") + items[:] = items[start_index:end_index] diff --git a/smoke-test/smoke.sh b/smoke-test/smoke.sh index 888a60f488e1fc..ec8188ebf5f4db 100755 --- a/smoke-test/smoke.sh +++ b/smoke-test/smoke.sh @@ -34,15 +34,20 @@ source ./set-cypress-creds.sh # set environment variables for the test source ./set-test-env-vars.sh -# no_cypress_suite0, no_cypress_suite1, cypress_suite1, cypress_rest -if [[ -z "${TEST_STRATEGY}" ]]; then - pytest -rP --durations=20 -vv --continue-on-collection-errors --junit-xml=junit.smoke.xml +# TEST_STRATEGY: +# if set to pytests, runs all pytests, skips cypress tests(though cypress test launch is via a pytest). +# if set tp cypress, runs all cypress tests +# if blank, runs all. +# When invoked via the github action, BATCH_COUNT and BATCH_NUM env vars are set to run a slice of those tests per +# worker for parallelism. docker-unified.yml generates a test matrix of pytests/cypress in batches. As number of tests +# increase, the batch_count config (in docker-unified.yml) may need adjustment. +if [[ "${TEST_STRATEGY}" == "pytests" ]]; then + #pytests only - github test matrix runs pytests in one of the runners when applicable. + pytest -rP --durations=20 -vv --continue-on-collection-errors --junit-xml=junit.smoke-pytests.xml -k 'not test_run_cypress' +elif [[ "${TEST_STRATEGY}" == "cypress" ]]; then + # run only cypress tests. The test inspects BATCH_COUNT and BATCH_NUMBER and runs only a subset of tests in that batch. + # github workflow test matrix will invoke this in multiple runners for each batch. + pytest -rP --durations=20 -vv --continue-on-collection-errors --junit-xml=junit.smoke-cypress${BATCH_NUMBER}.xml tests/cypress/integration_test.py else - if [ "$TEST_STRATEGY" == "no_cypress_suite0" ]; then - pytest -rP --durations=20 -vv --continue-on-collection-errors --junit-xml=junit.smoke_non_cypress.xml -k 'not test_run_cypress' -m 'not no_cypress_suite1' - elif [ "$TEST_STRATEGY" == "no_cypress_suite1" ]; then - pytest -rP --durations=20 -vv --continue-on-collection-errors --junit-xml=junit.smoke_non_cypress.xml -m 'no_cypress_suite1' - else - pytest -rP --durations=20 -vv --continue-on-collection-errors --junit-xml=junit.smoke_cypress_${TEST_STRATEGY}.xml tests/cypress/integration_test.py - fi + pytest -rP --durations=20 -vv --continue-on-collection-errors --junit-xml=junit.smoke-all.xml fi diff --git a/smoke-test/tests/cypress/integration_test.py b/smoke-test/tests/cypress/integration_test.py index 0d824a96810d05..33c67a923c278d 100644 --- a/smoke-test/tests/cypress/integration_test.py +++ b/smoke-test/tests/cypress/integration_test.py @@ -1,10 +1,11 @@ import datetime import os import subprocess -from typing import List, Set +from typing import List import pytest +from conftest import get_batch_start_end from tests.setup.lineage.ingest_time_lineage import ( get_time_lineage_urns, ingest_time_lineage, @@ -169,10 +170,29 @@ def ingest_cleanup_data(auth_session, graph_client): print("deleted onboarding data") -def _get_spec_map(items: Set[str]) -> str: - if len(items) == 0: - return "" - return ",".join([f"**/{item}/*.js" for item in items]) +def _get_js_files(base_path: str): + file_paths = [] + for root, dirs, files in os.walk(base_path): + for file in files: + if file.endswith(".js"): + file_paths.append(os.path.relpath(os.path.join(root, file), base_path)) + return sorted(file_paths) # sort to make the order stable across batch runs + + +def _get_cypress_tests_batch(): + """ + Batching is configured via env vars BATCH_COUNT and BATCH_NUMBER. All cypress tests are split into exactly + BATCH_COUNT batches. When BATCH_NUMBER env var is set (zero based index), that batch alone is run. + Github workflow via test_matrix, runs all batches in parallel to speed up the test elapsed time. + If either of these vars are not set, all tests are run sequentially. + :return: + """ + all_tests = _get_js_files("tests/cypress/cypress/e2e") + + batch_start, batch_end = get_batch_start_end(num_tests=len(all_tests)) + + return all_tests[batch_start:batch_end] + # return test_batches[int(batch_number)] #if BATCH_NUMBER was set, we this test just runs that one batch. def test_run_cypress(auth_session): @@ -182,24 +202,23 @@ def test_run_cypress(auth_session): test_strategy = os.getenv("TEST_STRATEGY", None) if record_key: record_arg = " --record " - tag_arg = f" --tag {test_strategy} " + batch_number = os.getenv("BATCH_NUMBER") + batch_count = os.getenv("BATCH_COUNT") + if batch_number and batch_count: + batch_suffix = f"-{batch_number}{batch_count}" + else: + batch_suffix = "" + tag_arg = f" --tag {test_strategy}{batch_suffix}" else: record_arg = " " rest_specs = set(os.listdir("tests/cypress/cypress/e2e")) cypress_suite1_specs = {"mutations", "search", "views"} rest_specs.difference_update(set(cypress_suite1_specs)) - strategy_spec_map = { - "cypress_suite1": cypress_suite1_specs, - "cypress_rest": rest_specs, - } print(f"test strategy is {test_strategy}") test_spec_arg = "" - if test_strategy is not None: - specs = strategy_spec_map.get(test_strategy) - assert specs is not None - specs_str = _get_spec_map(specs) - test_spec_arg = f" --spec '{specs_str}' " + specs_str = ",".join([f"**/{f}" for f in _get_cypress_tests_batch()]) + test_spec_arg = f" --spec '{specs_str}' " print("Running Cypress tests with command") command = f"NO_COLOR=1 npx cypress run {record_arg} {test_spec_arg} {tag_arg}" From 5708bd9beba6ea08d66a37506867380a45718df3 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Thu, 26 Dec 2024 11:14:15 -0600 Subject: [PATCH 17/28] chore(bump): spring minor version bump 6.1.14 (#12228) --- build.gradle | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.gradle b/build.gradle index a3d807a7333494..8929b4e644972c 100644 --- a/build.gradle +++ b/build.gradle @@ -35,7 +35,7 @@ buildscript { ext.pegasusVersion = '29.57.0' ext.mavenVersion = '3.6.3' ext.versionGradle = '8.11.1' - ext.springVersion = '6.1.13' + ext.springVersion = '6.1.14' ext.springBootVersion = '3.2.9' ext.springKafkaVersion = '3.1.6' ext.openTelemetryVersion = '1.18.0' From a920e9bec8ed8f0aa6ecfd482c8659afa4f3e034 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 26 Dec 2024 15:33:39 -0500 Subject: [PATCH 18/28] fix(ingest/lookml): emit warnings for resolution failures (#12215) --- .../source/looker/looker_dataclasses.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_dataclasses.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_dataclasses.py index 327c9ebf99bd20..d771821a14d88d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_dataclasses.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_dataclasses.py @@ -186,16 +186,16 @@ def resolve_includes( f"traversal_path={traversal_path}, included_files = {included_files}, seen_so_far: {seen_so_far}" ) if "*" not in inc and not included_files: - reporter.report_failure( + reporter.warning( title="Error Resolving Include", - message=f"Cannot resolve include {inc}", - context=f"Path: {path}", + message="Cannot resolve included file", + context=f"Include: {inc}, path: {path}, traversal_path: {traversal_path}", ) elif not included_files: - reporter.report_failure( + reporter.warning( title="Error Resolving Include", - message=f"Did not resolve anything for wildcard include {inc}", - context=f"Path: {path}", + message="Did not find anything matching the wildcard include", + context=f"Include: {inc}, path: {path}, traversal_path: {traversal_path}", ) # only load files that we haven't seen so far included_files = [x for x in included_files if x not in seen_so_far] @@ -231,9 +231,7 @@ def resolve_includes( source_config, reporter, seen_so_far, - traversal_path=traversal_path - + "." - + pathlib.Path(included_file).stem, + traversal_path=f"{traversal_path} -> {pathlib.Path(included_file).stem}", ) ) except Exception as e: From e1998dd371b1002ae8893d7a63d84e5bace079c7 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 26 Dec 2024 15:34:00 -0500 Subject: [PATCH 19/28] chore(ingest): remove `enable_logging` helper (#12222) --- .../powerbi/rest_api_wrapper/data_resolver.py | 31 ++-- .../powerbi/test_admin_only_api.py | 12 -- .../tests/integration/powerbi/test_powerbi.py | 141 +++++++----------- .../integration/powerbi/test_profiling.py | 10 -- .../tableau/test_tableau_ingest.py | 29 ---- 5 files changed, 70 insertions(+), 153 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py index e1301edef10b84..161975fa635fdb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py @@ -84,13 +84,14 @@ def __init__( tenant_id: str, metadata_api_timeout: int, ): - self.__access_token: Optional[str] = None - self.__access_token_expiry_time: Optional[datetime] = None - self.__tenant_id = tenant_id + self._access_token: Optional[str] = None + self._access_token_expiry_time: Optional[datetime] = None + + self._tenant_id = tenant_id # Test connection by generating access token logger.info(f"Trying to connect to {self._get_authority_url()}") # Power-Bi Auth (Service Principal Auth) - self.__msal_client = msal.ConfidentialClientApplication( + self._msal_client = msal.ConfidentialClientApplication( client_id, client_credential=client_secret, authority=DataResolverBase.AUTHORITY + tenant_id, @@ -168,18 +169,18 @@ def _get_app( pass def _get_authority_url(self): - return f"{DataResolverBase.AUTHORITY}{self.__tenant_id}" + return f"{DataResolverBase.AUTHORITY}{self._tenant_id}" def get_authorization_header(self): return {Constant.Authorization: self.get_access_token()} - def get_access_token(self): - if self.__access_token is not None and not self._is_access_token_expired(): - return self.__access_token + def get_access_token(self) -> str: + if self._access_token is not None and not self._is_access_token_expired(): + return self._access_token logger.info("Generating PowerBi access token") - auth_response = self.__msal_client.acquire_token_for_client( + auth_response = self._msal_client.acquire_token_for_client( scopes=[DataResolverBase.SCOPE] ) @@ -193,24 +194,24 @@ def get_access_token(self): logger.info("Generated PowerBi access token") - self.__access_token = "Bearer {}".format( + self._access_token = "Bearer {}".format( auth_response.get(Constant.ACCESS_TOKEN) ) safety_gap = 300 - self.__access_token_expiry_time = datetime.now() + timedelta( + self._access_token_expiry_time = datetime.now() + timedelta( seconds=( max(auth_response.get(Constant.ACCESS_TOKEN_EXPIRY, 0) - safety_gap, 0) ) ) - logger.debug(f"{Constant.PBIAccessToken}={self.__access_token}") + logger.debug(f"{Constant.PBIAccessToken}={self._access_token}") - return self.__access_token + return self._access_token def _is_access_token_expired(self) -> bool: - if not self.__access_token_expiry_time: + if not self._access_token_expiry_time: return True - return self.__access_token_expiry_time < datetime.now() + return self._access_token_expiry_time < datetime.now() def get_dashboards(self, workspace: Workspace) -> List[Dashboard]: """ diff --git a/metadata-ingestion/tests/integration/powerbi/test_admin_only_api.py b/metadata-ingestion/tests/integration/powerbi/test_admin_only_api.py index b636c12cfda064..00dc79ed38cfba 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_admin_only_api.py +++ b/metadata-ingestion/tests/integration/powerbi/test_admin_only_api.py @@ -1,5 +1,3 @@ -import logging -import sys from typing import Any, Dict from unittest import mock @@ -483,12 +481,6 @@ def register_mock_admin_api(request_mock: Any, override_data: dict = {}) -> None ) -def enable_logging(): - # set logging to console - logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) - logging.getLogger().setLevel(logging.DEBUG) - - def mock_msal_cca(*args, **kwargs): class MsalClient: def acquire_token_for_client(self, *args, **kwargs): @@ -527,8 +519,6 @@ def default_source_config(): @freeze_time(FROZEN_TIME) @mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) def test_admin_only_apis(mock_msal, pytestconfig, tmp_path, mock_time, requests_mock): - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_admin_api(request_mock=requests_mock) @@ -567,8 +557,6 @@ def test_admin_only_apis(mock_msal, pytestconfig, tmp_path, mock_time, requests_ def test_most_config_and_modified_since( mock_msal, pytestconfig, tmp_path, mock_time, requests_mock ): - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_admin_api(request_mock=requests_mock) diff --git a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py index edde11ff87d293..739be7cc8408dd 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py +++ b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py @@ -1,8 +1,6 @@ import datetime import json -import logging import re -import sys from pathlib import Path from typing import Any, Dict, List, Optional, Union, cast from unittest import mock @@ -31,29 +29,21 @@ FROZEN_TIME = "2022-02-03 07:00:00" -def enable_logging(): - # set logging to console - logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) - logging.getLogger().setLevel(logging.DEBUG) - - -class MsalClient: - call_num = 0 - token: Dict[str, Any] = { - "access_token": "dummy", - } - - @staticmethod - def acquire_token_for_client(*args, **kwargs): - MsalClient.call_num += 1 - return MsalClient.token +def mock_msal_cca(*args, **kwargs): + class MsalClient: + def __init__(self): + self.call_num = 0 + self.token: Dict[str, Any] = { + "access_token": "dummy", + } - @staticmethod - def reset(): - MsalClient.call_num = 0 + def acquire_token_for_client(self, *args, **kwargs): + self.call_num += 1 + return self.token + def reset(self): + self.call_num = 0 -def mock_msal_cca(*args, **kwargs): return MsalClient() @@ -154,8 +144,6 @@ def test_powerbi_ingest( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api(pytestconfig=pytestconfig, request_mock=requests_mock) @@ -199,8 +187,6 @@ def test_powerbi_workspace_type_filter( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api( @@ -260,8 +246,6 @@ def test_powerbi_ingest_patch_disabled( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api(pytestconfig=pytestconfig, request_mock=requests_mock) @@ -327,8 +311,6 @@ def test_powerbi_platform_instance_ingest( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api(pytestconfig=pytestconfig, request_mock=requests_mock) @@ -515,8 +497,6 @@ def test_extract_reports( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api(pytestconfig=pytestconfig, request_mock=requests_mock) @@ -561,8 +541,6 @@ def test_extract_lineage( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api(pytestconfig=pytestconfig, request_mock=requests_mock) @@ -660,8 +638,6 @@ def test_admin_access_is_not_allowed( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api( @@ -723,8 +699,6 @@ def test_workspace_container( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api(pytestconfig=pytestconfig, request_mock=requests_mock) @@ -764,85 +738,84 @@ def test_workspace_container( ) -@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) def test_access_token_expiry_with_long_expiry( - mock_msal: MagicMock, pytestconfig: pytest.Config, tmp_path: str, mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() - register_mock_api(pytestconfig=pytestconfig, request_mock=requests_mock) - pipeline = Pipeline.create( - { - "run_id": "powerbi-test", - "source": { - "type": "powerbi", - "config": { - **default_source_config(), + mock_msal = mock_msal_cca() + + with mock.patch("msal.ConfidentialClientApplication", return_value=mock_msal): + pipeline = Pipeline.create( + { + "run_id": "powerbi-test", + "source": { + "type": "powerbi", + "config": { + **default_source_config(), + }, }, - }, - "sink": { - "type": "file", - "config": { - "filename": f"{tmp_path}/powerbi_access_token_mces.json", + "sink": { + "type": "file", + "config": { + "filename": f"{tmp_path}/powerbi_access_token_mces.json", + }, }, - }, - } - ) + } + ) # for long expiry, the token should only be requested once. - MsalClient.token = { + mock_msal.token = { "access_token": "dummy2", "expires_in": 3600, } + mock_msal.reset() - MsalClient.reset() pipeline.run() # We expect the token to be requested twice (once for AdminApiResolver and one for RegularApiResolver) - assert MsalClient.call_num == 2 + assert mock_msal.call_num == 2 -@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) def test_access_token_expiry_with_short_expiry( - mock_msal: MagicMock, pytestconfig: pytest.Config, tmp_path: str, mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() - register_mock_api(pytestconfig=pytestconfig, request_mock=requests_mock) - pipeline = Pipeline.create( - { - "run_id": "powerbi-test", - "source": { - "type": "powerbi", - "config": { - **default_source_config(), + mock_msal = mock_msal_cca() + with mock.patch("msal.ConfidentialClientApplication", return_value=mock_msal): + pipeline = Pipeline.create( + { + "run_id": "powerbi-test", + "source": { + "type": "powerbi", + "config": { + **default_source_config(), + }, }, - }, - "sink": { - "type": "file", - "config": { - "filename": f"{tmp_path}/powerbi_access_token_mces.json", + "sink": { + "type": "file", + "config": { + "filename": f"{tmp_path}/powerbi_access_token_mces.json", + }, }, - }, - } - ) + } + ) # for short expiry, the token should be requested when expires. - MsalClient.token = { + mock_msal.token = { "access_token": "dummy", "expires_in": 0, } + mock_msal.reset() + pipeline.run() - assert MsalClient.call_num > 2 + assert mock_msal.call_num > 2 def dataset_type_mapping_set_to_all_platform(pipeline: Pipeline) -> None: @@ -940,8 +913,6 @@ def test_dataset_type_mapping_error( def test_server_to_platform_map( mock_msal, pytestconfig, tmp_path, mock_time, requests_mock ): - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" new_config: dict = { **default_source_config(), @@ -1416,8 +1387,6 @@ def test_powerbi_cross_workspace_reference_info_message( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() - register_mock_api( pytestconfig=pytestconfig, request_mock=requests_mock, @@ -1495,8 +1464,6 @@ def common_app_ingest( output_mcp_path: str, override_config: dict = {}, ) -> Pipeline: - enable_logging() - register_mock_api( pytestconfig=pytestconfig, request_mock=requests_mock, diff --git a/metadata-ingestion/tests/integration/powerbi/test_profiling.py b/metadata-ingestion/tests/integration/powerbi/test_profiling.py index 4b48bed003b1e8..78d35cf31a26d9 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_profiling.py +++ b/metadata-ingestion/tests/integration/powerbi/test_profiling.py @@ -1,5 +1,3 @@ -import logging -import sys from typing import Any, Dict from unittest import mock @@ -271,12 +269,6 @@ def register_mock_admin_api(request_mock: Any, override_data: dict = {}) -> None ) -def enable_logging(): - # set logging to console - logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) - logging.getLogger().setLevel(logging.DEBUG) - - def mock_msal_cca(*args, **kwargs): class MsalClient: def acquire_token_for_client(self, *args, **kwargs): @@ -311,8 +303,6 @@ def default_source_config(): @freeze_time(FROZEN_TIME) @mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) def test_profiling(mock_msal, pytestconfig, tmp_path, mock_time, requests_mock): - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_admin_api(request_mock=requests_mock) diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index 902ff243c802a8..71e5ad10c2fc5e 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -1,7 +1,5 @@ import json -import logging import pathlib -import sys from typing import Any, Dict, List, cast from unittest import mock @@ -88,12 +86,6 @@ } -def enable_logging(): - # set logging to console - logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) - logging.getLogger().setLevel(logging.DEBUG) - - def read_response(file_name): response_json_path = f"{test_resources_dir}/setup/{file_name}" with open(response_json_path) as file: @@ -376,7 +368,6 @@ def tableau_ingest_common( @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_tableau_ingest(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_mces.json" golden_file_name: str = "tableau_mces_golden.json" tableau_ingest_common( @@ -454,7 +445,6 @@ def mock_data() -> List[dict]: @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_tableau_cll_ingest(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_mces_cll.json" golden_file_name: str = "tableau_cll_mces_golden.json" @@ -481,7 +471,6 @@ def test_tableau_cll_ingest(pytestconfig, tmp_path, mock_datahub_graph): @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_project_pattern(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_project_pattern_mces.json" golden_file_name: str = "tableau_mces_golden.json" @@ -505,7 +494,6 @@ def test_project_pattern(pytestconfig, tmp_path, mock_datahub_graph): @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_project_path_pattern(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_project_path_mces.json" golden_file_name: str = "tableau_project_path_mces_golden.json" @@ -529,8 +517,6 @@ def test_project_path_pattern(pytestconfig, tmp_path, mock_datahub_graph): @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_project_hierarchy(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() - output_file_name: str = "tableau_nested_project_mces.json" golden_file_name: str = "tableau_nested_project_mces_golden.json" @@ -554,7 +540,6 @@ def test_project_hierarchy(pytestconfig, tmp_path, mock_datahub_graph): @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_extract_all_project(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_extract_all_project_mces.json" golden_file_name: str = "tableau_extract_all_project_mces_golden.json" @@ -644,7 +629,6 @@ def test_project_path_pattern_deny(pytestconfig, tmp_path, mock_datahub_graph): def test_tableau_ingest_with_platform_instance( pytestconfig, tmp_path, mock_datahub_graph ): - enable_logging() output_file_name: str = "tableau_with_platform_instance_mces.json" golden_file_name: str = "tableau_with_platform_instance_mces_golden.json" @@ -691,7 +675,6 @@ def test_tableau_ingest_with_platform_instance( def test_lineage_overrides(): - enable_logging() # Simple - specify platform instance to presto table assert ( TableauUpstreamReference( @@ -745,7 +728,6 @@ def test_lineage_overrides(): def test_database_hostname_to_platform_instance_map(): - enable_logging() # Simple - snowflake table assert ( TableauUpstreamReference( @@ -916,7 +898,6 @@ def test_tableau_stateful(pytestconfig, tmp_path, mock_time, mock_datahub_graph) def test_tableau_no_verify(): - enable_logging() # This test ensures that we can connect to a self-signed certificate # when ssl_verify is set to False. @@ -941,7 +922,6 @@ def test_tableau_no_verify(): @freeze_time(FROZEN_TIME) @pytest.mark.integration_batch_2 def test_tableau_signout_timeout(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_signout_timeout_mces.json" golden_file_name: str = "tableau_signout_timeout_mces_golden.json" tableau_ingest_common( @@ -1073,7 +1053,6 @@ def test_get_all_datasources_failure(pytestconfig, tmp_path, mock_datahub_graph) @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_tableau_ingest_multiple_sites(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_mces_multiple_sites.json" golden_file_name: str = "tableau_multiple_sites_mces_golden.json" @@ -1135,7 +1114,6 @@ def test_tableau_ingest_multiple_sites(pytestconfig, tmp_path, mock_datahub_grap @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_tableau_ingest_sites_as_container(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_mces_ingest_sites_as_container.json" golden_file_name: str = "tableau_sites_as_container_mces_golden.json" @@ -1159,7 +1137,6 @@ def test_tableau_ingest_sites_as_container(pytestconfig, tmp_path, mock_datahub_ @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_site_name_pattern(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_site_name_pattern_mces.json" golden_file_name: str = "tableau_site_name_pattern_mces_golden.json" @@ -1183,7 +1160,6 @@ def test_site_name_pattern(pytestconfig, tmp_path, mock_datahub_graph): @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_permission_ingestion(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_permission_ingestion_mces.json" golden_file_name: str = "tableau_permission_ingestion_mces_golden.json" @@ -1209,7 +1185,6 @@ def test_permission_ingestion(pytestconfig, tmp_path, mock_datahub_graph): @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_no_hidden_assets(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_no_hidden_assets_mces.json" golden_file_name: str = "tableau_no_hidden_assets_mces_golden.json" @@ -1232,7 +1207,6 @@ def test_no_hidden_assets(pytestconfig, tmp_path, mock_datahub_graph): @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_ingest_tags_disabled(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_ingest_tags_disabled_mces.json" golden_file_name: str = "tableau_ingest_tags_disabled_mces_golden.json" @@ -1254,7 +1228,6 @@ def test_ingest_tags_disabled(pytestconfig, tmp_path, mock_datahub_graph): @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_hidden_asset_tags(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_hidden_asset_tags_mces.json" golden_file_name: str = "tableau_hidden_asset_tags_mces_golden.json" @@ -1277,8 +1250,6 @@ def test_hidden_asset_tags(pytestconfig, tmp_path, mock_datahub_graph): @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_hidden_assets_without_ingest_tags(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() - new_config = config_source_default.copy() new_config["tags_for_hidden_assets"] = ["hidden", "private"] new_config["ingest_tags"] = False From 172736a9b3d291d6cb8fcaf775114abdf8853e1f Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 26 Dec 2024 21:34:31 -0500 Subject: [PATCH 20/28] feat(ingest/dbt): support "Explore" page in dbt cloud (#12223) --- docs/how/updating-datahub.md | 1 + .../src/datahub/ingestion/source/dbt/dbt_cloud.py | 13 ++++++++++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index a742ebe0cd8968..d6620fde0bf794 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -42,6 +42,7 @@ This file documents any backwards-incompatible changes in DataHub and assists pe - #12077: `Kafka` source no longer ingests schemas from schema registry as separate entities by default, set `ingest_schemas_as_entities` to `true` to ingest them - OpenAPI Update: PIT Keep Alive parameter added to scroll. NOTE: This parameter requires the `pointInTimeCreationEnabled` feature flag to be enabled and the `elasticSearch.implementation` configuration to be `elasticsearch`. This feature is not supported for OpenSearch at this time and the parameter will not be respected without both of these set. - OpenAPI Update 2: Previously there was an incorrectly marked parameter named `sort` on the generic list entities endpoint for v3. This parameter is deprecated and only supports a single string value while the documentation indicates it supports a list of strings. This documentation error has been fixed and the correct field, `sortCriteria`, is now documented which supports a list of strings. +- #12223: For dbt Cloud ingestion, the "View in dbt" link will point at the "Explore" page in the dbt Cloud UI. You can revert to the old behavior of linking to the dbt Cloud IDE by setting `external_url_mode: ide". ### Breaking Changes diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py index 66c5ef7179af41..5042f6d69b261a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py @@ -1,7 +1,7 @@ import logging from datetime import datetime from json import JSONDecodeError -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Literal, Optional, Tuple from urllib.parse import urlparse import dateutil.parser @@ -62,6 +62,11 @@ class DBTCloudConfig(DBTCommonConfig): description="The ID of the run to ingest metadata from. If not specified, we'll default to the latest run.", ) + external_url_mode: Literal["explore", "ide"] = Field( + default="explore", + description='Where should the "View in dbt" link point to - either the "Explore" UI or the dbt Cloud IDE', + ) + @root_validator(pre=True) def set_metadata_endpoint(cls, values: dict) -> dict: if values.get("access_url") and not values.get("metadata_endpoint"): @@ -527,5 +532,7 @@ def _parse_into_dbt_column( ) def get_external_url(self, node: DBTNode) -> Optional[str]: - # TODO: Once dbt Cloud supports deep linking to specific files, we can use that. - return f"{self.config.access_url}/develop/{self.config.account_id}/projects/{self.config.project_id}" + if self.config.external_url_mode == "explore": + return f"{self.config.access_url}/explore/{self.config.account_id}/projects/{self.config.project_id}/environments/production/details/{node.dbt_name}" + else: + return f"{self.config.access_url}/develop/{self.config.account_id}/projects/{self.config.project_id}" From 3ca8d09100eb649a5f191a32f2af8d300424818b Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Fri, 27 Dec 2024 11:40:00 +0530 Subject: [PATCH 21/28] feat(ingest/snowflake): support email_as_user_identifier for queries v2 (#12219) --- .../source/snowflake/snowflake_config.py | 19 ++++--- .../source/snowflake/snowflake_queries.py | 45 ++++++++++++--- .../source/snowflake/snowflake_query.py | 6 +- .../source/snowflake/snowflake_usage_v2.py | 7 +-- .../source/snowflake/snowflake_utils.py | 40 ++++++++------ .../snowflake/test_snowflake_queries.py | 55 +++++++++++++++++++ 6 files changed, 132 insertions(+), 40 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index 2b2dcf860cdb07..12e5fb72b00de8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -138,12 +138,20 @@ class SnowflakeIdentifierConfig( description="Whether to convert dataset urns to lowercase.", ) - -class SnowflakeUsageConfig(BaseUsageConfig): email_domain: Optional[str] = pydantic.Field( default=None, description="Email domain of your organization so users can be displayed on UI appropriately.", ) + + email_as_user_identifier: bool = Field( + default=True, + description="Format user urns as an email, if the snowflake user's email is set. If `email_domain` is " + "provided, generates email addresses for snowflake users with unset emails, based on their " + "username.", + ) + + +class SnowflakeUsageConfig(BaseUsageConfig): apply_view_usage_to_tables: bool = pydantic.Field( default=False, description="Whether to apply view's usage to its base tables. If set to True, usage is applied to base tables only.", @@ -267,13 +275,6 @@ class SnowflakeV2Config( " Map of share name -> details of share.", ) - email_as_user_identifier: bool = Field( - default=True, - description="Format user urns as an email, if the snowflake user's email is set. If `email_domain` is " - "provided, generates email addresses for snowflake users with unset emails, based on their " - "username.", - ) - include_assertion_results: bool = Field( default=False, description="Whether to ingest assertion run results for assertions created using Datahub" diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py index 174aad0bddd4a8..36825dc33fe7dc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -66,6 +66,11 @@ logger = logging.getLogger(__name__) +# Define a type alias +UserName = str +UserEmail = str +UsersMapping = Dict[UserName, UserEmail] + class SnowflakeQueriesExtractorConfig(ConfigModel): # TODO: Support stateful ingestion for the time windows. @@ -114,11 +119,13 @@ class SnowflakeQueriesSourceConfig( class SnowflakeQueriesExtractorReport(Report): copy_history_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer) query_log_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer) + users_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer) audit_log_load_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer) sql_aggregator: Optional[SqlAggregatorReport] = None num_ddl_queries_dropped: int = 0 + num_users: int = 0 @dataclass @@ -225,6 +232,9 @@ def is_allowed_table(self, name: str) -> bool: def get_workunits_internal( self, ) -> Iterable[MetadataWorkUnit]: + with self.report.users_fetch_timer: + users = self.fetch_users() + # TODO: Add some logic to check if the cached audit log is stale or not. audit_log_file = self.local_temp_path / "audit_log.sqlite" use_cached_audit_log = audit_log_file.exists() @@ -248,7 +258,7 @@ def get_workunits_internal( queries.append(entry) with self.report.query_log_fetch_timer: - for entry in self.fetch_query_log(): + for entry in self.fetch_query_log(users): queries.append(entry) with self.report.audit_log_load_timer: @@ -263,6 +273,25 @@ def get_workunits_internal( shared_connection.close() audit_log_file.unlink(missing_ok=True) + def fetch_users(self) -> UsersMapping: + users: UsersMapping = dict() + with self.structured_reporter.report_exc("Error fetching users from Snowflake"): + logger.info("Fetching users from Snowflake") + query = SnowflakeQuery.get_all_users() + resp = self.connection.query(query) + + for row in resp: + try: + users[row["NAME"]] = row["EMAIL"] + self.report.num_users += 1 + except Exception as e: + self.structured_reporter.warning( + "Error parsing user row", + context=f"{row}", + exc=e, + ) + return users + def fetch_copy_history(self) -> Iterable[KnownLineageMapping]: # Derived from _populate_external_lineage_from_copy_history. @@ -298,7 +327,7 @@ def fetch_copy_history(self) -> Iterable[KnownLineageMapping]: yield result def fetch_query_log( - self, + self, users: UsersMapping ) -> Iterable[Union[PreparsedQuery, TableRename, TableSwap]]: query_log_query = _build_enriched_query_log_query( start_time=self.config.window.start_time, @@ -319,7 +348,7 @@ def fetch_query_log( assert isinstance(row, dict) try: - entry = self._parse_audit_log_row(row) + entry = self._parse_audit_log_row(row, users) except Exception as e: self.structured_reporter.warning( "Error parsing query log row", @@ -331,7 +360,7 @@ def fetch_query_log( yield entry def _parse_audit_log_row( - self, row: Dict[str, Any] + self, row: Dict[str, Any], users: UsersMapping ) -> Optional[Union[TableRename, TableSwap, PreparsedQuery]]: json_fields = { "DIRECT_OBJECTS_ACCESSED", @@ -430,9 +459,11 @@ def _parse_audit_log_row( ) ) - # TODO: Fetch email addresses from Snowflake to map user -> email - # TODO: Support email_domain fallback for generating user urns. - user = CorpUserUrn(self.identifiers.snowflake_identifier(res["user_name"])) + user = CorpUserUrn( + self.identifiers.get_user_identifier( + res["user_name"], users.get(res["user_name"]) + ) + ) timestamp: datetime = res["query_start_time"] timestamp = timestamp.astimezone(timezone.utc) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py index a94b39476b2c22..40bcfb514efd23 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py @@ -947,4 +947,8 @@ def dmf_assertion_results(start_time_millis: int, end_time_millis: int) -> str: AND METRIC_NAME ilike '{pattern}' escape '{escape_pattern}' ORDER BY MEASUREMENT_TIME ASC; -""" + """ + + @staticmethod + def get_all_users() -> str: + return """SELECT name as "NAME", email as "EMAIL" FROM SNOWFLAKE.ACCOUNT_USAGE.USERS""" diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py index aff15386c50833..4bdf559f293b51 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py @@ -342,10 +342,9 @@ def _map_user_counts( filtered_user_counts.append( DatasetUserUsageCounts( user=make_user_urn( - self.get_user_identifier( + self.identifiers.get_user_identifier( user_count["user_name"], user_email, - self.config.email_as_user_identifier, ) ), count=user_count["total"], @@ -453,9 +452,7 @@ def _get_operation_aspect_work_unit( reported_time: int = int(time.time() * 1000) last_updated_timestamp: int = int(start_time.timestamp() * 1000) user_urn = make_user_urn( - self.get_user_identifier( - user_name, user_email, self.config.email_as_user_identifier - ) + self.identifiers.get_user_identifier(user_name, user_email) ) # NOTE: In earlier `snowflake-usage` connector this was base_objects_accessed, which is incorrect diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py index 8e0c97aa135e84..885bee1ccdb908 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py @@ -300,6 +300,28 @@ def get_quoted_identifier_for_schema(db_name, schema_name): def get_quoted_identifier_for_table(db_name, schema_name, table_name): return f'"{db_name}"."{schema_name}"."{table_name}"' + # Note - decide how to construct user urns. + # Historically urns were created using part before @ from user's email. + # Users without email were skipped from both user entries as well as aggregates. + # However email is not mandatory field in snowflake user, user_name is always present. + def get_user_identifier( + self, + user_name: str, + user_email: Optional[str], + ) -> str: + if user_email: + return self.snowflake_identifier( + user_email + if self.identifier_config.email_as_user_identifier is True + else user_email.split("@")[0] + ) + return self.snowflake_identifier( + f"{user_name}@{self.identifier_config.email_domain}" + if self.identifier_config.email_as_user_identifier is True + and self.identifier_config.email_domain is not None + else user_name + ) + class SnowflakeCommonMixin(SnowflakeStructuredReportMixin): platform = "snowflake" @@ -315,24 +337,6 @@ def structured_reporter(self) -> SourceReport: def identifiers(self) -> SnowflakeIdentifierBuilder: return SnowflakeIdentifierBuilder(self.config, self.report) - # Note - decide how to construct user urns. - # Historically urns were created using part before @ from user's email. - # Users without email were skipped from both user entries as well as aggregates. - # However email is not mandatory field in snowflake user, user_name is always present. - def get_user_identifier( - self, - user_name: str, - user_email: Optional[str], - email_as_user_identifier: bool, - ) -> str: - if user_email: - return self.identifiers.snowflake_identifier( - user_email - if email_as_user_identifier is True - else user_email.split("@")[0] - ) - return self.identifiers.snowflake_identifier(user_name) - # TODO: Revisit this after stateful ingestion can commit checkpoint # for failures that do not affect the checkpoint # TODO: Add additional parameters to match the signature of the .warning and .failure methods diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_queries.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_queries.py index 82f5691bcee3de..ae0f23d93215d4 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_queries.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_queries.py @@ -22,3 +22,58 @@ def test_source_close_cleans_tmp(snowflake_connect, tmp_path): # This closes QueriesExtractor which in turn closes SqlParsingAggregator source.close() assert len(os.listdir(tmp_path)) == 0 + + +@patch("snowflake.connector.connect") +def test_user_identifiers_email_as_identifier(snowflake_connect, tmp_path): + source = SnowflakeQueriesSource.create( + { + "connection": { + "account_id": "ABC12345.ap-south-1.aws", + "username": "TST_USR", + "password": "TST_PWD", + }, + "email_as_user_identifier": True, + "email_domain": "example.com", + }, + PipelineContext("run-id"), + ) + assert ( + source.identifiers.get_user_identifier("username", "username@example.com") + == "username@example.com" + ) + assert ( + source.identifiers.get_user_identifier("username", None) + == "username@example.com" + ) + + # We'd do best effort to use email as identifier, but would keep username as is, + # if email can't be formed. + source.identifiers.identifier_config.email_domain = None + + assert ( + source.identifiers.get_user_identifier("username", "username@example.com") + == "username@example.com" + ) + + assert source.identifiers.get_user_identifier("username", None) == "username" + + +@patch("snowflake.connector.connect") +def test_user_identifiers_username_as_identifier(snowflake_connect, tmp_path): + source = SnowflakeQueriesSource.create( + { + "connection": { + "account_id": "ABC12345.ap-south-1.aws", + "username": "TST_USR", + "password": "TST_PWD", + }, + "email_as_user_identifier": False, + }, + PipelineContext("run-id"), + ) + assert ( + source.identifiers.get_user_identifier("username", "username@example.com") + == "username" + ) + assert source.identifiers.get_user_identifier("username", None) == "username" From 29e4528ae5117ccdb6f0685b8571c2afdcc19f57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Fri, 27 Dec 2024 11:12:40 +0100 Subject: [PATCH 22/28] fix(tableau): retry if 502 error code (#12233) --- .../datahub/ingestion/source/tableau/tableau.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index 508500ffe489b9..df59cae3fad232 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -186,6 +186,15 @@ except ImportError: REAUTHENTICATE_ERRORS = (NonXMLResponseError,) +RETRIABLE_ERROR_CODES = [ + 408, # Request Timeout + 429, # Too Many Requests + 500, # Internal Server Error + 502, # Bad Gateway + 503, # Service Unavailable + 504, # Gateway Timeout +] + logger: logging.Logger = logging.getLogger(__name__) # Replace / with | @@ -287,7 +296,7 @@ def make_tableau_client(self, site: str) -> Server: max_retries=Retry( total=self.max_retries, backoff_factor=1, - status_forcelist=[429, 500, 502, 503, 504], + status_forcelist=RETRIABLE_ERROR_CODES, ) ) server._session.mount("http://", adapter) @@ -1212,9 +1221,11 @@ def get_connection_object_page( except InternalServerError as ise: # In some cases Tableau Server returns 504 error, which is a timeout error, so it worths to retry. - if ise.code == 504: + # Extended with other retryable errors. + if ise.code in RETRIABLE_ERROR_CODES: if retries_remaining <= 0: raise ise + logger.info(f"Retrying query due to error {ise.code}") return self.get_connection_object_page( query=query, connection_type=connection_type, From d7de7eb2a65385b0a4458f9c26cc8b1a42158cc1 Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Fri, 27 Dec 2024 17:51:43 +0530 Subject: [PATCH 23/28] ci: remove qodana (#12227) --- .github/workflows/qodana-scan.yml | 23 ----------------------- 1 file changed, 23 deletions(-) delete mode 100644 .github/workflows/qodana-scan.yml diff --git a/.github/workflows/qodana-scan.yml b/.github/workflows/qodana-scan.yml deleted file mode 100644 index 750cf24ad38e57..00000000000000 --- a/.github/workflows/qodana-scan.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: Qodana -on: - workflow_dispatch: - pull_request: - push: - branches: - - master - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - qodana: - runs-on: ubuntu-latest - steps: - - uses: acryldata/sane-checkout-action@v3 - - name: "Qodana Scan" - uses: JetBrains/qodana-action@v2022.3.4 - - uses: github/codeql-action/upload-sarif@v2 - with: - sarif_file: ${{ runner.temp }}/qodana/results/qodana.sarif.json - cache-default-branch-only: true From ac8e539457ef984cb61329a449585fa86fc5d3c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Fri, 27 Dec 2024 16:14:32 +0100 Subject: [PATCH 24/28] chore(tableau): adjust visibility of info message (#12235) --- .../src/datahub/ingestion/source/tableau/tableau.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index df59cae3fad232..d47e10c9eb5c62 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -920,10 +920,7 @@ def dataset_browse_prefix(self) -> str: return f"/{self.config.env.lower()}{self.no_env_browse_prefix}" def _re_authenticate(self) -> None: - self.report.info( - message="Re-authenticating to Tableau", - context=f"site='{self.site_content_url}'", - ) + logger.info(f"Re-authenticating to Tableau site '{self.site_content_url}'") # Sign-in again may not be enough because Tableau sometimes caches invalid sessions # so we need to recreate the Tableau Server object self.server = self.config.make_tableau_client(self.site_content_url) From ed8639e401d30b842fac66b52636f5c1ab0c71b7 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Fri, 27 Dec 2024 13:46:49 -0500 Subject: [PATCH 25/28] chore(python): test with python 3.11 (#11280) Co-authored-by: Tamas Nemeth Co-authored-by: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> --- .github/workflows/dagster-plugin.yml | 6 +++--- .github/workflows/metadata-ingestion.yml | 4 ++-- .github/workflows/prefect-plugin.yml | 4 ++-- metadata-ingestion-modules/airflow-plugin/setup.py | 4 ---- metadata-ingestion-modules/dagster-plugin/README.md | 3 +-- metadata-ingestion-modules/dagster-plugin/setup.py | 3 --- metadata-ingestion-modules/gx-plugin/README.md | 3 +-- metadata-ingestion-modules/gx-plugin/setup.py | 3 --- metadata-ingestion-modules/prefect-plugin/README.md | 2 +- metadata-ingestion-modules/prefect-plugin/setup.py | 6 +----- metadata-ingestion/setup.py | 10 ++++------ .../src/datahub/ingestion/source/s3/source.py | 2 +- .../tests/integration/feast/test_feast_repository.py | 8 ++++++++ 13 files changed, 24 insertions(+), 34 deletions(-) diff --git a/.github/workflows/dagster-plugin.yml b/.github/workflows/dagster-plugin.yml index d8a9cd7bfd6a35..ae9a0b1605cdf3 100644 --- a/.github/workflows/dagster-plugin.yml +++ b/.github/workflows/dagster-plugin.yml @@ -30,11 +30,11 @@ jobs: DATAHUB_TELEMETRY_ENABLED: false strategy: matrix: - python-version: ["3.9", "3.10"] + python-version: ["3.9", "3.11"] include: - python-version: "3.9" extraPythonRequirement: "dagster>=1.3.3" - - python-version: "3.10" + - python-version: "3.11" extraPythonRequirement: "dagster>=1.3.3" fail-fast: false steps: @@ -57,7 +57,7 @@ jobs: if: always() run: source metadata-ingestion-modules/dagster-plugin/venv/bin/activate && uv pip freeze - uses: actions/upload-artifact@v4 - if: ${{ always() && matrix.python-version == '3.10' && matrix.extraPythonRequirement == 'dagster>=1.3.3' }} + if: ${{ always() && matrix.python-version == '3.11' && matrix.extraPythonRequirement == 'dagster>=1.3.3' }} with: name: Test Results (dagster Plugin ${{ matrix.python-version}}) path: | diff --git a/.github/workflows/metadata-ingestion.yml b/.github/workflows/metadata-ingestion.yml index ad00c6d1551d1d..106cba1473982e 100644 --- a/.github/workflows/metadata-ingestion.yml +++ b/.github/workflows/metadata-ingestion.yml @@ -33,7 +33,7 @@ jobs: # DATAHUB_LOOKML_GIT_TEST_SSH_KEY: ${{ secrets.DATAHUB_LOOKML_GIT_TEST_SSH_KEY }} strategy: matrix: - python-version: ["3.8", "3.10"] + python-version: ["3.8", "3.11"] command: [ "testQuick", @@ -43,7 +43,7 @@ jobs: ] include: - python-version: "3.8" - - python-version: "3.10" + - python-version: "3.11" fail-fast: false steps: - name: Free up disk space diff --git a/.github/workflows/prefect-plugin.yml b/.github/workflows/prefect-plugin.yml index e4a70426f3a618..d77142a1f00ded 100644 --- a/.github/workflows/prefect-plugin.yml +++ b/.github/workflows/prefect-plugin.yml @@ -30,7 +30,7 @@ jobs: DATAHUB_TELEMETRY_ENABLED: false strategy: matrix: - python-version: ["3.8", "3.9", "3.10"] + python-version: ["3.8", "3.9", "3.10", "3.11"] fail-fast: false steps: - name: Set up JDK 17 @@ -52,7 +52,7 @@ jobs: if: always() run: source metadata-ingestion-modules/prefect-plugin/venv/bin/activate && uv pip freeze - uses: actions/upload-artifact@v4 - if: ${{ always() && matrix.python-version == '3.10'}} + if: ${{ always() && matrix.python-version == '3.11'}} with: name: Test Results (Prefect Plugin ${{ matrix.python-version}}) path: | diff --git a/metadata-ingestion-modules/airflow-plugin/setup.py b/metadata-ingestion-modules/airflow-plugin/setup.py index 3209233184d55a..2693aab0700da3 100644 --- a/metadata-ingestion-modules/airflow-plugin/setup.py +++ b/metadata-ingestion-modules/airflow-plugin/setup.py @@ -148,10 +148,6 @@ def get_long_description(): "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", "Intended Audience :: Developers", "Intended Audience :: Information Technology", "Intended Audience :: System Administrators", diff --git a/metadata-ingestion-modules/dagster-plugin/README.md b/metadata-ingestion-modules/dagster-plugin/README.md index 8e1460957ed9ff..5113fc37dcc222 100644 --- a/metadata-ingestion-modules/dagster-plugin/README.md +++ b/metadata-ingestion-modules/dagster-plugin/README.md @@ -1,4 +1,3 @@ # Datahub Dagster Plugin -See the DataHub Dagster docs for details. - +See the [DataHub Dagster docs](https://datahubproject.io/docs/lineage/dagster/) for details. diff --git a/metadata-ingestion-modules/dagster-plugin/setup.py b/metadata-ingestion-modules/dagster-plugin/setup.py index 0e0685cb378c1b..22c15497bd8070 100644 --- a/metadata-ingestion-modules/dagster-plugin/setup.py +++ b/metadata-ingestion-modules/dagster-plugin/setup.py @@ -107,9 +107,6 @@ def get_long_description(): "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", "Intended Audience :: Developers", "Intended Audience :: Information Technology", "Intended Audience :: System Administrators", diff --git a/metadata-ingestion-modules/gx-plugin/README.md b/metadata-ingestion-modules/gx-plugin/README.md index 1ffd87a955432d..9d50235a093d63 100644 --- a/metadata-ingestion-modules/gx-plugin/README.md +++ b/metadata-ingestion-modules/gx-plugin/README.md @@ -1,4 +1,3 @@ # Datahub GX Plugin -See the DataHub GX docs for details. - +See the [DataHub GX docs](https://datahubproject.io/docs/metadata-ingestion/integration_docs/great-expectations) for details. diff --git a/metadata-ingestion-modules/gx-plugin/setup.py b/metadata-ingestion-modules/gx-plugin/setup.py index 73d5d1a9a02f18..40afc81a98f9c8 100644 --- a/metadata-ingestion-modules/gx-plugin/setup.py +++ b/metadata-ingestion-modules/gx-plugin/setup.py @@ -118,9 +118,6 @@ def get_long_description(): "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", "Intended Audience :: Developers", "Intended Audience :: Information Technology", "Intended Audience :: System Administrators", diff --git a/metadata-ingestion-modules/prefect-plugin/README.md b/metadata-ingestion-modules/prefect-plugin/README.md index 0896942e78ef61..f21e00b4945135 100644 --- a/metadata-ingestion-modules/prefect-plugin/README.md +++ b/metadata-ingestion-modules/prefect-plugin/README.md @@ -28,7 +28,7 @@ The `prefect-datahub` collection allows you to easily integrate DataHub's metada ## Prerequisites -- Python 3.7+ +- Python 3.8+ - Prefect 2.0.0+ and < 3.0.0+ - A running instance of DataHub diff --git a/metadata-ingestion-modules/prefect-plugin/setup.py b/metadata-ingestion-modules/prefect-plugin/setup.py index 7e56fe8b6ad114..70b0e958195645 100644 --- a/metadata-ingestion-modules/prefect-plugin/setup.py +++ b/metadata-ingestion-modules/prefect-plugin/setup.py @@ -103,10 +103,6 @@ def get_long_description(): "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", "Intended Audience :: Developers", "Intended Audience :: Information Technology", "Intended Audience :: System Administrators", @@ -120,7 +116,7 @@ def get_long_description(): ], # Package info. zip_safe=False, - python_requires=">=3.7", + python_requires=">=3.8", package_dir={"": "src"}, packages=setuptools.find_namespace_packages(where="./src"), entry_points=entry_points, diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index c6994dd6d5aa65..986dc189cb29ba 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -298,8 +298,8 @@ } data_lake_profiling = { - "pydeequ~=1.1.0", - "pyspark~=3.3.0", + "pydeequ>=1.1.0", + "pyspark~=3.5.0", } delta_lake = { @@ -318,7 +318,7 @@ # 0.1.11 appears to have authentication issues with azure databricks # 0.22.0 has support for `include_browse` in metadata list apis "databricks-sdk>=0.30.0", - "pyspark~=3.3.0", + "pyspark~=3.5.0", "requests", # Version 2.4.0 includes sqlalchemy dialect, 2.8.0 includes some bug fixes # Version 3.0.0 required SQLAlchemy > 2.0.21 @@ -874,9 +874,6 @@ "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", "Intended Audience :: Developers", "Intended Audience :: Information Technology", "Intended Audience :: System Administrators", @@ -917,6 +914,7 @@ "sync-file-emitter", "sql-parser", "iceberg", + "feast", } else set() ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py index 3ddf47b70cdf80..ceac9e96d1ddd0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py @@ -225,7 +225,7 @@ def __init__(self, config: DataLakeSourceConfig, ctx: PipelineContext): self.init_spark() def init_spark(self): - os.environ.setdefault("SPARK_VERSION", "3.3") + os.environ.setdefault("SPARK_VERSION", "3.5") spark_version = os.environ["SPARK_VERSION"] # Importing here to avoid Deequ dependency for non profiling use cases diff --git a/metadata-ingestion/tests/integration/feast/test_feast_repository.py b/metadata-ingestion/tests/integration/feast/test_feast_repository.py index 7f04337145dc36..80d7c6311a9589 100644 --- a/metadata-ingestion/tests/integration/feast/test_feast_repository.py +++ b/metadata-ingestion/tests/integration/feast/test_feast_repository.py @@ -1,3 +1,6 @@ +import sys + +import pytest from freezegun import freeze_time from datahub.ingestion.run.pipeline import Pipeline @@ -6,6 +9,11 @@ FROZEN_TIME = "2020-04-14 07:00:00" +# The test is skipped for python 3.11 due to conflicting dependencies in installDev +# setup that requires pydantic < 2 for majority plugins. Note that the test works with +# python 3.11 if run with standalone virtual env setup with feast plugin alone using +# `pip install acryl-datahub[feast]` since it allows pydantic > 2 +@pytest.mark.skipif(sys.version_info > (3, 11), reason="Skipped on Python 3.11+") @freeze_time(FROZEN_TIME) def test_feast_repository_ingest(pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/feast" From d0423547ba559c6059ffc35f9ed153036bf0e45d Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Fri, 27 Dec 2024 13:50:28 -0500 Subject: [PATCH 26/28] feat(ingest): add parse_ts_millis helper (#12231) --- .../assertion_circuit_breaker.py | 9 ++--- .../src/datahub/emitter/mce_builder.py | 18 +++++++++- .../src/datahub/emitter/mcp_builder.py | 9 ++--- .../src/datahub/emitter/mcp_patch_builder.py | 4 +-- .../src/datahub/emitter/rest_emitter.py | 4 +-- .../datahub/ingestion/api/source_helpers.py | 8 ++--- .../source/bigquery_v2/bigquery_schema.py | 25 +++---------- .../source/datahub/datahub_kafka_reader.py | 3 +- .../source/sql/sql_generic_profiler.py | 11 +++--- .../ingestion/source/state/checkpoint.py | 3 +- .../datahub/ingestion/source/unity/proxy.py | 35 +++++-------------- .../src/datahub/utilities/time.py | 11 ++++-- .../dbt_enabled_with_schemas_mces_golden.json | 10 +++--- .../dbt_test_column_meta_mapping_golden.json | 10 +++--- ...test_prefer_sql_parser_lineage_golden.json | 34 +++++++++--------- ...bt_test_test_model_performance_golden.json | 34 +++++++++--------- ...th_complex_owner_patterns_mces_golden.json | 10 +++--- ...th_data_platform_instance_mces_golden.json | 10 +++--- ...h_non_incremental_lineage_mces_golden.json | 10 +++--- ..._target_platform_instance_mces_golden.json | 10 +++--- .../tests/unit/sdk/test_mce_builder.py | 17 +++++++++ .../tests/unit/serde/test_codegen.py | 6 ++-- smoke-test/smoke.sh | 2 ++ 23 files changed, 145 insertions(+), 148 deletions(-) diff --git a/metadata-ingestion/src/datahub/api/circuit_breaker/assertion_circuit_breaker.py b/metadata-ingestion/src/datahub/api/circuit_breaker/assertion_circuit_breaker.py index 9d2a65663ba37d..283cdaa8333338 100644 --- a/metadata-ingestion/src/datahub/api/circuit_breaker/assertion_circuit_breaker.py +++ b/metadata-ingestion/src/datahub/api/circuit_breaker/assertion_circuit_breaker.py @@ -1,6 +1,6 @@ import logging from dataclasses import dataclass -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone from typing import Any, Dict, List, Optional from pydantic import Field @@ -10,6 +10,7 @@ CircuitBreakerConfig, ) from datahub.api.graphql import Assertion, Operation +from datahub.emitter.mce_builder import parse_ts_millis logger: logging.Logger = logging.getLogger(__name__) @@ -49,7 +50,7 @@ def get_last_updated(self, urn: str) -> Optional[datetime]: if not operations: return None else: - return datetime.fromtimestamp(operations[0]["lastUpdatedTimestamp"] / 1000) + return parse_ts_millis(operations[0]["lastUpdatedTimestamp"]) def _check_if_assertion_failed( self, assertions: List[Dict[str, Any]], last_updated: Optional[datetime] = None @@ -93,7 +94,7 @@ class AssertionResult: logger.info(f"Found successful assertion: {assertion_urn}") result = False if last_updated is not None: - last_run = datetime.fromtimestamp(last_assertion.time / 1000) + last_run = parse_ts_millis(last_assertion.time) if last_updated > last_run: logger.error( f"Missing assertion run for {assertion_urn}. The dataset was updated on {last_updated} but the last assertion run was at {last_run}" @@ -117,7 +118,7 @@ def is_circuit_breaker_active(self, urn: str) -> bool: ) if not last_updated: - last_updated = datetime.now() - self.config.time_delta + last_updated = datetime.now(tz=timezone.utc) - self.config.time_delta logger.info( f"Dataset {urn} doesn't have last updated or check_last_assertion_time is false, using calculated min assertion date {last_updated}" ) diff --git a/metadata-ingestion/src/datahub/emitter/mce_builder.py b/metadata-ingestion/src/datahub/emitter/mce_builder.py index 69946c575908b5..110624aa61cb89 100644 --- a/metadata-ingestion/src/datahub/emitter/mce_builder.py +++ b/metadata-ingestion/src/datahub/emitter/mce_builder.py @@ -6,7 +6,7 @@ import os import re import time -from datetime import datetime +from datetime import datetime, timezone from enum import Enum from typing import ( TYPE_CHECKING, @@ -103,6 +103,22 @@ def make_ts_millis(ts: Optional[datetime]) -> Optional[int]: return int(ts.timestamp() * 1000) +@overload +def parse_ts_millis(ts: float) -> datetime: + ... + + +@overload +def parse_ts_millis(ts: None) -> None: + ... + + +def parse_ts_millis(ts: Optional[float]) -> Optional[datetime]: + if ts is None: + return None + return datetime.fromtimestamp(ts / 1000, tz=timezone.utc) + + def make_data_platform_urn(platform: str) -> str: if platform.startswith("urn:li:dataPlatform:"): return platform diff --git a/metadata-ingestion/src/datahub/emitter/mcp_builder.py b/metadata-ingestion/src/datahub/emitter/mcp_builder.py index 293157f8a1ed05..c8eb62a2e1de23 100644 --- a/metadata-ingestion/src/datahub/emitter/mcp_builder.py +++ b/metadata-ingestion/src/datahub/emitter/mcp_builder.py @@ -4,8 +4,8 @@ from pydantic.main import BaseModel from datahub.cli.env_utils import get_boolean_env_variable -from datahub.emitter.enum_helpers import get_enum_options from datahub.emitter.mce_builder import ( + ALL_ENV_TYPES, Aspect, datahub_guid, make_container_urn, @@ -25,7 +25,6 @@ ContainerClass, DomainsClass, EmbedClass, - FabricTypeClass, GlobalTagsClass, MetadataChangeEventClass, OwnerClass, @@ -206,11 +205,7 @@ def gen_containers( # Extra validation on the env field. # In certain cases (mainly for backwards compatibility), the env field will actually # have a platform instance name. - env = ( - container_key.env - if container_key.env in get_enum_options(FabricTypeClass) - else None - ) + env = container_key.env if container_key.env in ALL_ENV_TYPES else None container_urn = container_key.as_urn() diff --git a/metadata-ingestion/src/datahub/emitter/mcp_patch_builder.py b/metadata-ingestion/src/datahub/emitter/mcp_patch_builder.py index 779b42e1e1ee99..1ed8ce1d5a6158 100644 --- a/metadata-ingestion/src/datahub/emitter/mcp_patch_builder.py +++ b/metadata-ingestion/src/datahub/emitter/mcp_patch_builder.py @@ -2,7 +2,7 @@ import time from collections import defaultdict from dataclasses import dataclass -from typing import Any, Dict, Iterable, List, Optional, Sequence, Union +from typing import Any, Dict, List, Optional, Sequence, Union from datahub.emitter.aspect import JSON_PATCH_CONTENT_TYPE from datahub.emitter.serialization_helper import pre_json_transform @@ -75,7 +75,7 @@ def _add_patch( # TODO: Validate that aspectName is a valid aspect for this entityType self.patches[aspect_name].append(_Patch(op, path, value)) - def build(self) -> Iterable[MetadataChangeProposalClass]: + def build(self) -> List[MetadataChangeProposalClass]: return [ MetadataChangeProposalClass( entityUrn=self.urn, diff --git a/metadata-ingestion/src/datahub/emitter/rest_emitter.py b/metadata-ingestion/src/datahub/emitter/rest_emitter.py index 675717b5ec4829..04242c8bf45d2b 100644 --- a/metadata-ingestion/src/datahub/emitter/rest_emitter.py +++ b/metadata-ingestion/src/datahub/emitter/rest_emitter.py @@ -3,7 +3,7 @@ import logging import os from json.decoder import JSONDecodeError -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Sequence, Union import requests from deprecated import deprecated @@ -288,7 +288,7 @@ def emit_mcp( def emit_mcps( self, - mcps: List[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]], + mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]], async_flag: Optional[bool] = None, ) -> int: logger.debug("Attempting to emit batch mcps") diff --git a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py index 7791ea2797be34..f3e5b1db6a1c85 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py @@ -1,5 +1,4 @@ import logging -from datetime import datetime, timezone from typing import ( TYPE_CHECKING, Dict, @@ -14,7 +13,7 @@ ) from datahub.configuration.time_window_config import BaseTimeWindowConfig -from datahub.emitter.mce_builder import make_dataplatform_instance_urn +from datahub.emitter.mce_builder import make_dataplatform_instance_urn, parse_ts_millis from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_builder import entity_supports_aspect from datahub.ingestion.api.workunit import MetadataWorkUnit @@ -479,10 +478,7 @@ def auto_empty_dataset_usage_statistics( if invalid_timestamps: logger.warning( f"Usage statistics with unexpected timestamps, bucket_duration={config.bucket_duration}:\n" - ", ".join( - str(datetime.fromtimestamp(ts / 1000, tz=timezone.utc)) - for ts in invalid_timestamps - ) + ", ".join(str(parse_ts_millis(ts)) for ts in invalid_timestamps) ) for bucket in bucket_timestamps: diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py index 3ce34be8dc89df..cbe1f6eb978247 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py @@ -1,7 +1,7 @@ import logging from collections import defaultdict from dataclasses import dataclass, field -from datetime import datetime, timezone +from datetime import datetime from functools import lru_cache from typing import Any, Dict, FrozenSet, Iterable, Iterator, List, Optional @@ -15,6 +15,7 @@ TimePartitioningType, ) +from datahub.emitter.mce_builder import parse_ts_millis from datahub.ingestion.api.source import SourceReport from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier from datahub.ingestion.source.bigquery_v2.bigquery_helper import parse_labels @@ -393,13 +394,7 @@ def _make_bigquery_table( name=table.table_name, created=table.created, table_type=table.table_type, - last_altered=( - datetime.fromtimestamp( - table.get("last_altered") / 1000, tz=timezone.utc - ) - if table.get("last_altered") is not None - else None - ), + last_altered=parse_ts_millis(table.get("last_altered")), size_in_bytes=table.get("bytes"), rows_count=table.get("row_count"), comment=table.comment, @@ -460,11 +455,7 @@ def _make_bigquery_view(view: bigquery.Row) -> BigqueryView: return BigqueryView( name=view.table_name, created=view.created, - last_altered=( - datetime.fromtimestamp(view.get("last_altered") / 1000, tz=timezone.utc) - if view.get("last_altered") is not None - else None - ), + last_altered=(parse_ts_millis(view.get("last_altered"))), comment=view.comment, view_definition=view.view_definition, materialized=view.table_type == BigqueryTableType.MATERIALIZED_VIEW, @@ -705,13 +696,7 @@ def _make_bigquery_table_snapshot(snapshot: bigquery.Row) -> BigqueryTableSnapsh return BigqueryTableSnapshot( name=snapshot.table_name, created=snapshot.created, - last_altered=( - datetime.fromtimestamp( - snapshot.get("last_altered") / 1000, tz=timezone.utc - ) - if snapshot.get("last_altered") is not None - else None - ), + last_altered=parse_ts_millis(snapshot.get("last_altered")), comment=snapshot.comment, ddl=snapshot.ddl, snapshot_time=snapshot.snapshot_time, diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_kafka_reader.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_kafka_reader.py index 56a3d55abb184f..ba073533eccfb5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_kafka_reader.py +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_kafka_reader.py @@ -12,6 +12,7 @@ from confluent_kafka.schema_registry.avro import AvroDeserializer from datahub.configuration.kafka import KafkaConsumerConnectionConfig +from datahub.emitter.mce_builder import parse_ts_millis from datahub.ingestion.api.closeable import Closeable from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.datahub.config import DataHubSourceConfig @@ -92,7 +93,7 @@ def _poll_partition( if mcl.created and mcl.created.time > stop_time.timestamp() * 1000: logger.info( f"Stopped reading from kafka, reached MCL " - f"with audit stamp {datetime.fromtimestamp(mcl.created.time / 1000)}" + f"with audit stamp {parse_ts_millis(mcl.created.time)}" ) break diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py index bd6c23cc2d4644..c91be9b494c006 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py @@ -7,7 +7,10 @@ from sqlalchemy import create_engine, inspect from sqlalchemy.engine.reflection import Inspector -from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance +from datahub.emitter.mce_builder import ( + make_dataset_urn_with_platform_instance, + parse_ts_millis, +) from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.ge_data_profiler import ( @@ -245,11 +248,7 @@ def is_dataset_eligible_for_profiling( # If profiling state exists we have to carry over to the new state self.state_handler.add_to_state(dataset_urn, last_profiled) - threshold_time: Optional[datetime] = ( - datetime.fromtimestamp(last_profiled / 1000, timezone.utc) - if last_profiled - else None - ) + threshold_time: Optional[datetime] = parse_ts_millis(last_profiled) if ( not threshold_time and self.config.profiling.profile_if_updated_since_days is not None diff --git a/metadata-ingestion/src/datahub/ingestion/source/state/checkpoint.py b/metadata-ingestion/src/datahub/ingestion/source/state/checkpoint.py index 5bfd48eb754d53..2c7a4a8b6c137d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/state/checkpoint.py +++ b/metadata-ingestion/src/datahub/ingestion/source/state/checkpoint.py @@ -12,6 +12,7 @@ import pydantic from datahub.configuration.common import ConfigModel +from datahub.emitter.mce_builder import parse_ts_millis from datahub.metadata.schema_classes import ( DatahubIngestionCheckpointClass, IngestionCheckpointStateClass, @@ -144,7 +145,7 @@ def create_from_checkpoint_aspect( ) logger.info( f"Successfully constructed last checkpoint state for job {job_name} " - f"with timestamp {datetime.fromtimestamp(checkpoint_aspect.timestampMillis/1000, tz=timezone.utc)}" + f"with timestamp {parse_ts_millis(checkpoint_aspect.timestampMillis)}" ) return checkpoint return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py index 11827bace4b5a1..9b96953794dcd5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py @@ -4,7 +4,7 @@ import dataclasses import logging -from datetime import datetime, timezone +from datetime import datetime from typing import Any, Dict, Iterable, List, Optional, Union, cast from unittest.mock import patch @@ -27,6 +27,7 @@ from databricks.sdk.service.workspace import ObjectType import datahub +from datahub.emitter.mce_builder import parse_ts_millis from datahub.ingestion.source.unity.hive_metastore_proxy import HiveMetastoreProxy from datahub.ingestion.source.unity.proxy_profiling import ( UnityCatalogProxyProfilingMixin, @@ -211,16 +212,8 @@ def workspace_notebooks(self) -> Iterable[Notebook]: id=obj.object_id, path=obj.path, language=obj.language, - created_at=( - datetime.fromtimestamp(obj.created_at / 1000, tz=timezone.utc) - if obj.created_at - else None - ), - modified_at=( - datetime.fromtimestamp(obj.modified_at / 1000, tz=timezone.utc) - if obj.modified_at - else None - ), + created_at=parse_ts_millis(obj.created_at), + modified_at=parse_ts_millis(obj.modified_at), ) def query_history( @@ -452,17 +445,9 @@ def _create_table( properties=obj.properties or {}, owner=obj.owner, generation=obj.generation, - created_at=( - datetime.fromtimestamp(obj.created_at / 1000, tz=timezone.utc) - if obj.created_at - else None - ), + created_at=(parse_ts_millis(obj.created_at) if obj.created_at else None), created_by=obj.created_by, - updated_at=( - datetime.fromtimestamp(obj.updated_at / 1000, tz=timezone.utc) - if obj.updated_at - else None - ), + updated_at=(parse_ts_millis(obj.updated_at) if obj.updated_at else None), updated_by=obj.updated_by, table_id=obj.table_id, comment=obj.comment, @@ -500,12 +485,8 @@ def _create_query(self, info: QueryInfo) -> Optional[Query]: query_id=info.query_id, query_text=info.query_text, statement_type=info.statement_type, - start_time=datetime.fromtimestamp( - info.query_start_time_ms / 1000, tz=timezone.utc - ), - end_time=datetime.fromtimestamp( - info.query_end_time_ms / 1000, tz=timezone.utc - ), + start_time=parse_ts_millis(info.query_start_time_ms), + end_time=parse_ts_millis(info.query_end_time_ms), user_id=info.user_id, user_name=info.user_name, executed_as_user_id=info.executed_as_user_id, diff --git a/metadata-ingestion/src/datahub/utilities/time.py b/metadata-ingestion/src/datahub/utilities/time.py index 0df7afb19935f7..e8338ce068c844 100644 --- a/metadata-ingestion/src/datahub/utilities/time.py +++ b/metadata-ingestion/src/datahub/utilities/time.py @@ -1,6 +1,8 @@ import time from dataclasses import dataclass -from datetime import datetime, timezone +from datetime import datetime + +from datahub.emitter.mce_builder import make_ts_millis, parse_ts_millis def get_current_time_in_seconds() -> int: @@ -9,12 +11,15 @@ def get_current_time_in_seconds() -> int: def ts_millis_to_datetime(ts_millis: int) -> datetime: """Converts input timestamp in milliseconds to a datetime object with UTC timezone""" - return datetime.fromtimestamp(ts_millis / 1000, tz=timezone.utc) + return parse_ts_millis(ts_millis) def datetime_to_ts_millis(dt: datetime) -> int: """Converts a datetime object to timestamp in milliseconds""" - return int(round(dt.timestamp() * 1000)) + # TODO: Deprecate these helpers in favor of make_ts_millis and parse_ts_millis. + # The other ones support None with a typing overload. + # Also possibly move those helpers to this file. + return make_ts_millis(dt) @dataclass diff --git a/metadata-ingestion/tests/integration/dbt/dbt_enabled_with_schemas_mces_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_enabled_with_schemas_mces_golden.json index dc8c400b291574..fb25531e685265 100644 --- a/metadata-ingestion/tests/integration/dbt/dbt_enabled_with_schemas_mces_golden.json +++ b/metadata-ingestion/tests/integration/dbt/dbt_enabled_with_schemas_mces_golden.json @@ -2658,7 +2658,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1580505371997, + "time": 1580505371996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -2930,7 +2930,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1582319845997, + "time": 1582319845996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3180,7 +3180,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1584998318997, + "time": 1584998318996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3430,7 +3430,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1588287228997, + "time": 1588287228996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3680,7 +3680,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1589460269997, + "time": 1589460269996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", diff --git a/metadata-ingestion/tests/integration/dbt/dbt_test_column_meta_mapping_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_test_column_meta_mapping_golden.json index 60f5bf4fbca9a1..69c4b9cce0b17b 100644 --- a/metadata-ingestion/tests/integration/dbt/dbt_test_column_meta_mapping_golden.json +++ b/metadata-ingestion/tests/integration/dbt/dbt_test_column_meta_mapping_golden.json @@ -3024,7 +3024,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1580505371997, + "time": 1580505371996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3296,7 +3296,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1582319845997, + "time": 1582319845996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3546,7 +3546,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1584998318997, + "time": 1584998318996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3796,7 +3796,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1588287228997, + "time": 1588287228996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -4046,7 +4046,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1589460269997, + "time": 1589460269996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", diff --git a/metadata-ingestion/tests/integration/dbt/dbt_test_prefer_sql_parser_lineage_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_test_prefer_sql_parser_lineage_golden.json index 42a416473ae243..0361e899b5b390 100644 --- a/metadata-ingestion/tests/integration/dbt/dbt_test_prefer_sql_parser_lineage_golden.json +++ b/metadata-ingestion/tests/integration/dbt/dbt_test_prefer_sql_parser_lineage_golden.json @@ -564,7 +564,7 @@ "name": "just-some-random-id_urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.an-aliased-view-for-monthly-billing,PROD)", "type": "BATCH_SCHEDULED", "created": { - "time": 1663355198240, + "time": 1663355198239, "actor": "urn:li:corpuser:datahub" } } @@ -636,7 +636,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1663355198240, + "timestampMillis": 1663355198239, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -657,7 +657,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1663355198242, + "timestampMillis": 1663355198241, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -1019,7 +1019,7 @@ "name": "just-some-random-id_urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.an_aliased_view_for_payments,PROD)", "type": "BATCH_SCHEDULED", "created": { - "time": 1663355198240, + "time": 1663355198239, "actor": "urn:li:corpuser:datahub" } } @@ -1095,7 +1095,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1663355198240, + "timestampMillis": 1663355198239, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -1116,7 +1116,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1663355198242, + "timestampMillis": 1663355198241, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -1347,7 +1347,7 @@ "name": "just-some-random-id_urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payments_by_customer_by_month,PROD)", "type": "BATCH_SCHEDULED", "created": { - "time": 1663355198240, + "time": 1663355198239, "actor": "urn:li:corpuser:datahub" } } @@ -1418,7 +1418,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1663355198240, + "timestampMillis": 1663355198239, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -1439,7 +1439,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1663355198242, + "timestampMillis": 1663355198241, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -1871,7 +1871,7 @@ "name": "just-some-random-id_urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer_snapshot,PROD)", "type": "BATCH_SCHEDULED", "created": { - "time": 1663355198240, + "time": 1663355198239, "actor": "urn:li:corpuser:datahub" } } @@ -1942,7 +1942,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1663355198240, + "timestampMillis": 1663355198239, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -1963,7 +1963,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1663355198242, + "timestampMillis": 1663355198241, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -3140,7 +3140,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1580505371997, + "time": 1580505371996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3341,7 +3341,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1582319845997, + "time": 1582319845996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3523,7 +3523,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1584998318997, + "time": 1584998318996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3705,7 +3705,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1588287228997, + "time": 1588287228996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3887,7 +3887,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1589460269997, + "time": 1589460269996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", diff --git a/metadata-ingestion/tests/integration/dbt/dbt_test_test_model_performance_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_test_test_model_performance_golden.json index c281ea3eed0fa0..c59620f010343d 100644 --- a/metadata-ingestion/tests/integration/dbt/dbt_test_test_model_performance_golden.json +++ b/metadata-ingestion/tests/integration/dbt/dbt_test_test_model_performance_golden.json @@ -564,7 +564,7 @@ "name": "just-some-random-id_urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.an-aliased-view-for-monthly-billing,PROD)", "type": "BATCH_SCHEDULED", "created": { - "time": 1663355198240, + "time": 1663355198239, "actor": "urn:li:corpuser:datahub" } } @@ -636,7 +636,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1663355198240, + "timestampMillis": 1663355198239, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -657,7 +657,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1663355198242, + "timestampMillis": 1663355198241, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -1019,7 +1019,7 @@ "name": "just-some-random-id_urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.an_aliased_view_for_payments,PROD)", "type": "BATCH_SCHEDULED", "created": { - "time": 1663355198240, + "time": 1663355198239, "actor": "urn:li:corpuser:datahub" } } @@ -1095,7 +1095,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1663355198240, + "timestampMillis": 1663355198239, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -1116,7 +1116,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1663355198242, + "timestampMillis": 1663355198241, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -1347,7 +1347,7 @@ "name": "just-some-random-id_urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payments_by_customer_by_month,PROD)", "type": "BATCH_SCHEDULED", "created": { - "time": 1663355198240, + "time": 1663355198239, "actor": "urn:li:corpuser:datahub" } } @@ -1418,7 +1418,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1663355198240, + "timestampMillis": 1663355198239, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -1439,7 +1439,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1663355198242, + "timestampMillis": 1663355198241, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -1871,7 +1871,7 @@ "name": "just-some-random-id_urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer_snapshot,PROD)", "type": "BATCH_SCHEDULED", "created": { - "time": 1663355198240, + "time": 1663355198239, "actor": "urn:li:corpuser:datahub" } } @@ -1942,7 +1942,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1663355198240, + "timestampMillis": 1663355198239, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -1963,7 +1963,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1663355198242, + "timestampMillis": 1663355198241, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -3504,7 +3504,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1580505371997, + "time": 1580505371996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3773,7 +3773,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1582319845997, + "time": 1582319845996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -4023,7 +4023,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1584998318997, + "time": 1584998318996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -4273,7 +4273,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1588287228997, + "time": 1588287228996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -4523,7 +4523,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1589460269997, + "time": 1589460269996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", diff --git a/metadata-ingestion/tests/integration/dbt/dbt_test_with_complex_owner_patterns_mces_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_test_with_complex_owner_patterns_mces_golden.json index 495fa32569f569..23b5525b712d09 100644 --- a/metadata-ingestion/tests/integration/dbt/dbt_test_with_complex_owner_patterns_mces_golden.json +++ b/metadata-ingestion/tests/integration/dbt/dbt_test_with_complex_owner_patterns_mces_golden.json @@ -2598,7 +2598,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1580505371997, + "time": 1580505371996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -2867,7 +2867,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1582319845997, + "time": 1582319845996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3117,7 +3117,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1584998318997, + "time": 1584998318996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3367,7 +3367,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1588287228997, + "time": 1588287228996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3617,7 +3617,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1589460269997, + "time": 1589460269996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", diff --git a/metadata-ingestion/tests/integration/dbt/dbt_test_with_data_platform_instance_mces_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_test_with_data_platform_instance_mces_golden.json index 20b7cf4a1c26ca..da22458f5624c1 100644 --- a/metadata-ingestion/tests/integration/dbt/dbt_test_with_data_platform_instance_mces_golden.json +++ b/metadata-ingestion/tests/integration/dbt/dbt_test_with_data_platform_instance_mces_golden.json @@ -2610,7 +2610,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1580505371997, + "time": 1580505371996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -2880,7 +2880,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1582319845997, + "time": 1582319845996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3131,7 +3131,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1584998318997, + "time": 1584998318996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3382,7 +3382,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1588287228997, + "time": 1588287228996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3633,7 +3633,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1589460269997, + "time": 1589460269996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", diff --git a/metadata-ingestion/tests/integration/dbt/dbt_test_with_non_incremental_lineage_mces_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_test_with_non_incremental_lineage_mces_golden.json index 80ca85a5e6c61b..0b44fe77cd62ae 100644 --- a/metadata-ingestion/tests/integration/dbt/dbt_test_with_non_incremental_lineage_mces_golden.json +++ b/metadata-ingestion/tests/integration/dbt/dbt_test_with_non_incremental_lineage_mces_golden.json @@ -2599,7 +2599,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1580505371997, + "time": 1580505371996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -2868,7 +2868,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1582319845997, + "time": 1582319845996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3118,7 +3118,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1584998318997, + "time": 1584998318996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3368,7 +3368,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1588287228997, + "time": 1588287228996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3618,7 +3618,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1589460269997, + "time": 1589460269996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", diff --git a/metadata-ingestion/tests/integration/dbt/dbt_test_with_target_platform_instance_mces_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_test_with_target_platform_instance_mces_golden.json index 1e6e4d8ba94a2e..3174847dd7e7ad 100644 --- a/metadata-ingestion/tests/integration/dbt/dbt_test_with_target_platform_instance_mces_golden.json +++ b/metadata-ingestion/tests/integration/dbt/dbt_test_with_target_platform_instance_mces_golden.json @@ -2599,7 +2599,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1580505371997, + "time": 1580505371996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -2868,7 +2868,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1582319845997, + "time": 1582319845996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3118,7 +3118,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1584998318997, + "time": 1584998318996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3368,7 +3368,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1588287228997, + "time": 1588287228996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", @@ -3618,7 +3618,7 @@ "actor": "urn:li:corpuser:unknown" }, "lastModified": { - "time": 1589460269997, + "time": 1589460269996, "actor": "urn:li:corpuser:dbt_executor" }, "hash": "", diff --git a/metadata-ingestion/tests/unit/sdk/test_mce_builder.py b/metadata-ingestion/tests/unit/sdk/test_mce_builder.py index d7c84f7863b407..3bdbf07bf28b7d 100644 --- a/metadata-ingestion/tests/unit/sdk/test_mce_builder.py +++ b/metadata-ingestion/tests/unit/sdk/test_mce_builder.py @@ -1,3 +1,5 @@ +from datetime import datetime, timezone + import datahub.emitter.mce_builder as builder from datahub.metadata.schema_classes import ( DataFlowInfoClass, @@ -55,3 +57,18 @@ def test_make_group_urn() -> None: assert ( builder.make_group_urn("urn:li:corpuser:someUser") == "urn:li:corpuser:someUser" ) + + +def test_ts_millis() -> None: + assert builder.make_ts_millis(None) is None + assert builder.parse_ts_millis(None) is None + + assert ( + builder.make_ts_millis(datetime(2024, 1, 1, 2, 3, 4, 5, timezone.utc)) + == 1704074584000 + ) + + # We only have millisecond precision, don't support microseconds. + ts = datetime.now(timezone.utc).replace(microsecond=0) + ts_millis = builder.make_ts_millis(ts) + assert builder.parse_ts_millis(ts_millis) == ts diff --git a/metadata-ingestion/tests/unit/serde/test_codegen.py b/metadata-ingestion/tests/unit/serde/test_codegen.py index 98d62d5643ff2d..b49f7153129136 100644 --- a/metadata-ingestion/tests/unit/serde/test_codegen.py +++ b/metadata-ingestion/tests/unit/serde/test_codegen.py @@ -6,11 +6,10 @@ import pytest import typing_inspect -from datahub.emitter.enum_helpers import get_enum_options +from datahub.emitter.mce_builder import ALL_ENV_TYPES from datahub.metadata.schema_classes import ( ASPECT_CLASSES, KEY_ASPECTS, - FabricTypeClass, FineGrainedLineageClass, MetadataChangeEventClass, OwnershipClass, @@ -164,8 +163,7 @@ def _err(msg: str) -> None: def test_enum_options(): # This is mainly a sanity check to ensure that it doesn't do anything too crazy. - env_options = get_enum_options(FabricTypeClass) - assert "PROD" in env_options + assert "PROD" in ALL_ENV_TYPES def test_urn_types() -> None: diff --git a/smoke-test/smoke.sh b/smoke-test/smoke.sh index ec8188ebf5f4db..1d209b4ba82195 100755 --- a/smoke-test/smoke.sh +++ b/smoke-test/smoke.sh @@ -22,7 +22,9 @@ else echo "datahub:datahub" > ~/.datahub/plugins/frontend/auth/user.props python3 -m venv venv + set +x source venv/bin/activate + set -x python -m pip install --upgrade 'uv>=0.1.10' uv pip install -r requirements.txt fi From 4e3103e2661f3149f823d1cdda0980fffb7010d3 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Fri, 27 Dec 2024 13:50:43 -0500 Subject: [PATCH 27/28] fix(ingest): use `typing_extensions.Self` (#12230) --- metadata-ingestion/scripts/avro_codegen.py | 7 +++---- metadata-ingestion/setup.py | 2 +- .../src/datahub/configuration/common.py | 7 ++----- .../src/datahub/ingestion/api/closeable.py | 6 +++--- .../api/ingestion_job_checkpointing_provider_base.py | 11 ++++------- .../src/datahub/ingestion/api/report.py | 5 ++++- metadata-ingestion/src/datahub/ingestion/api/sink.py | 7 ++++--- .../src/datahub/utilities/urns/_urn_base.py | 12 +++++------- 8 files changed, 26 insertions(+), 31 deletions(-) diff --git a/metadata-ingestion/scripts/avro_codegen.py b/metadata-ingestion/scripts/avro_codegen.py index e5792da32fb5d7..2841985ad07808 100644 --- a/metadata-ingestion/scripts/avro_codegen.py +++ b/metadata-ingestion/scripts/avro_codegen.py @@ -154,7 +154,6 @@ def merge_schemas(schemas_obj: List[dict]) -> str: # Patch add_name method to NOT complain about duplicate names. class NamesWithDups(avro.schema.Names): def add_name(self, name_attr, space_attr, new_schema): - to_add = avro.schema.Name(name_attr, space_attr, self.default_namespace) assert to_add.name assert to_add.space @@ -626,7 +625,7 @@ def generate_urn_class(entity_type: str, key_aspect: dict) -> str: class {class_name}(_SpecificUrn): ENTITY_TYPE: ClassVar[str] = "{entity_type}" - URN_PARTS: ClassVar[int] = {arg_count} + _URN_PARTS: ClassVar[int] = {arg_count} def __init__(self, {init_args}, *, _allow_coercion: bool = True) -> None: if _allow_coercion: @@ -640,8 +639,8 @@ def __init__(self, {init_args}, *, _allow_coercion: bool = True) -> None: @classmethod def _parse_ids(cls, entity_ids: List[str]) -> "{class_name}": - if len(entity_ids) != cls.URN_PARTS: - raise InvalidUrnError(f"{class_name} should have {{cls.URN_PARTS}} parts, got {{len(entity_ids)}}: {{entity_ids}}") + if len(entity_ids) != cls._URN_PARTS: + raise InvalidUrnError(f"{class_name} should have {{cls._URN_PARTS}} parts, got {{len(entity_ids)}}: {{entity_ids}}") return cls({parse_ids_mapping}, _allow_coercion=False) @classmethod diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 986dc189cb29ba..8357262537bcf8 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -15,7 +15,7 @@ base_requirements = { # Our min version of typing_extensions is somewhat constrained by Airflow. - "typing_extensions>=3.10.0.2", + "typing_extensions>=4.2.0", # Actual dependencies. "typing-inspect", # pydantic 1.8.2 is incompatible with mypy 0.910. diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py index 7df007e087979c..08817d9d5fdb93 100644 --- a/metadata-ingestion/src/datahub/configuration/common.py +++ b/metadata-ingestion/src/datahub/configuration/common.py @@ -10,7 +10,6 @@ List, Optional, Type, - TypeVar, Union, runtime_checkable, ) @@ -19,14 +18,12 @@ from cached_property import cached_property from pydantic import BaseModel, Extra, ValidationError from pydantic.fields import Field -from typing_extensions import Protocol +from typing_extensions import Protocol, Self from datahub.configuration._config_enum import ConfigEnum as ConfigEnum # noqa: I250 from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2 from datahub.utilities.dedup_list import deduplicate_list -_ConfigSelf = TypeVar("_ConfigSelf", bound="ConfigModel") - REDACT_KEYS = { "password", "token", @@ -109,7 +106,7 @@ def _schema_extra(schema: Dict[str, Any], model: Type["ConfigModel"]) -> None: schema_extra = _schema_extra @classmethod - def parse_obj_allow_extras(cls: Type[_ConfigSelf], obj: Any) -> _ConfigSelf: + def parse_obj_allow_extras(cls, obj: Any) -> Self: if PYDANTIC_VERSION_2: try: with unittest.mock.patch.dict( diff --git a/metadata-ingestion/src/datahub/ingestion/api/closeable.py b/metadata-ingestion/src/datahub/ingestion/api/closeable.py index 80a5008ed63683..7b8e1a36162c92 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/closeable.py +++ b/metadata-ingestion/src/datahub/ingestion/api/closeable.py @@ -1,9 +1,9 @@ from abc import abstractmethod from contextlib import AbstractContextManager from types import TracebackType -from typing import Optional, Type, TypeVar +from typing import Optional, Type -_Self = TypeVar("_Self", bound="Closeable") +from typing_extensions import Self class Closeable(AbstractContextManager): @@ -11,7 +11,7 @@ class Closeable(AbstractContextManager): def close(self) -> None: pass - def __enter__(self: _Self) -> _Self: + def __enter__(self) -> Self: # This method is mainly required for type checking. return self diff --git a/metadata-ingestion/src/datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py b/metadata-ingestion/src/datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py index 3680546d307d97..c1a49ce82e6e05 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +++ b/metadata-ingestion/src/datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py @@ -1,6 +1,8 @@ from abc import abstractmethod from dataclasses import dataclass -from typing import Any, Dict, NewType, Optional, Type, TypeVar +from typing import Any, Dict, NewType, Optional + +from typing_extensions import Self import datahub.emitter.mce_builder as builder from datahub.configuration.common import ConfigModel @@ -17,9 +19,6 @@ class IngestionCheckpointingProviderConfig(ConfigModel): pass -_Self = TypeVar("_Self", bound="IngestionCheckpointingProviderBase") - - @dataclass() class IngestionCheckpointingProviderBase(StatefulCommittable[CheckpointJobStatesMap]): """ @@ -32,9 +31,7 @@ def __init__(self, name: str, commit_policy: CommitPolicy = CommitPolicy.ALWAYS) @classmethod @abstractmethod - def create( - cls: Type[_Self], config_dict: Dict[str, Any], ctx: PipelineContext - ) -> "_Self": + def create(cls, config_dict: Dict[str, Any], ctx: PipelineContext) -> Self: pass @abstractmethod diff --git a/metadata-ingestion/src/datahub/ingestion/api/report.py b/metadata-ingestion/src/datahub/ingestion/api/report.py index ade2832f1b669d..32810189acd00b 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/report.py +++ b/metadata-ingestion/src/datahub/ingestion/api/report.py @@ -42,7 +42,10 @@ def to_pure_python_obj(some_val: Any) -> Any: return some_val.as_obj() elif isinstance(some_val, pydantic.BaseModel): return Report.to_pure_python_obj(some_val.dict()) - elif dataclasses.is_dataclass(some_val): + elif dataclasses.is_dataclass(some_val) and not isinstance(some_val, type): + # The `is_dataclass` function returns `True` for both instances and classes. + # We need an extra check to ensure an instance was passed in. + # https://docs.python.org/3/library/dataclasses.html#dataclasses.is_dataclass return dataclasses.asdict(some_val) elif isinstance(some_val, list): return [Report.to_pure_python_obj(v) for v in some_val if v is not None] diff --git a/metadata-ingestion/src/datahub/ingestion/api/sink.py b/metadata-ingestion/src/datahub/ingestion/api/sink.py index 62feb7b5a02e66..655e6bb22fa8d1 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/sink.py +++ b/metadata-ingestion/src/datahub/ingestion/api/sink.py @@ -3,6 +3,8 @@ from dataclasses import dataclass, field from typing import Any, Generic, Optional, Type, TypeVar, cast +from typing_extensions import Self + from datahub.configuration.common import ConfigModel from datahub.ingestion.api.closeable import Closeable from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit @@ -79,7 +81,6 @@ def on_failure( SinkReportType = TypeVar("SinkReportType", bound=SinkReport, covariant=True) SinkConfig = TypeVar("SinkConfig", bound=ConfigModel, covariant=True) -Self = TypeVar("Self", bound="Sink") class Sink(Generic[SinkConfig, SinkReportType], Closeable, metaclass=ABCMeta): @@ -90,7 +91,7 @@ class Sink(Generic[SinkConfig, SinkReportType], Closeable, metaclass=ABCMeta): report: SinkReportType @classmethod - def get_config_class(cls: Type[Self]) -> Type[SinkConfig]: + def get_config_class(cls) -> Type[SinkConfig]: config_class = get_class_from_annotation(cls, Sink, ConfigModel) assert config_class, "Sink subclasses must define a config class" return cast(Type[SinkConfig], config_class) @@ -112,7 +113,7 @@ def __post_init__(self) -> None: pass @classmethod - def create(cls: Type[Self], config_dict: dict, ctx: PipelineContext) -> "Self": + def create(cls, config_dict: dict, ctx: PipelineContext) -> "Self": return cls(ctx, cls.get_config_class().parse_obj(config_dict)) def handle_work_unit_start(self, workunit: WorkUnit) -> None: diff --git a/metadata-ingestion/src/datahub/utilities/urns/_urn_base.py b/metadata-ingestion/src/datahub/utilities/urns/_urn_base.py index 7dadd16fb7f1c2..7996fe0d7b89b7 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/_urn_base.py +++ b/metadata-ingestion/src/datahub/utilities/urns/_urn_base.py @@ -1,9 +1,10 @@ import functools import urllib.parse from abc import abstractmethod -from typing import ClassVar, Dict, List, Optional, Type, TypeVar +from typing import ClassVar, Dict, List, Optional, Type from deprecated import deprecated +from typing_extensions import Self from datahub.utilities.urns.error import InvalidUrnError @@ -42,9 +43,6 @@ def _split_entity_id(entity_id: str) -> List[str]: return parts -_UrnSelf = TypeVar("_UrnSelf", bound="Urn") - - @functools.total_ordering class Urn: """ @@ -88,7 +86,7 @@ def entity_ids(self) -> List[str]: return self._entity_ids @classmethod - def from_string(cls: Type[_UrnSelf], urn_str: str) -> "_UrnSelf": + def from_string(cls, urn_str: str) -> Self: """ Creates an Urn from its string representation. @@ -174,7 +172,7 @@ def __hash__(self) -> int: @classmethod @deprecated(reason="prefer .from_string") - def create_from_string(cls: Type[_UrnSelf], urn_str: str) -> "_UrnSelf": + def create_from_string(cls, urn_str: str) -> Self: return cls.from_string(urn_str) @deprecated(reason="prefer .entity_ids") @@ -270,5 +268,5 @@ def underlying_key_aspect_type(cls) -> Type: @classmethod @abstractmethod - def _parse_ids(cls: Type[_UrnSelf], entity_ids: List[str]) -> _UrnSelf: + def _parse_ids(cls, entity_ids: List[str]) -> Self: raise NotImplementedError() From 6b6d820eea3e7c1297381b2b9ad9b37e22cd9c5d Mon Sep 17 00:00:00 2001 From: deepgarg-visa <149145061+deepgarg-visa@users.noreply.github.com> Date: Sat, 28 Dec 2024 01:49:15 +0530 Subject: [PATCH 28/28] feat(businessAttribute): generate platform events on association/removal with schemaField (#12224) --- metadata-io/build.gradle | 2 +- ...hemaFieldBusinessAttributeChangeEvent.java | 38 ++++++ ...usinessAttributesChangeEventGenerator.java | 98 ++++++++++++++ ...essAttributesChangeEventGeneratorTest.java | 124 ++++++++++++++++++ .../event/EntityChangeEventGeneratorHook.java | 22 +++- .../src/main/resources/application.yaml | 1 + ...tyChangeEventGeneratorRegistryFactory.java | 2 + 7 files changed, 279 insertions(+), 8 deletions(-) create mode 100644 metadata-io/src/main/java/com/linkedin/metadata/timeline/data/dataset/schema/SchemaFieldBusinessAttributeChangeEvent.java create mode 100644 metadata-io/src/main/java/com/linkedin/metadata/timeline/eventgenerator/BusinessAttributesChangeEventGenerator.java create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/timeline/eventgenerator/BusinessAttributesChangeEventGeneratorTest.java diff --git a/metadata-io/build.gradle b/metadata-io/build.gradle index 516a77d59d50bd..88bbfa2e10c4c1 100644 --- a/metadata-io/build.gradle +++ b/metadata-io/build.gradle @@ -102,7 +102,7 @@ dependencies { testImplementation(testFixtures(project(":entity-registry"))) testAnnotationProcessor externalDependency.lombok - + testImplementation project(':mock-entity-registry') constraints { implementation(externalDependency.log4jCore) { because("previous versions are vulnerable to CVE-2021-45105") diff --git a/metadata-io/src/main/java/com/linkedin/metadata/timeline/data/dataset/schema/SchemaFieldBusinessAttributeChangeEvent.java b/metadata-io/src/main/java/com/linkedin/metadata/timeline/data/dataset/schema/SchemaFieldBusinessAttributeChangeEvent.java new file mode 100644 index 00000000000000..1f1252e2085452 --- /dev/null +++ b/metadata-io/src/main/java/com/linkedin/metadata/timeline/data/dataset/schema/SchemaFieldBusinessAttributeChangeEvent.java @@ -0,0 +1,38 @@ +package com.linkedin.metadata.timeline.data.dataset.schema; + +import com.google.common.collect.ImmutableMap; +import com.linkedin.common.AuditStamp; +import com.linkedin.common.urn.Urn; +import com.linkedin.metadata.timeline.data.ChangeCategory; +import com.linkedin.metadata.timeline.data.ChangeEvent; +import com.linkedin.metadata.timeline.data.ChangeOperation; +import com.linkedin.metadata.timeline.data.SemanticChangeType; +import lombok.Builder; + +public class SchemaFieldBusinessAttributeChangeEvent extends ChangeEvent { + @Builder(builderMethodName = "schemaFieldBusinessAttributeChangeEventBuilder") + public SchemaFieldBusinessAttributeChangeEvent( + String entityUrn, + ChangeCategory category, + ChangeOperation operation, + String modifier, + AuditStamp auditStamp, + SemanticChangeType semVerChange, + String description, + Urn parentUrn, + Urn businessAttributeUrn, + Urn datasetUrn) { + super( + entityUrn, + category, + operation, + modifier, + ImmutableMap.of( + "parentUrn", parentUrn.toString(), + "businessAttributeUrn", businessAttributeUrn.toString(), + "datasetUrn", datasetUrn.toString()), + auditStamp, + semVerChange, + description); + } +} diff --git a/metadata-io/src/main/java/com/linkedin/metadata/timeline/eventgenerator/BusinessAttributesChangeEventGenerator.java b/metadata-io/src/main/java/com/linkedin/metadata/timeline/eventgenerator/BusinessAttributesChangeEventGenerator.java new file mode 100644 index 00000000000000..69d20f2f41bd56 --- /dev/null +++ b/metadata-io/src/main/java/com/linkedin/metadata/timeline/eventgenerator/BusinessAttributesChangeEventGenerator.java @@ -0,0 +1,98 @@ +package com.linkedin.metadata.timeline.eventgenerator; + +import com.linkedin.businessattribute.BusinessAttributeAssociation; +import com.linkedin.businessattribute.BusinessAttributes; +import com.linkedin.common.AuditStamp; +import com.linkedin.common.urn.Urn; +import com.linkedin.metadata.timeline.data.ChangeCategory; +import com.linkedin.metadata.timeline.data.ChangeEvent; +import com.linkedin.metadata.timeline.data.ChangeOperation; +import com.linkedin.metadata.timeline.data.SemanticChangeType; +import com.linkedin.metadata.timeline.data.dataset.schema.SchemaFieldBusinessAttributeChangeEvent; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; +import javax.annotation.Nonnull; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +public class BusinessAttributesChangeEventGenerator + extends EntityChangeEventGenerator { + + private static final String BUSINESS_ATTRIBUTE_ADDED_FORMAT = + "BusinessAttribute '%s' added to entity '%s'."; + private static final String BUSINESS_ATTRIBUTE_REMOVED_FORMAT = + "BusinessAttribute '%s' removed from entity '%s'."; + + @Override + public List getChangeEvents( + @Nonnull Urn urn, + @Nonnull String entityName, + @Nonnull String aspectName, + @Nonnull Aspect from, + @Nonnull Aspect to, + @Nonnull AuditStamp auditStamp) { + log.debug( + "Calling BusinessAttributesChangeEventGenerator for entity {} and aspect {}", + entityName, + aspectName); + return computeDiff(urn, entityName, aspectName, from.getValue(), to.getValue(), auditStamp); + } + + private List computeDiff( + Urn urn, + String entityName, + String aspectName, + BusinessAttributes previousValue, + BusinessAttributes newValue, + AuditStamp auditStamp) { + List changeEvents = new ArrayList<>(); + + BusinessAttributeAssociation previousAssociation = + previousValue != null ? previousValue.getBusinessAttribute() : null; + BusinessAttributeAssociation newAssociation = + newValue != null ? newValue.getBusinessAttribute() : null; + + if (Objects.nonNull(previousAssociation) && Objects.isNull(newAssociation)) { + changeEvents.add( + createChangeEvent( + previousAssociation, + urn, + ChangeOperation.REMOVE, + BUSINESS_ATTRIBUTE_REMOVED_FORMAT, + auditStamp)); + + } else if (Objects.isNull(previousAssociation) && Objects.nonNull(newAssociation)) { + changeEvents.add( + createChangeEvent( + newAssociation, + urn, + ChangeOperation.ADD, + BUSINESS_ATTRIBUTE_ADDED_FORMAT, + auditStamp)); + } + return changeEvents; + } + + private ChangeEvent createChangeEvent( + BusinessAttributeAssociation businessAttributeAssociation, + Urn entityUrn, + ChangeOperation changeOperation, + String format, + AuditStamp auditStamp) { + return SchemaFieldBusinessAttributeChangeEvent.schemaFieldBusinessAttributeChangeEventBuilder() + .entityUrn(entityUrn.toString()) + .category(ChangeCategory.BUSINESS_ATTRIBUTE) + .operation(changeOperation) + .modifier(businessAttributeAssociation.getBusinessAttributeUrn().toString()) + .auditStamp(auditStamp) + .semVerChange(SemanticChangeType.MINOR) + .description( + String.format( + format, businessAttributeAssociation.getBusinessAttributeUrn().getId(), entityUrn)) + .parentUrn(entityUrn) + .businessAttributeUrn(businessAttributeAssociation.getBusinessAttributeUrn()) + .datasetUrn(entityUrn.getIdAsUrn()) + .build(); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/timeline/eventgenerator/BusinessAttributesChangeEventGeneratorTest.java b/metadata-io/src/test/java/com/linkedin/metadata/timeline/eventgenerator/BusinessAttributesChangeEventGeneratorTest.java new file mode 100644 index 00000000000000..fb4c5ca3f96881 --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/timeline/eventgenerator/BusinessAttributesChangeEventGeneratorTest.java @@ -0,0 +1,124 @@ +package com.linkedin.metadata.timeline.eventgenerator; + +import static org.testng.AssertJUnit.assertEquals; + +import com.linkedin.businessattribute.BusinessAttributeAssociation; +import com.linkedin.businessattribute.BusinessAttributes; +import com.linkedin.common.AuditStamp; +import com.linkedin.common.urn.BusinessAttributeUrn; +import com.linkedin.common.urn.Urn; +import com.linkedin.data.ByteString; +import com.linkedin.data.template.RecordTemplate; +import com.linkedin.metadata.Constants; +import com.linkedin.metadata.models.AspectSpec; +import com.linkedin.metadata.timeline.data.ChangeEvent; +import com.linkedin.metadata.timeline.data.ChangeOperation; +import com.linkedin.metadata.utils.GenericRecordUtils; +import com.linkedin.mxe.SystemMetadata; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.util.List; +import mock.MockEntitySpec; +import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; +import org.testng.annotations.Test; + +public class BusinessAttributesChangeEventGeneratorTest extends AbstractTestNGSpringContextTests { + + private static Urn getSchemaFieldUrn() throws URISyntaxException { + return Urn.createFromString( + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hdfs,SampleHdfsDataset,PROD),user_id)"); + } + + private static final String BUSINESS_ATTRIBUTE_URN = + "urn:li:businessAttribute:cypressTestAttribute"; + + private static AuditStamp getTestAuditStamp() throws URISyntaxException { + return new AuditStamp() + .setActor(Urn.createFromString("urn:li:corpuser:__datahub_system")) + .setTime(1683829509553L); + } + + private static Aspect getBusinessAttributes( + BusinessAttributeAssociation association) { + return new Aspect<>( + new BusinessAttributes().setBusinessAttribute(association), new SystemMetadata()); + } + + private static Aspect getNullBusinessAttributes() { + MockEntitySpec mockEntitySpec = new MockEntitySpec("schemaField"); + BusinessAttributes businessAttributes = new BusinessAttributes(); + final AspectSpec aspectSpec = + mockEntitySpec.createAspectSpec(businessAttributes, Constants.BUSINESS_ATTRIBUTE_ASPECT); + final RecordTemplate nullAspect = + GenericRecordUtils.deserializeAspect( + ByteString.copyString("{}", StandardCharsets.UTF_8), "application/json", aspectSpec); + return new Aspect(nullAspect, new SystemMetadata()); + } + + @Test + public void testBusinessAttributeAddition() throws Exception { + BusinessAttributesChangeEventGenerator businessAttributesChangeEventGenerator = + new BusinessAttributesChangeEventGenerator(); + + Urn urn = getSchemaFieldUrn(); + String entity = "schemaField"; + String aspect = "businessAttributes"; + AuditStamp auditStamp = getTestAuditStamp(); + + Aspect from = getNullBusinessAttributes(); + Aspect to = + getBusinessAttributes( + new BusinessAttributeAssociation() + .setBusinessAttributeUrn(new BusinessAttributeUrn(BUSINESS_ATTRIBUTE_URN))); + + List actual = + businessAttributesChangeEventGenerator.getChangeEvents( + urn, entity, aspect, from, to, auditStamp); + assertEquals(1, actual.size()); + assertEquals(ChangeOperation.ADD.name(), actual.get(0).getOperation().name()); + assertEquals(getSchemaFieldUrn(), Urn.createFromString(actual.get(0).getEntityUrn())); + } + + @Test + public void testBusinessAttributeRemoval() throws Exception { + BusinessAttributesChangeEventGenerator test = new BusinessAttributesChangeEventGenerator(); + + Urn urn = getSchemaFieldUrn(); + String entity = "schemaField"; + String aspect = "businessAttributes"; + AuditStamp auditStamp = getTestAuditStamp(); + + Aspect from = + getBusinessAttributes( + new BusinessAttributeAssociation() + .setBusinessAttributeUrn(new BusinessAttributeUrn(BUSINESS_ATTRIBUTE_URN))); + Aspect to = getNullBusinessAttributes(); + + List actual = test.getChangeEvents(urn, entity, aspect, from, to, auditStamp); + assertEquals(1, actual.size()); + assertEquals(ChangeOperation.REMOVE.name(), actual.get(0).getOperation().name()); + assertEquals(getSchemaFieldUrn(), Urn.createFromString(actual.get(0).getEntityUrn())); + } + + @Test + public void testNoChange() throws Exception { + BusinessAttributesChangeEventGenerator test = new BusinessAttributesChangeEventGenerator(); + + Urn urn = getSchemaFieldUrn(); + String entity = "schemaField"; + String aspect = "businessAttributes"; + AuditStamp auditStamp = getTestAuditStamp(); + + Aspect from = + getBusinessAttributes( + new BusinessAttributeAssociation() + .setBusinessAttributeUrn(new BusinessAttributeUrn(BUSINESS_ATTRIBUTE_URN))); + Aspect to = + getBusinessAttributes( + new BusinessAttributeAssociation() + .setBusinessAttributeUrn(new BusinessAttributeUrn(BUSINESS_ATTRIBUTE_URN))); + + List actual = test.getChangeEvents(urn, entity, aspect, from, to, auditStamp); + assertEquals(0, actual.size()); + } +} diff --git a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/event/EntityChangeEventGeneratorHook.java b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/event/EntityChangeEventGeneratorHook.java index de570cc91b2fe7..17e34f151ae018 100644 --- a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/event/EntityChangeEventGeneratorHook.java +++ b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/event/EntityChangeEventGeneratorHook.java @@ -1,7 +1,5 @@ package com.linkedin.metadata.kafka.hook.event; -import static com.linkedin.metadata.Constants.SCHEMA_FIELD_ENTITY_NAME; - import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableSet; import com.linkedin.common.AuditStamp; @@ -27,6 +25,7 @@ import com.linkedin.platform.event.v1.Parameters; import io.datahubproject.metadata.context.OperationContext; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.Objects; import java.util.Set; @@ -65,6 +64,7 @@ public class EntityChangeEventGeneratorHook implements MetadataChangeLogHook { Constants.ASSERTION_RUN_EVENT_ASPECT_NAME, Constants.DATA_PROCESS_INSTANCE_RUN_EVENT_ASPECT_NAME, Constants.BUSINESS_ATTRIBUTE_INFO_ASPECT_NAME, + Constants.BUSINESS_ATTRIBUTE_ASPECT, // Entity Lifecycle Event Constants.DATASET_KEY_ASPECT_NAME, @@ -83,13 +83,12 @@ public class EntityChangeEventGeneratorHook implements MetadataChangeLogHook { private static final Set SUPPORTED_OPERATIONS = ImmutableSet.of("CREATE", "UPSERT", "DELETE"); - private static final Set ENTITY_EXCLUSIONS = ImmutableSet.of(SCHEMA_FIELD_ENTITY_NAME); - private final EntityChangeEventGeneratorRegistry entityChangeEventGeneratorRegistry; private final OperationContext systemOperationContext; private final SystemEntityClient systemEntityClient; private final Boolean isEnabled; @Getter private final String consumerGroupSuffix; + private final List entityExclusions; @Autowired public EntityChangeEventGeneratorHook( @@ -98,13 +97,16 @@ public EntityChangeEventGeneratorHook( final EntityChangeEventGeneratorRegistry entityChangeEventGeneratorRegistry, @Nonnull final SystemEntityClient entityClient, @Nonnull @Value("${entityChangeEvents.enabled:true}") Boolean isEnabled, - @Nonnull @Value("${entityChangeEvents.consumerGroupSuffix}") String consumerGroupSuffix) { + @Nonnull @Value("${entityChangeEvents.consumerGroupSuffix}") String consumerGroupSuffix, + @Nonnull @Value("#{'${entityChangeEvents.entityExclusions}'.split(',')}") + List entityExclusions) { this.systemOperationContext = systemOperationContext; this.entityChangeEventGeneratorRegistry = Objects.requireNonNull(entityChangeEventGeneratorRegistry); this.systemEntityClient = Objects.requireNonNull(entityClient); this.isEnabled = isEnabled; this.consumerGroupSuffix = consumerGroupSuffix; + this.entityExclusions = entityExclusions; } @VisibleForTesting @@ -113,7 +115,13 @@ public EntityChangeEventGeneratorHook( @Nonnull final EntityChangeEventGeneratorRegistry entityChangeEventGeneratorRegistry, @Nonnull final SystemEntityClient entityClient, @Nonnull Boolean isEnabled) { - this(systemOperationContext, entityChangeEventGeneratorRegistry, entityClient, isEnabled, ""); + this( + systemOperationContext, + entityChangeEventGeneratorRegistry, + entityClient, + isEnabled, + "", + Collections.emptyList()); } @Override @@ -202,7 +210,7 @@ private List generateChangeEvents( private boolean isEligibleForProcessing(final MetadataChangeLog log) { return SUPPORTED_OPERATIONS.contains(log.getChangeType().toString()) && SUPPORTED_ASPECT_NAMES.contains(log.getAspectName()) - && !ENTITY_EXCLUSIONS.contains(log.getEntityType()); + && !entityExclusions.contains(log.getEntityType()); } private void emitPlatformEvent( diff --git a/metadata-service/configuration/src/main/resources/application.yaml b/metadata-service/configuration/src/main/resources/application.yaml index b997bc108e4ba1..f6fa4a37fdadbc 100644 --- a/metadata-service/configuration/src/main/resources/application.yaml +++ b/metadata-service/configuration/src/main/resources/application.yaml @@ -467,6 +467,7 @@ featureFlags: entityChangeEvents: enabled: ${ENABLE_ENTITY_CHANGE_EVENTS_HOOK:true} consumerGroupSuffix: ${ECE_CONSUMER_GROUP_SUFFIX:} + entityExclusions: ${ECE_ENTITY_EXCLUSIONS:schemaField} # provides a comma separated list of entities to exclude from the ECE hook views: enabled: ${VIEWS_ENABLED:true} diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/timeline/eventgenerator/EntityChangeEventGeneratorRegistryFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/timeline/eventgenerator/EntityChangeEventGeneratorRegistryFactory.java index cd8eb4f1218db4..10770b83ad8811 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/timeline/eventgenerator/EntityChangeEventGeneratorRegistryFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/timeline/eventgenerator/EntityChangeEventGeneratorRegistryFactory.java @@ -6,6 +6,7 @@ import com.linkedin.metadata.timeline.eventgenerator.AssertionRunEventChangeEventGenerator; import com.linkedin.metadata.timeline.eventgenerator.BusinessAttributeAssociationChangeEventGenerator; import com.linkedin.metadata.timeline.eventgenerator.BusinessAttributeInfoChangeEventGenerator; +import com.linkedin.metadata.timeline.eventgenerator.BusinessAttributesChangeEventGenerator; import com.linkedin.metadata.timeline.eventgenerator.DataProcessInstanceRunEventChangeEventGenerator; import com.linkedin.metadata.timeline.eventgenerator.DatasetPropertiesChangeEventGenerator; import com.linkedin.metadata.timeline.eventgenerator.DeprecationChangeEventGenerator; @@ -59,6 +60,7 @@ protected EntityChangeEventGeneratorRegistry entityChangeEventGeneratorRegistry( BUSINESS_ATTRIBUTE_INFO_ASPECT_NAME, new BusinessAttributeInfoChangeEventGenerator()); registry.register( BUSINESS_ATTRIBUTE_ASSOCIATION, new BusinessAttributeAssociationChangeEventGenerator()); + registry.register(BUSINESS_ATTRIBUTE_ASPECT, new BusinessAttributesChangeEventGenerator()); // Entity Lifecycle Differs registry.register(DATASET_KEY_ASPECT_NAME, new EntityKeyChangeEventGenerator<>());