From f4f9bd3bca62beb15741493b11003642cd5a6889 Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Fri, 20 Dec 2024 17:45:43 +0530 Subject: [PATCH 01/39] =?UTF-8?q?feat(ingest/snowflake):=20include=20exter?= =?UTF-8?q?nal=20table=20ddl=20lineage=20for=20queries=E2=80=A6=20(#12179)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../source/snowflake/snowflake_lineage_v2.py | 55 ++----------------- .../source/snowflake/snowflake_queries.py | 3 - .../source/snowflake/snowflake_schema_gen.py | 54 +++++++++++++++++- .../source/snowflake/snowflake_v2.py | 51 ++++++++--------- .../source_report/ingestion_stage.py | 1 + 5 files changed, 80 insertions(+), 84 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index c769c6705ac3f..69f28a0e6e595 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -265,64 +265,17 @@ def _populate_external_upstreams(self, discovered_tables: List[str]) -> None: with PerfTimer() as timer: self.report.num_external_table_edges_scanned = 0 - for ( - known_lineage_mapping - ) in self._populate_external_lineage_from_copy_history(discovered_tables): - self.sql_aggregator.add(known_lineage_mapping) - logger.info( - "Done populating external lineage from copy history. " - f"Found {self.report.num_external_table_edges_scanned} external lineage edges so far." - ) - - for ( - known_lineage_mapping - ) in self._populate_external_lineage_from_show_query(discovered_tables): - self.sql_aggregator.add(known_lineage_mapping) - - logger.info( - "Done populating external lineage from show external tables. " - f"Found {self.report.num_external_table_edges_scanned} external lineage edges so far." - ) + for entry in self._get_copy_history_lineage(discovered_tables): + self.sql_aggregator.add(entry) + logger.info("Done populating external lineage from copy history. ") self.report.external_lineage_queries_secs = timer.elapsed_seconds() - # Handles the case for explicitly created external tables. - # NOTE: Snowflake does not log this information to the access_history table. - def _populate_external_lineage_from_show_query( - self, discovered_tables: List[str] - ) -> Iterable[KnownLineageMapping]: - external_tables_query: str = SnowflakeQuery.show_external_tables() - try: - for db_row in self.connection.query(external_tables_query): - key = self.identifiers.get_dataset_identifier( - db_row["name"], db_row["schema_name"], db_row["database_name"] - ) - - if key not in discovered_tables: - continue - if db_row["location"].startswith("s3://"): - yield KnownLineageMapping( - upstream_urn=make_s3_urn_for_lineage( - db_row["location"], self.config.env - ), - downstream_urn=self.identifiers.gen_dataset_urn(key), - ) - self.report.num_external_table_edges_scanned += 1 - - self.report.num_external_table_edges_scanned += 1 - except Exception as e: - logger.debug(e, exc_info=e) - self.structured_reporter.warning( - "Error populating external table lineage from Snowflake", - exc=e, - ) - self.report_status(EXTERNAL_LINEAGE, False) - # Handles the case where a table is populated from an external stage/s3 location via copy. # Eg: copy into category_english from @external_s3_stage; # Eg: copy into category_english from 's3://acryl-snow-demo-olist/olist_raw_data/category_english'credentials=(aws_key_id='...' aws_secret_key='...') pattern='.*.csv'; # NOTE: Snowflake does not log this information to the access_history table. - def _populate_external_lineage_from_copy_history( + def _get_copy_history_lineage( self, discovered_tables: List[str] ) -> Iterable[KnownLineageMapping]: query: str = SnowflakeQuery.copy_lineage_history( diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py index 2d2bdc50467c6..174aad0bddd4a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -247,9 +247,6 @@ def get_workunits_internal( for entry in self.fetch_copy_history(): queries.append(entry) - # TODO: Add "show external tables" lineage to the main schema extractor. - # Because it's not a time-based thing, it doesn't really make sense in the snowflake-queries extractor. - with self.report.query_log_fetch_timer: for entry in self.fetch_query_log(): queries.append(entry) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py index bc64693b6a108..4b72b09fafe2d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py @@ -16,6 +16,7 @@ ClassificationHandler, classification_workunit_processor, ) +from datahub.ingestion.source.aws.s3_util import make_s3_urn_for_lineage from datahub.ingestion.source.common.subtypes import ( DatasetContainerSubTypes, DatasetSubTypes, @@ -35,6 +36,7 @@ ) from datahub.ingestion.source.snowflake.snowflake_data_reader import SnowflakeDataReader from datahub.ingestion.source.snowflake.snowflake_profiler import SnowflakeProfiler +from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report from datahub.ingestion.source.snowflake.snowflake_schema import ( SCHEMA_PARALLELISM, @@ -65,6 +67,7 @@ get_domain_wu, ) from datahub.ingestion.source_report.ingestion_stage import ( + EXTERNAL_TABLE_DDL_LINEAGE, METADATA_EXTRACTION, PROFILING, ) @@ -96,7 +99,10 @@ TimeType, ) from datahub.metadata.com.linkedin.pegasus2avro.tag import TagProperties -from datahub.sql_parsing.sql_parsing_aggregator import SqlParsingAggregator +from datahub.sql_parsing.sql_parsing_aggregator import ( + KnownLineageMapping, + SqlParsingAggregator, +) from datahub.utilities.registries.domain_registry import DomainRegistry from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor @@ -180,7 +186,8 @@ def __init__( # These are populated as side-effects of get_workunits_internal. self.databases: List[SnowflakeDatabase] = [] - self.aggregator: Optional[SqlParsingAggregator] = aggregator + + self.aggregator = aggregator def get_connection(self) -> SnowflakeConnection: return self.connection @@ -212,6 +219,19 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.report.set_ingestion_stage(snowflake_db.name, METADATA_EXTRACTION) yield from self._process_database(snowflake_db) + self.report.set_ingestion_stage("*", EXTERNAL_TABLE_DDL_LINEAGE) + discovered_tables: List[str] = [ + self.identifiers.get_dataset_identifier( + table_name, schema.name, db.name + ) + for db in self.databases + for schema in db.schemas + for table_name in schema.tables + ] + if self.aggregator: + for entry in self._external_tables_ddl_lineage(discovered_tables): + self.aggregator.add(entry) + except SnowflakePermissionError as e: self.structured_reporter.failure( GENERIC_PERMISSION_ERROR_KEY, @@ -1082,3 +1102,33 @@ def get_fk_constraints_for_table( # Access to table but none of its constraints - is this possible ? return constraints.get(table_name, []) + + # Handles the case for explicitly created external tables. + # NOTE: Snowflake does not log this information to the access_history table. + def _external_tables_ddl_lineage( + self, discovered_tables: List[str] + ) -> Iterable[KnownLineageMapping]: + external_tables_query: str = SnowflakeQuery.show_external_tables() + try: + for db_row in self.connection.query(external_tables_query): + key = self.identifiers.get_dataset_identifier( + db_row["name"], db_row["schema_name"], db_row["database_name"] + ) + + if key not in discovered_tables: + continue + if db_row["location"].startswith("s3://"): + yield KnownLineageMapping( + upstream_urn=make_s3_urn_for_lineage( + db_row["location"], self.config.env + ), + downstream_urn=self.identifiers.gen_dataset_urn(key), + ) + self.report.num_external_table_edges_scanned += 1 + + self.report.num_external_table_edges_scanned += 1 + except Exception as e: + self.structured_reporter.warning( + "External table ddl lineage extraction failed", + exc=e, + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index e5883dd0349a3..884e6c49f5b62 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -161,35 +161,32 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): # For database, schema, tables, views, etc self.data_dictionary = SnowflakeDataDictionary(connection=self.connection) self.lineage_extractor: Optional[SnowflakeLineageExtractor] = None - self.aggregator: Optional[SqlParsingAggregator] = None - - if self.config.use_queries_v2 or self.config.include_table_lineage: - self.aggregator = self._exit_stack.enter_context( - SqlParsingAggregator( - platform=self.identifiers.platform, - platform_instance=self.config.platform_instance, - env=self.config.env, - graph=self.ctx.graph, - eager_graph_load=( - # If we're ingestion schema metadata for tables/views, then we will populate - # schemas into the resolver as we go. We only need to do a bulk fetch - # if we're not ingesting schema metadata as part of ingestion. - not ( - self.config.include_technical_schema - and self.config.include_tables - and self.config.include_views - ) - and not self.config.lazy_schema_resolver - ), - generate_usage_statistics=False, - generate_operations=False, - format_queries=self.config.format_sql_queries, - ) + + self.aggregator: SqlParsingAggregator = self._exit_stack.enter_context( + SqlParsingAggregator( + platform=self.identifiers.platform, + platform_instance=self.config.platform_instance, + env=self.config.env, + graph=self.ctx.graph, + eager_graph_load=( + # If we're ingestion schema metadata for tables/views, then we will populate + # schemas into the resolver as we go. We only need to do a bulk fetch + # if we're not ingesting schema metadata as part of ingestion. + not ( + self.config.include_technical_schema + and self.config.include_tables + and self.config.include_views + ) + and not self.config.lazy_schema_resolver + ), + generate_usage_statistics=False, + generate_operations=False, + format_queries=self.config.format_sql_queries, ) - self.report.sql_aggregator = self.aggregator.report + ) + self.report.sql_aggregator = self.aggregator.report if self.config.include_table_lineage: - assert self.aggregator is not None redundant_lineage_run_skip_handler: Optional[ RedundantLineageRunSkipHandler ] = None @@ -487,8 +484,6 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: databases = schema_extractor.databases - # TODO: The checkpoint state for stale entity detection can be committed here. - if self.config.shares: yield from SnowflakeSharesHandler( self.config, self.report diff --git a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py index 4308b405e46e3..92407eaae6e90 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py +++ b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py @@ -14,6 +14,7 @@ USAGE_EXTRACTION_INGESTION = "Usage Extraction Ingestion" USAGE_EXTRACTION_OPERATIONAL_STATS = "Usage Extraction Operational Stats" USAGE_EXTRACTION_USAGE_AGGREGATION = "Usage Extraction Usage Aggregation" +EXTERNAL_TABLE_DDL_LINEAGE = "External table DDL Lineage" QUERIES_EXTRACTION = "Queries Extraction" PROFILING = "Profiling" From 157013949e32dc664eb85127ca3b3c78c936e88f Mon Sep 17 00:00:00 2001 From: deepgarg-visa <149145061+deepgarg-visa@users.noreply.github.com> Date: Fri, 20 Dec 2024 21:42:10 +0530 Subject: [PATCH 02/39] fix(gms): Change names of charts in Analytics (#12192) --- .../datahub/graphql/analytics/resolver/GetChartsResolver.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/analytics/resolver/GetChartsResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/analytics/resolver/GetChartsResolver.java index 197ac87c1e22d..d9b8008d46286 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/analytics/resolver/GetChartsResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/analytics/resolver/GetChartsResolver.java @@ -125,7 +125,7 @@ private AnalyticsChart getTopUsersChart(OperationContext opContext) { final DateRange trailingMonthDateRange = dateUtil.getTrailingMonthDateRange(); final List columns = ImmutableList.of("Name", "Title", "Email"); - final String topUsersTitle = "Top Users"; + final String topUsersTitle = "Top Users (Last 30 Days)"; final List topUserRows = _analyticsService.getTopNTableChart( _analyticsService.getUsageIndexName(), @@ -198,7 +198,7 @@ private Row buildNewUsersRow(@Nonnull final SearchEntity entity) { private AnalyticsChart getNewUsersChart(OperationContext opContext) { try { final List columns = ImmutableList.of("Name", "Title", "Email"); - final String newUsersTitle = "New Users"; + final String newUsersTitle = "Active Users (Last 30 Days)"; final SearchResult result = searchForNewUsers(opContext); final List newUserRows = new ArrayList<>(); for (SearchEntity entity : result.getEntities()) { From e52a4deba8a6d436093257437cb3ae5d6148e4f8 Mon Sep 17 00:00:00 2001 From: skrydal Date: Fri, 20 Dec 2024 17:41:18 +0100 Subject: [PATCH 03/39] fix(ingest/databricks): Fix profiling (#12060) --- .../src/datahub/emitter/rest_emitter.py | 17 +- .../auto_ensure_aspect_size.py | 96 +++++ .../datahub/ingestion/source/unity/source.py | 4 + .../source_helpers/test_ensure_aspect_size.py | 346 ++++++++++++++++++ 4 files changed, 462 insertions(+), 1 deletion(-) create mode 100644 metadata-ingestion/src/datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py create mode 100644 metadata-ingestion/tests/unit/api/source_helpers/test_ensure_aspect_size.py diff --git a/metadata-ingestion/src/datahub/emitter/rest_emitter.py b/metadata-ingestion/src/datahub/emitter/rest_emitter.py index e2bc14925ad38..675717b5ec482 100644 --- a/metadata-ingestion/src/datahub/emitter/rest_emitter.py +++ b/metadata-ingestion/src/datahub/emitter/rest_emitter.py @@ -291,6 +291,7 @@ def emit_mcps( mcps: List[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]], async_flag: Optional[bool] = None, ) -> int: + logger.debug("Attempting to emit batch mcps") url = f"{self._gms_server}/aspects?action=ingestProposalBatch" for mcp in mcps: ensure_has_system_metadata(mcp) @@ -303,15 +304,22 @@ def emit_mcps( current_chunk_size = INGEST_MAX_PAYLOAD_BYTES for mcp_obj in mcp_objs: mcp_obj_size = len(json.dumps(mcp_obj)) + logger.debug( + f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}" + ) if ( mcp_obj_size + current_chunk_size > INGEST_MAX_PAYLOAD_BYTES or len(mcp_obj_chunks[-1]) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH ): + logger.debug("Decided to create new chunk") mcp_obj_chunks.append([]) current_chunk_size = 0 mcp_obj_chunks[-1].append(mcp_obj) current_chunk_size += mcp_obj_size + logger.debug( + f"Decided to send {len(mcps)} mcps in {len(mcp_obj_chunks)} chunks" + ) for mcp_obj_chunk in mcp_obj_chunks: # TODO: We're calling json.dumps on each MCP object twice, once to estimate @@ -338,8 +346,15 @@ def emit_usage(self, usageStats: UsageAggregation) -> None: def _emit_generic(self, url: str, payload: str) -> None: curl_command = make_curl_command(self._session, "POST", url, payload) + payload_size = len(payload) + if payload_size > INGEST_MAX_PAYLOAD_BYTES: + # since we know total payload size here, we could simply avoid sending such payload at all and report a warning, with current approach we are going to cause whole ingestion to fail + logger.warning( + f"Apparent payload size exceeded {INGEST_MAX_PAYLOAD_BYTES}, might fail with an exception due to the size" + ) logger.debug( - "Attempting to emit to DataHub GMS; using curl equivalent to:\n%s", + "Attempting to emit aspect (size: %s) to DataHub GMS; using curl equivalent to:\n%s", + payload_size, curl_command, ) try: diff --git a/metadata-ingestion/src/datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py b/metadata-ingestion/src/datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py new file mode 100644 index 0000000000000..559f0b77f59df --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py @@ -0,0 +1,96 @@ +import json +import logging +from typing import Iterable, List + +from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES +from datahub.emitter.serialization_helper import pre_json_transform +from datahub.ingestion.api.source import SourceReport +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.metadata.schema_classes import ( + DatasetProfileClass, + SchemaFieldClass, + SchemaMetadataClass, +) + +logger = logging.getLogger(__name__) + + +class EnsureAspectSizeProcessor: + def __init__( + self, report: SourceReport, payload_constraint: int = INGEST_MAX_PAYLOAD_BYTES + ): + self.report = report + self.payload_constraint = payload_constraint + + def ensure_dataset_profile_size( + self, dataset_urn: str, profile: DatasetProfileClass + ) -> None: + """ + This is quite arbitrary approach to ensuring dataset profile aspect does not exceed allowed size, might be adjusted + in the future + """ + sample_fields_size = 0 + if profile.fieldProfiles: + logger.debug(f"Length of field profiles: {len(profile.fieldProfiles)}") + for field in profile.fieldProfiles: + if field.sampleValues: + values_len = 0 + for value in field.sampleValues: + if value: + values_len += len(value) + logger.debug( + f"Field {field.fieldPath} has {len(field.sampleValues)} sample values, taking total bytes {values_len}" + ) + if sample_fields_size + values_len > self.payload_constraint: + field.sampleValues = [] + self.report.warning( + title="Dataset profile truncated due to size constraint", + message="Dataset profile contained too much data and would have caused ingestion to fail", + context=f"Sample values for field {field.fieldPath} were removed from dataset profile for {dataset_urn} due to aspect size constraints", + ) + else: + sample_fields_size += values_len + else: + logger.debug(f"Field {field.fieldPath} has no sample values") + + def ensure_schema_metadata_size( + self, dataset_urn: str, schema: SchemaMetadataClass + ) -> None: + """ + This is quite arbitrary approach to ensuring schema metadata aspect does not exceed allowed size, might be adjusted + in the future + """ + total_fields_size = 0 + logger.debug(f"Amount of schema fields: {len(schema.fields)}") + accepted_fields: List[SchemaFieldClass] = [] + for field in schema.fields: + field_size = len(json.dumps(pre_json_transform(field.to_obj()))) + logger.debug(f"Field {field.fieldPath} takes total {field_size}") + if total_fields_size + field_size < self.payload_constraint: + accepted_fields.append(field) + total_fields_size += field_size + else: + self.report.warning( + title="Schema truncated due to size constraint", + message="Dataset schema contained too much data and would have caused ingestion to fail", + context=f"Field {field.fieldPath} was removed from schema for {dataset_urn} due to aspect size constraints", + ) + + schema.fields = accepted_fields + + def ensure_aspect_size( + self, + stream: Iterable[MetadataWorkUnit], + ) -> Iterable[MetadataWorkUnit]: + """ + We have hard limitation of aspect size being 16 MB. Some aspects can exceed that value causing an exception + on GMS side and failure of the entire ingestion. This processor will attempt to trim suspected aspects. + """ + for wu in stream: + logger.debug(f"Ensuring size of workunit: {wu.id}") + + if schema := wu.get_aspect_of_type(SchemaMetadataClass): + self.ensure_schema_metadata_size(wu.get_urn(), schema) + elif profile := wu.get_aspect_of_type(DatasetProfileClass): + self.ensure_dataset_profile_size(wu.get_urn(), profile) + yield wu diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py index 9d9a746580f93..7bfa7fdb28aaf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py @@ -26,6 +26,9 @@ gen_containers, ) from datahub.emitter.sql_parsing_builder import SqlParsingBuilder +from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import ( + EnsureAspectSizeProcessor, +) from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.decorators import ( SupportStatus, @@ -260,6 +263,7 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: StaleEntityRemovalHandler.create( self, self.config, self.ctx ).workunit_processor, + EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size, ] def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: diff --git a/metadata-ingestion/tests/unit/api/source_helpers/test_ensure_aspect_size.py b/metadata-ingestion/tests/unit/api/source_helpers/test_ensure_aspect_size.py new file mode 100644 index 0000000000000..bdf1e0a2e0e86 --- /dev/null +++ b/metadata-ingestion/tests/unit/api/source_helpers/test_ensure_aspect_size.py @@ -0,0 +1,346 @@ +import json +import time +from unittest.mock import patch + +import pytest +from freezegun.api import freeze_time + +from datahub.emitter.aspect import JSON_CONTENT_TYPE +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES +from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import ( + EnsureAspectSizeProcessor, +) +from datahub.ingestion.api.source import SourceReport +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent +from datahub.metadata.schema_classes import ( + ChangeTypeClass, + DatasetFieldProfileClass, + DatasetProfileClass, + DatasetSnapshotClass, + GenericAspectClass, + MetadataChangeProposalClass, + NumberTypeClass, + OtherSchemaClass, + SchemaFieldClass, + SchemaFieldDataTypeClass, + SchemaMetadataClass, + StatusClass, + StringTypeClass, + SubTypesClass, +) + + +@pytest.fixture +def processor(): + return EnsureAspectSizeProcessor(SourceReport()) + + +def too_big_schema_metadata() -> SchemaMetadataClass: + fields = [ + SchemaFieldClass( + "aaaa", + nativeDataType="int", + type=SchemaFieldDataTypeClass(type=NumberTypeClass()), + ), + SchemaFieldClass( + "bbbb", + nativeDataType="string", + type=SchemaFieldDataTypeClass(type=StringTypeClass()), + ), + SchemaFieldClass( + "cccc", + nativeDataType="int", + type=SchemaFieldDataTypeClass(type=NumberTypeClass()), + ), + ] + # simple int type field takes ~160 bytes in JSON representation, below is to assure we exceed the threshold + for f_no in range(1000): + fields.append( + SchemaFieldClass( + fieldPath=f"t{f_no}", + nativeDataType="int", + type=SchemaFieldDataTypeClass(type=NumberTypeClass()), + description=20000 * "a", + ) + ) + + # adding small field to check whether it will still be present in the output + fields.append( + SchemaFieldClass( + "dddd", + nativeDataType="int", + type=SchemaFieldDataTypeClass(type=NumberTypeClass()), + ) + ) + return SchemaMetadataClass( + schemaName="abcdef", + version=1, + platform="s3", + hash="ABCDE1234567890", + platformSchema=OtherSchemaClass(rawSchema="aaa"), + fields=fields, + ) + + +def proper_schema_metadata() -> SchemaMetadataClass: + fields = [ + SchemaFieldClass( + "aaaa", + nativeDataType="int", + type=SchemaFieldDataTypeClass(type=NumberTypeClass()), + ), + SchemaFieldClass( + "bbbb", + nativeDataType="string", + type=SchemaFieldDataTypeClass(type=StringTypeClass()), + ), + SchemaFieldClass( + "cccc", + nativeDataType="int", + type=SchemaFieldDataTypeClass(type=NumberTypeClass()), + ), + ] + return SchemaMetadataClass( + schemaName="abcdef", + version=1, + platform="s3", + hash="ABCDE1234567890", + platformSchema=OtherSchemaClass(rawSchema="aaa"), + fields=fields, + ) + + +def proper_dataset_profile() -> DatasetProfileClass: + sample_values = [ + "23483295", + "234234", + "324234", + "12123", + "3150314", + "19231", + "211", + "93498", + "12837", + "73847", + "12434", + "33466", + "98785", + "4546", + "4547", + "342", + "11", + "34", + "444", + "38576", + ] + field_profiles = [ + DatasetFieldProfileClass(fieldPath="a", sampleValues=sample_values), + DatasetFieldProfileClass(fieldPath="b", sampleValues=sample_values), + DatasetFieldProfileClass(fieldPath="c", sampleValues=sample_values), + DatasetFieldProfileClass(fieldPath="d", sampleValues=sample_values), + DatasetFieldProfileClass(fieldPath="e", sampleValues=sample_values), + DatasetFieldProfileClass(fieldPath="f", sampleValues=sample_values), + DatasetFieldProfileClass(fieldPath="g", sampleValues=sample_values), + DatasetFieldProfileClass(fieldPath="h", sampleValues=sample_values), + DatasetFieldProfileClass(fieldPath="i", sampleValues=sample_values), + DatasetFieldProfileClass(fieldPath="j", sampleValues=sample_values), + ] + return DatasetProfileClass( + timestampMillis=int(time.time()) * 1000, fieldProfiles=field_profiles + ) + + +@freeze_time("2023-01-02 00:00:00") +def test_ensure_size_of_proper_dataset_profile(processor): + profile = proper_dataset_profile() + orig_repr = json.dumps(profile.to_obj()) + processor.ensure_dataset_profile_size( + "urn:li:dataset:(s3, dummy_dataset, DEV)", profile + ) + assert orig_repr == json.dumps( + profile.to_obj() + ), "Aspect was modified in case where workunit processor should have been no-op" + + +@freeze_time("2023-01-02 00:00:00") +def test_ensure_size_of_too_big_schema_metadata(processor): + schema = too_big_schema_metadata() + assert len(schema.fields) == 1004 + + processor.ensure_schema_metadata_size( + "urn:li:dataset:(s3, dummy_dataset, DEV)", schema + ) + assert len(schema.fields) < 1004, "Schema has not been properly truncated" + assert schema.fields[-1].fieldPath == "dddd", "Small field was not added at the end" + # +100kb is completely arbitrary, but we are truncating the aspect based on schema fields size only, not total taken + # by other parameters of the aspect - it is reasonable approach though - schema fields is the only field in schema + # metadata which can be expected to grow out of control + assert ( + len(json.dumps(schema.to_obj())) < INGEST_MAX_PAYLOAD_BYTES + 100000 + ), "Aspect exceeded acceptable size" + + +@freeze_time("2023-01-02 00:00:00") +def test_ensure_size_of_proper_schema_metadata(processor): + schema = proper_schema_metadata() + orig_repr = json.dumps(schema.to_obj()) + processor.ensure_schema_metadata_size( + "urn:li:dataset:(s3, dummy_dataset, DEV)", schema + ) + assert orig_repr == json.dumps( + schema.to_obj() + ), "Aspect was modified in case where workunit processor should have been no-op" + + +@freeze_time("2023-01-02 00:00:00") +def test_ensure_size_of_too_big_dataset_profile(processor): + profile = proper_dataset_profile() + big_field = DatasetFieldProfileClass( + fieldPath="big", + sampleValues=20 * [(int(INGEST_MAX_PAYLOAD_BYTES / 20) - 10) * "a"], + ) + assert profile.fieldProfiles + profile.fieldProfiles.insert(4, big_field) + processor.ensure_dataset_profile_size( + "urn:li:dataset:(s3, dummy_dataset, DEV)", profile + ) + + expected_profile = proper_dataset_profile() + reduced_field = DatasetFieldProfileClass( + fieldPath="big", + sampleValues=[], + ) + assert expected_profile.fieldProfiles + expected_profile.fieldProfiles.insert(4, reduced_field) + assert json.dumps(profile.to_obj()) == json.dumps( + expected_profile.to_obj() + ), "Field 'big' was not properly removed from aspect due to its size" + + +@freeze_time("2023-01-02 00:00:00") +@patch( + "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_schema_metadata_size" +) +@patch( + "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_dataset_profile_size" +) +def test_wu_processor_triggered_by_data_profile_aspect( + ensure_dataset_profile_size_mock, ensure_schema_metadata_size_mock, processor +): + ret = [ # noqa: F841 + *processor.ensure_aspect_size( + [ + MetadataChangeProposalWrapper( + entityUrn="urn:li:dataset:(urn:li:dataPlatform:s3, dummy_name, DEV)", + aspect=proper_dataset_profile(), + ).as_workunit() + ] + ) + ] + ensure_dataset_profile_size_mock.assert_called_once() + ensure_schema_metadata_size_mock.assert_not_called() + + +@freeze_time("2023-01-02 00:00:00") +@patch( + "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_schema_metadata_size" +) +@patch( + "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_dataset_profile_size" +) +def test_wu_processor_triggered_by_data_profile_aspect_mcpc( + ensure_dataset_profile_size_mock, ensure_schema_metadata_size_mock, processor +): + profile_aspect = proper_dataset_profile() + mcpc = MetadataWorkUnit( + id="test", + mcp_raw=MetadataChangeProposalClass( + entityType="dataset", + changeType=ChangeTypeClass.UPSERT, + entityUrn="urn:li:dataset:(urn:li:dataPlatform:s3, dummy_name, DEV)", + aspectName=DatasetProfileClass.ASPECT_NAME, + aspect=GenericAspectClass( + value=json.dumps(profile_aspect.to_obj()).encode(), + contentType=JSON_CONTENT_TYPE, + ), + ), + ) + ret = [*processor.ensure_aspect_size([mcpc])] # noqa: F841 + ensure_dataset_profile_size_mock.assert_called_once() + ensure_schema_metadata_size_mock.assert_not_called() + + +@freeze_time("2023-01-02 00:00:00") +@patch( + "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_schema_metadata_size" +) +@patch( + "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_dataset_profile_size" +) +def test_wu_processor_triggered_by_data_profile_aspect_mce( + ensure_dataset_profile_size_mock, ensure_schema_metadata_size_mock, processor +): + snapshot = DatasetSnapshotClass( + urn="urn:li:dataset:(urn:li:dataPlatform:s3, dummy_name, DEV)", + aspects=[proper_schema_metadata()], + ) + mce = MetadataWorkUnit( + id="test", mce=MetadataChangeEvent(proposedSnapshot=snapshot) + ) + ret = [*processor.ensure_aspect_size([mce])] # noqa: F841 + ensure_schema_metadata_size_mock.assert_called_once() + ensure_dataset_profile_size_mock.assert_not_called() + + +@freeze_time("2023-01-02 00:00:00") +@patch( + "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_schema_metadata_size" +) +@patch( + "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_dataset_profile_size" +) +def test_wu_processor_triggered_by_schema_metadata_aspect( + ensure_dataset_profile_size_mock, ensure_schema_metadata_size_mock, processor +): + ret = [ # noqa: F841 + *processor.ensure_aspect_size( + [ + MetadataChangeProposalWrapper( + entityUrn="urn:li:dataset:(urn:li:dataPlatform:s3, dummy_name, DEV)", + aspect=proper_schema_metadata(), + ).as_workunit() + ] + ) + ] + ensure_schema_metadata_size_mock.assert_called_once() + ensure_dataset_profile_size_mock.assert_not_called() + + +@freeze_time("2023-01-02 00:00:00") +@patch( + "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_schema_metadata_size" +) +@patch( + "datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size.EnsureAspectSizeProcessor.ensure_dataset_profile_size" +) +def test_wu_processor_not_triggered_by_unhandled_aspects( + ensure_dataset_profile_size_mock, ensure_schema_metadata_size_mock, processor +): + ret = [ # noqa: F841 + *processor.ensure_aspect_size( + [ + MetadataChangeProposalWrapper( + entityUrn="urn:li:dataset:(urn:li:dataPlatform:s3, dummy_name, DEV)", + aspect=StatusClass(removed=False), + ).as_workunit(), + MetadataChangeProposalWrapper( + entityUrn="urn:li:dataset:(urn:li:dataPlatform:s3, dummy_name, DEV)", + aspect=SubTypesClass(typeNames=["table"]), + ).as_workunit(), + ] + ) + ] + ensure_schema_metadata_size_mock.assert_not_called() + ensure_dataset_profile_size_mock.assert_not_called() From 98c056d569d4e5f2fa031a5a3ac8f3009ee49567 Mon Sep 17 00:00:00 2001 From: sid-acryl <155424659+sid-acryl@users.noreply.github.com> Date: Sat, 21 Dec 2024 00:36:57 +0530 Subject: [PATCH 04/39] refactor(ingest/tableau): mark the `fetch_size` configuration as deprecated (#12126) --- .../ingestion/source/tableau/tableau.py | 18 +++++++++++------- .../integration/tableau/test_tableau_ingest.py | 1 + 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index fadcb8ff8f396..984cf9357199d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -49,6 +49,7 @@ DatasetSourceConfigMixin, ) from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated +from datahub.configuration.validate_field_removal import pydantic_removed_field from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_builder import ( ContainerKey, @@ -380,11 +381,6 @@ class TableauConfig( description="[advanced] Number of metadata objects (e.g. CustomSQLTable, PublishedDatasource, etc) to query at a time using the Tableau API.", ) - fetch_size: int = Field( - default=250, - description="Specifies the number of records to retrieve in each batch during a query execution.", - ) - # We've found that even with a small workbook page size (e.g. 10), the Tableau API often # returns warnings like this: # { @@ -499,6 +495,10 @@ class TableauConfig( "This can only be used with ingest_tags enabled as it will overwrite tags entered from the UI.", ) + _fetch_size = pydantic_removed_field( + "fetch_size", + ) + # pre = True because we want to take some decision before pydantic initialize the configuration to default values @root_validator(pre=True) def projects_backward_compatibility(cls, values: Dict) -> Dict: @@ -1147,7 +1147,7 @@ def get_connection_object_page( connection_type: str, query_filter: str, current_cursor: Optional[str], - fetch_size: int = 250, + fetch_size: int, retry_on_auth_error: bool = True, retries_remaining: Optional[int] = None, ) -> Tuple[dict, Optional[str], int]: @@ -1344,7 +1344,11 @@ def get_connection_objects( connection_type=connection_type, query_filter=filter_, current_cursor=current_cursor, - fetch_size=self.config.fetch_size, + # `filter_page` contains metadata object IDs (e.g., Project IDs, Field IDs, Sheet IDs, etc.). + # The number of IDs is always less than or equal to page_size. + # If the IDs are primary keys, the number of metadata objects to load matches the number of records to return. + # In our case, mostly, the IDs are primary key, therefore, fetch_size is set equal to page_size. + fetch_size=page_size, ) yield from connection_objects.get(c.NODES) or [] diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index 4b2ac96931b95..fa00eaef9ccab 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -1324,6 +1324,7 @@ def test_permission_warning(pytestconfig, tmp_path, mock_datahub_graph): query_filter=mock.MagicMock(), current_cursor=None, retries_remaining=1, + fetch_size=10, ) warnings = list(reporter.warnings) From 3c3d0322fe9608ccf7cbaadfd83f6f7f0e7afeff Mon Sep 17 00:00:00 2001 From: sid-acryl <155424659+sid-acryl@users.noreply.github.com> Date: Sat, 21 Dec 2024 01:27:34 +0530 Subject: [PATCH 05/39] test(ingest/tableau): add test for extract_project_hierarchy scenario (#12079) --- .../tableau/test_tableau_ingest.py | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index fa00eaef9ccab..c3a8880bf20a0 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -27,6 +27,7 @@ from datahub.ingestion.source.tableau import tableau_constant as c from datahub.ingestion.source.tableau.tableau import ( TableauConfig, + TableauProject, TableauSiteSource, TableauSource, TableauSourceReport, @@ -1342,6 +1343,82 @@ def test_permission_warning(pytestconfig, tmp_path, mock_datahub_graph): @freeze_time(FROZEN_TIME) +@pytest.mark.parametrize( + "extract_project_hierarchy, allowed_projects", + [ + (True, ["project1", "project4", "project3"]), + (False, ["project1", "project4"]), + ], +) +def test_extract_project_hierarchy(extract_project_hierarchy, allowed_projects): + context = PipelineContext(run_id="0", pipeline_name="test_tableau") + + config_dict = config_source_default.copy() + + del config_dict["stateful_ingestion"] + del config_dict["projects"] + + config_dict["project_pattern"] = { + "allow": ["project1", "project4"], + "deny": ["project2"], + } + + config_dict["extract_project_hierarchy"] = extract_project_hierarchy + + config = TableauConfig.parse_obj(config_dict) + + site_source = TableauSiteSource( + config=config, + ctx=context, + platform="tableau", + site=SiteItem(name="Site 1", content_url="site1"), + site_id="site1", + report=TableauSourceReport(), + server=Server("https://test-tableau-server.com"), + ) + + all_project_map: Dict[str, TableauProject] = { + "p1": TableauProject( + id="1", + name="project1", + path=[], + parent_id=None, + parent_name=None, + description=None, + ), + "p2": TableauProject( + id="2", + name="project2", + path=[], + parent_id="1", + parent_name="project1", + description=None, + ), + "p3": TableauProject( + id="3", + name="project3", + path=[], + parent_id="1", + parent_name="project1", + description=None, + ), + "p4": TableauProject( + id="4", + name="project4", + path=[], + parent_id=None, + parent_name=None, + description=None, + ), + } + + site_source._init_tableau_project_registry(all_project_map) + + assert allowed_projects == [ + project.name for project in site_source.tableau_project_registry.values() + ] + + @pytest.mark.integration def test_connection_report_test(requests_mock): server_info_response = """ From 667fa8fccec40037c55ec1c99a35777dbc0e5eaf Mon Sep 17 00:00:00 2001 From: "nicholas.fwang" Date: Sat, 21 Dec 2024 04:59:44 +0900 Subject: [PATCH 06/39] docs(structured properties): fix entityTypes in creating structured property (#12187) --- docs/api/tutorials/structured-properties.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/api/tutorials/structured-properties.md b/docs/api/tutorials/structured-properties.md index 95c89424e9ca7..2caa015e20659 100644 --- a/docs/api/tutorials/structured-properties.md +++ b/docs/api/tutorials/structured-properties.md @@ -73,7 +73,7 @@ mutation createStructuredProperty { {numberValue: 365, description:"Use this for non-sensitive data that can be retained for longer"} ], cardinality: SINGLE, - entityTypes: ["urn:li:entityType:dataset", "urn:li:entityType:dataFlow"], + entityTypes: ["urn:li:entityType:datahub.dataset", "urn:li:entityType:datahub.dataFlow"], } ) { urn From 327c6f911ada269d8ad9554bceed8aaf16568295 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Fri, 20 Dec 2024 15:59:07 -0600 Subject: [PATCH 07/39] chore(bump): bump alpine and dockerize (#12184) --- .../docker-custom-build-and-push/action.yml | 3 +- .github/workflows/docker-postgres-setup.yml | 2 +- .github/workflows/docker-unified.yml | 46 +++++++++---------- docker/datahub-gms/Dockerfile | 4 +- docker/datahub-mae-consumer/Dockerfile | 4 +- docker/datahub-mce-consumer/Dockerfile | 4 +- docker/datahub-upgrade/Dockerfile | 4 +- docker/elasticsearch-setup/Dockerfile | 4 +- docker/mysql-setup/Dockerfile | 4 +- docker/postgres-setup/Dockerfile | 4 +- 10 files changed, 40 insertions(+), 39 deletions(-) diff --git a/.github/actions/docker-custom-build-and-push/action.yml b/.github/actions/docker-custom-build-and-push/action.yml index ccaff510c120a..cc2c2bd86416d 100644 --- a/.github/actions/docker-custom-build-and-push/action.yml +++ b/.github/actions/docker-custom-build-and-push/action.yml @@ -97,10 +97,11 @@ runs: cache-to: | type=inline - name: Upload image locally for testing (if not publishing) - uses: ishworkh/docker-image-artifact-upload@v1 + uses: ishworkh/container-image-artifact-upload@v2.0.0 if: ${{ inputs.publish != 'true' }} with: image: ${{ steps.single_tag.outputs.SINGLE_TAG }} + retention_days: "2" # Code for building multi-platform images and pushing to Docker Hub. - name: Set up QEMU diff --git a/.github/workflows/docker-postgres-setup.yml b/.github/workflows/docker-postgres-setup.yml index 956f3f7b1c390..c028bfb55d48d 100644 --- a/.github/workflows/docker-postgres-setup.yml +++ b/.github/workflows/docker-postgres-setup.yml @@ -52,7 +52,7 @@ jobs: with: images: | acryldata/datahub-postgres-setup - tags: ${{ needs.setup.outputs.tag }} + image_tag: ${{ needs.setup.outputs.tag }} username: ${{ secrets.ACRYL_DOCKER_USERNAME }} password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} publish: ${{ needs.setup.outputs.publish == 'true' }} diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index 49dd26e1cd27e..16a2d29e9fd85 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -186,7 +186,7 @@ jobs: - name: Checkout # adding checkout step just to make trivy upload happy uses: acryldata/sane-checkout-action@v3 - name: Download image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} with: image: ${{ env.DATAHUB_GMS_IMAGE }}:${{ needs.setup.outputs.unique_tag }} @@ -257,7 +257,7 @@ jobs: - name: Checkout # adding checkout step just to make trivy upload happy uses: acryldata/sane-checkout-action@v3 - name: Download image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} with: image: ${{ env.DATAHUB_MAE_CONSUMER_IMAGE }}:${{ needs.setup.outputs.unique_tag }} @@ -328,7 +328,7 @@ jobs: - name: Checkout # adding checkout step just to make trivy upload happy uses: acryldata/sane-checkout-action@v3 - name: Download image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} with: image: ${{ env.DATAHUB_MCE_CONSUMER_IMAGE }}:${{ needs.setup.outputs.unique_tag }} @@ -399,7 +399,7 @@ jobs: - name: Checkout # adding checkout step just to make trivy upload happy uses: acryldata/sane-checkout-action@v3 - name: Download image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} with: image: ${{ env.DATAHUB_UPGRADE_IMAGE }}:${{ needs.setup.outputs.unique_tag }} @@ -472,7 +472,7 @@ jobs: - name: Checkout # adding checkout step just to make trivy upload happy uses: actions/checkout@v4 - name: Download image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} with: image: ${{ env.DATAHUB_FRONTEND_IMAGE }}:${{ needs.setup.outputs.unique_tag }} @@ -533,7 +533,7 @@ jobs: - name: Checkout # adding checkout step just to make trivy upload happy uses: acryldata/sane-checkout-action@v3 - name: Download image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} with: image: ${{ env.DATAHUB_KAFKA_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} @@ -594,7 +594,7 @@ jobs: - name: Checkout # adding checkout step just to make trivy upload happy uses: acryldata/sane-checkout-action@v3 - name: Download image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} with: image: ${{ env.DATAHUB_MYSQL_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} @@ -655,7 +655,7 @@ jobs: - name: Checkout # adding checkout step just to make trivy upload happy uses: acryldata/sane-checkout-action@v3 - name: Download image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} with: image: ${{ env.DATAHUB_ELASTIC_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} @@ -727,7 +727,7 @@ jobs: - name: Check out the repo uses: acryldata/sane-checkout-action@v3 - name: Download Base Image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' && needs.setup.outputs.ingestion_base_change == 'true' }} with: image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_tag || 'head' }} @@ -775,7 +775,7 @@ jobs: - name: Check out the repo uses: acryldata/sane-checkout-action@v3 - name: Download Base Image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' && needs.setup.outputs.ingestion_base_change == 'true' }} with: image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_tag || 'head' }} @@ -836,7 +836,7 @@ jobs: if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish =='true' }} run: ./gradlew :metadata-ingestion:codegen - name: Download Base Image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' && needs.setup.outputs.ingestion_base_change == 'true' }} with: image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_slim_tag || 'head-slim' }} @@ -883,7 +883,7 @@ jobs: - name: Checkout # adding checkout step just to make trivy upload happy uses: acryldata/sane-checkout-action@v3 - name: Download image Slim Image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.datahub_ingestion_slim_build.outputs.needs_artifact_download == 'true' }} with: image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.tag }} @@ -937,7 +937,7 @@ jobs: if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }} run: ./gradlew :metadata-ingestion:codegen - name: Download Base Image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' && needs.setup.outputs.ingestion_base_change == 'true' }} with: image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_tag || 'head' }} @@ -982,7 +982,7 @@ jobs: - name: Checkout # adding checkout step just to make trivy upload happy uses: acryldata/sane-checkout-action@v3 - name: Download image Full Image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.datahub_ingestion_full_build.outputs.needs_artifact_download == 'true' }} with: image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_full_build.outputs.tag }} @@ -1079,47 +1079,47 @@ jobs: - name: Disk Check run: df -h . && docker images - name: Download GMS image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' ) && needs.gms_build.result == 'success' }} with: image: ${{ env.DATAHUB_GMS_IMAGE }}:${{ needs.setup.outputs.unique_tag }} - name: Download Frontend image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' ) && needs.frontend_build.result == 'success' }} with: image: ${{ env.DATAHUB_FRONTEND_IMAGE }}:${{ needs.setup.outputs.unique_tag }} - name: Download Kafka Setup image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' ) && needs.kafka_setup_build.result == 'success' }} with: image: ${{ env.DATAHUB_KAFKA_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} - name: Download Mysql Setup image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' ) && needs.mysql_setup_build.result == 'success' }} with: image: ${{ env.DATAHUB_MYSQL_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} - name: Download Elastic Setup image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' ) && needs.elasticsearch_setup_build.result == 'success' }} with: image: ${{ env.DATAHUB_ELASTIC_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} - name: Download MCE Consumer image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' ) && needs.mce_consumer_build.result == 'success' }} with: image: ${{ env.DATAHUB_MCE_CONSUMER_IMAGE }}:${{ needs.setup.outputs.unique_tag }} - name: Download MAE Consumer image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' ) && needs.mae_consumer_build.result == 'success' }} with: image: ${{ env.DATAHUB_MAE_CONSUMER_IMAGE }}:${{ needs.setup.outputs.unique_tag }} - name: Download upgrade image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' ) && needs.datahub_upgrade_build.result == 'success' }} with: image: ${{ env.DATAHUB_UPGRADE_IMAGE }}:${{ needs.setup.outputs.unique_tag }} - name: Download datahub-ingestion-slim image - uses: ishworkh/docker-image-artifact-download@v1 + uses: ishworkh/container-image-artifact-download@v2.0.0 if: ${{ needs.datahub_ingestion_slim_build.outputs.needs_artifact_download == 'true' && needs.datahub_ingestion_slim_build.result == 'success' }} with: image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.tag }} diff --git a/docker/datahub-gms/Dockerfile b/docker/datahub-gms/Dockerfile index b15bf3c6f9f17..47b10535f8dee 100644 --- a/docker/datahub-gms/Dockerfile +++ b/docker/datahub-gms/Dockerfile @@ -6,12 +6,12 @@ ARG ALPINE_REPO_URL=http://dl-cdn.alpinelinux.org/alpine ARG GITHUB_REPO_URL=https://github.com ARG MAVEN_CENTRAL_REPO_URL=https://repo1.maven.org/maven2 -FROM golang:1-alpine3.20 AS binary +FROM golang:1-alpine3.21 AS binary # Re-declaring arg from above to make it available in this stage (will inherit default value) ARG ALPINE_REPO_URL -ENV DOCKERIZE_VERSION=v0.6.1 +ENV DOCKERIZE_VERSION=v0.9.1 WORKDIR /go/src/github.com/jwilder # Optionally set corporate mirror for apk diff --git a/docker/datahub-mae-consumer/Dockerfile b/docker/datahub-mae-consumer/Dockerfile index 6edaa29ee1a8b..74375072761d8 100644 --- a/docker/datahub-mae-consumer/Dockerfile +++ b/docker/datahub-mae-consumer/Dockerfile @@ -6,12 +6,12 @@ ARG ALPINE_REPO_URL=http://dl-cdn.alpinelinux.org/alpine ARG GITHUB_REPO_URL=https://github.com ARG MAVEN_CENTRAL_REPO_URL=https://repo1.maven.org/maven2 -FROM golang:1-alpine3.20 AS binary +FROM golang:1-alpine3.21 AS binary # Re-declaring arg from above to make it available in this stage (will inherit default value) ARG ALPINE_REPO_URL -ENV DOCKERIZE_VERSION=v0.6.1 +ENV DOCKERIZE_VERSION=v0.9.1 WORKDIR /go/src/github.com/jwilder # Optionally set corporate mirror for apk diff --git a/docker/datahub-mce-consumer/Dockerfile b/docker/datahub-mce-consumer/Dockerfile index 1eb56633c561e..3adef53cd0606 100644 --- a/docker/datahub-mce-consumer/Dockerfile +++ b/docker/datahub-mce-consumer/Dockerfile @@ -6,12 +6,12 @@ ARG ALPINE_REPO_URL=http://dl-cdn.alpinelinux.org/alpine ARG GITHUB_REPO_URL=https://github.com ARG MAVEN_CENTRAL_REPO_URL=https://repo1.maven.org/maven2 -FROM golang:1-alpine3.20 AS binary +FROM golang:1-alpine3.21 AS binary # Re-declaring arg from above to make it available in this stage (will inherit default value) ARG ALPINE_REPO_URL -ENV DOCKERIZE_VERSION=v0.6.1 +ENV DOCKERIZE_VERSION=v0.9.1 WORKDIR /go/src/github.com/jwilder # Optionally set corporate mirror for apk diff --git a/docker/datahub-upgrade/Dockerfile b/docker/datahub-upgrade/Dockerfile index 3d59a903414b1..a8ef4e8034fdd 100644 --- a/docker/datahub-upgrade/Dockerfile +++ b/docker/datahub-upgrade/Dockerfile @@ -6,12 +6,12 @@ ARG ALPINE_REPO_URL=http://dl-cdn.alpinelinux.org/alpine ARG GITHUB_REPO_URL=https://github.com ARG MAVEN_CENTRAL_REPO_URL=https://repo1.maven.org/maven2 -FROM golang:1-alpine3.20 AS binary +FROM golang:1-alpine3.21 AS binary # Re-declaring arg from above to make it available in this stage (will inherit default value) ARG ALPINE_REPO_URL -ENV DOCKERIZE_VERSION=v0.6.1 +ENV DOCKERIZE_VERSION=v0.9.1 WORKDIR /go/src/github.com/jwilder # Optionally set corporate mirror for apk diff --git a/docker/elasticsearch-setup/Dockerfile b/docker/elasticsearch-setup/Dockerfile index 4e64dcbc1e452..1a6fe5bee6c84 100644 --- a/docker/elasticsearch-setup/Dockerfile +++ b/docker/elasticsearch-setup/Dockerfile @@ -6,11 +6,11 @@ ARG APP_ENV=prod # Defining custom repo urls for use in enterprise environments. Re-used between stages below. ARG ALPINE_REPO_URL=http://dl-cdn.alpinelinux.org/alpine -FROM golang:1-alpine3.20 AS binary +FROM golang:1-alpine3.21 AS binary ARG ALPINE_REPO_URL -ENV DOCKERIZE_VERSION=v0.6.1 +ENV DOCKERIZE_VERSION=v0.9.1 WORKDIR /go/src/github.com/jwilder # Optionally set corporate mirror for apk diff --git a/docker/mysql-setup/Dockerfile b/docker/mysql-setup/Dockerfile index b0ca45ad8f6f2..8a2d42bc23318 100644 --- a/docker/mysql-setup/Dockerfile +++ b/docker/mysql-setup/Dockerfile @@ -1,11 +1,11 @@ # Defining custom repo urls for use in enterprise environments. Re-used between stages below. ARG ALPINE_REPO_URL=http://dl-cdn.alpinelinux.org/alpine -FROM golang:1-alpine3.20 AS binary +FROM golang:1-alpine3.21 AS binary ARG ALPINE_REPO_URL -ENV DOCKERIZE_VERSION=v0.6.1 +ENV DOCKERIZE_VERSION=v0.9.1 WORKDIR /go/src/github.com/jwilder # Optionally set corporate mirror for apk diff --git a/docker/postgres-setup/Dockerfile b/docker/postgres-setup/Dockerfile index e145456e807d4..31e9687cea15e 100644 --- a/docker/postgres-setup/Dockerfile +++ b/docker/postgres-setup/Dockerfile @@ -1,11 +1,11 @@ # Defining custom repo urls for use in enterprise environments. Re-used between stages below. ARG ALPINE_REPO_URL=http://dl-cdn.alpinelinux.org/alpine -FROM golang:1-alpine3.20 AS binary +FROM golang:1-alpine3.21 AS binary ARG ALPINE_REPO_URL -ENV DOCKERIZE_VERSION=v0.6.1 +ENV DOCKERIZE_VERSION=v0.9.1 WORKDIR /go/src/github.com/jwilder # Optionally set corporate mirror for apk From f6c0cf34c075e078fe6cf3c2e18e6a8d711cc8db Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Fri, 20 Dec 2024 17:04:58 -0600 Subject: [PATCH 08/39] docs update: Update v_0_3_7.md (#12197) Co-authored-by: Chris Collins --- docs/managed-datahub/release-notes/v_0_3_7.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/managed-datahub/release-notes/v_0_3_7.md b/docs/managed-datahub/release-notes/v_0_3_7.md index 75f5ac21224c2..31302403ea930 100644 --- a/docs/managed-datahub/release-notes/v_0_3_7.md +++ b/docs/managed-datahub/release-notes/v_0_3_7.md @@ -13,6 +13,12 @@ If you are using an older CLI/SDK version, then please upgrade it. This applies ## Known Issues +### v0.3.7.8 + * Notes Feature + * Adding a Note to an entity will result in that note showing up in the Settings > Home Page list of announcements as well as the profile page of the entity. + * If more than 30 Notes are added to entities, there's a risk that home page announcements will not show up on the home page properly. + * Notes are only supported for Dataset and Column entities in this release. + ### v0.3.7.7 * Postgres regression, non-functional when using postgres @@ -24,7 +30,9 @@ If you are using an older CLI/SDK version, then please upgrade it. This applies ### v0.3.7.8 +- Helm Chart Requirement: 1.4.157+ - [Postgres] Fix regression from MySQL fix in v0.3.7.7 +- [UI] Fix editing post on entity profile page becomes announcement ### v0.3.7.7 From 8e9fc20fb6ec57b547c97d433ec5f85b8a3efe9a Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Fri, 20 Dec 2024 20:00:09 -0600 Subject: [PATCH 09/39] feat(gradle): add quickstartPgDebug option (#12195) --- docker/build.gradle | 262 ++++++++++++++++++++++---------------------- 1 file changed, 131 insertions(+), 131 deletions(-) diff --git a/docker/build.gradle b/docker/build.gradle index 25e3dc12036ef..7b36c0d9acdcf 100644 --- a/docker/build.gradle +++ b/docker/build.gradle @@ -18,24 +18,131 @@ ext { ':datahub-upgrade', ':metadata-service:war', ] - quickstart_modules = backend_profile_modules + [ - ':metadata-jobs:mce-consumer-job', - ':metadata-jobs:mae-consumer-job', - ':datahub-frontend' + + python_services_modules = [] + + // Common configuration for all tasks + common_config = [ + captureContainersOutput: true, + captureContainersOutputToFiles: project.file('build/container-logs') ] - debug_modules = quickstart_modules - [':metadata-jobs:mce-consumer-job', - ':metadata-jobs:mae-consumer-job'] - compose_args = ['-f', compose_base] - debug_reloadable = [ - 'datahub-gms-debug', - 'system-update-debug', - 'frontend-debug' + // declarative task configuration + quickstart_configs = [ + 'quickstart': [ + profile: 'quickstart-consumers', + modules: python_services_modules + backend_profile_modules + [ + ':datahub-frontend', + ':metadata-jobs:mce-consumer-job', + ':metadata-jobs:mae-consumer-job', + ] + ], + 'quickstartDebug': [ + profile: 'debug', + modules: python_services_modules + backend_profile_modules + [':datahub-frontend'], + isDebug: true + ], + 'quickstartPg': [ + profile: 'quickstart-postgres', + modules: (backend_profile_modules - [':docker:mysql-setup']) + [ + ':docker:postgres-setup', + ':datahub-frontend' + ] + ], + 'quickstartPgDebug': [ + profile: 'debug-postgres', + modules: python_services_modules + (backend_profile_modules - [':docker:mysql-setup']) + [ + ':docker:postgres-setup', + ':datahub-frontend' + ], + isDebug: true + ], + 'quickstartSlim': [ + profile: 'quickstart-backend', + modules: backend_profile_modules + [':docker:datahub-ingestion'], + additionalEnv: [ + 'DATAHUB_ACTIONS_IMAGE': 'acryldata/datahub-ingestion', + 'ACTIONS_VERSION': "v${version}-slim", + 'ACTIONS_EXTRA_PACKAGES': 'acryl-datahub-actions[executor] acryl-datahub-actions', + 'ACTIONS_CONFIG': 'https://raw.githubusercontent.com/acryldata/datahub-actions/main/docker/config/executor.yaml', + 'DATAHUB_LOCAL_COMMON_ENV': "${rootProject.project(':metadata-integration:java:spark-lineage-legacy').projectDir}/spark-smoke-test/smoke-gms.env" + ] + ], + 'quickstartStorage': [ + profile: 'quickstart-storage', + preserveVolumes: true + ] ] - // Postgres - pg_quickstart_modules = quickstart_modules - [':docker:mysql-setup'] + [':docker:postgres-setup'] +} + +// Register all quickstart tasks +quickstart_configs.each { taskName, config -> + tasks.register(taskName) +} + +// Dynamically create all quickstart tasks and configurations +dockerCompose { + // Configure default settings that apply to all configurations + useComposeFiles = [compose_base] + projectName = project_name + projectNamePrefix = '' + buildBeforeUp = false + buildBeforePull = false + stopContainers = false + removeVolumes = false + + quickstart_configs.each { taskName, config -> + "${taskName}" { + isRequiredBy(tasks.named(taskName)) + if (config.profile) { + composeAdditionalArgs = ['--profile', config.profile] + } + + // Common environment variables + environment.put 'DATAHUB_VERSION', config.isDebug ? + System.getenv("DATAHUB_VERSION") ?: "v${version}" : + "v${version}" + environment.put 'DATAHUB_TELEMETRY_ENABLED', 'false' + environment.put "METADATA_TESTS_ENABLED", "true" + environment.put "DATAHUB_REPO", "${docker_registry}" + + // Additional environment variables if specified + if (config.additionalEnv) { + config.additionalEnv.each { key, value -> + environment.put key, value + } + } + + useComposeFiles = [compose_base] + projectName = project_name + projectNamePrefix = '' + buildBeforeUp = false + buildBeforePull = false + stopContainers = false + removeVolumes = false + + // Apply common configuration + common_config.each { key, value -> + delegate."${key}" = value + } + + // Apply additional task-specific configuration if specified + if (config.additionalConfig) { + config.additionalConfig.each { key, value -> + delegate."${key}" = value + } + } + } + } +} - revision = 1 // increment to trigger rebuild +// Configure dependencies for ComposeUp tasks +quickstart_configs.each { taskName, config -> + if (config.modules) { + tasks.getByName("${taskName}ComposeUp").dependsOn( + config.modules.collect { it + ":${config.isDebug ? 'dockerTagDebug' : 'dockerTag'}" } + ) + } } tasks.register('minDockerCompose2.20', Exec) { @@ -43,18 +150,11 @@ tasks.register('minDockerCompose2.20', Exec) { args '-c', 'echo -e "$(docker compose version --short)\n2.20"|sort --version-sort --check=quiet --reverse' } -tasks.register('quickstart') {} -tasks.register('quickstartSlim') {} -tasks.register('quickstartDebug') {} -tasks.register('quickstartPg') {} -tasks.register('quickstartStorage') {} - tasks.register('quickstartNuke') { doFirst { - dockerCompose.quickstart.removeVolumes = true - dockerCompose.quickstartPg.removeVolumes = true - dockerCompose.quickstartSlim.removeVolumes = true - dockerCompose.quickstartDebug.removeVolumes = true + quickstart_configs.each { taskName, config -> + dockerCompose."${taskName}".removeVolumes = !config.preserveVolumes + } } finalizedBy(tasks.withType(ComposeDownForced)) } @@ -63,117 +163,17 @@ tasks.register('quickstartDown') { finalizedBy(tasks.withType(ComposeDownForced)) } -dockerCompose { - quickstart { - isRequiredBy(tasks.named('quickstart')) - composeAdditionalArgs = ['--profile', 'quickstart-consumers'] - - environment.put 'DATAHUB_VERSION', "v${version}" - environment.put 'DATAHUB_TELEMETRY_ENABLED', 'false' // disabled when built locally - - useComposeFiles = [compose_base] - projectName = project_name - projectNamePrefix = '' - buildBeforeUp = false - buildBeforePull = false - stopContainers = false - removeVolumes = false - captureContainersOutput = true - captureContainersOutputToFiles = project.file('build/container-logs') - } - - quickstartPg { - isRequiredBy(tasks.named('quickstartPg')) - composeAdditionalArgs = ['--profile', 'quickstart-postgres'] - - environment.put 'DATAHUB_VERSION', "v${version}" - environment.put 'DATAHUB_TELEMETRY_ENABLED', 'false' // disabled when built locally - - useComposeFiles = [compose_base] - projectName = project_name - projectNamePrefix = '' - buildBeforeUp = false - buildBeforePull = false - stopContainers = false - removeVolumes = false - } - - /** - * The smallest disk footprint required for Spark integration tests - * - * No frontend, mae, mce, or other services - */ - quickstartSlim { - isRequiredBy(tasks.named('quickstartSlim')) - composeAdditionalArgs = ['--profile', 'quickstart-backend'] - - environment.put 'DATAHUB_VERSION', "v${version}" - environment.put "DATAHUB_ACTIONS_IMAGE", "acryldata/datahub-ingestion" - environment.put "ACTIONS_VERSION", "v${version}-slim" - environment.put "ACTIONS_EXTRA_PACKAGES", 'acryl-datahub-actions[executor] acryl-datahub-actions' - environment.put "ACTIONS_CONFIG", 'https://raw.githubusercontent.com/acryldata/datahub-actions/main/docker/config/executor.yaml' - environment.put 'DATAHUB_TELEMETRY_ENABLED', 'false' // disabled when built locally - // disabled for spark-lineage smoke-test - environment.put 'DATAHUB_LOCAL_COMMON_ENV', "${rootProject.project(':metadata-integration:java:spark-lineage-legacy').projectDir}/spark-smoke-test/smoke-gms.env" - - useComposeFiles = [compose_base] - projectName = project_name - projectNamePrefix = '' - buildBeforeUp = false - buildBeforePull = false - stopContainers = false - removeVolumes = false - captureContainersOutput = true - captureContainersOutputToFiles = project.file('build/container-logs') - } - - quickstartDebug { - isRequiredBy(tasks.named('quickstartDebug')) - composeAdditionalArgs = ['--profile', 'debug'] - - if (System.getenv().containsKey("DATAHUB_VERSION")) { - environment.put 'DATAHUB_VERSION', System.getenv("DATAHUB_VERSION") - } - environment.put 'DATAHUB_TELEMETRY_ENABLED', 'false' // disabled when built locally - - useComposeFiles = [compose_base] - projectName = project_name - projectNamePrefix = '' - buildBeforeUp = false - buildBeforePull = false - stopContainers = false - removeVolumes = false - } - - quickstartStorage { - isRequiredBy(tasks.named('quickstartStorage')) - composeAdditionalArgs = ['--profile', 'quickstart-storage'] - - useComposeFiles = [compose_base] - projectName = project_name - projectNamePrefix = '' - buildBeforeUp = false - buildBeforePull = false - stopContainers = false - removeVolumes = false - } -} -tasks.getByName('quickstartComposeUp').dependsOn( - quickstart_modules.collect { it + ':dockerTag' }) -tasks.getByName('quickstartPgComposeUp').dependsOn( - pg_quickstart_modules.collect { it + ':dockerTag' }) -tasks.getByName('quickstartSlimComposeUp').dependsOn( - ([':docker:datahub-ingestion'] + backend_profile_modules) - .collect { it + ':dockerTag' }) -tasks.getByName('quickstartDebugComposeUp').dependsOn( - debug_modules.collect { it + ':dockerTagDebug' } -) tasks.withType(ComposeUp).configureEach { shouldRunAfter('quickstartNuke') dependsOn tasks.named("minDockerCompose2.20") } task debugReload(type: Exec) { - def cmd = ['docker compose -p datahub --profile debug'] + compose_args + ['restart'] + debug_reloadable + def cmd = ['docker compose -p datahub --profile debug'] + ['-f', compose_base] + [ + 'restart', + 'datahub-gms-debug', + 'system-update-debug', + 'frontend-debug' + ] commandLine 'bash', '-c', cmd.join(" ") -} +} \ No newline at end of file From 0b4d96e95c50c3db1fdf8cb65954e1f423c17310 Mon Sep 17 00:00:00 2001 From: sid-acryl <155424659+sid-acryl@users.noreply.github.com> Date: Sat, 21 Dec 2024 12:07:53 +0530 Subject: [PATCH 10/39] fix(ingest/powerbi): support comments in m-query grammar (#12177) --- .../powerbi/powerbi-lexical-grammar.rule | 18 ++++++++-- .../integration/powerbi/test_m_parser.py | 36 +++++++++++++++++++ 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule index 51a0dff288558..f237e2503317f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule @@ -21,6 +21,11 @@ // | empty_string // | empty_string "," argument_list // - Added sql_string in any_literal +// - Added WS_INLINE? in field expression +// Added to ignore any comments +// %ignore WS // Ignore whitespace +// %ignore CPP_COMMENT // Ignore single-line comments +// %ignore C_COMMENT // Ignore multi-line comments lexical_unit: lexical_elements? @@ -245,6 +250,8 @@ operator_or_punctuator: "," | "=>" | ".." | "..." + | "{{" + | "}}" document: section_document | expression_document @@ -275,6 +282,7 @@ expression: logical_or_expression | if_expression | error_raising_expression | error_handling_expression + | outer_expression logical_or_expression: logical_and_expression @@ -376,6 +384,8 @@ sql_content: /(?:[^\"\\]|\\[\"]|\"\"|\#\(lf\))+/ sql_string: "\"" sql_content "\"" +outer_expression: "{{" expression "}}" + argument_list: WS_INLINE? expression | WS_INLINE? expression WS_INLINE? "," WS_INLINE? argument_list | WS_INLINE? sql_string @@ -409,7 +419,7 @@ record_expression: "[" field_list? "]" field_list: field | field "," field_list -field: field_name WS_INLINE? "=" WS_INLINE? expression +field: WS_INLINE? field_name WS_INLINE? "=" WS_INLINE? expression field_name: generalized_identifier | quoted_identifier @@ -621,4 +631,8 @@ any_literal: record_literal %import common.DIGIT %import common.LF %import common.CR -%import common.ESCAPED_STRING \ No newline at end of file +%import common.ESCAPED_STRING + +%ignore WS // Ignore whitespace +%ignore CPP_COMMENT // Ignore single-line comments +%ignore C_COMMENT // Ignore multi-line comments \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index 63821f9038a88..832d00d9c5470 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -1171,3 +1171,39 @@ def test_m_query_timeout(mock_get_lark_parser): assert ( is_entry_present ), 'Warning message "M-Query Parsing Timeout" should be present in reporter' + + +def test_comments_in_m_query(): + q: str = 'let\n Source = Snowflake.Databases("xaa48144.snowflakecomputing.com", "COMPUTE_WH", [Role="ACCOUNTADMIN"]),\n SNOWFLAKE_SAMPLE_DATA_Database = Source{[Name="SNOWFLAKE_SAMPLE_DATA", Kind="Database"]}[Data],\n TPCDS_SF100TCL_Schema = SNOWFLAKE_SAMPLE_DATA_Database{[Name="TPCDS_SF100TCL", Kind="Schema"]}[Data],\n ITEM_Table = TPCDS_SF100TCL_Schema{[Name="ITEM", Kind="Table"]}[Data],\n \n // Group by I_BRAND and calculate the count\n BrandCountsTable = Table.Group(ITEM_Table, {"I_BRAND"}, {{"BrandCount", each Table.RowCount(_), Int64.Type}})\nin\n BrandCountsTable' + + table: powerbi_data_classes.Table = powerbi_data_classes.Table( + columns=[], + measures=[], + expression=q, + name="pet_price_index", + full_name="datalake.sandbox_pet.pet_price_index", + ) + + reporter = PowerBiDashboardSourceReport() + + ctx, config, platform_instance_resolver = get_default_instances() + + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, + parameters={ + "hostname": "xyz.databricks.com", + "http_path": "/sql/1.0/warehouses/abc", + "catalog": "cat", + "schema": "public", + }, + )[0].upstreams + + assert len(data_platform_tables) == 1 + assert ( + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpcds_sf100tcl.item,PROD)" + ) From 95b9d1b4c9687c3d505485aa600b5040a2549047 Mon Sep 17 00:00:00 2001 From: Jonny Dixon <45681293+acrylJonny@users.noreply.github.com> Date: Sat, 21 Dec 2024 06:38:59 +0000 Subject: [PATCH 11/39] feat(ingest/aws-common): improved instance profile support (#12139) for ec2, ecs, eks, lambda, beanstalk, app runner and cft roles --- .../ingestion/source/aws/aws_common.py | 258 ++++++++++++-- .../tests/unit/test_aws_common.py | 328 ++++++++++++++++++ 2 files changed, 559 insertions(+), 27 deletions(-) create mode 100644 metadata-ingestion/tests/unit/test_aws_common.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/aws_common.py b/metadata-ingestion/src/datahub/ingestion/source/aws/aws_common.py index 161aed5bb5988..b76eb95def1ed 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/aws_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/aws_common.py @@ -1,7 +1,12 @@ +import logging +import os from datetime import datetime, timedelta, timezone -from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union +from enum import Enum +from http import HTTPStatus +from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union import boto3 +import requests from boto3.session import Session from botocore.config import DEFAULT_TIMEOUT, Config from botocore.utils import fix_s3_host @@ -14,6 +19,8 @@ ) from datahub.configuration.source_common import EnvConfigMixin +logger = logging.getLogger(__name__) + if TYPE_CHECKING: from mypy_boto3_dynamodb import DynamoDBClient from mypy_boto3_glue import GlueClient @@ -22,6 +29,26 @@ from mypy_boto3_sts import STSClient +class AwsEnvironment(Enum): + EC2 = "EC2" + ECS = "ECS" + EKS = "EKS" + LAMBDA = "LAMBDA" + APP_RUNNER = "APP_RUNNER" + BEANSTALK = "ELASTIC_BEANSTALK" + CLOUD_FORMATION = "CLOUD_FORMATION" + UNKNOWN = "UNKNOWN" + + +class AwsServicePrincipal(Enum): + LAMBDA = "lambda.amazonaws.com" + EKS = "eks.amazonaws.com" + APP_RUNNER = "apprunner.amazonaws.com" + ECS = "ecs.amazonaws.com" + ELASTIC_BEANSTALK = "elasticbeanstalk.amazonaws.com" + EC2 = "ec2.amazonaws.com" + + class AwsAssumeRoleConfig(PermissiveConfigModel): # Using the PermissiveConfigModel to allow the user to pass additional arguments. @@ -34,6 +61,163 @@ class AwsAssumeRoleConfig(PermissiveConfigModel): ) +def get_instance_metadata_token() -> Optional[str]: + """Get IMDSv2 token""" + try: + response = requests.put( + "http://169.254.169.254/latest/api/token", + headers={"X-aws-ec2-metadata-token-ttl-seconds": "21600"}, + timeout=1, + ) + if response.status_code == HTTPStatus.OK: + return response.text + except requests.exceptions.RequestException: + logger.debug("Failed to get IMDSv2 token") + return None + + +def is_running_on_ec2() -> bool: + """Check if code is running on EC2 using IMDSv2""" + token = get_instance_metadata_token() + if not token: + return False + + try: + response = requests.get( + "http://169.254.169.254/latest/meta-data/instance-id", + headers={"X-aws-ec2-metadata-token": token}, + timeout=1, + ) + return response.status_code == HTTPStatus.OK + except requests.exceptions.RequestException: + return False + + +def detect_aws_environment() -> AwsEnvironment: + """ + Detect the AWS environment we're running in. + Order matters as some environments may have multiple indicators. + """ + # Check Lambda first as it's most specific + if os.getenv("AWS_LAMBDA_FUNCTION_NAME"): + if os.getenv("AWS_EXECUTION_ENV", "").startswith("CloudFormation"): + return AwsEnvironment.CLOUD_FORMATION + return AwsEnvironment.LAMBDA + + # Check EKS (IRSA) + if os.getenv("AWS_WEB_IDENTITY_TOKEN_FILE") and os.getenv("AWS_ROLE_ARN"): + return AwsEnvironment.EKS + + # Check App Runner + if os.getenv("AWS_APP_RUNNER_SERVICE_ID"): + return AwsEnvironment.APP_RUNNER + + # Check ECS + if os.getenv("ECS_CONTAINER_METADATA_URI_V4") or os.getenv( + "ECS_CONTAINER_METADATA_URI" + ): + return AwsEnvironment.ECS + + # Check Elastic Beanstalk + if os.getenv("ELASTIC_BEANSTALK_ENVIRONMENT_NAME"): + return AwsEnvironment.BEANSTALK + + if is_running_on_ec2(): + return AwsEnvironment.EC2 + + return AwsEnvironment.UNKNOWN + + +def get_instance_role_arn() -> Optional[str]: + """Get role ARN from EC2 instance metadata using IMDSv2""" + token = get_instance_metadata_token() + if not token: + return None + + try: + response = requests.get( + "http://169.254.169.254/latest/meta-data/iam/security-credentials/", + headers={"X-aws-ec2-metadata-token": token}, + timeout=1, + ) + if response.status_code == 200: + role_name = response.text.strip() + if role_name: + sts = boto3.client("sts") + identity = sts.get_caller_identity() + return identity.get("Arn") + except Exception as e: + logger.debug(f"Failed to get instance role ARN: {e}") + return None + + +def get_lambda_role_arn() -> Optional[str]: + """Get the Lambda function's role ARN""" + try: + function_name = os.getenv("AWS_LAMBDA_FUNCTION_NAME") + if not function_name: + return None + + lambda_client = boto3.client("lambda") + function_config = lambda_client.get_function_configuration( + FunctionName=function_name + ) + return function_config.get("Role") + except Exception as e: + logger.debug(f"Failed to get Lambda role ARN: {e}") + return None + + +def get_current_identity() -> Tuple[Optional[str], Optional[str]]: + """ + Get the current role ARN and source type based on the runtime environment. + Returns (role_arn, credential_source) + """ + env = detect_aws_environment() + + if env == AwsEnvironment.LAMBDA: + role_arn = get_lambda_role_arn() + return role_arn, AwsServicePrincipal.LAMBDA.value + + elif env == AwsEnvironment.EKS: + role_arn = os.getenv("AWS_ROLE_ARN") + return role_arn, AwsServicePrincipal.EKS.value + + elif env == AwsEnvironment.APP_RUNNER: + try: + sts = boto3.client("sts") + identity = sts.get_caller_identity() + return identity.get("Arn"), AwsServicePrincipal.APP_RUNNER.value + except Exception as e: + logger.debug(f"Failed to get App Runner role: {e}") + + elif env == AwsEnvironment.ECS: + try: + metadata_uri = os.getenv("ECS_CONTAINER_METADATA_URI_V4") or os.getenv( + "ECS_CONTAINER_METADATA_URI" + ) + if metadata_uri: + response = requests.get(f"{metadata_uri}/task", timeout=1) + if response.status_code == HTTPStatus.OK: + task_metadata = response.json() + if "TaskARN" in task_metadata: + return ( + task_metadata.get("TaskARN"), + AwsServicePrincipal.ECS.value, + ) + except Exception as e: + logger.debug(f"Failed to get ECS task role: {e}") + + elif env == AwsEnvironment.BEANSTALK: + # Beanstalk uses EC2 instance metadata + return get_instance_role_arn(), AwsServicePrincipal.ELASTIC_BEANSTALK.value + + elif env == AwsEnvironment.EC2: + return get_instance_role_arn(), AwsServicePrincipal.EC2.value + + return None, None + + def assume_role( role: AwsAssumeRoleConfig, aws_region: Optional[str], @@ -95,7 +279,7 @@ class AwsConnectionConfig(ConfigModel): ) aws_profile: Optional[str] = Field( default=None, - description="Named AWS profile to use. Only used if access key / secret are unset. If not set the default will be used", + description="The [named profile](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-profiles.html) to use from AWS credentials. Falls back to default profile if not specified and no access keys provided. Profiles are configured in ~/.aws/credentials or ~/.aws/config.", ) aws_region: Optional[str] = Field(None, description="AWS region code.") @@ -145,6 +329,7 @@ def _normalized_aws_roles(self) -> List[AwsAssumeRoleConfig]: def get_session(self) -> Session: if self.aws_access_key_id and self.aws_secret_access_key: + # Explicit credentials take precedence session = Session( aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key, @@ -152,38 +337,57 @@ def get_session(self) -> Session: region_name=self.aws_region, ) elif self.aws_profile: + # Named profile is second priority session = Session( region_name=self.aws_region, profile_name=self.aws_profile ) else: - # Use boto3's credential autodetection. + # Use boto3's credential autodetection session = Session(region_name=self.aws_region) - if self._normalized_aws_roles(): - # Use existing session credentials to start the chain of role assumption. - current_credentials = session.get_credentials() - credentials = { - "AccessKeyId": current_credentials.access_key, - "SecretAccessKey": current_credentials.secret_key, - "SessionToken": current_credentials.token, - } - - for role in self._normalized_aws_roles(): - if self._should_refresh_credentials(): - credentials = assume_role( - role, - self.aws_region, - credentials=credentials, + target_roles = self._normalized_aws_roles() + if target_roles: + current_role_arn, credential_source = get_current_identity() + + # Only assume role if: + # 1. We're not in a known AWS environment with a role, or + # 2. We need to assume a different role than our current one + should_assume_role = current_role_arn is None or any( + role.RoleArn != current_role_arn for role in target_roles + ) + + if should_assume_role: + env = detect_aws_environment() + logger.debug(f"Assuming role(s) from {env.value} environment") + + current_credentials = session.get_credentials() + if current_credentials is None: + raise ValueError("No credentials available for role assumption") + + credentials = { + "AccessKeyId": current_credentials.access_key, + "SecretAccessKey": current_credentials.secret_key, + "SessionToken": current_credentials.token, + } + + for role in target_roles: + if self._should_refresh_credentials(): + credentials = assume_role( + role=role, + aws_region=self.aws_region, + credentials=credentials, + ) + if isinstance(credentials["Expiration"], datetime): + self._credentials_expiration = credentials["Expiration"] + + session = Session( + aws_access_key_id=credentials["AccessKeyId"], + aws_secret_access_key=credentials["SecretAccessKey"], + aws_session_token=credentials["SessionToken"], + region_name=self.aws_region, ) - if isinstance(credentials["Expiration"], datetime): - self._credentials_expiration = credentials["Expiration"] - - session = Session( - aws_access_key_id=credentials["AccessKeyId"], - aws_secret_access_key=credentials["SecretAccessKey"], - aws_session_token=credentials["SessionToken"], - region_name=self.aws_region, - ) + else: + logger.debug(f"Using existing role from {credential_source}") return session diff --git a/metadata-ingestion/tests/unit/test_aws_common.py b/metadata-ingestion/tests/unit/test_aws_common.py new file mode 100644 index 0000000000000..9291fb91134b1 --- /dev/null +++ b/metadata-ingestion/tests/unit/test_aws_common.py @@ -0,0 +1,328 @@ +import json +import os +from unittest.mock import MagicMock, patch + +import boto3 +import pytest +from moto import mock_iam, mock_lambda, mock_sts + +from datahub.ingestion.source.aws.aws_common import ( + AwsConnectionConfig, + AwsEnvironment, + detect_aws_environment, + get_current_identity, + get_instance_metadata_token, + get_instance_role_arn, + is_running_on_ec2, +) + + +@pytest.fixture +def mock_aws_config(): + return AwsConnectionConfig( + aws_access_key_id="test-key", + aws_secret_access_key="test-secret", + aws_region="us-east-1", + ) + + +class TestAwsCommon: + def test_environment_detection_no_environment(self): + """Test environment detection when no AWS environment is present""" + with patch.dict(os.environ, {}, clear=True): + assert detect_aws_environment() == AwsEnvironment.UNKNOWN + + def test_environment_detection_lambda(self): + """Test Lambda environment detection""" + with patch.dict(os.environ, {"AWS_LAMBDA_FUNCTION_NAME": "test-function"}): + assert detect_aws_environment() == AwsEnvironment.LAMBDA + + def test_environment_detection_lambda_cloudformation(self): + """Test CloudFormation Lambda environment detection""" + with patch.dict( + os.environ, + { + "AWS_LAMBDA_FUNCTION_NAME": "test-function", + "AWS_EXECUTION_ENV": "CloudFormation.xxx", + }, + ): + assert detect_aws_environment() == AwsEnvironment.CLOUD_FORMATION + + def test_environment_detection_eks(self): + """Test EKS environment detection""" + with patch.dict( + os.environ, + { + "AWS_WEB_IDENTITY_TOKEN_FILE": "/var/run/secrets/token", + "AWS_ROLE_ARN": "arn:aws:iam::123456789012:role/test-role", + }, + ): + assert detect_aws_environment() == AwsEnvironment.EKS + + def test_environment_detection_app_runner(self): + """Test App Runner environment detection""" + with patch.dict(os.environ, {"AWS_APP_RUNNER_SERVICE_ID": "service-id"}): + assert detect_aws_environment() == AwsEnvironment.APP_RUNNER + + def test_environment_detection_ecs(self): + """Test ECS environment detection""" + with patch.dict( + os.environ, {"ECS_CONTAINER_METADATA_URI_V4": "http://169.254.170.2/v4"} + ): + assert detect_aws_environment() == AwsEnvironment.ECS + + def test_environment_detection_beanstalk(self): + """Test Elastic Beanstalk environment detection""" + with patch.dict(os.environ, {"ELASTIC_BEANSTALK_ENVIRONMENT_NAME": "my-env"}): + assert detect_aws_environment() == AwsEnvironment.BEANSTALK + + @patch("requests.put") + def test_ec2_metadata_token(self, mock_put): + """Test EC2 metadata token retrieval""" + mock_put.return_value.status_code = 200 + mock_put.return_value.text = "token123" + + token = get_instance_metadata_token() + assert token == "token123" + + mock_put.assert_called_once_with( + "http://169.254.169.254/latest/api/token", + headers={"X-aws-ec2-metadata-token-ttl-seconds": "21600"}, + timeout=1, + ) + + @patch("requests.put") + def test_ec2_metadata_token_failure(self, mock_put): + """Test EC2 metadata token failure case""" + mock_put.return_value.status_code = 404 + + token = get_instance_metadata_token() + assert token is None + + @patch("requests.get") + @patch("requests.put") + def test_is_running_on_ec2(self, mock_put, mock_get): + """Test EC2 instance detection with IMDSv2""" + mock_put.return_value.status_code = 200 + mock_put.return_value.text = "token123" + mock_get.return_value.status_code = 200 + + assert is_running_on_ec2() is True + + mock_put.assert_called_once_with( + "http://169.254.169.254/latest/api/token", + headers={"X-aws-ec2-metadata-token-ttl-seconds": "21600"}, + timeout=1, + ) + mock_get.assert_called_once_with( + "http://169.254.169.254/latest/meta-data/instance-id", + headers={"X-aws-ec2-metadata-token": "token123"}, + timeout=1, + ) + + @patch("requests.get") + @patch("requests.put") + def test_is_running_on_ec2_failure(self, mock_put, mock_get): + """Test EC2 instance detection failure""" + mock_put.return_value.status_code = 404 + assert is_running_on_ec2() is False + + mock_put.return_value.status_code = 200 + mock_put.return_value.text = "token123" + mock_get.return_value.status_code = 404 + assert is_running_on_ec2() is False + + @mock_sts + @mock_lambda + @mock_iam + def test_get_current_identity_lambda(self): + """Test getting identity in Lambda environment""" + with patch.dict( + os.environ, + { + "AWS_LAMBDA_FUNCTION_NAME": "test-function", + "AWS_DEFAULT_REGION": "us-east-1", + }, + ): + # Create IAM role first with proper trust policy + iam_client = boto3.client("iam", region_name="us-east-1") + trust_policy = { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": {"Service": "lambda.amazonaws.com"}, + "Action": "sts:AssumeRole", + } + ], + } + iam_client.create_role( + RoleName="test-role", AssumeRolePolicyDocument=json.dumps(trust_policy) + ) + + lambda_client = boto3.client("lambda", region_name="us-east-1") + lambda_client.create_function( + FunctionName="test-function", + Runtime="python3.8", + Role="arn:aws:iam::123456789012:role/test-role", + Handler="index.handler", + Code={"ZipFile": b"def handler(event, context): pass"}, + ) + + role_arn, source = get_current_identity() + assert source == "lambda.amazonaws.com" + assert role_arn == "arn:aws:iam::123456789012:role/test-role" + + @patch("requests.get") + @patch("requests.put") + @mock_sts + def test_get_instance_role_arn_success(self, mock_put, mock_get): + """Test getting EC2 instance role ARN""" + mock_put.return_value.status_code = 200 + mock_put.return_value.text = "token123" + mock_get.return_value.status_code = 200 + mock_get.return_value.text = "test-role" + + with patch("boto3.client") as mock_boto: + mock_sts = MagicMock() + mock_sts.get_caller_identity.return_value = { + "Arn": "arn:aws:sts::123456789012:assumed-role/test-role/instance" + } + mock_boto.return_value = mock_sts + + role_arn = get_instance_role_arn() + assert ( + role_arn == "arn:aws:sts::123456789012:assumed-role/test-role/instance" + ) + + @mock_sts + def test_aws_connection_config_basic(self, mock_aws_config): + """Test basic AWS connection configuration""" + session = mock_aws_config.get_session() + creds = session.get_credentials() + assert creds.access_key == "test-key" + assert creds.secret_key == "test-secret" + + @mock_sts + def test_aws_connection_config_with_session_token(self): + """Test AWS connection with session token""" + config = AwsConnectionConfig( + aws_access_key_id="test-key", + aws_secret_access_key="test-secret", + aws_session_token="test-token", + aws_region="us-east-1", + ) + + session = config.get_session() + creds = session.get_credentials() + assert creds.token == "test-token" + + @mock_sts + def test_aws_connection_config_role_assumption(self): + """Test AWS connection with role assumption""" + config = AwsConnectionConfig( + aws_access_key_id="test-key", + aws_secret_access_key="test-secret", + aws_region="us-east-1", + aws_role="arn:aws:iam::123456789012:role/test-role", + ) + + with patch( + "datahub.ingestion.source.aws.aws_common.get_current_identity" + ) as mock_identity: + mock_identity.return_value = (None, None) + session = config.get_session() + creds = session.get_credentials() + assert creds is not None + + @mock_sts + def test_aws_connection_config_skip_role_assumption(self): + """Test AWS connection skipping role assumption when already in role""" + config = AwsConnectionConfig( + aws_region="us-east-1", + aws_role="arn:aws:iam::123456789012:role/current-role", + ) + + with patch( + "datahub.ingestion.source.aws.aws_common.get_current_identity" + ) as mock_identity: + mock_identity.return_value = ( + "arn:aws:iam::123456789012:role/current-role", + "ec2.amazonaws.com", + ) + session = config.get_session() + assert session is not None + + @mock_sts + def test_aws_connection_config_multiple_roles(self): + """Test AWS connection with multiple role assumption""" + config = AwsConnectionConfig( + aws_access_key_id="test-key", + aws_secret_access_key="test-secret", + aws_region="us-east-1", + aws_role=[ + "arn:aws:iam::123456789012:role/role1", + "arn:aws:iam::123456789012:role/role2", + ], + ) + + with patch( + "datahub.ingestion.source.aws.aws_common.get_current_identity" + ) as mock_identity: + mock_identity.return_value = (None, None) + session = config.get_session() + assert session is not None + + def test_aws_connection_config_validation_error(self): + """Test AWS connection validation""" + with patch.dict( + "os.environ", + { + "AWS_ACCESS_KEY_ID": "test-key", + # Deliberately missing AWS_SECRET_ACCESS_KEY + "AWS_DEFAULT_REGION": "us-east-1", + }, + clear=True, + ): + config = AwsConnectionConfig() # Let it pick up from environment + session = config.get_session() + with pytest.raises( + Exception, + match="Partial credentials found in env, missing: AWS_SECRET_ACCESS_KEY", + ): + session.get_credentials() + + @pytest.mark.parametrize( + "env_vars,expected_environment", + [ + ({}, AwsEnvironment.UNKNOWN), + ({"AWS_LAMBDA_FUNCTION_NAME": "test"}, AwsEnvironment.LAMBDA), + ( + { + "AWS_LAMBDA_FUNCTION_NAME": "test", + "AWS_EXECUTION_ENV": "CloudFormation", + }, + AwsEnvironment.CLOUD_FORMATION, + ), + ( + { + "AWS_WEB_IDENTITY_TOKEN_FILE": "/token", + "AWS_ROLE_ARN": "arn:aws:iam::123:role/test", + }, + AwsEnvironment.EKS, + ), + ({"AWS_APP_RUNNER_SERVICE_ID": "service-123"}, AwsEnvironment.APP_RUNNER), + ( + {"ECS_CONTAINER_METADATA_URI_V4": "http://169.254.170.2"}, + AwsEnvironment.ECS, + ), + ( + {"ELASTIC_BEANSTALK_ENVIRONMENT_NAME": "my-env"}, + AwsEnvironment.BEANSTALK, + ), + ], + ) + def test_environment_detection_parametrized(self, env_vars, expected_environment): + """Parametrized test for environment detection with different configurations""" + with patch.dict(os.environ, env_vars, clear=True): + assert detect_aws_environment() == expected_environment From 8350a4e03ac9a259bb21e295c173972fd74d5f6f Mon Sep 17 00:00:00 2001 From: Jonny Dixon <45681293+acrylJonny@users.noreply.github.com> Date: Sat, 21 Dec 2024 07:52:27 +0000 Subject: [PATCH 12/39] feat(ingest/hive): lineage from/to file storage (#11841) Co-authored-by: Aseem Bansal --- .../src/datahub/ingestion/source/sql/hive.py | 614 +++++++++++++++++- 1 file changed, 606 insertions(+), 8 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py b/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py index 59f301baf4016..fad54fda45378 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py @@ -1,7 +1,10 @@ import json import logging import re -from typing import Any, Dict, Iterable, List, Optional, Union +from dataclasses import dataclass +from enum import Enum +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union +from urllib.parse import urlparse from pydantic.class_validators import validator from pydantic.fields import Field @@ -11,7 +14,12 @@ from pyhive.sqlalchemy_hive import HiveDate, HiveDecimal, HiveDialect, HiveTimestamp from sqlalchemy.engine.reflection import Inspector -from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance +from datahub.emitter.mce_builder import ( + make_data_platform_urn, + make_dataplatform_instance_urn, + make_dataset_urn_with_platform_instance, + make_schema_field_urn, +) from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.decorators import ( SourceCapability, @@ -29,14 +37,24 @@ TwoTierSQLAlchemyConfig, TwoTierSQLAlchemySource, ) -from datahub.metadata.com.linkedin.pegasus2avro.schema import ( +from datahub.metadata.schema_classes import ( + DataPlatformInstanceClass, + DatasetLineageTypeClass, + DatasetPropertiesClass, DateTypeClass, + FineGrainedLineageClass, + FineGrainedLineageDownstreamTypeClass, + FineGrainedLineageUpstreamTypeClass, NullTypeClass, NumberTypeClass, - SchemaField, + OtherSchemaClass, + SchemaFieldClass, + SchemaMetadataClass, TimeTypeClass, + UpstreamClass, + UpstreamLineageClass, + ViewPropertiesClass, ) -from datahub.metadata.schema_classes import ViewPropertiesClass from datahub.utilities import config_clean from datahub.utilities.hive_schema_to_avro import get_avro_schema_for_hive_column @@ -46,6 +64,511 @@ register_custom_type(HiveTimestamp, TimeTypeClass) register_custom_type(HiveDecimal, NumberTypeClass) + +class StoragePlatform(Enum): + """Enumeration of storage platforms supported for lineage""" + + S3 = "s3" + AZURE = "abs" + GCS = "gcs" + DBFS = "dbfs" + LOCAL = "file" + HDFS = "hdfs" + + +# Mapping of URL schemes to storage platforms +STORAGE_SCHEME_MAPPING = { + # S3 and derivatives + "s3": StoragePlatform.S3, + "s3a": StoragePlatform.S3, + "s3n": StoragePlatform.S3, + # Azure and derivatives + "abfs": StoragePlatform.AZURE, + "abfss": StoragePlatform.AZURE, + "adl": StoragePlatform.AZURE, + "adls": StoragePlatform.AZURE, + "wasb": StoragePlatform.AZURE, + "wasbs": StoragePlatform.AZURE, + # GCS and derivatives + "gs": StoragePlatform.GCS, + "gcs": StoragePlatform.GCS, + # DBFS + "dbfs": StoragePlatform.DBFS, + # Local filesystem + "file": StoragePlatform.LOCAL, + # HDFS + "hdfs": StoragePlatform.HDFS, +} + + +class StoragePathParser: + """Parser for storage paths with platform-specific logic""" + + @staticmethod + def parse_storage_location(location: str) -> Optional[Tuple[StoragePlatform, str]]: + """ + Parse a storage location into platform and normalized path. + + Args: + location: Storage location URI (e.g., s3://bucket/path, abfss://container@account.dfs.core.windows.net/path) + + Returns: + Tuple of (StoragePlatform, normalized_path) if valid, None if invalid + """ + + try: + # Handle special case for local files with no scheme + if location.startswith("/"): + return StoragePlatform.LOCAL, location + + # Parse the URI + parsed = urlparse(location) + scheme = parsed.scheme.lower() + + if not scheme: + return None + + # Look up the platform + platform = STORAGE_SCHEME_MAPPING.get(scheme) + if not platform: + return None + + # Get normalized path based on platform + if platform == StoragePlatform.S3: + # For S3, combine bucket and path + path = f"{parsed.netloc}/{parsed.path.lstrip('/')}" + + elif platform == StoragePlatform.AZURE: + if scheme in ("abfs", "abfss"): + # Format: abfss://container@account.dfs.core.windows.net/path + container = parsed.netloc.split("@")[0] + path = f"{container}/{parsed.path.lstrip('/')}" + else: + # Handle other Azure schemes + path = f"{parsed.netloc}/{parsed.path.lstrip('/')}" + + elif platform == StoragePlatform.GCS: + # For GCS, combine bucket and path + path = f"{parsed.netloc}/{parsed.path.lstrip('/')}" + + elif platform == StoragePlatform.DBFS: + # For DBFS, use path as-is + path = parsed.path.lstrip("/") + + elif platform == StoragePlatform.LOCAL: + # For local files, use full path + path = f"{parsed.netloc}/{parsed.path.lstrip('/')}" + + elif platform == StoragePlatform.HDFS: + # For HDFS, use full path + path = f"{parsed.netloc}/{parsed.path.lstrip('/')}" + + else: + return None + + # Clean up the path + path = path.rstrip("/") # Remove trailing slashes + path = re.sub(r"/+", "/", path) # Normalize multiple slashes + path = f"/{path}" + + return platform, path + + except Exception as exp: + logger.warning(f"Failed to parse storage location {location}: {exp}") + return None + + @staticmethod + def get_platform_name(platform: StoragePlatform) -> str: + """Get the platform name to use in URNs""" + + platform_names = { + StoragePlatform.S3: "s3", + StoragePlatform.AZURE: "adls", + StoragePlatform.GCS: "gcs", + StoragePlatform.DBFS: "dbfs", + StoragePlatform.LOCAL: "file", + StoragePlatform.HDFS: "hdfs", + } + return platform_names[platform] + + +class HiveStorageLineageConfig: + """Configuration for Hive storage lineage.""" + + def __init__( + self, + emit_storage_lineage: bool, + hive_storage_lineage_direction: str, + include_column_lineage: bool, + storage_platform_instance: Optional[str], + ): + if hive_storage_lineage_direction.lower() not in ["upstream", "downstream"]: + raise ValueError( + "hive_storage_lineage_direction must be either upstream or downstream" + ) + + self.emit_storage_lineage = emit_storage_lineage + self.hive_storage_lineage_direction = hive_storage_lineage_direction.lower() + self.include_column_lineage = include_column_lineage + self.storage_platform_instance = storage_platform_instance + + +@dataclass +class HiveStorageSourceReport: + """Report for tracking storage lineage statistics""" + + storage_locations_scanned: int = 0 + filtered_locations: List[str] = Field(default_factory=list) + failed_locations: List[str] = Field(default_factory=list) + + def report_location_scanned(self) -> None: + self.storage_locations_scanned += 1 + + def report_location_filtered(self, location: str) -> None: + self.filtered_locations.append(location) + + def report_location_failed(self, location: str) -> None: + self.failed_locations.append(location) + + +class HiveStorageLineage: + """Handles storage lineage for Hive tables""" + + def __init__( + self, + config: HiveStorageLineageConfig, + env: str, + convert_urns_to_lowercase: bool = False, + ): + self.config = config + self.env = env + self.convert_urns_to_lowercase = convert_urns_to_lowercase + self.report = HiveStorageSourceReport() + + def _make_dataset_platform_instance( + self, + platform: str, + instance: Optional[str], + ) -> DataPlatformInstanceClass: + """Create DataPlatformInstance aspect""" + + return DataPlatformInstanceClass( + platform=make_data_platform_urn(platform), + instance=make_dataplatform_instance_urn(platform, instance) + if instance + else None, + ) + + def _make_storage_dataset_urn( + self, + storage_location: str, + ) -> Optional[Tuple[str, str]]: + """ + Create storage dataset URN from location. + Returns tuple of (urn, platform) if successful, None otherwise. + """ + + platform_instance = None + storage_info = StoragePathParser.parse_storage_location(storage_location) + if not storage_info: + logger.debug(f"Could not parse storage location: {storage_location}") + return None + + platform, path = storage_info + platform_name = StoragePathParser.get_platform_name(platform) + + if self.convert_urns_to_lowercase: + platform_name = platform_name.lower() + path = path.lower() + if self.config.storage_platform_instance: + platform_instance = self.config.storage_platform_instance.lower() + + try: + storage_urn = make_dataset_urn_with_platform_instance( + platform=platform_name, + name=path, + env=self.env, + platform_instance=platform_instance, + ) + return storage_urn, platform_name + except Exception as exp: + logger.error(f"Failed to create URN for {platform_name}:{path}: {exp}") + return None + + def _get_fine_grained_lineages( + self, + dataset_urn: str, + storage_urn: str, + dataset_schema: SchemaMetadataClass, + storage_schema: SchemaMetadataClass, + ) -> Iterable[FineGrainedLineageClass]: + """Generate column-level lineage between dataset and storage""" + + if not self.config.include_column_lineage: + return + + for dataset_field in dataset_schema.fields: + dataset_path = dataset_field.fieldPath + + # Find matching field in storage schema + matching_field = next( + (f for f in storage_schema.fields if f.fieldPath == dataset_path), + None, + ) + + if matching_field: + if self.config.hive_storage_lineage_direction == "upstream": + yield FineGrainedLineageClass( + upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET, + upstreams=[ + make_schema_field_urn( + parent_urn=storage_urn, + field_path=matching_field.fieldPath, + ) + ], + downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD, + downstreams=[ + make_schema_field_urn( + parent_urn=dataset_urn, + field_path=dataset_path, + ) + ], + ) + else: + yield FineGrainedLineageClass( + upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET, + upstreams=[ + make_schema_field_urn( + parent_urn=dataset_urn, + field_path=dataset_path, + ) + ], + downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD, + downstreams=[ + make_schema_field_urn( + parent_urn=storage_urn, + field_path=matching_field.fieldPath, + ) + ], + ) + + def _create_lineage_mcp( + self, + source_urn: str, + target_urn: str, + fine_grained_lineages: Optional[Iterable[FineGrainedLineageClass]] = None, + ) -> Iterable[MetadataWorkUnit]: + """Create lineage MCP between source and target datasets""" + + lineages_list = ( + list(fine_grained_lineages) if fine_grained_lineages is not None else None + ) + + upstream_lineage = UpstreamLineageClass( + upstreams=[ + UpstreamClass(dataset=source_urn, type=DatasetLineageTypeClass.COPY) + ], + fineGrainedLineages=lineages_list, + ) + + yield MetadataWorkUnit( + id=f"{source_urn}-{target_urn}-lineage", + mcp=MetadataChangeProposalWrapper( + entityUrn=target_urn, aspect=upstream_lineage + ), + ) + + def get_storage_dataset_mcp( + self, + storage_location: str, + platform_instance: Optional[str] = None, + schema_metadata: Optional[SchemaMetadataClass] = None, + ) -> Iterable[MetadataWorkUnit]: + """ + Generate MCPs for storage dataset if needed. + This creates the storage dataset entity in DataHub. + """ + + storage_info = StoragePathParser.parse_storage_location( + storage_location, + ) + if not storage_info: + return + + platform, path = storage_info + platform_name = StoragePathParser.get_platform_name(platform) + + if self.convert_urns_to_lowercase: + platform_name = platform_name.lower() + path = path.lower() + if self.config.storage_platform_instance: + platform_instance = self.config.storage_platform_instance.lower() + + try: + storage_urn = make_dataset_urn_with_platform_instance( + platform=platform_name, + name=path, + env=self.env, + platform_instance=platform_instance, + ) + + # Dataset properties + props = DatasetPropertiesClass(name=path) + yield MetadataWorkUnit( + id=f"storage-{storage_urn}-props", + mcp=MetadataChangeProposalWrapper( + entityUrn=storage_urn, + aspect=props, + ), + ) + + # Platform instance + platform_instance_aspect = self._make_dataset_platform_instance( + platform=platform_name, + instance=platform_instance, + ) + yield MetadataWorkUnit( + id=f"storage-{storage_urn}-platform", + mcp=MetadataChangeProposalWrapper( + entityUrn=storage_urn, aspect=platform_instance_aspect + ), + ) + + # Schema if available + if schema_metadata: + storage_schema = SchemaMetadataClass( + schemaName=f"{platform.value}_schema", + platform=f"urn:li:dataPlatform:{platform.value}", + version=0, + fields=schema_metadata.fields, + hash="", + platformSchema=OtherSchemaClass(rawSchema=""), + ) + yield MetadataWorkUnit( + id=f"storage-{storage_urn}-schema", + mcp=MetadataChangeProposalWrapper( + entityUrn=storage_urn, aspect=storage_schema + ), + ) + + except Exception as e: + logger.error( + f"Failed to create storage dataset MCPs for {storage_location}: {e}" + ) + return + + def get_lineage_mcp( + self, + dataset_urn: str, + table: Dict[str, Any], + dataset_schema: Optional[SchemaMetadataClass] = None, + ) -> Iterable[MetadataWorkUnit]: + """ + Generate lineage MCP for a Hive table to its storage location. + + Args: + dataset_urn: URN of the Hive dataset + table: Hive table dictionary containing metadata + dataset_schema: Optional schema metadata for the Hive dataset + + Returns: + MetadataWorkUnit containing the lineage MCP if successful + """ + + platform_instance = None + + if not self.config.emit_storage_lineage: + return + + # Get storage location from table + storage_location = table.get("StorageDescriptor", {}).get("Location") + if not storage_location: + return + + # Create storage dataset URN + storage_info = self._make_storage_dataset_urn(storage_location) + if not storage_info: + self.report.report_location_failed(storage_location) + return + + storage_urn, storage_platform = storage_info + self.report.report_location_scanned() + + if self.config.storage_platform_instance: + platform_instance = self.config.storage_platform_instance.lower() + + # Create storage dataset entity + yield from self.get_storage_dataset_mcp( + storage_location=storage_location, + platform_instance=platform_instance, + schema_metadata=dataset_schema, + ) + + # Get storage schema if available (implement based on storage system) + storage_schema = ( + self._get_storage_schema(storage_location, dataset_schema) + if dataset_schema + else None + ) + + # Generate fine-grained lineage if schemas available + fine_grained_lineages = ( + None + if not (dataset_schema and storage_schema) + else self._get_fine_grained_lineages( + dataset_urn, storage_urn, dataset_schema, storage_schema + ) + ) + + # Create lineage MCP + if self.config.hive_storage_lineage_direction == "upstream": + yield from self._create_lineage_mcp( + source_urn=storage_urn, + target_urn=dataset_urn, + fine_grained_lineages=fine_grained_lineages, + ) + else: + yield from self._create_lineage_mcp( + source_urn=dataset_urn, + target_urn=storage_urn, + fine_grained_lineages=fine_grained_lineages, + ) + + def _get_storage_schema( + self, + storage_location: str, + table_schema: Optional[SchemaMetadataClass] = None, + ) -> Optional[SchemaMetadataClass]: + """ + Get schema metadata for storage location. + Currently supports: + - Delta tables + - Parquet files + - Spark tables + + Returns: + SchemaMetadataClass if schema can be inferred, None otherwise + """ + + if not table_schema: + return None + + storage_info = StoragePathParser.parse_storage_location(storage_location) + if not storage_info: + return None + + platform, _ = storage_info + + return SchemaMetadataClass( + schemaName=f"{platform.value}_schema", + platform=f"urn:li:dataPlatform:{platform.value}", + version=0, + fields=table_schema.fields, + hash="", + platformSchema=OtherSchemaClass(rawSchema=""), + ) + + try: from databricks_dbapi.sqlalchemy_dialects.hive import DatabricksPyhiveDialect from pyhive.sqlalchemy_hive import _type_map @@ -94,8 +617,8 @@ def dbapi_get_columns_patched(self, connection, table_name, schema=None, **kw): DatabricksPyhiveDialect.get_columns = dbapi_get_columns_patched except ModuleNotFoundError: pass -except Exception as e: - logger.warning(f"Failed to patch method due to {e}") +except Exception as exp: + logger.warning(f"Failed to patch method due to {exp}") @reflection.cache # type: ignore @@ -126,10 +649,48 @@ class HiveConfig(TwoTierSQLAlchemyConfig): # defaults scheme: str = Field(default="hive", hidden_from_docs=True) + # Overriding as table location lineage is richer implementation here than with include_table_location_lineage + include_table_location_lineage: bool = Field(default=False, hidden_from_docs=True) + + emit_storage_lineage: bool = Field( + default=False, + description="Whether to emit storage-to-Hive lineage", + ) + hive_storage_lineage_direction: str = Field( + default="upstream", + description="If 'upstream', storage is upstream to Hive. If 'downstream' storage is downstream to Hive", + ) + include_column_lineage: bool = Field( + default=True, + description="When enabled, column-level lineage will be extracted from storage", + ) + storage_platform_instance: Optional[str] = Field( + default=None, + description="Platform instance for the storage system", + ) + @validator("host_port") def clean_host_port(cls, v): return config_clean.remove_protocol(v) + @validator("hive_storage_lineage_direction") + def _validate_direction(cls, v: str) -> str: + """Validate the lineage direction.""" + if v.lower() not in ["upstream", "downstream"]: + raise ValueError( + "storage_lineage_direction must be either upstream or downstream" + ) + return v.lower() + + def get_storage_lineage_config(self) -> HiveStorageLineageConfig: + """Convert base config parameters to HiveStorageLineageConfig""" + return HiveStorageLineageConfig( + emit_storage_lineage=self.emit_storage_lineage, + hive_storage_lineage_direction=self.hive_storage_lineage_direction, + include_column_lineage=self.include_column_lineage, + storage_platform_instance=self.storage_platform_instance, + ) + @platform_name("Hive") @config_class(HiveConfig) @@ -151,12 +712,49 @@ class HiveSource(TwoTierSQLAlchemySource): def __init__(self, config, ctx): super().__init__(config, ctx, "hive") + self.storage_lineage = HiveStorageLineage( + config=config.get_storage_lineage_config(), + env=config.env, + convert_urns_to_lowercase=config.convert_urns_to_lowercase, + ) @classmethod def create(cls, config_dict, ctx): config = HiveConfig.parse_obj(config_dict) return cls(config, ctx) + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + """Generate workunits for tables and their storage lineage.""" + for wu in super().get_workunits_internal(): + yield wu + + if not isinstance(wu, MetadataWorkUnit): + continue + + # Get dataset URN and required aspects using workunit methods + try: + dataset_urn = wu.get_urn() + dataset_props = wu.get_aspect_of_type(DatasetPropertiesClass) + schema_metadata = wu.get_aspect_of_type(SchemaMetadataClass) + except Exception as exp: + logger.warning(f"Failed to process workunit {wu.id}: {exp}") + continue + + # Only proceed if we have the necessary properties + if dataset_props and dataset_props.customProperties: + table = { + "StorageDescriptor": { + "Location": dataset_props.customProperties.get("Location") + } + } + + if table.get("StorageDescriptor", {}).get("Location"): + yield from self.storage_lineage.get_lineage_mcp( + dataset_urn=dataset_urn, + table=table, + dataset_schema=schema_metadata, + ) + def get_schema_names(self, inspector): assert isinstance(self.config, HiveConfig) # This condition restricts the ingestion to the specified database. @@ -173,7 +771,7 @@ def get_schema_fields_for_column( pk_constraints: Optional[Dict[Any, Any]] = None, partition_keys: Optional[List[str]] = None, tags: Optional[List[str]] = None, - ) -> List[SchemaField]: + ) -> List[SchemaFieldClass]: fields = super().get_schema_fields_for_column( dataset_name, column, From 494c522405830aaec181bcd2d61b2cfe9a53f155 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Sun, 22 Dec 2024 13:21:41 +0100 Subject: [PATCH 13/39] fix(ingest/mssql): add container dataflow/ datajob entities (#12194) --- .../ingestion/source/sql/mssql/job_models.py | 26 +++ .../ingestion/source/sql/mssql/source.py | 10 + .../golden_mces_mssql_no_db_to_file.json | 207 ++++++++++++++++- .../golden_mces_mssql_no_db_with_filter.json | 162 ++++++++++++- .../golden_mces_mssql_to_file.json | 219 +++++++++++++++++- ...golden_mces_mssql_with_lower_case_urn.json | 207 ++++++++++++++++- 6 files changed, 795 insertions(+), 36 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py index d3941e7add0fd..0cd6261151928 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py @@ -7,7 +7,9 @@ make_data_platform_urn, make_dataplatform_instance_urn, ) +from datahub.emitter.mcp_builder import DatabaseKey from datahub.metadata.schema_classes import ( + ContainerClass, DataFlowInfoClass, DataJobInfoClass, DataJobInputOutputClass, @@ -210,6 +212,18 @@ def as_datajob_info_aspect(self) -> DataJobInfoClass: status=self.status, ) + @property + def as_container_aspect(self) -> ContainerClass: + databaseKey = DatabaseKey( + platform=self.entity.flow.orchestrator, + instance=self.entity.flow.platform_instance + if self.entity.flow.platform_instance + else None, + env=self.entity.flow.env, + database=self.entity.flow.db, + ) + return ContainerClass(container=databaseKey.as_urn()) + @property def as_maybe_platform_instance_aspect(self) -> Optional[DataPlatformInstanceClass]: if self.entity.flow.platform_instance: @@ -257,6 +271,18 @@ def as_dataflow_info_aspect(self) -> DataFlowInfoClass: externalUrl=self.external_url, ) + @property + def as_container_aspect(self) -> ContainerClass: + databaseKey = DatabaseKey( + platform=self.entity.orchestrator, + instance=self.entity.platform_instance + if self.entity.platform_instance + else None, + env=self.entity.env, + database=self.entity.db, + ) + return ContainerClass(container=databaseKey.as_urn()) + @property def as_maybe_platform_instance_aspect(self) -> Optional[DataPlatformInstanceClass]: if self.entity.platform_instance: diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py index 9d8b67041998c..547adcc8eccc9 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py @@ -639,6 +639,11 @@ def construct_job_workunits( aspect=data_job.as_datajob_info_aspect, ).as_workunit() + yield MetadataChangeProposalWrapper( + entityUrn=data_job.urn, + aspect=data_job.as_container_aspect, + ).as_workunit() + data_platform_instance_aspect = data_job.as_maybe_platform_instance_aspect if data_platform_instance_aspect: yield MetadataChangeProposalWrapper( @@ -662,6 +667,11 @@ def construct_flow_workunits( aspect=data_flow.as_dataflow_info_aspect, ).as_workunit() + yield MetadataChangeProposalWrapper( + entityUrn=data_flow.urn, + aspect=data_flow.as_container_aspect, + ).as_workunit() + data_platform_instance_aspect = data_flow.as_maybe_platform_instance_aspect if data_platform_instance_aspect: yield MetadataChangeProposalWrapper( diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json index 72dcda25c1296..720ef0b392945 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json @@ -105,6 +105,43 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -113,11 +150,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "c2d77890-83ba-435f-879b-1c77fa38dd47", + "job_id": "ab960f9d-30f3-4ced-b558-4f9b6671b6dd", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-12-05 16:44:43.910000", - "date_modified": "2024-12-05 16:44:44.043000", + "date_created": "2024-12-20 15:15:24.483000", + "date_modified": "2024-12-20 15:15:24.653000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -136,6 +173,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -154,6 +207,27 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", @@ -2103,8 +2177,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n", - "is_view": "True" + "is_view": "True", + "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n" }, "name": "PersonsView", "tags": [] @@ -2269,6 +2343,43 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -2282,8 +2393,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-12-05 16:44:43.800000", - "date_modified": "2024-12-05 16:44:43.800000" + "date_created": "2024-12-20 15:15:24.290000", + "date_modified": "2024-12-20 15:15:24.290000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2298,6 +2409,43 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", @@ -2310,8 +2458,8 @@ "depending_on_procedure": "{}", "code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n", "input parameters": "[]", - "date_created": "2024-12-05 16:44:43.803000", - "date_modified": "2024-12-05 16:44:43.803000" + "date_created": "2024-12-20 15:15:24.300000", + "date_modified": "2024-12-20 15:15:24.300000" }, "externalUrl": "", "name": "DemoData.Foo.NewProc", @@ -2326,6 +2474,43 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", @@ -4427,8 +4612,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "view_definition": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n", - "is_view": "True" + "is_view": "True", + "view_definition": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n" }, "name": "View1", "tags": [] diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json index 0df89ff1eb94d..cf3abbfc62997 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json @@ -105,6 +105,43 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -113,11 +150,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "c2d77890-83ba-435f-879b-1c77fa38dd47", + "job_id": "ab960f9d-30f3-4ced-b558-4f9b6671b6dd", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-12-05 16:44:43.910000", - "date_modified": "2024-12-05 16:44:44.043000", + "date_created": "2024-12-20 15:15:24.483000", + "date_modified": "2024-12-20 15:15:24.653000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -136,6 +173,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -154,6 +207,27 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", @@ -2103,8 +2177,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n", - "is_view": "True" + "is_view": "True", + "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n" }, "name": "PersonsView", "tags": [] @@ -2269,6 +2343,43 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -2282,8 +2393,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-12-05 16:44:43.800000", - "date_modified": "2024-12-05 16:44:43.800000" + "date_created": "2024-12-20 15:15:24.290000", + "date_modified": "2024-12-20 15:15:24.290000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2298,6 +2409,43 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json index b36188405e7e1..c2289f954a36e 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json @@ -112,6 +112,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataFlow", "entityUrn": "urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD)", @@ -129,6 +145,31 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -137,11 +178,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "b8907be7-52f5-4df4-a870-f4fe0679ec45", + "job_id": "ab960f9d-30f3-4ced-b558-4f9b6671b6dd", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-12-19 12:34:45.843000", - "date_modified": "2024-12-19 12:34:46.017000", + "date_created": "2024-12-20 15:15:24.483000", + "date_modified": "2024-12-20 15:15:24.653000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -160,6 +201,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -195,6 +252,31 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:5726a09b23f60be6f661206c879a3683", @@ -2502,6 +2584,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataFlow", "entityUrn": "urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD)", @@ -2519,6 +2617,31 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -2532,8 +2655,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-12-19 12:34:45.660000", - "date_modified": "2024-12-19 12:34:45.660000" + "date_created": "2024-12-20 15:15:24.290000", + "date_modified": "2024-12-20 15:15:24.290000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2548,6 +2671,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -2565,6 +2704,31 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)", @@ -2577,8 +2741,8 @@ "depending_on_procedure": "{}", "code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n", "input parameters": "[]", - "date_created": "2024-12-19 12:34:45.667000", - "date_modified": "2024-12-19 12:34:45.667000" + "date_created": "2024-12-20 15:15:24.300000", + "date_modified": "2024-12-20 15:15:24.300000" }, "externalUrl": "", "name": "DemoData.Foo.NewProc", @@ -2593,6 +2757,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)", @@ -2610,6 +2790,31 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:5631370915311469374ef3cb5f0ebbf0", diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json index ebcadcc11dcbf..4db18dae27b7e 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json @@ -105,6 +105,43 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -113,11 +150,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "4130c37d-146c-43da-a671-dd9a413a44b3", + "job_id": "ab960f9d-30f3-4ced-b558-4f9b6671b6dd", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-11-22 12:58:03.260000", - "date_modified": "2024-11-22 12:58:03.440000", + "date_created": "2024-12-20 15:15:24.483000", + "date_modified": "2024-12-20 15:15:24.653000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -136,6 +173,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -154,6 +207,27 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", @@ -2103,8 +2177,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n", - "is_view": "True" + "is_view": "True", + "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n" }, "name": "PersonsView", "tags": [] @@ -2269,6 +2343,43 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -2282,8 +2393,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-11-22 12:58:03.137000", - "date_modified": "2024-11-22 12:58:03.137000" + "date_created": "2024-12-20 15:15:24.290000", + "date_modified": "2024-12-20 15:15:24.290000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2298,6 +2409,43 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", @@ -2310,8 +2458,8 @@ "depending_on_procedure": "{}", "code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n", "input parameters": "[]", - "date_created": "2024-11-22 12:58:03.140000", - "date_modified": "2024-11-22 12:58:03.140000" + "date_created": "2024-12-20 15:15:24.300000", + "date_modified": "2024-12-20 15:15:24.300000" }, "externalUrl": "", "name": "DemoData.Foo.NewProc", @@ -2326,6 +2474,43 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", + "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", @@ -4427,8 +4612,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "view_definition": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n", - "is_view": "True" + "is_view": "True", + "view_definition": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n" }, "name": "View1", "tags": [] From ff262bc65e7ab3e067f51a412cfb40db6e726fea Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Sun, 22 Dec 2024 18:24:18 +0530 Subject: [PATCH 14/39] Revert "fix(mssql): adds missing containers for dataflow and datajob entities, required for browse paths v2 generation" (#12201) --- .../ingestion/source/sql/mssql/job_models.py | 26 --- .../ingestion/source/sql/mssql/source.py | 10 - .../golden_mces_mssql_no_db_to_file.json | 207 +---------------- .../golden_mces_mssql_no_db_with_filter.json | 162 +------------ .../golden_mces_mssql_to_file.json | 219 +----------------- ...golden_mces_mssql_with_lower_case_urn.json | 207 +---------------- 6 files changed, 36 insertions(+), 795 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py index 0cd6261151928..d3941e7add0fd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py @@ -7,9 +7,7 @@ make_data_platform_urn, make_dataplatform_instance_urn, ) -from datahub.emitter.mcp_builder import DatabaseKey from datahub.metadata.schema_classes import ( - ContainerClass, DataFlowInfoClass, DataJobInfoClass, DataJobInputOutputClass, @@ -212,18 +210,6 @@ def as_datajob_info_aspect(self) -> DataJobInfoClass: status=self.status, ) - @property - def as_container_aspect(self) -> ContainerClass: - databaseKey = DatabaseKey( - platform=self.entity.flow.orchestrator, - instance=self.entity.flow.platform_instance - if self.entity.flow.platform_instance - else None, - env=self.entity.flow.env, - database=self.entity.flow.db, - ) - return ContainerClass(container=databaseKey.as_urn()) - @property def as_maybe_platform_instance_aspect(self) -> Optional[DataPlatformInstanceClass]: if self.entity.flow.platform_instance: @@ -271,18 +257,6 @@ def as_dataflow_info_aspect(self) -> DataFlowInfoClass: externalUrl=self.external_url, ) - @property - def as_container_aspect(self) -> ContainerClass: - databaseKey = DatabaseKey( - platform=self.entity.orchestrator, - instance=self.entity.platform_instance - if self.entity.platform_instance - else None, - env=self.entity.env, - database=self.entity.db, - ) - return ContainerClass(container=databaseKey.as_urn()) - @property def as_maybe_platform_instance_aspect(self) -> Optional[DataPlatformInstanceClass]: if self.entity.platform_instance: diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py index 547adcc8eccc9..9d8b67041998c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py @@ -639,11 +639,6 @@ def construct_job_workunits( aspect=data_job.as_datajob_info_aspect, ).as_workunit() - yield MetadataChangeProposalWrapper( - entityUrn=data_job.urn, - aspect=data_job.as_container_aspect, - ).as_workunit() - data_platform_instance_aspect = data_job.as_maybe_platform_instance_aspect if data_platform_instance_aspect: yield MetadataChangeProposalWrapper( @@ -667,11 +662,6 @@ def construct_flow_workunits( aspect=data_flow.as_dataflow_info_aspect, ).as_workunit() - yield MetadataChangeProposalWrapper( - entityUrn=data_flow.urn, - aspect=data_flow.as_container_aspect, - ).as_workunit() - data_platform_instance_aspect = data_flow.as_maybe_platform_instance_aspect if data_platform_instance_aspect: yield MetadataChangeProposalWrapper( diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json index 720ef0b392945..72dcda25c1296 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json @@ -105,43 +105,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -150,11 +113,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "ab960f9d-30f3-4ced-b558-4f9b6671b6dd", + "job_id": "c2d77890-83ba-435f-879b-1c77fa38dd47", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-12-20 15:15:24.483000", - "date_modified": "2024-12-20 15:15:24.653000", + "date_created": "2024-12-05 16:44:43.910000", + "date_modified": "2024-12-05 16:44:44.043000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -173,22 +136,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -207,27 +154,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", @@ -2177,8 +2103,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "is_view": "True", - "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n" + "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n", + "is_view": "True" }, "name": "PersonsView", "tags": [] @@ -2343,43 +2269,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -2393,8 +2282,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-12-20 15:15:24.290000", - "date_modified": "2024-12-20 15:15:24.290000" + "date_created": "2024-12-05 16:44:43.800000", + "date_modified": "2024-12-05 16:44:43.800000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2409,43 +2298,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", @@ -2458,8 +2310,8 @@ "depending_on_procedure": "{}", "code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n", "input parameters": "[]", - "date_created": "2024-12-20 15:15:24.300000", - "date_modified": "2024-12-20 15:15:24.300000" + "date_created": "2024-12-05 16:44:43.803000", + "date_modified": "2024-12-05 16:44:43.803000" }, "externalUrl": "", "name": "DemoData.Foo.NewProc", @@ -2474,43 +2326,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", @@ -4612,8 +4427,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "is_view": "True", - "view_definition": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n" + "view_definition": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n", + "is_view": "True" }, "name": "View1", "tags": [] diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json index cf3abbfc62997..0df89ff1eb94d 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json @@ -105,43 +105,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -150,11 +113,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "ab960f9d-30f3-4ced-b558-4f9b6671b6dd", + "job_id": "c2d77890-83ba-435f-879b-1c77fa38dd47", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-12-20 15:15:24.483000", - "date_modified": "2024-12-20 15:15:24.653000", + "date_created": "2024-12-05 16:44:43.910000", + "date_modified": "2024-12-05 16:44:44.043000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -173,22 +136,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -207,27 +154,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", @@ -2177,8 +2103,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "is_view": "True", - "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n" + "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n", + "is_view": "True" }, "name": "PersonsView", "tags": [] @@ -2343,43 +2269,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -2393,8 +2282,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-12-20 15:15:24.290000", - "date_modified": "2024-12-20 15:15:24.290000" + "date_created": "2024-12-05 16:44:43.800000", + "date_modified": "2024-12-05 16:44:43.800000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2409,43 +2298,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json index c2289f954a36e..b36188405e7e1 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json @@ -112,22 +112,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataFlow", "entityUrn": "urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD)", @@ -145,31 +129,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" - }, - { - "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", - "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -178,11 +137,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "ab960f9d-30f3-4ced-b558-4f9b6671b6dd", + "job_id": "b8907be7-52f5-4df4-a870-f4fe0679ec45", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-12-20 15:15:24.483000", - "date_modified": "2024-12-20 15:15:24.653000", + "date_created": "2024-12-19 12:34:45.843000", + "date_modified": "2024-12-19 12:34:46.017000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -201,22 +160,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -252,31 +195,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" - }, - { - "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", - "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:5726a09b23f60be6f661206c879a3683", @@ -2584,22 +2502,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataFlow", "entityUrn": "urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD)", @@ -2617,31 +2519,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" - }, - { - "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", - "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -2655,8 +2532,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-12-20 15:15:24.290000", - "date_modified": "2024-12-20 15:15:24.290000" + "date_created": "2024-12-19 12:34:45.660000", + "date_modified": "2024-12-19 12:34:45.660000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2671,22 +2548,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -2704,31 +2565,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" - }, - { - "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", - "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)", @@ -2741,8 +2577,8 @@ "depending_on_procedure": "{}", "code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n", "input parameters": "[]", - "date_created": "2024-12-20 15:15:24.300000", - "date_modified": "2024-12-20 15:15:24.300000" + "date_created": "2024-12-19 12:34:45.667000", + "date_modified": "2024-12-19 12:34:45.667000" }, "externalUrl": "", "name": "DemoData.Foo.NewProc", @@ -2757,22 +2593,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)", @@ -2790,31 +2610,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" - }, - { - "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", - "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:5631370915311469374ef3cb5f0ebbf0", diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json index 4db18dae27b7e..ebcadcc11dcbf 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json @@ -105,43 +105,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -150,11 +113,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "ab960f9d-30f3-4ced-b558-4f9b6671b6dd", + "job_id": "4130c37d-146c-43da-a671-dd9a413a44b3", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-12-20 15:15:24.483000", - "date_modified": "2024-12-20 15:15:24.653000", + "date_created": "2024-11-22 12:58:03.260000", + "date_modified": "2024-11-22 12:58:03.440000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -173,22 +136,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", @@ -207,27 +154,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", @@ -2177,8 +2103,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "is_view": "True", - "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n" + "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n", + "is_view": "True" }, "name": "PersonsView", "tags": [] @@ -2343,43 +2269,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -2393,8 +2282,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-12-20 15:15:24.290000", - "date_modified": "2024-12-20 15:15:24.290000" + "date_created": "2024-11-22 12:58:03.137000", + "date_modified": "2024-11-22 12:58:03.137000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2409,43 +2298,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", @@ -2458,8 +2310,8 @@ "depending_on_procedure": "{}", "code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n", "input parameters": "[]", - "date_created": "2024-12-20 15:15:24.300000", - "date_modified": "2024-12-20 15:15:24.300000" + "date_created": "2024-11-22 12:58:03.140000", + "date_modified": "2024-11-22 12:58:03.140000" }, "externalUrl": "", "name": "DemoData.Foo.NewProc", @@ -2474,43 +2326,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63", - "urn": "urn:li:container:a327c3b1f5aadd4524158aeb5121be63" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", @@ -4612,8 +4427,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "is_view": "True", - "view_definition": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n" + "view_definition": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n", + "is_view": "True" }, "name": "View1", "tags": [] From 73dce9e4180d7beef1ea6c9a7c9eeedbc551d18a Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Sun, 22 Dec 2024 10:28:19 -0600 Subject: [PATCH 15/39] =?UTF-8?q?chore(bump):=20bump=20node=20version=20lo?= =?UTF-8?q?ng=20term=20support=20release=20(build=20time=20=E2=80=A6=20(#1?= =?UTF-8?q?2199)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/build-and-test.yml | 2 +- .github/workflows/docker-unified.yml | 2 +- datahub-web-react/build.gradle | 3 +-- datahub-web-react/package.json | 2 +- docs-website/build.gradle | 2 +- smoke-test/build.gradle | 2 +- 6 files changed, 6 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 1b10fe6e74372..98071b536a336 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -110,7 +110,7 @@ jobs: run: | ./gradlew :datahub-frontend:build :datahub-web-react:build --parallel env: - NODE_OPTIONS: "--max-old-space-size=3072" + NODE_OPTIONS: "--max-old-space-size=4096" - name: Gradle compile (jdk8) for legacy Spark if: ${{ matrix.command == 'except_metadata_ingestion' && needs.setup.outputs.backend_change == 'true' }} run: | diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index 16a2d29e9fd85..03a9b3afc3bc5 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -446,7 +446,7 @@ jobs: ./gradlew :datahub-frontend:dist -x test -x yarnTest -x yarnLint --parallel mv ./datahub-frontend/build/distributions/datahub-frontend-*.zip datahub-frontend.zip env: - NODE_OPTIONS: "--max-old-space-size=3072" + NODE_OPTIONS: "--max-old-space-size=4096" - name: Build and push uses: ./.github/actions/docker-custom-build-and-push with: diff --git a/datahub-web-react/build.gradle b/datahub-web-react/build.gradle index b9fffce173c5c..bf1aa401e3f56 100644 --- a/datahub-web-react/build.gradle +++ b/datahub-web-react/build.gradle @@ -16,7 +16,7 @@ node { } // Version of node to use. - version = '21.2.0' + version = '22.12.0' // Version of Yarn to use. yarnVersion = '1.22.22' @@ -93,7 +93,6 @@ task yarnLintFix(type: YarnTask, dependsOn: [yarnInstall, yarnGenerate]) { } task yarnBuild(type: YarnTask, dependsOn: [yarnInstall, yarnGenerate]) { - environment = [NODE_OPTIONS: "--max-old-space-size=3072 --openssl-legacy-provider"] args = ['run', 'build'] outputs.cacheIf { true } diff --git a/datahub-web-react/package.json b/datahub-web-react/package.json index 31c10804482f0..2d1d667a89f14 100644 --- a/datahub-web-react/package.json +++ b/datahub-web-react/package.json @@ -90,7 +90,7 @@ "analyze": "source-map-explorer 'dist/assets/*.js'", "start": "yarn run generate && vite", "ec2-dev": "yarn run generate && CI=true;export CI;vite", - "build": "yarn run generate && NODE_OPTIONS='--max-old-space-size=3072 --openssl-legacy-provider' CI=false vite build", + "build": "yarn run generate && NODE_OPTIONS='--max-old-space-size=4096 --openssl-legacy-provider' CI=false vite build", "test": "vitest", "generate": "graphql-codegen --config codegen.yml", "lint": "eslint . --ext .ts,.tsx --quiet && yarn format-check && yarn type-check", diff --git a/docs-website/build.gradle b/docs-website/build.gradle index 797863d2019fb..1be790695e87e 100644 --- a/docs-website/build.gradle +++ b/docs-website/build.gradle @@ -14,7 +14,7 @@ node { } // Version of node to use. - version = '21.2.0' + version = '22.12.0' // Version of Yarn to use. yarnVersion = '1.22.22' diff --git a/smoke-test/build.gradle b/smoke-test/build.gradle index def3e814b2ba0..73ecdcb08ea14 100644 --- a/smoke-test/build.gradle +++ b/smoke-test/build.gradle @@ -16,7 +16,7 @@ node { } // Version of node to use. - version = '21.2.0' + version = '22.12.0' // Version of Yarn to use. yarnVersion = '1.22.22' From 0562c7a190c4548e29c7845fa44e9adf0248e4de Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Mon, 23 Dec 2024 16:56:54 +0530 Subject: [PATCH 16/39] fix(ingest): exclude aspect from migration (#12206) --- .../src/datahub/ingestion/source/datahub/config.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py index a3304334cb1eb..cd3c2146e6d84 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py @@ -14,6 +14,17 @@ DEFAULT_DATABASE_TABLE_NAME = "metadata_aspect_v2" DEFAULT_KAFKA_TOPIC_NAME = "MetadataChangeLog_Timeseries_v1" DEFAULT_DATABASE_BATCH_SIZE = 10_000 +DEFAULT_EXCLUDE_ASPECTS = { + "dataHubIngestionSourceKey", + "dataHubIngestionSourceInfo", + "datahubIngestionRunSummary", + "datahubIngestionCheckpoint", + "dataHubSecretKey", + "dataHubSecretValue", + "globalSettingsKey", + "globalSettingsInfo", + "testResults", +} class DataHubSourceConfig(StatefulIngestionConfigBase): @@ -44,7 +55,7 @@ class DataHubSourceConfig(StatefulIngestionConfigBase): ) exclude_aspects: Set[str] = Field( - default_factory=set, + default=DEFAULT_EXCLUDE_ASPECTS, description="Set of aspect names to exclude from ingestion", ) From d06980f6f3421ac5d3a3fc21d5c15f3e3057338f Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Mon, 23 Dec 2024 19:11:40 +0530 Subject: [PATCH 17/39] fix(ingest/snowflake): handle empty snowflake column upstreams (#12207) --- .../source/snowflake/snowflake_lineage_v2.py | 6 ++--- .../unit/snowflake/test_snowflake_source.py | 24 +++++++++++++++++++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index 69f28a0e6e595..b815a6584379a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -4,7 +4,7 @@ from datetime import datetime from typing import Any, Collection, Iterable, List, Optional, Set, Tuple, Type -from pydantic import BaseModel, validator +from pydantic import BaseModel, Field, validator from datahub.configuration.datetimes import parse_absolute_time from datahub.ingestion.api.closeable import Closeable @@ -72,8 +72,8 @@ class ColumnUpstreamJob(BaseModel): class ColumnUpstreamLineage(BaseModel): - column_name: str - upstreams: List[ColumnUpstreamJob] + column_name: Optional[str] + upstreams: List[ColumnUpstreamJob] = Field(default_factory=list) class UpstreamTableNode(BaseModel): diff --git a/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py b/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py index c735feb539608..2ff85a08f052f 100644 --- a/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py +++ b/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py @@ -18,6 +18,7 @@ DEFAULT_TEMP_TABLES_PATTERNS, SnowflakeV2Config, ) +from datahub.ingestion.source.snowflake.snowflake_lineage_v2 import UpstreamLineageEdge from datahub.ingestion.source.snowflake.snowflake_query import ( SnowflakeQuery, create_deny_regex_sql_filter, @@ -664,3 +665,26 @@ def test_create_snowsight_base_url_ap_northeast_1(): def test_snowflake_utils() -> None: assert_doctest(datahub.ingestion.source.snowflake.snowflake_utils) + + +def test_snowflake_query_result_parsing(): + db_row = { + "DOWNSTREAM_TABLE_NAME": "db.schema.downstream_table", + "DOWNSTREAM_TABLE_DOMAIN": "Table", + "UPSTREAM_TABLES": [ + { + "query_id": "01b92f61-0611-c826-000d-0103cf9b5db7", + "upstream_object_domain": "Table", + "upstream_object_name": "db.schema.upstream_table", + } + ], + "UPSTREAM_COLUMNS": [{}], + "QUERIES": [ + { + "query_id": "01b92f61-0611-c826-000d-0103cf9b5db7", + "query_text": "Query test", + "start_time": "2022-12-01 19:56:34", + } + ], + } + assert UpstreamLineageEdge.parse_obj(db_row) From dd23f9e294a72076e2cbe241cd6ce18f205bac68 Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Mon, 23 Dec 2024 21:28:18 +0530 Subject: [PATCH 18/39] fix(ui): null dereference (#12193) --- .../styled/ERModelRelationship/ERModelRelationUtils.tsx | 2 +- .../shared/tabs/Dataset/Queries/utils/filterQueries.ts | 6 +++--- .../shared/tabs/Dataset/Schema/utils/filterSchemaRows.ts | 2 +- .../source/executions/ExecutionRequestDetailsModal.tsx | 4 ++-- datahub-web-react/src/app/lineage/utils/titleUtils.ts | 4 ++-- .../src/app/search/context/SearchResultContext.tsx | 2 +- .../src/app/search/matches/MatchedFieldList.tsx | 2 +- .../src/app/search/matches/SearchTextHighlighter.tsx | 2 +- 8 files changed, 12 insertions(+), 12 deletions(-) diff --git a/datahub-web-react/src/app/entity/shared/components/styled/ERModelRelationship/ERModelRelationUtils.tsx b/datahub-web-react/src/app/entity/shared/components/styled/ERModelRelationship/ERModelRelationUtils.tsx index 0eb198aec4803..811ebf99b123a 100644 --- a/datahub-web-react/src/app/entity/shared/components/styled/ERModelRelationship/ERModelRelationUtils.tsx +++ b/datahub-web-react/src/app/entity/shared/components/styled/ERModelRelationship/ERModelRelationUtils.tsx @@ -68,6 +68,6 @@ export function getDatasetName(datainput: any): string { datainput?.editableProperties?.name || datainput?.properties?.name || datainput?.name || - datainput?.urn?.split(',').at(1) + datainput?.urn?.split(',')?.at(1) ); } diff --git a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Queries/utils/filterQueries.ts b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Queries/utils/filterQueries.ts index a8ec960ea2e08..fb97c8235cbe6 100644 --- a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Queries/utils/filterQueries.ts +++ b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Queries/utils/filterQueries.ts @@ -10,9 +10,9 @@ export const filterQueries = (filterText, queries: Query[]) => { const lowerFilterText = filterText.toLowerCase(); return queries.filter((query) => { return ( - query.title?.toLowerCase().includes(lowerFilterText) || - query.description?.toLowerCase().includes(lowerFilterText) || - query.query?.toLowerCase().includes(lowerFilterText) + query.title?.toLowerCase()?.includes(lowerFilterText) || + query.description?.toLowerCase()?.includes(lowerFilterText) || + query.query?.toLowerCase()?.includes(lowerFilterText) ); }); }; diff --git a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/utils/filterSchemaRows.ts b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/utils/filterSchemaRows.ts index 53b76d53f886a..9c0813fc2b85a 100644 --- a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/utils/filterSchemaRows.ts +++ b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/utils/filterSchemaRows.ts @@ -12,7 +12,7 @@ function matchesTagsOrTermsOrDescription(field: SchemaField, filterText: string, .toLocaleLowerCase() .includes(filterText), ) || - field.description?.toLocaleLowerCase().includes(filterText) + field.description?.toLocaleLowerCase()?.includes(filterText) ); } diff --git a/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx b/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx index a7e6f516bb794..f56eb06b6af14 100644 --- a/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx +++ b/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx @@ -129,7 +129,7 @@ export const ExecutionDetailsModal = ({ urn, open, onClose }: Props) => { downloadFile(output, `exec-${urn}.log`); }; - const logs = (showExpandedLogs && output) || output?.split('\n').slice(0, 5).join('\n'); + const logs = (showExpandedLogs && output) || output?.split('\n')?.slice(0, 5)?.join('\n'); const result = data?.executionRequest?.result as Partial; const status = getIngestionSourceStatus(result); @@ -163,7 +163,7 @@ export const ExecutionDetailsModal = ({ urn, open, onClose }: Props) => { } catch (e) { recipeYaml = ''; } - const recipe = showExpandedRecipe ? recipeYaml : recipeYaml?.split('\n').slice(0, 5).join('\n'); + const recipe = showExpandedRecipe ? recipeYaml : recipeYaml?.split('\n')?.slice(0, 5)?.join('\n'); const areLogsExpandable = output?.split(/\r\n|\r|\n/)?.length > 5; const isRecipeExpandable = recipeYaml?.split(/\r\n|\r|\n/)?.length > 5; diff --git a/datahub-web-react/src/app/lineage/utils/titleUtils.ts b/datahub-web-react/src/app/lineage/utils/titleUtils.ts index 6bd4cfea0f09a..8bd0cbda55b33 100644 --- a/datahub-web-react/src/app/lineage/utils/titleUtils.ts +++ b/datahub-web-react/src/app/lineage/utils/titleUtils.ts @@ -124,10 +124,10 @@ function truncate(input, length) { function getLastTokenOfTitle(title?: string): string { if (!title) return ''; - const lastToken = title?.split('.').slice(-1)[0]; + const lastToken = title?.split('.')?.slice(-1)?.[0]; // if the last token does not contain any content, the string should not be tokenized on `.` - if (lastToken.replace(/\s/g, '').length === 0) { + if (lastToken?.replace(/\s/g, '')?.length === 0) { return title; } diff --git a/datahub-web-react/src/app/search/context/SearchResultContext.tsx b/datahub-web-react/src/app/search/context/SearchResultContext.tsx index 68adead005149..961a50c1d4bfe 100644 --- a/datahub-web-react/src/app/search/context/SearchResultContext.tsx +++ b/datahub-web-react/src/app/search/context/SearchResultContext.tsx @@ -40,7 +40,7 @@ export const useSearchResult = () => { }; export const useEntityType = () => { - return useSearchResultContext()?.searchResult.entity.type; + return useSearchResultContext()?.searchResult?.entity?.type; }; export const useMatchedFields = () => { diff --git a/datahub-web-react/src/app/search/matches/MatchedFieldList.tsx b/datahub-web-react/src/app/search/matches/MatchedFieldList.tsx index 0bfe000dea366..9d77d446ff3b8 100644 --- a/datahub-web-react/src/app/search/matches/MatchedFieldList.tsx +++ b/datahub-web-react/src/app/search/matches/MatchedFieldList.tsx @@ -42,7 +42,7 @@ const RenderedField = ({ field: MatchedField; }) => { const entityRegistry = useEntityRegistry(); - const query = useSearchQuery()?.trim().toLowerCase(); + const query = useSearchQuery()?.trim()?.toLowerCase(); const customRenderedField = customFieldRenderer?.(field); if (customRenderedField) return {customRenderedField}; if (isHighlightableEntityField(field)) { diff --git a/datahub-web-react/src/app/search/matches/SearchTextHighlighter.tsx b/datahub-web-react/src/app/search/matches/SearchTextHighlighter.tsx index d8da1088ea89d..7a0a0e1e41a4b 100644 --- a/datahub-web-react/src/app/search/matches/SearchTextHighlighter.tsx +++ b/datahub-web-react/src/app/search/matches/SearchTextHighlighter.tsx @@ -23,7 +23,7 @@ const SearchTextHighlighter = ({ field, text, enableFullHighlight = false }: Pro const enableNameHighlight = appConfig.config.visualConfig.searchResult?.enableNameHighlight; const matchedFields = useMatchedFieldsByGroup(field); const hasMatchedField = !!matchedFields?.length; - const normalizedSearchQuery = useSearchQuery()?.trim().toLowerCase(); + const normalizedSearchQuery = useSearchQuery()?.trim()?.toLowerCase(); const normalizedText = text.trim().toLowerCase(); const hasSubstring = hasMatchedField && !!normalizedSearchQuery && normalizedText.includes(normalizedSearchQuery); const pattern = enableFullHighlight ? HIGHLIGHT_ALL_PATTERN : undefined; From dc82251afed92ed605ce6dcc7c956396c494ca29 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 23 Dec 2024 13:03:52 -0500 Subject: [PATCH 19/39] fix(ingest): quote asset urns in patch path (#12212) --- metadata-ingestion/src/datahub/specific/dataproduct.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/specific/dataproduct.py b/metadata-ingestion/src/datahub/specific/dataproduct.py index 6b7e695b4d57e..f9830a4b23df0 100644 --- a/metadata-ingestion/src/datahub/specific/dataproduct.py +++ b/metadata-ingestion/src/datahub/specific/dataproduct.py @@ -131,7 +131,7 @@ def add_asset(self, asset_urn: str) -> "DataProductPatchBuilder": self._add_patch( DataProductProperties.ASPECT_NAME, "add", - path=f"/assets/{asset_urn}", + path=f"/assets/{self.quote(asset_urn)}", value=DataProductAssociation(destinationUrn=asset_urn), ) return self @@ -140,7 +140,7 @@ def remove_asset(self, asset_urn: str) -> "DataProductPatchBuilder": self._add_patch( DataProductProperties.ASPECT_NAME, "remove", - path=f"/assets/{asset_urn}", + path=f"/assets/{self.quote(asset_urn)}", value={}, ) return self From 4c0b568887c7a3c2aa8a1e1b888ce362ce768485 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 23 Dec 2024 13:04:06 -0500 Subject: [PATCH 20/39] feat(ingest): add sql parser trace mode (#12210) --- .../datahub/sql_parsing/sqlglot_lineage.py | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py index f387618bfaec1..bf28ab0e7b229 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py @@ -66,6 +66,7 @@ "SQL_LINEAGE_TIMEOUT_ENABLED", True ) SQL_LINEAGE_TIMEOUT_SECONDS = 10 +SQL_PARSER_TRACE = get_boolean_env_variable("DATAHUB_SQL_PARSER_TRACE", False) # These rules are a subset of the rules in sqlglot.optimizer.optimizer.RULES. @@ -365,10 +366,11 @@ def _sqlglot_force_column_normalizer( return node - # logger.debug( - # "Prior to case normalization sql %s", - # statement.sql(pretty=True, dialect=dialect), - # ) + if SQL_PARSER_TRACE: + logger.debug( + "Prior to case normalization sql %s", + statement.sql(pretty=True, dialect=dialect), + ) statement = statement.transform(_sqlglot_force_column_normalizer, copy=False) # logger.debug( # "Sql after casing normalization %s", @@ -562,7 +564,7 @@ def _select_statement_cll( # noqa: C901 ) ) - # TODO: Also extract referenced columns (aka auxillary / non-SELECT lineage) + # TODO: Also extract referenced columns (aka auxiliary / non-SELECT lineage) except (sqlglot.errors.OptimizeError, ValueError, IndexError) as e: raise SqlUnderstandingError( f"sqlglot failed to compute some lineage: {e}" @@ -1022,6 +1024,14 @@ def _sqlglot_lineage_inner( logger.debug( f"Resolved {total_schemas_resolved} of {total_tables_discovered} table schemas" ) + if SQL_PARSER_TRACE: + for qualified_table, schema_info in table_name_schema_mapping.items(): + logger.debug( + "Table name %s resolved to %s with schema %s", + qualified_table, + table_name_urn_mapping[qualified_table], + schema_info, + ) column_lineage: Optional[List[_ColumnLineageInfo]] = None try: From b6ea974630d68c61eb7c5cd624ee013817de7bd6 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 23 Dec 2024 13:04:15 -0500 Subject: [PATCH 21/39] fix(ingest): preserve certs when converting emitter to graph (#12211) --- metadata-ingestion/src/datahub/ingestion/graph/client.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index 4aa937639e959..ca9a41172e5b6 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -188,9 +188,12 @@ def from_emitter(cls, emitter: DatahubRestEmitter) -> "DataHubGraph": retry_max_times=emitter._retry_max_times, extra_headers=emitter._session.headers, disable_ssl_verification=emitter._session.verify is False, - # TODO: Support these headers. - # ca_certificate_path=emitter._ca_certificate_path, - # client_certificate_path=emitter._client_certificate_path, + ca_certificate_path=( + emitter._session.verify + if isinstance(emitter._session.verify, str) + else None + ), + client_certificate_path=emitter._session.cert, ) ) From 21ddb5538d08b64279f3526aa250ec489f5497ed Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 23 Dec 2024 16:32:49 -0500 Subject: [PATCH 22/39] fix(ingest/mode): move sql logic to view properties aspect (#12196) --- .../src/datahub/ingestion/source/mode.py | 21 ++++++--- .../integration/mode/mode_mces_golden.json | 43 ++++++++++++++++++- 2 files changed, 55 insertions(+), 9 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/mode.py b/metadata-ingestion/src/datahub/ingestion/source/mode.py index c1ab9271ce13a..ef0b499129f97 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mode.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mode.py @@ -98,6 +98,7 @@ TagPropertiesClass, UpstreamClass, UpstreamLineageClass, + ViewPropertiesClass, ) from datahub.metadata.urns import QueryUrn from datahub.sql_parsing.sqlglot_lineage import ( @@ -930,16 +931,13 @@ def construct_query_or_dataset( dataset_props = DatasetPropertiesClass( name=report_info.get("name") if is_mode_dataset else query_data.get("name"), - description=f"""### Source Code -``` sql -{query_data.get("raw_query")} -``` - """, + description=None, externalUrl=externalUrl, customProperties=self.get_custom_props_from_dict( query_data, [ - "id" "created_at", + "id", + "created_at", "updated_at", "last_run_id", "data_source_id", @@ -949,7 +947,6 @@ def construct_query_or_dataset( ], ), ) - yield ( MetadataChangeProposalWrapper( entityUrn=query_urn, @@ -957,6 +954,16 @@ def construct_query_or_dataset( ).as_workunit() ) + if raw_query := query_data.get("raw_query"): + yield MetadataChangeProposalWrapper( + entityUrn=query_urn, + aspect=ViewPropertiesClass( + viewLogic=raw_query, + viewLanguage=QueryLanguageClass.SQL, + materialized=False, + ), + ).as_workunit() + if is_mode_dataset: space_container_key = self.gen_space_key(space_token) yield from add_dataset_to_container( diff --git a/metadata-ingestion/tests/integration/mode/mode_mces_golden.json b/metadata-ingestion/tests/integration/mode/mode_mces_golden.json index ed00dc5734680..84dbdbe89f7b5 100644 --- a/metadata-ingestion/tests/integration/mode/mode_mces_golden.json +++ b/metadata-ingestion/tests/integration/mode/mode_mces_golden.json @@ -176,6 +176,7 @@ "datasets": [ "urn:li:dataset:(urn:li:dataPlatform:mode,5450544,PROD)" ], + "dashboards": [], "lastModified": { "created": { "time": 1639169724316, @@ -253,6 +254,8 @@ "aspect": { "json": { "customProperties": { + "id": "19780522", + "created_at": "2024-09-02T07:38:43.755Z", "updated_at": "2024-09-02T07:40:44.046Z", "last_run_id": "3535709679", "data_source_id": "44763", @@ -260,7 +263,6 @@ }, "externalUrl": "https://app.mode.com/acryl/datasets/24f66e1701b6", "name": "Dataset 1", - "description": "### Source Code\n``` sql\n-- Returns first 100 rows from DATAHUB_COMMUNITY.POSTGRES_PUBLIC.COMPANY\n SELECT \n\t\tAGE,\n\t\tID,\n\t\tNAME,\n\t\t_FIVETRAN_DELETED,\n\t\t_FIVETRAN_SYNCED\n FROM DATAHUB_COMMUNITY.POSTGRES_PUBLIC.COMPANY LIMIT 100;\n\n-- Returns first 100 rows from ETHAN_TEST_DB.PUBLIC.ACCOUNT_PHONE_NUMBER\n SELECT \n\t\tCOMMUNICATION_ACCOUNT_ID,\n\t\tID,\n\t\tMMS_CAPABLE,\n\t\tPHONE_NUMBER,\n\t\tSMS_CAPABLE,\n\t\tSTATUS,\n\t\tSTATUS_TLM,\n\t\tTLM,\n\t\tVOICE_CAPABLE,\n\t\tWHEN_CREATED\n FROM ETHAN_TEST_DB.PUBLIC.ACCOUNT_PHONE_NUMBER LIMIT 100;\n \n \n```\n ", "tags": [] } }, @@ -270,6 +272,24 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mode,5450544,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "-- Returns first 100 rows from DATAHUB_COMMUNITY.POSTGRES_PUBLIC.COMPANY\n SELECT \n\t\tAGE,\n\t\tID,\n\t\tNAME,\n\t\t_FIVETRAN_DELETED,\n\t\t_FIVETRAN_SYNCED\n FROM DATAHUB_COMMUNITY.POSTGRES_PUBLIC.COMPANY LIMIT 100;\n\n-- Returns first 100 rows from ETHAN_TEST_DB.PUBLIC.ACCOUNT_PHONE_NUMBER\n SELECT \n\t\tCOMMUNICATION_ACCOUNT_ID,\n\t\tID,\n\t\tMMS_CAPABLE,\n\t\tPHONE_NUMBER,\n\t\tSMS_CAPABLE,\n\t\tSTATUS,\n\t\tSTATUS_TLM,\n\t\tTLM,\n\t\tVOICE_CAPABLE,\n\t\tWHEN_CREATED\n FROM ETHAN_TEST_DB.PUBLIC.ACCOUNT_PHONE_NUMBER LIMIT 100;\n \n ", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "mode-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mode,5450544,PROD)", @@ -336,13 +356,14 @@ "aspect": { "json": { "customProperties": { + "id": "10149707", + "created_at": "2021-12-10T20:55:24.361Z", "updated_at": "2021-12-10T23:12:53.273Z", "last_run_id": "1897576958", "data_source_id": "34499" }, "externalUrl": "https://app.mode.com/acryl/reports/9d2da37fa91e/details/queries/6e26a9f3d4e2", "name": "Customer and staff", - "description": "### Source Code\n``` sql\nSELECT rental.*, staff.first_name \"Staff First Name\", staff.last_name \"Staff Last Name\" FROM {{ @join_on_definition as rental }} join staff on staff.staff_id = rental.staff_id where selected_id = {{ selected_id }} \n{% form %}\nselected_id:\n type: text\n default: my_id\n{% endform %}\n```\n ", "tags": [] } }, @@ -352,6 +373,24 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mode,10149707,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "SELECT rental.*, staff.first_name \"Staff First Name\", staff.last_name \"Staff Last Name\" FROM {{ @join_on_definition as rental }} join staff on staff.staff_id = rental.staff_id where selected_id = {{ selected_id }} \n{% form %}\nselected_id:\n type: text\n default: my_id\n{% endform %}", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "mode-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mode,10149707,PROD)", From 047644b888b121fa3feb10a5f33bdef60b1072ce Mon Sep 17 00:00:00 2001 From: Hyejin Yoon <0327jane@gmail.com> Date: Tue, 24 Dec 2024 10:06:35 +0900 Subject: [PATCH 23/39] feat: update mlflow-related metadata models (#12174) Co-authored-by: John Joyce Co-authored-by: John Joyce --- .../src/main/resources/entity.graphql | 196 +++++++++++++++++- .../dataprocess/DataProcessInstanceOutput.pdl | 2 +- .../DataProcessInstanceProperties.pdl | 2 +- .../ml/metadata/MLModelGroupProperties.pdl | 35 ++++ .../ml/metadata/MLModelProperties.pdl | 28 ++- .../ml/metadata/MLTrainingRunProperties.pdl | 36 ++++ .../src/main/resources/entity-registry.yml | 4 + .../com.linkedin.entity.aspects.snapshot.json | 54 +++-- ...com.linkedin.entity.entities.snapshot.json | 99 +++++++-- .../com.linkedin.entity.runs.snapshot.json | 54 +++-- ...nkedin.operations.operations.snapshot.json | 54 +++-- ...m.linkedin.platform.platform.snapshot.json | 99 +++++++-- 12 files changed, 568 insertions(+), 95 deletions(-) create mode 100644 metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLTrainingRunProperties.pdl diff --git a/datahub-graphql-core/src/main/resources/entity.graphql b/datahub-graphql-core/src/main/resources/entity.graphql index e086273068ee5..9abf4e16f12dd 100644 --- a/datahub-graphql-core/src/main/resources/entity.graphql +++ b/datahub-graphql-core/src/main/resources/entity.graphql @@ -262,8 +262,16 @@ type Query { Fetch all Business Attributes """ listBusinessAttributes(input: ListBusinessAttributesInput!): ListBusinessAttributesResult + + """ + Fetch a Data Process Instance by primary key (urn) + """ + dataProcessInstance(urn: String!): DataProcessInstance + + } + """ An ERModelRelationship is a high-level abstraction that dictates what datasets fields are erModelRelationshiped. """ @@ -9832,15 +9840,45 @@ type MLModelGroup implements EntityWithRelationships & Entity & BrowsableEntity privileges: EntityPrivileges } +""" +Properties describing a group of related ML models +""" type MLModelGroupProperties { + """ + Display name of the model group + """ + name: String + """ + Detailed description of the model group's purpose and contents + """ description: String - createdAt: Long + """ + When this model group was created + """ + created: AuditStamp + """ + When this model group was last modified + """ + lastModified: AuditStamp + + """ + Version identifier for this model group + """ version: VersionTag + """ + Custom key-value properties for the model group + """ customProperties: [CustomPropertiesEntry!] + + """ + Deprecated creation timestamp + @deprecated Use the 'created' field instead + """ + createdAt: Long @deprecated(reason: "Use `created` instead") } """ @@ -9990,40 +10028,103 @@ description: String } type MLMetric { + """ + Name of the metric (e.g. accuracy, precision, recall) + """ name: String + """ + Description of what this metric measures + """ description: String + """ + The computed value of the metric + """ value: String + """ + Timestamp when this metric was recorded + """ createdAt: Long } type MLModelProperties { + """ + The display name of the model used in the UI + """ + name: String! + """ + Detailed description of the model's purpose and characteristics + """ description: String - date: Long + """ + When the model was last modified + """ + lastModified: AuditStamp + """ + Version identifier for this model + """ version: String + """ + The type/category of ML model (e.g. classification, regression) + """ type: String + """ + Mapping of hyperparameter configurations + """ hyperParameters: HyperParameterMap - hyperParams: [MLHyperParam] + """ + List of hyperparameter settings used to train this model + """ + hyperParams: [MLHyperParam] + """ + Performance metrics from model training + """ trainingMetrics: [MLMetric] + """ + Names of ML features used by this model + """ mlFeatures: [String!] + """ + Tags for categorizing and searching models + """ tags: [String!] + """ + Model groups this model belongs to + """ groups: [MLModelGroup] + """ + Additional custom properties specific to this model + """ customProperties: [CustomPropertiesEntry!] + """ + URL to view this model in external system + """ externalUrl: String + + """ + When this model was created + """ + created: AuditStamp + + """ + Deprecated timestamp for model creation + @deprecated Use 'created' field instead + """ + date: Long @deprecated(reason: "Use `created` instead") } type MLFeatureProperties { @@ -12804,3 +12905,92 @@ type CronSchedule { """ timezone: String! } + + +""" +Properties describing a data process instance's execution metadata +""" +type DataProcessInstanceProperties { + """ + The display name of this process instance + """ + name: String! + + """ + URL to view this process instance in the external system + """ + externalUrl: String + + """ + When this process instance was created + """ + created: AuditStamp + + """ + Additional custom properties specific to this process instance + """ + customProperties: [CustomPropertiesEntry!] +} + +""" +Properties specific to an ML model training run instance +""" +type MLTrainingRunProperties { + """ + Unique identifier for this training run + """ + id: String + + """ + List of URLs to access training run outputs (e.g. model artifacts, logs) + """ + outputUrls: [String] + + """ + Hyperparameters used in this training run + """ + hyperParams: [MLHyperParam] + + """ + Performance metrics recorded during this training run + """ + trainingMetrics: [MLMetric] +} + +extend type DataProcessInstance { + + """ + Additional read only properties associated with the Data Job + """ + properties: DataProcessInstanceProperties + + """ + The specific instance of the data platform that this entity belongs to + """ + dataPlatformInstance: DataPlatformInstance + + """ + Sub Types that this entity implements + """ + subTypes: SubTypes + + """ + The parent container in which the entity resides + """ + container: Container + + """ + Standardized platform urn where the data process instance is defined + """ + platform: DataPlatform! + + """ + Recursively get the lineage of containers for this entity + """ + parentContainers: ParentContainersResult + + """ + Additional properties when subtype is Training Run + """ + mlTrainingRunProperties: MLTrainingRunProperties +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceOutput.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceOutput.pdl index f33c41e63efed..fe782dbe01ca9 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceOutput.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceOutput.pdl @@ -15,7 +15,7 @@ record DataProcessInstanceOutput { @Relationship = { "/*": { "name": "Produces", - "entityTypes": [ "dataset" ] + "entityTypes": [ "dataset", "mlModel" ] } } @Searchable = { diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl index c63cb1a97c017..5c6bfaecf1ef4 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl @@ -52,4 +52,4 @@ record DataProcessInstanceProperties includes CustomProperties, ExternalReferenc } created: AuditStamp -} +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelGroupProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelGroupProperties.pdl index b54e430038082..81c5e7a240f61 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelGroupProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelGroupProperties.pdl @@ -4,6 +4,7 @@ import com.linkedin.common.Urn import com.linkedin.common.Time import com.linkedin.common.VersionTag import com.linkedin.common.CustomProperties +import com.linkedin.common.TimeStamp /** * Properties associated with an ML Model Group @@ -13,6 +14,17 @@ import com.linkedin.common.CustomProperties } record MLModelGroupProperties includes CustomProperties { + /** + * Display name of the MLModelGroup + */ + @Searchable = { + "fieldType": "WORD_GRAM", + "enableAutocomplete": true, + "boostScore": 10.0, + "queryByDefault": true, + } + name: optional string + /** * Documentation of the MLModelGroup */ @@ -25,8 +37,31 @@ record MLModelGroupProperties includes CustomProperties { /** * Date when the MLModelGroup was developed */ + @deprecated createdAt: optional Time + /** + * Time and Actor who created the MLModelGroup + */ + created: optional TimeStamp + + /** + * Date when the MLModelGroup was last modified + */ + lastModified: optional TimeStamp + + /** + * List of jobs (if any) used to train the model group. Visible in Lineage. + */ + @Relationship = { + "/*": { + "name": "TrainedBy", + "entityTypes": [ "dataJob" ], + "isLineage": true + } + } + trainingJobs: optional array[Urn] + /** * Version of the MLModelGroup */ diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelProperties.pdl index 621a3e1747b50..d89d07384bba1 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLModelProperties.pdl @@ -6,6 +6,7 @@ import com.linkedin.common.Time import com.linkedin.common.VersionTag import com.linkedin.common.CustomProperties import com.linkedin.common.ExternalReference +import com.linkedin.common.TimeStamp /** * Properties associated with a ML Model @@ -15,6 +16,18 @@ import com.linkedin.common.ExternalReference } record MLModelProperties includes CustomProperties, ExternalReference { + /** + * Display name of the MLModel + */ + @Searchable = { + "fieldType": "WORD_GRAM", + "enableAutocomplete": true, + "boostScore": 10.0, + "queryByDefault": true, + } + name: optional string + + /** * Documentation of the MLModel */ @@ -27,8 +40,19 @@ record MLModelProperties includes CustomProperties, ExternalReference { /** * Date when the MLModel was developed */ + @deprecated date: optional Time + /** + * Audit stamp containing who created this and when + */ + created: optional TimeStamp + + /** + * Date when the MLModel was last modified + */ + lastModified: optional TimeStamp + /** * Version of the MLModel */ @@ -93,12 +117,12 @@ record MLModelProperties includes CustomProperties, ExternalReference { deployments: optional array[Urn] /** - * List of jobs (if any) used to train the model + * List of jobs (if any) used to train the model. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect. */ @Relationship = { "/*": { "name": "TrainedBy", - "entityTypes": [ "dataJob" ], + "entityTypes": [ "dataJob", "dataProcessInstance" ], "isLineage": true } } diff --git a/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLTrainingRunProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLTrainingRunProperties.pdl new file mode 100644 index 0000000000000..f8b8eeafe908b --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/ml/metadata/MLTrainingRunProperties.pdl @@ -0,0 +1,36 @@ +namespace com.linkedin.ml.metadata + +import com.linkedin.common.AuditStamp +import com.linkedin.common.CustomProperties +import com.linkedin.common.ExternalReference +import com.linkedin.common.Urn +import com.linkedin.common.JobFlowUrn +import com.linkedin.common.DataJobUrn +/** + * The inputs and outputs of this training run + */ +@Aspect = { + "name": "mlTrainingRunProperties", +} +record MLTrainingRunProperties includes CustomProperties, ExternalReference { + + /** + * Run Id of the ML Training Run + */ + id: optional string + + /** + * List of URLs for the Outputs of the ML Training Run + */ + outputUrls: optional array[string] + + /** + * Hyperparameters of the ML Training Run + */ + hyperParams: optional array[MLHyperParam] + + /** + * Metrics of the ML Training Run + */ + trainingMetrics: optional array[MLMetric] +} \ No newline at end of file diff --git a/metadata-models/src/main/resources/entity-registry.yml b/metadata-models/src/main/resources/entity-registry.yml index 1c3eb5b574e20..4fe170ced69f3 100644 --- a/metadata-models/src/main/resources/entity-registry.yml +++ b/metadata-models/src/main/resources/entity-registry.yml @@ -116,6 +116,10 @@ entities: - dataProcessInstanceRunEvent - status - testResults + - dataPlatformInstance + - subTypes + - container + - mlTrainingRunProperties - name: chart category: core keyAspect: chartKey diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json index 827789130d8bb..1c713fd33884b 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json @@ -3826,12 +3826,23 @@ "type" : "record", "name" : "MLModelProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with a ML Model", + "doc" : "Properties associated with a ML Model\r", "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModel\r", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModel", + "doc" : "Documentation of the MLModel\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -3840,17 +3851,28 @@ }, { "name" : "date", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModel was developed", + "doc" : "Date when the MLModel was developed\r", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Audit stamp containing who created this and when\r", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModel was last modified\r", "optional" : true }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModel", + "doc" : "Version of the MLModel\r", "optional" : true }, { "name" : "type", "type" : "string", - "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc", + "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT_PARTIAL" @@ -3866,7 +3888,7 @@ "ref" : [ "string", "int", "float", "double", "boolean" ] } }, - "doc" : "Hyper Parameters of the MLModel\n\nNOTE: these are deprecated in favor of hyperParams", + "doc" : "Hyper Parameters of the MLModel\r\n\r\nNOTE: these are deprecated in favor of hyperParams\r", "optional" : true }, { "name" : "hyperParams", @@ -3901,7 +3923,7 @@ } } }, - "doc" : "Hyperparameters of the MLModel", + "doc" : "Hyperparameters of the MLModel\r", "optional" : true }, { "name" : "trainingMetrics", @@ -3936,7 +3958,7 @@ } } }, - "doc" : "Metrics of the MLModel used in training", + "doc" : "Metrics of the MLModel used in training\r", "optional" : true }, { "name" : "onlineMetrics", @@ -3944,7 +3966,7 @@ "type" : "array", "items" : "MLMetric" }, - "doc" : "Metrics of the MLModel used in production", + "doc" : "Metrics of the MLModel used in production\r", "optional" : true }, { "name" : "mlFeatures", @@ -3952,7 +3974,7 @@ "type" : "array", "items" : "com.linkedin.common.MLFeatureUrn" }, - "doc" : "List of features used for MLModel training", + "doc" : "List of features used for MLModel training\r", "optional" : true, "Relationship" : { "/*" : { @@ -3967,7 +3989,7 @@ "type" : "array", "items" : "string" }, - "doc" : "Tags for the MLModel", + "doc" : "Tags for the MLModel\r", "default" : [ ] }, { "name" : "deployments", @@ -3975,7 +3997,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Deployments for the MLModel", + "doc" : "Deployments for the MLModel\r", "optional" : true, "Relationship" : { "/*" : { @@ -3989,11 +4011,11 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) used to train the model", + "doc" : "List of jobs (if any) used to train the model. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", "optional" : true, "Relationship" : { "/*" : { - "entityTypes" : [ "dataJob" ], + "entityTypes" : [ "dataJob", "dataProcessInstance" ], "isLineage" : true, "name" : "TrainedBy" } @@ -4004,7 +4026,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) that use the model", + "doc" : "List of jobs (if any) that use the model\r", "optional" : true, "Relationship" : { "/*" : { @@ -4020,7 +4042,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Groups the model belongs to", + "doc" : "Groups the model belongs to\r", "optional" : true, "Relationship" : { "/*" : { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json index b549cef0af84b..77d4644f3c121 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json @@ -3984,12 +3984,23 @@ "type" : "record", "name" : "MLModelProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with a ML Model", + "doc" : "Properties associated with a ML Model\r", "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModel\r", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModel", + "doc" : "Documentation of the MLModel\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -3998,17 +4009,28 @@ }, { "name" : "date", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModel was developed", + "doc" : "Date when the MLModel was developed\r", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Audit stamp containing who created this and when\r", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModel was last modified\r", "optional" : true }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModel", + "doc" : "Version of the MLModel\r", "optional" : true }, { "name" : "type", "type" : "string", - "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc", + "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT_PARTIAL" @@ -4024,7 +4046,7 @@ "ref" : [ "string", "int", "float", "double", "boolean" ] } }, - "doc" : "Hyper Parameters of the MLModel\n\nNOTE: these are deprecated in favor of hyperParams", + "doc" : "Hyper Parameters of the MLModel\r\n\r\nNOTE: these are deprecated in favor of hyperParams\r", "optional" : true }, { "name" : "hyperParams", @@ -4059,7 +4081,7 @@ } } }, - "doc" : "Hyperparameters of the MLModel", + "doc" : "Hyperparameters of the MLModel\r", "optional" : true }, { "name" : "trainingMetrics", @@ -4094,7 +4116,7 @@ } } }, - "doc" : "Metrics of the MLModel used in training", + "doc" : "Metrics of the MLModel used in training\r", "optional" : true }, { "name" : "onlineMetrics", @@ -4102,7 +4124,7 @@ "type" : "array", "items" : "MLMetric" }, - "doc" : "Metrics of the MLModel used in production", + "doc" : "Metrics of the MLModel used in production\r", "optional" : true }, { "name" : "mlFeatures", @@ -4110,7 +4132,7 @@ "type" : "array", "items" : "com.linkedin.common.MLFeatureUrn" }, - "doc" : "List of features used for MLModel training", + "doc" : "List of features used for MLModel training\r", "optional" : true, "Relationship" : { "/*" : { @@ -4125,7 +4147,7 @@ "type" : "array", "items" : "string" }, - "doc" : "Tags for the MLModel", + "doc" : "Tags for the MLModel\r", "default" : [ ] }, { "name" : "deployments", @@ -4133,7 +4155,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Deployments for the MLModel", + "doc" : "Deployments for the MLModel\r", "optional" : true, "Relationship" : { "/*" : { @@ -4147,11 +4169,11 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) used to train the model", + "doc" : "List of jobs (if any) used to train the model. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", "optional" : true, "Relationship" : { "/*" : { - "entityTypes" : [ "dataJob" ], + "entityTypes" : [ "dataJob", "dataProcessInstance" ], "isLineage" : true, "name" : "TrainedBy" } @@ -4162,7 +4184,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) that use the model", + "doc" : "List of jobs (if any) that use the model\r", "optional" : true, "Relationship" : { "/*" : { @@ -4178,7 +4200,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Groups the model belongs to", + "doc" : "Groups the model belongs to\r", "optional" : true, "Relationship" : { "/*" : { @@ -4981,12 +5003,23 @@ "type" : "record", "name" : "MLModelGroupProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with an ML Model Group", + "doc" : "Properties associated with an ML Model Group\r", "include" : [ "com.linkedin.common.CustomProperties" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModelGroup\r", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModelGroup", + "doc" : "Documentation of the MLModelGroup\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -4995,12 +5028,38 @@ }, { "name" : "createdAt", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModelGroup was developed", + "doc" : "Date when the MLModelGroup was developed\r", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Time and Actor who created the MLModelGroup\r", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModelGroup was last modified\r", "optional" : true + }, { + "name" : "trainingJobs", + "type" : { + "type" : "array", + "items" : "com.linkedin.common.Urn" + }, + "doc" : "List of jobs (if any) used to train the model group. Visible in Lineage.\r", + "optional" : true, + "Relationship" : { + "/*" : { + "entityTypes" : [ "dataJob" ], + "isLineage" : true, + "name" : "TrainedBy" + } + } }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModelGroup", + "doc" : "Version of the MLModelGroup\r", "optional" : true } ], "Aspect" : { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json index c8be9d063eaea..8b6def75f7a66 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json @@ -3550,12 +3550,23 @@ "type" : "record", "name" : "MLModelProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with a ML Model", + "doc" : "Properties associated with a ML Model\r", "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModel\r", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModel", + "doc" : "Documentation of the MLModel\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -3564,17 +3575,28 @@ }, { "name" : "date", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModel was developed", + "doc" : "Date when the MLModel was developed\r", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Audit stamp containing who created this and when\r", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModel was last modified\r", "optional" : true }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModel", + "doc" : "Version of the MLModel\r", "optional" : true }, { "name" : "type", "type" : "string", - "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc", + "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT_PARTIAL" @@ -3590,7 +3612,7 @@ "ref" : [ "string", "int", "float", "double", "boolean" ] } }, - "doc" : "Hyper Parameters of the MLModel\n\nNOTE: these are deprecated in favor of hyperParams", + "doc" : "Hyper Parameters of the MLModel\r\n\r\nNOTE: these are deprecated in favor of hyperParams\r", "optional" : true }, { "name" : "hyperParams", @@ -3625,7 +3647,7 @@ } } }, - "doc" : "Hyperparameters of the MLModel", + "doc" : "Hyperparameters of the MLModel\r", "optional" : true }, { "name" : "trainingMetrics", @@ -3660,7 +3682,7 @@ } } }, - "doc" : "Metrics of the MLModel used in training", + "doc" : "Metrics of the MLModel used in training\r", "optional" : true }, { "name" : "onlineMetrics", @@ -3668,7 +3690,7 @@ "type" : "array", "items" : "MLMetric" }, - "doc" : "Metrics of the MLModel used in production", + "doc" : "Metrics of the MLModel used in production\r", "optional" : true }, { "name" : "mlFeatures", @@ -3676,7 +3698,7 @@ "type" : "array", "items" : "com.linkedin.common.MLFeatureUrn" }, - "doc" : "List of features used for MLModel training", + "doc" : "List of features used for MLModel training\r", "optional" : true, "Relationship" : { "/*" : { @@ -3691,7 +3713,7 @@ "type" : "array", "items" : "string" }, - "doc" : "Tags for the MLModel", + "doc" : "Tags for the MLModel\r", "default" : [ ] }, { "name" : "deployments", @@ -3699,7 +3721,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Deployments for the MLModel", + "doc" : "Deployments for the MLModel\r", "optional" : true, "Relationship" : { "/*" : { @@ -3713,11 +3735,11 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) used to train the model", + "doc" : "List of jobs (if any) used to train the model. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", "optional" : true, "Relationship" : { "/*" : { - "entityTypes" : [ "dataJob" ], + "entityTypes" : [ "dataJob", "dataProcessInstance" ], "isLineage" : true, "name" : "TrainedBy" } @@ -3728,7 +3750,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) that use the model", + "doc" : "List of jobs (if any) that use the model\r", "optional" : true, "Relationship" : { "/*" : { @@ -3744,7 +3766,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Groups the model belongs to", + "doc" : "Groups the model belongs to\r", "optional" : true, "Relationship" : { "/*" : { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json index 8c7595c5e505d..e4cc5c42303ee 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json @@ -3544,12 +3544,23 @@ "type" : "record", "name" : "MLModelProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with a ML Model", + "doc" : "Properties associated with a ML Model\r", "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModel\r", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModel", + "doc" : "Documentation of the MLModel\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -3558,17 +3569,28 @@ }, { "name" : "date", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModel was developed", + "doc" : "Date when the MLModel was developed\r", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Audit stamp containing who created this and when\r", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModel was last modified\r", "optional" : true }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModel", + "doc" : "Version of the MLModel\r", "optional" : true }, { "name" : "type", "type" : "string", - "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc", + "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT_PARTIAL" @@ -3584,7 +3606,7 @@ "ref" : [ "string", "int", "float", "double", "boolean" ] } }, - "doc" : "Hyper Parameters of the MLModel\n\nNOTE: these are deprecated in favor of hyperParams", + "doc" : "Hyper Parameters of the MLModel\r\n\r\nNOTE: these are deprecated in favor of hyperParams\r", "optional" : true }, { "name" : "hyperParams", @@ -3619,7 +3641,7 @@ } } }, - "doc" : "Hyperparameters of the MLModel", + "doc" : "Hyperparameters of the MLModel\r", "optional" : true }, { "name" : "trainingMetrics", @@ -3654,7 +3676,7 @@ } } }, - "doc" : "Metrics of the MLModel used in training", + "doc" : "Metrics of the MLModel used in training\r", "optional" : true }, { "name" : "onlineMetrics", @@ -3662,7 +3684,7 @@ "type" : "array", "items" : "MLMetric" }, - "doc" : "Metrics of the MLModel used in production", + "doc" : "Metrics of the MLModel used in production\r", "optional" : true }, { "name" : "mlFeatures", @@ -3670,7 +3692,7 @@ "type" : "array", "items" : "com.linkedin.common.MLFeatureUrn" }, - "doc" : "List of features used for MLModel training", + "doc" : "List of features used for MLModel training\r", "optional" : true, "Relationship" : { "/*" : { @@ -3685,7 +3707,7 @@ "type" : "array", "items" : "string" }, - "doc" : "Tags for the MLModel", + "doc" : "Tags for the MLModel\r", "default" : [ ] }, { "name" : "deployments", @@ -3693,7 +3715,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Deployments for the MLModel", + "doc" : "Deployments for the MLModel\r", "optional" : true, "Relationship" : { "/*" : { @@ -3707,11 +3729,11 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) used to train the model", + "doc" : "List of jobs (if any) used to train the model. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", "optional" : true, "Relationship" : { "/*" : { - "entityTypes" : [ "dataJob" ], + "entityTypes" : [ "dataJob", "dataProcessInstance" ], "isLineage" : true, "name" : "TrainedBy" } @@ -3722,7 +3744,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) that use the model", + "doc" : "List of jobs (if any) that use the model\r", "optional" : true, "Relationship" : { "/*" : { @@ -3738,7 +3760,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Groups the model belongs to", + "doc" : "Groups the model belongs to\r", "optional" : true, "Relationship" : { "/*" : { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json index 75e5c9a559076..e375ac698ab51 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json @@ -3978,12 +3978,23 @@ "type" : "record", "name" : "MLModelProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with a ML Model", + "doc" : "Properties associated with a ML Model\r", "include" : [ "com.linkedin.common.CustomProperties", "com.linkedin.common.ExternalReference" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModel\r", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModel", + "doc" : "Documentation of the MLModel\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -3992,17 +4003,28 @@ }, { "name" : "date", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModel was developed", + "doc" : "Date when the MLModel was developed\r", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Audit stamp containing who created this and when\r", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModel was last modified\r", "optional" : true }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModel", + "doc" : "Version of the MLModel\r", "optional" : true }, { "name" : "type", "type" : "string", - "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc", + "doc" : "Type of Algorithm or MLModel such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT_PARTIAL" @@ -4018,7 +4040,7 @@ "ref" : [ "string", "int", "float", "double", "boolean" ] } }, - "doc" : "Hyper Parameters of the MLModel\n\nNOTE: these are deprecated in favor of hyperParams", + "doc" : "Hyper Parameters of the MLModel\r\n\r\nNOTE: these are deprecated in favor of hyperParams\r", "optional" : true }, { "name" : "hyperParams", @@ -4053,7 +4075,7 @@ } } }, - "doc" : "Hyperparameters of the MLModel", + "doc" : "Hyperparameters of the MLModel\r", "optional" : true }, { "name" : "trainingMetrics", @@ -4088,7 +4110,7 @@ } } }, - "doc" : "Metrics of the MLModel used in training", + "doc" : "Metrics of the MLModel used in training\r", "optional" : true }, { "name" : "onlineMetrics", @@ -4096,7 +4118,7 @@ "type" : "array", "items" : "MLMetric" }, - "doc" : "Metrics of the MLModel used in production", + "doc" : "Metrics of the MLModel used in production\r", "optional" : true }, { "name" : "mlFeatures", @@ -4104,7 +4126,7 @@ "type" : "array", "items" : "com.linkedin.common.MLFeatureUrn" }, - "doc" : "List of features used for MLModel training", + "doc" : "List of features used for MLModel training\r", "optional" : true, "Relationship" : { "/*" : { @@ -4119,7 +4141,7 @@ "type" : "array", "items" : "string" }, - "doc" : "Tags for the MLModel", + "doc" : "Tags for the MLModel\r", "default" : [ ] }, { "name" : "deployments", @@ -4127,7 +4149,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Deployments for the MLModel", + "doc" : "Deployments for the MLModel\r", "optional" : true, "Relationship" : { "/*" : { @@ -4141,11 +4163,11 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) used to train the model", + "doc" : "List of jobs (if any) used to train the model. Visible in Lineage. Note that ML Models can also be specified as the output of a specific Data Process Instances (runs) via the DataProcessInstanceOutputs aspect.\r", "optional" : true, "Relationship" : { "/*" : { - "entityTypes" : [ "dataJob" ], + "entityTypes" : [ "dataJob", "dataProcessInstance" ], "isLineage" : true, "name" : "TrainedBy" } @@ -4156,7 +4178,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "List of jobs (if any) that use the model", + "doc" : "List of jobs (if any) that use the model\r", "optional" : true, "Relationship" : { "/*" : { @@ -4172,7 +4194,7 @@ "type" : "array", "items" : "com.linkedin.common.Urn" }, - "doc" : "Groups the model belongs to", + "doc" : "Groups the model belongs to\r", "optional" : true, "Relationship" : { "/*" : { @@ -4975,12 +4997,23 @@ "type" : "record", "name" : "MLModelGroupProperties", "namespace" : "com.linkedin.ml.metadata", - "doc" : "Properties associated with an ML Model Group", + "doc" : "Properties associated with an ML Model Group\r", "include" : [ "com.linkedin.common.CustomProperties" ], "fields" : [ { + "name" : "name", + "type" : "string", + "doc" : "Display name of the MLModelGroup\r", + "optional" : true, + "Searchable" : { + "boostScore" : 10.0, + "enableAutocomplete" : true, + "fieldType" : "WORD_GRAM", + "queryByDefault" : true + } + }, { "name" : "description", "type" : "string", - "doc" : "Documentation of the MLModelGroup", + "doc" : "Documentation of the MLModelGroup\r", "optional" : true, "Searchable" : { "fieldType" : "TEXT", @@ -4989,12 +5022,38 @@ }, { "name" : "createdAt", "type" : "com.linkedin.common.Time", - "doc" : "Date when the MLModelGroup was developed", + "doc" : "Date when the MLModelGroup was developed\r", + "optional" : true, + "deprecated" : true + }, { + "name" : "created", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Time and Actor who created the MLModelGroup\r", + "optional" : true + }, { + "name" : "lastModified", + "type" : "com.linkedin.common.TimeStamp", + "doc" : "Date when the MLModelGroup was last modified\r", "optional" : true + }, { + "name" : "trainingJobs", + "type" : { + "type" : "array", + "items" : "com.linkedin.common.Urn" + }, + "doc" : "List of jobs (if any) used to train the model group. Visible in Lineage.\r", + "optional" : true, + "Relationship" : { + "/*" : { + "entityTypes" : [ "dataJob" ], + "isLineage" : true, + "name" : "TrainedBy" + } + } }, { "name" : "version", "type" : "com.linkedin.common.VersionTag", - "doc" : "Version of the MLModelGroup", + "doc" : "Version of the MLModelGroup\r", "optional" : true } ], "Aspect" : { From 09a9b6eef912d8f855a2cc6fdc03032f5ec7a652 Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Mon, 23 Dec 2024 22:39:57 -0800 Subject: [PATCH 24/39] feat(ingest/looker): Do not emit usage for non-ingested dashboards and charts (#11647) --- .../ingestion/source/looker/looker_common.py | 9 + .../ingestion/source/looker/looker_source.py | 22 +- .../ingestion/source/looker/looker_usage.py | 40 +- .../looker/looker_mces_usage_history.json | 364 +++++++++++++++++- .../tests/integration/looker/test_looker.py | 87 ++++- 5 files changed, 482 insertions(+), 40 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py index a66962f962255..1183916e9b3fe 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py @@ -1408,6 +1408,15 @@ class LookerDashboardSourceReport(StaleEntityRemovalSourceReport): dashboards_with_activity: LossySet[str] = dataclasses_field( default_factory=LossySet ) + + # Entities that don't seem to exist, so we don't emit usage aspects for them despite having usage data + dashboards_skipped_for_usage: LossySet[str] = dataclasses_field( + default_factory=LossySet + ) + charts_skipped_for_usage: LossySet[str] = dataclasses_field( + default_factory=LossySet + ) + stage_latency: List[StageLatency] = dataclasses_field(default_factory=list) _looker_explore_registry: Optional[LookerExploreRegistry] = None total_explores: int = 0 diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py index 815c5dfb1c014..8487d5113bc1d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py @@ -68,6 +68,7 @@ ViewField, ViewFieldType, gen_model_key, + get_urn_looker_element_id, ) from datahub.ingestion.source.looker.looker_config import LookerDashboardSourceConfig from datahub.ingestion.source.looker.looker_lib_wrapper import LookerAPI @@ -165,6 +166,9 @@ def __init__(self, config: LookerDashboardSourceConfig, ctx: PipelineContext): # Required, as we do not ingest all folders but only those that have dashboards/looks self.processed_folders: List[str] = [] + # Keep track of ingested chart urns, to omit usage for non-ingested entities + self.chart_urns: Set[str] = set() + @staticmethod def test_connection(config_dict: dict) -> TestConnectionReport: test_report = TestConnectionReport() @@ -642,6 +646,7 @@ def _make_chart_metadata_events( chart_urn = self._make_chart_urn( element_id=dashboard_element.get_urn_element_id() ) + self.chart_urns.add(chart_urn) chart_snapshot = ChartSnapshot( urn=chart_urn, aspects=[Status(removed=False)], @@ -1380,7 +1385,9 @@ def _get_folder_and_ancestors_workunits( yield from self._emit_folder_as_container(folder) def extract_usage_stat( - self, looker_dashboards: List[looker_usage.LookerDashboardForUsage] + self, + looker_dashboards: List[looker_usage.LookerDashboardForUsage], + ingested_chart_urns: Set[str], ) -> List[MetadataChangeProposalWrapper]: looks: List[looker_usage.LookerChartForUsage] = [] # filter out look from all dashboard @@ -1391,6 +1398,15 @@ def extract_usage_stat( # dedup looks looks = list({str(look.id): look for look in looks}.values()) + filtered_looks = [] + for look in looks: + if not look.id: + continue + chart_urn = self._make_chart_urn(get_urn_looker_element_id(look.id)) + if chart_urn in ingested_chart_urns: + filtered_looks.append(look) + else: + self.reporter.charts_skipped_for_usage.add(look.id) # Keep stat generators to generate entity stat aspect later stat_generator_config: looker_usage.StatGeneratorConfig = ( @@ -1414,7 +1430,7 @@ def extract_usage_stat( stat_generator_config, self.reporter, self._make_chart_urn, - looks, + filtered_looks, ) mcps: List[MetadataChangeProposalWrapper] = [] @@ -1669,7 +1685,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: if self.source_config.extract_usage_history: self.reporter.report_stage_start("usage_extraction") usage_mcps: List[MetadataChangeProposalWrapper] = self.extract_usage_stat( - looker_dashboards_for_usage + looker_dashboards_for_usage, self.chart_urns ) for usage_mcp in usage_mcps: yield usage_mcp.as_workunit() diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py index ef7d64e4f42d4..098d7d73a3da8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py @@ -42,6 +42,7 @@ TimeWindowSizeClass, _Aspect as AspectAbstract, ) +from datahub.utilities.lossy_collections import LossySet logger = logging.getLogger(__name__) @@ -170,7 +171,7 @@ def __init__( self.config = config self.looker_models = looker_models # Later it will help to find out for what are the looker entities from query result - self.id_vs_model: Dict[str, ModelForUsage] = { + self.id_to_model: Dict[str, ModelForUsage] = { self.get_id(looker_object): looker_object for looker_object in looker_models } self.post_filter = len(self.looker_models) > 100 @@ -225,6 +226,10 @@ def get_id(self, looker_object: ModelForUsage) -> str: def get_id_from_row(self, row: dict) -> str: pass + @abstractmethod + def report_skip_set(self) -> LossySet[str]: + pass + def create_mcp( self, model: ModelForUsage, aspect: Aspect ) -> MetadataChangeProposalWrapper: @@ -258,20 +263,11 @@ def _process_entity_timeseries_rows( return entity_stat_aspect - def _process_absolute_aspect(self) -> List[Tuple[ModelForUsage, AspectAbstract]]: - aspects: List[Tuple[ModelForUsage, AspectAbstract]] = [] - for looker_object in self.looker_models: - aspects.append( - (looker_object, self.to_entity_absolute_stat_aspect(looker_object)) - ) - - return aspects - def _fill_user_stat_aspect( self, entity_usage_stat: Dict[Tuple[str, str], Aspect], user_wise_rows: List[Dict], - ) -> Iterable[Tuple[ModelForUsage, Aspect]]: + ) -> Iterable[Tuple[str, Aspect]]: logger.debug("Entering fill user stat aspect") # We first resolve all the users using a threadpool to warm up the cache @@ -300,7 +296,7 @@ def _fill_user_stat_aspect( for row in user_wise_rows: # Confirm looker object was given for stat generation - looker_object = self.id_vs_model.get(self.get_id_from_row(row)) + looker_object = self.id_to_model.get(self.get_id_from_row(row)) if looker_object is None: logger.warning( "Looker object with id({}) was not register with stat generator".format( @@ -338,7 +334,7 @@ def _fill_user_stat_aspect( logger.debug("Starting to yield answers for user-wise counts") for (id, _), aspect in entity_usage_stat.items(): - yield self.id_vs_model[id], aspect + yield id, aspect def _execute_query(self, query: LookerQuery, query_name: str) -> List[Dict]: rows = [] @@ -357,7 +353,7 @@ def _execute_query(self, query: LookerQuery, query_name: str) -> List[Dict]: ) if self.post_filter: logger.debug("post filtering") - rows = [r for r in rows if self.get_id_from_row(r) in self.id_vs_model] + rows = [r for r in rows if self.get_id_from_row(r) in self.id_to_model] logger.debug("Filtered down to %d rows", len(rows)) except Exception as e: logger.warning(f"Failed to execute {query_name} query: {e}") @@ -378,7 +374,8 @@ def generate_usage_stat_mcps(self) -> Iterable[MetadataChangeProposalWrapper]: return # yield absolute stat for looker entities - for looker_object, aspect in self._process_absolute_aspect(): # type: ignore + for looker_object in self.looker_models: + aspect = self.to_entity_absolute_stat_aspect(looker_object) yield self.create_mcp(looker_object, aspect) # Execute query and process the raw json which contains stat information @@ -399,10 +396,13 @@ def generate_usage_stat_mcps(self) -> Iterable[MetadataChangeProposalWrapper]: ) user_wise_rows = self._execute_query(user_wise_query_with_filters, "user_query") # yield absolute stat for entity - for looker_object, aspect in self._fill_user_stat_aspect( + for object_id, aspect in self._fill_user_stat_aspect( entity_usage_stat, user_wise_rows ): - yield self.create_mcp(looker_object, aspect) + if object_id in self.id_to_model: + yield self.create_mcp(self.id_to_model[object_id], aspect) + else: + self.report_skip_set().add(object_id) class DashboardStatGenerator(BaseStatGenerator): @@ -425,6 +425,9 @@ def __init__( def get_stats_generator_name(self) -> str: return "DashboardStats" + def report_skip_set(self) -> LossySet[str]: + return self.report.dashboards_skipped_for_usage + def get_filter(self) -> Dict[ViewField, str]: return { HistoryViewField.HISTORY_DASHBOARD_ID: ",".join( @@ -541,6 +544,9 @@ def __init__( def get_stats_generator_name(self) -> str: return "ChartStats" + def report_skip_set(self) -> LossySet[str]: + return self.report.charts_skipped_for_usage + def get_filter(self) -> Dict[ViewField, str]: return { LookViewField.LOOK_ID: ",".join( diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json index 594983c8fb0f2..ed0c5401c9029 100644 --- a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json +++ b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json @@ -1,4 +1,66 @@ [ +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.ChartSnapshot": { + "urn": "urn:li:chart:(looker,dashboard_elements.3)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.chart.ChartInfo": { + "customProperties": { + "upstream_fields": "" + }, + "title": "", + "description": "", + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + }, + "chartUrl": "https://looker.company.com/x/", + "inputs": [ + { + "string": "urn:li:dataset:(urn:li:dataPlatform:looker,look_data.explore.look_view,PROD)" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(looker,dashboard_elements.3)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Look" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DashboardSnapshot": { @@ -9,7 +71,9 @@ "customProperties": {}, "title": "foo", "description": "lorem ipsum", - "charts": [], + "charts": [ + "urn:li:chart:(looker,dashboard_elements.3)" + ], "datasets": [], "dashboards": [], "lastModified": { @@ -89,6 +153,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(looker,dashboard_elements.3)", + "changeType": "UPSERT", + "aspectName": "inputFields", + "aspect": { + "json": { + "fields": [] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dashboard", "entityUrn": "urn:li:dashboard:(looker,dashboards.1)", @@ -215,6 +295,98 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "looker", + "env": "PROD", + "model_name": "look_data" + }, + "name": "look_data", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "LookML Model" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "Explore" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { @@ -389,6 +561,180 @@ "lastRunId": "no-run-id-provided" } }, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,look_data.explore.look_view,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.BrowsePaths": { + "paths": [ + "/Explore/look_data" + ] + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "project": "lkml_samples", + "model": "look_data", + "looker.explore.label": "My Explore View", + "looker.explore.name": "look_view", + "looker.explore.file": "test_source_file.lkml" + }, + "externalUrl": "https://looker.company.com/explore/look_data/look_view", + "name": "My Explore View", + "description": "lorem ipsum", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", + "type": "VIEW" + } + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "look_view", + "platform": "urn:li:dataPlatform:looker", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "" + } + }, + "fields": [ + { + "fieldPath": "dim1", + "nullable": false, + "description": "dimension one description", + "label": "Dimensions One Label", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + } + ], + "primaryKeys": [] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,look_data.explore.look_view,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Explore" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,look_data.explore.look_view,PROD)", + "changeType": "UPSERT", + "aspectName": "embed", + "aspect": { + "json": { + "renderUrl": "https://looker.company.com/embed/explore/look_data/look_view" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,look_data.explore.look_view,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,look_data.explore.look_view,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "Explore" + }, + { + "id": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb", + "urn": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { @@ -747,22 +1093,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "chart", - "entityUrn": "urn:li:chart:(looker,dashboard_elements.3)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "looker-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/test_looker.py b/metadata-ingestion/tests/integration/looker/test_looker.py index a39de8384efb2..c96bcc729a95d 100644 --- a/metadata-ingestion/tests/integration/looker/test_looker.py +++ b/metadata-ingestion/tests/integration/looker/test_looker.py @@ -31,7 +31,10 @@ from datahub.ingestion.api.source import SourceReport from datahub.ingestion.run.pipeline import Pipeline, PipelineInitError from datahub.ingestion.source.looker import looker_common, looker_usage -from datahub.ingestion.source.looker.looker_common import LookerExplore +from datahub.ingestion.source.looker.looker_common import ( + LookerDashboardSourceReport, + LookerExplore, +) from datahub.ingestion.source.looker.looker_config import LookerCommonConfig from datahub.ingestion.source.looker.looker_lib_wrapper import ( LookerAPI, @@ -414,7 +417,9 @@ def setup_mock_dashboard_multiple_charts(mocked_client): ) -def setup_mock_dashboard_with_usage(mocked_client): +def setup_mock_dashboard_with_usage( + mocked_client: mock.MagicMock, skip_look: bool = False +) -> None: mocked_client.all_dashboards.return_value = [Dashboard(id="1")] mocked_client.dashboard.return_value = Dashboard( id="1", @@ -437,7 +442,13 @@ def setup_mock_dashboard_with_usage(mocked_client): ), ), DashboardElement( - id="3", type="", look=LookWithQuery(id="3", view_count=30) + id="3", + type="" if skip_look else "vis", # Looks only ingested if type == `vis` + look=LookWithQuery( + id="3", + view_count=30, + query=Query(model="look_data", view="look_view"), + ), ), ], ) @@ -611,6 +622,12 @@ def side_effect_query_inline( HistoryViewField.HISTORY_DASHBOARD_USER: 1, HistoryViewField.HISTORY_DASHBOARD_RUN_COUNT: 5, }, + { + HistoryViewField.HISTORY_DASHBOARD_ID: "5", + HistoryViewField.HISTORY_CREATED_DATE: "2022-07-07", + HistoryViewField.HISTORY_DASHBOARD_USER: 1, + HistoryViewField.HISTORY_DASHBOARD_RUN_COUNT: 5, + }, ] ), looker_usage.QueryId.DASHBOARD_PER_USER_PER_DAY_USAGE_STAT: json.dumps( @@ -790,6 +807,70 @@ def test_looker_ingest_usage_history(pytestconfig, tmp_path, mock_time): ) +@freeze_time(FROZEN_TIME) +def test_looker_filter_usage_history(pytestconfig, tmp_path, mock_time): + mocked_client = mock.MagicMock() + with mock.patch("looker_sdk.init40") as mock_sdk: + mock_sdk.return_value = mocked_client + setup_mock_dashboard_with_usage(mocked_client, skip_look=True) + mocked_client.run_inline_query.side_effect = side_effect_query_inline + setup_mock_explore(mocked_client) + setup_mock_user(mocked_client) + + temp_output_file = f"{tmp_path}/looker_mces.json" + pipeline = Pipeline.create( + { + "run_id": "looker-test", + "source": { + "type": "looker", + "config": { + "base_url": "https://looker.company.com", + "client_id": "foo", + "client_secret": "bar", + "extract_usage_history": True, + "max_threads": 1, + }, + }, + "sink": { + "type": "file", + "config": { + "filename": temp_output_file, + }, + }, + } + ) + pipeline.run() + pipeline.pretty_print_summary() + pipeline.raise_from_status() + + # There should be 4 dashboardUsageStatistics aspects (one absolute and 3 timeseries) + dashboard_usage_aspect_count = 0 + # There should be 0 chartUsageStatistics -- filtered by set of ingested charts + chart_usage_aspect_count = 0 + with open(temp_output_file) as f: + temp_output_dict = json.load(f) + for element in temp_output_dict: + if ( + element.get("entityType") == "dashboard" + and element.get("aspectName") == "dashboardUsageStatistics" + ): + dashboard_usage_aspect_count = dashboard_usage_aspect_count + 1 + if ( + element.get("entityType") == "chart" + and element.get("aspectName") == "chartUsageStatistics" + ): + chart_usage_aspect_count = chart_usage_aspect_count + 1 + + assert dashboard_usage_aspect_count == 4 + assert chart_usage_aspect_count == 0 + + source_report = cast(LookerDashboardSourceReport, pipeline.source.get_report()) + # From timeseries query + assert str(source_report.dashboards_skipped_for_usage) == str(["5"]) + # From dashboard element + assert str(source_report.charts_skipped_for_usage) == str(["3"]) + + @freeze_time(FROZEN_TIME) def test_looker_ingest_stateful(pytestconfig, tmp_path, mock_time, mock_datahub_graph): output_file_name: str = "looker_mces.json" From 87e7b58ac699005ca5757e6ef47fb853d89a6583 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Tue, 24 Dec 2024 10:46:19 +0100 Subject: [PATCH 25/39] fix(tableau): retry on InternalServerError 504 (#12213) --- .../ingestion/source/tableau/tableau.py | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index 984cf9357199d..2b7aac2bea1d0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -35,7 +35,10 @@ SiteItem, TableauAuth, ) -from tableauserverclient.server.endpoint.exceptions import NonXMLResponseError +from tableauserverclient.server.endpoint.exceptions import ( + InternalServerError, + NonXMLResponseError, +) from urllib3 import Retry import datahub.emitter.mce_builder as builder @@ -1196,6 +1199,24 @@ def get_connection_object_page( retry_on_auth_error=False, retries_remaining=retries_remaining - 1, ) + + except InternalServerError as ise: + # In some cases Tableau Server returns 504 error, which is a timeout error, so it worths to retry. + if ise.code == 504: + if retries_remaining <= 0: + raise ise + return self.get_connection_object_page( + query=query, + connection_type=connection_type, + query_filter=query_filter, + fetch_size=fetch_size, + current_cursor=current_cursor, + retry_on_auth_error=False, + retries_remaining=retries_remaining - 1, + ) + else: + raise ise + except OSError: # In tableauseverclient 0.26 (which was yanked and released in 0.28 on 2023-10-04), # the request logic was changed to use threads. From 4d990b06bd0df4f51443893e2efb39e09d9818b6 Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Tue, 24 Dec 2024 18:14:51 +0530 Subject: [PATCH 26/39] fix(ingest/snowflake): always ingest view and external table ddl lineage (#12191) --- docs/how/updating-datahub.md | 2 +- .../source/snowflake/snowflake_config.py | 28 ++----------------- .../source/snowflake/snowflake_lineage_v2.py | 13 ++------- .../source/snowflake/snowflake_query.py | 9 ------ .../source/snowflake/snowflake_schema_gen.py | 6 +--- .../source/snowflake/snowflake_shares.py | 2 +- .../source/snowflake/snowflake_v2.py | 20 +++++++++---- .../source_report/ingestion_stage.py | 1 + .../tests/integration/snowflake/common.py | 2 -- .../integration/snowflake/test_snowflake.py | 2 -- .../test_snowflake_classification.py | 1 - .../snowflake/test_snowflake_failures.py | 2 -- .../snowflake/test_snowflake_tag.py | 2 -- .../performance/snowflake/test_snowflake.py | 1 - .../unit/snowflake/test_snowflake_source.py | 23 +++++++-------- 15 files changed, 36 insertions(+), 78 deletions(-) diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index 5bc0e66fa2ff1..a742ebe0cd896 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -17,7 +17,7 @@ This file documents any backwards-incompatible changes in DataHub and assists people when migrating to a new version. ## Next - +- #12191 - Configs `include_view_lineage` and `include_view_column_lineage` are removed from snowflake ingestion source. View and External Table DDL lineage will always be ingested when definitions are available. - #11560 - The PowerBI ingestion source configuration option include_workspace_name_in_dataset_urn determines whether the workspace name is included in the PowerBI dataset's URN.
PowerBI allows to have identical name of semantic model and their tables across the workspace, It will overwrite the semantic model in-case of multi-workspace ingestion.
Entity urn with `include_workspace_name_in_dataset_urn: false` diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index 1d1cc3c2af4f0..2b2dcf860cdb0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -163,26 +163,13 @@ class SnowflakeConfig( default=True, description="If enabled, populates the snowflake table-to-table and s3-to-snowflake table lineage. Requires appropriate grants given to the role and Snowflake Enterprise Edition or above.", ) - include_view_lineage: bool = pydantic.Field( - default=True, - description="If enabled, populates the snowflake view->table and table->view lineages. Requires appropriate grants given to the role, and include_table_lineage to be True. view->table lineage requires Snowflake Enterprise Edition or above.", - ) + + _include_view_lineage = pydantic_removed_field("include_view_lineage") + _include_view_column_lineage = pydantic_removed_field("include_view_column_lineage") ignore_start_time_lineage: bool = False upstream_lineage_in_report: bool = False - @pydantic.root_validator(skip_on_failure=True) - def validate_include_view_lineage(cls, values): - if ( - "include_table_lineage" in values - and not values.get("include_table_lineage") - and values.get("include_view_lineage") - ): - raise ValueError( - "include_table_lineage must be True for include_view_lineage to be set." - ) - return values - class SnowflakeV2Config( SnowflakeConfig, @@ -222,11 +209,6 @@ class SnowflakeV2Config( description="Populates table->table and view->table column lineage. Requires appropriate grants given to the role and the Snowflake Enterprise Edition or above.", ) - include_view_column_lineage: bool = Field( - default=True, - description="Populates view->view and table->view column lineage using DataHub's sql parser.", - ) - use_queries_v2: bool = Field( default=False, description="If enabled, uses the new queries extractor to extract queries from snowflake.", @@ -355,10 +337,6 @@ def get_sql_alchemy_url( self, database=database, username=username, password=password, role=role ) - @property - def parse_view_ddl(self) -> bool: - return self.include_view_column_lineage - @validator("shares") def validate_shares( cls, shares: Optional[Dict[str, SnowflakeShareConfig]], values: Dict diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index b815a6584379a..6b200590d7ab6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -8,7 +8,6 @@ from datahub.configuration.datetimes import parse_absolute_time from datahub.ingestion.api.closeable import Closeable -from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.aws.s3_util import make_s3_urn_for_lineage from datahub.ingestion.source.snowflake.constants import ( LINEAGE_PERMISSION_ERROR, @@ -163,11 +162,11 @@ def get_time_window(self) -> Tuple[datetime, datetime]: self.config.end_time, ) - def get_workunits( + def add_time_based_lineage_to_aggregator( self, discovered_tables: List[str], discovered_views: List[str], - ) -> Iterable[MetadataWorkUnit]: + ) -> None: if not self._should_ingest_lineage(): return @@ -177,9 +176,7 @@ def get_workunits( # snowflake view/table -> snowflake table self.populate_table_upstreams(discovered_tables) - for mcp in self.sql_aggregator.gen_metadata(): - yield mcp.as_workunit() - + def update_state(self): if self.redundant_run_skip_handler: # Update the checkpoint state for this run. self.redundant_run_skip_handler.update_state( @@ -337,10 +334,6 @@ def _fetch_upstream_lineages_for_tables(self) -> Iterable[UpstreamLineageEdge]: start_time_millis=int(self.start_time.timestamp() * 1000), end_time_millis=int(self.end_time.timestamp() * 1000), upstreams_deny_pattern=self.config.temporary_tables_pattern, - # The self.config.include_view_lineage setting is about fetching upstreams of views. - # We always generate lineage pointing at views from tables, even if self.config.include_view_lineage is False. - # TODO: Remove this `include_view_lineage` flag, since it's effectively dead code. - include_view_lineage=True, include_column_lineage=self.config.include_column_lineage, ) try: diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py index 97c398c1962d6..a94b39476b2c2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py @@ -376,7 +376,6 @@ def view_dependencies() -> str: def table_to_table_lineage_history_v2( start_time_millis: int, end_time_millis: int, - include_view_lineage: bool = True, include_column_lineage: bool = True, upstreams_deny_pattern: List[str] = DEFAULT_TEMP_TABLES_PATTERNS, ) -> str: @@ -385,14 +384,12 @@ def table_to_table_lineage_history_v2( start_time_millis, end_time_millis, upstreams_deny_pattern, - include_view_lineage, ) else: return SnowflakeQuery.table_upstreams_only( start_time_millis, end_time_millis, upstreams_deny_pattern, - include_view_lineage, ) @staticmethod @@ -677,12 +674,9 @@ def table_upstreams_with_column_lineage( start_time_millis: int, end_time_millis: int, upstreams_deny_pattern: List[str], - include_view_lineage: bool = True, ) -> str: allowed_upstream_table_domains = ( SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER - if include_view_lineage - else SnowflakeQuery.ACCESS_HISTORY_TABLE_DOMAINS_FILTER ) upstream_sql_filter = create_deny_regex_sql_filter( @@ -847,12 +841,9 @@ def table_upstreams_only( start_time_millis: int, end_time_millis: int, upstreams_deny_pattern: List[str], - include_view_lineage: bool = True, ) -> str: allowed_upstream_table_domains = ( SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER - if include_view_lineage - else SnowflakeQuery.ACCESS_HISTORY_TABLE_DOMAINS_FILTER ) upstream_sql_filter = create_deny_regex_sql_filter( diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py index 4b72b09fafe2d..8a1bf15b7a7bc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py @@ -435,11 +435,7 @@ def _process_schema( ) if self.config.include_views: - if ( - self.aggregator - and self.config.include_view_lineage - and self.config.parse_view_ddl - ): + if self.aggregator: for view in views: view_identifier = self.identifiers.get_dataset_identifier( view.name, schema_name, db_name diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py index 794a6f4a59f46..606acd53dc332 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py @@ -72,7 +72,7 @@ def get_shares_workunits( assert len(sibling_dbs) == 1 # SnowflakeLineageExtractor is unaware of database->schema->table hierarchy # hence this lineage code is not written in SnowflakeLineageExtractor - # also this is not governed by configs include_table_lineage and include_view_lineage + # also this is not governed by configs include_table_lineage yield self.get_upstream_lineage_with_primary_sibling( db.name, schema.name, table_name, sibling_dbs[0] ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index 884e6c49f5b62..954e8a29c1a1b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -82,6 +82,7 @@ LINEAGE_EXTRACTION, METADATA_EXTRACTION, QUERIES_EXTRACTION, + VIEW_PARSING, ) from datahub.sql_parsing.sql_parsing_aggregator import SqlParsingAggregator from datahub.utilities.registries.domain_registry import DomainRegistry @@ -103,7 +104,7 @@ @capability(SourceCapability.DESCRIPTIONS, "Enabled by default") @capability( SourceCapability.LINEAGE_COARSE, - "Enabled by default, can be disabled via configuration `include_table_lineage` and `include_view_lineage`", + "Enabled by default, can be disabled via configuration `include_table_lineage`", ) @capability( SourceCapability.LINEAGE_FINE, @@ -512,15 +513,14 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: discovered_datasets = discovered_tables + discovered_views if self.config.use_queries_v2: - self.report.set_ingestion_stage("*", "View Parsing") - assert self.aggregator is not None + self.report.set_ingestion_stage("*", VIEW_PARSING) yield from auto_workunit(self.aggregator.gen_metadata()) self.report.set_ingestion_stage("*", QUERIES_EXTRACTION) schema_resolver = self.aggregator._schema_resolver - queries_extractor: SnowflakeQueriesExtractor = SnowflakeQueriesExtractor( + queries_extractor = SnowflakeQueriesExtractor( connection=self.connection, config=SnowflakeQueriesExtractorConfig( window=self.config, @@ -546,13 +546,21 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: queries_extractor.close() else: - if self.config.include_table_lineage and self.lineage_extractor: + if self.lineage_extractor: self.report.set_ingestion_stage("*", LINEAGE_EXTRACTION) - yield from self.lineage_extractor.get_workunits( + self.lineage_extractor.add_time_based_lineage_to_aggregator( discovered_tables=discovered_tables, discovered_views=discovered_views, ) + # This would emit view and external table ddl lineage + # as well as query lineage via lineage_extractor + for mcp in self.aggregator.gen_metadata(): + yield mcp.as_workunit() + + if self.lineage_extractor: + self.lineage_extractor.update_state() + if ( self.config.include_usage_stats or self.config.include_operational_stats ) and self.usage_extractor: diff --git a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py index 92407eaae6e90..42b3b648bd298 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py +++ b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py @@ -15,6 +15,7 @@ USAGE_EXTRACTION_OPERATIONAL_STATS = "Usage Extraction Operational Stats" USAGE_EXTRACTION_USAGE_AGGREGATION = "Usage Extraction Usage Aggregation" EXTERNAL_TABLE_DDL_LINEAGE = "External table DDL Lineage" +VIEW_PARSING = "View Parsing" QUERIES_EXTRACTION = "Queries Extraction" PROFILING = "Profiling" diff --git a/metadata-ingestion/tests/integration/snowflake/common.py b/metadata-ingestion/tests/integration/snowflake/common.py index 862d27186703a..7b4f5abe1cd46 100644 --- a/metadata-ingestion/tests/integration/snowflake/common.py +++ b/metadata-ingestion/tests/integration/snowflake/common.py @@ -458,7 +458,6 @@ def default_query_results( # noqa: C901 snowflake_query.SnowflakeQuery.table_to_table_lineage_history_v2( start_time_millis=1654473600000, end_time_millis=1654586220000, - include_view_lineage=True, include_column_lineage=True, ), ): @@ -548,7 +547,6 @@ def default_query_results( # noqa: C901 snowflake_query.SnowflakeQuery.table_to_table_lineage_history_v2( start_time_millis=1654473600000, end_time_millis=1654586220000, - include_view_lineage=True, include_column_lineage=False, ), ): diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py index 1d7470d24f768..ef4918a20e640 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py @@ -117,7 +117,6 @@ def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph): schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]), include_technical_schema=True, include_table_lineage=True, - include_view_lineage=True, include_usage_stats=True, format_sql_queries=True, validate_upstreams_against_patterns=False, @@ -216,7 +215,6 @@ def test_snowflake_private_link_and_incremental_mcps( include_table_lineage=True, include_column_lineage=False, include_views=True, - include_view_lineage=True, include_usage_stats=False, format_sql_queries=True, incremental_lineage=False, diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_classification.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_classification.py index 75a9df4f28051..52453b30f740a 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_classification.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_classification.py @@ -66,7 +66,6 @@ def test_snowflake_classification_perf(num_workers, num_cols_per_table, num_tabl schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]), include_technical_schema=True, include_table_lineage=False, - include_view_lineage=False, include_column_lineage=False, include_usage_stats=False, include_operational_stats=False, diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py index 0b838b0bb59c3..de6e996a52642 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py @@ -49,7 +49,6 @@ def snowflake_pipeline_config(tmp_path): include_technical_schema=True, match_fully_qualified_names=True, schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]), - include_view_lineage=False, include_usage_stats=False, start_time=datetime(2022, 6, 6, 0, 0, 0, 0).replace( tzinfo=timezone.utc @@ -227,7 +226,6 @@ def test_snowflake_missing_snowflake_lineage_permission_causes_pipeline_failure( snowflake_query.SnowflakeQuery.table_to_table_lineage_history_v2( start_time_millis=1654473600000, end_time_millis=1654586220000, - include_view_lineage=True, include_column_lineage=True, ) ], diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_tag.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_tag.py index d5e265e783882..9bb598cb0c1c7 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_tag.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_tag.py @@ -30,7 +30,6 @@ def test_snowflake_tag_pattern(): ), include_technical_schema=True, include_table_lineage=False, - include_view_lineage=False, include_column_lineage=False, include_usage_stats=False, include_operational_stats=False, @@ -74,7 +73,6 @@ def test_snowflake_tag_pattern_deny(): ), include_technical_schema=True, include_table_lineage=False, - include_view_lineage=False, include_column_lineage=False, include_usage_stats=False, include_operational_stats=False, diff --git a/metadata-ingestion/tests/performance/snowflake/test_snowflake.py b/metadata-ingestion/tests/performance/snowflake/test_snowflake.py index 5042c78c2e7b9..984d9e4295745 100644 --- a/metadata-ingestion/tests/performance/snowflake/test_snowflake.py +++ b/metadata-ingestion/tests/performance/snowflake/test_snowflake.py @@ -37,7 +37,6 @@ def run_test(): password="TST_PWD", include_technical_schema=False, include_table_lineage=True, - include_view_lineage=True, include_usage_stats=True, include_operational_stats=True, start_time=datetime(2022, 6, 6, 0, 0, 0, 0).replace(tzinfo=timezone.utc), diff --git a/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py b/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py index 2ff85a08f052f..75f32b535eb2e 100644 --- a/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py +++ b/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py @@ -257,17 +257,6 @@ def test_options_contain_connect_args(): assert connect_args is not None -def test_snowflake_config_with_view_lineage_no_table_lineage_throws_error(): - config_dict = default_config_dict.copy() - config_dict["include_view_lineage"] = True - config_dict["include_table_lineage"] = False - with pytest.raises( - ValidationError, - match="include_table_lineage must be True for include_view_lineage to be set", - ): - SnowflakeV2Config.parse_obj(config_dict) - - def test_snowflake_config_with_column_lineage_no_table_lineage_throws_error(): config_dict = default_config_dict.copy() config_dict["include_column_lineage"] = True @@ -667,6 +656,18 @@ def test_snowflake_utils() -> None: assert_doctest(datahub.ingestion.source.snowflake.snowflake_utils) +def test_using_removed_fields_causes_no_error() -> None: + assert SnowflakeV2Config.parse_obj( + { + "account_id": "test", + "username": "snowflake", + "password": "snowflake", + "include_view_lineage": "true", + "include_view_column_lineage": "true", + } + ) + + def test_snowflake_query_result_parsing(): db_row = { "DOWNSTREAM_TABLE_NAME": "db.schema.downstream_table", From d88e6c997713509d8ecdb463c42d072c5c857853 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Tue, 24 Dec 2024 16:03:36 +0100 Subject: [PATCH 27/39] fix(tableau): fixes wrong argument when reauthenticating (#12216) --- .../ingestion/source/tableau/tableau.py | 48 +++++++++++-------- .../tableau/test_tableau_ingest.py | 10 ++-- 2 files changed, 33 insertions(+), 25 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index 2b7aac2bea1d0..508500ffe489b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -621,6 +621,12 @@ def update_table( self.parsed_columns = parsed_columns +@dataclass +class SiteIdContentUrl: + site_id: str + site_content_url: str + + class TableauSourceReport(StaleEntityRemovalSourceReport): get_all_datasources_query_failed: bool = False num_get_datasource_query_failures: int = 0 @@ -773,7 +779,6 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: config=self.config, ctx=self.ctx, site=site, - site_id=site.id, report=self.report, server=self.server, platform=self.platform, @@ -792,8 +797,11 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: site_source = TableauSiteSource( config=self.config, ctx=self.ctx, - site=site, - site_id=self.server.site_id, + site=site + if site + else SiteIdContentUrl( + site_id=self.server.site_id, site_content_url=self.config.site + ), report=self.report, server=self.server, platform=self.platform, @@ -826,8 +834,7 @@ def __init__( self, config: TableauConfig, ctx: PipelineContext, - site: Optional[SiteItem], - site_id: Optional[str], + site: Union[SiteItem, SiteIdContentUrl], report: TableauSourceReport, server: Server, platform: str, @@ -838,13 +845,18 @@ def __init__( self.ctx: PipelineContext = ctx self.platform = platform - self.site: Optional[SiteItem] = site - if site_id is not None: - self.site_id: str = site_id + self.site: Optional[SiteItem] = None + if isinstance(site, SiteItem): + self.site = site + assert site.id is not None, "Site ID is required" + self.site_id = site.id + self.site_content_url = site.content_url + elif isinstance(site, SiteIdContentUrl): + self.site = None + self.site_id = site.site_id + self.site_content_url = site.site_content_url else: - assert self.site is not None, "site or site_id is required" - assert self.site.id is not None, "site_id is required when site is provided" - self.site_id = self.site.id + raise AssertionError("site or site id+content_url pair is required") self.database_tables: Dict[str, DatabaseTable] = {} self.tableau_stat_registry: Dict[str, UsageStat] = {} @@ -898,16 +910,14 @@ def dataset_browse_prefix(self) -> str: # datasets also have the env in the browse path return f"/{self.config.env.lower()}{self.no_env_browse_prefix}" - def _re_authenticate(self): + def _re_authenticate(self) -> None: + self.report.info( + message="Re-authenticating to Tableau", + context=f"site='{self.site_content_url}'", + ) # Sign-in again may not be enough because Tableau sometimes caches invalid sessions # so we need to recreate the Tableau Server object - self.server = self.config.make_tableau_client(self.site_id) - - @property - def site_content_url(self) -> Optional[str]: - if self.site and self.site.content_url: - return self.site.content_url - return None + self.server = self.config.make_tableau_client(self.site_content_url) def _populate_usage_stat_registry(self) -> None: if self.server is None: diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index c3a8880bf20a0..902ff243c802a 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -26,6 +26,7 @@ from datahub.ingestion.run.pipeline import Pipeline, PipelineContext from datahub.ingestion.source.tableau import tableau_constant as c from datahub.ingestion.source.tableau.tableau import ( + SiteIdContentUrl, TableauConfig, TableauProject, TableauSiteSource, @@ -1008,8 +1009,7 @@ def check_lineage_metadata( config=config, ctx=context, platform="tableau", - site=SiteItem(name="Site 1", content_url="site1"), - site_id="site1", + site=SiteIdContentUrl(site_id="id1", site_content_url="site1"), report=TableauSourceReport(), server=Server("https://test-tableau-server.com"), ) @@ -1313,8 +1313,7 @@ def test_permission_warning(pytestconfig, tmp_path, mock_datahub_graph): platform="tableau", config=mock.MagicMock(), ctx=mock.MagicMock(), - site=mock.MagicMock(), - site_id=None, + site=mock.MagicMock(spec=SiteItem, id="Site1", content_url="site1"), server=mock_sdk.return_value, report=reporter, ) @@ -1371,8 +1370,7 @@ def test_extract_project_hierarchy(extract_project_hierarchy, allowed_projects): config=config, ctx=context, platform="tableau", - site=SiteItem(name="Site 1", content_url="site1"), - site_id="site1", + site=mock.MagicMock(spec=SiteItem, id="Site1", content_url="site1"), report=TableauSourceReport(), server=Server("https://test-tableau-server.com"), ) From 48736a03dd56c70a3894efcbef6e95a23d8cbfdd Mon Sep 17 00:00:00 2001 From: sagar-salvi-apptware <159135491+sagar-salvi-apptware@users.noreply.github.com> Date: Wed, 25 Dec 2024 00:57:27 +0530 Subject: [PATCH 28/39] fix(ingest/looker): Add flag for Looker metadata extraction (#12205) Co-authored-by: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> --- .../src/datahub/sql_parsing/tool_meta_extractor.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py b/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py index 5af9d9d4f0fff..d2682252e0fbf 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py +++ b/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py @@ -40,6 +40,7 @@ def _get_last_line(query: str) -> str: class ToolMetaExtractorReport(Report): num_queries_meta_extracted: Dict[str, int] = field(default_factory=int_top_k_dict) failures: List[str] = field(default_factory=list) + looker_user_mapping_missing: Optional[bool] = None class ToolMetaExtractor: @@ -108,7 +109,9 @@ def extract_looker_user_mapping_from_graph( PlatformResource.search_by_filters(query=query, graph_client=graph) ) - if len(platform_resources) > 1: + if len(platform_resources) == 0: + report.looker_user_mapping_missing = True + elif len(platform_resources) > 1: report.failures.append( "Looker user metadata extraction failed. Found more than one looker user id mappings." ) From f4b33b59d1726dd962db4d3300f085cc60626a81 Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Tue, 24 Dec 2024 11:33:06 -0800 Subject: [PATCH 29/39] fix(ingest/mode): Handle 204 response and invalid json (#12156) Co-authored-by: Aseem Bansal --- .../src/datahub/ingestion/source/mode.py | 46 +++--- .../tests/integration/mode/test_mode.py | 141 ++++++++++++++++-- 2 files changed, 151 insertions(+), 36 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/mode.py b/metadata-ingestion/src/datahub/ingestion/source/mode.py index ef0b499129f97..68ecc5d8694ac 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mode.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mode.py @@ -5,6 +5,7 @@ from dataclasses import dataclass from datetime import datetime, timezone from functools import lru_cache +from json import JSONDecodeError from typing import Dict, Iterable, List, Optional, Set, Tuple, Union import dateutil.parser as dp @@ -193,6 +194,9 @@ class HTTPError429(HTTPError): pass +ModeRequestError = (HTTPError, JSONDecodeError) + + @dataclass class ModeSourceReport(StaleEntityRemovalSourceReport): filtered_spaces: LossyList[str] = dataclasses.field(default_factory=LossyList) @@ -328,11 +332,11 @@ def __init__(self, ctx: PipelineContext, config: ModeConfig): # Test the connection try: self._get_request_json(f"{self.config.connect_uri}/api/verify") - except HTTPError as http_error: + except ModeRequestError as e: self.report.report_failure( title="Failed to Connect", message="Unable to verify connection to mode.", - context=f"Error: {str(http_error)}", + context=f"Error: {str(e)}", ) self.workspace_uri = f"{self.config.connect_uri}/api/{self.config.workspace}" @@ -521,11 +525,11 @@ def _get_creator(self, href: str) -> Optional[str]: if self.config.owner_username_instead_of_email else user_json.get("email") ) - except HTTPError as http_error: + except ModeRequestError as e: self.report.report_warning( title="Failed to retrieve Mode creator", message=f"Unable to retrieve user for {href}", - context=f"Reason: {str(http_error)}", + context=f"Reason: {str(e)}", ) return user @@ -571,11 +575,11 @@ def _get_space_name_and_tokens(self) -> dict: logging.debug(f"Skipping space {space_name} due to space pattern") continue space_info[s.get("token", "")] = s.get("name", "") - except HTTPError as http_error: + except ModeRequestError as e: self.report.report_failure( title="Failed to Retrieve Spaces", message="Unable to retrieve spaces / collections for workspace.", - context=f"Workspace: {self.workspace_uri}, Error: {str(http_error)}", + context=f"Workspace: {self.workspace_uri}, Error: {str(e)}", ) return space_info @@ -721,11 +725,11 @@ def _get_data_sources(self) -> List[dict]: try: ds_json = self._get_request_json(f"{self.workspace_uri}/data_sources") data_sources = ds_json.get("_embedded", {}).get("data_sources", []) - except HTTPError as http_error: + except ModeRequestError as e: self.report.report_failure( title="Failed to retrieve Data Sources", message="Unable to retrieve data sources from Mode.", - context=f"Error: {str(http_error)}", + context=f"Error: {str(e)}", ) return data_sources @@ -812,11 +816,11 @@ def _get_definition(self, definition_name): if definition.get("name", "") == definition_name: return definition.get("source", "") - except HTTPError as http_error: + except ModeRequestError as e: self.report.report_failure( title="Failed to Retrieve Definition", message="Unable to retrieve definition from Mode.", - context=f"Definition Name: {definition_name}, Error: {str(http_error)}", + context=f"Definition Name: {definition_name}, Error: {str(e)}", ) return None @@ -1382,11 +1386,11 @@ def _get_reports(self, space_token: str) -> List[dict]: f"{self.workspace_uri}/spaces/{space_token}/reports" ) reports = reports_json.get("_embedded", {}).get("reports", {}) - except HTTPError as http_error: + except ModeRequestError as e: self.report.report_failure( title="Failed to Retrieve Reports for Space", message="Unable to retrieve reports for space token.", - context=f"Space Token: {space_token}, Error: {str(http_error)}", + context=f"Space Token: {space_token}, Error: {str(e)}", ) return reports @@ -1400,11 +1404,11 @@ def _get_datasets(self, space_token: str) -> List[dict]: url = f"{self.workspace_uri}/spaces/{space_token}/datasets" datasets_json = self._get_request_json(url) datasets = datasets_json.get("_embedded", {}).get("reports", []) - except HTTPError as http_error: + except ModeRequestError as e: self.report.report_failure( title="Failed to Retrieve Datasets for Space", message=f"Unable to retrieve datasets for space token {space_token}.", - context=f"Error: {str(http_error)}", + context=f"Error: {str(e)}", ) return datasets @@ -1416,11 +1420,11 @@ def _get_queries(self, report_token: str) -> list: f"{self.workspace_uri}/reports/{report_token}/queries" ) queries = queries_json.get("_embedded", {}).get("queries", {}) - except HTTPError as http_error: + except ModeRequestError as e: self.report.report_failure( title="Failed to Retrieve Queries", message="Unable to retrieve queries for report token.", - context=f"Report Token: {report_token}, Error: {str(http_error)}", + context=f"Report Token: {report_token}, Error: {str(e)}", ) return queries @@ -1433,11 +1437,11 @@ def _get_last_query_run( f"{self.workspace_uri}/reports/{report_token}/runs/{report_run_id}/query_runs{query_run_id}" ) queries = queries_json.get("_embedded", {}).get("queries", {}) - except HTTPError as http_error: + except ModeRequestError as e: self.report.report_failure( title="Failed to Retrieve Queries for Report", message="Unable to retrieve queries for report token.", - context=f"Report Token:{report_token}, Error: {str(http_error)}", + context=f"Report Token:{report_token}, Error: {str(e)}", ) return {} return queries @@ -1451,13 +1455,13 @@ def _get_charts(self, report_token: str, query_token: str) -> list: f"/queries/{query_token}/charts" ) charts = charts_json.get("_embedded", {}).get("charts", {}) - except HTTPError as http_error: + except ModeRequestError as e: self.report.report_failure( title="Failed to Retrieve Charts", message="Unable to retrieve charts from Mode.", context=f"Report Token: {report_token}, " f"Query token: {query_token}, " - f"Error: {str(http_error)}", + f"Error: {str(e)}", ) return charts @@ -1477,6 +1481,8 @@ def get_request(): response = self.session.get( url, timeout=self.config.api_options.timeout ) + if response.status_code == 204: # No content, don't parse json + return {} return response.json() except HTTPError as http_error: error_response = http_error.response diff --git a/metadata-ingestion/tests/integration/mode/test_mode.py b/metadata-ingestion/tests/integration/mode/test_mode.py index ce7533d5611e4..7f1e3935aa0fa 100644 --- a/metadata-ingestion/tests/integration/mode/test_mode.py +++ b/metadata-ingestion/tests/integration/mode/test_mode.py @@ -1,11 +1,14 @@ import json import pathlib +from typing import Sequence from unittest.mock import patch +import pytest from freezegun import freeze_time from requests.models import HTTPError from datahub.configuration.common import PipelineExecutionError +from datahub.ingestion.api.source import StructuredLogEntry from datahub.ingestion.run.pipeline import Pipeline from tests.test_helpers import mce_helpers @@ -28,7 +31,7 @@ "https://app.mode.com/api/acryl/reports/24f66e1701b6/queries": "dataset_queries_24f66e1701b6.json", } -RESPONSE_ERROR_LIST = ["https://app.mode.com/api/acryl/spaces/75737b70402e/reports"] +ERROR_URL = "https://app.mode.com/api/acryl/spaces/75737b70402e/reports" test_resources_dir = pathlib.Path(__file__).parent @@ -49,6 +52,14 @@ def mount(self, prefix, adaptor): return self def get(self, url, timeout=40): + if self.error_list is not None and self.url in self.error_list: + http_error_msg = "{} Client Error: {} for url: {}".format( + 400, + "Simulate error", + self.url, + ) + raise HTTPError(http_error_msg, response=self) + self.url = url self.timeout = timeout response_json_path = f"{test_resources_dir}/setup/{JSON_RESPONSE_MAP.get(url)}" @@ -57,29 +68,46 @@ def get(self, url, timeout=40): self.json_data = data return self - def raise_for_status(self): - if self.error_list is not None and self.url in self.error_list: - http_error_msg = "{} Client Error: {} for url: {}".format( - 400, - "Simulate error", - self.url, - ) - raise HTTPError(http_error_msg, response=self) + +class MockResponseJson(MockResponse): + def __init__( + self, + status_code: int = 200, + *, + json_empty_list: Sequence[str] = (), + json_error_list: Sequence[str] = (), + ): + super().__init__(None, status_code) + self.json_empty_list = json_empty_list + self.json_error_list = json_error_list + + def json(self): + if self.url in self.json_empty_list: + return json.loads("") # Shouldn't be called + if self.url in self.json_error_list: + return json.loads("{") + return super().json() + + def get(self, url, timeout=40): + response = super().get(url, timeout) + if self.url in self.json_empty_list: + response.status_code = 204 + return response -def mocked_requests_sucess(*args, **kwargs): +def mocked_requests_success(*args, **kwargs): return MockResponse(None, 200) def mocked_requests_failure(*args, **kwargs): - return MockResponse(RESPONSE_ERROR_LIST, 200) + return MockResponse([ERROR_URL], 200) @freeze_time(FROZEN_TIME) def test_mode_ingest_success(pytestconfig, tmp_path): with patch( "datahub.ingestion.source.mode.requests.Session", - side_effect=mocked_requests_sucess, + side_effect=mocked_requests_success, ): pipeline = Pipeline.create( { @@ -142,8 +170,89 @@ def test_mode_ingest_failure(pytestconfig, tmp_path): } ) pipeline.run() - try: + with pytest.raises(PipelineExecutionError) as exec_error: pipeline.raise_from_status() - except PipelineExecutionError as exec_error: - assert exec_error.args[0] == "Source reported errors" - assert len(exec_error.args[1].failures) == 1 + assert exec_error.value.args[0] == "Source reported errors" + assert len(exec_error.value.args[1].failures) == 1 + error_dict: StructuredLogEntry + _level, error_dict = exec_error.value.args[1].failures[0] + error = next(iter(error_dict.context)) + assert "Simulate error" in error + assert ERROR_URL in error + + +@freeze_time(FROZEN_TIME) +def test_mode_ingest_json_empty(pytestconfig, tmp_path): + with patch( + "datahub.ingestion.source.mode.requests.Session", + side_effect=lambda *args, **kwargs: MockResponseJson( + json_empty_list=["https://app.mode.com/api/modeuser"] + ), + ): + global test_resources_dir + test_resources_dir = pytestconfig.rootpath / "tests/integration/mode" + + pipeline = Pipeline.create( + { + "run_id": "mode-test", + "source": { + "type": "mode", + "config": { + "token": "xxxx", + "password": "xxxx", + "connect_uri": "https://app.mode.com/", + "workspace": "acryl", + }, + }, + "sink": { + "type": "file", + "config": { + "filename": f"{tmp_path}/mode_mces.json", + }, + }, + } + ) + pipeline.run() + pipeline.raise_from_status(raise_warnings=True) + + +@freeze_time(FROZEN_TIME) +def test_mode_ingest_json_failure(pytestconfig, tmp_path): + with patch( + "datahub.ingestion.source.mode.requests.Session", + side_effect=lambda *args, **kwargs: MockResponseJson( + json_error_list=["https://app.mode.com/api/modeuser"] + ), + ): + global test_resources_dir + test_resources_dir = pytestconfig.rootpath / "tests/integration/mode" + + pipeline = Pipeline.create( + { + "run_id": "mode-test", + "source": { + "type": "mode", + "config": { + "token": "xxxx", + "password": "xxxx", + "connect_uri": "https://app.mode.com/", + "workspace": "acryl", + }, + }, + "sink": { + "type": "file", + "config": { + "filename": f"{tmp_path}/mode_mces.json", + }, + }, + } + ) + pipeline.run() + pipeline.raise_from_status(raise_warnings=False) + with pytest.raises(PipelineExecutionError) as exec_error: + pipeline.raise_from_status(raise_warnings=True) + assert len(exec_error.value.args[1].warnings) > 0 + error_dict: StructuredLogEntry + _level, error_dict = exec_error.value.args[1].warnings[0] + error = next(iter(error_dict.context)) + assert "Expecting property name enclosed in double quotes" in error From 756b199506d57449c60a3a28901f7d22fe89f9f1 Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Tue, 24 Dec 2024 14:56:35 -0800 Subject: [PATCH 30/39] fix(ingest/glue): Add additional checks and logging when specifying catalog_id (#12168) --- .../src/datahub/ingestion/source/aws/glue.py | 14 +++++- .../tests/unit/glue/glue_mces_golden.json | 2 +- .../glue/glue_mces_golden_table_lineage.json | 2 +- .../glue_mces_platform_instance_golden.json | 2 +- .../tests/unit/glue/test_glue_source.py | 43 +++++++++++++++++-- .../tests/unit/glue/test_glue_source_stubs.py | 8 ++-- 6 files changed, 59 insertions(+), 12 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index 37c146218e263..7a5ed154d40bc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -52,6 +52,7 @@ platform_name, support_status, ) +from datahub.ingestion.api.report import EntityFilterReport from datahub.ingestion.api.source import MetadataWorkUnitProcessor from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.aws import s3_util @@ -115,7 +116,6 @@ logger = logging.getLogger(__name__) - DEFAULT_PLATFORM = "glue" VALID_PLATFORMS = [DEFAULT_PLATFORM, "athena"] @@ -220,6 +220,7 @@ def platform_validator(cls, v: str) -> str: class GlueSourceReport(StaleEntityRemovalSourceReport): tables_scanned = 0 filtered: List[str] = dataclass_field(default_factory=list) + databases: EntityFilterReport = EntityFilterReport.field(type="database") num_job_script_location_missing: int = 0 num_job_script_location_invalid: int = 0 @@ -668,6 +669,7 @@ def get_datajob_wu(self, node: Dict[str, Any], job_name: str) -> MetadataWorkUni return MetadataWorkUnit(id=f'{job_name}-{node["Id"]}', mce=mce) def get_all_databases(self) -> Iterable[Mapping[str, Any]]: + logger.debug("Getting all databases") # see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue/paginator/GetDatabases.html paginator = self.glue_client.get_paginator("get_databases") @@ -684,10 +686,18 @@ def get_all_databases(self) -> Iterable[Mapping[str, Any]]: pattern += "[?!TargetDatabase]" for database in paginator_response.search(pattern): - if self.source_config.database_pattern.allowed(database["Name"]): + if (not self.source_config.database_pattern.allowed(database["Name"])) or ( + self.source_config.catalog_id + and database.get("CatalogId") + and database.get("CatalogId") != self.source_config.catalog_id + ): + self.report.databases.dropped(database["Name"]) + else: + self.report.databases.processed(database["Name"]) yield database def get_tables_from_database(self, database: Mapping[str, Any]) -> Iterable[Dict]: + logger.debug(f"Getting tables from database {database['Name']}") # see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue/paginator/GetTables.html paginator = self.glue_client.get_paginator("get_tables") database_name = database["Name"] diff --git a/metadata-ingestion/tests/unit/glue/glue_mces_golden.json b/metadata-ingestion/tests/unit/glue/glue_mces_golden.json index 87971de12fbb3..71d7c31b222bd 100644 --- a/metadata-ingestion/tests/unit/glue/glue_mces_golden.json +++ b/metadata-ingestion/tests/unit/glue/glue_mces_golden.json @@ -124,7 +124,7 @@ "CreateTime": "June 01, 2021 at 14:55:13" }, "name": "empty-database", - "qualifiedName": "arn:aws:glue:us-west-2:123412341234:database/empty-database", + "qualifiedName": "arn:aws:glue:us-west-2:000000000000:database/empty-database", "env": "PROD" } } diff --git a/metadata-ingestion/tests/unit/glue/glue_mces_golden_table_lineage.json b/metadata-ingestion/tests/unit/glue/glue_mces_golden_table_lineage.json index e2dd4cec97c2e..22bb4b53b91ef 100644 --- a/metadata-ingestion/tests/unit/glue/glue_mces_golden_table_lineage.json +++ b/metadata-ingestion/tests/unit/glue/glue_mces_golden_table_lineage.json @@ -124,7 +124,7 @@ "CreateTime": "June 01, 2021 at 14:55:13" }, "name": "empty-database", - "qualifiedName": "arn:aws:glue:us-west-2:123412341234:database/empty-database", + "qualifiedName": "arn:aws:glue:us-west-2:000000000000:database/empty-database", "env": "PROD" } } diff --git a/metadata-ingestion/tests/unit/glue/glue_mces_platform_instance_golden.json b/metadata-ingestion/tests/unit/glue/glue_mces_platform_instance_golden.json index 0b883062763f4..b700335c26e5a 100644 --- a/metadata-ingestion/tests/unit/glue/glue_mces_platform_instance_golden.json +++ b/metadata-ingestion/tests/unit/glue/glue_mces_platform_instance_golden.json @@ -129,7 +129,7 @@ "CreateTime": "June 01, 2021 at 14:55:13" }, "name": "empty-database", - "qualifiedName": "arn:aws:glue:us-west-2:123412341234:database/empty-database", + "qualifiedName": "arn:aws:glue:us-west-2:000000000000:database/empty-database", "env": "PROD" } } diff --git a/metadata-ingestion/tests/unit/glue/test_glue_source.py b/metadata-ingestion/tests/unit/glue/test_glue_source.py index 693fd6bc336fd..9e3f260a23f1c 100644 --- a/metadata-ingestion/tests/unit/glue/test_glue_source.py +++ b/metadata-ingestion/tests/unit/glue/test_glue_source.py @@ -35,8 +35,8 @@ validate_all_providers_have_committed_successfully, ) from tests.unit.glue.test_glue_source_stubs import ( - databases_1, - databases_2, + empty_database, + flights_database, get_bucket_tagging, get_databases_delta_response, get_databases_response, @@ -64,6 +64,7 @@ tables_2, tables_profiling_1, target_database_tables, + test_database, ) FROZEN_TIME = "2020-04-14 07:00:00" @@ -310,6 +311,40 @@ def test_config_without_platform(): assert source.platform == "glue" +def test_get_databases_filters_by_catalog(): + def format_databases(databases): + return set(d["Name"] for d in databases) + + all_catalogs_source: GlueSource = GlueSource( + config=GlueSourceConfig(aws_region="us-west-2"), + ctx=PipelineContext(run_id="glue-source-test"), + ) + with Stubber(all_catalogs_source.glue_client) as glue_stubber: + glue_stubber.add_response("get_databases", get_databases_response, {}) + + expected = [flights_database, test_database, empty_database] + actual = all_catalogs_source.get_all_databases() + assert format_databases(actual) == format_databases(expected) + assert all_catalogs_source.report.databases.dropped_entities.as_obj() == [] + + catalog_id = "123412341234" + single_catalog_source: GlueSource = GlueSource( + config=GlueSourceConfig(catalog_id=catalog_id, aws_region="us-west-2"), + ctx=PipelineContext(run_id="glue-source-test"), + ) + with Stubber(single_catalog_source.glue_client) as glue_stubber: + glue_stubber.add_response( + "get_databases", get_databases_response, {"CatalogId": catalog_id} + ) + + expected = [flights_database, test_database] + actual = single_catalog_source.get_all_databases() + assert format_databases(actual) == format_databases(expected) + assert single_catalog_source.report.databases.dropped_entities.as_obj() == [ + "empty-database" + ] + + @freeze_time(FROZEN_TIME) def test_glue_stateful(pytestconfig, tmp_path, mock_time, mock_datahub_graph): deleted_actor_golden_mcs = "{}/glue_deleted_actor_mces_golden.json".format( @@ -357,8 +392,8 @@ def test_glue_stateful(pytestconfig, tmp_path, mock_time, mock_datahub_graph): tables_on_first_call = tables_1 tables_on_second_call = tables_2 mock_get_all_databases_and_tables.side_effect = [ - (databases_1, tables_on_first_call), - (databases_2, tables_on_second_call), + ([flights_database], tables_on_first_call), + ([test_database], tables_on_second_call), ] pipeline_run1 = run_and_get_pipeline(pipeline_config_dict) diff --git a/metadata-ingestion/tests/unit/glue/test_glue_source_stubs.py b/metadata-ingestion/tests/unit/glue/test_glue_source_stubs.py index dba1eea3010c2..43bf62fd4e3b8 100644 --- a/metadata-ingestion/tests/unit/glue/test_glue_source_stubs.py +++ b/metadata-ingestion/tests/unit/glue/test_glue_source_stubs.py @@ -88,12 +88,14 @@ "Permissions": ["ALL"], } ], - "CatalogId": "123412341234", + "CatalogId": "000000000000", }, ] } -databases_1 = [{"Name": "flights-database", "CatalogId": "123412341234"}] -databases_2 = [{"Name": "test-database", "CatalogId": "123412341234"}] +flights_database = {"Name": "flights-database", "CatalogId": "123412341234"} +test_database = {"Name": "test-database", "CatalogId": "123412341234"} +empty_database = {"Name": "empty-database", "CatalogId": "000000000000"} + tables_1 = [ { "Name": "avro", From 16698da509ec5c9f86188db7a16c38cea19d61bf Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Thu, 26 Dec 2024 18:10:09 +0530 Subject: [PATCH 31/39] fix(ingest/gc): misc fixes in gc source (#12226) --- .../datahub/ingestion/source/gc/datahub_gc.py | 23 +++++-- .../source/gc/execution_request_cleanup.py | 61 +++++++++++++++---- .../source_report/ingestion_stage.py | 1 + 3 files changed, 68 insertions(+), 17 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py b/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py index 4eecbb4d9d717..168b787b85e8b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py @@ -34,6 +34,7 @@ SoftDeletedEntitiesCleanupConfig, SoftDeletedEntitiesReport, ) +from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport logger = logging.getLogger(__name__) @@ -86,6 +87,7 @@ class DataHubGcSourceReport( DataProcessCleanupReport, SoftDeletedEntitiesReport, DatahubExecutionRequestCleanupReport, + IngestionStageReport, ): expired_tokens_revoked: int = 0 @@ -139,31 +141,40 @@ def get_workunits_internal( ) -> Iterable[MetadataWorkUnit]: if self.config.cleanup_expired_tokens: try: + self.report.report_ingestion_stage_start("Expired Token Cleanup") self.revoke_expired_tokens() except Exception as e: self.report.failure("While trying to cleanup expired token ", exc=e) if self.config.truncate_indices: try: + self.report.report_ingestion_stage_start("Truncate Indices") self.truncate_indices() except Exception as e: self.report.failure("While trying to truncate indices ", exc=e) if self.config.soft_deleted_entities_cleanup.enabled: try: + self.report.report_ingestion_stage_start( + "Soft Deleted Entities Cleanup" + ) self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities() except Exception as e: self.report.failure( "While trying to cleanup soft deleted entities ", exc=e ) - if self.config.execution_request_cleanup.enabled: - try: - self.execution_request_cleanup.run() - except Exception as e: - self.report.failure("While trying to cleanup execution request ", exc=e) if self.config.dataprocess_cleanup.enabled: try: + self.report.report_ingestion_stage_start("Data Process Cleanup") yield from self.dataprocess_cleanup.get_workunits_internal() except Exception as e: self.report.failure("While trying to cleanup data process ", exc=e) + if self.config.execution_request_cleanup.enabled: + try: + self.report.report_ingestion_stage_start("Execution request Cleanup") + self.execution_request_cleanup.run() + except Exception as e: + self.report.failure("While trying to cleanup execution request ", exc=e) + # Otherwise last stage's duration does not get calculated. + self.report.report_ingestion_stage_start("End") yield from [] def truncate_indices(self) -> None: @@ -281,6 +292,8 @@ def revoke_expired_tokens(self) -> None: list_access_tokens = expired_tokens_res.get("listAccessTokens", {}) tokens = list_access_tokens.get("tokens", []) total = list_access_tokens.get("total", 0) + if tokens == []: + break for token in tokens: self.report.expired_tokens_revoked += 1 token_id = token["id"] diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py index 3baf858e44cdc..170a6ada3e336 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py @@ -1,3 +1,4 @@ +import datetime import logging import time from typing import Any, Dict, Iterator, Optional @@ -42,16 +43,28 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel): description="Global switch for this cleanup task", ) + runtime_limit_seconds: int = Field( + default=3600, + description="Maximum runtime in seconds for the cleanup task", + ) + + max_read_errors: int = Field( + default=10, + description="Maximum number of read errors before aborting", + ) + def keep_history_max_milliseconds(self): return self.keep_history_max_days * 24 * 3600 * 1000 class DatahubExecutionRequestCleanupReport(SourceReport): - execution_request_cleanup_records_read: int = 0 - execution_request_cleanup_records_preserved: int = 0 - execution_request_cleanup_records_deleted: int = 0 - execution_request_cleanup_read_errors: int = 0 - execution_request_cleanup_delete_errors: int = 0 + ergc_records_read: int = 0 + ergc_records_preserved: int = 0 + ergc_records_deleted: int = 0 + ergc_read_errors: int = 0 + ergc_delete_errors: int = 0 + ergc_start_time: Optional[datetime.datetime] = None + ergc_end_time: Optional[datetime.datetime] = None class CleanupRecord(BaseModel): @@ -124,6 +137,13 @@ def _scroll_execution_requests( params.update(overrides) while True: + if self._reached_runtime_limit(): + break + if self.report.ergc_read_errors >= self.config.max_read_errors: + self.report.failure( + f"ergc({self.instance_id}): too many read errors, aborting." + ) + break try: url = f"{self.graph.config.server}/openapi/v2/entity/{DATAHUB_EXECUTION_REQUEST_ENTITY_NAME}" response = self.graph._session.get(url, headers=headers, params=params) @@ -141,7 +161,7 @@ def _scroll_execution_requests( logger.error( f"ergc({self.instance_id}): failed to fetch next batch of execution requests: {e}" ) - self.report.execution_request_cleanup_read_errors += 1 + self.report.ergc_read_errors += 1 def _scroll_garbage_records(self): state: Dict[str, Dict] = {} @@ -150,7 +170,7 @@ def _scroll_garbage_records(self): running_guard_timeout = now_ms - 30 * 24 * 3600 * 1000 for entry in self._scroll_execution_requests(): - self.report.execution_request_cleanup_records_read += 1 + self.report.ergc_records_read += 1 key = entry.ingestion_source # Always delete corrupted records @@ -171,7 +191,7 @@ def _scroll_garbage_records(self): # Do not delete if number of requests is below minimum if state[key]["count"] < self.config.keep_history_min_count: - self.report.execution_request_cleanup_records_preserved += 1 + self.report.ergc_records_preserved += 1 continue # Do not delete if number of requests do not exceed allowed maximum, @@ -179,7 +199,7 @@ def _scroll_garbage_records(self): if (state[key]["count"] < self.config.keep_history_max_count) and ( entry.requested_at > state[key]["cutoffTimestamp"] ): - self.report.execution_request_cleanup_records_preserved += 1 + self.report.ergc_records_preserved += 1 continue # Do not delete if status is RUNNING or PENDING and created within last month. If the record is >month old and it did not @@ -188,7 +208,7 @@ def _scroll_garbage_records(self): "RUNNING", "PENDING", ]: - self.report.execution_request_cleanup_records_preserved += 1 + self.report.ergc_records_preserved += 1 continue # Otherwise delete current record @@ -200,7 +220,7 @@ def _scroll_garbage_records(self): f"record timestamp: {entry.requested_at}." ) ) - self.report.execution_request_cleanup_records_deleted += 1 + self.report.ergc_records_deleted += 1 yield entry def _delete_entry(self, entry: CleanupRecord) -> None: @@ -210,17 +230,31 @@ def _delete_entry(self, entry: CleanupRecord) -> None: ) self.graph.delete_entity(entry.urn, True) except Exception as e: - self.report.execution_request_cleanup_delete_errors += 1 + self.report.ergc_delete_errors += 1 logger.error( f"ergc({self.instance_id}): failed to delete ExecutionRequest {entry.request_id}: {e}" ) + def _reached_runtime_limit(self) -> bool: + if ( + self.config.runtime_limit_seconds + and self.report.ergc_start_time + and ( + datetime.datetime.now() - self.report.ergc_start_time + >= datetime.timedelta(seconds=self.config.runtime_limit_seconds) + ) + ): + logger.info(f"ergc({self.instance_id}): max runtime reached.") + return True + return False + def run(self) -> None: if not self.config.enabled: logger.info( f"ergc({self.instance_id}): ExecutionRequest cleaner is disabled." ) return + self.report.ergc_start_time = datetime.datetime.now() logger.info( ( @@ -232,8 +266,11 @@ def run(self) -> None: ) for entry in self._scroll_garbage_records(): + if self._reached_runtime_limit(): + break self._delete_entry(entry) + self.report.ergc_end_time = datetime.datetime.now() logger.info( f"ergc({self.instance_id}): Finished cleanup of ExecutionRequest records." ) diff --git a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py index 42b3b648bd298..ce683e64b3f46 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py +++ b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py @@ -42,4 +42,5 @@ def report_ingestion_stage_start(self, stage: str) -> None: self._timer = PerfTimer() self.ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}" + logger.info(f"Stage started: {self.ingestion_stage}") self._timer.start() From fe43f076dadc754313864f7a9ca286223ebb0275 Mon Sep 17 00:00:00 2001 From: Chakru <161002324+chakru-r@users.noreply.github.com> Date: Thu, 26 Dec 2024 21:22:16 +0530 Subject: [PATCH 32/39] Parallelize smoke test (#12225) --- .github/workflows/docker-unified.yml | 50 +++++++++++++------ smoke-test/.gitignore | 4 +- smoke-test/build.gradle | 38 ++++++-------- smoke-test/conftest.py | 52 ++++++++++++++++++++ smoke-test/smoke.sh | 25 ++++++---- smoke-test/tests/cypress/integration_test.py | 49 ++++++++++++------ 6 files changed, 155 insertions(+), 63 deletions(-) diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index 03a9b3afc3bc5..47c26068347c0 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -1011,18 +1011,39 @@ jobs: needs: setup outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} + cypress_batch_count: ${{ steps.set-batch-count.outputs.cypress_batch_count }} + python_batch_count: ${{ steps.set-batch-count.outputs.python_batch_count }} steps: + - id: set-batch-count + # Tests are split simply to ensure the configured number of batches for parallelization. This may need some + # increase as a new tests added increase the duration where an additional parallel batch helps. + # python_batch_count is used to split pytests in the smoke-test (batches of actual test functions) + # cypress_batch_count is used to split the collection of cypress test specs into batches. + run: | + echo "cypress_batch_count=11" >> "$GITHUB_OUTPUT" + echo "python_batch_count=5" >> "$GITHUB_OUTPUT" + - id: set-matrix + # For m batches for python and n batches for cypress, we need a test matrix of python x m + cypress x n. + # while the github action matrix generation can handle these two parts individually, there isnt a way to use the + # two generated matrices for the same job. So, produce that matrix with scripting and use the include directive + # to add it to the test matrix. run: | - if [ '${{ needs.setup.outputs.frontend_only }}' == 'true' ]; then - echo 'matrix=["cypress_suite1","cypress_rest"]' >> "$GITHUB_OUTPUT" - elif [ '${{ needs.setup.outputs.ingestion_only }}' == 'true' ]; then - echo 'matrix=["no_cypress_suite0","no_cypress_suite1"]' >> "$GITHUB_OUTPUT" - elif [[ '${{ needs.setup.outputs.backend_change }}' == 'true' || '${{ needs.setup.outputs.smoke_test_change }}' == 'true' ]]; then - echo 'matrix=["no_cypress_suite0","no_cypress_suite1","cypress_suite1","cypress_rest"]' >> "$GITHUB_OUTPUT" - else - echo 'matrix=[]' >> "$GITHUB_OUTPUT" + python_batch_count=${{ steps.set-batch-count.outputs.python_batch_count }} + python_matrix=$(printf "{\"test_strategy\":\"pytests\",\"batch\":\"0\",\"batch_count\":\"$python_batch_count\"}"; for ((i=1;i> "$GITHUB_OUTPUT" smoke_test: name: Run Smoke Tests @@ -1043,8 +1064,7 @@ jobs: ] strategy: fail-fast: false - matrix: - test_strategy: ${{ fromJson(needs.smoke_test_matrix.outputs.matrix) }} + matrix: ${{ fromJson(needs.smoke_test_matrix.outputs.matrix) }} if: ${{ always() && !failure() && !cancelled() && needs.smoke_test_matrix.outputs.matrix != '[]' }} steps: - name: Free up disk space @@ -1220,6 +1240,8 @@ jobs: CYPRESS_RECORD_KEY: ${{ secrets.CYPRESS_RECORD_KEY }} CLEANUP_DATA: "false" TEST_STRATEGY: ${{ matrix.test_strategy }} + BATCH_COUNT: ${{ matrix.batch_count }} + BATCH_NUMBER: ${{ matrix.batch }} run: | echo "$DATAHUB_VERSION" ./gradlew --stop @@ -1230,25 +1252,25 @@ jobs: if: failure() run: | docker ps -a - TEST_STRATEGY="-${{ matrix.test_strategy }}" + TEST_STRATEGY="-${{ matrix.test_strategy }}-${{ matrix.batch }}" source .github/scripts/docker_logs.sh - name: Upload logs uses: actions/upload-artifact@v3 if: failure() with: - name: docker-logs-${{ matrix.test_strategy }} + name: docker-logs-${{ matrix.test_strategy }}-${{ matrix.batch }} path: "docker_logs/*.log" retention-days: 5 - name: Upload screenshots uses: actions/upload-artifact@v3 if: failure() with: - name: cypress-snapshots-${{ matrix.test_strategy }} + name: cypress-snapshots-${{ matrix.test_strategy }}-${{ matrix.batch }} path: smoke-test/tests/cypress/cypress/screenshots/ - uses: actions/upload-artifact@v3 if: always() with: - name: Test Results (smoke tests) ${{ matrix.test_strategy }} + name: Test Results (smoke tests) ${{ matrix.test_strategy }} ${{ matrix.batch }} path: | **/build/reports/tests/test/** **/build/test-results/test/** diff --git a/smoke-test/.gitignore b/smoke-test/.gitignore index b8af2eef535a0..d8cfd65ff81b9 100644 --- a/smoke-test/.gitignore +++ b/smoke-test/.gitignore @@ -29,6 +29,8 @@ share/python-wheels/ .installed.cfg *.egg MANIFEST +**/cypress/node_modules + # PyInstaller # Usually these files are written by a python script from a template @@ -132,4 +134,4 @@ dmypy.json # Pyre type checker .pyre/ junit* -tests/cypress/onboarding.json \ No newline at end of file +tests/cypress/onboarding.json diff --git a/smoke-test/build.gradle b/smoke-test/build.gradle index 73ecdcb08ea14..60d08e0206cda 100644 --- a/smoke-test/build.gradle +++ b/smoke-test/build.gradle @@ -91,39 +91,31 @@ task pythonLintFix(type: Exec, dependsOn: installDev) { * The following tasks assume an already running quickstart. * ./gradlew quickstart (or another variation `quickstartDebug`) */ -task noCypressSuite0(type: Exec, dependsOn: [installDev, ':metadata-ingestion:installDev']) { - environment 'RUN_QUICKSTART', 'false' - environment 'TEST_STRATEGY', 'no_cypress_suite0' - - workingDir = project.projectDir - commandLine 'bash', '-c', - "source ${venv_name}/bin/activate && set -x && " + - "./smoke.sh" -} +// ./gradlew :smoke-test:pytest -PbatchNumber=2 (default 0) +task pytest(type: Exec, dependsOn: [installDev, ':metadata-ingestion:installDev']) { + // Get BATCH_NUMBER from command line argument with default value of 0 + def batchNumber = project.hasProperty('batchNumber') ? project.property('batchNumber') : '0' -task noCypressSuite1(type: Exec, dependsOn: [installDev, ':metadata-ingestion:installDev']) { environment 'RUN_QUICKSTART', 'false' - environment 'TEST_STRATEGY', 'no_cypress_suite1' + environment 'TEST_STRATEGY', 'pytests' + environment 'BATCH_COUNT', 5 + environment 'BATCH_NUMBER', batchNumber workingDir = project.projectDir commandLine 'bash', '-c', "source ${venv_name}/bin/activate && set -x && " + - "./smoke.sh" + "./smoke.sh" } -task cypressSuite1(type: Exec, dependsOn: [installDev, ':metadata-ingestion:installDev']) { - environment 'RUN_QUICKSTART', 'false' - environment 'TEST_STRATEGY', 'cypress_suite1' - - workingDir = project.projectDir - commandLine 'bash', '-c', - "source ${venv_name}/bin/activate && set -x && " + - "./smoke.sh" -} +// ./gradlew :smoke-test:cypressTest -PbatchNumber=2 (default 0) +task cypressTest(type: Exec, dependsOn: [installDev, ':metadata-ingestion:installDev']) { + // Get BATCH_NUMBER from command line argument with default value of 0 + def batchNumber = project.hasProperty('batchNumber') ? project.property('batchNumber') : '0' -task cypressRest(type: Exec, dependsOn: [installDev, ':metadata-ingestion:installDev']) { environment 'RUN_QUICKSTART', 'false' - environment 'TEST_STRATEGY', 'cypress_rest' + environment 'TEST_STRATEGY', 'cypress' + environment 'BATCH_COUNT', 11 + environment 'BATCH_NUMBER', batchNumber workingDir = project.projectDir commandLine 'bash', '-c', diff --git a/smoke-test/conftest.py b/smoke-test/conftest.py index 6d148db9886a4..d48a92b22ab48 100644 --- a/smoke-test/conftest.py +++ b/smoke-test/conftest.py @@ -1,6 +1,8 @@ import os import pytest +from typing import List, Tuple +from _pytest.nodes import Item import requests from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph @@ -45,3 +47,53 @@ def graph_client(auth_session) -> DataHubGraph: def pytest_sessionfinish(session, exitstatus): """whole test run finishes.""" send_message(exitstatus) + + +def get_batch_start_end(num_tests: int) -> Tuple[int, int]: + batch_count_env = os.getenv("BATCH_COUNT", 1) + batch_count = int(batch_count_env) + + batch_number_env = os.getenv("BATCH_NUMBER", 0) + batch_number = int(batch_number_env) + + if batch_count == 0 or batch_count > num_tests: + raise ValueError( + f"Invalid batch count {batch_count}: must be >0 and <= {num_tests} (num_tests)" + ) + if batch_number >= batch_count: + raise ValueError( + f"Invalid batch number: {batch_number}, must be less than {batch_count} (zer0 based index)" + ) + + batch_size = round(num_tests / batch_count) + + batch_start = batch_size * batch_number + batch_end = batch_start + batch_size + # We must have exactly as many batches as specified by BATCH_COUNT. + if ( + num_tests - batch_end < batch_size + ): # We must have exactly as many batches as specified by BATCH_COUNT, put the remaining in the last batch. + batch_end = num_tests + + if batch_count > 0: + print(f"Running tests for batch {batch_number} of {batch_count}") + + return batch_start, batch_end + + +def pytest_collection_modifyitems( + session: pytest.Session, config: pytest.Config, items: List[Item] +) -> None: + if os.getenv("TEST_STRATEGY") == "cypress": + return # We launch cypress via pytests, but needs a different batching mechanism at cypress level. + + # If BATCH_COUNT and BATCH_ENV vars are set, splits the pytests to batches and runs filters only the BATCH_NUMBER + # batch for execution. Enables multiple parallel launches. Current implementation assumes all test are of equal + # weight for batching. TODO. A weighted batching method can help make batches more equal sized by cost. + # this effectively is a no-op if BATCH_COUNT=1 + start_index, end_index = get_batch_start_end(num_tests=len(items)) + + items.sort(key=lambda x: x.nodeid) # we want the order to be stable across batches + # replace items with the filtered list + print(f"Running tests for batch {start_index}-{end_index}") + items[:] = items[start_index:end_index] diff --git a/smoke-test/smoke.sh b/smoke-test/smoke.sh index 888a60f488e1f..ec8188ebf5f4d 100755 --- a/smoke-test/smoke.sh +++ b/smoke-test/smoke.sh @@ -34,15 +34,20 @@ source ./set-cypress-creds.sh # set environment variables for the test source ./set-test-env-vars.sh -# no_cypress_suite0, no_cypress_suite1, cypress_suite1, cypress_rest -if [[ -z "${TEST_STRATEGY}" ]]; then - pytest -rP --durations=20 -vv --continue-on-collection-errors --junit-xml=junit.smoke.xml +# TEST_STRATEGY: +# if set to pytests, runs all pytests, skips cypress tests(though cypress test launch is via a pytest). +# if set tp cypress, runs all cypress tests +# if blank, runs all. +# When invoked via the github action, BATCH_COUNT and BATCH_NUM env vars are set to run a slice of those tests per +# worker for parallelism. docker-unified.yml generates a test matrix of pytests/cypress in batches. As number of tests +# increase, the batch_count config (in docker-unified.yml) may need adjustment. +if [[ "${TEST_STRATEGY}" == "pytests" ]]; then + #pytests only - github test matrix runs pytests in one of the runners when applicable. + pytest -rP --durations=20 -vv --continue-on-collection-errors --junit-xml=junit.smoke-pytests.xml -k 'not test_run_cypress' +elif [[ "${TEST_STRATEGY}" == "cypress" ]]; then + # run only cypress tests. The test inspects BATCH_COUNT and BATCH_NUMBER and runs only a subset of tests in that batch. + # github workflow test matrix will invoke this in multiple runners for each batch. + pytest -rP --durations=20 -vv --continue-on-collection-errors --junit-xml=junit.smoke-cypress${BATCH_NUMBER}.xml tests/cypress/integration_test.py else - if [ "$TEST_STRATEGY" == "no_cypress_suite0" ]; then - pytest -rP --durations=20 -vv --continue-on-collection-errors --junit-xml=junit.smoke_non_cypress.xml -k 'not test_run_cypress' -m 'not no_cypress_suite1' - elif [ "$TEST_STRATEGY" == "no_cypress_suite1" ]; then - pytest -rP --durations=20 -vv --continue-on-collection-errors --junit-xml=junit.smoke_non_cypress.xml -m 'no_cypress_suite1' - else - pytest -rP --durations=20 -vv --continue-on-collection-errors --junit-xml=junit.smoke_cypress_${TEST_STRATEGY}.xml tests/cypress/integration_test.py - fi + pytest -rP --durations=20 -vv --continue-on-collection-errors --junit-xml=junit.smoke-all.xml fi diff --git a/smoke-test/tests/cypress/integration_test.py b/smoke-test/tests/cypress/integration_test.py index 0d824a96810d0..33c67a923c278 100644 --- a/smoke-test/tests/cypress/integration_test.py +++ b/smoke-test/tests/cypress/integration_test.py @@ -1,10 +1,11 @@ import datetime import os import subprocess -from typing import List, Set +from typing import List import pytest +from conftest import get_batch_start_end from tests.setup.lineage.ingest_time_lineage import ( get_time_lineage_urns, ingest_time_lineage, @@ -169,10 +170,29 @@ def ingest_cleanup_data(auth_session, graph_client): print("deleted onboarding data") -def _get_spec_map(items: Set[str]) -> str: - if len(items) == 0: - return "" - return ",".join([f"**/{item}/*.js" for item in items]) +def _get_js_files(base_path: str): + file_paths = [] + for root, dirs, files in os.walk(base_path): + for file in files: + if file.endswith(".js"): + file_paths.append(os.path.relpath(os.path.join(root, file), base_path)) + return sorted(file_paths) # sort to make the order stable across batch runs + + +def _get_cypress_tests_batch(): + """ + Batching is configured via env vars BATCH_COUNT and BATCH_NUMBER. All cypress tests are split into exactly + BATCH_COUNT batches. When BATCH_NUMBER env var is set (zero based index), that batch alone is run. + Github workflow via test_matrix, runs all batches in parallel to speed up the test elapsed time. + If either of these vars are not set, all tests are run sequentially. + :return: + """ + all_tests = _get_js_files("tests/cypress/cypress/e2e") + + batch_start, batch_end = get_batch_start_end(num_tests=len(all_tests)) + + return all_tests[batch_start:batch_end] + # return test_batches[int(batch_number)] #if BATCH_NUMBER was set, we this test just runs that one batch. def test_run_cypress(auth_session): @@ -182,24 +202,23 @@ def test_run_cypress(auth_session): test_strategy = os.getenv("TEST_STRATEGY", None) if record_key: record_arg = " --record " - tag_arg = f" --tag {test_strategy} " + batch_number = os.getenv("BATCH_NUMBER") + batch_count = os.getenv("BATCH_COUNT") + if batch_number and batch_count: + batch_suffix = f"-{batch_number}{batch_count}" + else: + batch_suffix = "" + tag_arg = f" --tag {test_strategy}{batch_suffix}" else: record_arg = " " rest_specs = set(os.listdir("tests/cypress/cypress/e2e")) cypress_suite1_specs = {"mutations", "search", "views"} rest_specs.difference_update(set(cypress_suite1_specs)) - strategy_spec_map = { - "cypress_suite1": cypress_suite1_specs, - "cypress_rest": rest_specs, - } print(f"test strategy is {test_strategy}") test_spec_arg = "" - if test_strategy is not None: - specs = strategy_spec_map.get(test_strategy) - assert specs is not None - specs_str = _get_spec_map(specs) - test_spec_arg = f" --spec '{specs_str}' " + specs_str = ",".join([f"**/{f}" for f in _get_cypress_tests_batch()]) + test_spec_arg = f" --spec '{specs_str}' " print("Running Cypress tests with command") command = f"NO_COLOR=1 npx cypress run {record_arg} {test_spec_arg} {tag_arg}" From 5708bd9beba6ea08d66a37506867380a45718df3 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Thu, 26 Dec 2024 11:14:15 -0600 Subject: [PATCH 33/39] chore(bump): spring minor version bump 6.1.14 (#12228) --- build.gradle | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.gradle b/build.gradle index a3d807a733349..8929b4e644972 100644 --- a/build.gradle +++ b/build.gradle @@ -35,7 +35,7 @@ buildscript { ext.pegasusVersion = '29.57.0' ext.mavenVersion = '3.6.3' ext.versionGradle = '8.11.1' - ext.springVersion = '6.1.13' + ext.springVersion = '6.1.14' ext.springBootVersion = '3.2.9' ext.springKafkaVersion = '3.1.6' ext.openTelemetryVersion = '1.18.0' From a920e9bec8ed8f0aa6ecfd482c8659afa4f3e034 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 26 Dec 2024 15:33:39 -0500 Subject: [PATCH 34/39] fix(ingest/lookml): emit warnings for resolution failures (#12215) --- .../source/looker/looker_dataclasses.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_dataclasses.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_dataclasses.py index 327c9ebf99bd2..d771821a14d88 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_dataclasses.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_dataclasses.py @@ -186,16 +186,16 @@ def resolve_includes( f"traversal_path={traversal_path}, included_files = {included_files}, seen_so_far: {seen_so_far}" ) if "*" not in inc and not included_files: - reporter.report_failure( + reporter.warning( title="Error Resolving Include", - message=f"Cannot resolve include {inc}", - context=f"Path: {path}", + message="Cannot resolve included file", + context=f"Include: {inc}, path: {path}, traversal_path: {traversal_path}", ) elif not included_files: - reporter.report_failure( + reporter.warning( title="Error Resolving Include", - message=f"Did not resolve anything for wildcard include {inc}", - context=f"Path: {path}", + message="Did not find anything matching the wildcard include", + context=f"Include: {inc}, path: {path}, traversal_path: {traversal_path}", ) # only load files that we haven't seen so far included_files = [x for x in included_files if x not in seen_so_far] @@ -231,9 +231,7 @@ def resolve_includes( source_config, reporter, seen_so_far, - traversal_path=traversal_path - + "." - + pathlib.Path(included_file).stem, + traversal_path=f"{traversal_path} -> {pathlib.Path(included_file).stem}", ) ) except Exception as e: From e1998dd371b1002ae8893d7a63d84e5bace079c7 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 26 Dec 2024 15:34:00 -0500 Subject: [PATCH 35/39] chore(ingest): remove `enable_logging` helper (#12222) --- .../powerbi/rest_api_wrapper/data_resolver.py | 31 ++-- .../powerbi/test_admin_only_api.py | 12 -- .../tests/integration/powerbi/test_powerbi.py | 141 +++++++----------- .../integration/powerbi/test_profiling.py | 10 -- .../tableau/test_tableau_ingest.py | 29 ---- 5 files changed, 70 insertions(+), 153 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py index e1301edef10b8..161975fa635fd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py @@ -84,13 +84,14 @@ def __init__( tenant_id: str, metadata_api_timeout: int, ): - self.__access_token: Optional[str] = None - self.__access_token_expiry_time: Optional[datetime] = None - self.__tenant_id = tenant_id + self._access_token: Optional[str] = None + self._access_token_expiry_time: Optional[datetime] = None + + self._tenant_id = tenant_id # Test connection by generating access token logger.info(f"Trying to connect to {self._get_authority_url()}") # Power-Bi Auth (Service Principal Auth) - self.__msal_client = msal.ConfidentialClientApplication( + self._msal_client = msal.ConfidentialClientApplication( client_id, client_credential=client_secret, authority=DataResolverBase.AUTHORITY + tenant_id, @@ -168,18 +169,18 @@ def _get_app( pass def _get_authority_url(self): - return f"{DataResolverBase.AUTHORITY}{self.__tenant_id}" + return f"{DataResolverBase.AUTHORITY}{self._tenant_id}" def get_authorization_header(self): return {Constant.Authorization: self.get_access_token()} - def get_access_token(self): - if self.__access_token is not None and not self._is_access_token_expired(): - return self.__access_token + def get_access_token(self) -> str: + if self._access_token is not None and not self._is_access_token_expired(): + return self._access_token logger.info("Generating PowerBi access token") - auth_response = self.__msal_client.acquire_token_for_client( + auth_response = self._msal_client.acquire_token_for_client( scopes=[DataResolverBase.SCOPE] ) @@ -193,24 +194,24 @@ def get_access_token(self): logger.info("Generated PowerBi access token") - self.__access_token = "Bearer {}".format( + self._access_token = "Bearer {}".format( auth_response.get(Constant.ACCESS_TOKEN) ) safety_gap = 300 - self.__access_token_expiry_time = datetime.now() + timedelta( + self._access_token_expiry_time = datetime.now() + timedelta( seconds=( max(auth_response.get(Constant.ACCESS_TOKEN_EXPIRY, 0) - safety_gap, 0) ) ) - logger.debug(f"{Constant.PBIAccessToken}={self.__access_token}") + logger.debug(f"{Constant.PBIAccessToken}={self._access_token}") - return self.__access_token + return self._access_token def _is_access_token_expired(self) -> bool: - if not self.__access_token_expiry_time: + if not self._access_token_expiry_time: return True - return self.__access_token_expiry_time < datetime.now() + return self._access_token_expiry_time < datetime.now() def get_dashboards(self, workspace: Workspace) -> List[Dashboard]: """ diff --git a/metadata-ingestion/tests/integration/powerbi/test_admin_only_api.py b/metadata-ingestion/tests/integration/powerbi/test_admin_only_api.py index b636c12cfda06..00dc79ed38cfb 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_admin_only_api.py +++ b/metadata-ingestion/tests/integration/powerbi/test_admin_only_api.py @@ -1,5 +1,3 @@ -import logging -import sys from typing import Any, Dict from unittest import mock @@ -483,12 +481,6 @@ def register_mock_admin_api(request_mock: Any, override_data: dict = {}) -> None ) -def enable_logging(): - # set logging to console - logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) - logging.getLogger().setLevel(logging.DEBUG) - - def mock_msal_cca(*args, **kwargs): class MsalClient: def acquire_token_for_client(self, *args, **kwargs): @@ -527,8 +519,6 @@ def default_source_config(): @freeze_time(FROZEN_TIME) @mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) def test_admin_only_apis(mock_msal, pytestconfig, tmp_path, mock_time, requests_mock): - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_admin_api(request_mock=requests_mock) @@ -567,8 +557,6 @@ def test_admin_only_apis(mock_msal, pytestconfig, tmp_path, mock_time, requests_ def test_most_config_and_modified_since( mock_msal, pytestconfig, tmp_path, mock_time, requests_mock ): - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_admin_api(request_mock=requests_mock) diff --git a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py index edde11ff87d29..739be7cc8408d 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py +++ b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py @@ -1,8 +1,6 @@ import datetime import json -import logging import re -import sys from pathlib import Path from typing import Any, Dict, List, Optional, Union, cast from unittest import mock @@ -31,29 +29,21 @@ FROZEN_TIME = "2022-02-03 07:00:00" -def enable_logging(): - # set logging to console - logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) - logging.getLogger().setLevel(logging.DEBUG) - - -class MsalClient: - call_num = 0 - token: Dict[str, Any] = { - "access_token": "dummy", - } - - @staticmethod - def acquire_token_for_client(*args, **kwargs): - MsalClient.call_num += 1 - return MsalClient.token +def mock_msal_cca(*args, **kwargs): + class MsalClient: + def __init__(self): + self.call_num = 0 + self.token: Dict[str, Any] = { + "access_token": "dummy", + } - @staticmethod - def reset(): - MsalClient.call_num = 0 + def acquire_token_for_client(self, *args, **kwargs): + self.call_num += 1 + return self.token + def reset(self): + self.call_num = 0 -def mock_msal_cca(*args, **kwargs): return MsalClient() @@ -154,8 +144,6 @@ def test_powerbi_ingest( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api(pytestconfig=pytestconfig, request_mock=requests_mock) @@ -199,8 +187,6 @@ def test_powerbi_workspace_type_filter( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api( @@ -260,8 +246,6 @@ def test_powerbi_ingest_patch_disabled( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api(pytestconfig=pytestconfig, request_mock=requests_mock) @@ -327,8 +311,6 @@ def test_powerbi_platform_instance_ingest( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api(pytestconfig=pytestconfig, request_mock=requests_mock) @@ -515,8 +497,6 @@ def test_extract_reports( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api(pytestconfig=pytestconfig, request_mock=requests_mock) @@ -561,8 +541,6 @@ def test_extract_lineage( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api(pytestconfig=pytestconfig, request_mock=requests_mock) @@ -660,8 +638,6 @@ def test_admin_access_is_not_allowed( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api( @@ -723,8 +699,6 @@ def test_workspace_container( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api(pytestconfig=pytestconfig, request_mock=requests_mock) @@ -764,85 +738,84 @@ def test_workspace_container( ) -@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) def test_access_token_expiry_with_long_expiry( - mock_msal: MagicMock, pytestconfig: pytest.Config, tmp_path: str, mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() - register_mock_api(pytestconfig=pytestconfig, request_mock=requests_mock) - pipeline = Pipeline.create( - { - "run_id": "powerbi-test", - "source": { - "type": "powerbi", - "config": { - **default_source_config(), + mock_msal = mock_msal_cca() + + with mock.patch("msal.ConfidentialClientApplication", return_value=mock_msal): + pipeline = Pipeline.create( + { + "run_id": "powerbi-test", + "source": { + "type": "powerbi", + "config": { + **default_source_config(), + }, }, - }, - "sink": { - "type": "file", - "config": { - "filename": f"{tmp_path}/powerbi_access_token_mces.json", + "sink": { + "type": "file", + "config": { + "filename": f"{tmp_path}/powerbi_access_token_mces.json", + }, }, - }, - } - ) + } + ) # for long expiry, the token should only be requested once. - MsalClient.token = { + mock_msal.token = { "access_token": "dummy2", "expires_in": 3600, } + mock_msal.reset() - MsalClient.reset() pipeline.run() # We expect the token to be requested twice (once for AdminApiResolver and one for RegularApiResolver) - assert MsalClient.call_num == 2 + assert mock_msal.call_num == 2 -@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) def test_access_token_expiry_with_short_expiry( - mock_msal: MagicMock, pytestconfig: pytest.Config, tmp_path: str, mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() - register_mock_api(pytestconfig=pytestconfig, request_mock=requests_mock) - pipeline = Pipeline.create( - { - "run_id": "powerbi-test", - "source": { - "type": "powerbi", - "config": { - **default_source_config(), + mock_msal = mock_msal_cca() + with mock.patch("msal.ConfidentialClientApplication", return_value=mock_msal): + pipeline = Pipeline.create( + { + "run_id": "powerbi-test", + "source": { + "type": "powerbi", + "config": { + **default_source_config(), + }, }, - }, - "sink": { - "type": "file", - "config": { - "filename": f"{tmp_path}/powerbi_access_token_mces.json", + "sink": { + "type": "file", + "config": { + "filename": f"{tmp_path}/powerbi_access_token_mces.json", + }, }, - }, - } - ) + } + ) # for short expiry, the token should be requested when expires. - MsalClient.token = { + mock_msal.token = { "access_token": "dummy", "expires_in": 0, } + mock_msal.reset() + pipeline.run() - assert MsalClient.call_num > 2 + assert mock_msal.call_num > 2 def dataset_type_mapping_set_to_all_platform(pipeline: Pipeline) -> None: @@ -940,8 +913,6 @@ def test_dataset_type_mapping_error( def test_server_to_platform_map( mock_msal, pytestconfig, tmp_path, mock_time, requests_mock ): - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" new_config: dict = { **default_source_config(), @@ -1416,8 +1387,6 @@ def test_powerbi_cross_workspace_reference_info_message( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() - register_mock_api( pytestconfig=pytestconfig, request_mock=requests_mock, @@ -1495,8 +1464,6 @@ def common_app_ingest( output_mcp_path: str, override_config: dict = {}, ) -> Pipeline: - enable_logging() - register_mock_api( pytestconfig=pytestconfig, request_mock=requests_mock, diff --git a/metadata-ingestion/tests/integration/powerbi/test_profiling.py b/metadata-ingestion/tests/integration/powerbi/test_profiling.py index 4b48bed003b1e..78d35cf31a26d 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_profiling.py +++ b/metadata-ingestion/tests/integration/powerbi/test_profiling.py @@ -1,5 +1,3 @@ -import logging -import sys from typing import Any, Dict from unittest import mock @@ -271,12 +269,6 @@ def register_mock_admin_api(request_mock: Any, override_data: dict = {}) -> None ) -def enable_logging(): - # set logging to console - logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) - logging.getLogger().setLevel(logging.DEBUG) - - def mock_msal_cca(*args, **kwargs): class MsalClient: def acquire_token_for_client(self, *args, **kwargs): @@ -311,8 +303,6 @@ def default_source_config(): @freeze_time(FROZEN_TIME) @mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) def test_profiling(mock_msal, pytestconfig, tmp_path, mock_time, requests_mock): - enable_logging() - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_admin_api(request_mock=requests_mock) diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index 902ff243c802a..71e5ad10c2fc5 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -1,7 +1,5 @@ import json -import logging import pathlib -import sys from typing import Any, Dict, List, cast from unittest import mock @@ -88,12 +86,6 @@ } -def enable_logging(): - # set logging to console - logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) - logging.getLogger().setLevel(logging.DEBUG) - - def read_response(file_name): response_json_path = f"{test_resources_dir}/setup/{file_name}" with open(response_json_path) as file: @@ -376,7 +368,6 @@ def tableau_ingest_common( @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_tableau_ingest(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_mces.json" golden_file_name: str = "tableau_mces_golden.json" tableau_ingest_common( @@ -454,7 +445,6 @@ def mock_data() -> List[dict]: @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_tableau_cll_ingest(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_mces_cll.json" golden_file_name: str = "tableau_cll_mces_golden.json" @@ -481,7 +471,6 @@ def test_tableau_cll_ingest(pytestconfig, tmp_path, mock_datahub_graph): @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_project_pattern(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_project_pattern_mces.json" golden_file_name: str = "tableau_mces_golden.json" @@ -505,7 +494,6 @@ def test_project_pattern(pytestconfig, tmp_path, mock_datahub_graph): @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_project_path_pattern(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_project_path_mces.json" golden_file_name: str = "tableau_project_path_mces_golden.json" @@ -529,8 +517,6 @@ def test_project_path_pattern(pytestconfig, tmp_path, mock_datahub_graph): @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_project_hierarchy(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() - output_file_name: str = "tableau_nested_project_mces.json" golden_file_name: str = "tableau_nested_project_mces_golden.json" @@ -554,7 +540,6 @@ def test_project_hierarchy(pytestconfig, tmp_path, mock_datahub_graph): @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_extract_all_project(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_extract_all_project_mces.json" golden_file_name: str = "tableau_extract_all_project_mces_golden.json" @@ -644,7 +629,6 @@ def test_project_path_pattern_deny(pytestconfig, tmp_path, mock_datahub_graph): def test_tableau_ingest_with_platform_instance( pytestconfig, tmp_path, mock_datahub_graph ): - enable_logging() output_file_name: str = "tableau_with_platform_instance_mces.json" golden_file_name: str = "tableau_with_platform_instance_mces_golden.json" @@ -691,7 +675,6 @@ def test_tableau_ingest_with_platform_instance( def test_lineage_overrides(): - enable_logging() # Simple - specify platform instance to presto table assert ( TableauUpstreamReference( @@ -745,7 +728,6 @@ def test_lineage_overrides(): def test_database_hostname_to_platform_instance_map(): - enable_logging() # Simple - snowflake table assert ( TableauUpstreamReference( @@ -916,7 +898,6 @@ def test_tableau_stateful(pytestconfig, tmp_path, mock_time, mock_datahub_graph) def test_tableau_no_verify(): - enable_logging() # This test ensures that we can connect to a self-signed certificate # when ssl_verify is set to False. @@ -941,7 +922,6 @@ def test_tableau_no_verify(): @freeze_time(FROZEN_TIME) @pytest.mark.integration_batch_2 def test_tableau_signout_timeout(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_signout_timeout_mces.json" golden_file_name: str = "tableau_signout_timeout_mces_golden.json" tableau_ingest_common( @@ -1073,7 +1053,6 @@ def test_get_all_datasources_failure(pytestconfig, tmp_path, mock_datahub_graph) @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_tableau_ingest_multiple_sites(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_mces_multiple_sites.json" golden_file_name: str = "tableau_multiple_sites_mces_golden.json" @@ -1135,7 +1114,6 @@ def test_tableau_ingest_multiple_sites(pytestconfig, tmp_path, mock_datahub_grap @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_tableau_ingest_sites_as_container(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_mces_ingest_sites_as_container.json" golden_file_name: str = "tableau_sites_as_container_mces_golden.json" @@ -1159,7 +1137,6 @@ def test_tableau_ingest_sites_as_container(pytestconfig, tmp_path, mock_datahub_ @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_site_name_pattern(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_site_name_pattern_mces.json" golden_file_name: str = "tableau_site_name_pattern_mces_golden.json" @@ -1183,7 +1160,6 @@ def test_site_name_pattern(pytestconfig, tmp_path, mock_datahub_graph): @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_permission_ingestion(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_permission_ingestion_mces.json" golden_file_name: str = "tableau_permission_ingestion_mces_golden.json" @@ -1209,7 +1185,6 @@ def test_permission_ingestion(pytestconfig, tmp_path, mock_datahub_graph): @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_no_hidden_assets(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_no_hidden_assets_mces.json" golden_file_name: str = "tableau_no_hidden_assets_mces_golden.json" @@ -1232,7 +1207,6 @@ def test_no_hidden_assets(pytestconfig, tmp_path, mock_datahub_graph): @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_ingest_tags_disabled(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_ingest_tags_disabled_mces.json" golden_file_name: str = "tableau_ingest_tags_disabled_mces_golden.json" @@ -1254,7 +1228,6 @@ def test_ingest_tags_disabled(pytestconfig, tmp_path, mock_datahub_graph): @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_hidden_asset_tags(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() output_file_name: str = "tableau_hidden_asset_tags_mces.json" golden_file_name: str = "tableau_hidden_asset_tags_mces_golden.json" @@ -1277,8 +1250,6 @@ def test_hidden_asset_tags(pytestconfig, tmp_path, mock_datahub_graph): @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_hidden_assets_without_ingest_tags(pytestconfig, tmp_path, mock_datahub_graph): - enable_logging() - new_config = config_source_default.copy() new_config["tags_for_hidden_assets"] = ["hidden", "private"] new_config["ingest_tags"] = False From 172736a9b3d291d6cb8fcaf775114abdf8853e1f Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 26 Dec 2024 21:34:31 -0500 Subject: [PATCH 36/39] feat(ingest/dbt): support "Explore" page in dbt cloud (#12223) --- docs/how/updating-datahub.md | 1 + .../src/datahub/ingestion/source/dbt/dbt_cloud.py | 13 ++++++++++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index a742ebe0cd896..d6620fde0bf79 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -42,6 +42,7 @@ This file documents any backwards-incompatible changes in DataHub and assists pe - #12077: `Kafka` source no longer ingests schemas from schema registry as separate entities by default, set `ingest_schemas_as_entities` to `true` to ingest them - OpenAPI Update: PIT Keep Alive parameter added to scroll. NOTE: This parameter requires the `pointInTimeCreationEnabled` feature flag to be enabled and the `elasticSearch.implementation` configuration to be `elasticsearch`. This feature is not supported for OpenSearch at this time and the parameter will not be respected without both of these set. - OpenAPI Update 2: Previously there was an incorrectly marked parameter named `sort` on the generic list entities endpoint for v3. This parameter is deprecated and only supports a single string value while the documentation indicates it supports a list of strings. This documentation error has been fixed and the correct field, `sortCriteria`, is now documented which supports a list of strings. +- #12223: For dbt Cloud ingestion, the "View in dbt" link will point at the "Explore" page in the dbt Cloud UI. You can revert to the old behavior of linking to the dbt Cloud IDE by setting `external_url_mode: ide". ### Breaking Changes diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py index 66c5ef7179af4..5042f6d69b261 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py @@ -1,7 +1,7 @@ import logging from datetime import datetime from json import JSONDecodeError -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Literal, Optional, Tuple from urllib.parse import urlparse import dateutil.parser @@ -62,6 +62,11 @@ class DBTCloudConfig(DBTCommonConfig): description="The ID of the run to ingest metadata from. If not specified, we'll default to the latest run.", ) + external_url_mode: Literal["explore", "ide"] = Field( + default="explore", + description='Where should the "View in dbt" link point to - either the "Explore" UI or the dbt Cloud IDE', + ) + @root_validator(pre=True) def set_metadata_endpoint(cls, values: dict) -> dict: if values.get("access_url") and not values.get("metadata_endpoint"): @@ -527,5 +532,7 @@ def _parse_into_dbt_column( ) def get_external_url(self, node: DBTNode) -> Optional[str]: - # TODO: Once dbt Cloud supports deep linking to specific files, we can use that. - return f"{self.config.access_url}/develop/{self.config.account_id}/projects/{self.config.project_id}" + if self.config.external_url_mode == "explore": + return f"{self.config.access_url}/explore/{self.config.account_id}/projects/{self.config.project_id}/environments/production/details/{node.dbt_name}" + else: + return f"{self.config.access_url}/develop/{self.config.account_id}/projects/{self.config.project_id}" From 3ca8d09100eb649a5f191a32f2af8d300424818b Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Fri, 27 Dec 2024 11:40:00 +0530 Subject: [PATCH 37/39] feat(ingest/snowflake): support email_as_user_identifier for queries v2 (#12219) --- .../source/snowflake/snowflake_config.py | 19 ++++--- .../source/snowflake/snowflake_queries.py | 45 ++++++++++++--- .../source/snowflake/snowflake_query.py | 6 +- .../source/snowflake/snowflake_usage_v2.py | 7 +-- .../source/snowflake/snowflake_utils.py | 40 ++++++++------ .../snowflake/test_snowflake_queries.py | 55 +++++++++++++++++++ 6 files changed, 132 insertions(+), 40 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index 2b2dcf860cdb0..12e5fb72b00de 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -138,12 +138,20 @@ class SnowflakeIdentifierConfig( description="Whether to convert dataset urns to lowercase.", ) - -class SnowflakeUsageConfig(BaseUsageConfig): email_domain: Optional[str] = pydantic.Field( default=None, description="Email domain of your organization so users can be displayed on UI appropriately.", ) + + email_as_user_identifier: bool = Field( + default=True, + description="Format user urns as an email, if the snowflake user's email is set. If `email_domain` is " + "provided, generates email addresses for snowflake users with unset emails, based on their " + "username.", + ) + + +class SnowflakeUsageConfig(BaseUsageConfig): apply_view_usage_to_tables: bool = pydantic.Field( default=False, description="Whether to apply view's usage to its base tables. If set to True, usage is applied to base tables only.", @@ -267,13 +275,6 @@ class SnowflakeV2Config( " Map of share name -> details of share.", ) - email_as_user_identifier: bool = Field( - default=True, - description="Format user urns as an email, if the snowflake user's email is set. If `email_domain` is " - "provided, generates email addresses for snowflake users with unset emails, based on their " - "username.", - ) - include_assertion_results: bool = Field( default=False, description="Whether to ingest assertion run results for assertions created using Datahub" diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py index 174aad0bddd4a..36825dc33fe7d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -66,6 +66,11 @@ logger = logging.getLogger(__name__) +# Define a type alias +UserName = str +UserEmail = str +UsersMapping = Dict[UserName, UserEmail] + class SnowflakeQueriesExtractorConfig(ConfigModel): # TODO: Support stateful ingestion for the time windows. @@ -114,11 +119,13 @@ class SnowflakeQueriesSourceConfig( class SnowflakeQueriesExtractorReport(Report): copy_history_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer) query_log_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer) + users_fetch_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer) audit_log_load_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer) sql_aggregator: Optional[SqlAggregatorReport] = None num_ddl_queries_dropped: int = 0 + num_users: int = 0 @dataclass @@ -225,6 +232,9 @@ def is_allowed_table(self, name: str) -> bool: def get_workunits_internal( self, ) -> Iterable[MetadataWorkUnit]: + with self.report.users_fetch_timer: + users = self.fetch_users() + # TODO: Add some logic to check if the cached audit log is stale or not. audit_log_file = self.local_temp_path / "audit_log.sqlite" use_cached_audit_log = audit_log_file.exists() @@ -248,7 +258,7 @@ def get_workunits_internal( queries.append(entry) with self.report.query_log_fetch_timer: - for entry in self.fetch_query_log(): + for entry in self.fetch_query_log(users): queries.append(entry) with self.report.audit_log_load_timer: @@ -263,6 +273,25 @@ def get_workunits_internal( shared_connection.close() audit_log_file.unlink(missing_ok=True) + def fetch_users(self) -> UsersMapping: + users: UsersMapping = dict() + with self.structured_reporter.report_exc("Error fetching users from Snowflake"): + logger.info("Fetching users from Snowflake") + query = SnowflakeQuery.get_all_users() + resp = self.connection.query(query) + + for row in resp: + try: + users[row["NAME"]] = row["EMAIL"] + self.report.num_users += 1 + except Exception as e: + self.structured_reporter.warning( + "Error parsing user row", + context=f"{row}", + exc=e, + ) + return users + def fetch_copy_history(self) -> Iterable[KnownLineageMapping]: # Derived from _populate_external_lineage_from_copy_history. @@ -298,7 +327,7 @@ def fetch_copy_history(self) -> Iterable[KnownLineageMapping]: yield result def fetch_query_log( - self, + self, users: UsersMapping ) -> Iterable[Union[PreparsedQuery, TableRename, TableSwap]]: query_log_query = _build_enriched_query_log_query( start_time=self.config.window.start_time, @@ -319,7 +348,7 @@ def fetch_query_log( assert isinstance(row, dict) try: - entry = self._parse_audit_log_row(row) + entry = self._parse_audit_log_row(row, users) except Exception as e: self.structured_reporter.warning( "Error parsing query log row", @@ -331,7 +360,7 @@ def fetch_query_log( yield entry def _parse_audit_log_row( - self, row: Dict[str, Any] + self, row: Dict[str, Any], users: UsersMapping ) -> Optional[Union[TableRename, TableSwap, PreparsedQuery]]: json_fields = { "DIRECT_OBJECTS_ACCESSED", @@ -430,9 +459,11 @@ def _parse_audit_log_row( ) ) - # TODO: Fetch email addresses from Snowflake to map user -> email - # TODO: Support email_domain fallback for generating user urns. - user = CorpUserUrn(self.identifiers.snowflake_identifier(res["user_name"])) + user = CorpUserUrn( + self.identifiers.get_user_identifier( + res["user_name"], users.get(res["user_name"]) + ) + ) timestamp: datetime = res["query_start_time"] timestamp = timestamp.astimezone(timezone.utc) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py index a94b39476b2c2..40bcfb514efd2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py @@ -947,4 +947,8 @@ def dmf_assertion_results(start_time_millis: int, end_time_millis: int) -> str: AND METRIC_NAME ilike '{pattern}' escape '{escape_pattern}' ORDER BY MEASUREMENT_TIME ASC; -""" + """ + + @staticmethod + def get_all_users() -> str: + return """SELECT name as "NAME", email as "EMAIL" FROM SNOWFLAKE.ACCOUNT_USAGE.USERS""" diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py index aff15386c5083..4bdf559f293b5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py @@ -342,10 +342,9 @@ def _map_user_counts( filtered_user_counts.append( DatasetUserUsageCounts( user=make_user_urn( - self.get_user_identifier( + self.identifiers.get_user_identifier( user_count["user_name"], user_email, - self.config.email_as_user_identifier, ) ), count=user_count["total"], @@ -453,9 +452,7 @@ def _get_operation_aspect_work_unit( reported_time: int = int(time.time() * 1000) last_updated_timestamp: int = int(start_time.timestamp() * 1000) user_urn = make_user_urn( - self.get_user_identifier( - user_name, user_email, self.config.email_as_user_identifier - ) + self.identifiers.get_user_identifier(user_name, user_email) ) # NOTE: In earlier `snowflake-usage` connector this was base_objects_accessed, which is incorrect diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py index 8e0c97aa135e8..885bee1ccdb90 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py @@ -300,6 +300,28 @@ def get_quoted_identifier_for_schema(db_name, schema_name): def get_quoted_identifier_for_table(db_name, schema_name, table_name): return f'"{db_name}"."{schema_name}"."{table_name}"' + # Note - decide how to construct user urns. + # Historically urns were created using part before @ from user's email. + # Users without email were skipped from both user entries as well as aggregates. + # However email is not mandatory field in snowflake user, user_name is always present. + def get_user_identifier( + self, + user_name: str, + user_email: Optional[str], + ) -> str: + if user_email: + return self.snowflake_identifier( + user_email + if self.identifier_config.email_as_user_identifier is True + else user_email.split("@")[0] + ) + return self.snowflake_identifier( + f"{user_name}@{self.identifier_config.email_domain}" + if self.identifier_config.email_as_user_identifier is True + and self.identifier_config.email_domain is not None + else user_name + ) + class SnowflakeCommonMixin(SnowflakeStructuredReportMixin): platform = "snowflake" @@ -315,24 +337,6 @@ def structured_reporter(self) -> SourceReport: def identifiers(self) -> SnowflakeIdentifierBuilder: return SnowflakeIdentifierBuilder(self.config, self.report) - # Note - decide how to construct user urns. - # Historically urns were created using part before @ from user's email. - # Users without email were skipped from both user entries as well as aggregates. - # However email is not mandatory field in snowflake user, user_name is always present. - def get_user_identifier( - self, - user_name: str, - user_email: Optional[str], - email_as_user_identifier: bool, - ) -> str: - if user_email: - return self.identifiers.snowflake_identifier( - user_email - if email_as_user_identifier is True - else user_email.split("@")[0] - ) - return self.identifiers.snowflake_identifier(user_name) - # TODO: Revisit this after stateful ingestion can commit checkpoint # for failures that do not affect the checkpoint # TODO: Add additional parameters to match the signature of the .warning and .failure methods diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_queries.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_queries.py index 82f5691bcee3d..ae0f23d93215d 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_queries.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_queries.py @@ -22,3 +22,58 @@ def test_source_close_cleans_tmp(snowflake_connect, tmp_path): # This closes QueriesExtractor which in turn closes SqlParsingAggregator source.close() assert len(os.listdir(tmp_path)) == 0 + + +@patch("snowflake.connector.connect") +def test_user_identifiers_email_as_identifier(snowflake_connect, tmp_path): + source = SnowflakeQueriesSource.create( + { + "connection": { + "account_id": "ABC12345.ap-south-1.aws", + "username": "TST_USR", + "password": "TST_PWD", + }, + "email_as_user_identifier": True, + "email_domain": "example.com", + }, + PipelineContext("run-id"), + ) + assert ( + source.identifiers.get_user_identifier("username", "username@example.com") + == "username@example.com" + ) + assert ( + source.identifiers.get_user_identifier("username", None) + == "username@example.com" + ) + + # We'd do best effort to use email as identifier, but would keep username as is, + # if email can't be formed. + source.identifiers.identifier_config.email_domain = None + + assert ( + source.identifiers.get_user_identifier("username", "username@example.com") + == "username@example.com" + ) + + assert source.identifiers.get_user_identifier("username", None) == "username" + + +@patch("snowflake.connector.connect") +def test_user_identifiers_username_as_identifier(snowflake_connect, tmp_path): + source = SnowflakeQueriesSource.create( + { + "connection": { + "account_id": "ABC12345.ap-south-1.aws", + "username": "TST_USR", + "password": "TST_PWD", + }, + "email_as_user_identifier": False, + }, + PipelineContext("run-id"), + ) + assert ( + source.identifiers.get_user_identifier("username", "username@example.com") + == "username" + ) + assert source.identifiers.get_user_identifier("username", None) == "username" From 29e4528ae5117ccdb6f0685b8571c2afdcc19f57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Fri, 27 Dec 2024 11:12:40 +0100 Subject: [PATCH 38/39] fix(tableau): retry if 502 error code (#12233) --- .../datahub/ingestion/source/tableau/tableau.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index 508500ffe489b..df59cae3fad23 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -186,6 +186,15 @@ except ImportError: REAUTHENTICATE_ERRORS = (NonXMLResponseError,) +RETRIABLE_ERROR_CODES = [ + 408, # Request Timeout + 429, # Too Many Requests + 500, # Internal Server Error + 502, # Bad Gateway + 503, # Service Unavailable + 504, # Gateway Timeout +] + logger: logging.Logger = logging.getLogger(__name__) # Replace / with | @@ -287,7 +296,7 @@ def make_tableau_client(self, site: str) -> Server: max_retries=Retry( total=self.max_retries, backoff_factor=1, - status_forcelist=[429, 500, 502, 503, 504], + status_forcelist=RETRIABLE_ERROR_CODES, ) ) server._session.mount("http://", adapter) @@ -1212,9 +1221,11 @@ def get_connection_object_page( except InternalServerError as ise: # In some cases Tableau Server returns 504 error, which is a timeout error, so it worths to retry. - if ise.code == 504: + # Extended with other retryable errors. + if ise.code in RETRIABLE_ERROR_CODES: if retries_remaining <= 0: raise ise + logger.info(f"Retrying query due to error {ise.code}") return self.get_connection_object_page( query=query, connection_type=connection_type, From d7de7eb2a65385b0a4458f9c26cc8b1a42158cc1 Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Fri, 27 Dec 2024 17:51:43 +0530 Subject: [PATCH 39/39] ci: remove qodana (#12227) --- .github/workflows/qodana-scan.yml | 23 ----------------------- 1 file changed, 23 deletions(-) delete mode 100644 .github/workflows/qodana-scan.yml diff --git a/.github/workflows/qodana-scan.yml b/.github/workflows/qodana-scan.yml deleted file mode 100644 index 750cf24ad38e5..0000000000000 --- a/.github/workflows/qodana-scan.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: Qodana -on: - workflow_dispatch: - pull_request: - push: - branches: - - master - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - qodana: - runs-on: ubuntu-latest - steps: - - uses: acryldata/sane-checkout-action@v3 - - name: "Qodana Scan" - uses: JetBrains/qodana-action@v2022.3.4 - - uses: github/codeql-action/upload-sarif@v2 - with: - sarif_file: ${{ runner.temp }}/qodana/results/qodana.sarif.json - cache-default-branch-only: true