From 75f63a2311b9959e4c0fbec54f30d802cba707e2 Mon Sep 17 00:00:00 2001 From: Mayuri N Date: Wed, 18 Oct 2023 22:38:28 +0530 Subject: [PATCH 1/7] feat(ingest): support view lineage for all sqlalchemy sources Additional Changes: 1. Support incremental lineage for all sqlalchemy sources 2. Keep column level lineage enabled and incremental lineage disabled by default 3. Monkey-patch hive dialect to extract hive view definitions to extract lineage 4. Fix incremental_lineage_helper for empty upstreams Pending Followup Changes: 1. Support postgres-like partial view definitions --- .../src/datahub/configuration/common.py | 2 +- .../datahub/emitter/sql_parsing_builder.py | 5 +- .../api/incremental_lineage_helper.py | 13 +- .../src/datahub/ingestion/api/source.py | 24 + .../source/snowflake/snowflake_lineage_v2.py | 13 +- .../source/snowflake/snowflake_v2.py | 7 - .../src/datahub/ingestion/source/sql/hive.py | 86 ++- .../ingestion/source/sql/sql_common.py | 102 ++- .../ingestion/source/sql/sql_config.py | 19 +- .../datahub/ingestion/source/sql/teradata.py | 54 +- .../source/sql/two_tier_sql_source.py | 6 +- .../datahub/ingestion/source/sql/vertica.py | 2 +- .../source/state/stateful_ingestion_base.py | 3 +- .../src/datahub/utilities/sqlglot_lineage.py | 3 + .../hive/hive_mces_all_db_golden.json | 581 +++++++++++++++--- .../integration/hive/hive_mces_golden.json | 530 ++++++++++++++-- .../tests/integration/hive/hive_setup.sql | 22 +- .../mysql/mysql_mces_no_db_golden.json | 272 ++++++-- .../test_incremental_lineage_helper.py | 21 + 19 files changed, 1473 insertions(+), 292 deletions(-) diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py index c909b89eb0c2dd..73ac4baac48c0f 100644 --- a/metadata-ingestion/src/datahub/configuration/common.py +++ b/metadata-ingestion/src/datahub/configuration/common.py @@ -283,7 +283,7 @@ class VersionedConfig(ConfigModel): class LineageConfig(ConfigModel): incremental_lineage: bool = Field( - default=True, + default=False, description="When enabled, emits lineage as incremental to existing lineage already in DataHub. When disabled, re-states lineage on each run.", ) diff --git a/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py b/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py index 071d590f270f8b..fd0b080d4fe143 100644 --- a/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py +++ b/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py @@ -106,6 +106,7 @@ def process_sql_parsing_result( user: Optional[UserUrn] = None, custom_operation_type: Optional[str] = None, include_urns: Optional[Set[DatasetUrn]] = None, + include_column_lineage: bool = True, ) -> Iterable[MetadataWorkUnit]: """Process a single query and yield any generated workunits. @@ -130,7 +131,9 @@ def process_sql_parsing_result( _merge_lineage_data( downstream_urn=downstream_urn, upstream_urns=result.in_tables, - column_lineage=result.column_lineage, + column_lineage=result.column_lineage + if include_column_lineage + else None, upstream_edges=self._lineage_map[downstream_urn], query_timestamp=query_timestamp, is_view_ddl=is_view_ddl, diff --git a/metadata-ingestion/src/datahub/ingestion/api/incremental_lineage_helper.py b/metadata-ingestion/src/datahub/ingestion/api/incremental_lineage_helper.py index 9478c5cf7efa26..945b201ca5758c 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/incremental_lineage_helper.py +++ b/metadata-ingestion/src/datahub/ingestion/api/incremental_lineage_helper.py @@ -130,10 +130,13 @@ def auto_incremental_lineage( if len(wu.metadata.proposedSnapshot.aspects) > 0: yield wu - yield _lineage_wu_via_read_modify_write( - graph, urn, lineage_aspect, wu.metadata.systemMetadata - ) if lineage_aspect.fineGrainedLineages else _convert_upstream_lineage_to_patch( - urn, lineage_aspect, wu.metadata.systemMetadata - ) + if lineage_aspect.fineGrainedLineages: + yield _lineage_wu_via_read_modify_write( + graph, urn, lineage_aspect, wu.metadata.systemMetadata + ) + elif lineage_aspect.upstreams: + yield _convert_upstream_lineage_to_patch( + urn, lineage_aspect, wu.metadata.systemMetadata + ) else: yield wu diff --git a/metadata-ingestion/src/datahub/ingestion/api/source.py b/metadata-ingestion/src/datahub/ingestion/api/source.py index b86844b1c4c831..f6309e82a92f1d 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source.py @@ -26,6 +26,7 @@ from datahub.emitter.mcp_builder import mcps_from_mce from datahub.ingestion.api.closeable import Closeable from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit +from datahub.ingestion.api.incremental_lineage_helper import auto_incremental_lineage from datahub.ingestion.api.report import Report from datahub.ingestion.api.source_helpers import ( auto_browse_path_v2, @@ -215,12 +216,35 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: ) ): auto_lowercase_dataset_urns = auto_lowercase_urns + + incremental_lineage_processor: Optional[MetadataWorkUnitProcessor] = None + if ( + self.ctx.pipeline_config + and self.ctx.pipeline_config.source + and self.ctx.pipeline_config.source.config + ): + incremental_lineage = ( + hasattr( + self.ctx.pipeline_config.source.config, + "incremental_lineage", + ) + and self.ctx.pipeline_config.source.config.incremental_lineage + ) or ( + hasattr(self.ctx.pipeline_config.source.config, "get") + and self.ctx.pipeline_config.source.config.get("incremental_lineage") + ) + incremental_lineage_processor = partial( + auto_incremental_lineage, + self.ctx.graph, + incremental_lineage, + ) return [ auto_lowercase_dataset_urns, auto_status_aspect, auto_materialize_referenced_tags, browse_path_processor, partial(auto_workunit_reporter, self.get_report()), + incremental_lineage_processor, ] @staticmethod diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index 0a15c352fc8420..c9046b9a7aa65c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -200,14 +200,15 @@ def _gen_workunit_from_sql_parsing_result( self, dataset_identifier: str, result: SqlParsingResult, - ) -> MetadataWorkUnit: + ) -> Iterable[MetadataWorkUnit]: upstreams, fine_upstreams = self.get_upstreams_from_sql_parsing_result( self.dataset_urn_builder(dataset_identifier), result ) - self.report.num_views_with_upstreams += 1 - return self._create_upstream_lineage_workunit( - dataset_identifier, upstreams, fine_upstreams - ) + if upstreams: + self.report.num_views_with_upstreams += 1 + yield self._create_upstream_lineage_workunit( + dataset_identifier, upstreams, fine_upstreams + ) def _gen_workunits_from_query_result( self, @@ -251,7 +252,7 @@ def get_view_upstream_workunits( ) if result: views_processed.add(view_identifier) - yield self._gen_workunit_from_sql_parsing_result( + yield from self._gen_workunit_from_sql_parsing_result( view_identifier, result ) self.report.view_lineage_parse_secs = timer.elapsed_seconds() diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index a5c07d9a3870c6..215116b4c33fb0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -4,7 +4,6 @@ import os.path import platform from dataclasses import dataclass -from functools import partial from typing import Callable, Dict, Iterable, List, Optional, Union import pandas as pd @@ -27,7 +26,6 @@ platform_name, support_status, ) -from datahub.ingestion.api.incremental_lineage_helper import auto_incremental_lineage from datahub.ingestion.api.source import ( CapabilityReport, MetadataWorkUnitProcessor, @@ -513,11 +511,6 @@ def _init_schema_resolver(self) -> SchemaResolver: def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), - partial( - auto_incremental_lineage, - self.ctx.graph, - self.config.incremental_lineage, - ), StaleEntityRemovalHandler.create( self, self.config, self.ctx ).workunit_processor, diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py b/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py index 63b21bc82edddd..0c183b2f52f6c6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py @@ -1,7 +1,7 @@ import json import logging import re -from typing import Any, Dict, List, Optional +from typing import Any, Dict, Iterable, List, Optional, Union from pydantic.class_validators import validator from pydantic.fields import Field @@ -9,7 +9,10 @@ # This import verifies that the dependencies are available. from pyhive import hive # noqa: F401 from pyhive.sqlalchemy_hive import HiveDate, HiveDecimal, HiveTimestamp +from sqlalchemy.engine.reflection import Inspector +from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance +from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.decorators import ( SourceCapability, SupportStatus, @@ -18,8 +21,10 @@ platform_name, support_status, ) +from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.extractor import schema_util -from datahub.ingestion.source.sql.sql_common import register_custom_type +from datahub.ingestion.source.sql.sql_common import SqlWorkUnit, register_custom_type +from datahub.ingestion.source.sql.sql_config import SQLCommonConfig from datahub.ingestion.source.sql.two_tier_sql_source import ( TwoTierSQLAlchemyConfig, TwoTierSQLAlchemySource, @@ -31,6 +36,7 @@ SchemaField, TimeTypeClass, ) +from datahub.metadata.schema_classes import ViewPropertiesClass from datahub.utilities import config_clean from datahub.utilities.hive_schema_to_avro import get_avro_schema_for_hive_column @@ -90,19 +96,39 @@ def dbapi_get_columns_patched(self, connection, table_name, schema=None, **kw): logger.warning(f"Failed to patch method due to {e}") +try: + from pyhive.sqlalchemy_hive import HiveDialect + + @reflection.cache # type: ignore + def get_view_names_patched(self, connection, schema=None, **kw): + query = "SHOW VIEWS" + if schema: + query += " IN " + self.identifier_preparer.quote_identifier(schema) + return [row[0] for row in connection.execute(query)] + + @reflection.cache # type: ignore + def get_view_definition_patched(self, connection, view_name, schema=None, **kw): + full_table = self.identifier_preparer.quote_identifier(view_name) + if schema: + full_table = "{}.{}".format( + self.identifier_preparer.quote_identifier(schema), + self.identifier_preparer.quote_identifier(view_name), + ) + row = connection.execute("SHOW CREATE TABLE {}".format(full_table)).fetchone() + return row[0] + + HiveDialect.get_view_names = get_view_names_patched + HiveDialect.get_view_definition = get_view_definition_patched +except ModuleNotFoundError: + pass +except Exception as e: + logger.warning(f"Failed to patch method due to {e}") + + class HiveConfig(TwoTierSQLAlchemyConfig): # defaults scheme = Field(default="hive", hidden_from_docs=True) - # Hive SQLAlchemy connector returns views as tables. - # See https://github.com/dropbox/PyHive/blob/b21c507a24ed2f2b0cf15b0b6abb1c43f31d3ee0/pyhive/sqlalchemy_hive.py#L270-L273. - # Disabling views helps us prevent this duplication. - include_views = Field( - default=False, - hidden_from_docs=True, - description="Hive SQLAlchemy connector returns views as tables. See https://github.com/dropbox/PyHive/blob/b21c507a24ed2f2b0cf15b0b6abb1c43f31d3ee0/pyhive/sqlalchemy_hive.py#L270-L273. Disabling views helps us prevent this duplication.", - ) - @validator("host_port") def clean_host_port(cls, v): return config_clean.remove_protocol(v) @@ -174,3 +200,41 @@ def get_schema_fields_for_column( return new_fields return fields + + # Hive SQLAlchemy connector returns views as tables in get_table_names. + # See https://github.com/dropbox/PyHive/blob/b21c507a24ed2f2b0cf15b0b6abb1c43f31d3ee0/pyhive/sqlalchemy_hive.py#L270-L273. + # This override makes sure that we ingest view definitions for views + def _process_view( + self, + dataset_name: str, + inspector: Inspector, + schema: str, + view: str, + sql_config: SQLCommonConfig, + ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: + dataset_urn = make_dataset_urn_with_platform_instance( + self.platform, + dataset_name, + self.config.platform_instance, + self.config.env, + ) + + try: + view_definition = inspector.get_view_definition(view, schema) + if view_definition is None: + view_definition = "" + else: + # Some dialects return a TextClause instead of a raw string, + # so we need to convert them to a string. + view_definition = str(view_definition) + except NotImplementedError: + view_definition = "" + + if view_definition: + view_properties_aspect = ViewPropertiesClass( + materialized=False, viewLanguage="SQL", viewLogic=view_definition + ) + yield MetadataChangeProposalWrapper( + entityUrn=dataset_urn, + aspect=view_properties_aspect, + ).as_workunit() diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py index 6524eea8222d41..26908eb92cf6df 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py @@ -8,6 +8,7 @@ Dict, Iterable, List, + MutableMapping, Optional, Set, Tuple, @@ -29,6 +30,7 @@ make_tag_urn, ) from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.emitter.sql_parsing_builder import SqlParsingBuilder from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.source import MetadataWorkUnitProcessor from datahub.ingestion.api.workunit import MetadataWorkUnit @@ -88,9 +90,15 @@ ViewPropertiesClass, ) from datahub.telemetry import telemetry +from datahub.utilities.file_backed_collections import FileBackedDict from datahub.utilities.lossy_collections import LossyList from datahub.utilities.registries.domain_registry import DomainRegistry from datahub.utilities.sqlalchemy_query_combiner import SQLAlchemyQueryCombinerReport +from datahub.utilities.sqlglot_lineage import ( + SchemaResolver, + SqlParsingResult, + sqlglot_lineage, +) if TYPE_CHECKING: from datahub.ingestion.source.ge_data_profiler import ( @@ -112,6 +120,11 @@ class SQLSourceReport(StaleEntityRemovalSourceReport): query_combiner: Optional[SQLAlchemyQueryCombinerReport] = None + num_view_definitions_parsed: int = 0 + num_view_definitions_failed_parsing: int = 0 + num_view_definitions_failed_column_parsing: int = 0 + view_definitions_parsing_failures: LossyList[str] = field(default_factory=LossyList) + def report_entity_scanned(self, name: str, ent_type: str = "table") -> None: """ Entity could be a view or a table @@ -322,6 +335,18 @@ def __init__(self, config: SQLCommonConfig, ctx: PipelineContext, platform: str) cached_domains=[k for k in self.config.domain], graph=self.ctx.graph ) + self.views_processed: Set[str] = set() + self.schema_resolver: SchemaResolver = SchemaResolver( + platform=self.platform, + platform_instance=self.config.platform_instance, + env=self.config.env, + ) + self._view_definition_cache: MutableMapping[str, str] + if self.config.use_file_backed_cache: + self._view_definition_cache = FileBackedDict[str]() + else: + self._view_definition_cache = {} + def warn(self, log: logging.Logger, key: str, reason: str) -> None: self.report.report_warning(key, reason[:100]) log.warning(f"{key} => {reason}") @@ -515,6 +540,33 @@ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit profile_requests, profiler, platform=self.platform ) + if self.config.include_view_lineage: + yield from self.get_view_lineage() + + def get_view_lineage(self) -> Iterable[MetadataWorkUnit]: + builder = SqlParsingBuilder( + generate_lineage=True, + generate_usage_statistics=False, + generate_operations=False, + ) + for dataset_name in self._view_definition_cache.keys(): + view_definition = self._view_definition_cache[dataset_name] + result = self._run_sql_parser( + dataset_name, + view_definition, + self.schema_resolver, + ) + if result and result.out_tables: + # This does not yield any workunits but we use + # yield here to execute this method + yield from builder.process_sql_parsing_result( + result=result, + query=view_definition, + is_view_ddl=True, + include_column_lineage=self.config.include_view_column_lineage, + ) + yield from builder.gen_workunits() + def get_identifier( self, *, schema: str, entity: str, inspector: Inspector, **kwargs: Any ) -> str: @@ -661,6 +713,8 @@ def _process_table( schema_fields, ) dataset_snapshot.aspects.append(schema_metadata) + if self.config.include_view_lineage: + self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata) db_name = self.get_db_name(inspector) yield from self.add_table_to_schema_container( @@ -865,6 +919,12 @@ def _process_view( view: str, sql_config: SQLCommonConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: + dataset_urn = make_dataset_urn_with_platform_instance( + self.platform, + dataset_name, + self.config.platform_instance, + self.config.env, + ) try: columns = inspector.get_columns(view, schema) except KeyError: @@ -880,6 +940,8 @@ def _process_view( columns, canonical_schema=schema_fields, ) + if self.config.include_view_lineage: + self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata) description, properties, _ = self.get_table_properties(inspector, schema, view) try: view_definition = inspector.get_view_definition(view, schema) @@ -893,12 +955,9 @@ def _process_view( view_definition = "" properties["view_definition"] = view_definition properties["is_view"] = "True" - dataset_urn = make_dataset_urn_with_platform_instance( - self.platform, - dataset_name, - self.config.platform_instance, - self.config.env, - ) + if view_definition and self.config.include_view_lineage: + self._view_definition_cache[dataset_name] = view_definition + dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[StatusClass(removed=False)], @@ -945,6 +1004,37 @@ def _process_view( domain_registry=self.domain_registry, ) + def _run_sql_parser( + self, dataset_identifier: str, query: str, schema_resolver: SchemaResolver + ) -> Optional[SqlParsingResult]: + try: + database, schema = self.get_db_schema(dataset_identifier) + except ValueError: + logger.warning(f"Invalid view identifier: {dataset_identifier}") + return None + raw_lineage = sqlglot_lineage( + query, + schema_resolver=schema_resolver, + default_db=database, + default_schema=schema, + ) + if raw_lineage.debug_info.table_error: + logger.debug( + f"Failed to parse lineage for view {dataset_identifier}: " + f"{raw_lineage.debug_info.table_error}" + ) + self.report.num_view_definitions_failed_parsing += 1 + return None + elif raw_lineage.debug_info.column_error: + self.report.num_view_definitions_failed_column_parsing += 1 + else: + self.report.num_view_definitions_parsed += 1 + return raw_lineage + + def get_db_schema(self, dataset_identifier: str) -> Tuple[Optional[str], str]: + database, schema, _view = dataset_identifier.split(".") + return database, schema + def get_profiler_instance(self, inspector: Inspector) -> "DatahubGEProfiler": from datahub.ingestion.source.ge_data_profiler import DatahubGEProfiler diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py index 57aae32b361cf5..095b8e64431719 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py @@ -6,7 +6,7 @@ from pydantic import Field from sqlalchemy.engine import URL -from datahub.configuration.common import AllowDenyPattern, ConfigModel +from datahub.configuration.common import AllowDenyPattern, ConfigModel, LineageConfig from datahub.configuration.source_common import ( DatasetSourceConfigMixin, LowerCaseDatasetUrnConfigMixin, @@ -28,6 +28,7 @@ class SQLCommonConfig( StatefulIngestionConfigBase, DatasetSourceConfigMixin, LowerCaseDatasetUrnConfigMixin, + LineageConfig, ): options: dict = pydantic.Field( default_factory=dict, @@ -70,6 +71,22 @@ class SQLCommonConfig( description="If the source supports it, include table lineage to the underlying storage location.", ) + include_view_lineage: bool = Field( + default=True, + description="Populates view->view and table->view lineage using DataHub's sql parser.", + ) + + include_view_column_lineage: bool = Field( + default=True, + description="Populates column-level lineage for view->view and table->view lineage using DataHub's sql parser." + " Requires `include_view_lineage` to be enabled.", + ) + + use_file_backed_cache: bool = Field( + default=True, + description="Whether to use a file backed cache for the view definitions.", + ) + profiling: GEProfilingConfig = GEProfilingConfig() # Custom Stateful Ingestion settings stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py b/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py index e628e4dbd34467..899a7b6697c0a5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py @@ -1,7 +1,7 @@ import logging from dataclasses import dataclass from datetime import datetime -from typing import Iterable, MutableMapping, Optional, Union +from typing import Iterable, Optional, Union # This import verifies that the dependencies are available. import teradatasqlalchemy # noqa: F401 @@ -33,14 +33,11 @@ from datahub.ingestion.source.usage.usage_common import BaseUsageConfig from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport from datahub.ingestion.source_report.time_window import BaseTimeWindowReport -from datahub.metadata._schema_classes import SchemaMetadataClass, ViewPropertiesClass from datahub.metadata.com.linkedin.pegasus2avro.schema import ( BytesTypeClass, TimeTypeClass, ) -from datahub.utilities.file_backed_collections import FileBackedDict from datahub.utilities.sqlglot_lineage import SchemaResolver, sqlglot_lineage -from datahub.utilities.urns.dataset_urn import DatasetUrn logger: logging.Logger = logging.getLogger(__name__) @@ -87,11 +84,6 @@ class TeradataConfig(BaseTeradataConfig, BaseTimeWindowConfig): "This requires to have the table lineage feature enabled.", ) - include_view_lineage = Field( - default=True, - description="Whether to include view lineage in the ingestion. " - "This requires to have the view lineage feature enabled.", - ) usage: BaseUsageConfig = Field( description="The usage config to use when generating usage statistics", default=BaseUsageConfig(), @@ -107,11 +99,6 @@ class TeradataConfig(BaseTeradataConfig, BaseTimeWindowConfig): description="Generate usage statistic.", ) - use_file_backed_cache: bool = Field( - default=True, - description="Whether to use a file backed cache for the view definitions.", - ) - @platform_name("Teradata") @config_class(TeradataConfig) @@ -143,8 +130,6 @@ class TeradataSource(TwoTierSQLAlchemySource): and "timestamp" < TIMESTAMP '{end_time}' """ - _view_definition_cache: MutableMapping[str, str] - def __init__(self, config: TeradataConfig, ctx: PipelineContext): super().__init__(config, ctx, "teradata") @@ -167,34 +152,11 @@ def __init__(self, config: TeradataConfig, ctx: PipelineContext): env=self.config.env, ) - if self.config.use_file_backed_cache: - self._view_definition_cache = FileBackedDict[str]() - else: - self._view_definition_cache = {} - @classmethod def create(cls, config_dict, ctx): config = TeradataConfig.parse_obj(config_dict) return cls(config, ctx) - def get_view_lineage(self) -> Iterable[MetadataWorkUnit]: - for key in self._view_definition_cache.keys(): - view_definition = self._view_definition_cache[key] - dataset_urn = DatasetUrn.create_from_string(key) - - db_name: Optional[str] = None - # We need to get the default db from the dataset urn otherwise the builder generates the wrong urns - if "." in dataset_urn.get_dataset_name(): - db_name = dataset_urn.get_dataset_name().split(".", 1)[0] - - self.report.num_view_ddl_parsed += 1 - if self.report.num_view_ddl_parsed % 1000 == 0: - logger.info(f"Parsed {self.report.num_queries_parsed} view ddl") - - yield from self.gen_lineage_from_query( - query=view_definition, default_database=db_name, is_view_ddl=True - ) - def get_audit_log_mcps(self) -> Iterable[MetadataWorkUnit]: engine = self.get_metadata_engine() for entry in engine.execute( @@ -252,19 +214,7 @@ def get_metadata_engine(self) -> Engine: def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]: # Add all schemas to the schema resolver - for wu in super().get_workunits_internal(): - urn = wu.get_urn() - schema_metadata = wu.get_aspect_of_type(SchemaMetadataClass) - if schema_metadata: - self.schema_resolver.add_schema_metadata(urn, schema_metadata) - view_properties = wu.get_aspect_of_type(ViewPropertiesClass) - if view_properties and self.config.include_view_lineage: - self._view_definition_cache[urn] = view_properties.viewLogic - yield wu - - if self.config.include_view_lineage: - self.report.report_ingestion_stage_start("view lineage extraction") - yield from self.get_view_lineage() + yield from super().get_workunits_internal() if self.config.include_table_lineage or self.config.include_usage_statistics: self.report.report_ingestion_stage_start("audit log extraction") diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/two_tier_sql_source.py b/metadata-ingestion/src/datahub/ingestion/source/sql/two_tier_sql_source.py index 7a49551dc12351..efb1d3ffe119fc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/two_tier_sql_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/two_tier_sql_source.py @@ -1,6 +1,6 @@ import typing import urllib.parse -from typing import Any, Dict, Iterable, Optional +from typing import Any, Dict, Iterable, Optional, Tuple from pydantic.fields import Field from sqlalchemy import create_engine, inspect @@ -71,6 +71,10 @@ def __init__(self, config, ctx, platform): super().__init__(config, ctx, platform) self.config: TwoTierSQLAlchemyConfig = config + def get_db_schema(self, dataset_identifier: str) -> Tuple[Optional[str], str]: + schema, _view = dataset_identifier.split(".", 1) + return None, schema + def get_database_container_key(self, db_name: str, schema: str) -> ContainerKey: # Because our overridden get_allowed_schemas method returns db_name as the schema name, # the db_name and schema here will be the same. Hence, we just ignore the schema parameter. diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py b/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py index a417cae2b1ab03..b89db755853bc3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py @@ -86,7 +86,7 @@ class VerticaConfig(BasicSQLAlchemyConfig): default=True, description="Whether Models should be ingested." ) - include_view_lineage: Optional[bool] = pydantic.Field( + include_view_lineage: bool = pydantic.Field( default=True, description="If the source supports it, include view lineage to the underlying storage location.", ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py b/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py index be97e9380f1f57..7fb2cf9813cab1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py +++ b/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py @@ -11,7 +11,6 @@ ConfigModel, ConfigurationError, DynamicTypedConfig, - LineageConfig, ) from datahub.configuration.time_window_config import BaseTimeWindowConfig from datahub.configuration.validate_field_rename import pydantic_renamed_field @@ -100,7 +99,7 @@ class StatefulIngestionConfigBase(GenericModel, Generic[CustomConfig]): ) -class StatefulLineageConfigMixin(LineageConfig): +class StatefulLineageConfigMixin: enable_stateful_lineage_ingestion: bool = Field( default=True, description="Enable stateful lineage ingestion." diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py index c830ec8c02fd44..322c236732102f 100644 --- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py @@ -588,6 +588,9 @@ def _schema_aware_fuzzy_column_resolve( except sqlglot.errors.OptimizeError as e: # This is not a fatal error, so we can continue. logger.debug("sqlglot failed to annotate types: %s", e) + except sqlglot.errors.ParseError as e: + # This is not a fatal error, so we can continue. + logger.debug("sqlglot failed to parse types: %s", e) column_lineage = [] diff --git a/metadata-ingestion/tests/integration/hive/hive_mces_all_db_golden.json b/metadata-ingestion/tests/integration/hive/hive_mces_all_db_golden.json index f3b6d2b8138cc5..6774d4c7055b99 100644 --- a/metadata-ingestion/tests/integration/hive/hive_mces_all_db_golden.json +++ b/metadata-ingestion/tests/integration/hive/hive_mces_all_db_golden.json @@ -16,7 +16,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -31,7 +32,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -46,7 +48,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -63,7 +66,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -78,7 +82,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -93,7 +98,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -111,7 +117,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Wed Jul 05 17:38:26 UTC 2023", + "CreateTime:": "Thu Oct 19 13:26:12 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/_test_table_underscore", @@ -121,7 +127,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1688578706", + "Table Parameters: transient_lastDdlTime": "1697721972", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -187,7 +193,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -204,7 +211,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -224,7 +232,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -239,7 +248,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -257,17 +267,19 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Wed Jul 05 17:38:26 UTC 2023", + "CreateTime:": "Thu Oct 19 13:26:12 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/array_struct_test", "Table Type:": "MANAGED_TABLE", "Table Parameters: COLUMN_STATS_ACCURATE": "{\\\"BASIC_STATS\\\":\\\"true\\\"}", + "Table Parameters: another.comment": "This table has no partitions", + "Table Parameters: comment": "This table has array of structs", "Table Parameters: numFiles": "1", "Table Parameters: numRows": "1", "Table Parameters: rawDataSize": "32", "Table Parameters: totalSize": "33", - "Table Parameters: transient_lastDdlTime": "1688578710", + "Table Parameters: transient_lastDdlTime": "1697721976", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -278,6 +290,7 @@ "Storage Desc Params: serialization.format": "1" }, "name": "array_struct_test", + "description": "This table has array of structs", "tags": [] } }, @@ -304,6 +317,7 @@ { "fieldPath": "property_id", "nullable": true, + "description": "id of property", "type": { "type": { "com.linkedin.pegasus2avro.schema.NumberType": {} @@ -316,6 +330,7 @@ { "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service", "nullable": true, + "description": "service types and providers", "type": { "type": { "com.linkedin.pegasus2avro.schema.ArrayType": { @@ -368,7 +383,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -385,7 +401,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -405,7 +422,189 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:ded36d15fcfbbb939830549697122661" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "Database:": "db1", + "Owner:": "root", + "CreateTime:": "Thu Oct 19 13:26:18 UTC 2023", + "LastAccessTime:": "UNKNOWN", + "Retention:": "0", + "Table Type:": "VIRTUAL_VIEW", + "Table Parameters: transient_lastDdlTime": "1697721978", + "SerDe Library:": "null", + "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", + "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", + "Compressed:": "No", + "Num Buckets:": "-1", + "Bucket Columns:": "[]", + "Sort Columns:": "[]", + "View Original Text:": "select * from db1.array_struct_test", + "View Expanded Text:": "select `array_struct_test`.`property_id`, `array_struct_test`.`service` from `db1`.`array_struct_test`", + "View Rewrite Enabled:": "No" + }, + "name": "array_struct_test_view", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "db1.array_struct_test_view", + "platform": "urn:li:dataPlatform:hive", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "property_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": [ + "record" + ] + } + } + }, + "nativeDataType": "array>>", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array>>\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service.[type=string].type", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service.[type=array].[type=int].provider", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": [ + "int" + ] + } + } + }, + "nativeDataType": "array", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array\"}" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:ded36d15fcfbbb939830549697122661", + "urn": "urn:li:container:ded36d15fcfbbb939830549697122661" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -420,7 +619,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -438,7 +638,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Wed Jul 05 17:38:30 UTC 2023", + "CreateTime:": "Thu Oct 19 13:26:18 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/map_test", @@ -448,7 +648,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1688578710", + "Table Parameters: transient_lastDdlTime": "1697721978", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -518,7 +718,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -535,7 +736,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -555,7 +757,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -570,7 +773,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -588,7 +792,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Wed Jul 05 17:38:30 UTC 2023", + "CreateTime:": "Thu Oct 19 13:26:18 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/nested_struct_test", @@ -598,7 +802,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1688578710", + "Table Parameters: transient_lastDdlTime": "1697721978", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -717,7 +921,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -734,7 +939,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -754,7 +960,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -769,7 +976,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -787,16 +995,17 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Wed Jul 05 17:38:22 UTC 2023", + "CreateTime:": "Thu Oct 19 13:26:08 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/pokes", "Table Type:": "MANAGED_TABLE", "Table Parameters: numFiles": "1", + "Table Parameters: numPartitions": "1", "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "5812", - "Table Parameters: transient_lastDdlTime": "1688578704", + "Table Parameters: transient_lastDdlTime": "1697721968", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -853,6 +1062,18 @@ "nativeDataType": "string", "recursive": false, "isPartOfKey": false + }, + { + "fieldPath": "baz", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false } ] } @@ -862,7 +1083,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -879,7 +1101,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -899,7 +1122,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -914,7 +1138,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -932,7 +1157,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Wed Jul 05 17:38:26 UTC 2023", + "CreateTime:": "Thu Oct 19 13:26:12 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test", @@ -942,7 +1167,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1688578706", + "Table Parameters: transient_lastDdlTime": "1697721972", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -1039,7 +1264,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1056,7 +1282,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1076,7 +1303,188 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.struct_test_view_materialized,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:ded36d15fcfbbb939830549697122661" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.struct_test_view_materialized,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "Database:": "db1", + "Owner:": "root", + "CreateTime:": "Thu Oct 19 13:26:18 UTC 2023", + "LastAccessTime:": "UNKNOWN", + "Retention:": "0", + "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test_view_materialized", + "Table Type:": "MATERIALIZED_VIEW", + "Table Parameters: numFiles": "0", + "Table Parameters: totalSize": "0", + "Table Parameters: transient_lastDdlTime": "1697721978", + "SerDe Library:": "org.apache.hadoop.hive.ql.io.orc.OrcSerde", + "InputFormat:": "org.apache.hadoop.hive.ql.io.orc.OrcInputFormat", + "OutputFormat:": "org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat", + "Compressed:": "No", + "Num Buckets:": "-1", + "Bucket Columns:": "[]", + "Sort Columns:": "[]", + "View Original Text:": "select * from db1.struct_test", + "View Expanded Text:": "null", + "View Rewrite Enabled:": "No" + }, + "name": "struct_test_view_materialized", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "db1.struct_test_view_materialized", + "platform": "urn:li:dataPlatform:hive", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "property_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].service", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.RecordType": {} + } + }, + "nativeDataType": "struct>", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"struct>\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=string].type", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=array].[type=int].provider", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": [ + "int" + ] + } + } + }, + "nativeDataType": "array", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array\"}" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.struct_test_view_materialized,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.struct_test_view_materialized,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:ded36d15fcfbbb939830549697122661", + "urn": "urn:li:container:ded36d15fcfbbb939830549697122661" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1091,7 +1499,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1109,7 +1518,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Wed Jul 05 17:38:30 UTC 2023", + "CreateTime:": "Thu Oct 19 13:26:18 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/union_test", @@ -1119,10 +1528,10 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1688578710", - "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", - "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", - "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", + "Table Parameters: transient_lastDdlTime": "1697721978", + "SerDe Library:": "org.apache.hadoop.hive.ql.io.orc.OrcSerde", + "InputFormat:": "org.apache.hadoop.hive.ql.io.orc.OrcInputFormat", + "OutputFormat:": "org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat", "Compressed:": "No", "Num Buckets:": "-1", "Bucket Columns:": "[]", @@ -1285,7 +1694,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1302,7 +1712,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1322,7 +1733,26 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "CREATE VIEW `db1.array_struct_test_view` AS select `array_struct_test`.`property_id`, `array_struct_test`.`service` from `db1`.`array_struct_test`", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1342,7 +1772,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1357,7 +1788,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1372,7 +1804,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1389,7 +1822,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1404,7 +1838,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1419,7 +1854,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1437,7 +1873,7 @@ "customProperties": { "Database:": "db2", "Owner:": "root", - "CreateTime:": "Wed Jul 05 17:38:24 UTC 2023", + "CreateTime:": "Thu Oct 19 13:26:10 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db2.db/pokes", @@ -1446,7 +1882,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "5812", - "Table Parameters: transient_lastDdlTime": "1688578706", + "Table Parameters: transient_lastDdlTime": "1697721971", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -1454,10 +1890,7 @@ "Num Buckets:": "-1", "Bucket Columns:": "[]", "Sort Columns:": "[]", - "Storage Desc Params: serialization.format": "1", - "Table:": "db2.pokes", - "Constraint Name:": "pk_1173723383_1683022998392_0", - "Column Names:": "foo" + "Storage Desc Params: serialization.format": "1" }, "name": "pokes", "tags": [] @@ -1515,7 +1948,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1532,7 +1966,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1552,7 +1987,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1572,7 +2008,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1587,7 +2024,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1602,7 +2040,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1619,7 +2058,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1634,7 +2074,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/hive/hive_mces_golden.json b/metadata-ingestion/tests/integration/hive/hive_mces_golden.json index 08f281f398909b..e93924049f626c 100644 --- a/metadata-ingestion/tests/integration/hive/hive_mces_golden.json +++ b/metadata-ingestion/tests/integration/hive/hive_mces_golden.json @@ -16,7 +16,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -31,7 +32,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -46,7 +48,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -63,7 +66,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -78,7 +82,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -93,7 +98,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -111,7 +117,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Wed Jul 05 17:38:26 UTC 2023", + "CreateTime:": "Thu Oct 19 13:26:12 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/_test_table_underscore", @@ -121,7 +127,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1688578706", + "Table Parameters: transient_lastDdlTime": "1697721972", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -187,7 +193,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -204,7 +211,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -224,7 +232,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -239,7 +248,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -257,17 +267,19 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Wed Jul 05 17:38:26 UTC 2023", + "CreateTime:": "Thu Oct 19 13:26:12 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/array_struct_test", "Table Type:": "MANAGED_TABLE", "Table Parameters: COLUMN_STATS_ACCURATE": "{\\\"BASIC_STATS\\\":\\\"true\\\"}", + "Table Parameters: another.comment": "This table has no partitions", + "Table Parameters: comment": "This table has array of structs", "Table Parameters: numFiles": "1", "Table Parameters: numRows": "1", "Table Parameters: rawDataSize": "32", "Table Parameters: totalSize": "33", - "Table Parameters: transient_lastDdlTime": "1688578710", + "Table Parameters: transient_lastDdlTime": "1697721976", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -278,6 +290,7 @@ "Storage Desc Params: serialization.format": "1" }, "name": "array_struct_test", + "description": "This table has array of structs", "tags": [] } }, @@ -304,6 +317,7 @@ { "fieldPath": "property_id", "nullable": true, + "description": "id of property", "type": { "type": { "com.linkedin.pegasus2avro.schema.NumberType": {} @@ -316,6 +330,7 @@ { "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service", "nullable": true, + "description": "service types and providers", "type": { "type": { "com.linkedin.pegasus2avro.schema.ArrayType": { @@ -368,7 +383,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -385,7 +401,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -405,7 +422,189 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:ded36d15fcfbbb939830549697122661" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "Database:": "db1", + "Owner:": "root", + "CreateTime:": "Thu Oct 19 13:26:18 UTC 2023", + "LastAccessTime:": "UNKNOWN", + "Retention:": "0", + "Table Type:": "VIRTUAL_VIEW", + "Table Parameters: transient_lastDdlTime": "1697721978", + "SerDe Library:": "null", + "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", + "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", + "Compressed:": "No", + "Num Buckets:": "-1", + "Bucket Columns:": "[]", + "Sort Columns:": "[]", + "View Original Text:": "select * from db1.array_struct_test", + "View Expanded Text:": "select `array_struct_test`.`property_id`, `array_struct_test`.`service` from `db1`.`array_struct_test`", + "View Rewrite Enabled:": "No" + }, + "name": "array_struct_test_view", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "db1.array_struct_test_view", + "platform": "urn:li:dataPlatform:hive", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "property_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": [ + "record" + ] + } + } + }, + "nativeDataType": "array>>", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array>>\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service.[type=string].type", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service.[type=array].[type=int].provider", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": [ + "int" + ] + } + } + }, + "nativeDataType": "array", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array\"}" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:ded36d15fcfbbb939830549697122661", + "urn": "urn:li:container:ded36d15fcfbbb939830549697122661" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -420,7 +619,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -438,7 +638,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Wed Jul 05 17:38:30 UTC 2023", + "CreateTime:": "Thu Oct 19 13:26:18 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/map_test", @@ -448,7 +648,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1688578710", + "Table Parameters: transient_lastDdlTime": "1697721978", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -518,7 +718,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -535,7 +736,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -555,7 +757,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -570,7 +773,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -588,7 +792,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Wed Jul 05 17:38:30 UTC 2023", + "CreateTime:": "Thu Oct 19 13:26:18 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/nested_struct_test", @@ -598,7 +802,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1688578710", + "Table Parameters: transient_lastDdlTime": "1697721978", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -717,7 +921,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -734,7 +939,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -754,7 +960,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -769,7 +976,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -787,16 +995,17 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Wed Jul 05 17:38:22 UTC 2023", + "CreateTime:": "Thu Oct 19 13:26:08 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/pokes", "Table Type:": "MANAGED_TABLE", "Table Parameters: numFiles": "1", + "Table Parameters: numPartitions": "1", "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "5812", - "Table Parameters: transient_lastDdlTime": "1688578704", + "Table Parameters: transient_lastDdlTime": "1697721968", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -853,6 +1062,18 @@ "nativeDataType": "string", "recursive": false, "isPartOfKey": false + }, + { + "fieldPath": "baz", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false } ] } @@ -862,7 +1083,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -879,7 +1101,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -899,7 +1122,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -914,7 +1138,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -932,7 +1157,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Wed Jul 05 17:38:26 UTC 2023", + "CreateTime:": "Thu Oct 19 13:26:12 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test", @@ -942,7 +1167,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1688578706", + "Table Parameters: transient_lastDdlTime": "1697721972", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -1039,7 +1264,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1056,7 +1282,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1076,7 +1303,188 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.struct_test_view_materialized,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:ded36d15fcfbbb939830549697122661" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.struct_test_view_materialized,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "Database:": "db1", + "Owner:": "root", + "CreateTime:": "Thu Oct 19 13:26:18 UTC 2023", + "LastAccessTime:": "UNKNOWN", + "Retention:": "0", + "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test_view_materialized", + "Table Type:": "MATERIALIZED_VIEW", + "Table Parameters: numFiles": "0", + "Table Parameters: totalSize": "0", + "Table Parameters: transient_lastDdlTime": "1697721978", + "SerDe Library:": "org.apache.hadoop.hive.ql.io.orc.OrcSerde", + "InputFormat:": "org.apache.hadoop.hive.ql.io.orc.OrcInputFormat", + "OutputFormat:": "org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat", + "Compressed:": "No", + "Num Buckets:": "-1", + "Bucket Columns:": "[]", + "Sort Columns:": "[]", + "View Original Text:": "select * from db1.struct_test", + "View Expanded Text:": "null", + "View Rewrite Enabled:": "No" + }, + "name": "struct_test_view_materialized", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "db1.struct_test_view_materialized", + "platform": "urn:li:dataPlatform:hive", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "property_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].service", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.RecordType": {} + } + }, + "nativeDataType": "struct>", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"struct>\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=string].type", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=array].[type=int].provider", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": [ + "int" + ] + } + } + }, + "nativeDataType": "array", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array\"}" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.struct_test_view_materialized,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.struct_test_view_materialized,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:ded36d15fcfbbb939830549697122661", + "urn": "urn:li:container:ded36d15fcfbbb939830549697122661" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1091,7 +1499,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1109,7 +1518,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Wed Jul 05 17:38:30 UTC 2023", + "CreateTime:": "Thu Oct 19 13:26:18 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/union_test", @@ -1119,10 +1528,10 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1688578710", - "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", - "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", - "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", + "Table Parameters: transient_lastDdlTime": "1697721978", + "SerDe Library:": "org.apache.hadoop.hive.ql.io.orc.OrcSerde", + "InputFormat:": "org.apache.hadoop.hive.ql.io.orc.OrcInputFormat", + "OutputFormat:": "org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat", "Compressed:": "No", "Num Buckets:": "-1", "Bucket Columns:": "[]", @@ -1285,7 +1694,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1302,7 +1712,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1322,7 +1733,26 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "CREATE VIEW `db1.array_struct_test_view` AS select `array_struct_test`.`property_id`, `array_struct_test`.`service` from `db1`.`array_struct_test`", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/hive/hive_setup.sql b/metadata-ingestion/tests/integration/hive/hive_setup.sql index 8fb8498894bc06..323a78e24d10b3 100644 --- a/metadata-ingestion/tests/integration/hive/hive_setup.sql +++ b/metadata-ingestion/tests/integration/hive/hive_setup.sql @@ -1,10 +1,10 @@ CREATE DATABASE IF NOT EXISTS db1; CREATE DATABASE IF NOT EXISTS db2; -- Setup a "pokes" example table. -CREATE TABLE IF NOT EXISTS db1.pokes (foo INT, bar STRING); -LOAD DATA LOCAL INPATH '/opt/hive/examples/files/kv1.txt' OVERWRITE INTO TABLE db1.pokes; +CREATE TABLE IF NOT EXISTS db1.pokes (foo INT, bar STRING) PARTITIONED BY (baz STRING); +LOAD DATA LOCAL INPATH '/opt/hive/examples/files/kv1.txt' OVERWRITE INTO TABLE db1.pokes PARTITION (baz='dummy'); -CREATE TABLE IF NOT EXISTS db2.pokes (foo INT, bar STRING, CONSTRAINT pk_1173723383_1683022998392_0 primary key(foo) DISABLE NOVALIDATE NORELY); +CREATE TABLE IF NOT EXISTS db2.pokes (foo INT, bar STRING); LOAD DATA LOCAL INPATH '/opt/hive/examples/files/kv1.txt' OVERWRITE INTO TABLE db2.pokes; -- Setup a table with a special character. @@ -23,12 +23,12 @@ CREATE TABLE IF NOT EXISTS db1.struct_test CREATE TABLE IF NOT EXISTS db1.array_struct_test ( - property_id INT, + property_id INT COMMENT 'id of property', service array - >> -); + >> COMMENT 'service types and providers' +) TBLPROPERTIES ('comment' = 'This table has array of structs', 'another.comment' = 'This table has no partitions');; WITH test_data as ( @@ -39,6 +39,9 @@ test_data as ( INSERT INTO TABLE db1.array_struct_test select * from test_data; +CREATE MATERIALIZED VIEW db1.struct_test_view_materialized as select * from db1.struct_test; +CREATE VIEW db1.array_struct_test_view as select * from db1.array_struct_test; + CREATE TABLE IF NOT EXISTS db1.nested_struct_test ( property_id INT, @@ -50,9 +53,6 @@ CREATE TABLE IF NOT EXISTS db1.nested_struct_test CREATE TABLE db1.union_test( foo UNIONTYPE, struct, struct> -); +) STORED AS ORC ; -CREATE TABLE db1.map_test( - KeyValue String, - RecordId map -); \ No newline at end of file +CREATE TABLE db1.map_test(KeyValue String, RecordId map); \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/mysql/mysql_mces_no_db_golden.json b/metadata-ingestion/tests/integration/mysql/mysql_mces_no_db_golden.json index 4aaefb48d33e15..38b03ce238d1c8 100644 --- a/metadata-ingestion/tests/integration/mysql/mysql_mces_no_db_golden.json +++ b/metadata-ingestion/tests/integration/mysql/mysql_mces_no_db_golden.json @@ -16,7 +16,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -31,7 +32,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -46,7 +48,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -63,7 +66,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -78,7 +82,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -93,7 +98,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -213,7 +219,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -230,7 +237,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -250,7 +258,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -265,7 +274,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -361,7 +371,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -378,7 +389,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -398,7 +410,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -554,7 +567,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -969,7 +983,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -989,7 +1004,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1004,7 +1020,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1019,7 +1036,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1036,7 +1054,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1053,7 +1072,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1068,7 +1088,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1083,7 +1104,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1215,7 +1237,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1232,7 +1255,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1249,7 +1273,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1269,7 +1294,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1284,7 +1310,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1418,7 +1445,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1435,7 +1463,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1452,7 +1481,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1472,7 +1502,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1487,7 +1518,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1586,7 +1618,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1603,7 +1636,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1620,7 +1654,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1637,7 +1672,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1657,7 +1693,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1677,7 +1714,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1692,7 +1730,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1707,7 +1746,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1724,7 +1764,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1739,7 +1780,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1754,7 +1796,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1874,7 +1917,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1891,7 +1935,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1911,7 +1956,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1926,7 +1972,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2022,7 +2069,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2039,7 +2087,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2059,7 +2108,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2182,7 +2232,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2233,7 +2284,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2253,7 +2305,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2268,7 +2321,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2283,7 +2337,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2300,7 +2355,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2315,7 +2371,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2330,7 +2387,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2390,7 +2448,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2407,7 +2466,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2427,7 +2487,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2442,7 +2503,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2502,7 +2564,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2519,7 +2582,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2539,7 +2603,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2568,7 +2633,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2597,7 +2663,79 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index_view,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index,PROD),doubleVal)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index_view,PROD),doubleVal)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index,PROD),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index_view,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index,PROD),path)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index_view,PROD),path)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index,PROD),urn)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index_view,PROD),urn)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_helper.py b/metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_helper.py index 54a22d860285ce..e8485106c6a818 100644 --- a/metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_helper.py +++ b/metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_helper.py @@ -104,6 +104,27 @@ def test_incremental_table_lineage(tmp_path, pytestconfig): ) +def test_incremental_table_lineage_empty_upstreams(tmp_path, pytestconfig): + + urn = make_dataset_urn(platform, "dataset1") + aspect = make_lineage_aspect( + "dataset1", + upstreams=[], + ) + + processed_wus = auto_incremental_lineage( + graph=None, + incremental_lineage=True, + stream=[ + MetadataChangeProposalWrapper( + entityUrn=urn, aspect=aspect, systemMetadata=system_metadata + ).as_workunit() + ], + ) + + assert [wu.metadata for wu in processed_wus] == [] + + @pytest.mark.parametrize( "gms_aspect,current_aspect,output_aspect", [ From 93b7ba1a0acd24d8094493c027a7764d2cf001fd Mon Sep 17 00:00:00 2001 From: Mayuri N Date: Fri, 20 Oct 2023 19:16:56 +0530 Subject: [PATCH 2/7] revert incremental lineage helper refractor to source.py - primarily to reduce adverse effect on other sources, such as dbt which have their own flavour of incremental lineage implementation --- .../src/datahub/ingestion/api/source.py | 23 ------------------- .../ingestion/source/dbt/dbt_common.py | 5 ++++ .../source/snowflake/snowflake_lineage_v2.py | 1 - .../source/snowflake/snowflake_v2.py | 7 ++++++ .../ingestion/source/sql/sql_common.py | 7 ++++++ .../ingestion/source_config/sql/snowflake.py | 6 ++++- .../snowflake/test_snowflake_failures.py | 3 ++- 7 files changed, 26 insertions(+), 26 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/api/source.py b/metadata-ingestion/src/datahub/ingestion/api/source.py index f6309e82a92f1d..8940642f7008a7 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source.py @@ -26,7 +26,6 @@ from datahub.emitter.mcp_builder import mcps_from_mce from datahub.ingestion.api.closeable import Closeable from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit -from datahub.ingestion.api.incremental_lineage_helper import auto_incremental_lineage from datahub.ingestion.api.report import Report from datahub.ingestion.api.source_helpers import ( auto_browse_path_v2, @@ -217,34 +216,12 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: ): auto_lowercase_dataset_urns = auto_lowercase_urns - incremental_lineage_processor: Optional[MetadataWorkUnitProcessor] = None - if ( - self.ctx.pipeline_config - and self.ctx.pipeline_config.source - and self.ctx.pipeline_config.source.config - ): - incremental_lineage = ( - hasattr( - self.ctx.pipeline_config.source.config, - "incremental_lineage", - ) - and self.ctx.pipeline_config.source.config.incremental_lineage - ) or ( - hasattr(self.ctx.pipeline_config.source.config, "get") - and self.ctx.pipeline_config.source.config.get("incremental_lineage") - ) - incremental_lineage_processor = partial( - auto_incremental_lineage, - self.ctx.graph, - incremental_lineage, - ) return [ auto_lowercase_dataset_urns, auto_status_aspect, auto_materialize_referenced_tags, browse_path_processor, partial(auto_workunit_reporter, self.get_report()), - incremental_lineage_processor, ] @staticmethod diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py index 48d2118a9b0917..c4de24bf192f16 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py @@ -280,6 +280,11 @@ class DBTCommonConfig( default=False, description="When enabled, dbt test warnings will be treated as failures.", ) + # override fault value to True. + incremental_lineage: bool = Field( + default=True, + description="When enabled, emits lineage as incremental to existing lineage already in DataHub. When disabled, re-states lineage on each run.", + ) @validator("target_platform") def validate_target_platform_value(cls, target_platform: str) -> str: diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index c9046b9a7aa65c..9649054dbe6cbb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -136,7 +136,6 @@ def get_workunits( return self._populate_external_lineage_map(discovered_tables) - if self.config.include_view_lineage: if len(discovered_views) > 0: yield from self.get_view_upstream_workunits( diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index 215116b4c33fb0..a5c07d9a3870c6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -4,6 +4,7 @@ import os.path import platform from dataclasses import dataclass +from functools import partial from typing import Callable, Dict, Iterable, List, Optional, Union import pandas as pd @@ -26,6 +27,7 @@ platform_name, support_status, ) +from datahub.ingestion.api.incremental_lineage_helper import auto_incremental_lineage from datahub.ingestion.api.source import ( CapabilityReport, MetadataWorkUnitProcessor, @@ -511,6 +513,11 @@ def _init_schema_resolver(self) -> SchemaResolver: def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), + partial( + auto_incremental_lineage, + self.ctx.graph, + self.config.incremental_lineage, + ), StaleEntityRemovalHandler.create( self, self.config, self.ctx ).workunit_processor, diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py index 26908eb92cf6df..a75f612ab6aebf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py @@ -2,6 +2,7 @@ import logging import traceback from dataclasses import dataclass, field +from functools import partial from typing import ( TYPE_CHECKING, Any, @@ -32,6 +33,7 @@ from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.sql_parsing_builder import SqlParsingBuilder from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.incremental_lineage_helper import auto_incremental_lineage from datahub.ingestion.api.source import MetadataWorkUnitProcessor from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.common.subtypes import ( @@ -483,6 +485,11 @@ def get_schema_level_workunits( def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), + partial( + auto_incremental_lineage, + self.ctx.graph, + self.config.incremental_lineage, + ), StaleEntityRemovalHandler.create( self, self.config, self.ctx ).workunit_processor, diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py index 0d72fc52da0cab..6e0b55991a2e51 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py @@ -168,7 +168,11 @@ def _check_oauth_config(oauth_config: Optional[OAuthConfiguration]) -> None: @pydantic.validator("include_view_lineage") def validate_include_view_lineage(cls, v, values): - if not values.get("include_table_lineage") and v: + if ( + "include_table_lineage" in values + and not values.get("include_table_lineage") + and v + ): raise ValueError( "include_table_lineage must be True for include_view_lineage to be set." ) diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py index cd53b8f7db4f64..4b0dd2b1045a37 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py @@ -287,8 +287,9 @@ def test_snowflake_unexpected_snowflake_view_lineage_error_causes_pipeline_warni SnowflakeV2Config, cast(PipelineConfig, snowflake_pipeline_config1).source.config, ) + config.include_table_lineage = True config.include_view_lineage = True - config.incremental_lineage = False + pipeline = Pipeline(snowflake_pipeline_config1) pipeline.run() pipeline.raise_from_status() # pipeline should not fail From 7c186978acc2096d9c9ddb9832c906a2585d5dd1 Mon Sep 17 00:00:00 2001 From: Mayuri N Date: Mon, 23 Oct 2023 13:42:39 +0530 Subject: [PATCH 3/7] fix tests, address review comments --- .../src/datahub/ingestion/source/sql/hive.py | 45 +++++++++---------- .../ingestion/source_config/sql/snowflake.py | 8 ++-- .../src/datahub/utilities/sqlglot_lineage.py | 7 +-- 3 files changed, 26 insertions(+), 34 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py b/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py index 0c183b2f52f6c6..d081acb6c1effa 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py @@ -8,7 +8,7 @@ # This import verifies that the dependencies are available. from pyhive import hive # noqa: F401 -from pyhive.sqlalchemy_hive import HiveDate, HiveDecimal, HiveTimestamp +from pyhive.sqlalchemy_hive import HiveDate, HiveDecimal, HiveDialect, HiveTimestamp from sqlalchemy.engine.reflection import Inspector from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance @@ -96,33 +96,28 @@ def dbapi_get_columns_patched(self, connection, table_name, schema=None, **kw): logger.warning(f"Failed to patch method due to {e}") -try: - from pyhive.sqlalchemy_hive import HiveDialect +@reflection.cache # type: ignore +def get_view_names_patched(self, connection, schema=None, **kw): + query = "SHOW VIEWS" + if schema: + query += " IN " + self.identifier_preparer.quote_identifier(schema) + return [row[0] for row in connection.execute(query)] - @reflection.cache # type: ignore - def get_view_names_patched(self, connection, schema=None, **kw): - query = "SHOW VIEWS" - if schema: - query += " IN " + self.identifier_preparer.quote_identifier(schema) - return [row[0] for row in connection.execute(query)] - @reflection.cache # type: ignore - def get_view_definition_patched(self, connection, view_name, schema=None, **kw): - full_table = self.identifier_preparer.quote_identifier(view_name) - if schema: - full_table = "{}.{}".format( - self.identifier_preparer.quote_identifier(schema), - self.identifier_preparer.quote_identifier(view_name), - ) - row = connection.execute("SHOW CREATE TABLE {}".format(full_table)).fetchone() - return row[0] +@reflection.cache # type: ignore +def get_view_definition_patched(self, connection, view_name, schema=None, **kw): + full_table = self.identifier_preparer.quote_identifier(view_name) + if schema: + full_table = "{}.{}".format( + self.identifier_preparer.quote_identifier(schema), + self.identifier_preparer.quote_identifier(view_name), + ) + row = connection.execute("SHOW CREATE TABLE {}".format(full_table)).fetchone() + return row[0] - HiveDialect.get_view_names = get_view_names_patched - HiveDialect.get_view_definition = get_view_definition_patched -except ModuleNotFoundError: - pass -except Exception as e: - logger.warning(f"Failed to patch method due to {e}") + +HiveDialect.get_view_names = get_view_names_patched +HiveDialect.get_view_definition = get_view_definition_patched class HiveConfig(TwoTierSQLAlchemyConfig): diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py index 6e0b55991a2e51..c3e8c175f1de54 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py @@ -166,17 +166,17 @@ def _check_oauth_config(oauth_config: Optional[OAuthConfiguration]) -> None: "but should be set when using use_certificate false for oauth_config" ) - @pydantic.validator("include_view_lineage") - def validate_include_view_lineage(cls, v, values): + @pydantic.root_validator() + def validate_include_view_lineage(cls, values): if ( "include_table_lineage" in values and not values.get("include_table_lineage") - and v + and values.get("include_view_lineage") ): raise ValueError( "include_table_lineage must be True for include_view_lineage to be set." ) - return v + return values def get_sql_alchemy_url( self, diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py index 322c236732102f..ec42124d774a28 100644 --- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py @@ -585,12 +585,9 @@ def _schema_aware_fuzzy_column_resolve( statement = sqlglot.optimizer.annotate_types.annotate_types( statement, schema=sqlglot_db_schema ) - except sqlglot.errors.OptimizeError as e: + except (sqlglot.errors.OptimizeError, sqlglot.errors.ParseError) as e: # This is not a fatal error, so we can continue. - logger.debug("sqlglot failed to annotate types: %s", e) - except sqlglot.errors.ParseError as e: - # This is not a fatal error, so we can continue. - logger.debug("sqlglot failed to parse types: %s", e) + logger.debug("sqlglot failed to annotate or parse types: %s", e) column_lineage = [] From b74e7949271437fde38881be019650283d45ba6d Mon Sep 17 00:00:00 2001 From: Mayuri N Date: Wed, 25 Oct 2023 14:57:02 +0530 Subject: [PATCH 4/7] feat(ingest): support view lineage extraction for postgres, trino fallback to native postgres view lineage extraction for failed views --- .../datahub/ingestion/source/sql/postgres.py | 20 +- .../ingestion/source/sql/sql_common.py | 29 +- .../src/datahub/utilities/sqlglot_lineage.py | 16 + .../postgres_all_db_mces_with_db_golden.json | 324 ++++++++++++++---- ..._db_to_file_with_db_estimate_row_count.yml | 2 +- .../postgres_mces_with_db_golden.json | 264 ++++++++++++-- ...res_to_file_with_db_estimate_row_count.yml | 2 +- .../trino/trino_hive_mces_golden.json | 211 +++++++++--- 8 files changed, 703 insertions(+), 165 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/postgres.py b/metadata-ingestion/src/datahub/ingestion/source/sql/postgres.py index a6a9d8e2c8597c..4f133c6459a0ff 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/postgres.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/postgres.py @@ -103,10 +103,6 @@ class BasePostgresConfig(BasicSQLAlchemyConfig): class PostgresConfig(BasePostgresConfig): - include_view_lineage = Field( - default=False, description="Include table lineage for views" - ) - database_pattern: AllowDenyPattern = Field( default=AllowDenyPattern.allow_all(), description=( @@ -183,9 +179,10 @@ def get_inspectors(self) -> Iterable[Inspector]: def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]: yield from super().get_workunits_internal() - for inspector in self.get_inspectors(): - if self.config.include_view_lineage: - yield from self._get_view_lineage_workunits(inspector) + if self.views_failed_parsing: + for inspector in self.get_inspectors(): + if self.config.include_view_lineage: + yield from self._get_view_lineage_workunits(inspector) def _get_view_lineage_elements( self, inspector: Inspector @@ -245,11 +242,14 @@ def _get_view_lineage_workunits( dependent_view, dependent_schema = key # Construct a lineage object. + view_identifier = self.get_identifier( + schema=dependent_schema, entity=dependent_view, inspector=inspector + ) + if view_identifier not in self.views_failed_parsing: + return urn = mce_builder.make_dataset_urn_with_platform_instance( platform=self.platform, - name=self.get_identifier( - schema=dependent_schema, entity=dependent_view, inspector=inspector - ), + name=view_identifier, platform_instance=self.config.platform_instance, env=self.config.env, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py index 4ee9b2fada34b0..51909eaf4ed550 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py @@ -98,6 +98,7 @@ SchemaResolver, SqlParsingResult, sqlglot_lineage, + view_definition_lineage_helper, ) if TYPE_CHECKING: @@ -334,7 +335,7 @@ def __init__(self, config: SQLCommonConfig, ctx: PipelineContext, platform: str) cached_domains=[k for k in self.config.domain], graph=self.ctx.graph ) - self.views_processed: Set[str] = set() + self.views_failed_parsing: Set[str] = set() self.schema_resolver: SchemaResolver = SchemaResolver( platform=self.platform, platform_instance=self.config.platform_instance, @@ -569,6 +570,8 @@ def get_view_lineage(self) -> Iterable[MetadataWorkUnit]: is_view_ddl=True, include_column_lineage=self.config.include_view_column_lineage, ) + else: + self.views_failed_parsing.add(dataset_name) yield from builder.gen_workunits() def get_identifier( @@ -1009,12 +1012,12 @@ def _process_view( ) def _run_sql_parser( - self, dataset_identifier: str, query: str, schema_resolver: SchemaResolver + self, view_identifier: str, query: str, schema_resolver: SchemaResolver ) -> Optional[SqlParsingResult]: try: - database, schema = self.get_db_schema(dataset_identifier) + database, schema = self.get_db_schema(view_identifier) except ValueError: - logger.warning(f"Invalid view identifier: {dataset_identifier}") + logger.warning(f"Invalid view identifier: {view_identifier}") return None raw_lineage = sqlglot_lineage( query, @@ -1022,18 +1025,32 @@ def _run_sql_parser( default_db=database, default_schema=schema, ) + view_urn = make_dataset_urn_with_platform_instance( + self.platform, + view_identifier, + self.config.platform_instance, + self.config.env, + ) + if raw_lineage.debug_info.table_error: logger.debug( - f"Failed to parse lineage for view {dataset_identifier}: " + f"Failed to parse lineage for view {view_identifier}: " f"{raw_lineage.debug_info.table_error}" ) self.report.num_view_definitions_failed_parsing += 1 + self.report.view_definitions_parsing_failures.append( + f"Table-level sql parsing error for view {view_identifier}: {raw_lineage.debug_info.table_error}" + ) return None + elif raw_lineage.debug_info.column_error: self.report.num_view_definitions_failed_column_parsing += 1 + self.report.view_definitions_parsing_failures.append( + f"Column-level sql parsing error for view {view_identifier}: {raw_lineage.debug_info.column_error}" + ) else: self.report.num_view_definitions_parsed += 1 - return raw_lineage + return view_definition_lineage_helper(raw_lineage, view_urn) def get_db_schema(self, dataset_identifier: str) -> Tuple[Optional[str], str]: database, schema, _view = dataset_identifier.split(".") diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py index 389b818e11ba05..7f7beaac0a72b5 100644 --- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py @@ -1156,3 +1156,19 @@ def create_lineage_sql_parsed_result( finally: if needs_close: schema_resolver.close() + + +def view_definition_lineage_helper( + result: SqlParsingResult, view_urn: str +) -> SqlParsingResult: + if result.query_type is QueryType.SELECT: + # Some platforms (e.g. postgres) store only ` . For such view definitions, `result.out_tables` and + # `result.column_lineage[].downstream` are empty in `sqlglot_lineage` response, whereas + # Here, we inject view V's urn in `result.out_tables` and `result.column_lineage[].downstream` + # to get complete lineage result. + result.out_tables = [view_urn] + if result.column_lineage: + for col_result in result.column_lineage: + col_result.downstream.table = view_urn + return result diff --git a/metadata-ingestion/tests/integration/postgres/postgres_all_db_mces_with_db_golden.json b/metadata-ingestion/tests/integration/postgres/postgres_all_db_mces_with_db_golden.json index 535ce964c6058f..b9b2a3b2141a8c 100644 --- a/metadata-ingestion/tests/integration/postgres/postgres_all_db_mces_with_db_golden.json +++ b/metadata-ingestion/tests/integration/postgres/postgres_all_db_mces_with_db_golden.json @@ -16,7 +16,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -31,7 +32,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -46,7 +48,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -63,7 +66,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -78,7 +82,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -99,7 +104,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -114,7 +120,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -129,7 +136,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -146,7 +154,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -161,7 +170,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -181,7 +191,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -201,7 +212,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -216,7 +228,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -231,7 +244,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -248,7 +262,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -263,7 +278,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -284,7 +300,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -299,7 +316,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -314,7 +332,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -331,7 +350,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -346,7 +366,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -366,7 +387,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -381,7 +403,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -537,7 +560,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -554,7 +578,186 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a6097853edba03be190d99ece4b307ff", + "urn": "urn:li:container:a6097853edba03be190d99ece4b307ff" + }, + { + "id": "urn:li:container:51904fc8cd5cc729bc630decff284525", + "urn": "urn:li:container:51904fc8cd5cc729bc630decff284525" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:51904fc8cd5cc729bc630decff284525" + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "view_definition": " SELECT metadata_aspect_v2.urn,\n metadata_aspect_v2.aspect\n FROM metadata_aspect_v2\n WHERE (metadata_aspect_v2.version = 0);", + "is_view": "True" + }, + "name": "metadata_aspect_view", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "postgrestest.public.metadata_aspect_view", + "platform": "urn:li:dataPlatform:postgres", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "urn", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=500)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "aspect", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=200)", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": " SELECT metadata_aspect_v2.urn,\n metadata_aspect_v2.aspect\n FROM metadata_aspect_v2\n WHERE (metadata_aspect_v2.version = 0);", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a6097853edba03be190d99ece4b307ff", + "urn": "urn:li:container:a6097853edba03be190d99ece4b307ff" + }, + { + "id": "urn:li:container:51904fc8cd5cc729bc630decff284525", + "urn": "urn:li:container:51904fc8cd5cc729bc630decff284525" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -634,31 +837,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a6097853edba03be190d99ece4b307ff", - "urn": "urn:li:container:a6097853edba03be190d99ece4b307ff" - }, - { - "id": "urn:li:container:51904fc8cd5cc729bc630decff284525", - "urn": "urn:li:container:51904fc8cd5cc729bc630decff284525" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -675,29 +855,39 @@ "actor": "urn:li:corpuser:unknown" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD)", - "type": "TRANSFORMED" + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD),aspect)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD),aspect)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD),urn)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD),urn)" + ], + "confidenceScore": 1.0 } ] } }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/postgres/postgres_all_db_to_file_with_db_estimate_row_count.yml b/metadata-ingestion/tests/integration/postgres/postgres_all_db_to_file_with_db_estimate_row_count.yml index b390d9246677e9..2bfa39a65363b5 100644 --- a/metadata-ingestion/tests/integration/postgres/postgres_all_db_to_file_with_db_estimate_row_count.yml +++ b/metadata-ingestion/tests/integration/postgres/postgres_all_db_to_file_with_db_estimate_row_count.yml @@ -25,7 +25,7 @@ source: include_field_distinct_value_frequencies: false include_field_histogram: false catch_exceptions: true - include_views: false + include_views: true sink: type: file config: diff --git a/metadata-ingestion/tests/integration/postgres/postgres_mces_with_db_golden.json b/metadata-ingestion/tests/integration/postgres/postgres_mces_with_db_golden.json index bf36a39a8c103f..f6fa0a0ed032ef 100644 --- a/metadata-ingestion/tests/integration/postgres/postgres_mces_with_db_golden.json +++ b/metadata-ingestion/tests/integration/postgres/postgres_mces_with_db_golden.json @@ -16,7 +16,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -31,7 +32,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -46,7 +48,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -63,7 +66,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -78,7 +82,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -99,7 +104,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -114,7 +120,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -129,7 +136,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -146,7 +154,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -161,7 +170,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -181,7 +191,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -196,7 +207,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -352,7 +364,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -369,7 +382,186 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a6097853edba03be190d99ece4b307ff", + "urn": "urn:li:container:a6097853edba03be190d99ece4b307ff" + }, + { + "id": "urn:li:container:51904fc8cd5cc729bc630decff284525", + "urn": "urn:li:container:51904fc8cd5cc729bc630decff284525" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:51904fc8cd5cc729bc630decff284525" + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "view_definition": " SELECT metadata_aspect_v2.urn,\n metadata_aspect_v2.aspect\n FROM metadata_aspect_v2\n WHERE (metadata_aspect_v2.version = 0);", + "is_view": "True" + }, + "name": "metadata_aspect_view", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "postgrestest.public.metadata_aspect_view", + "platform": "urn:li:dataPlatform:postgres", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "urn", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=500)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "aspect", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=200)", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": " SELECT metadata_aspect_v2.urn,\n metadata_aspect_v2.aspect\n FROM metadata_aspect_v2\n WHERE (metadata_aspect_v2.version = 0);", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a6097853edba03be190d99ece4b307ff", + "urn": "urn:li:container:a6097853edba03be190d99ece4b307ff" + }, + { + "id": "urn:li:container:51904fc8cd5cc729bc630decff284525", + "urn": "urn:li:container:51904fc8cd5cc729bc630decff284525" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -391,31 +583,57 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "upstreamLineage", "aspect": { "json": { - "path": [ + "upstreams": [ { - "id": "urn:li:container:a6097853edba03be190d99ece4b307ff", - "urn": "urn:li:container:a6097853edba03be190d99ece4b307ff" + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD),aspect)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD),aspect)" + ], + "confidenceScore": 1.0 }, { - "id": "urn:li:container:51904fc8cd5cc729bc630decff284525", - "urn": "urn:li:container:51904fc8cd5cc729bc630decff284525" + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD),urn)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD),urn)" + ], + "confidenceScore": 1.0 } ] } }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/postgres/postgres_to_file_with_db_estimate_row_count.yml b/metadata-ingestion/tests/integration/postgres/postgres_to_file_with_db_estimate_row_count.yml index a489877d52a23f..4a2cc543f2d011 100644 --- a/metadata-ingestion/tests/integration/postgres/postgres_to_file_with_db_estimate_row_count.yml +++ b/metadata-ingestion/tests/integration/postgres/postgres_to_file_with_db_estimate_row_count.yml @@ -13,7 +13,7 @@ source: profile_table_row_count_estimate_only: true turn_off_expensive_profiling_metrics: true catch_exceptions: true - include_views: false + include_views: true sink: type: file config: diff --git a/metadata-ingestion/tests/integration/trino/trino_hive_mces_golden.json b/metadata-ingestion/tests/integration/trino/trino_hive_mces_golden.json index 19961e48b4a336..c43223c68a6b64 100644 --- a/metadata-ingestion/tests/integration/trino/trino_hive_mces_golden.json +++ b/metadata-ingestion/tests/integration/trino/trino_hive_mces_golden.json @@ -16,7 +16,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -31,7 +32,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -46,7 +48,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -63,7 +66,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -78,7 +82,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -99,7 +104,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -114,7 +120,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -129,7 +136,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -146,7 +154,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -161,7 +170,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -181,7 +191,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -196,7 +207,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -219,7 +231,7 @@ "numrows": "1", "rawdatasize": "32", "totalsize": "33", - "transient_lastddltime": "1688422059" + "transient_lastddltime": "1698223433" }, "name": "array_struct_test", "description": "This table has array of structs", @@ -315,7 +327,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -332,7 +345,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -356,7 +370,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -371,7 +386,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -392,7 +408,7 @@ "numrows": "0", "rawdatasize": "0", "totalsize": "0", - "transient_lastddltime": "1688422063" + "transient_lastddltime": "1698223435" }, "name": "map_test", "tags": [] @@ -454,7 +470,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -471,7 +488,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -495,7 +513,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -510,7 +529,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -531,7 +551,7 @@ "numrows": "0", "rawdatasize": "0", "totalsize": "0", - "transient_lastddltime": "1688422062" + "transient_lastddltime": "1698223435" }, "name": "nested_struct_test", "tags": [] @@ -642,7 +662,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -659,7 +680,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -683,7 +705,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -698,7 +721,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -714,7 +738,7 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "transient_lastddltime": "1688421792" + "transient_lastddltime": "1698223429" }, "name": "pokes", "tags": [] @@ -784,7 +808,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -801,7 +826,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -825,7 +851,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -840,7 +867,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -861,7 +889,7 @@ "numrows": "0", "rawdatasize": "0", "totalsize": "0", - "transient_lastddltime": "1688421808" + "transient_lastddltime": "1698223431" }, "name": "struct_test", "tags": [] @@ -950,7 +978,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -967,7 +996,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -991,7 +1021,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1006,7 +1037,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1024,7 +1056,7 @@ "customProperties": { "numfiles": "0", "totalsize": "0", - "transient_lastddltime": "1688422062" + "transient_lastddltime": "1698223435" }, "name": "struct_test_view_materialized", "tags": [] @@ -1113,7 +1145,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1130,7 +1163,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1154,7 +1188,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1169,7 +1204,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1190,7 +1226,7 @@ "numrows": "0", "rawdatasize": "0", "totalsize": "0", - "transient_lastddltime": "1688421807" + "transient_lastddltime": "1698223431" }, "name": "_test_table_underscore", "tags": [] @@ -1248,7 +1284,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1265,7 +1302,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1289,7 +1327,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1304,7 +1343,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1325,7 +1365,7 @@ "numrows": "0", "rawdatasize": "0", "totalsize": "0", - "transient_lastddltime": "1688422062" + "transient_lastddltime": "1698223435" }, "name": "union_test", "tags": [] @@ -1467,7 +1507,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1484,7 +1525,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1508,7 +1550,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1523,7 +1566,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1539,7 +1583,7 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "transient_lastddltime": "1688422062", + "transient_lastddltime": "1698223435", "view_definition": "SELECT \"property_id\", \"service\"\nFROM \"db1\".\"array_struct_test\"", "is_view": "True" }, @@ -1634,7 +1678,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1651,7 +1696,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1668,7 +1714,57 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.array_struct_test_view,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.array_struct_test,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.array_struct_test,PROD),property_id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.array_struct_test_view,PROD),property_id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.array_struct_test,PROD),service)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.array_struct_test_view,PROD),service)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1692,7 +1788,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file From 337ca34718ae19c52d3d0b34f7769e8568e1b304 Mon Sep 17 00:00:00 2001 From: Mayuri N Date: Thu, 26 Oct 2023 09:25:00 +0530 Subject: [PATCH 5/7] fix tests --- metadata-ingestion/setup.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 417588a4336555..30d13401923f15 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -101,6 +101,11 @@ "grpcio-tools>=1.44.0,<2", } +usage_common = { + "sqlparse", +} + + sql_common = { # Required for all SQL sources. # This is temporary lower bound that we're open to loosening/tightening as requirements show up @@ -115,7 +120,7 @@ # https://github.com/ipython/traitlets/issues/741 "traitlets<5.2.2", "greenlet", -} +} | usage_common sqllineage_lib = { "sqllineage==1.3.8", @@ -243,10 +248,6 @@ powerbi_report_server = {"requests", "requests_ntlm"} -usage_common = { - "sqlparse", -} - databricks = { # 0.1.11 appears to have authentication issues with azure databricks "databricks-sdk>=0.9.0", From 1fe4b02122287ff61ccc6c8f5b59f9e18e429bc5 Mon Sep 17 00:00:00 2001 From: Mayuri N Date: Thu, 26 Oct 2023 12:14:54 +0530 Subject: [PATCH 6/7] add sqlglot dependency --- metadata-ingestion/setup.py | 45 ++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 1bfa7d1f801dbd..7f7826abe20952 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -105,22 +105,31 @@ "sqlparse", } +sqlglot_lib = { + # Using an Acryl fork of sqlglot. + # https://github.com/tobymao/sqlglot/compare/main...hsheth2:sqlglot:hsheth?expand=1 + "acryl-sqlglot==18.5.2.dev45", +} -sql_common = { - # Required for all SQL sources. - # This is temporary lower bound that we're open to loosening/tightening as requirements show up - "sqlalchemy>=1.4.39, <2", - # Required for SQL profiling. - "great-expectations>=0.15.12, <=0.15.50", - # scipy version restricted to reduce backtracking, used by great-expectations, - "scipy>=1.7.2", - # GE added handling for higher version of jinja2 - # https://github.com/great-expectations/great_expectations/pull/5382/files - # datahub does not depend on traitlets directly but great expectations does. - # https://github.com/ipython/traitlets/issues/741 - "traitlets<5.2.2", - "greenlet", -} | usage_common +sql_common = ( + { + # Required for all SQL sources. + # This is temporary lower bound that we're open to loosening/tightening as requirements show up + "sqlalchemy>=1.4.39, <2", + # Required for SQL profiling. + "great-expectations>=0.15.12, <=0.15.50", + # scipy version restricted to reduce backtracking, used by great-expectations, + "scipy>=1.7.2", + # GE added handling for higher version of jinja2 + # https://github.com/great-expectations/great_expectations/pull/5382/files + # datahub does not depend on traitlets directly but great expectations does. + # https://github.com/ipython/traitlets/issues/741 + "traitlets<5.2.2", + "greenlet", + } + | usage_common + | sqlglot_lib +) sqllineage_lib = { "sqllineage==1.3.8", @@ -130,12 +139,6 @@ "sqlparse==0.4.4", } -sqlglot_lib = { - # Using an Acryl fork of sqlglot. - # https://github.com/tobymao/sqlglot/compare/main...hsheth2:sqlglot:hsheth?expand=1 - "acryl-sqlglot==18.5.2.dev45", -} - aws_common = { # AWS Python SDK "boto3", From c5ca9b29b8e563810d2b7a96ef8e59056d6ae3aa Mon Sep 17 00:00:00 2001 From: Mayuri N Date: Thu, 26 Oct 2023 12:19:53 +0530 Subject: [PATCH 7/7] update comment --- metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py index 7f7beaac0a72b5..1d74b205698140 100644 --- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py @@ -1164,7 +1164,8 @@ def view_definition_lineage_helper( if result.query_type is QueryType.SELECT: # Some platforms (e.g. postgres) store only ` . For such view definitions, `result.out_tables` and - # `result.column_lineage[].downstream` are empty in `sqlglot_lineage` response, whereas + # `result.column_lineage[].downstream` are empty in `sqlglot_lineage` response, whereas upstream + # details and downstream column details are extracted correctly. # Here, we inject view V's urn in `result.out_tables` and `result.column_lineage[].downstream` # to get complete lineage result. result.out_tables = [view_urn]