-
Notifications
You must be signed in to change notification settings - Fork 3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(ingest): support view lineage for all sqlalchemy sources #9039
Changes from 1 commit
75f63a2
93b7ba1
7c18697
bf4ecb5
b74e794
337ca34
e60a7f4
1fe4b02
c5ca9b2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -130,10 +130,13 @@ def auto_incremental_lineage( | |
if len(wu.metadata.proposedSnapshot.aspects) > 0: | ||
yield wu | ||
|
||
yield _lineage_wu_via_read_modify_write( | ||
graph, urn, lineage_aspect, wu.metadata.systemMetadata | ||
) if lineage_aspect.fineGrainedLineages else _convert_upstream_lineage_to_patch( | ||
urn, lineage_aspect, wu.metadata.systemMetadata | ||
) | ||
if lineage_aspect.fineGrainedLineages: | ||
yield _lineage_wu_via_read_modify_write( | ||
graph, urn, lineage_aspect, wu.metadata.systemMetadata | ||
) | ||
elif lineage_aspect.upstreams: | ||
yield _convert_upstream_lineage_to_patch( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If there is a table level upstream only aspect with empty upstreams, we ignore it, as part of incremental lineage. |
||
urn, lineage_aspect, wu.metadata.systemMetadata | ||
) | ||
else: | ||
yield wu |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -200,14 +200,15 @@ def _gen_workunit_from_sql_parsing_result( | |
self, | ||
dataset_identifier: str, | ||
result: SqlParsingResult, | ||
) -> MetadataWorkUnit: | ||
) -> Iterable[MetadataWorkUnit]: | ||
upstreams, fine_upstreams = self.get_upstreams_from_sql_parsing_result( | ||
self.dataset_urn_builder(dataset_identifier), result | ||
) | ||
self.report.num_views_with_upstreams += 1 | ||
return self._create_upstream_lineage_workunit( | ||
dataset_identifier, upstreams, fine_upstreams | ||
) | ||
if upstreams: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should not emit upstream lineage if no upstreams are found. |
||
self.report.num_views_with_upstreams += 1 | ||
yield self._create_upstream_lineage_workunit( | ||
dataset_identifier, upstreams, fine_upstreams | ||
) | ||
|
||
def _gen_workunits_from_query_result( | ||
self, | ||
|
@@ -251,7 +252,7 @@ def get_view_upstream_workunits( | |
) | ||
if result: | ||
views_processed.add(view_identifier) | ||
yield self._gen_workunit_from_sql_parsing_result( | ||
yield from self._gen_workunit_from_sql_parsing_result( | ||
view_identifier, result | ||
) | ||
self.report.view_lineage_parse_secs = timer.elapsed_seconds() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,15 +1,18 @@ | ||
import json | ||
import logging | ||
import re | ||
from typing import Any, Dict, List, Optional | ||
from typing import Any, Dict, Iterable, List, Optional, Union | ||
|
||
from pydantic.class_validators import validator | ||
from pydantic.fields import Field | ||
|
||
# This import verifies that the dependencies are available. | ||
from pyhive import hive # noqa: F401 | ||
from pyhive.sqlalchemy_hive import HiveDate, HiveDecimal, HiveTimestamp | ||
from sqlalchemy.engine.reflection import Inspector | ||
|
||
from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance | ||
from datahub.emitter.mcp import MetadataChangeProposalWrapper | ||
from datahub.ingestion.api.decorators import ( | ||
SourceCapability, | ||
SupportStatus, | ||
|
@@ -18,8 +21,10 @@ | |
platform_name, | ||
support_status, | ||
) | ||
from datahub.ingestion.api.workunit import MetadataWorkUnit | ||
from datahub.ingestion.extractor import schema_util | ||
from datahub.ingestion.source.sql.sql_common import register_custom_type | ||
from datahub.ingestion.source.sql.sql_common import SqlWorkUnit, register_custom_type | ||
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig | ||
from datahub.ingestion.source.sql.two_tier_sql_source import ( | ||
TwoTierSQLAlchemyConfig, | ||
TwoTierSQLAlchemySource, | ||
|
@@ -31,6 +36,7 @@ | |
SchemaField, | ||
TimeTypeClass, | ||
) | ||
from datahub.metadata.schema_classes import ViewPropertiesClass | ||
from datahub.utilities import config_clean | ||
from datahub.utilities.hive_schema_to_avro import get_avro_schema_for_hive_column | ||
|
||
|
@@ -90,19 +96,39 @@ def dbapi_get_columns_patched(self, connection, table_name, schema=None, **kw): | |
logger.warning(f"Failed to patch method due to {e}") | ||
|
||
|
||
try: | ||
from pyhive.sqlalchemy_hive import HiveDialect | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Alternatively we can also move this code to acryl pyhive fork - https://github.com/acryldata/PyHive This seemed simpler and easier to test this end to end. Open to suggestions here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems fine for now, and we can fix up when we refactor sql common next week |
||
|
||
@reflection.cache # type: ignore | ||
def get_view_names_patched(self, connection, schema=None, **kw): | ||
query = "SHOW VIEWS" | ||
if schema: | ||
query += " IN " + self.identifier_preparer.quote_identifier(schema) | ||
return [row[0] for row in connection.execute(query)] | ||
|
||
@reflection.cache # type: ignore | ||
def get_view_definition_patched(self, connection, view_name, schema=None, **kw): | ||
full_table = self.identifier_preparer.quote_identifier(view_name) | ||
if schema: | ||
full_table = "{}.{}".format( | ||
self.identifier_preparer.quote_identifier(schema), | ||
self.identifier_preparer.quote_identifier(view_name), | ||
) | ||
row = connection.execute("SHOW CREATE TABLE {}".format(full_table)).fetchone() | ||
return row[0] | ||
|
||
HiveDialect.get_view_names = get_view_names_patched | ||
HiveDialect.get_view_definition = get_view_definition_patched | ||
except ModuleNotFoundError: | ||
pass | ||
except Exception as e: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Failure to patch should cause the source to fail to load right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right. Let me remove this exception handling. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
logger.warning(f"Failed to patch method due to {e}") | ||
|
||
|
||
class HiveConfig(TwoTierSQLAlchemyConfig): | ||
# defaults | ||
scheme = Field(default="hive", hidden_from_docs=True) | ||
|
||
# Hive SQLAlchemy connector returns views as tables. | ||
# See https://github.com/dropbox/PyHive/blob/b21c507a24ed2f2b0cf15b0b6abb1c43f31d3ee0/pyhive/sqlalchemy_hive.py#L270-L273. | ||
# Disabling views helps us prevent this duplication. | ||
include_views = Field( | ||
default=False, | ||
hidden_from_docs=True, | ||
description="Hive SQLAlchemy connector returns views as tables. See https://github.com/dropbox/PyHive/blob/b21c507a24ed2f2b0cf15b0b6abb1c43f31d3ee0/pyhive/sqlalchemy_hive.py#L270-L273. Disabling views helps us prevent this duplication.", | ||
) | ||
|
||
@validator("host_port") | ||
def clean_host_port(cls, v): | ||
return config_clean.remove_protocol(v) | ||
|
@@ -174,3 +200,41 @@ def get_schema_fields_for_column( | |
return new_fields | ||
|
||
return fields | ||
|
||
# Hive SQLAlchemy connector returns views as tables in get_table_names. | ||
# See https://github.com/dropbox/PyHive/blob/b21c507a24ed2f2b0cf15b0b6abb1c43f31d3ee0/pyhive/sqlalchemy_hive.py#L270-L273. | ||
# This override makes sure that we ingest view definitions for views | ||
def _process_view( | ||
self, | ||
dataset_name: str, | ||
inspector: Inspector, | ||
schema: str, | ||
view: str, | ||
sql_config: SQLCommonConfig, | ||
) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: | ||
dataset_urn = make_dataset_urn_with_platform_instance( | ||
self.platform, | ||
dataset_name, | ||
self.config.platform_instance, | ||
self.config.env, | ||
) | ||
|
||
try: | ||
view_definition = inspector.get_view_definition(view, schema) | ||
if view_definition is None: | ||
view_definition = "" | ||
else: | ||
# Some dialects return a TextClause instead of a raw string, | ||
# so we need to convert them to a string. | ||
view_definition = str(view_definition) | ||
except NotImplementedError: | ||
view_definition = "" | ||
|
||
if view_definition: | ||
view_properties_aspect = ViewPropertiesClass( | ||
materialized=False, viewLanguage="SQL", viewLogic=view_definition | ||
) | ||
yield MetadataChangeProposalWrapper( | ||
entityUrn=dataset_urn, | ||
aspect=view_properties_aspect, | ||
).as_workunit() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Incremental lineage requires presence of DataHubGraph, which is available by default only when using DataHub rest sink. We plan to keep this default enabled in managed ingestion.