diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
index 532ba1102ed57..412c962cb6e36 100644
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@@ -83,6 +83,7 @@ jobs:
- uses: gradle/actions/setup-gradle@v3
- name: Gradle build (and test) for NOT metadata ingestion
if: ${{ matrix.command == 'except_metadata_ingestion' && needs.setup.outputs.backend_change == 'true' }}
+ # datahub-schematron:cli excluded due to dependency on metadata-ingestion
run: |
./gradlew build \
-x :metadata-ingestion:build \
@@ -100,6 +101,7 @@ jobs:
-x :metadata-ingestion-modules:gx-plugin:check \
-x :datahub-frontend:build \
-x :datahub-web-react:build \
+ -x :metadata-integration:java:datahub-schematron:cli:test \
--parallel
- name: Gradle build (and test) for frontend
if: ${{ matrix.command == 'frontend' && needs.setup.outputs.frontend_change == 'true' }}
diff --git a/.github/workflows/check-datahub-jars.yml b/.github/workflows/check-datahub-jars.yml
index becf8126dc45b..7a49f32729ec1 100644
--- a/.github/workflows/check-datahub-jars.yml
+++ b/.github/workflows/check-datahub-jars.yml
@@ -40,4 +40,5 @@ jobs:
- name: check ${{ matrix.command }} jar
run: |
./gradlew :metadata-integration:java:${{ matrix.command }}:build --info
+ ./gradlew :metadata-integration:java:${{ matrix.command }}:checkShadowJar
./gradlew :metadata-integration:java:${{ matrix.command }}:javadoc
diff --git a/build.gradle b/build.gradle
index e3c4f5efe6bb6..be4d7ee8a562b 100644
--- a/build.gradle
+++ b/build.gradle
@@ -48,6 +48,7 @@ buildscript {
// see also datahub-frontend/play.gradle
ext.playVersion = '2.8.22'
ext.playScalaVersion = '2.13'
+ ext.akkaVersion = '2.6.21' // 2.7.0+ has incompatible license
ext.log4jVersion = '2.23.1'
ext.slf4jVersion = '1.7.36'
ext.logbackClassic = '1.4.14'
@@ -105,7 +106,14 @@ project.ext.spec = [
]
project.ext.externalDependency = [
- 'akkaHttp': "com.typesafe.akka:akka-http-core_$playScalaVersion:10.2.10",
+ 'akkaHttp': "com.typesafe.akka:akka-http-core_$playScalaVersion:10.2.10", // max version due to licensing
+ 'akkaActor': "com.typesafe.akka:akka-actor_$playScalaVersion:$akkaVersion",
+ 'akkaStream': "com.typesafe.akka:akka-stream_$playScalaVersion:$akkaVersion",
+ 'akkaActorTyped': "com.typesafe.akka:akka-actor-typed_$playScalaVersion:$akkaVersion",
+ 'akkaSlf4j': "com.typesafe.akka:akka-slf4j_$playScalaVersion:$akkaVersion",
+ 'akkaJackson': "com.typesafe.akka:akka-serialization-jackson_$playScalaVersion:$akkaVersion",
+ 'akkaParsing': "com.typesafe.akka:akka-parsing_$playScalaVersion:$akkaVersion",
+ 'akkaProtobuf': "com.typesafe.akka:akka-protobuf-v3_$playScalaVersion:$akkaVersion",
'antlr4Runtime': 'org.antlr:antlr4-runtime:4.9.3',
'antlr4': 'org.antlr:antlr4:4.9.3',
'assertJ': 'org.assertj:assertj-core:3.11.1',
diff --git a/datahub-frontend/play.gradle b/datahub-frontend/play.gradle
index 266962721a80a..d513c3c232d9a 100644
--- a/datahub-frontend/play.gradle
+++ b/datahub-frontend/play.gradle
@@ -55,6 +55,13 @@ dependencies {
implementation externalDependency.antlr4Runtime
implementation externalDependency.antlr4
implementation externalDependency.akkaHttp
+ implementation externalDependency.akkaActor
+ implementation externalDependency.akkaStream
+ implementation externalDependency.akkaActorTyped
+ implementation externalDependency.akkaSlf4j
+ implementation externalDependency.akkaJackson
+ implementation externalDependency.akkaParsing
+ implementation externalDependency.akkaProtobuf
implementation externalDependency.jerseyCore
implementation externalDependency.jerseyGuava
diff --git a/datahub-web-react/src/app/ingest/source/builder/sources.json b/datahub-web-react/src/app/ingest/source/builder/sources.json
index 4ec2d4300aff6..537e429c1dd69 100644
--- a/datahub-web-react/src/app/ingest/source/builder/sources.json
+++ b/datahub-web-react/src/app/ingest/source/builder/sources.json
@@ -318,6 +318,14 @@
"docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/cassandra",
"recipe": "source:\n type: cassandra\n config:\n # Credentials for on prem cassandra\n contact_point: localhost\n port: 9042\n username: admin\n password: password\n\n # Or\n # Credentials Astra Cloud\n #cloud_config:\n # secure_connect_bundle: Path to Secure Connect Bundle (.zip)\n # token: Application Token\n\n # Optional Allow / Deny extraction of particular keyspaces.\n keyspace_pattern:\n allow: [.*]\n\n # Optional Allow / Deny extraction of particular tables.\n table_pattern:\n allow: [.*]"
},
+ {
+ "urn": "urn:li:dataPlatform:iceberg",
+ "name": "iceberg",
+ "displayName": "Iceberg",
+ "description": "Ingest databases and tables from any Iceberg catalog implementation",
+ "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/iceberg",
+ "recipe": "source:\n type: \"iceberg\"\n config:\n env: dev\n # each thread will open internet connections to fetch manifest files independently, \n # this value needs to be adjusted with ulimit\n processing_threads: 1 \n # a single catalog definition with a form of a dictionary\n catalog: \n demo: # name of the catalog\n type: \"rest\" # other types are available\n uri: \"uri\"\n s3.access-key-id: \"access-key\"\n s3.secret-access-key: \"secret-access-key\"\n s3.region: \"aws-region\"\n profiling:\n enabled: false\n"
+ },
{
"urn": "urn:li:dataPlatform:neo4j",
"name": "neo4j",
diff --git a/docs/automations/snowflake-tag-propagation.md b/docs/automations/snowflake-tag-propagation.md
index b72224642b0f0..8eded451644cc 100644
--- a/docs/automations/snowflake-tag-propagation.md
+++ b/docs/automations/snowflake-tag-propagation.md
@@ -4,6 +4,8 @@ import FeatureAvailability from '@site/src/components/FeatureAvailability';
+> Note that this Automation in currently in open **Beta**. With any questions or issues, please reach out to your Acryl representative.
+
## Introduction
Snowflake Tag Propagation is an automation that allows you to sync DataHub Glossary Terms and Tags on
@@ -15,6 +17,41 @@ both columns and tables back to Snowflake. This automation is available in DataH
- Automatically Add DataHub Tags to Snowflake Tables and Columns
- Automatically Remove DataHub Glossary Terms and Tags from Snowflake Tables and Columns when they are removed in DataHub
+## Prerequisites
+
+### Permissions Required for Tag Management
+
+- `CREATE TAG`: Required to create new tags in Snowflake.
+Ensure the user or role has this privilege on the specific schema or database where tags will be created.
+- `APPLY TAG`: Required to assign tags to Snowflake objects such as tables, columns, or other database objects.
+This permission must be granted at the database, schema, or object level depending on the scope.
+
+
+### Permissions Required for Object Access
+
+- `USAGE` on the database and schema: Allows access to the database and schema to view and apply changes.
+- `SELECT` on the objects (tables, views, etc.): Enables the automation to read metadata and verify existing tags.
+
+### Example Permission Grant Statements
+
+To grant the necessary permissions for a specific role (DATAHUB_AUTOMATION_ROLE), you can use the following SQL commands:
+
+```sql
+-- Tag management permissions
+GRANT CREATE TAG ON SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE;
+GRANT APPLY TAG ON SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE;
+
+-- Object access for metadata operations
+GRANT USAGE ON DATABASE your_database TO ROLE DATAHUB_AUTOMATION_ROLE;
+GRANT USAGE ON SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE;
+GRANT SELECT ON ALL TABLES IN SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE;
+
+-- Future privileges for tagging
+GRANT SELECT ON FUTURE TABLES IN SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE;
+GRANT APPLY TAG ON FUTURE TABLES IN SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE;
+```
+
+
## Enabling Snowflake Tag Sync
1. **Navigate to Automations**: Click on 'Govern' > 'Automations' in the navigation bar.
diff --git a/docs/managed-datahub/release-notes/v_0_3_7.md b/docs/managed-datahub/release-notes/v_0_3_7.md
index af23b5ae1541b..94cbdd79dbf5e 100644
--- a/docs/managed-datahub/release-notes/v_0_3_7.md
+++ b/docs/managed-datahub/release-notes/v_0_3_7.md
@@ -7,7 +7,7 @@ Release Availability Date
Recommended CLI/SDK
---
-- `v0.14.1.11` with release notes at https://github.com/datahub/datahub/releases/tag/v0.14.1.11
+- `v0.14.1.12` with release notes at https://github.com/datahub/datahub/releases/tag/v0.14.1.12
If you are using an older CLI/SDK version, then please upgrade it. This applies for all CLI/SDK usages, if you are using it through your terminal, GitHub Actions, Airflow, in Python SDK somewhere, Java SDK, etc. This is a strong recommendation to upgrade, as we keep on pushing fixes in the CLI, and it helps us support you better.
@@ -19,6 +19,26 @@ If you are using an older CLI/SDK version, then please upgrade it. This applies
## Release Changelog
---
+### v0.3.7.4
+
+- [#11935](https://github.com/datahub-project/datahub/pull/11935) - Added environment variable for enabling stricter URN validation rules `STRICT_URN_VALIDATION_ENABLED` [[1](https://datahubproject.io/docs/what/urn/#restrictions)].
+- [Automations] Filter out self-nodes in glossary term propagation
+- [Remote Executor] Allow dashes in executor ids.
+- [Search] Fix Nested Filter Counts in Primary Search
+- [Search] Fix white screen of death on empty search result
+- [Columns Tab] Support searching nested struct columns correctly in V2 UI.
+- [Logo] Fix fit of custom logo for V2 UI nav bar.
+- [Structured Properties] Better handling for special characters in structured properties
+- [Lineage] Improvements to handling lineage cycles
+- [Metadata Tests] Improve Reliability of Metadata Tests Action Application
+- [Slack Integration] Minor improvement in authentication redirect to integrate with Slack
+- [Columns Tab] Property display nullable status in column sidebar (bug)
+- [Columns Tab] Fixing merging of sibling schemas between V2 and V1 field paths.
+- [Documentation] Support group authors for institutional memory aspect
+
+
+### v0.3.7
+
- All changes in https://github.com/datahub-project/datahub/releases/tag/v0.14.1
- Note Breaking Changes: https://datahubproject.io/docs/how/updating-datahub/#0141
@@ -96,7 +116,7 @@ If you are using an older CLI/SDK version, then please upgrade it. This applies
- Improved UX for setting up and managing SSO
- Ingestion changes
- - In addition to the improvements listed here: https://github.com/acryldata/datahub/releases/tag/v0.14.1.11
+ - In addition to the improvements listed here: https://github.com/acryldata/datahub/releases/tag/v0.14.1.12
- PowerBI: Support for PowerBI Apps and cross-workspace lineage
- Fivetran: Major improvements to configurability and improved reliability with large Fivetran setups
- Snowflake & BigQuery: Improved handling of temporary tables and swap statements when generating lineage
diff --git a/docs/what/urn.md b/docs/what/urn.md
index 2f4dffb985653..c7fb0555cd992 100644
--- a/docs/what/urn.md
+++ b/docs/what/urn.md
@@ -35,11 +35,17 @@ urn:li:dataset:(urn:li:dataPlatform:hdfs,PageViewEvent,EI)
## Restrictions
-There are a few restrictions when creating an urn:
+There are a few restrictions when creating an URN:
-1. Commas are reserved character in URN fields: `,`
-2. Parentheses are reserved characters in URN fields: `(` or `)`
-3. Colons are reserved characters in URN fields: `:`
-4. Urn separator UTF-8 character `␟`
+The following characters are not allowed anywhere in the URN
+
+1. Parentheses are reserved characters in URN fields: `(` or `)`
+2. The "unit separator" unicode character `␟` (U+241F)
+
+The following characters are not allowed within an URN tuple.
+
+1. Commas are reserved characters in URN tuples: `,`
+
+Example: `urn:li:dashboard:(looker,dashboards.thelook)` is a valid urn, but `urn:li:dashboard:(looker,dashboards.the,look)` is invalid.
Please do not use these characters when creating or generating urns. One approach is to use URL encoding for the characters.
diff --git a/metadata-ingestion/docs/sources/iceberg/iceberg.md b/metadata-ingestion/docs/sources/iceberg/iceberg.md
index 7e40315a2e319..92aac5ffa6ce5 100644
--- a/metadata-ingestion/docs/sources/iceberg/iceberg.md
+++ b/metadata-ingestion/docs/sources/iceberg/iceberg.md
@@ -18,6 +18,8 @@ This ingestion source maps the following Source System Concepts to DataHub Conce
## Troubleshooting
-### [Common Issue]
+### Exceptions while increasing `processing_threads`
-[Provide description of common issues with this integration and steps to resolve]
+Each processing thread will open several files/sockets to download manifest files from blob storage. If you experience
+exceptions appearing when increasing `processing_threads` configuration parameter, try to increase limit of open
+files (i.e. using `ulimit` in Linux).
diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
index a9915e1bd745d..c6d55fb5bcc56 100644
--- a/metadata-ingestion/setup.py
+++ b/metadata-ingestion/setup.py
@@ -14,8 +14,8 @@
)
base_requirements = {
- # Typing extension should be >=3.10.0.2 ideally but we can't restrict due to a Airflow 2.1 dependency conflict.
- "typing_extensions>=3.7.4.3",
+ # Our min version of typing_extensions is somewhat constrained by Airflow.
+ "typing_extensions>=3.10.0.2",
# Actual dependencies.
"typing-inspect",
# pydantic 1.8.2 is incompatible with mypy 0.910.
@@ -249,7 +249,8 @@
iceberg_common = {
# Iceberg Python SDK
- "pyiceberg>=0.4,<0.7",
+ # Kept at 0.4.0 due to higher versions requiring pydantic>2, as soon as we are fine with it, bump this dependency
+ "pyiceberg>=0.4.0",
}
mssql_common = {
@@ -775,7 +776,7 @@
"trino = datahub.ingestion.source.sql.trino:TrinoSource",
"starburst-trino-usage = datahub.ingestion.source.usage.starburst_trino_usage:TrinoUsageSource",
"nifi = datahub.ingestion.source.nifi:NifiSource",
- "powerbi = datahub.ingestion.source.powerbi:PowerBiDashboardSource",
+ "powerbi = datahub.ingestion.source.powerbi.powerbi:PowerBiDashboardSource",
"powerbi-report-server = datahub.ingestion.source.powerbi_report_server:PowerBiReportServerDashboardSource",
"iceberg = datahub.ingestion.source.iceberg.iceberg:IcebergSource",
"vertica = datahub.ingestion.source.sql.vertica:VerticaSource",
diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py
index 759aebcfd46b0..4aa937639e959 100644
--- a/metadata-ingestion/src/datahub/ingestion/graph/client.py
+++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py
@@ -67,6 +67,7 @@
SystemMetadataClass,
TelemetryClientIdClass,
)
+from datahub.telemetry.telemetry import telemetry_instance
from datahub.utilities.perf_timer import PerfTimer
from datahub.utilities.str_enum import StrEnum
from datahub.utilities.urns.urn import Urn, guess_entity_type
@@ -1819,4 +1820,5 @@ def get_default_graph() -> DataHubGraph:
graph_config = config_utils.load_client_config()
graph = DataHubGraph(graph_config)
graph.test_connection()
+ telemetry_instance.set_context(server=graph)
return graph
diff --git a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py
index 7c3a42c3e0893..667129ff83584 100644
--- a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py
+++ b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py
@@ -44,7 +44,8 @@
)
from datahub.ingestion.transformer.transform_registry import transform_registry
from datahub.metadata.schema_classes import MetadataChangeProposalClass
-from datahub.telemetry import stats, telemetry
+from datahub.telemetry import stats
+from datahub.telemetry.telemetry import telemetry_instance
from datahub.utilities._custom_package_loader import model_version_name
from datahub.utilities.global_warning_util import (
clear_global_warnings,
@@ -273,8 +274,9 @@ def __init__(
if self.graph is None and isinstance(self.sink, DatahubRestSink):
with _add_init_error_context("setup default datahub client"):
self.graph = self.sink.emitter.to_graph()
+ self.graph.test_connection()
self.ctx.graph = self.graph
- telemetry.telemetry_instance.update_capture_exception_context(server=self.graph)
+ telemetry_instance.set_context(server=self.graph)
with set_graph_context(self.graph):
with _add_init_error_context("configure reporters"):
@@ -615,7 +617,7 @@ def log_ingestion_stats(self) -> None:
sink_warnings = len(self.sink.get_report().warnings)
global_warnings = len(get_global_warnings())
- telemetry.telemetry_instance.ping(
+ telemetry_instance.ping(
"ingest_stats",
{
"source_type": self.source_type,
@@ -637,7 +639,6 @@ def log_ingestion_stats(self) -> None:
),
"has_pipeline_name": bool(self.config.pipeline_name),
},
- self.ctx.graph,
)
def _approx_all_vals(self, d: LossyList[Any]) -> int:
diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py
index 4598ae388b827..499e7e1231d05 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py
@@ -53,19 +53,7 @@
make_assertion_from_test,
make_assertion_result_from_test,
)
-from datahub.ingestion.source.sql.sql_types import (
- ATHENA_SQL_TYPES_MAP,
- BIGQUERY_TYPES_MAP,
- POSTGRES_TYPES_MAP,
- SNOWFLAKE_TYPES_MAP,
- SPARK_SQL_TYPES_MAP,
- TRINO_SQL_TYPES_MAP,
- VERTICA_SQL_TYPES_MAP,
- resolve_athena_modified_type,
- resolve_postgres_modified_type,
- resolve_trino_modified_type,
- resolve_vertica_modified_type,
-)
+from datahub.ingestion.source.sql.sql_types import resolve_sql_type
from datahub.ingestion.source.state.stale_entity_removal_handler import (
StaleEntityRemovalHandler,
StaleEntityRemovalSourceReport,
@@ -89,17 +77,11 @@
from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
- BooleanTypeClass,
- DateTypeClass,
MySqlDDL,
NullTypeClass,
- NumberTypeClass,
- RecordType,
SchemaField,
SchemaFieldDataType,
SchemaMetadata,
- StringTypeClass,
- TimeTypeClass,
)
from datahub.metadata.schema_classes import (
DataPlatformInstanceClass,
@@ -804,28 +786,6 @@ def make_mapping_upstream_lineage(
)
-# See https://github.com/fishtown-analytics/dbt/blob/master/core/dbt/adapters/sql/impl.py
-_field_type_mapping = {
- "boolean": BooleanTypeClass,
- "date": DateTypeClass,
- "time": TimeTypeClass,
- "numeric": NumberTypeClass,
- "text": StringTypeClass,
- "timestamp with time zone": DateTypeClass,
- "timestamp without time zone": DateTypeClass,
- "integer": NumberTypeClass,
- "float8": NumberTypeClass,
- "struct": RecordType,
- **POSTGRES_TYPES_MAP,
- **SNOWFLAKE_TYPES_MAP,
- **BIGQUERY_TYPES_MAP,
- **SPARK_SQL_TYPES_MAP,
- **TRINO_SQL_TYPES_MAP,
- **ATHENA_SQL_TYPES_MAP,
- **VERTICA_SQL_TYPES_MAP,
-}
-
-
def get_column_type(
report: DBTSourceReport,
dataset_name: str,
@@ -835,24 +795,10 @@ def get_column_type(
"""
Maps known DBT types to datahub types
"""
- TypeClass: Any = _field_type_mapping.get(column_type) if column_type else None
-
- if TypeClass is None and column_type:
- # resolve a modified type
- if dbt_adapter == "trino":
- TypeClass = resolve_trino_modified_type(column_type)
- elif dbt_adapter == "athena":
- TypeClass = resolve_athena_modified_type(column_type)
- elif dbt_adapter == "postgres" or dbt_adapter == "redshift":
- # Redshift uses a variant of Postgres, so we can use the same logic.
- TypeClass = resolve_postgres_modified_type(column_type)
- elif dbt_adapter == "vertica":
- TypeClass = resolve_vertica_modified_type(column_type)
- elif dbt_adapter == "snowflake":
- # Snowflake types are uppercase, so we check that.
- TypeClass = _field_type_mapping.get(column_type.upper())
-
- # if still not found, report the warning
+
+ TypeClass = resolve_sql_type(column_type, dbt_adapter)
+
+ # if still not found, report a warning
if TypeClass is None:
if column_type:
report.info(
@@ -861,9 +807,9 @@ def get_column_type(
context=f"{dataset_name} - {column_type}",
log=False,
)
- TypeClass = NullTypeClass
+ TypeClass = NullTypeClass()
- return SchemaFieldDataType(type=TypeClass())
+ return SchemaFieldDataType(type=TypeClass)
@platform_name("dbt")
diff --git a/metadata-ingestion/src/datahub/ingestion/source/feast.py b/metadata-ingestion/src/datahub/ingestion/source/feast.py
index e097fd1f221ea..6330fe0291660 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/feast.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/feast.py
@@ -42,10 +42,14 @@
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
from datahub.metadata.schema_classes import (
BrowsePathsClass,
+ GlobalTagsClass,
MLFeaturePropertiesClass,
MLFeatureTablePropertiesClass,
MLPrimaryKeyPropertiesClass,
+ OwnerClass,
+ OwnershipClass,
StatusClass,
+ TagAssociationClass,
)
# FIXME: ValueType module cannot be used as a type
@@ -91,6 +95,24 @@ class FeastRepositorySourceConfig(ConfigModel):
environment: str = Field(
default=DEFAULT_ENV, description="Environment to use when constructing URNs"
)
+ # owner_mappings example:
+ # This must be added to the recipe in order to extract owners, otherwise NO owners will be extracted
+ # owner_mappings:
+ # - feast_owner_name: ""
+ # datahub_owner_urn: "urn:li:corpGroup:"
+ # datahub_ownership_type: "BUSINESS_OWNER"
+ owner_mappings: Optional[List[Dict[str, str]]] = Field(
+ default=None, description="Mapping of owner names to owner types"
+ )
+ enable_owner_extraction: bool = Field(
+ default=False,
+ description="If this is disabled, then we NEVER try to map owners. "
+ "If this is enabled, then owner_mappings is REQUIRED to extract ownership.",
+ )
+ enable_tag_extraction: bool = Field(
+ default=False,
+ description="If this is disabled, then we NEVER try to extract tags.",
+ )
@platform_name("Feast")
@@ -215,10 +237,15 @@ def _get_entity_workunit(
"""
feature_view_name = f"{self.feature_store.project}.{feature_view.name}"
+ aspects = (
+ [StatusClass(removed=False)]
+ + self._get_tags(entity)
+ + self._get_owners(entity)
+ )
entity_snapshot = MLPrimaryKeySnapshot(
urn=builder.make_ml_primary_key_urn(feature_view_name, entity.name),
- aspects=[StatusClass(removed=False)],
+ aspects=aspects,
)
entity_snapshot.aspects.append(
@@ -243,10 +270,11 @@ def _get_feature_workunit(
Generate an MLFeature work unit for a Feast feature.
"""
feature_view_name = f"{self.feature_store.project}.{feature_view.name}"
+ aspects = [StatusClass(removed=False)] + self._get_tags(field)
feature_snapshot = MLFeatureSnapshot(
urn=builder.make_ml_feature_urn(feature_view_name, field.name),
- aspects=[StatusClass(removed=False)],
+ aspects=aspects,
)
feature_sources = []
@@ -295,13 +323,18 @@ def _get_feature_view_workunit(self, feature_view: FeatureView) -> MetadataWorkU
"""
feature_view_name = f"{self.feature_store.project}.{feature_view.name}"
+ aspects = (
+ [
+ BrowsePathsClass(paths=[f"/feast/{self.feature_store.project}"]),
+ StatusClass(removed=False),
+ ]
+ + self._get_tags(feature_view)
+ + self._get_owners(feature_view)
+ )
feature_view_snapshot = MLFeatureTableSnapshot(
urn=builder.make_ml_feature_table_urn("feast", feature_view_name),
- aspects=[
- BrowsePathsClass(paths=[f"/feast/{self.feature_store.project}"]),
- StatusClass(removed=False),
- ],
+ aspects=aspects,
)
feature_view_snapshot.aspects.append(
@@ -360,6 +393,64 @@ def _get_on_demand_feature_view_workunit(
return MetadataWorkUnit(id=on_demand_feature_view_name, mce=mce)
+ # If a tag is specified in a Feast object, then the tag will be ingested into Datahub if enable_tag_extraction is
+ # True, otherwise NO tags will be ingested
+ def _get_tags(self, obj: Union[Entity, FeatureView, FeastField]) -> list:
+ """
+ Extracts tags from the given object and returns a list of aspects.
+ """
+ aspects: List[Union[GlobalTagsClass]] = []
+
+ # Extract tags
+ if self.source_config.enable_tag_extraction:
+ if obj.tags.get("name"):
+ tag_name: str = obj.tags["name"]
+ tag_association = TagAssociationClass(
+ tag=builder.make_tag_urn(tag_name)
+ )
+ global_tags_aspect = GlobalTagsClass(tags=[tag_association])
+ aspects.append(global_tags_aspect)
+
+ return aspects
+
+ # If an owner is specified in a Feast object, it will only be ingested into Datahub if owner_mappings is specified
+ # and enable_owner_extraction is True in FeastRepositorySourceConfig, otherwise NO owners will be ingested
+ def _get_owners(self, obj: Union[Entity, FeatureView, FeastField]) -> list:
+ """
+ Extracts owners from the given object and returns a list of aspects.
+ """
+ aspects: List[Union[OwnershipClass]] = []
+
+ # Extract owner
+ if self.source_config.enable_owner_extraction:
+ owner = getattr(obj, "owner", None)
+ if owner:
+ # Create owner association, skipping if None
+ owner_association = self._create_owner_association(owner)
+ if owner_association: # Only add valid owner associations
+ owners_aspect = OwnershipClass(owners=[owner_association])
+ aspects.append(owners_aspect)
+
+ return aspects
+
+ def _create_owner_association(self, owner: str) -> Optional[OwnerClass]:
+ """
+ Create an OwnerClass instance for the given owner using the owner mappings.
+ """
+ if self.source_config.owner_mappings is not None:
+ for mapping in self.source_config.owner_mappings:
+ if mapping["feast_owner_name"] == owner:
+ ownership_type_class: str = mapping.get(
+ "datahub_ownership_type", "TECHNICAL_OWNER"
+ )
+ datahub_owner_urn = mapping.get("datahub_owner_urn")
+ if datahub_owner_urn:
+ return OwnerClass(
+ owner=datahub_owner_urn,
+ type=ownership_type_class,
+ )
+ return None
+
@classmethod
def create(cls, config_dict, ctx):
config = FeastRepositorySourceConfig.parse_obj(config_dict)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py b/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py
index c4b4186f45fc3..52807ca2a3f02 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py
@@ -144,15 +144,32 @@ def get_workunits_internal(
self,
) -> Iterable[MetadataWorkUnit]:
if self.config.cleanup_expired_tokens:
- self.revoke_expired_tokens()
+ try:
+ self.revoke_expired_tokens()
+ except Exception as e:
+ self.report.failure("While trying to cleanup expired token ", exc=e)
if self.config.truncate_indices:
- self.truncate_indices()
+ try:
+ self.truncate_indices()
+ except Exception as e:
+ self.report.failure("While trying to truncate indices ", exc=e)
if self.dataprocess_cleanup:
- yield from self.dataprocess_cleanup.get_workunits_internal()
+ try:
+ yield from self.dataprocess_cleanup.get_workunits_internal()
+ except Exception as e:
+ self.report.failure("While trying to cleanup data process ", exc=e)
if self.soft_deleted_entities_cleanup:
- self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
+ try:
+ self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
+ except Exception as e:
+ self.report.failure(
+ "While trying to cleanup soft deleted entities ", exc=e
+ )
if self.execution_request_cleanup:
- self.execution_request_cleanup.run()
+ try:
+ self.execution_request_cleanup.run()
+ except Exception as e:
+ self.report.failure("While trying to cleanup execution request ", exc=e)
yield from []
def truncate_indices(self) -> None:
diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py
index 130f2c9c2e12f..0f35e1a67fede 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py
@@ -404,7 +404,9 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
try:
self.delete_dpi_from_datajobs(datajob_entity)
except Exception as e:
- logger.error(f"While trying to delete {datajob_entity} got {e}")
+ self.report.failure(
+ f"While trying to delete {datajob_entity} ", exc=e
+ )
if (
datajob_entity.total_runs == 0
and self.config.delete_empty_data_jobs
diff --git a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py
index 258a4b9ad6daf..5931873f54236 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py
@@ -9,6 +9,7 @@
NoSuchIcebergTableError,
NoSuchNamespaceError,
NoSuchPropertyException,
+ NoSuchTableError,
)
from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType, visit
from pyiceberg.table import Table
@@ -104,7 +105,7 @@
@capability(SourceCapability.DESCRIPTIONS, "Enabled by default.")
@capability(
SourceCapability.OWNERSHIP,
- "Optionally enabled via configuration by specifying which Iceberg table property holds user or group ownership.",
+ "Automatically ingests ownership information from table properties based on `user_ownership_property` and `group_ownership_property`",
)
@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
class IcebergSource(StatefulIngestionSourceBase):
@@ -192,9 +193,7 @@ def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]:
table = thread_local.local_catalog.load_table(dataset_path)
time_taken = timer.elapsed_seconds()
self.report.report_table_load_time(time_taken)
- LOGGER.debug(
- f"Loaded table: {table.identifier}, time taken: {time_taken}"
- )
+ LOGGER.debug(f"Loaded table: {table.name()}, time taken: {time_taken}")
yield from self._create_iceberg_workunit(dataset_name, table)
except NoSuchPropertyException as e:
self.report.report_warning(
@@ -206,12 +205,20 @@ def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]:
)
except NoSuchIcebergTableError as e:
self.report.report_warning(
- "no-iceberg-table",
+ "not-an-iceberg-table",
f"Failed to create workunit for {dataset_name}. {e}",
)
LOGGER.warning(
f"NoSuchIcebergTableError while processing table {dataset_path}, skipping it.",
)
+ except NoSuchTableError as e:
+ self.report.report_warning(
+ "no-such-table",
+ f"Failed to create workunit for {dataset_name}. {e}",
+ )
+ LOGGER.warning(
+ f"NoSuchTableError while processing table {dataset_path}, skipping it.",
+ )
except Exception as e:
self.report.report_failure("general", f"Failed to create workunit: {e}")
LOGGER.exception(
diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka.py b/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka.py
index e57dc853a83c6..709ba431f0f87 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka.py
@@ -148,7 +148,7 @@ def get_kafka_consumer(
) -> confluent_kafka.Consumer:
consumer = confluent_kafka.Consumer(
{
- "group.id": "test",
+ "group.id": "datahub-kafka-ingestion",
"bootstrap.servers": connection.bootstrap,
**connection.consumer_config,
}
@@ -164,6 +164,25 @@ def get_kafka_consumer(
return consumer
+def get_kafka_admin_client(
+ connection: KafkaConsumerConnectionConfig,
+) -> AdminClient:
+ client = AdminClient(
+ {
+ "group.id": "datahub-kafka-ingestion",
+ "bootstrap.servers": connection.bootstrap,
+ **connection.consumer_config,
+ }
+ )
+ if CallableConsumerConfig.is_callable_config(connection.consumer_config):
+ # As per documentation, we need to explicitly call the poll method to make sure OAuth callback gets executed
+ # https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#kafka-client-configuration
+ logger.debug("Initiating polling for kafka admin client")
+ client.poll(timeout=30)
+ logger.debug("Initiated polling for kafka admin client")
+ return client
+
+
@dataclass
class KafkaSourceReport(StaleEntityRemovalSourceReport):
topics_scanned: int = 0
@@ -278,13 +297,7 @@ def __init__(self, config: KafkaSourceConfig, ctx: PipelineContext):
def init_kafka_admin_client(self) -> None:
try:
# TODO: Do we require separate config than existing consumer_config ?
- self.admin_client = AdminClient(
- {
- "group.id": "test",
- "bootstrap.servers": self.source_config.connection.bootstrap,
- **self.source_config.connection.consumer_config,
- }
- )
+ self.admin_client = get_kafka_admin_client(self.source_config.connection)
except Exception as e:
logger.debug(e, exc_info=e)
self.report.report_warning(
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/__init__.py
index 1068f335e8f8e..e69de29bb2d1d 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/__init__.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/__init__.py
@@ -1 +0,0 @@
-from datahub.ingestion.source.powerbi.powerbi import PowerBiDashboardSource
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py
index 91fa2e96be2cc..f7458c4eb4d5b 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py
@@ -173,7 +173,7 @@ class SupportedDataPlatform(Enum):
datahub_data_platform_name="redshift",
)
- DATABRICK_SQL = DataPlatformPair(
+ DATABRICKS_SQL = DataPlatformPair(
powerbi_data_platform_name="Databricks", datahub_data_platform_name="databricks"
)
@@ -313,8 +313,8 @@ class PowerBiDashboardSourceConfig(
" Note: This field works in conjunction with 'workspace_type_filter' and both must be considered when filtering workspaces.",
)
- # Dataset type mapping PowerBI support many type of data-sources. Here user need to define what type of PowerBI
- # DataSource need to be mapped to corresponding DataHub Platform DataSource. For example PowerBI `Snowflake` is
+ # Dataset type mapping PowerBI support many type of data-sources. Here user needs to define what type of PowerBI
+ # DataSource needs to be mapped to corresponding DataHub Platform DataSource. For example, PowerBI `Snowflake` is
# mapped to DataHub `snowflake` PowerBI `PostgreSQL` is mapped to DataHub `postgres` and so on.
dataset_type_mapping: Union[
Dict[str, str], Dict[str, PlatformDetail]
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/data_classes.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/data_classes.py
index bb0c0c2f79bbd..f1691b5df68a9 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/data_classes.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/data_classes.py
@@ -1,10 +1,14 @@
import os
from abc import ABC
from dataclasses import dataclass
-from typing import Any, Dict, Optional
+from enum import Enum
+from typing import Any, Dict, List, Optional
from lark import Tree
+from datahub.ingestion.source.powerbi.config import DataPlatformPair
+from datahub.sql_parsing.sqlglot_lineage import ColumnLineageInfo
+
TRACE_POWERBI_MQUERY_PARSER = os.getenv("DATAHUB_TRACE_POWERBI_MQUERY_PARSER", False)
@@ -30,7 +34,7 @@ class IdentifierAccessor(AbstractIdentifierAccessor):
"[Schema="public",Item="order_date"]" is "items" in ItemSelector. Data of items varies as per DataSource
- "public_order_date" is in "next" of ItemSelector. The "next" will be None if this identifier is leaf i.e. table
+ "public_order_date" is in "next" of ItemSelector. The "next" will be None if this identifier is leaf i.e., table
"""
@@ -53,3 +57,31 @@ class ReferencedTable:
database: str
schema: str
table: str
+
+
+@dataclass
+class DataPlatformTable:
+ data_platform_pair: DataPlatformPair
+ urn: str
+
+
+@dataclass
+class Lineage:
+ upstreams: List[DataPlatformTable]
+ column_lineage: List[ColumnLineageInfo]
+
+ @staticmethod
+ def empty() -> "Lineage":
+ return Lineage(upstreams=[], column_lineage=[])
+
+
+class FunctionName(Enum):
+ NATIVE_QUERY = "Value.NativeQuery"
+ POSTGRESQL_DATA_ACCESS = "PostgreSQL.Database"
+ ORACLE_DATA_ACCESS = "Oracle.Database"
+ SNOWFLAKE_DATA_ACCESS = "Snowflake.Databases"
+ MSSQL_DATA_ACCESS = "Sql.Database"
+ DATABRICK_DATA_ACCESS = "Databricks.Catalogs"
+ GOOGLE_BIGQUERY_DATA_ACCESS = "GoogleBigQuery.Database"
+ AMAZON_REDSHIFT_DATA_ACCESS = "AmazonRedshift.Database"
+ DATABRICK_MULTI_CLOUD_DATA_ACCESS = "DatabricksMultiCloud.Catalogs"
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py
index 97698a3d0d56c..2a5de7494920b 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py
@@ -7,6 +7,7 @@
import lark
from lark import Lark, Tree
+import datahub.ingestion.source.powerbi.m_query.data_classes
from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.source.powerbi.config import (
PowerBiDashboardSourceConfig,
@@ -65,7 +66,7 @@ def get_upstream_tables(
ctx: PipelineContext,
config: PowerBiDashboardSourceConfig,
parameters: Dict[str, str] = {},
-) -> List[resolver.Lineage]:
+) -> List[datahub.ingestion.source.powerbi.m_query.data_classes.Lineage]:
if table.expression is None:
logger.debug(f"There is no M-Query expression in table {table.full_name}")
return []
@@ -127,12 +128,14 @@ def get_upstream_tables(
reporter.m_query_parse_successes += 1
try:
- lineage: List[resolver.Lineage] = resolver.MQueryResolver(
+ lineage: List[
+ datahub.ingestion.source.powerbi.m_query.data_classes.Lineage
+ ] = resolver.MQueryResolver(
table=table,
parse_tree=parse_tree,
reporter=reporter,
parameters=parameters,
- ).resolve_to_data_platform_table_list(
+ ).resolve_to_lineage(
ctx=ctx,
config=config,
platform_instance_resolver=platform_instance_resolver,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py
new file mode 100644
index 0000000000000..13d97a7029029
--- /dev/null
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py
@@ -0,0 +1,920 @@
+import logging
+from abc import ABC, abstractmethod
+from enum import Enum
+from typing import Dict, List, Optional, Tuple, Type, Union, cast
+
+from lark import Tree
+
+from datahub.emitter import mce_builder as builder
+from datahub.ingestion.api.common import PipelineContext
+from datahub.ingestion.source.powerbi.config import (
+ Constant,
+ DataBricksPlatformDetail,
+ DataPlatformPair,
+ PlatformDetail,
+ PowerBiDashboardSourceConfig,
+ PowerBiDashboardSourceReport,
+ PowerBIPlatformDetail,
+ SupportedDataPlatform,
+)
+from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import (
+ AbstractDataPlatformInstanceResolver,
+)
+from datahub.ingestion.source.powerbi.m_query import native_sql_parser, tree_function
+from datahub.ingestion.source.powerbi.m_query.data_classes import (
+ AbstractIdentifierAccessor,
+ DataAccessFunctionDetail,
+ DataPlatformTable,
+ FunctionName,
+ IdentifierAccessor,
+ Lineage,
+ ReferencedTable,
+)
+from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table
+from datahub.sql_parsing.sqlglot_lineage import SqlParsingResult
+
+logger = logging.getLogger(__name__)
+
+
+def get_next_item(items: List[str], item: str) -> Optional[str]:
+ if item in items:
+ try:
+ index = items.index(item)
+ return items[index + 1]
+ except IndexError:
+ logger.debug(f'item:"{item}", not found in item-list: {items}')
+ return None
+
+
+def urn_to_lowercase(value: str, flag: bool) -> str:
+ if flag is True:
+ return value.lower()
+
+ return value
+
+
+def make_urn(
+ config: PowerBiDashboardSourceConfig,
+ platform_instance_resolver: AbstractDataPlatformInstanceResolver,
+ data_platform_pair: DataPlatformPair,
+ server: str,
+ qualified_table_name: str,
+) -> str:
+ platform_detail: PlatformDetail = platform_instance_resolver.get_platform_instance(
+ PowerBIPlatformDetail(
+ data_platform_pair=data_platform_pair,
+ data_platform_server=server,
+ )
+ )
+
+ return builder.make_dataset_urn_with_platform_instance(
+ platform=data_platform_pair.datahub_data_platform_name,
+ platform_instance=platform_detail.platform_instance,
+ env=platform_detail.env,
+ name=urn_to_lowercase(
+ qualified_table_name, config.convert_lineage_urns_to_lowercase
+ ),
+ )
+
+
+class AbstractLineage(ABC):
+ """
+ Base class to share common functionalities among different dataplatform for M-Query parsing.
+
+ To create qualified table name we need to parse M-Query data-access-functions(https://learn.microsoft.com/en-us/powerquery-m/accessing-data-functions) and
+ the data-access-functions has some define pattern to access database-name, schema-name and table-name, for example, see below M-Query.
+
+ let
+ Source = Sql.Database("localhost", "library"),
+ dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data]
+ in
+ dbo_book_issue
+
+ It is MSSQL M-Query and Sql.Database is the data-access-function to access MSSQL. If this function is available in M-Query then database name is available in the second argument of the first statement and schema-name and table-name is available in the second statement. the second statement can be repeated to access different tables from MSSQL.
+
+ DefaultTwoStepDataAccessSources extends the AbstractDataPlatformTableCreator and provides the common functionalities for data-platform which has above type of M-Query pattern
+
+ data-access-function varies as per data-platform for example for MySQL.Database for MySQL, PostgreSQL.Database for Postgres and Oracle.Database for Oracle and number of statement to
+ find out database-name , schema-name and table-name also varies as per dataplatform.
+
+ Value.NativeQuery is one of the functions which is used to execute a native query inside M-Query, for example see below M-Query
+
+ let
+ Source = Value.NativeQuery(AmazonRedshift.Database("redshift-url","dev"), "select * from dev.public.category", null, [EnableFolding=true])
+ in
+ Source
+
+ In this M-Query database-name is available in first argument and rest of the detail i.e database & schema is available in native query.
+
+ NativeQueryDataPlatformTableCreator extends AbstractDataPlatformTableCreator to support Redshift and Snowflake native query parsing.
+
+ """
+
+ ctx: PipelineContext
+ table: Table
+ config: PowerBiDashboardSourceConfig
+ reporter: PowerBiDashboardSourceReport
+ platform_instance_resolver: AbstractDataPlatformInstanceResolver
+
+ def __init__(
+ self,
+ ctx: PipelineContext,
+ table: Table,
+ config: PowerBiDashboardSourceConfig,
+ reporter: PowerBiDashboardSourceReport,
+ platform_instance_resolver: AbstractDataPlatformInstanceResolver,
+ ) -> None:
+ super().__init__()
+ self.ctx = ctx
+ self.table = table
+ self.config = config
+ self.reporter = reporter
+ self.platform_instance_resolver = platform_instance_resolver
+
+ @abstractmethod
+ def create_lineage(
+ self, data_access_func_detail: DataAccessFunctionDetail
+ ) -> Lineage:
+ pass
+
+ @abstractmethod
+ def get_platform_pair(self) -> DataPlatformPair:
+ pass
+
+ @staticmethod
+ def get_db_detail_from_argument(
+ arg_list: Tree,
+ ) -> Tuple[Optional[str], Optional[str]]:
+ arguments: List[str] = tree_function.strip_char_from_list(
+ values=tree_function.remove_whitespaces_from_list(
+ tree_function.token_values(arg_list)
+ ),
+ )
+
+ if len(arguments) < 2:
+ logger.debug(f"Expected minimum 2 arguments, but got {len(arguments)}")
+ return None, None
+
+ return arguments[0], arguments[1]
+
+ @staticmethod
+ def create_reference_table(
+ arg_list: Tree,
+ table_detail: Dict[str, str],
+ ) -> Optional[ReferencedTable]:
+ arguments: List[str] = tree_function.strip_char_from_list(
+ values=tree_function.remove_whitespaces_from_list(
+ tree_function.token_values(arg_list)
+ ),
+ )
+
+ logger.debug(f"Processing arguments {arguments}")
+
+ if (
+ len(arguments)
+ >= 4 # [0] is warehouse FQDN.
+ # [1] is endpoint, we are not using it.
+ # [2] is "Catalog" key
+ # [3] is catalog's value
+ ):
+ return ReferencedTable(
+ warehouse=arguments[0],
+ catalog=arguments[3],
+ # As per my observation, database and catalog names are same in M-Query
+ database=table_detail["Database"]
+ if table_detail.get("Database")
+ else arguments[3],
+ schema=table_detail["Schema"],
+ table=table_detail.get("Table") or table_detail["View"],
+ )
+ elif len(arguments) == 2:
+ return ReferencedTable(
+ warehouse=arguments[0],
+ database=table_detail["Database"],
+ schema=table_detail["Schema"],
+ table=table_detail.get("Table") or table_detail["View"],
+ catalog=None,
+ )
+
+ return None
+
+ def parse_custom_sql(
+ self, query: str, server: str, database: Optional[str], schema: Optional[str]
+ ) -> Lineage:
+ dataplatform_tables: List[DataPlatformTable] = []
+
+ platform_detail: PlatformDetail = (
+ self.platform_instance_resolver.get_platform_instance(
+ PowerBIPlatformDetail(
+ data_platform_pair=self.get_platform_pair(),
+ data_platform_server=server,
+ )
+ )
+ )
+
+ query = native_sql_parser.remove_drop_statement(
+ native_sql_parser.remove_special_characters(query)
+ )
+
+ parsed_result: Optional[
+ "SqlParsingResult"
+ ] = native_sql_parser.parse_custom_sql(
+ ctx=self.ctx,
+ query=query,
+ platform=self.get_platform_pair().datahub_data_platform_name,
+ platform_instance=platform_detail.platform_instance,
+ env=platform_detail.env,
+ database=database,
+ schema=schema,
+ )
+
+ if parsed_result is None:
+ self.reporter.info(
+ title=Constant.SQL_PARSING_FAILURE,
+ message="Fail to parse native sql present in PowerBI M-Query",
+ context=f"table-name={self.table.full_name}, sql={query}",
+ )
+ return Lineage.empty()
+
+ if parsed_result.debug_info and parsed_result.debug_info.table_error:
+ self.reporter.warning(
+ title=Constant.SQL_PARSING_FAILURE,
+ message="Fail to parse native sql present in PowerBI M-Query",
+ context=f"table-name={self.table.full_name}, error={parsed_result.debug_info.table_error},sql={query}",
+ )
+ return Lineage.empty()
+
+ for urn in parsed_result.in_tables:
+ dataplatform_tables.append(
+ DataPlatformTable(
+ data_platform_pair=self.get_platform_pair(),
+ urn=urn,
+ )
+ )
+
+ logger.debug(f"Native Query parsed result={parsed_result}")
+ logger.debug(f"Generated dataplatform_tables={dataplatform_tables}")
+
+ return Lineage(
+ upstreams=dataplatform_tables,
+ column_lineage=(
+ parsed_result.column_lineage
+ if parsed_result.column_lineage is not None
+ else []
+ ),
+ )
+
+
+class AmazonRedshiftLineage(AbstractLineage):
+ def get_platform_pair(self) -> DataPlatformPair:
+ return SupportedDataPlatform.AMAZON_REDSHIFT.value
+
+ def create_lineage(
+ self, data_access_func_detail: DataAccessFunctionDetail
+ ) -> Lineage:
+ logger.debug(
+ f"Processing AmazonRedshift data-access function detail {data_access_func_detail}"
+ )
+
+ server, db_name = self.get_db_detail_from_argument(
+ data_access_func_detail.arg_list
+ )
+ if db_name is None or server is None:
+ return Lineage.empty() # Return an empty list
+
+ schema_name: str = cast(
+ IdentifierAccessor, data_access_func_detail.identifier_accessor
+ ).items["Name"]
+
+ table_name: str = cast(
+ IdentifierAccessor,
+ cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next,
+ ).items["Name"]
+
+ qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
+
+ urn = make_urn(
+ config=self.config,
+ platform_instance_resolver=self.platform_instance_resolver,
+ data_platform_pair=self.get_platform_pair(),
+ server=server,
+ qualified_table_name=qualified_table_name,
+ )
+
+ return Lineage(
+ upstreams=[
+ DataPlatformTable(
+ data_platform_pair=self.get_platform_pair(),
+ urn=urn,
+ )
+ ],
+ column_lineage=[],
+ )
+
+
+class OracleLineage(AbstractLineage):
+ def get_platform_pair(self) -> DataPlatformPair:
+ return SupportedDataPlatform.ORACLE.value
+
+ @staticmethod
+ def _get_server_and_db_name(value: str) -> Tuple[Optional[str], Optional[str]]:
+ error_message: str = (
+ f"The target argument ({value}) should in the format of :/["
+ ".]"
+ )
+ splitter_result: List[str] = value.split("/")
+ if len(splitter_result) != 2:
+ logger.debug(error_message)
+ return None, None
+
+ db_name = splitter_result[1].split(".")[0]
+
+ return tree_function.strip_char_from_list([splitter_result[0]])[0], db_name
+
+ def create_lineage(
+ self, data_access_func_detail: DataAccessFunctionDetail
+ ) -> Lineage:
+ logger.debug(
+ f"Processing Oracle data-access function detail {data_access_func_detail}"
+ )
+
+ arguments: List[str] = tree_function.remove_whitespaces_from_list(
+ tree_function.token_values(data_access_func_detail.arg_list)
+ )
+
+ server, db_name = self._get_server_and_db_name(arguments[0])
+
+ if db_name is None or server is None:
+ return Lineage.empty()
+
+ schema_name: str = cast(
+ IdentifierAccessor, data_access_func_detail.identifier_accessor
+ ).items["Schema"]
+
+ table_name: str = cast(
+ IdentifierAccessor,
+ cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next,
+ ).items["Name"]
+
+ qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
+
+ urn = make_urn(
+ config=self.config,
+ platform_instance_resolver=self.platform_instance_resolver,
+ data_platform_pair=self.get_platform_pair(),
+ server=server,
+ qualified_table_name=qualified_table_name,
+ )
+
+ return Lineage(
+ upstreams=[
+ DataPlatformTable(
+ data_platform_pair=self.get_platform_pair(),
+ urn=urn,
+ )
+ ],
+ column_lineage=[],
+ )
+
+
+class DatabricksLineage(AbstractLineage):
+ def form_qualified_table_name(
+ self,
+ table_reference: ReferencedTable,
+ data_platform_pair: DataPlatformPair,
+ ) -> str:
+ platform_detail: PlatformDetail = (
+ self.platform_instance_resolver.get_platform_instance(
+ PowerBIPlatformDetail(
+ data_platform_pair=data_platform_pair,
+ data_platform_server=table_reference.warehouse,
+ )
+ )
+ )
+
+ metastore: Optional[str] = None
+
+ qualified_table_name: str = f"{table_reference.database}.{table_reference.schema}.{table_reference.table}"
+
+ if isinstance(platform_detail, DataBricksPlatformDetail):
+ metastore = platform_detail.metastore
+
+ if metastore is not None:
+ return f"{metastore}.{qualified_table_name}"
+
+ return qualified_table_name
+
+ def create_lineage(
+ self, data_access_func_detail: DataAccessFunctionDetail
+ ) -> Lineage:
+ logger.debug(
+ f"Processing Databrick data-access function detail {data_access_func_detail}"
+ )
+ table_detail: Dict[str, str] = {}
+ temp_accessor: Optional[
+ Union[IdentifierAccessor, AbstractIdentifierAccessor]
+ ] = data_access_func_detail.identifier_accessor
+
+ while temp_accessor:
+ if isinstance(temp_accessor, IdentifierAccessor):
+ # Condition to handle databricks M-query pattern where table, schema and database all are present in
+ # the same invoke statement
+ if all(
+ element in temp_accessor.items
+ for element in ["Item", "Schema", "Catalog"]
+ ):
+ table_detail["Schema"] = temp_accessor.items["Schema"]
+ table_detail["Table"] = temp_accessor.items["Item"]
+ else:
+ table_detail[temp_accessor.items["Kind"]] = temp_accessor.items[
+ "Name"
+ ]
+
+ if temp_accessor.next is not None:
+ temp_accessor = temp_accessor.next
+ else:
+ break
+ else:
+ logger.debug(
+ "expecting instance to be IdentifierAccessor, please check if parsing is done properly"
+ )
+ return Lineage.empty()
+
+ table_reference = self.create_reference_table(
+ arg_list=data_access_func_detail.arg_list,
+ table_detail=table_detail,
+ )
+
+ if table_reference:
+ qualified_table_name: str = self.form_qualified_table_name(
+ table_reference=table_reference,
+ data_platform_pair=self.get_platform_pair(),
+ )
+
+ urn = make_urn(
+ config=self.config,
+ platform_instance_resolver=self.platform_instance_resolver,
+ data_platform_pair=self.get_platform_pair(),
+ server=table_reference.warehouse,
+ qualified_table_name=qualified_table_name,
+ )
+
+ return Lineage(
+ upstreams=[
+ DataPlatformTable(
+ data_platform_pair=self.get_platform_pair(),
+ urn=urn,
+ )
+ ],
+ column_lineage=[],
+ )
+
+ return Lineage.empty()
+
+ def get_platform_pair(self) -> DataPlatformPair:
+ return SupportedDataPlatform.DATABRICKS_SQL.value
+
+
+class TwoStepDataAccessPattern(AbstractLineage, ABC):
+ """
+ These are the DataSource for which PowerBI Desktop generates default M-Query of the following pattern
+ let
+ Source = Sql.Database("localhost", "library"),
+ dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data]
+ in
+ dbo_book_issue
+ """
+
+ def two_level_access_pattern(
+ self, data_access_func_detail: DataAccessFunctionDetail
+ ) -> Lineage:
+ logger.debug(
+ f"Processing {self.get_platform_pair().powerbi_data_platform_name} data-access function detail {data_access_func_detail}"
+ )
+
+ server, db_name = self.get_db_detail_from_argument(
+ data_access_func_detail.arg_list
+ )
+ if server is None or db_name is None:
+ return Lineage.empty() # Return an empty list
+
+ schema_name: str = cast(
+ IdentifierAccessor, data_access_func_detail.identifier_accessor
+ ).items["Schema"]
+
+ table_name: str = cast(
+ IdentifierAccessor, data_access_func_detail.identifier_accessor
+ ).items["Item"]
+
+ qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
+
+ logger.debug(
+ f"Platform({self.get_platform_pair().datahub_data_platform_name}) qualified_table_name= {qualified_table_name}"
+ )
+
+ urn = make_urn(
+ config=self.config,
+ platform_instance_resolver=self.platform_instance_resolver,
+ data_platform_pair=self.get_platform_pair(),
+ server=server,
+ qualified_table_name=qualified_table_name,
+ )
+ return Lineage(
+ upstreams=[
+ DataPlatformTable(
+ data_platform_pair=self.get_platform_pair(),
+ urn=urn,
+ )
+ ],
+ column_lineage=[],
+ )
+
+
+class PostgresLineage(TwoStepDataAccessPattern):
+ def create_lineage(
+ self, data_access_func_detail: DataAccessFunctionDetail
+ ) -> Lineage:
+ return self.two_level_access_pattern(data_access_func_detail)
+
+ def get_platform_pair(self) -> DataPlatformPair:
+ return SupportedDataPlatform.POSTGRES_SQL.value
+
+
+class MSSqlLineage(TwoStepDataAccessPattern):
+ # https://learn.microsoft.com/en-us/sql/relational-databases/security/authentication-access/ownership-and-user-schema-separation?view=sql-server-ver16
+ DEFAULT_SCHEMA = "dbo" # Default schema name in MS-SQL is dbo
+
+ def get_platform_pair(self) -> DataPlatformPair:
+ return SupportedDataPlatform.MS_SQL.value
+
+ def create_urn_using_old_parser(
+ self, query: str, db_name: str, server: str
+ ) -> List[DataPlatformTable]:
+ dataplatform_tables: List[DataPlatformTable] = []
+
+ tables: List[str] = native_sql_parser.get_tables(query)
+
+ for parsed_table in tables:
+ # components: List[str] = [v.strip("[]") for v in parsed_table.split(".")]
+ components = [v.strip("[]") for v in parsed_table.split(".")]
+ if len(components) == 3:
+ database, schema, table = components
+ elif len(components) == 2:
+ schema, table = components
+ database = db_name
+ elif len(components) == 1:
+ (table,) = components
+ database = db_name
+ schema = MSSqlLineage.DEFAULT_SCHEMA
+ else:
+ self.reporter.warning(
+ title="Invalid table format",
+ message="The advanced SQL lineage feature (enable_advance_lineage_sql_construct) is disabled. Please either enable this feature or ensure the table is referenced as .. in the SQL.",
+ context=f"table-name={self.table.full_name}",
+ )
+ continue
+
+ qualified_table_name = f"{database}.{schema}.{table}"
+ urn = make_urn(
+ config=self.config,
+ platform_instance_resolver=self.platform_instance_resolver,
+ data_platform_pair=self.get_platform_pair(),
+ server=server,
+ qualified_table_name=qualified_table_name,
+ )
+ dataplatform_tables.append(
+ DataPlatformTable(
+ data_platform_pair=self.get_platform_pair(),
+ urn=urn,
+ )
+ )
+
+ logger.debug(f"Generated upstream tables = {dataplatform_tables}")
+
+ return dataplatform_tables
+
+ def create_lineage(
+ self, data_access_func_detail: DataAccessFunctionDetail
+ ) -> Lineage:
+ arguments: List[str] = tree_function.strip_char_from_list(
+ values=tree_function.remove_whitespaces_from_list(
+ tree_function.token_values(data_access_func_detail.arg_list)
+ ),
+ )
+
+ server, database = self.get_db_detail_from_argument(
+ data_access_func_detail.arg_list
+ )
+ if server is None or database is None:
+ return Lineage.empty() # Return an empty list
+
+ assert server
+ assert database # to silent the lint
+
+ query: Optional[str] = get_next_item(arguments, "Query")
+ if query:
+ if self.config.enable_advance_lineage_sql_construct is False:
+ # Use previous parser to generate URN to keep backward compatibility
+ return Lineage(
+ upstreams=self.create_urn_using_old_parser(
+ query=query,
+ db_name=database,
+ server=server,
+ ),
+ column_lineage=[],
+ )
+
+ return self.parse_custom_sql(
+ query=query,
+ database=database,
+ server=server,
+ schema=MSSqlLineage.DEFAULT_SCHEMA,
+ )
+
+ # It is a regular case of MS-SQL
+ logger.debug("Handling with regular case")
+ return self.two_level_access_pattern(data_access_func_detail)
+
+
+class ThreeStepDataAccessPattern(AbstractLineage, ABC):
+ def get_datasource_server(
+ self, arguments: List[str], data_access_func_detail: DataAccessFunctionDetail
+ ) -> str:
+ return tree_function.strip_char_from_list([arguments[0]])[0]
+
+ def create_lineage(
+ self, data_access_func_detail: DataAccessFunctionDetail
+ ) -> Lineage:
+ logger.debug(
+ f"Processing {self.get_platform_pair().datahub_data_platform_name} function detail {data_access_func_detail}"
+ )
+
+ arguments: List[str] = tree_function.remove_whitespaces_from_list(
+ tree_function.token_values(data_access_func_detail.arg_list)
+ )
+ # First is database name
+ db_name: str = data_access_func_detail.identifier_accessor.items["Name"] # type: ignore
+ # Second is schema name
+ schema_name: str = cast(
+ IdentifierAccessor, data_access_func_detail.identifier_accessor.next # type: ignore
+ ).items["Name"]
+ # Third is table name
+ table_name: str = cast(
+ IdentifierAccessor, data_access_func_detail.identifier_accessor.next.next # type: ignore
+ ).items["Name"]
+
+ qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
+
+ logger.debug(
+ f"{self.get_platform_pair().datahub_data_platform_name} qualified_table_name {qualified_table_name}"
+ )
+
+ server: str = self.get_datasource_server(arguments, data_access_func_detail)
+
+ urn = make_urn(
+ config=self.config,
+ platform_instance_resolver=self.platform_instance_resolver,
+ data_platform_pair=self.get_platform_pair(),
+ server=server,
+ qualified_table_name=qualified_table_name,
+ )
+
+ return Lineage(
+ upstreams=[
+ DataPlatformTable(
+ data_platform_pair=self.get_platform_pair(),
+ urn=urn,
+ )
+ ],
+ column_lineage=[],
+ )
+
+
+class SnowflakeLineage(ThreeStepDataAccessPattern):
+ def get_platform_pair(self) -> DataPlatformPair:
+ return SupportedDataPlatform.SNOWFLAKE.value
+
+
+class GoogleBigQueryLineage(ThreeStepDataAccessPattern):
+ def get_platform_pair(self) -> DataPlatformPair:
+ return SupportedDataPlatform.GOOGLE_BIGQUERY.value
+
+ def get_datasource_server(
+ self, arguments: List[str], data_access_func_detail: DataAccessFunctionDetail
+ ) -> str:
+ # In Google BigQuery server is project-name
+ # condition to silent lint, it is not going to be None
+ return (
+ data_access_func_detail.identifier_accessor.items["Name"]
+ if data_access_func_detail.identifier_accessor is not None
+ else ""
+ )
+
+
+class NativeQueryLineage(AbstractLineage):
+ SUPPORTED_NATIVE_QUERY_DATA_PLATFORM: dict = {
+ SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name: SupportedDataPlatform.SNOWFLAKE,
+ SupportedDataPlatform.AMAZON_REDSHIFT.value.powerbi_data_platform_name: SupportedDataPlatform.AMAZON_REDSHIFT,
+ SupportedDataPlatform.DatabricksMultiCloud_SQL.value.powerbi_data_platform_name: SupportedDataPlatform.DatabricksMultiCloud_SQL,
+ }
+ current_data_platform: SupportedDataPlatform = SupportedDataPlatform.SNOWFLAKE
+
+ def get_platform_pair(self) -> DataPlatformPair:
+ return self.current_data_platform.value
+
+ @staticmethod
+ def is_native_parsing_supported(data_access_function_name: str) -> bool:
+ return (
+ data_access_function_name
+ in NativeQueryLineage.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM
+ )
+
+ def create_urn_using_old_parser(self, query: str, server: str) -> Lineage:
+ dataplatform_tables: List[DataPlatformTable] = []
+
+ tables: List[str] = native_sql_parser.get_tables(query)
+
+ for qualified_table_name in tables:
+ if len(qualified_table_name.split(".")) != 3:
+ logger.debug(
+ f"Skipping table {qualified_table_name} as it is not as per qualified_table_name format"
+ )
+ continue
+
+ urn = make_urn(
+ config=self.config,
+ platform_instance_resolver=self.platform_instance_resolver,
+ data_platform_pair=self.get_platform_pair(),
+ server=server,
+ qualified_table_name=qualified_table_name,
+ )
+
+ dataplatform_tables.append(
+ DataPlatformTable(
+ data_platform_pair=self.get_platform_pair(),
+ urn=urn,
+ )
+ )
+
+ logger.debug(f"Generated dataplatform_tables {dataplatform_tables}")
+
+ return Lineage(
+ upstreams=dataplatform_tables,
+ column_lineage=[],
+ )
+
+ def get_db_name(self, data_access_tokens: List[str]) -> Optional[str]:
+ if (
+ data_access_tokens[0]
+ != SupportedDataPlatform.DatabricksMultiCloud_SQL.value.powerbi_data_platform_name
+ ):
+ return None
+
+ database: Optional[str] = get_next_item(data_access_tokens, "Database")
+
+ if (
+ database and database != Constant.M_QUERY_NULL
+ ): # database name is explicitly set
+ return database
+
+ return get_next_item( # database name is set in Name argument
+ data_access_tokens, "Name"
+ ) or get_next_item( # If both above arguments are not available, then try Catalog
+ data_access_tokens, "Catalog"
+ )
+
+ def create_lineage(
+ self, data_access_func_detail: DataAccessFunctionDetail
+ ) -> Lineage:
+ t1: Tree = cast(
+ Tree, tree_function.first_arg_list_func(data_access_func_detail.arg_list)
+ )
+ flat_argument_list: List[Tree] = tree_function.flat_argument_list(t1)
+
+ if len(flat_argument_list) != 2:
+ logger.debug(
+ f"Expecting 2 argument, actual argument count is {len(flat_argument_list)}"
+ )
+ logger.debug(f"Flat argument list = {flat_argument_list}")
+ return Lineage.empty()
+
+ data_access_tokens: List[str] = tree_function.remove_whitespaces_from_list(
+ tree_function.token_values(flat_argument_list[0])
+ )
+
+ if not self.is_native_parsing_supported(data_access_tokens[0]):
+ logger.debug(
+ f"Unsupported native-query data-platform = {data_access_tokens[0]}"
+ )
+ logger.debug(
+ f"NativeQuery is supported only for {self.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM}"
+ )
+
+ return Lineage.empty()
+
+ if len(data_access_tokens[0]) < 3:
+ logger.debug(
+ f"Server is not available in argument list for data-platform {data_access_tokens[0]}. Returning empty "
+ "list"
+ )
+ return Lineage.empty()
+
+ self.current_data_platform = self.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM[
+ data_access_tokens[0]
+ ]
+ # The First argument is the query
+ sql_query: str = tree_function.strip_char_from_list(
+ values=tree_function.remove_whitespaces_from_list(
+ tree_function.token_values(flat_argument_list[1])
+ ),
+ )[
+ 0
+ ] # Remove any whitespaces and double quotes character
+
+ server = tree_function.strip_char_from_list([data_access_tokens[2]])[0]
+
+ if self.config.enable_advance_lineage_sql_construct is False:
+ # Use previous parser to generate URN to keep backward compatibility
+ return self.create_urn_using_old_parser(
+ query=sql_query,
+ server=server,
+ )
+
+ database_name: Optional[str] = self.get_db_name(data_access_tokens)
+
+ return self.parse_custom_sql(
+ query=sql_query,
+ server=server,
+ database=database_name,
+ schema=None,
+ )
+
+
+class SupportedPattern(Enum):
+ DATABRICKS_QUERY = (
+ DatabricksLineage,
+ FunctionName.DATABRICK_DATA_ACCESS,
+ )
+
+ DATABRICKS_MULTI_CLOUD = (
+ DatabricksLineage,
+ FunctionName.DATABRICK_MULTI_CLOUD_DATA_ACCESS,
+ )
+
+ POSTGRES_SQL = (
+ PostgresLineage,
+ FunctionName.POSTGRESQL_DATA_ACCESS,
+ )
+
+ ORACLE = (
+ OracleLineage,
+ FunctionName.ORACLE_DATA_ACCESS,
+ )
+
+ SNOWFLAKE = (
+ SnowflakeLineage,
+ FunctionName.SNOWFLAKE_DATA_ACCESS,
+ )
+
+ MS_SQL = (
+ MSSqlLineage,
+ FunctionName.MSSQL_DATA_ACCESS,
+ )
+
+ GOOGLE_BIG_QUERY = (
+ GoogleBigQueryLineage,
+ FunctionName.GOOGLE_BIGQUERY_DATA_ACCESS,
+ )
+
+ AMAZON_REDSHIFT = (
+ AmazonRedshiftLineage,
+ FunctionName.AMAZON_REDSHIFT_DATA_ACCESS,
+ )
+
+ NATIVE_QUERY = (
+ NativeQueryLineage,
+ FunctionName.NATIVE_QUERY,
+ )
+
+ def handler(self) -> Type[AbstractLineage]:
+ return self.value[0]
+
+ def function_name(self) -> str:
+ return self.value[1].value
+
+ @staticmethod
+ def get_function_names() -> List[str]:
+ functions: List[str] = []
+ for supported_resolver in SupportedPattern:
+ functions.append(supported_resolver.function_name())
+
+ return functions
+
+ @staticmethod
+ def get_pattern_handler(function_name: str) -> Optional["SupportedPattern"]:
+ logger.debug(f"Looking for pattern-handler for {function_name}")
+ for supported_resolver in SupportedPattern:
+ if function_name == supported_resolver.function_name():
+ return supported_resolver
+ logger.debug(f"pattern-handler not found for function_name {function_name}")
+ return None
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py
index a40e67d08da5b..81a0e1ef2d79b 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py
@@ -1,286 +1,33 @@
import logging
from abc import ABC, abstractmethod
-from dataclasses import dataclass
-from enum import Enum
-from typing import Any, Dict, List, Optional, Tuple, Type, Union, cast
+from typing import Any, Dict, List, Optional, Tuple, Union, cast
from lark import Tree
-import datahub.emitter.mce_builder as builder
from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.source.powerbi.config import (
- Constant,
- DataBricksPlatformDetail,
- DataPlatformPair,
- PlatformDetail,
PowerBiDashboardSourceConfig,
PowerBiDashboardSourceReport,
- PowerBIPlatformDetail,
- SupportedDataPlatform,
)
from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import (
AbstractDataPlatformInstanceResolver,
)
-from datahub.ingestion.source.powerbi.m_query import native_sql_parser, tree_function
+from datahub.ingestion.source.powerbi.m_query import tree_function
from datahub.ingestion.source.powerbi.m_query.data_classes import (
TRACE_POWERBI_MQUERY_PARSER,
- AbstractIdentifierAccessor,
DataAccessFunctionDetail,
IdentifierAccessor,
- ReferencedTable,
+ Lineage,
+)
+from datahub.ingestion.source.powerbi.m_query.pattern_handler import (
+ AbstractLineage,
+ SupportedPattern,
)
from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table
-from datahub.sql_parsing.sqlglot_lineage import ColumnLineageInfo, SqlParsingResult
logger = logging.getLogger(__name__)
-@dataclass
-class DataPlatformTable:
- data_platform_pair: DataPlatformPair
- urn: str
-
-
-@dataclass
-class Lineage:
- upstreams: List[DataPlatformTable]
- column_lineage: List[ColumnLineageInfo]
-
- @staticmethod
- def empty() -> "Lineage":
- return Lineage(upstreams=[], column_lineage=[])
-
-
-def urn_to_lowercase(value: str, flag: bool) -> str:
- if flag is True:
- return value.lower()
-
- return value
-
-
-def urn_creator(
- config: PowerBiDashboardSourceConfig,
- platform_instance_resolver: AbstractDataPlatformInstanceResolver,
- data_platform_pair: DataPlatformPair,
- server: str,
- qualified_table_name: str,
-) -> str:
- platform_detail: PlatformDetail = platform_instance_resolver.get_platform_instance(
- PowerBIPlatformDetail(
- data_platform_pair=data_platform_pair,
- data_platform_server=server,
- )
- )
-
- return builder.make_dataset_urn_with_platform_instance(
- platform=data_platform_pair.datahub_data_platform_name,
- platform_instance=platform_detail.platform_instance,
- env=platform_detail.env,
- name=urn_to_lowercase(
- qualified_table_name, config.convert_lineage_urns_to_lowercase
- ),
- )
-
-
-def get_next_item(items: List[str], item: str) -> Optional[str]:
- if item in items:
- try:
- index = items.index(item)
- return items[index + 1]
- except IndexError:
- logger.debug(f'item:"{item}", not found in item-list: {items}')
- return None
-
-
-class AbstractDataPlatformTableCreator(ABC):
- """
- Base class to share common functionalities among different dataplatform for M-Query parsing.
-
- To create qualified table name we need to parse M-Query data-access-functions(https://learn.microsoft.com/en-us/powerquery-m/accessing-data-functions) and
- the data-access-functions has some define pattern to access database-name, schema-name and table-name, for example see below M-Query.
-
- let
- Source = Sql.Database("localhost", "library"),
- dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data]
- in
- dbo_book_issue
-
- It is MSSQL M-Query and Sql.Database is the data-access-function to access MSSQL. If this function is available in M-Query then database name is available in second argument
- of first statement and schema-name and table-name is available in second statement. second statement can be repeated to access different tables from MSSQL.
-
- DefaultTwoStepDataAccessSources extends the AbstractDataPlatformTableCreator and provides the common functionalities for data-platform which has above type of M-Query pattern
-
- data-access-function varies as per data-platform for example for MySQL.Database for MySQL, PostgreSQL.Database for Postgres and Oracle.Database for Oracle and number of statement to
- find out database-name , schema-name and table-name also varies as per dataplatform.
-
- Value.NativeQuery is one of the function which is used to execute native query inside M-Query, for example see below M-Query
-
- let
- Source = Value.NativeQuery(AmazonRedshift.Database("redshift-url","dev"), "select * from dev.public.category", null, [EnableFolding=true])
- in
- Source
-
- In this M-Query database-name is available in first argument and rest of the detail i.e database & schema is available in native query.
-
- NativeQueryDataPlatformTableCreator extends AbstractDataPlatformTableCreator to support Redshift and Snowflake native query parsing.
-
- """
-
- ctx: PipelineContext
- table: Table
- config: PowerBiDashboardSourceConfig
- reporter: PowerBiDashboardSourceReport
- platform_instance_resolver: AbstractDataPlatformInstanceResolver
-
- def __init__(
- self,
- ctx: PipelineContext,
- table: Table,
- config: PowerBiDashboardSourceConfig,
- reporter: PowerBiDashboardSourceReport,
- platform_instance_resolver: AbstractDataPlatformInstanceResolver,
- ) -> None:
- super().__init__()
- self.ctx = ctx
- self.table = table
- self.config = config
- self.reporter = reporter
- self.platform_instance_resolver = platform_instance_resolver
-
- @abstractmethod
- def create_lineage(
- self, data_access_func_detail: DataAccessFunctionDetail
- ) -> Lineage:
- pass
-
- @abstractmethod
- def get_platform_pair(self) -> DataPlatformPair:
- pass
-
- @staticmethod
- def get_db_detail_from_argument(
- arg_list: Tree,
- ) -> Tuple[Optional[str], Optional[str]]:
- arguments: List[str] = tree_function.strip_char_from_list(
- values=tree_function.remove_whitespaces_from_list(
- tree_function.token_values(arg_list)
- ),
- )
-
- if len(arguments) < 2:
- logger.debug(f"Expected minimum 2 arguments, but got {len(arguments)}")
- return None, None
-
- return arguments[0], arguments[1]
-
- @staticmethod
- def create_reference_table(
- arg_list: Tree,
- table_detail: Dict[str, str],
- ) -> Optional[ReferencedTable]:
- arguments: List[str] = tree_function.strip_char_from_list(
- values=tree_function.remove_whitespaces_from_list(
- tree_function.token_values(arg_list)
- ),
- )
-
- logger.debug(f"Processing arguments {arguments}")
-
- if (
- len(arguments)
- >= 4 # [0] is warehouse FQDN.
- # [1] is endpoint, we are not using it.
- # [2] is "Catalog" key
- # [3] is catalog's value
- ):
- return ReferencedTable(
- warehouse=arguments[0],
- catalog=arguments[3],
- # As per my observation, database and catalog names are same in M-Query
- database=table_detail["Database"]
- if table_detail.get("Database")
- else arguments[3],
- schema=table_detail["Schema"],
- table=table_detail.get("Table") or table_detail["View"],
- )
- elif len(arguments) == 2:
- return ReferencedTable(
- warehouse=arguments[0],
- database=table_detail["Database"],
- schema=table_detail["Schema"],
- table=table_detail.get("Table") or table_detail["View"],
- catalog=None,
- )
-
- return None
-
- def parse_custom_sql(
- self, query: str, server: str, database: Optional[str], schema: Optional[str]
- ) -> Lineage:
- dataplatform_tables: List[DataPlatformTable] = []
-
- platform_detail: PlatformDetail = (
- self.platform_instance_resolver.get_platform_instance(
- PowerBIPlatformDetail(
- data_platform_pair=self.get_platform_pair(),
- data_platform_server=server,
- )
- )
- )
-
- query = native_sql_parser.remove_drop_statement(
- native_sql_parser.remove_special_characters(query)
- )
-
- parsed_result: Optional[
- "SqlParsingResult"
- ] = native_sql_parser.parse_custom_sql(
- ctx=self.ctx,
- query=query,
- platform=self.get_platform_pair().datahub_data_platform_name,
- platform_instance=platform_detail.platform_instance,
- env=platform_detail.env,
- database=database,
- schema=schema,
- )
-
- if parsed_result is None:
- self.reporter.info(
- title=Constant.SQL_PARSING_FAILURE,
- message="Fail to parse native sql present in PowerBI M-Query",
- context=f"table-name={self.table.full_name}, sql={query}",
- )
- return Lineage.empty()
-
- if parsed_result.debug_info and parsed_result.debug_info.table_error:
- self.reporter.warning(
- title=Constant.SQL_PARSING_FAILURE,
- message="Fail to parse native sql present in PowerBI M-Query",
- context=f"table-name={self.table.full_name}, error={parsed_result.debug_info.table_error},sql={query}",
- )
- return Lineage.empty()
-
- for urn in parsed_result.in_tables:
- dataplatform_tables.append(
- DataPlatformTable(
- data_platform_pair=self.get_platform_pair(),
- urn=urn,
- )
- )
-
- logger.debug(f"Native Query parsed result={parsed_result}")
- logger.debug(f"Generated dataplatform_tables={dataplatform_tables}")
-
- return Lineage(
- upstreams=dataplatform_tables,
- column_lineage=(
- parsed_result.column_lineage
- if parsed_result.column_lineage is not None
- else []
- ),
- )
-
-
class AbstractDataAccessMQueryResolver(ABC):
table: Table
parse_tree: Tree
@@ -299,10 +46,10 @@ def __init__(
self.parse_tree = parse_tree
self.reporter = reporter
self.parameters = parameters
- self.data_access_functions = SupportedResolver.get_function_names()
+ self.data_access_functions = SupportedPattern.get_function_names()
@abstractmethod
- def resolve_to_data_platform_table_list(
+ def resolve_to_lineage(
self,
ctx: PipelineContext,
config: PowerBiDashboardSourceConfig,
@@ -318,7 +65,7 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
This class has generic code to process M-Query tokens and create instance of DataAccessFunctionDetail.
Once DataAccessFunctionDetail instance is initialized thereafter MQueryResolver generates the DataPlatformTable with the help of AbstractDataPlatformTableCreator
- (see method resolve_to_data_platform_table_list).
+ (see method resolve_to_lineage).
Classes which extended from AbstractDataPlatformTableCreator know how to convert generated DataAccessFunctionDetail instance
to the respective DataPlatformTable instance as per dataplatform.
@@ -602,7 +349,7 @@ def internal(
return table_links
- def resolve_to_data_platform_table_list(
+ def resolve_to_lineage(
self,
ctx: PipelineContext,
config: PowerBiDashboardSourceConfig,
@@ -630,7 +377,7 @@ def resolve_to_data_platform_table_list(
# Each item is data-access function
for f_detail in table_links:
# Get & Check if we support data-access-function available in M-Query
- supported_resolver = SupportedResolver.get_resolver(
+ supported_resolver = SupportedPattern.get_pattern_handler(
f_detail.data_access_function_name
)
if supported_resolver is None:
@@ -643,11 +390,9 @@ def resolve_to_data_platform_table_list(
)
continue
- # From supported_resolver enum get respective resolver like AmazonRedshift or Snowflake or Oracle or NativeQuery and create instance of it
- # & also pass additional information that will be need to generate urn
- table_qualified_name_creator: (
- AbstractDataPlatformTableCreator
- ) = supported_resolver.get_table_full_name_creator()(
+ # From supported_resolver enum get respective handler like AmazonRedshift or Snowflake or Oracle or NativeQuery and create instance of it
+ # & also pass additional information that will be need to generate lineage
+ pattern_handler: (AbstractLineage) = supported_resolver.handler()(
ctx=ctx,
table=self.table,
config=config,
@@ -655,673 +400,6 @@ def resolve_to_data_platform_table_list(
platform_instance_resolver=platform_instance_resolver,
)
- lineage.append(table_qualified_name_creator.create_lineage(f_detail))
+ lineage.append(pattern_handler.create_lineage(f_detail))
return lineage
-
-
-class DefaultTwoStepDataAccessSources(AbstractDataPlatformTableCreator, ABC):
- """
- These are the DataSource for which PowerBI Desktop generates default M-Query of following pattern
- let
- Source = Sql.Database("localhost", "library"),
- dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data]
- in
- dbo_book_issue
- """
-
- def two_level_access_pattern(
- self, data_access_func_detail: DataAccessFunctionDetail
- ) -> Lineage:
- logger.debug(
- f"Processing {self.get_platform_pair().powerbi_data_platform_name} data-access function detail {data_access_func_detail}"
- )
-
- server, db_name = self.get_db_detail_from_argument(
- data_access_func_detail.arg_list
- )
- if server is None or db_name is None:
- return Lineage.empty() # Return an empty list
-
- schema_name: str = cast(
- IdentifierAccessor, data_access_func_detail.identifier_accessor
- ).items["Schema"]
-
- table_name: str = cast(
- IdentifierAccessor, data_access_func_detail.identifier_accessor
- ).items["Item"]
-
- qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
-
- logger.debug(
- f"Platform({self.get_platform_pair().datahub_data_platform_name}) qualified_table_name= {qualified_table_name}"
- )
-
- urn = urn_creator(
- config=self.config,
- platform_instance_resolver=self.platform_instance_resolver,
- data_platform_pair=self.get_platform_pair(),
- server=server,
- qualified_table_name=qualified_table_name,
- )
- return Lineage(
- upstreams=[
- DataPlatformTable(
- data_platform_pair=self.get_platform_pair(),
- urn=urn,
- )
- ],
- column_lineage=[],
- )
-
-
-class PostgresDataPlatformTableCreator(DefaultTwoStepDataAccessSources):
- def create_lineage(
- self, data_access_func_detail: DataAccessFunctionDetail
- ) -> Lineage:
- return self.two_level_access_pattern(data_access_func_detail)
-
- def get_platform_pair(self) -> DataPlatformPair:
- return SupportedDataPlatform.POSTGRES_SQL.value
-
-
-class MSSqlDataPlatformTableCreator(DefaultTwoStepDataAccessSources):
- # https://learn.microsoft.com/en-us/sql/relational-databases/security/authentication-access/ownership-and-user-schema-separation?view=sql-server-ver16
- DEFAULT_SCHEMA = "dbo" # Default schema name in MS-SQL is dbo
-
- def get_platform_pair(self) -> DataPlatformPair:
- return SupportedDataPlatform.MS_SQL.value
-
- def create_urn_using_old_parser(
- self, query: str, db_name: str, server: str
- ) -> List[DataPlatformTable]:
- dataplatform_tables: List[DataPlatformTable] = []
-
- tables: List[str] = native_sql_parser.get_tables(query)
-
- for parsed_table in tables:
- # components: List[str] = [v.strip("[]") for v in parsed_table.split(".")]
- components = [v.strip("[]") for v in parsed_table.split(".")]
- if len(components) == 3:
- database, schema, table = components
- elif len(components) == 2:
- schema, table = components
- database = db_name
- elif len(components) == 1:
- (table,) = components
- database = db_name
- schema = MSSqlDataPlatformTableCreator.DEFAULT_SCHEMA
- else:
- self.reporter.warning(
- title="Invalid table format",
- message="The advanced SQL lineage feature (enable_advance_lineage_sql_construct) is disabled. Please either enable this feature or ensure the table is referenced as .. in the SQL.",
- context=f"table-name={self.table.full_name}",
- )
- continue
-
- qualified_table_name = f"{database}.{schema}.{table}"
- urn = urn_creator(
- config=self.config,
- platform_instance_resolver=self.platform_instance_resolver,
- data_platform_pair=self.get_platform_pair(),
- server=server,
- qualified_table_name=qualified_table_name,
- )
- dataplatform_tables.append(
- DataPlatformTable(
- data_platform_pair=self.get_platform_pair(),
- urn=urn,
- )
- )
-
- logger.debug(f"Generated upstream tables = {dataplatform_tables}")
-
- return dataplatform_tables
-
- def create_lineage(
- self, data_access_func_detail: DataAccessFunctionDetail
- ) -> Lineage:
- arguments: List[str] = tree_function.strip_char_from_list(
- values=tree_function.remove_whitespaces_from_list(
- tree_function.token_values(data_access_func_detail.arg_list)
- ),
- )
-
- server, database = self.get_db_detail_from_argument(
- data_access_func_detail.arg_list
- )
- if server is None or database is None:
- return Lineage.empty() # Return an empty list
-
- assert server
- assert database # to silent the lint
-
- query: Optional[str] = get_next_item(arguments, "Query")
- if query:
- if self.config.enable_advance_lineage_sql_construct is False:
- # Use previous parser to generate URN to keep backward compatibility
- return Lineage(
- upstreams=self.create_urn_using_old_parser(
- query=query,
- db_name=database,
- server=server,
- ),
- column_lineage=[],
- )
-
- return self.parse_custom_sql(
- query=query,
- database=database,
- server=server,
- schema=MSSqlDataPlatformTableCreator.DEFAULT_SCHEMA,
- )
-
- # It is a regular case of MS-SQL
- logger.debug("Handling with regular case")
- return self.two_level_access_pattern(data_access_func_detail)
-
-
-class OracleDataPlatformTableCreator(AbstractDataPlatformTableCreator):
- def get_platform_pair(self) -> DataPlatformPair:
- return SupportedDataPlatform.ORACLE.value
-
- @staticmethod
- def _get_server_and_db_name(value: str) -> Tuple[Optional[str], Optional[str]]:
- error_message: str = (
- f"The target argument ({value}) should in the format of :/["
- ".]"
- )
- splitter_result: List[str] = value.split("/")
- if len(splitter_result) != 2:
- logger.debug(error_message)
- return None, None
-
- db_name = splitter_result[1].split(".")[0]
-
- return tree_function.strip_char_from_list([splitter_result[0]])[0], db_name
-
- def create_lineage(
- self, data_access_func_detail: DataAccessFunctionDetail
- ) -> Lineage:
- logger.debug(
- f"Processing Oracle data-access function detail {data_access_func_detail}"
- )
-
- arguments: List[str] = tree_function.remove_whitespaces_from_list(
- tree_function.token_values(data_access_func_detail.arg_list)
- )
-
- server, db_name = self._get_server_and_db_name(arguments[0])
-
- if db_name is None or server is None:
- return Lineage.empty()
-
- schema_name: str = cast(
- IdentifierAccessor, data_access_func_detail.identifier_accessor
- ).items["Schema"]
-
- table_name: str = cast(
- IdentifierAccessor,
- cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next,
- ).items["Name"]
-
- qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
-
- urn = urn_creator(
- config=self.config,
- platform_instance_resolver=self.platform_instance_resolver,
- data_platform_pair=self.get_platform_pair(),
- server=server,
- qualified_table_name=qualified_table_name,
- )
-
- return Lineage(
- upstreams=[
- DataPlatformTable(
- data_platform_pair=self.get_platform_pair(),
- urn=urn,
- )
- ],
- column_lineage=[],
- )
-
-
-class DatabrickDataPlatformTableCreator(AbstractDataPlatformTableCreator):
- def form_qualified_table_name(
- self,
- table_reference: ReferencedTable,
- data_platform_pair: DataPlatformPair,
- ) -> str:
- platform_detail: PlatformDetail = (
- self.platform_instance_resolver.get_platform_instance(
- PowerBIPlatformDetail(
- data_platform_pair=data_platform_pair,
- data_platform_server=table_reference.warehouse,
- )
- )
- )
-
- metastore: Optional[str] = None
-
- qualified_table_name: str = f"{table_reference.database}.{table_reference.schema}.{table_reference.table}"
-
- if isinstance(platform_detail, DataBricksPlatformDetail):
- metastore = platform_detail.metastore
-
- if metastore is not None:
- return f"{metastore}.{qualified_table_name}"
-
- return qualified_table_name
-
- def create_lineage(
- self, data_access_func_detail: DataAccessFunctionDetail
- ) -> Lineage:
- logger.debug(
- f"Processing Databrick data-access function detail {data_access_func_detail}"
- )
- table_detail: Dict[str, str] = {}
- temp_accessor: Optional[
- Union[IdentifierAccessor, AbstractIdentifierAccessor]
- ] = data_access_func_detail.identifier_accessor
-
- while temp_accessor:
- if isinstance(temp_accessor, IdentifierAccessor):
- # Condition to handle databricks M-query pattern where table, schema and database all are present in
- # the same invoke statement
- if all(
- element in temp_accessor.items
- for element in ["Item", "Schema", "Catalog"]
- ):
- table_detail["Schema"] = temp_accessor.items["Schema"]
- table_detail["Table"] = temp_accessor.items["Item"]
- else:
- table_detail[temp_accessor.items["Kind"]] = temp_accessor.items[
- "Name"
- ]
-
- if temp_accessor.next is not None:
- temp_accessor = temp_accessor.next
- else:
- break
- else:
- logger.debug(
- "expecting instance to be IdentifierAccessor, please check if parsing is done properly"
- )
- return Lineage.empty()
-
- table_reference = self.create_reference_table(
- arg_list=data_access_func_detail.arg_list,
- table_detail=table_detail,
- )
-
- if table_reference:
- qualified_table_name: str = self.form_qualified_table_name(
- table_reference=table_reference,
- data_platform_pair=self.get_platform_pair(),
- )
-
- urn = urn_creator(
- config=self.config,
- platform_instance_resolver=self.platform_instance_resolver,
- data_platform_pair=self.get_platform_pair(),
- server=table_reference.warehouse,
- qualified_table_name=qualified_table_name,
- )
-
- return Lineage(
- upstreams=[
- DataPlatformTable(
- data_platform_pair=self.get_platform_pair(),
- urn=urn,
- )
- ],
- column_lineage=[],
- )
-
- return Lineage.empty()
-
- def get_platform_pair(self) -> DataPlatformPair:
- return SupportedDataPlatform.DATABRICK_SQL.value
-
-
-class DefaultThreeStepDataAccessSources(AbstractDataPlatformTableCreator, ABC):
- def get_datasource_server(
- self, arguments: List[str], data_access_func_detail: DataAccessFunctionDetail
- ) -> str:
- return tree_function.strip_char_from_list([arguments[0]])[0]
-
- def create_lineage(
- self, data_access_func_detail: DataAccessFunctionDetail
- ) -> Lineage:
- logger.debug(
- f"Processing {self.get_platform_pair().datahub_data_platform_name} function detail {data_access_func_detail}"
- )
-
- arguments: List[str] = tree_function.remove_whitespaces_from_list(
- tree_function.token_values(data_access_func_detail.arg_list)
- )
- # First is database name
- db_name: str = data_access_func_detail.identifier_accessor.items["Name"] # type: ignore
- # Second is schema name
- schema_name: str = cast(
- IdentifierAccessor, data_access_func_detail.identifier_accessor.next # type: ignore
- ).items["Name"]
- # Third is table name
- table_name: str = cast(
- IdentifierAccessor, data_access_func_detail.identifier_accessor.next.next # type: ignore
- ).items["Name"]
-
- qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
-
- logger.debug(
- f"{self.get_platform_pair().datahub_data_platform_name} qualified_table_name {qualified_table_name}"
- )
-
- server: str = self.get_datasource_server(arguments, data_access_func_detail)
-
- urn = urn_creator(
- config=self.config,
- platform_instance_resolver=self.platform_instance_resolver,
- data_platform_pair=self.get_platform_pair(),
- server=server,
- qualified_table_name=qualified_table_name,
- )
-
- return Lineage(
- upstreams=[
- DataPlatformTable(
- data_platform_pair=self.get_platform_pair(),
- urn=urn,
- )
- ],
- column_lineage=[],
- )
-
-
-class SnowflakeDataPlatformTableCreator(DefaultThreeStepDataAccessSources):
- def get_platform_pair(self) -> DataPlatformPair:
- return SupportedDataPlatform.SNOWFLAKE.value
-
-
-class GoogleBigQueryDataPlatformTableCreator(DefaultThreeStepDataAccessSources):
- def get_platform_pair(self) -> DataPlatformPair:
- return SupportedDataPlatform.GOOGLE_BIGQUERY.value
-
- def get_datasource_server(
- self, arguments: List[str], data_access_func_detail: DataAccessFunctionDetail
- ) -> str:
- # In Google BigQuery server is project-name
- # condition to silent lint, it is not going to be None
- return (
- data_access_func_detail.identifier_accessor.items["Name"]
- if data_access_func_detail.identifier_accessor is not None
- else ""
- )
-
-
-class AmazonRedshiftDataPlatformTableCreator(AbstractDataPlatformTableCreator):
- def get_platform_pair(self) -> DataPlatformPair:
- return SupportedDataPlatform.AMAZON_REDSHIFT.value
-
- def create_lineage(
- self, data_access_func_detail: DataAccessFunctionDetail
- ) -> Lineage:
- logger.debug(
- f"Processing AmazonRedshift data-access function detail {data_access_func_detail}"
- )
-
- server, db_name = self.get_db_detail_from_argument(
- data_access_func_detail.arg_list
- )
- if db_name is None or server is None:
- return Lineage.empty() # Return empty list
-
- schema_name: str = cast(
- IdentifierAccessor, data_access_func_detail.identifier_accessor
- ).items["Name"]
-
- table_name: str = cast(
- IdentifierAccessor,
- cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next,
- ).items["Name"]
-
- qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
-
- urn = urn_creator(
- config=self.config,
- platform_instance_resolver=self.platform_instance_resolver,
- data_platform_pair=self.get_platform_pair(),
- server=server,
- qualified_table_name=qualified_table_name,
- )
-
- return Lineage(
- upstreams=[
- DataPlatformTable(
- data_platform_pair=self.get_platform_pair(),
- urn=urn,
- )
- ],
- column_lineage=[],
- )
-
-
-class NativeQueryDataPlatformTableCreator(AbstractDataPlatformTableCreator):
- SUPPORTED_NATIVE_QUERY_DATA_PLATFORM: dict = {
- SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name: SupportedDataPlatform.SNOWFLAKE,
- SupportedDataPlatform.AMAZON_REDSHIFT.value.powerbi_data_platform_name: SupportedDataPlatform.AMAZON_REDSHIFT,
- SupportedDataPlatform.DatabricksMultiCloud_SQL.value.powerbi_data_platform_name: SupportedDataPlatform.DatabricksMultiCloud_SQL,
- }
- current_data_platform: SupportedDataPlatform = SupportedDataPlatform.SNOWFLAKE
-
- def get_platform_pair(self) -> DataPlatformPair:
- return self.current_data_platform.value
-
- @staticmethod
- def is_native_parsing_supported(data_access_function_name: str) -> bool:
- return (
- data_access_function_name
- in NativeQueryDataPlatformTableCreator.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM
- )
-
- def create_urn_using_old_parser(self, query: str, server: str) -> Lineage:
- dataplatform_tables: List[DataPlatformTable] = []
-
- tables: List[str] = native_sql_parser.get_tables(query)
-
- for qualified_table_name in tables:
- if len(qualified_table_name.split(".")) != 3:
- logger.debug(
- f"Skipping table {qualified_table_name} as it is not as per qualified_table_name format"
- )
- continue
-
- urn = urn_creator(
- config=self.config,
- platform_instance_resolver=self.platform_instance_resolver,
- data_platform_pair=self.get_platform_pair(),
- server=server,
- qualified_table_name=qualified_table_name,
- )
-
- dataplatform_tables.append(
- DataPlatformTable(
- data_platform_pair=self.get_platform_pair(),
- urn=urn,
- )
- )
-
- logger.debug(f"Generated dataplatform_tables {dataplatform_tables}")
-
- return Lineage(
- upstreams=dataplatform_tables,
- column_lineage=[],
- )
-
- def get_db_name(self, data_access_tokens: List[str]) -> Optional[str]:
- if (
- data_access_tokens[0]
- != SupportedDataPlatform.DatabricksMultiCloud_SQL.value.powerbi_data_platform_name
- ):
- return None
-
- database: Optional[str] = get_next_item(data_access_tokens, "Database")
-
- if (
- database and database != Constant.M_QUERY_NULL
- ): # database name is explicitly set
- return database
-
- return get_next_item( # database name is set in Name argument
- data_access_tokens, "Name"
- ) or get_next_item( # If both above arguments are not available, then try Catalog
- data_access_tokens, "Catalog"
- )
-
- def create_lineage(
- self, data_access_func_detail: DataAccessFunctionDetail
- ) -> Lineage:
- t1: Tree = cast(
- Tree, tree_function.first_arg_list_func(data_access_func_detail.arg_list)
- )
- flat_argument_list: List[Tree] = tree_function.flat_argument_list(t1)
-
- if len(flat_argument_list) != 2:
- logger.debug(
- f"Expecting 2 argument, actual argument count is {len(flat_argument_list)}"
- )
- logger.debug(f"Flat argument list = {flat_argument_list}")
- return Lineage.empty()
-
- data_access_tokens: List[str] = tree_function.remove_whitespaces_from_list(
- tree_function.token_values(flat_argument_list[0])
- )
-
- if not self.is_native_parsing_supported(data_access_tokens[0]):
- logger.debug(
- f"Unsupported native-query data-platform = {data_access_tokens[0]}"
- )
- logger.debug(
- f"NativeQuery is supported only for {self.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM}"
- )
-
- return Lineage.empty()
-
- if len(data_access_tokens[0]) < 3:
- logger.debug(
- f"Server is not available in argument list for data-platform {data_access_tokens[0]}. Returning empty "
- "list"
- )
- return Lineage.empty()
-
- self.current_data_platform = self.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM[
- data_access_tokens[0]
- ]
- # The First argument is the query
- sql_query: str = tree_function.strip_char_from_list(
- values=tree_function.remove_whitespaces_from_list(
- tree_function.token_values(flat_argument_list[1])
- ),
- )[
- 0
- ] # Remove any whitespaces and double quotes character
-
- server = tree_function.strip_char_from_list([data_access_tokens[2]])[0]
-
- if self.config.enable_advance_lineage_sql_construct is False:
- # Use previous parser to generate URN to keep backward compatibility
- return self.create_urn_using_old_parser(
- query=sql_query,
- server=server,
- )
-
- database_name: Optional[str] = self.get_db_name(data_access_tokens)
-
- return self.parse_custom_sql(
- query=sql_query,
- server=server,
- database=database_name,
- schema=None,
- )
-
-
-class FunctionName(Enum):
- NATIVE_QUERY = "Value.NativeQuery"
- POSTGRESQL_DATA_ACCESS = "PostgreSQL.Database"
- ORACLE_DATA_ACCESS = "Oracle.Database"
- SNOWFLAKE_DATA_ACCESS = "Snowflake.Databases"
- MSSQL_DATA_ACCESS = "Sql.Database"
- DATABRICK_DATA_ACCESS = "Databricks.Catalogs"
- GOOGLE_BIGQUERY_DATA_ACCESS = "GoogleBigQuery.Database"
- AMAZON_REDSHIFT_DATA_ACCESS = "AmazonRedshift.Database"
- DATABRICK_MULTI_CLOUD_DATA_ACCESS = "DatabricksMultiCloud.Catalogs"
-
-
-class SupportedResolver(Enum):
- DATABRICKS_QUERY = (
- DatabrickDataPlatformTableCreator,
- FunctionName.DATABRICK_DATA_ACCESS,
- )
-
- DATABRICKS_MULTI_CLOUD = (
- DatabrickDataPlatformTableCreator,
- FunctionName.DATABRICK_MULTI_CLOUD_DATA_ACCESS,
- )
-
- POSTGRES_SQL = (
- PostgresDataPlatformTableCreator,
- FunctionName.POSTGRESQL_DATA_ACCESS,
- )
-
- ORACLE = (
- OracleDataPlatformTableCreator,
- FunctionName.ORACLE_DATA_ACCESS,
- )
-
- SNOWFLAKE = (
- SnowflakeDataPlatformTableCreator,
- FunctionName.SNOWFLAKE_DATA_ACCESS,
- )
-
- MS_SQL = (
- MSSqlDataPlatformTableCreator,
- FunctionName.MSSQL_DATA_ACCESS,
- )
-
- GOOGLE_BIG_QUERY = (
- GoogleBigQueryDataPlatformTableCreator,
- FunctionName.GOOGLE_BIGQUERY_DATA_ACCESS,
- )
-
- AMAZON_REDSHIFT = (
- AmazonRedshiftDataPlatformTableCreator,
- FunctionName.AMAZON_REDSHIFT_DATA_ACCESS,
- )
-
- NATIVE_QUERY = (
- NativeQueryDataPlatformTableCreator,
- FunctionName.NATIVE_QUERY,
- )
-
- def get_table_full_name_creator(self) -> Type[AbstractDataPlatformTableCreator]:
- return self.value[0]
-
- def get_function_name(self) -> str:
- return self.value[1].value
-
- @staticmethod
- def get_function_names() -> List[str]:
- functions: List[str] = []
- for supported_resolver in SupportedResolver:
- functions.append(supported_resolver.get_function_name())
-
- return functions
-
- @staticmethod
- def get_resolver(function_name: str) -> Optional["SupportedResolver"]:
- logger.debug(f"Looking for resolver {function_name}")
- for supported_resolver in SupportedResolver:
- if function_name == supported_resolver.get_function_name():
- return supported_resolver
- logger.debug(f"Resolver not found for function_name {function_name}")
- return None
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py
index ca2abf97c9f30..b52977aaa41fb 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py
@@ -1,7 +1,7 @@
import logging
from typing import Optional, Tuple
-from datahub.ingestion.source.powerbi.m_query import resolver
+import datahub.ingestion.source.powerbi.m_query.data_classes
logger = logging.getLogger(__name__)
@@ -14,12 +14,18 @@ def validate_parse_tree(
:param native_query_enabled: Whether user want to extract lineage from native query
:return: True or False.
"""
- function_names = [fun.value for fun in resolver.FunctionName]
+ function_names = [
+ fun.value
+ for fun in datahub.ingestion.source.powerbi.m_query.data_classes.FunctionName
+ ]
if not any(fun in expression for fun in function_names):
return False, "DataAccess function is not present in M-Query expression."
if native_query_enabled is False:
- if resolver.FunctionName.NATIVE_QUERY.value in function_names:
+ if (
+ datahub.ingestion.source.powerbi.m_query.data_classes.FunctionName.NATIVE_QUERY.value
+ in function_names
+ ):
return (
False,
"Lineage extraction from native query is disabled. Enable native_query_parsing in recipe",
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py
index cef2d098aebc4..044946a5d308d 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py
@@ -10,6 +10,7 @@
import more_itertools
import datahub.emitter.mce_builder as builder
+import datahub.ingestion.source.powerbi.m_query.data_classes
import datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes as powerbi_data_classes
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.emitter.mcp_builder import ContainerKey, gen_containers
@@ -42,12 +43,13 @@
Constant,
PowerBiDashboardSourceConfig,
PowerBiDashboardSourceReport,
+ SupportedDataPlatform,
)
from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import (
AbstractDataPlatformInstanceResolver,
create_dataplatform_instance_resolver,
)
-from datahub.ingestion.source.powerbi.m_query import parser, resolver
+from datahub.ingestion.source.powerbi.m_query import parser
from datahub.ingestion.source.powerbi.rest_api_wrapper.powerbi_api import PowerBiAPI
from datahub.ingestion.source.state.stale_entity_removal_handler import (
StaleEntityRemovalHandler,
@@ -182,7 +184,9 @@ def extract_dataset_schema(
return [schema_mcp]
def make_fine_grained_lineage_class(
- self, lineage: resolver.Lineage, dataset_urn: str
+ self,
+ lineage: datahub.ingestion.source.powerbi.m_query.data_classes.Lineage,
+ dataset_urn: str,
) -> List[FineGrainedLineage]:
fine_grained_lineages: List[FineGrainedLineage] = []
@@ -234,7 +238,9 @@ def extract_lineage(
upstream: List[UpstreamClass] = []
cll_lineage: List[FineGrainedLineage] = []
- upstream_lineage: List[resolver.Lineage] = parser.get_upstream_tables(
+ upstream_lineage: List[
+ datahub.ingestion.source.powerbi.m_query.data_classes.Lineage
+ ] = parser.get_upstream_tables(
table=table,
reporter=self.__reporter,
platform_instance_resolver=self.__dataplatform_instance_resolver,
@@ -1294,7 +1300,7 @@ def get_allowed_workspaces(self) -> List[powerbi_data_classes.Workspace]:
def validate_dataset_type_mapping(self):
powerbi_data_platforms: List[str] = [
data_platform.value.powerbi_data_platform_name
- for data_platform in resolver.SupportedDataPlatform
+ for data_platform in SupportedDataPlatform
]
for key in self.source_config.dataset_type_mapping.keys():
@@ -1481,7 +1487,7 @@ def _get_dashboard_patch_work_unit(
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
# As modified_workspaces is not idempotent, hence workunit processors are run later for each workspace_id
- # This will result in creating checkpoint for each workspace_id
+ # This will result in creating a checkpoint for each workspace_id
if self.source_config.modified_since:
return [] # Handle these in get_workunits_internal
else:
@@ -1492,7 +1498,7 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
"""
- Datahub Ingestion framework invoke this method
+ Datahub Ingestion framework invokes this method
"""
logger.info("PowerBi plugin execution is started")
# Validate dataset type mapping
diff --git a/metadata-ingestion/src/datahub/ingestion/source/qlik_sense/data_classes.py b/metadata-ingestion/src/datahub/ingestion/source/qlik_sense/data_classes.py
index 672fcbceb0603..a43f5f32493f2 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/qlik_sense/data_classes.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/qlik_sense/data_classes.py
@@ -15,6 +15,7 @@
TimeType,
)
+# TODO: Replace with standardized types in sql_types.py
FIELD_TYPE_MAPPING: Dict[
str,
Type[
diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py
index 4bc4c1451c262..06cbb7fbae27c 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py
@@ -222,6 +222,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
```
"""
+ # TODO: Replace with standardized types in sql_types.py
REDSHIFT_FIELD_TYPE_MAPPINGS: Dict[
str,
Type[
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py
index d4442749a0622..2bd8e8017f549 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py
@@ -103,6 +103,7 @@
logger = logging.getLogger(__name__)
# https://docs.snowflake.com/en/sql-reference/intro-summary-data-types.html
+# TODO: Move to the standardized types in sql_types.py
SNOWFLAKE_FIELD_TYPE_MAPPINGS = {
"DATE": DateType,
"BIGINT": NumberType,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py b/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py
index 71cfd0268ee6b..6f7decc79b1df 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py
@@ -26,6 +26,7 @@
platform_name,
support_status,
)
+from datahub.ingestion.api.source import StructuredLogLevel
from datahub.ingestion.api.workunit import MetadataWorkUnit
from datahub.ingestion.source.aws.s3_util import make_s3_urn
from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes
@@ -35,6 +36,7 @@
register_custom_type,
)
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, make_sqlalchemy_uri
+from datahub.ingestion.source.sql.sql_report import SQLSourceReport
from datahub.ingestion.source.sql.sql_utils import (
add_table_to_schema_container,
gen_database_container,
@@ -48,6 +50,15 @@
get_schema_fields_for_sqlalchemy_column,
)
+try:
+ from typing_extensions import override
+except ImportError:
+ _F = typing.TypeVar("_F", bound=typing.Callable[..., typing.Any])
+
+ def override(f: _F, /) -> _F: # noqa: F811
+ return f
+
+
logger = logging.getLogger(__name__)
assert STRUCT, "required type modules are not available"
@@ -322,12 +333,15 @@ class AthenaSource(SQLAlchemySource):
- Profiling when enabled.
"""
- table_partition_cache: Dict[str, Dict[str, Partitionitem]] = {}
+ config: AthenaConfig
+ report: SQLSourceReport
def __init__(self, config, ctx):
super().__init__(config, ctx, "athena")
self.cursor: Optional[BaseCursor] = None
+ self.table_partition_cache: Dict[str, Dict[str, Partitionitem]] = {}
+
@classmethod
def create(cls, config_dict, ctx):
config = AthenaConfig.parse_obj(config_dict)
@@ -452,6 +466,7 @@ def add_table_to_schema_container(
)
# It seems like database/schema filter in the connection string does not work and this to work around that
+ @override
def get_schema_names(self, inspector: Inspector) -> List[str]:
athena_config = typing.cast(AthenaConfig, self.config)
schemas = inspector.get_schema_names()
@@ -459,34 +474,42 @@ def get_schema_names(self, inspector: Inspector) -> List[str]:
return [schema for schema in schemas if schema == athena_config.database]
return schemas
- # Overwrite to get partitions
+ @classmethod
+ def _casted_partition_key(cls, key: str) -> str:
+ # We need to cast the partition keys to a VARCHAR, since otherwise
+ # Athena may throw an error during concatenation / comparison.
+ return f"CAST({key} as VARCHAR)"
+
+ @override
def get_partitions(
self, inspector: Inspector, schema: str, table: str
- ) -> List[str]:
- partitions = []
-
- athena_config = typing.cast(AthenaConfig, self.config)
-
- if not athena_config.extract_partitions:
- return []
+ ) -> Optional[List[str]]:
+ if not self.config.extract_partitions:
+ return None
if not self.cursor:
- return []
+ return None
metadata: AthenaTableMetadata = self.cursor.get_table_metadata(
table_name=table, schema_name=schema
)
- if metadata.partition_keys:
- for key in metadata.partition_keys:
- if key.name:
- partitions.append(key.name)
-
- if not partitions:
- return []
+ partitions = []
+ for key in metadata.partition_keys:
+ if key.name:
+ partitions.append(key.name)
+ if not partitions:
+ return []
- # We create an artiificaial concatenated partition key to be able to query max partition easier
- part_concat = "|| '-' ||".join(partitions)
+ with self.report.report_exc(
+ message="Failed to extract partition details",
+ context=f"{schema}.{table}",
+ level=StructuredLogLevel.WARN,
+ ):
+ # We create an artifical concatenated partition key to be able to query max partition easier
+ part_concat = " || '-' || ".join(
+ self._casted_partition_key(key) for key in partitions
+ )
max_partition_query = f'select {",".join(partitions)} from "{schema}"."{table}$partitions" where {part_concat} = (select max({part_concat}) from "{schema}"."{table}$partitions")'
ret = self.cursor.execute(max_partition_query)
max_partition: Dict[str, str] = {}
@@ -500,9 +523,8 @@ def get_partitions(
partitions=partitions,
max_partition=max_partition,
)
- return partitions
- return []
+ return partitions
# Overwrite to modify the creation of schema fields
def get_schema_fields_for_column(
@@ -551,7 +573,9 @@ def generate_partition_profiler_query(
if partition and partition.max_partition:
max_partition_filters = []
for key, value in partition.max_partition.items():
- max_partition_filters.append(f"CAST({key} as VARCHAR) = '{value}'")
+ max_partition_filters.append(
+ f"{self._casted_partition_key(key)} = '{value}'"
+ )
max_partition = str(partition.max_partition)
return (
max_partition,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py
index 8ea4209784063..89ca160ba1f48 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py
@@ -1,5 +1,5 @@
import re
-from typing import Any, Dict, ValuesView
+from typing import Any, Dict, Optional, Type, Union, ValuesView
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
ArrayType,
@@ -16,14 +16,28 @@
UnionType,
)
-# these can be obtained by running `select format_type(oid, null),* from pg_type;`
-# we've omitted the types without a meaningful DataHub type (e.g. postgres-specific types, index vectors, etc.)
-# (run `\copy (select format_type(oid, null),* from pg_type) to 'pg_type.csv' csv header;` to get a CSV)
+DATAHUB_FIELD_TYPE = Union[
+ ArrayType,
+ BooleanType,
+ BytesType,
+ DateType,
+ EnumType,
+ MapType,
+ NullType,
+ NumberType,
+ RecordType,
+ StringType,
+ TimeType,
+ UnionType,
+]
-# we map from format_type since this is what dbt uses
-# see https://github.com/fishtown-analytics/dbt/blob/master/plugins/postgres/dbt/include/postgres/macros/catalog.sql#L22
-# see https://www.npgsql.org/dev/types.html for helpful type annotations
+# These can be obtained by running `select format_type(oid, null),* from pg_type;`
+# We've omitted the types without a meaningful DataHub type (e.g. postgres-specific types, index vectors, etc.)
+# (run `\copy (select format_type(oid, null),* from pg_type) to 'pg_type.csv' csv header;` to get a CSV)
+# We map from format_type since this is what dbt uses.
+# See https://github.com/fishtown-analytics/dbt/blob/master/plugins/postgres/dbt/include/postgres/macros/catalog.sql#L22
+# See https://www.npgsql.org/dev/types.html for helpful type annotations
POSTGRES_TYPES_MAP: Dict[str, Any] = {
"boolean": BooleanType,
"bytea": BytesType,
@@ -430,3 +444,54 @@ def resolve_vertica_modified_type(type_string: str) -> Any:
"geography": None,
"uuid": StringType,
}
+
+
+_merged_mapping = {
+ "boolean": BooleanType,
+ "date": DateType,
+ "time": TimeType,
+ "numeric": NumberType,
+ "text": StringType,
+ "timestamp with time zone": DateType,
+ "timestamp without time zone": DateType,
+ "integer": NumberType,
+ "float8": NumberType,
+ "struct": RecordType,
+ **POSTGRES_TYPES_MAP,
+ **SNOWFLAKE_TYPES_MAP,
+ **BIGQUERY_TYPES_MAP,
+ **SPARK_SQL_TYPES_MAP,
+ **TRINO_SQL_TYPES_MAP,
+ **ATHENA_SQL_TYPES_MAP,
+ **VERTICA_SQL_TYPES_MAP,
+}
+
+
+def resolve_sql_type(
+ column_type: Optional[str],
+ platform: Optional[str] = None,
+) -> Optional[DATAHUB_FIELD_TYPE]:
+ # In theory, we should use the platform-specific mapping where available.
+ # However, the types don't ever conflict, so the merged mapping is fine.
+ TypeClass: Optional[Type[DATAHUB_FIELD_TYPE]] = (
+ _merged_mapping.get(column_type) if column_type else None
+ )
+
+ if TypeClass is None and column_type:
+ # resolve a modified type
+ if platform == "trino":
+ TypeClass = resolve_trino_modified_type(column_type)
+ elif platform == "athena":
+ TypeClass = resolve_athena_modified_type(column_type)
+ elif platform == "postgres" or platform == "redshift":
+ # Redshift uses a variant of Postgres, so we can use the same logic.
+ TypeClass = resolve_postgres_modified_type(column_type)
+ elif platform == "vertica":
+ TypeClass = resolve_vertica_modified_type(column_type)
+ elif platform == "snowflake":
+ # Snowflake types are uppercase, so we check that.
+ TypeClass = _merged_mapping.get(column_type.upper())
+
+ if TypeClass:
+ return TypeClass()
+ return None
diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py
index f84f6c1b0c08d..9c5752c518df1 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py
@@ -33,6 +33,7 @@
logger = logging.getLogger(__name__)
+# TODO: (maybe) Replace with standardized types in sql_types.py
DATA_TYPE_REGISTRY: dict = {
ColumnTypeName.BOOLEAN: BooleanTypeClass,
ColumnTypeName.BYTE: BytesTypeClass,
diff --git a/metadata-ingestion/src/datahub/telemetry/telemetry.py b/metadata-ingestion/src/datahub/telemetry/telemetry.py
index 4faf04ee2d2c7..22b2cb6a101af 100644
--- a/metadata-ingestion/src/datahub/telemetry/telemetry.py
+++ b/metadata-ingestion/src/datahub/telemetry/telemetry.py
@@ -7,7 +7,7 @@
import uuid
from functools import wraps
from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, TypeVar
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, TypeVar
from mixpanel import Consumer, Mixpanel
from typing_extensions import ParamSpec
@@ -16,10 +16,12 @@
from datahub.cli.config_utils import DATAHUB_ROOT_FOLDER
from datahub.cli.env_utils import get_boolean_env_variable
from datahub.configuration.common import ExceptionWithProps
-from datahub.ingestion.graph.client import DataHubGraph
from datahub.metadata.schema_classes import _custom_package_path
from datahub.utilities.perf_timer import PerfTimer
+if TYPE_CHECKING:
+ from datahub.ingestion.graph.client import DataHubGraph
+
logger = logging.getLogger(__name__)
DATAHUB_FOLDER = Path(DATAHUB_ROOT_FOLDER)
@@ -117,7 +119,11 @@ class Telemetry:
tracking_init: bool = False
sentry_enabled: bool = False
+ context_properties: Dict[str, Any] = {}
+
def __init__(self):
+ self.context_properties = {}
+
if SENTRY_DSN:
self.sentry_enabled = True
try:
@@ -157,6 +163,9 @@ def __init__(self):
except Exception as e:
logger.debug(f"Error connecting to mixpanel: {e}")
+ # Initialize the default properties for all events.
+ self.set_context()
+
def update_config(self) -> bool:
"""
Update the config file with the current client ID and enabled status.
@@ -238,18 +247,22 @@ def load_config(self) -> bool:
return False
- def update_capture_exception_context(
+ def set_context(
self,
- server: Optional[DataHubGraph] = None,
+ server: Optional["DataHubGraph"] = None,
properties: Optional[Dict[str, Any]] = None,
) -> None:
+ self.context_properties = {
+ **self._server_props(server),
+ **(properties or {}),
+ }
+
if self.sentry_enabled:
from sentry_sdk import set_tag
properties = {
**_default_telemetry_properties(),
- **self._server_props(server),
- **(properties or {}),
+ **self.context_properties,
}
for key in properties:
@@ -297,7 +310,6 @@ def ping(
self,
event_name: str,
properties: Optional[Dict[str, Any]] = None,
- server: Optional[DataHubGraph] = None,
) -> None:
"""
Send a single telemetry event.
@@ -323,14 +335,15 @@ def ping(
properties = {
**_default_telemetry_properties(),
- **self._server_props(server),
+ **self.context_properties,
**properties,
}
self.mp.track(self.client_id, event_name, properties)
except Exception as e:
logger.debug(f"Error reporting telemetry: {e}")
- def _server_props(self, server: Optional[DataHubGraph]) -> Dict[str, str]:
+ @classmethod
+ def _server_props(cls, server: Optional["DataHubGraph"]) -> Dict[str, str]:
if not server:
return {
"server_type": "n/a",
@@ -435,6 +448,7 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _T:
**call_props,
"status": "error",
**_error_props(e),
+ "code": e.code,
},
)
telemetry_instance.capture_exception(e)
diff --git a/metadata-ingestion/src/datahub/utilities/urn_encoder.py b/metadata-ingestion/src/datahub/utilities/urn_encoder.py
index 88c0a128b8e46..4f19eeff3e70f 100644
--- a/metadata-ingestion/src/datahub/utilities/urn_encoder.py
+++ b/metadata-ingestion/src/datahub/utilities/urn_encoder.py
@@ -4,7 +4,8 @@
# NOTE: Frontend relies on encoding these three characters. Specifically, we decode and encode schema fields for column level lineage.
# If this changes, make appropriate changes to datahub-web-react/src/app/lineage/utils/columnLineageUtils.ts
# We also rely on encoding these exact three characters when generating schemaField urns in our graphQL layer. Update SchemaFieldUtils if this changes.
-RESERVED_CHARS = {",", "(", ")"}
+# Also see https://datahubproject.io/docs/what/urn/#restrictions
+RESERVED_CHARS = {",", "(", ")", "␟"}
RESERVED_CHARS_EXTENDED = RESERVED_CHARS.union({"%"})
diff --git a/metadata-ingestion/tests/integration/dbt/test_dbt.py b/metadata-ingestion/tests/integration/dbt/test_dbt.py
index 390d8d7698dd4..c6a3dc4fd590b 100644
--- a/metadata-ingestion/tests/integration/dbt/test_dbt.py
+++ b/metadata-ingestion/tests/integration/dbt/test_dbt.py
@@ -11,12 +11,6 @@
from datahub.ingestion.run.pipeline_config import PipelineConfig, SourceConfig
from datahub.ingestion.source.dbt.dbt_common import DBTEntitiesEnabled, EmitDirective
from datahub.ingestion.source.dbt.dbt_core import DBTCoreConfig, DBTCoreSource
-from datahub.ingestion.source.sql.sql_types import (
- ATHENA_SQL_TYPES_MAP,
- TRINO_SQL_TYPES_MAP,
- resolve_athena_modified_type,
- resolve_trino_modified_type,
-)
from tests.test_helpers import mce_helpers, test_connection_helpers
FROZEN_TIME = "2022-02-03 07:00:00"
@@ -362,69 +356,6 @@ def test_dbt_tests(test_resources_dir, pytestconfig, tmp_path, mock_time, **kwar
)
-@pytest.mark.parametrize(
- "data_type, expected_data_type",
- [
- ("boolean", "boolean"),
- ("tinyint", "tinyint"),
- ("smallint", "smallint"),
- ("int", "int"),
- ("integer", "integer"),
- ("bigint", "bigint"),
- ("real", "real"),
- ("double", "double"),
- ("decimal(10,0)", "decimal"),
- ("varchar(20)", "varchar"),
- ("char", "char"),
- ("varbinary", "varbinary"),
- ("json", "json"),
- ("date", "date"),
- ("time", "time"),
- ("time(12)", "time"),
- ("timestamp", "timestamp"),
- ("timestamp(3)", "timestamp"),
- ("row(x bigint, y double)", "row"),
- ("array(row(x bigint, y double))", "array"),
- ("map(varchar, varchar)", "map"),
- ],
-)
-def test_resolve_trino_modified_type(data_type, expected_data_type):
- assert (
- resolve_trino_modified_type(data_type)
- == TRINO_SQL_TYPES_MAP[expected_data_type]
- )
-
-
-@pytest.mark.parametrize(
- "data_type, expected_data_type",
- [
- ("boolean", "boolean"),
- ("tinyint", "tinyint"),
- ("smallint", "smallint"),
- ("int", "int"),
- ("integer", "integer"),
- ("bigint", "bigint"),
- ("float", "float"),
- ("double", "double"),
- ("decimal(10,0)", "decimal"),
- ("varchar(20)", "varchar"),
- ("char", "char"),
- ("binary", "binary"),
- ("date", "date"),
- ("timestamp", "timestamp"),
- ("timestamp(3)", "timestamp"),
- ("struct", "struct"),
- ("array>", "array"),
- ("map", "map"),
- ],
-)
-def test_resolve_athena_modified_type(data_type, expected_data_type):
- assert (
- resolve_athena_modified_type(data_type)
- == ATHENA_SQL_TYPES_MAP[expected_data_type]
- )
-
-
@pytest.mark.integration
@freeze_time(FROZEN_TIME)
def test_dbt_tests_only_assertions(
diff --git a/metadata-ingestion/tests/integration/feast/feast_repository_mces_golden.json b/metadata-ingestion/tests/integration/feast/feast_repository_mces_golden.json
index 1b91925289845..a4fd9843c5cf4 100644
--- a/metadata-ingestion/tests/integration/feast/feast_repository_mces_golden.json
+++ b/metadata-ingestion/tests/integration/feast/feast_repository_mces_golden.json
@@ -9,8 +9,33 @@
"removed": false
}
},
+ {
+ "com.linkedin.pegasus2avro.common.GlobalTags": {
+ "tags": [
+ {
+ "tag": "urn:li:tag:deprecated"
+ }
+ ]
+ }
+ },
+ {
+ "com.linkedin.pegasus2avro.common.Ownership": {
+ "owners": [
+ {
+ "owner": "urn:li:corpGroup:MOCK_OWNER",
+ "type": "BUSINESS_OWNER"
+ }
+ ],
+ "ownerTypes": {},
+ "lastModified": {
+ "time": 0,
+ "actor": "urn:li:corpuser:unknown"
+ }
+ }
+ },
{
"com.linkedin.pegasus2avro.ml.metadata.MLPrimaryKeyProperties": {
+ "customProperties": {},
"description": "Driver ID",
"dataType": "ORDINAL",
"sources": [
@@ -23,7 +48,8 @@
},
"systemMetadata": {
"lastObserved": 1586847600000,
- "runId": "feast-repository-test"
+ "runId": "feast-repository-test",
+ "lastRunId": "no-run-id-provided"
}
},
{
@@ -36,8 +62,18 @@
"removed": false
}
},
+ {
+ "com.linkedin.pegasus2avro.common.GlobalTags": {
+ "tags": [
+ {
+ "tag": "urn:li:tag:needs_documentation"
+ }
+ ]
+ }
+ },
{
"com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": {
+ "customProperties": {},
"description": "Conv rate",
"dataType": "CONTINUOUS",
"sources": [
@@ -50,7 +86,8 @@
},
"systemMetadata": {
"lastObserved": 1586847600000,
- "runId": "feast-repository-test"
+ "runId": "feast-repository-test",
+ "lastRunId": "no-run-id-provided"
}
},
{
@@ -65,6 +102,7 @@
},
{
"com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": {
+ "customProperties": {},
"description": "Acc rate",
"dataType": "CONTINUOUS",
"sources": [
@@ -77,7 +115,8 @@
},
"systemMetadata": {
"lastObserved": 1586847600000,
- "runId": "feast-repository-test"
+ "runId": "feast-repository-test",
+ "lastRunId": "no-run-id-provided"
}
},
{
@@ -92,6 +131,7 @@
},
{
"com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": {
+ "customProperties": {},
"description": "Avg daily trips",
"dataType": "ORDINAL",
"sources": [
@@ -104,7 +144,8 @@
},
"systemMetadata": {
"lastObserved": 1586847600000,
- "runId": "feast-repository-test"
+ "runId": "feast-repository-test",
+ "lastRunId": "no-run-id-provided"
}
},
{
@@ -119,6 +160,7 @@
},
{
"com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": {
+ "customProperties": {},
"description": "String feature",
"dataType": "TEXT",
"sources": [
@@ -131,7 +173,8 @@
},
"systemMetadata": {
"lastObserved": 1586847600000,
- "runId": "feast-repository-test"
+ "runId": "feast-repository-test",
+ "lastRunId": "no-run-id-provided"
}
},
{
@@ -151,6 +194,30 @@
"removed": false
}
},
+ {
+ "com.linkedin.pegasus2avro.common.GlobalTags": {
+ "tags": [
+ {
+ "tag": "urn:li:tag:deprecated"
+ }
+ ]
+ }
+ },
+ {
+ "com.linkedin.pegasus2avro.common.Ownership": {
+ "owners": [
+ {
+ "owner": "urn:li:corpGroup:MOCK_OWNER",
+ "type": "BUSINESS_OWNER"
+ }
+ ],
+ "ownerTypes": {},
+ "lastModified": {
+ "time": 0,
+ "actor": "urn:li:corpuser:unknown"
+ }
+ }
+ },
{
"com.linkedin.pegasus2avro.ml.metadata.MLFeatureTableProperties": {
"customProperties": {},
@@ -170,7 +237,8 @@
},
"systemMetadata": {
"lastObserved": 1586847600000,
- "runId": "feast-repository-test"
+ "runId": "feast-repository-test",
+ "lastRunId": "no-run-id-provided"
}
},
{
@@ -189,7 +257,8 @@
},
"systemMetadata": {
"lastObserved": 1586847600000,
- "runId": "feast-repository-test"
+ "runId": "feast-repository-test",
+ "lastRunId": "no-run-id-provided"
}
},
{
@@ -204,6 +273,7 @@
},
{
"com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": {
+ "customProperties": {},
"dataType": "CONTINUOUS",
"sources": [
"urn:li:dataset:(urn:li:dataPlatform:request,vals_to_add,PROD)",
@@ -216,7 +286,8 @@
},
"systemMetadata": {
"lastObserved": 1586847600000,
- "runId": "feast-repository-test"
+ "runId": "feast-repository-test",
+ "lastRunId": "no-run-id-provided"
}
},
{
@@ -231,6 +302,7 @@
},
{
"com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": {
+ "customProperties": {},
"dataType": "CONTINUOUS",
"sources": [
"urn:li:dataset:(urn:li:dataPlatform:request,vals_to_add,PROD)",
@@ -243,7 +315,8 @@
},
"systemMetadata": {
"lastObserved": 1586847600000,
- "runId": "feast-repository-test"
+ "runId": "feast-repository-test",
+ "lastRunId": "no-run-id-provided"
}
},
{
@@ -278,7 +351,8 @@
},
"systemMetadata": {
"lastObserved": 1586847600000,
- "runId": "feast-repository-test"
+ "runId": "feast-repository-test",
+ "lastRunId": "no-run-id-provided"
}
},
{
@@ -297,7 +371,40 @@
},
"systemMetadata": {
"lastObserved": 1586847600000,
- "runId": "feast-repository-test"
+ "runId": "feast-repository-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "tag",
+ "entityUrn": "urn:li:tag:deprecated",
+ "changeType": "UPSERT",
+ "aspectName": "tagKey",
+ "aspect": {
+ "json": {
+ "name": "deprecated"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "feast-repository-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "tag",
+ "entityUrn": "urn:li:tag:needs_documentation",
+ "changeType": "UPSERT",
+ "aspectName": "tagKey",
+ "aspect": {
+ "json": {
+ "name": "needs_documentation"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "feast-repository-test",
+ "lastRunId": "no-run-id-provided"
}
}
]
\ No newline at end of file
diff --git a/metadata-ingestion/tests/integration/feast/feature_store/data/registry.db b/metadata-ingestion/tests/integration/feast/feature_store/data/registry.db
index a511ff56c9770..5dca29d92afe5 100644
Binary files a/metadata-ingestion/tests/integration/feast/feature_store/data/registry.db and b/metadata-ingestion/tests/integration/feast/feature_store/data/registry.db differ
diff --git a/metadata-ingestion/tests/integration/feast/feature_store/features.py b/metadata-ingestion/tests/integration/feast/feature_store/features.py
index a6e6cd3616e92..dcfd417637958 100644
--- a/metadata-ingestion/tests/integration/feast/feature_store/features.py
+++ b/metadata-ingestion/tests/integration/feast/feature_store/features.py
@@ -19,6 +19,8 @@
join_keys=["driver_id"],
value_type=ValueType.INT64,
description="Driver ID",
+ owner="MOCK_OWNER",
+ tags={"name": "deprecated"},
)
driver_hourly_stats_view = FeatureView(
@@ -29,7 +31,7 @@
Field(
name="conv_rate",
dtype=feast.types.Float64,
- tags=dict(description="Conv rate"),
+ tags={"name": "needs_documentation", "description": "Conv rate"},
),
Field(
name="acc_rate",
@@ -49,7 +51,8 @@
],
online=True,
source=driver_hourly_stats_source,
- tags={},
+ tags={"name": "deprecated"},
+ owner="MOCK_OWNER",
)
input_request = RequestSource(
diff --git a/metadata-ingestion/tests/integration/feast/test_feast_repository.py b/metadata-ingestion/tests/integration/feast/test_feast_repository.py
index a6bdce6722289..7f04337145dc3 100644
--- a/metadata-ingestion/tests/integration/feast/test_feast_repository.py
+++ b/metadata-ingestion/tests/integration/feast/test_feast_repository.py
@@ -19,6 +19,15 @@ def test_feast_repository_ingest(pytestconfig, tmp_path, mock_time):
"config": {
"path": str(test_resources_dir / "feature_store"),
"environment": "PROD",
+ "enable_tag_extraction": True,
+ "enable_owner_extraction": True,
+ "owner_mappings": [
+ {
+ "feast_owner_name": "MOCK_OWNER",
+ "datahub_owner_urn": "urn:li:corpGroup:MOCK_OWNER",
+ "datahub_ownership_type": "BUSINESS_OWNER",
+ }
+ ],
},
},
"sink": {
diff --git a/metadata-ingestion/tests/integration/kafka/test_kafka.py b/metadata-ingestion/tests/integration/kafka/test_kafka.py
index 597889c8440b7..7462f177684b7 100644
--- a/metadata-ingestion/tests/integration/kafka/test_kafka.py
+++ b/metadata-ingestion/tests/integration/kafka/test_kafka.py
@@ -128,11 +128,32 @@ def test_kafka_oauth_callback(
pipeline.run()
- is_found: bool = False
+ # Initialize flags to track oauth events
+ checks = {
+ "consumer_polling": False,
+ "consumer_oauth_callback": False,
+ "admin_polling": False,
+ "admin_oauth_callback": False,
+ }
+
+ # Read log file and check for oauth events
with open(log_file, "r") as file:
- for line_number, line in enumerate(file, 1):
+ for line in file:
+ # Check for polling events
+ if "Initiating polling for kafka admin client" in line:
+ checks["admin_polling"] = True
+ elif "Initiating polling for kafka consumer" in line:
+ checks["consumer_polling"] = True
+
+ # Check for oauth callbacks
if oauth.MESSAGE in line:
- is_found = True
- break
-
- assert is_found
+ if checks["consumer_polling"] and not checks["admin_polling"]:
+ checks["consumer_oauth_callback"] = True
+ elif checks["consumer_polling"] and checks["admin_polling"]:
+ checks["admin_oauth_callback"] = True
+
+ # Verify all oauth events occurred
+ assert checks["consumer_polling"], "Consumer polling was not initiated"
+ assert checks["consumer_oauth_callback"], "Consumer oauth callback not found"
+ assert checks["admin_polling"], "Admin polling was not initiated"
+ assert checks["admin_oauth_callback"], "Admin oauth callback not found"
diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py
index f22998b47b900..63821f9038a88 100644
--- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py
+++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py
@@ -7,6 +7,7 @@
import pytest
from lark import Tree
+import datahub.ingestion.source.powerbi.m_query.data_classes
import datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes as powerbi_data_classes
from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.api.source import StructuredLogLevel
@@ -18,8 +19,11 @@
AbstractDataPlatformInstanceResolver,
create_dataplatform_instance_resolver,
)
-from datahub.ingestion.source.powerbi.m_query import parser, resolver, tree_function
-from datahub.ingestion.source.powerbi.m_query.resolver import DataPlatformTable, Lineage
+from datahub.ingestion.source.powerbi.m_query import parser, tree_function
+from datahub.ingestion.source.powerbi.m_query.data_classes import (
+ DataPlatformTable,
+ Lineage,
+)
pytestmark = pytest.mark.integration_batch_2
@@ -62,7 +66,9 @@
]
-def get_data_platform_tables_with_dummy_table(q: str) -> List[resolver.Lineage]:
+def get_data_platform_tables_with_dummy_table(
+ q: str,
+) -> List[datahub.ingestion.source.powerbi.m_query.data_classes.Lineage]:
table: powerbi_data_classes.Table = powerbi_data_classes.Table(
columns=[],
measures=[],
@@ -759,7 +765,9 @@ def test_sqlglot_parser():
}
)
- lineage: List[resolver.Lineage] = parser.get_upstream_tables(
+ lineage: List[
+ datahub.ingestion.source.powerbi.m_query.data_classes.Lineage
+ ] = parser.get_upstream_tables(
table,
reporter,
ctx=ctx,
@@ -806,7 +814,9 @@ def test_sqlglot_parser():
def test_databricks_multi_cloud():
q = M_QUERIES[25]
- lineage: List[resolver.Lineage] = get_data_platform_tables_with_dummy_table(q=q)
+ lineage: List[
+ datahub.ingestion.source.powerbi.m_query.data_classes.Lineage
+ ] = get_data_platform_tables_with_dummy_table(q=q)
assert len(lineage) == 1
@@ -823,7 +833,9 @@ def test_databricks_multi_cloud():
def test_databricks_catalog_pattern_1():
q = M_QUERIES[26]
- lineage: List[resolver.Lineage] = get_data_platform_tables_with_dummy_table(q=q)
+ lineage: List[
+ datahub.ingestion.source.powerbi.m_query.data_classes.Lineage
+ ] = get_data_platform_tables_with_dummy_table(q=q)
assert len(lineage) == 1
@@ -892,7 +904,9 @@ def test_sqlglot_parser_2():
}
)
- lineage: List[resolver.Lineage] = parser.get_upstream_tables(
+ lineage: List[
+ datahub.ingestion.source.powerbi.m_query.data_classes.Lineage
+ ] = parser.get_upstream_tables(
table,
reporter,
ctx=ctx,
@@ -951,7 +965,9 @@ def test_databricks_regular_case_with_view():
def test_snowflake_double_double_quotes():
q = M_QUERIES[30]
- lineage: List[resolver.Lineage] = get_data_platform_tables_with_dummy_table(q=q)
+ lineage: List[
+ datahub.ingestion.source.powerbi.m_query.data_classes.Lineage
+ ] = get_data_platform_tables_with_dummy_table(q=q)
assert len(lineage) == 1
@@ -968,7 +984,9 @@ def test_snowflake_double_double_quotes():
def test_databricks_multicloud():
q = M_QUERIES[31]
- lineage: List[resolver.Lineage] = get_data_platform_tables_with_dummy_table(q=q)
+ lineage: List[
+ datahub.ingestion.source.powerbi.m_query.data_classes.Lineage
+ ] = get_data_platform_tables_with_dummy_table(q=q)
assert len(lineage) == 1
@@ -985,7 +1003,9 @@ def test_databricks_multicloud():
def test_snowflake_multi_function_call():
q = M_QUERIES[32]
- lineage: List[resolver.Lineage] = get_data_platform_tables_with_dummy_table(q=q)
+ lineage: List[
+ datahub.ingestion.source.powerbi.m_query.data_classes.Lineage
+ ] = get_data_platform_tables_with_dummy_table(q=q)
assert len(lineage) == 1
@@ -1002,7 +1022,9 @@ def test_snowflake_multi_function_call():
def test_mssql_drop_with_select():
q = M_QUERIES[33]
- lineage: List[resolver.Lineage] = get_data_platform_tables_with_dummy_table(q=q)
+ lineage: List[
+ datahub.ingestion.source.powerbi.m_query.data_classes.Lineage
+ ] = get_data_platform_tables_with_dummy_table(q=q)
assert len(lineage) == 1
@@ -1062,7 +1084,9 @@ def test_empty_string_in_m_query():
# TRIM(TRIM(TRIM(AGENT_NAME, '\"\"'), '+'), '\\'') is in Query
q = "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu10758.ap-unknown-2.fakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select #(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS CLIENT_DIRECTOR,#(lf)TRIM(TRIM(TRIM(AGENT_NAME, '\"\"'), '+'), '\\'') AS TRIM_AGENT_NAME,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS inner join OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT #(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Accounting'#(lf)and TARGET_TEAM = 'Enterprise'#(lf)AND TIER = 'Client Director'\", null, [EnableFolding=true])\nin\n Source"
- lineage: List[resolver.Lineage] = get_data_platform_tables_with_dummy_table(q=q)
+ lineage: List[
+ datahub.ingestion.source.powerbi.m_query.data_classes.Lineage
+ ] = get_data_platform_tables_with_dummy_table(q=q)
assert len(lineage) == 1
@@ -1084,7 +1108,9 @@ def test_double_quotes_in_alias():
# SELECT CAST(sales_date AS DATE) AS \"\"Date\"\" in query
q = 'let \n Source = Sql.Database("abc.com", "DB", [Query="SELECT CAST(sales_date AS DATE) AS ""Date"",#(lf) SUM(cshintrpret) / 60.0 AS ""Total Order All Items"",#(lf)#(tab)#(tab)#(tab) SUM(cshintrpret) / 60.0 - LAG(SUM(cshintrpret) / 60.0, 1) OVER (ORDER BY CAST(sales_date AS DATE)) AS ""Total minute difference"",#(lf)#(tab)#(tab)#(tab) SUM(sale_price) / 60.0 - LAG(SUM(sale_price) / 60.0, 1) OVER (ORDER BY CAST(sales_date AS DATE)) AS ""Normal minute difference""#(lf) FROM [DB].[dbo].[sales_t]#(lf) WHERE sales_date >= GETDATE() - 365#(lf) GROUP BY CAST(sales_date AS DATE),#(lf)#(tab)#(tab)CAST(sales_date AS TIME);"]) \n in \n Source'
- lineage: List[resolver.Lineage] = get_data_platform_tables_with_dummy_table(q=q)
+ lineage: List[
+ datahub.ingestion.source.powerbi.m_query.data_classes.Lineage
+ ] = get_data_platform_tables_with_dummy_table(q=q)
assert len(lineage) == 1
diff --git a/metadata-ingestion/tests/unit/test_athena_source.py b/metadata-ingestion/tests/unit/test_athena_source.py
index 875cf3800daf8..f8b6220d18273 100644
--- a/metadata-ingestion/tests/unit/test_athena_source.py
+++ b/metadata-ingestion/tests/unit/test_athena_source.py
@@ -93,7 +93,8 @@ def test_athena_get_table_properties():
"CreateTime": datetime.now(),
"LastAccessTime": datetime.now(),
"PartitionKeys": [
- {"Name": "testKey", "Type": "string", "Comment": "testComment"}
+ {"Name": "year", "Type": "string", "Comment": "testComment"},
+ {"Name": "month", "Type": "string", "Comment": "testComment"},
],
"Parameters": {
"comment": "testComment",
@@ -112,8 +113,18 @@ def test_athena_get_table_properties():
response=table_metadata
)
+ # Mock partition query results
+ mock_cursor.execute.return_value.description = [
+ ["year"],
+ ["month"],
+ ]
+ mock_cursor.execute.return_value.__iter__.return_value = [["2023", "12"]]
+
ctx = PipelineContext(run_id="test")
source = AthenaSource(config=config, ctx=ctx)
+ source.cursor = mock_cursor
+
+ # Test table properties
description, custom_properties, location = source.get_table_properties(
inspector=mock_inspector, table=table, schema=schema
)
@@ -124,13 +135,35 @@ def test_athena_get_table_properties():
"last_access_time": "2020-04-14 07:00:00",
"location": "s3://testLocation",
"outputformat": "testOutputFormat",
- "partition_keys": '[{"name": "testKey", "type": "string", "comment": "testComment"}]',
+ "partition_keys": '[{"name": "year", "type": "string", "comment": "testComment"}, {"name": "month", "type": "string", "comment": "testComment"}]',
"serde.serialization.lib": "testSerde",
"table_type": "testType",
}
-
assert location == make_s3_urn("s3://testLocation", "PROD")
+ # Test partition functionality
+ partitions = source.get_partitions(
+ inspector=mock_inspector, schema=schema, table=table
+ )
+ assert partitions == ["year", "month"]
+
+ # Verify the correct SQL query was generated for partitions
+ expected_query = """\
+select year,month from "test_schema"."test_table$partitions" \
+where CAST(year as VARCHAR) || '-' || CAST(month as VARCHAR) = \
+(select max(CAST(year as VARCHAR) || '-' || CAST(month as VARCHAR)) \
+from "test_schema"."test_table$partitions")"""
+ mock_cursor.execute.assert_called_once()
+ actual_query = mock_cursor.execute.call_args[0][0]
+ assert actual_query == expected_query
+
+ # Verify partition cache was populated correctly
+ assert source.table_partition_cache[schema][table].partitions == partitions
+ assert source.table_partition_cache[schema][table].max_partition == {
+ "year": "2023",
+ "month": "12",
+ }
+
def test_get_column_type_simple_types():
assert isinstance(
@@ -214,3 +247,9 @@ def test_column_type_complex_combination():
assert isinstance(
result._STRUCT_fields[2][1].item_type._STRUCT_fields[1][1], types.String
)
+
+
+def test_casted_partition_key():
+ from datahub.ingestion.source.sql.athena import AthenaSource
+
+ assert AthenaSource._casted_partition_key("test_col") == "CAST(test_col as VARCHAR)"
diff --git a/metadata-ingestion/tests/unit/test_powerbi_parser.py b/metadata-ingestion/tests/unit/test_powerbi_parser.py
index 31579f0c0abd3..a487a3a5b87f8 100644
--- a/metadata-ingestion/tests/unit/test_powerbi_parser.py
+++ b/metadata-ingestion/tests/unit/test_powerbi_parser.py
@@ -8,9 +8,7 @@
from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import (
ResolvePlatformInstanceFromDatasetTypeMapping,
)
-from datahub.ingestion.source.powerbi.m_query.resolver import (
- MSSqlDataPlatformTableCreator,
-)
+from datahub.ingestion.source.powerbi.m_query.pattern_handler import MSSqlLineage
from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table
@@ -27,7 +25,7 @@ def creator():
full_name="db.schema.test_table",
)
- return MSSqlDataPlatformTableCreator(
+ return MSSqlLineage(
ctx=PipelineContext(run_id="test-run-id"),
table=table,
reporter=PowerBiDashboardSourceReport(),
diff --git a/metadata-ingestion/tests/unit/test_sql_types.py b/metadata-ingestion/tests/unit/test_sql_types.py
new file mode 100644
index 0000000000000..ebe5ade115cdd
--- /dev/null
+++ b/metadata-ingestion/tests/unit/test_sql_types.py
@@ -0,0 +1,78 @@
+import pytest
+
+from datahub.ingestion.source.sql.sql_types import (
+ ATHENA_SQL_TYPES_MAP,
+ TRINO_SQL_TYPES_MAP,
+ resolve_athena_modified_type,
+ resolve_sql_type,
+ resolve_trino_modified_type,
+)
+from datahub.metadata.schema_classes import BooleanTypeClass, StringTypeClass
+
+
+@pytest.mark.parametrize(
+ "data_type, expected_data_type",
+ [
+ ("boolean", "boolean"),
+ ("tinyint", "tinyint"),
+ ("smallint", "smallint"),
+ ("int", "int"),
+ ("integer", "integer"),
+ ("bigint", "bigint"),
+ ("real", "real"),
+ ("double", "double"),
+ ("decimal(10,0)", "decimal"),
+ ("varchar(20)", "varchar"),
+ ("char", "char"),
+ ("varbinary", "varbinary"),
+ ("json", "json"),
+ ("date", "date"),
+ ("time", "time"),
+ ("time(12)", "time"),
+ ("timestamp", "timestamp"),
+ ("timestamp(3)", "timestamp"),
+ ("row(x bigint, y double)", "row"),
+ ("array(row(x bigint, y double))", "array"),
+ ("map(varchar, varchar)", "map"),
+ ],
+)
+def test_resolve_trino_modified_type(data_type, expected_data_type):
+ assert (
+ resolve_trino_modified_type(data_type)
+ == TRINO_SQL_TYPES_MAP[expected_data_type]
+ )
+
+
+@pytest.mark.parametrize(
+ "data_type, expected_data_type",
+ [
+ ("boolean", "boolean"),
+ ("tinyint", "tinyint"),
+ ("smallint", "smallint"),
+ ("int", "int"),
+ ("integer", "integer"),
+ ("bigint", "bigint"),
+ ("float", "float"),
+ ("double", "double"),
+ ("decimal(10,0)", "decimal"),
+ ("varchar(20)", "varchar"),
+ ("char", "char"),
+ ("binary", "binary"),
+ ("date", "date"),
+ ("timestamp", "timestamp"),
+ ("timestamp(3)", "timestamp"),
+ ("struct", "struct"),
+ ("array>", "array"),
+ ("map", "map"),
+ ],
+)
+def test_resolve_athena_modified_type(data_type, expected_data_type):
+ assert (
+ resolve_athena_modified_type(data_type)
+ == ATHENA_SQL_TYPES_MAP[expected_data_type]
+ )
+
+
+def test_resolve_sql_type() -> None:
+ assert resolve_sql_type("boolean") == BooleanTypeClass()
+ assert resolve_sql_type("varchar") == StringTypeClass()
diff --git a/metadata-ingestion/tests/unit/urns/test_urn.py b/metadata-ingestion/tests/unit/urns/test_urn.py
index 1bf48082fec8c..73badb3d1b423 100644
--- a/metadata-ingestion/tests/unit/urns/test_urn.py
+++ b/metadata-ingestion/tests/unit/urns/test_urn.py
@@ -1,6 +1,12 @@
import pytest
-from datahub.metadata.urns import DatasetUrn, Urn
+from datahub.metadata.urns import (
+ CorpUserUrn,
+ DashboardUrn,
+ DataPlatformUrn,
+ DatasetUrn,
+ Urn,
+)
from datahub.utilities.urns.error import InvalidUrnError
pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning")
@@ -36,20 +42,51 @@ def test_url_encode_urn() -> None:
def test_invalid_urn() -> None:
with pytest.raises(InvalidUrnError):
- Urn.create_from_string("urn:li:abc")
+ Urn.from_string("urn:li:abc")
with pytest.raises(InvalidUrnError):
- Urn.create_from_string("urn:li:abc:")
+ Urn.from_string("urn:li:abc:")
with pytest.raises(InvalidUrnError):
- Urn.create_from_string("urn:li:abc:()")
+ Urn.from_string("urn:li:abc:()")
with pytest.raises(InvalidUrnError):
- Urn.create_from_string("urn:li:abc:(abc,)")
+ Urn.from_string("urn:li:abc:(abc,)")
+
+ with pytest.raises(InvalidUrnError):
+ Urn.from_string("urn:li:corpuser:abc)")
+
+
+def test_urn_colon() -> None:
+ # Colon characters are valid in urns, and should not mess up parsing.
+
+ urn = Urn.from_string(
+ "urn:li:dashboard:(looker,dashboards.thelook::customer_lookup)"
+ )
+ assert isinstance(urn, DashboardUrn)
+
+ assert DataPlatformUrn.from_string("urn:li:dataPlatform:abc:def")
+ assert DatasetUrn.from_string(
+ "urn:li:dataset:(urn:li:dataPlatform:abc:def,table_name,PROD)"
+ )
+ assert Urn.from_string("urn:li:corpuser:foo:bar@example.com")
+
+ # I'm not sure why you'd ever want this, but technically it's a valid urn.
+ urn = Urn.from_string("urn:li:corpuser::")
+ assert isinstance(urn, CorpUserUrn)
+ assert urn.username == ":"
+ assert urn == CorpUserUrn(":")
+
+
+def test_urn_coercion() -> None:
+ urn = CorpUserUrn("foo␟bar")
+ assert urn.urn() == "urn:li:corpuser:foo%E2%90%9Fbar"
+
+ assert urn == Urn.from_string(urn.urn())
def test_urn_type_dispatch() -> None:
- urn = Urn.from_string("urn:li:dataset:(urn:li:dataPlatform:abc,def,prod)")
+ urn = Urn.from_string("urn:li:dataset:(urn:li:dataPlatform:abc,def,PROD)")
assert isinstance(urn, DatasetUrn)
with pytest.raises(InvalidUrnError, match="Passed an urn of type corpuser"):
diff --git a/metadata-integration/java/datahub-protobuf/scripts/check_jar.sh b/metadata-integration/java/datahub-protobuf/scripts/check_jar.sh
index bd0c28f0f8698..66c70f0b85769 100755
--- a/metadata-integration/java/datahub-protobuf/scripts/check_jar.sh
+++ b/metadata-integration/java/datahub-protobuf/scripts/check_jar.sh
@@ -44,7 +44,9 @@ jar -tvf $jarFile |\
grep -v "mime.types" |\
grep -v "com/ibm/.*" |\
grep -v "org/glassfish/" |\
- grep -v "LICENSE"
+ grep -v "LICENSE" |\
+ grep -v "org/apache/avro" |\
+ grep -v "org/apache"
if [ $? -ne 0 ]; then
echo "✅ No unexpected class paths found in ${jarFile}"
diff --git a/metadata-integration/java/datahub-schematron/lib/build.gradle b/metadata-integration/java/datahub-schematron/lib/build.gradle
index 83dec1039f7be..3ba22ff4cb7b5 100644
--- a/metadata-integration/java/datahub-schematron/lib/build.gradle
+++ b/metadata-integration/java/datahub-schematron/lib/build.gradle
@@ -45,10 +45,6 @@ jacocoTestReport {
test.finalizedBy jacocoTestReport
-task checkShadowJar(type: Exec) {
- commandLine 'sh', '-c', 'scripts/check_jar.sh'
-}
-
configurations {
provided
implementation.extendsFrom provided
diff --git a/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/converters/avro/AvroSchemaConverter.java b/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/converters/avro/AvroSchemaConverter.java
index c199f8e6dcb92..0ddb357db76ba 100644
--- a/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/converters/avro/AvroSchemaConverter.java
+++ b/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/converters/avro/AvroSchemaConverter.java
@@ -345,7 +345,8 @@ private void processArrayField(
log.debug("Array Field Path before expand: {}", fieldPath.asString());
fieldPath = fieldPath.popLast();
fieldPath =
- fieldPath.clonePlus(new FieldElement(List.of("array"), new ArrayList<>(), null, null));
+ fieldPath.clonePlus(
+ new FieldElement(Collections.singletonList("array"), new ArrayList<>(), null, null));
Schema.Field elementField =
new Schema.Field(
field.name(),
@@ -400,7 +401,9 @@ private void processMapField(
FieldPath valueFieldPath =
fieldPath
.popLast()
- .clonePlus(new FieldElement(List.of("map"), new ArrayList<>(), null, null));
+ .clonePlus(
+ new FieldElement(
+ Collections.singletonList("map"), new ArrayList<>(), null, null));
processField(valueField, valueFieldPath, defaultNullable, fields, isNullable, mapDataHubType);
} else {
SchemaField mapField =
@@ -434,7 +437,7 @@ private void processUnionField(
unionTypes.stream()
.filter(s -> s.getType() != Schema.Type.NULL)
.findFirst()
- .orElseThrow();
+ .orElseThrow(NoSuchElementException::new);
processField(
new Schema.Field(field.name(), nonNullSchema, field.doc()),
@@ -476,7 +479,8 @@ private void processUnionField(
FieldPath indexedFieldPath = fieldPath.popLast();
indexedFieldPath =
indexedFieldPath.clonePlus(
- new FieldElement(List.of("union"), new ArrayList<>(), null, null));
+ new FieldElement(
+ Collections.singletonList("union"), new ArrayList<>(), null, null));
log.debug("TypeIndex: {}, Indexed Field path : {}", typeIndex, indexedFieldPath.asString());
// FieldPath unionFieldPath =
// fieldPath.expandType(getDiscriminatedType(unionSchema),
diff --git a/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/models/FieldPath.java b/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/models/FieldPath.java
index e51aa1221c54e..b4b72fcc031a5 100644
--- a/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/models/FieldPath.java
+++ b/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/models/FieldPath.java
@@ -2,6 +2,7 @@
import com.linkedin.schema.*;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Objects;
@@ -117,8 +118,8 @@ public FieldPath expandType(String type, Object typeSchema) {
.getPath()
.add(
new FieldElement(
- new ArrayList<>(List.of(type)),
- new ArrayList<>(List.of(typeSchema.toString())),
+ new ArrayList<>(Collections.singletonList(type)),
+ new ArrayList<>(Collections.singletonList(typeSchema.toString())),
null,
null));
}
diff --git a/metadata-io/metadata-io-api/build.gradle b/metadata-io/metadata-io-api/build.gradle
index b8028fad07bb6..5273177b75281 100644
--- a/metadata-io/metadata-io-api/build.gradle
+++ b/metadata-io/metadata-io-api/build.gradle
@@ -16,3 +16,7 @@ dependencies {
testImplementation externalDependency.lombok
testAnnotationProcessor externalDependency.lombok
}
+
+test {
+ environment 'STRICT_URN_VALIDATION_ENABLED', 'true'
+}
\ No newline at end of file
diff --git a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/validation/ValidationApiUtils.java b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/validation/ValidationApiUtils.java
index c2e1c47eca1fd..5e1f09fcc6439 100644
--- a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/validation/ValidationApiUtils.java
+++ b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/validation/ValidationApiUtils.java
@@ -30,7 +30,8 @@ public class ValidationApiUtils {
// Related to BrowsePathv2
public static final String URN_DELIMITER_SEPARATOR = "␟";
// https://datahubproject.io/docs/what/urn/#restrictions
- public static final Set ILLEGAL_URN_COMPONENT_CHARACTERS = Set.of(":", "(", ")", ",");
+ public static final Set ILLEGAL_URN_COMPONENT_CHARACTERS = Set.of("(", ")");
+ public static final Set ILLEGAL_URN_TUPLE_CHARACTERS = Set.of(",");
/**
* Validates a {@link RecordTemplate} and throws {@link ValidationException} if validation fails.
@@ -86,11 +87,10 @@ public static void validateUrn(
"Error: URN cannot contain " + URN_DELIMITER_SEPARATOR + " character");
}
+ int totalParts = urn.getEntityKey().getParts().size();
List illegalComponents =
urn.getEntityKey().getParts().stream()
- .flatMap(ValidationApiUtils::processUrnPartRecursively)
- .filter(
- urnPart -> ILLEGAL_URN_COMPONENT_CHARACTERS.stream().anyMatch(urnPart::contains))
+ .flatMap(part -> processUrnPartRecursively(part, totalParts))
.collect(Collectors.toList());
if (!illegalComponents.isEmpty()) {
@@ -114,15 +114,25 @@ public static void validateUrn(
}
/** Recursively process URN parts with URL decoding */
- private static Stream processUrnPartRecursively(String urnPart) {
+ private static Stream processUrnPartRecursively(String urnPart, int totalParts) {
String decodedPart =
URLDecoder.decode(URLEncodingFixer.fixURLEncoding(urnPart), StandardCharsets.UTF_8);
if (decodedPart.startsWith("urn:li:")) {
// Recursively process nested URN after decoding
+ int nestedParts = UrnUtils.getUrn(decodedPart).getEntityKey().getParts().size();
return UrnUtils.getUrn(decodedPart).getEntityKey().getParts().stream()
- .flatMap(ValidationApiUtils::processUrnPartRecursively);
+ .flatMap(part -> processUrnPartRecursively(part, nestedParts));
}
- return Stream.of(decodedPart);
+ if (totalParts > 1) {
+ if (ILLEGAL_URN_TUPLE_CHARACTERS.stream().anyMatch(c -> urnPart.contains(c))) {
+ return Stream.of(urnPart);
+ }
+ }
+ if (ILLEGAL_URN_COMPONENT_CHARACTERS.stream().anyMatch(c -> urnPart.contains(c))) {
+ return Stream.of(urnPart);
+ }
+
+ return Stream.empty();
}
/**
diff --git a/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/validation/ValidationApiUtilsTest.java b/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/validation/ValidationApiUtilsTest.java
index e683e594d8766..a2c9a15d92f90 100644
--- a/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/validation/ValidationApiUtilsTest.java
+++ b/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/validation/ValidationApiUtilsTest.java
@@ -18,10 +18,36 @@ public void testValidateDatasetUrn() {
// If no exception is thrown, test passes
}
- @Test(expectedExceptions = IllegalArgumentException.class)
+ @Test
public void testSimpleUrnColon() {
- Urn invalidUrn = UrnUtils.getUrn("urn:li:corpuser:foo:bar");
- ValidationApiUtils.validateUrn(entityRegistry, invalidUrn, true);
+ ValidationApiUtils.validateUrn(
+ entityRegistry, UrnUtils.getUrn("urn:li:corpuser:foo:bar"), true);
+ ValidationApiUtils.validateUrn(
+ entityRegistry, UrnUtils.getUrn("urn:li:dataPlatform:abc:def"), true);
+ ValidationApiUtils.validateUrn(
+ entityRegistry, UrnUtils.getUrn("urn:li:corpuser:foo:bar@example.com"), true);
+ // If no exception is thrown, test passes
+ }
+
+ @Test
+ public void testSimpleUrnComma() {
+ ValidationApiUtils.validateUrn(entityRegistry, UrnUtils.getUrn("urn:li:corpuser:,"), true);
+ // If no exception is thrown, test passes
+ }
+
+ @Test(expectedExceptions = IllegalArgumentException.class)
+ public void testTupleUrnComma() {
+ ValidationApiUtils.validateUrn(
+ entityRegistry, UrnUtils.getUrn("urn:li:dashboard:(looker,dashboards,thelook)"), true);
+ }
+
+ @Test(expectedExceptions = IllegalArgumentException.class)
+ public void testFabricTypeCasing() {
+ // prod != PROD
+ ValidationApiUtils.validateUrn(
+ entityRegistry,
+ UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:abc:def,table_name,prod)"),
+ true);
}
@Test
@@ -34,7 +60,7 @@ public void testComplexUrnColon() throws URISyntaxException {
}
@Test(expectedExceptions = IllegalArgumentException.class)
- public void testUrnFabricType() {
+ public void testFabricTypeParen() {
Urn invalidUrn = UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:hdfs,/path/to/data,())");
ValidationApiUtils.validateUrn(entityRegistry, invalidUrn, true);
}
@@ -83,20 +109,20 @@ public void testValidComplexUrn() {
UrnUtils.getUrn(
"urn:li:dataset:(urn:li:dataPlatform:bigquery,myproject.dataset.table,PROD)");
- ValidationApiUtils.validateUrn(entityRegistry, validUrn);
+ ValidationApiUtils.validateUrn(entityRegistry, validUrn, true);
// If no exception is thrown, test passes
}
@Test(expectedExceptions = NullPointerException.class)
public void testUrnNull() {
- ValidationApiUtils.validateUrn(entityRegistry, null);
+ ValidationApiUtils.validateUrn(entityRegistry, null, true);
}
@Test
public void testValidPartialUrlEncode() {
Urn validUrn = UrnUtils.getUrn("urn:li:assertion:123=-%28__% weekly__%29");
- ValidationApiUtils.validateUrn(entityRegistry, validUrn);
+ ValidationApiUtils.validateUrn(entityRegistry, validUrn, true);
// If no exception is thrown, test passes
}
@@ -106,7 +132,23 @@ public void testValidPartialUrlEncode2() {
UrnUtils.getUrn(
"urn:li:dataset:(urn:li:dataPlatform:s3,urn:li:dataset:%28urn:li:dataPlatform:s3%2Ctest-datalake-concepts%prog_maintenance%2CPROD%29,PROD)");
- ValidationApiUtils.validateUrn(entityRegistry, validUrn);
+ ValidationApiUtils.validateUrn(entityRegistry, validUrn, true);
+ // If no exception is thrown, test passes
+ }
+
+ @Test
+ public void testValidColon() {
+ Urn validUrn =
+ UrnUtils.getUrn("urn:li:dashboard:(looker,dashboards.thelook::cohort_data_tool)");
+
+ ValidationApiUtils.validateUrn(entityRegistry, validUrn, true);
+ // If no exception is thrown, test passes
+ }
+
+ @Test
+ public void testNoTupleComma() {
+ Urn invalidUrn = UrnUtils.getUrn("urn:li:corpuser:,");
+ ValidationApiUtils.validateUrn(entityRegistry, invalidUrn, true);
// If no exception is thrown, test passes
}
}
diff --git a/metadata-service/configuration/src/main/resources/bootstrap_mcps.yaml b/metadata-service/configuration/src/main/resources/bootstrap_mcps.yaml
index f9497258c384f..0e283dfdfc93c 100644
--- a/metadata-service/configuration/src/main/resources/bootstrap_mcps.yaml
+++ b/metadata-service/configuration/src/main/resources/bootstrap_mcps.yaml
@@ -38,7 +38,7 @@ bootstrap:
# Ingestion Recipes
- name: ingestion-datahub-gc
- version: v4
+ version: v5
optional: false
mcps_location: "bootstrap_mcps/ingestion-datahub-gc.yaml"
values_env: "DATAHUB_GC_BOOTSTRAP_VALUES"
diff --git a/metadata-service/configuration/src/main/resources/bootstrap_mcps/ingestion-datahub-gc.yaml b/metadata-service/configuration/src/main/resources/bootstrap_mcps/ingestion-datahub-gc.yaml
index 395eb5db53424..c0c5be85b16b1 100644
--- a/metadata-service/configuration/src/main/resources/bootstrap_mcps/ingestion-datahub-gc.yaml
+++ b/metadata-service/configuration/src/main/resources/bootstrap_mcps/ingestion-datahub-gc.yaml
@@ -19,6 +19,7 @@
config:
cleanup_expired_tokens: {{cleanup_expired_tokens}}{{^cleanup_expired_tokens}}false{{/cleanup_expired_tokens}}
truncate_indices: {{truncate_indices}}{{^truncate_indices}}true{{/truncate_indices}}
+ truncate_index_older_than_days: {{truncate_indices_retention_days}}{{^truncate_indices_retention_days}}30{{/truncate_indices_retention_days}}
dataprocess_cleanup:
retention_days: {{dataprocess_cleanup.retention_days}}{{^dataprocess_cleanup.retention_days}}10{{/dataprocess_cleanup.retention_days}}
delete_empty_data_jobs: {{dataprocess_cleanup.delete_empty_data_jobs}}{{^dataprocess_cleanup.delete_empty_data_jobs}}true{{/dataprocess_cleanup.delete_empty_data_jobs}}
diff --git a/metadata-service/configuration/src/main/resources/search_config.yaml b/metadata-service/configuration/src/main/resources/search_config.yaml
index e93f8af8b1d6c..47494c8cb1ca4 100644
--- a/metadata-service/configuration/src/main/resources/search_config.yaml
+++ b/metadata-service/configuration/src/main/resources/search_config.yaml
@@ -65,9 +65,9 @@ queryConfigurations:
boost_mode: replace
# Criteria for exact-match only
- # Contains quotes, is a single term with `_`, `.`, or `-` (normally consider for tokenization) then use exact match query
+ # Contains quotes then use exact match query
- queryRegex: >-
- ^["'].+["']$|^[a-zA-Z0-9]\S+[_.-]\S+[a-zA-Z0-9]$
+ ^["'].+["']$
simpleQuery: false
prefixMatchQuery: true
exactMatchQuery: true
diff --git a/smoke-test/tests/cypress/cypress/e2e/siblings/siblings.js b/smoke-test/tests/cypress/cypress/e2e/siblings/siblings.js
index fb772bd7af1e7..57617d7721e59 100644
--- a/smoke-test/tests/cypress/cypress/e2e/siblings/siblings.js
+++ b/smoke-test/tests/cypress/cypress/e2e/siblings/siblings.js
@@ -98,7 +98,7 @@ describe("siblings", () => {
it("will combine results in search", () => {
cy.login();
- cy.visit("/search?page=1&query=raw_orders");
+ cy.visit("/search?page=1&query=%22raw_orders%22");
cy.contains("Showing 1 - 2 of ");