diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 532ba1102ed57..412c962cb6e36 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -83,6 +83,7 @@ jobs: - uses: gradle/actions/setup-gradle@v3 - name: Gradle build (and test) for NOT metadata ingestion if: ${{ matrix.command == 'except_metadata_ingestion' && needs.setup.outputs.backend_change == 'true' }} + # datahub-schematron:cli excluded due to dependency on metadata-ingestion run: | ./gradlew build \ -x :metadata-ingestion:build \ @@ -100,6 +101,7 @@ jobs: -x :metadata-ingestion-modules:gx-plugin:check \ -x :datahub-frontend:build \ -x :datahub-web-react:build \ + -x :metadata-integration:java:datahub-schematron:cli:test \ --parallel - name: Gradle build (and test) for frontend if: ${{ matrix.command == 'frontend' && needs.setup.outputs.frontend_change == 'true' }} diff --git a/.github/workflows/check-datahub-jars.yml b/.github/workflows/check-datahub-jars.yml index becf8126dc45b..7a49f32729ec1 100644 --- a/.github/workflows/check-datahub-jars.yml +++ b/.github/workflows/check-datahub-jars.yml @@ -40,4 +40,5 @@ jobs: - name: check ${{ matrix.command }} jar run: | ./gradlew :metadata-integration:java:${{ matrix.command }}:build --info + ./gradlew :metadata-integration:java:${{ matrix.command }}:checkShadowJar ./gradlew :metadata-integration:java:${{ matrix.command }}:javadoc diff --git a/build.gradle b/build.gradle index e3c4f5efe6bb6..be4d7ee8a562b 100644 --- a/build.gradle +++ b/build.gradle @@ -48,6 +48,7 @@ buildscript { // see also datahub-frontend/play.gradle ext.playVersion = '2.8.22' ext.playScalaVersion = '2.13' + ext.akkaVersion = '2.6.21' // 2.7.0+ has incompatible license ext.log4jVersion = '2.23.1' ext.slf4jVersion = '1.7.36' ext.logbackClassic = '1.4.14' @@ -105,7 +106,14 @@ project.ext.spec = [ ] project.ext.externalDependency = [ - 'akkaHttp': "com.typesafe.akka:akka-http-core_$playScalaVersion:10.2.10", + 'akkaHttp': "com.typesafe.akka:akka-http-core_$playScalaVersion:10.2.10", // max version due to licensing + 'akkaActor': "com.typesafe.akka:akka-actor_$playScalaVersion:$akkaVersion", + 'akkaStream': "com.typesafe.akka:akka-stream_$playScalaVersion:$akkaVersion", + 'akkaActorTyped': "com.typesafe.akka:akka-actor-typed_$playScalaVersion:$akkaVersion", + 'akkaSlf4j': "com.typesafe.akka:akka-slf4j_$playScalaVersion:$akkaVersion", + 'akkaJackson': "com.typesafe.akka:akka-serialization-jackson_$playScalaVersion:$akkaVersion", + 'akkaParsing': "com.typesafe.akka:akka-parsing_$playScalaVersion:$akkaVersion", + 'akkaProtobuf': "com.typesafe.akka:akka-protobuf-v3_$playScalaVersion:$akkaVersion", 'antlr4Runtime': 'org.antlr:antlr4-runtime:4.9.3', 'antlr4': 'org.antlr:antlr4:4.9.3', 'assertJ': 'org.assertj:assertj-core:3.11.1', diff --git a/datahub-frontend/play.gradle b/datahub-frontend/play.gradle index 266962721a80a..d513c3c232d9a 100644 --- a/datahub-frontend/play.gradle +++ b/datahub-frontend/play.gradle @@ -55,6 +55,13 @@ dependencies { implementation externalDependency.antlr4Runtime implementation externalDependency.antlr4 implementation externalDependency.akkaHttp + implementation externalDependency.akkaActor + implementation externalDependency.akkaStream + implementation externalDependency.akkaActorTyped + implementation externalDependency.akkaSlf4j + implementation externalDependency.akkaJackson + implementation externalDependency.akkaParsing + implementation externalDependency.akkaProtobuf implementation externalDependency.jerseyCore implementation externalDependency.jerseyGuava diff --git a/datahub-web-react/src/app/ingest/source/builder/sources.json b/datahub-web-react/src/app/ingest/source/builder/sources.json index 4ec2d4300aff6..537e429c1dd69 100644 --- a/datahub-web-react/src/app/ingest/source/builder/sources.json +++ b/datahub-web-react/src/app/ingest/source/builder/sources.json @@ -318,6 +318,14 @@ "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/cassandra", "recipe": "source:\n type: cassandra\n config:\n # Credentials for on prem cassandra\n contact_point: localhost\n port: 9042\n username: admin\n password: password\n\n # Or\n # Credentials Astra Cloud\n #cloud_config:\n # secure_connect_bundle: Path to Secure Connect Bundle (.zip)\n # token: Application Token\n\n # Optional Allow / Deny extraction of particular keyspaces.\n keyspace_pattern:\n allow: [.*]\n\n # Optional Allow / Deny extraction of particular tables.\n table_pattern:\n allow: [.*]" }, + { + "urn": "urn:li:dataPlatform:iceberg", + "name": "iceberg", + "displayName": "Iceberg", + "description": "Ingest databases and tables from any Iceberg catalog implementation", + "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/iceberg", + "recipe": "source:\n type: \"iceberg\"\n config:\n env: dev\n # each thread will open internet connections to fetch manifest files independently, \n # this value needs to be adjusted with ulimit\n processing_threads: 1 \n # a single catalog definition with a form of a dictionary\n catalog: \n demo: # name of the catalog\n type: \"rest\" # other types are available\n uri: \"uri\"\n s3.access-key-id: \"access-key\"\n s3.secret-access-key: \"secret-access-key\"\n s3.region: \"aws-region\"\n profiling:\n enabled: false\n" + }, { "urn": "urn:li:dataPlatform:neo4j", "name": "neo4j", diff --git a/docs/automations/snowflake-tag-propagation.md b/docs/automations/snowflake-tag-propagation.md index b72224642b0f0..8eded451644cc 100644 --- a/docs/automations/snowflake-tag-propagation.md +++ b/docs/automations/snowflake-tag-propagation.md @@ -4,6 +4,8 @@ import FeatureAvailability from '@site/src/components/FeatureAvailability'; +> Note that this Automation in currently in open **Beta**. With any questions or issues, please reach out to your Acryl representative. + ## Introduction Snowflake Tag Propagation is an automation that allows you to sync DataHub Glossary Terms and Tags on @@ -15,6 +17,41 @@ both columns and tables back to Snowflake. This automation is available in DataH - Automatically Add DataHub Tags to Snowflake Tables and Columns - Automatically Remove DataHub Glossary Terms and Tags from Snowflake Tables and Columns when they are removed in DataHub +## Prerequisites + +### Permissions Required for Tag Management + +- `CREATE TAG`: Required to create new tags in Snowflake. +Ensure the user or role has this privilege on the specific schema or database where tags will be created. +- `APPLY TAG`: Required to assign tags to Snowflake objects such as tables, columns, or other database objects. +This permission must be granted at the database, schema, or object level depending on the scope. + + +### Permissions Required for Object Access + +- `USAGE` on the database and schema: Allows access to the database and schema to view and apply changes. +- `SELECT` on the objects (tables, views, etc.): Enables the automation to read metadata and verify existing tags. + +### Example Permission Grant Statements + +To grant the necessary permissions for a specific role (DATAHUB_AUTOMATION_ROLE), you can use the following SQL commands: + +```sql +-- Tag management permissions +GRANT CREATE TAG ON SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE; +GRANT APPLY TAG ON SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE; + +-- Object access for metadata operations +GRANT USAGE ON DATABASE your_database TO ROLE DATAHUB_AUTOMATION_ROLE; +GRANT USAGE ON SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE; +GRANT SELECT ON ALL TABLES IN SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE; + +-- Future privileges for tagging +GRANT SELECT ON FUTURE TABLES IN SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE; +GRANT APPLY TAG ON FUTURE TABLES IN SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE; +``` + + ## Enabling Snowflake Tag Sync 1. **Navigate to Automations**: Click on 'Govern' > 'Automations' in the navigation bar. diff --git a/docs/managed-datahub/release-notes/v_0_3_7.md b/docs/managed-datahub/release-notes/v_0_3_7.md index af23b5ae1541b..94cbdd79dbf5e 100644 --- a/docs/managed-datahub/release-notes/v_0_3_7.md +++ b/docs/managed-datahub/release-notes/v_0_3_7.md @@ -7,7 +7,7 @@ Release Availability Date Recommended CLI/SDK --- -- `v0.14.1.11` with release notes at https://github.com/datahub/datahub/releases/tag/v0.14.1.11 +- `v0.14.1.12` with release notes at https://github.com/datahub/datahub/releases/tag/v0.14.1.12 If you are using an older CLI/SDK version, then please upgrade it. This applies for all CLI/SDK usages, if you are using it through your terminal, GitHub Actions, Airflow, in Python SDK somewhere, Java SDK, etc. This is a strong recommendation to upgrade, as we keep on pushing fixes in the CLI, and it helps us support you better. @@ -19,6 +19,26 @@ If you are using an older CLI/SDK version, then please upgrade it. This applies ## Release Changelog --- +### v0.3.7.4 + +- [#11935](https://github.com/datahub-project/datahub/pull/11935) - Added environment variable for enabling stricter URN validation rules `STRICT_URN_VALIDATION_ENABLED` [[1](https://datahubproject.io/docs/what/urn/#restrictions)]. +- [Automations] Filter out self-nodes in glossary term propagation +- [Remote Executor] Allow dashes in executor ids. +- [Search] Fix Nested Filter Counts in Primary Search +- [Search] Fix white screen of death on empty search result +- [Columns Tab] Support searching nested struct columns correctly in V2 UI. +- [Logo] Fix fit of custom logo for V2 UI nav bar. +- [Structured Properties] Better handling for special characters in structured properties +- [Lineage] Improvements to handling lineage cycles +- [Metadata Tests] Improve Reliability of Metadata Tests Action Application +- [Slack Integration] Minor improvement in authentication redirect to integrate with Slack +- [Columns Tab] Property display nullable status in column sidebar (bug) +- [Columns Tab] Fixing merging of sibling schemas between V2 and V1 field paths. +- [Documentation] Support group authors for institutional memory aspect + + +### v0.3.7 + - All changes in https://github.com/datahub-project/datahub/releases/tag/v0.14.1 - Note Breaking Changes: https://datahubproject.io/docs/how/updating-datahub/#0141 @@ -96,7 +116,7 @@ If you are using an older CLI/SDK version, then please upgrade it. This applies - Improved UX for setting up and managing SSO - Ingestion changes - - In addition to the improvements listed here: https://github.com/acryldata/datahub/releases/tag/v0.14.1.11 + - In addition to the improvements listed here: https://github.com/acryldata/datahub/releases/tag/v0.14.1.12 - PowerBI: Support for PowerBI Apps and cross-workspace lineage - Fivetran: Major improvements to configurability and improved reliability with large Fivetran setups - Snowflake & BigQuery: Improved handling of temporary tables and swap statements when generating lineage diff --git a/docs/what/urn.md b/docs/what/urn.md index 2f4dffb985653..c7fb0555cd992 100644 --- a/docs/what/urn.md +++ b/docs/what/urn.md @@ -35,11 +35,17 @@ urn:li:dataset:(urn:li:dataPlatform:hdfs,PageViewEvent,EI) ## Restrictions -There are a few restrictions when creating an urn: +There are a few restrictions when creating an URN: -1. Commas are reserved character in URN fields: `,` -2. Parentheses are reserved characters in URN fields: `(` or `)` -3. Colons are reserved characters in URN fields: `:` -4. Urn separator UTF-8 character `␟` +The following characters are not allowed anywhere in the URN + +1. Parentheses are reserved characters in URN fields: `(` or `)` +2. The "unit separator" unicode character `␟` (U+241F) + +The following characters are not allowed within an URN tuple. + +1. Commas are reserved characters in URN tuples: `,` + +Example: `urn:li:dashboard:(looker,dashboards.thelook)` is a valid urn, but `urn:li:dashboard:(looker,dashboards.the,look)` is invalid. Please do not use these characters when creating or generating urns. One approach is to use URL encoding for the characters. diff --git a/metadata-ingestion/docs/sources/iceberg/iceberg.md b/metadata-ingestion/docs/sources/iceberg/iceberg.md index 7e40315a2e319..92aac5ffa6ce5 100644 --- a/metadata-ingestion/docs/sources/iceberg/iceberg.md +++ b/metadata-ingestion/docs/sources/iceberg/iceberg.md @@ -18,6 +18,8 @@ This ingestion source maps the following Source System Concepts to DataHub Conce ## Troubleshooting -### [Common Issue] +### Exceptions while increasing `processing_threads` -[Provide description of common issues with this integration and steps to resolve] +Each processing thread will open several files/sockets to download manifest files from blob storage. If you experience +exceptions appearing when increasing `processing_threads` configuration parameter, try to increase limit of open +files (i.e. using `ulimit` in Linux). diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index a9915e1bd745d..c6d55fb5bcc56 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -14,8 +14,8 @@ ) base_requirements = { - # Typing extension should be >=3.10.0.2 ideally but we can't restrict due to a Airflow 2.1 dependency conflict. - "typing_extensions>=3.7.4.3", + # Our min version of typing_extensions is somewhat constrained by Airflow. + "typing_extensions>=3.10.0.2", # Actual dependencies. "typing-inspect", # pydantic 1.8.2 is incompatible with mypy 0.910. @@ -249,7 +249,8 @@ iceberg_common = { # Iceberg Python SDK - "pyiceberg>=0.4,<0.7", + # Kept at 0.4.0 due to higher versions requiring pydantic>2, as soon as we are fine with it, bump this dependency + "pyiceberg>=0.4.0", } mssql_common = { @@ -775,7 +776,7 @@ "trino = datahub.ingestion.source.sql.trino:TrinoSource", "starburst-trino-usage = datahub.ingestion.source.usage.starburst_trino_usage:TrinoUsageSource", "nifi = datahub.ingestion.source.nifi:NifiSource", - "powerbi = datahub.ingestion.source.powerbi:PowerBiDashboardSource", + "powerbi = datahub.ingestion.source.powerbi.powerbi:PowerBiDashboardSource", "powerbi-report-server = datahub.ingestion.source.powerbi_report_server:PowerBiReportServerDashboardSource", "iceberg = datahub.ingestion.source.iceberg.iceberg:IcebergSource", "vertica = datahub.ingestion.source.sql.vertica:VerticaSource", diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index 759aebcfd46b0..4aa937639e959 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -67,6 +67,7 @@ SystemMetadataClass, TelemetryClientIdClass, ) +from datahub.telemetry.telemetry import telemetry_instance from datahub.utilities.perf_timer import PerfTimer from datahub.utilities.str_enum import StrEnum from datahub.utilities.urns.urn import Urn, guess_entity_type @@ -1819,4 +1820,5 @@ def get_default_graph() -> DataHubGraph: graph_config = config_utils.load_client_config() graph = DataHubGraph(graph_config) graph.test_connection() + telemetry_instance.set_context(server=graph) return graph diff --git a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py index 7c3a42c3e0893..667129ff83584 100644 --- a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py +++ b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py @@ -44,7 +44,8 @@ ) from datahub.ingestion.transformer.transform_registry import transform_registry from datahub.metadata.schema_classes import MetadataChangeProposalClass -from datahub.telemetry import stats, telemetry +from datahub.telemetry import stats +from datahub.telemetry.telemetry import telemetry_instance from datahub.utilities._custom_package_loader import model_version_name from datahub.utilities.global_warning_util import ( clear_global_warnings, @@ -273,8 +274,9 @@ def __init__( if self.graph is None and isinstance(self.sink, DatahubRestSink): with _add_init_error_context("setup default datahub client"): self.graph = self.sink.emitter.to_graph() + self.graph.test_connection() self.ctx.graph = self.graph - telemetry.telemetry_instance.update_capture_exception_context(server=self.graph) + telemetry_instance.set_context(server=self.graph) with set_graph_context(self.graph): with _add_init_error_context("configure reporters"): @@ -615,7 +617,7 @@ def log_ingestion_stats(self) -> None: sink_warnings = len(self.sink.get_report().warnings) global_warnings = len(get_global_warnings()) - telemetry.telemetry_instance.ping( + telemetry_instance.ping( "ingest_stats", { "source_type": self.source_type, @@ -637,7 +639,6 @@ def log_ingestion_stats(self) -> None: ), "has_pipeline_name": bool(self.config.pipeline_name), }, - self.ctx.graph, ) def _approx_all_vals(self, d: LossyList[Any]) -> int: diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py index 4598ae388b827..499e7e1231d05 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py @@ -53,19 +53,7 @@ make_assertion_from_test, make_assertion_result_from_test, ) -from datahub.ingestion.source.sql.sql_types import ( - ATHENA_SQL_TYPES_MAP, - BIGQUERY_TYPES_MAP, - POSTGRES_TYPES_MAP, - SNOWFLAKE_TYPES_MAP, - SPARK_SQL_TYPES_MAP, - TRINO_SQL_TYPES_MAP, - VERTICA_SQL_TYPES_MAP, - resolve_athena_modified_type, - resolve_postgres_modified_type, - resolve_trino_modified_type, - resolve_vertica_modified_type, -) +from datahub.ingestion.source.sql.sql_types import resolve_sql_type from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, StaleEntityRemovalSourceReport, @@ -89,17 +77,11 @@ from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent from datahub.metadata.com.linkedin.pegasus2avro.schema import ( - BooleanTypeClass, - DateTypeClass, MySqlDDL, NullTypeClass, - NumberTypeClass, - RecordType, SchemaField, SchemaFieldDataType, SchemaMetadata, - StringTypeClass, - TimeTypeClass, ) from datahub.metadata.schema_classes import ( DataPlatformInstanceClass, @@ -804,28 +786,6 @@ def make_mapping_upstream_lineage( ) -# See https://github.com/fishtown-analytics/dbt/blob/master/core/dbt/adapters/sql/impl.py -_field_type_mapping = { - "boolean": BooleanTypeClass, - "date": DateTypeClass, - "time": TimeTypeClass, - "numeric": NumberTypeClass, - "text": StringTypeClass, - "timestamp with time zone": DateTypeClass, - "timestamp without time zone": DateTypeClass, - "integer": NumberTypeClass, - "float8": NumberTypeClass, - "struct": RecordType, - **POSTGRES_TYPES_MAP, - **SNOWFLAKE_TYPES_MAP, - **BIGQUERY_TYPES_MAP, - **SPARK_SQL_TYPES_MAP, - **TRINO_SQL_TYPES_MAP, - **ATHENA_SQL_TYPES_MAP, - **VERTICA_SQL_TYPES_MAP, -} - - def get_column_type( report: DBTSourceReport, dataset_name: str, @@ -835,24 +795,10 @@ def get_column_type( """ Maps known DBT types to datahub types """ - TypeClass: Any = _field_type_mapping.get(column_type) if column_type else None - - if TypeClass is None and column_type: - # resolve a modified type - if dbt_adapter == "trino": - TypeClass = resolve_trino_modified_type(column_type) - elif dbt_adapter == "athena": - TypeClass = resolve_athena_modified_type(column_type) - elif dbt_adapter == "postgres" or dbt_adapter == "redshift": - # Redshift uses a variant of Postgres, so we can use the same logic. - TypeClass = resolve_postgres_modified_type(column_type) - elif dbt_adapter == "vertica": - TypeClass = resolve_vertica_modified_type(column_type) - elif dbt_adapter == "snowflake": - # Snowflake types are uppercase, so we check that. - TypeClass = _field_type_mapping.get(column_type.upper()) - - # if still not found, report the warning + + TypeClass = resolve_sql_type(column_type, dbt_adapter) + + # if still not found, report a warning if TypeClass is None: if column_type: report.info( @@ -861,9 +807,9 @@ def get_column_type( context=f"{dataset_name} - {column_type}", log=False, ) - TypeClass = NullTypeClass + TypeClass = NullTypeClass() - return SchemaFieldDataType(type=TypeClass()) + return SchemaFieldDataType(type=TypeClass) @platform_name("dbt") diff --git a/metadata-ingestion/src/datahub/ingestion/source/feast.py b/metadata-ingestion/src/datahub/ingestion/source/feast.py index e097fd1f221ea..6330fe0291660 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/feast.py +++ b/metadata-ingestion/src/datahub/ingestion/source/feast.py @@ -42,10 +42,14 @@ from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent from datahub.metadata.schema_classes import ( BrowsePathsClass, + GlobalTagsClass, MLFeaturePropertiesClass, MLFeatureTablePropertiesClass, MLPrimaryKeyPropertiesClass, + OwnerClass, + OwnershipClass, StatusClass, + TagAssociationClass, ) # FIXME: ValueType module cannot be used as a type @@ -91,6 +95,24 @@ class FeastRepositorySourceConfig(ConfigModel): environment: str = Field( default=DEFAULT_ENV, description="Environment to use when constructing URNs" ) + # owner_mappings example: + # This must be added to the recipe in order to extract owners, otherwise NO owners will be extracted + # owner_mappings: + # - feast_owner_name: "" + # datahub_owner_urn: "urn:li:corpGroup:" + # datahub_ownership_type: "BUSINESS_OWNER" + owner_mappings: Optional[List[Dict[str, str]]] = Field( + default=None, description="Mapping of owner names to owner types" + ) + enable_owner_extraction: bool = Field( + default=False, + description="If this is disabled, then we NEVER try to map owners. " + "If this is enabled, then owner_mappings is REQUIRED to extract ownership.", + ) + enable_tag_extraction: bool = Field( + default=False, + description="If this is disabled, then we NEVER try to extract tags.", + ) @platform_name("Feast") @@ -215,10 +237,15 @@ def _get_entity_workunit( """ feature_view_name = f"{self.feature_store.project}.{feature_view.name}" + aspects = ( + [StatusClass(removed=False)] + + self._get_tags(entity) + + self._get_owners(entity) + ) entity_snapshot = MLPrimaryKeySnapshot( urn=builder.make_ml_primary_key_urn(feature_view_name, entity.name), - aspects=[StatusClass(removed=False)], + aspects=aspects, ) entity_snapshot.aspects.append( @@ -243,10 +270,11 @@ def _get_feature_workunit( Generate an MLFeature work unit for a Feast feature. """ feature_view_name = f"{self.feature_store.project}.{feature_view.name}" + aspects = [StatusClass(removed=False)] + self._get_tags(field) feature_snapshot = MLFeatureSnapshot( urn=builder.make_ml_feature_urn(feature_view_name, field.name), - aspects=[StatusClass(removed=False)], + aspects=aspects, ) feature_sources = [] @@ -295,13 +323,18 @@ def _get_feature_view_workunit(self, feature_view: FeatureView) -> MetadataWorkU """ feature_view_name = f"{self.feature_store.project}.{feature_view.name}" + aspects = ( + [ + BrowsePathsClass(paths=[f"/feast/{self.feature_store.project}"]), + StatusClass(removed=False), + ] + + self._get_tags(feature_view) + + self._get_owners(feature_view) + ) feature_view_snapshot = MLFeatureTableSnapshot( urn=builder.make_ml_feature_table_urn("feast", feature_view_name), - aspects=[ - BrowsePathsClass(paths=[f"/feast/{self.feature_store.project}"]), - StatusClass(removed=False), - ], + aspects=aspects, ) feature_view_snapshot.aspects.append( @@ -360,6 +393,64 @@ def _get_on_demand_feature_view_workunit( return MetadataWorkUnit(id=on_demand_feature_view_name, mce=mce) + # If a tag is specified in a Feast object, then the tag will be ingested into Datahub if enable_tag_extraction is + # True, otherwise NO tags will be ingested + def _get_tags(self, obj: Union[Entity, FeatureView, FeastField]) -> list: + """ + Extracts tags from the given object and returns a list of aspects. + """ + aspects: List[Union[GlobalTagsClass]] = [] + + # Extract tags + if self.source_config.enable_tag_extraction: + if obj.tags.get("name"): + tag_name: str = obj.tags["name"] + tag_association = TagAssociationClass( + tag=builder.make_tag_urn(tag_name) + ) + global_tags_aspect = GlobalTagsClass(tags=[tag_association]) + aspects.append(global_tags_aspect) + + return aspects + + # If an owner is specified in a Feast object, it will only be ingested into Datahub if owner_mappings is specified + # and enable_owner_extraction is True in FeastRepositorySourceConfig, otherwise NO owners will be ingested + def _get_owners(self, obj: Union[Entity, FeatureView, FeastField]) -> list: + """ + Extracts owners from the given object and returns a list of aspects. + """ + aspects: List[Union[OwnershipClass]] = [] + + # Extract owner + if self.source_config.enable_owner_extraction: + owner = getattr(obj, "owner", None) + if owner: + # Create owner association, skipping if None + owner_association = self._create_owner_association(owner) + if owner_association: # Only add valid owner associations + owners_aspect = OwnershipClass(owners=[owner_association]) + aspects.append(owners_aspect) + + return aspects + + def _create_owner_association(self, owner: str) -> Optional[OwnerClass]: + """ + Create an OwnerClass instance for the given owner using the owner mappings. + """ + if self.source_config.owner_mappings is not None: + for mapping in self.source_config.owner_mappings: + if mapping["feast_owner_name"] == owner: + ownership_type_class: str = mapping.get( + "datahub_ownership_type", "TECHNICAL_OWNER" + ) + datahub_owner_urn = mapping.get("datahub_owner_urn") + if datahub_owner_urn: + return OwnerClass( + owner=datahub_owner_urn, + type=ownership_type_class, + ) + return None + @classmethod def create(cls, config_dict, ctx): config = FeastRepositorySourceConfig.parse_obj(config_dict) diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py b/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py index c4b4186f45fc3..52807ca2a3f02 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py @@ -144,15 +144,32 @@ def get_workunits_internal( self, ) -> Iterable[MetadataWorkUnit]: if self.config.cleanup_expired_tokens: - self.revoke_expired_tokens() + try: + self.revoke_expired_tokens() + except Exception as e: + self.report.failure("While trying to cleanup expired token ", exc=e) if self.config.truncate_indices: - self.truncate_indices() + try: + self.truncate_indices() + except Exception as e: + self.report.failure("While trying to truncate indices ", exc=e) if self.dataprocess_cleanup: - yield from self.dataprocess_cleanup.get_workunits_internal() + try: + yield from self.dataprocess_cleanup.get_workunits_internal() + except Exception as e: + self.report.failure("While trying to cleanup data process ", exc=e) if self.soft_deleted_entities_cleanup: - self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities() + try: + self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities() + except Exception as e: + self.report.failure( + "While trying to cleanup soft deleted entities ", exc=e + ) if self.execution_request_cleanup: - self.execution_request_cleanup.run() + try: + self.execution_request_cleanup.run() + except Exception as e: + self.report.failure("While trying to cleanup execution request ", exc=e) yield from [] def truncate_indices(self) -> None: diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py index 130f2c9c2e12f..0f35e1a67fede 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py @@ -404,7 +404,9 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: try: self.delete_dpi_from_datajobs(datajob_entity) except Exception as e: - logger.error(f"While trying to delete {datajob_entity} got {e}") + self.report.failure( + f"While trying to delete {datajob_entity} ", exc=e + ) if ( datajob_entity.total_runs == 0 and self.config.delete_empty_data_jobs diff --git a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py index 258a4b9ad6daf..5931873f54236 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py +++ b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py @@ -9,6 +9,7 @@ NoSuchIcebergTableError, NoSuchNamespaceError, NoSuchPropertyException, + NoSuchTableError, ) from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType, visit from pyiceberg.table import Table @@ -104,7 +105,7 @@ @capability(SourceCapability.DESCRIPTIONS, "Enabled by default.") @capability( SourceCapability.OWNERSHIP, - "Optionally enabled via configuration by specifying which Iceberg table property holds user or group ownership.", + "Automatically ingests ownership information from table properties based on `user_ownership_property` and `group_ownership_property`", ) @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") class IcebergSource(StatefulIngestionSourceBase): @@ -192,9 +193,7 @@ def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]: table = thread_local.local_catalog.load_table(dataset_path) time_taken = timer.elapsed_seconds() self.report.report_table_load_time(time_taken) - LOGGER.debug( - f"Loaded table: {table.identifier}, time taken: {time_taken}" - ) + LOGGER.debug(f"Loaded table: {table.name()}, time taken: {time_taken}") yield from self._create_iceberg_workunit(dataset_name, table) except NoSuchPropertyException as e: self.report.report_warning( @@ -206,12 +205,20 @@ def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]: ) except NoSuchIcebergTableError as e: self.report.report_warning( - "no-iceberg-table", + "not-an-iceberg-table", f"Failed to create workunit for {dataset_name}. {e}", ) LOGGER.warning( f"NoSuchIcebergTableError while processing table {dataset_path}, skipping it.", ) + except NoSuchTableError as e: + self.report.report_warning( + "no-such-table", + f"Failed to create workunit for {dataset_name}. {e}", + ) + LOGGER.warning( + f"NoSuchTableError while processing table {dataset_path}, skipping it.", + ) except Exception as e: self.report.report_failure("general", f"Failed to create workunit: {e}") LOGGER.exception( diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka.py b/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka.py index e57dc853a83c6..709ba431f0f87 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka.py +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka.py @@ -148,7 +148,7 @@ def get_kafka_consumer( ) -> confluent_kafka.Consumer: consumer = confluent_kafka.Consumer( { - "group.id": "test", + "group.id": "datahub-kafka-ingestion", "bootstrap.servers": connection.bootstrap, **connection.consumer_config, } @@ -164,6 +164,25 @@ def get_kafka_consumer( return consumer +def get_kafka_admin_client( + connection: KafkaConsumerConnectionConfig, +) -> AdminClient: + client = AdminClient( + { + "group.id": "datahub-kafka-ingestion", + "bootstrap.servers": connection.bootstrap, + **connection.consumer_config, + } + ) + if CallableConsumerConfig.is_callable_config(connection.consumer_config): + # As per documentation, we need to explicitly call the poll method to make sure OAuth callback gets executed + # https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#kafka-client-configuration + logger.debug("Initiating polling for kafka admin client") + client.poll(timeout=30) + logger.debug("Initiated polling for kafka admin client") + return client + + @dataclass class KafkaSourceReport(StaleEntityRemovalSourceReport): topics_scanned: int = 0 @@ -278,13 +297,7 @@ def __init__(self, config: KafkaSourceConfig, ctx: PipelineContext): def init_kafka_admin_client(self) -> None: try: # TODO: Do we require separate config than existing consumer_config ? - self.admin_client = AdminClient( - { - "group.id": "test", - "bootstrap.servers": self.source_config.connection.bootstrap, - **self.source_config.connection.consumer_config, - } - ) + self.admin_client = get_kafka_admin_client(self.source_config.connection) except Exception as e: logger.debug(e, exc_info=e) self.report.report_warning( diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/__init__.py index 1068f335e8f8e..e69de29bb2d1d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/__init__.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/__init__.py @@ -1 +0,0 @@ -from datahub.ingestion.source.powerbi.powerbi import PowerBiDashboardSource diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index 91fa2e96be2cc..f7458c4eb4d5b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -173,7 +173,7 @@ class SupportedDataPlatform(Enum): datahub_data_platform_name="redshift", ) - DATABRICK_SQL = DataPlatformPair( + DATABRICKS_SQL = DataPlatformPair( powerbi_data_platform_name="Databricks", datahub_data_platform_name="databricks" ) @@ -313,8 +313,8 @@ class PowerBiDashboardSourceConfig( " Note: This field works in conjunction with 'workspace_type_filter' and both must be considered when filtering workspaces.", ) - # Dataset type mapping PowerBI support many type of data-sources. Here user need to define what type of PowerBI - # DataSource need to be mapped to corresponding DataHub Platform DataSource. For example PowerBI `Snowflake` is + # Dataset type mapping PowerBI support many type of data-sources. Here user needs to define what type of PowerBI + # DataSource needs to be mapped to corresponding DataHub Platform DataSource. For example, PowerBI `Snowflake` is # mapped to DataHub `snowflake` PowerBI `PostgreSQL` is mapped to DataHub `postgres` and so on. dataset_type_mapping: Union[ Dict[str, str], Dict[str, PlatformDetail] diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/data_classes.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/data_classes.py index bb0c0c2f79bbd..f1691b5df68a9 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/data_classes.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/data_classes.py @@ -1,10 +1,14 @@ import os from abc import ABC from dataclasses import dataclass -from typing import Any, Dict, Optional +from enum import Enum +from typing import Any, Dict, List, Optional from lark import Tree +from datahub.ingestion.source.powerbi.config import DataPlatformPair +from datahub.sql_parsing.sqlglot_lineage import ColumnLineageInfo + TRACE_POWERBI_MQUERY_PARSER = os.getenv("DATAHUB_TRACE_POWERBI_MQUERY_PARSER", False) @@ -30,7 +34,7 @@ class IdentifierAccessor(AbstractIdentifierAccessor): "[Schema="public",Item="order_date"]" is "items" in ItemSelector. Data of items varies as per DataSource - "public_order_date" is in "next" of ItemSelector. The "next" will be None if this identifier is leaf i.e. table + "public_order_date" is in "next" of ItemSelector. The "next" will be None if this identifier is leaf i.e., table """ @@ -53,3 +57,31 @@ class ReferencedTable: database: str schema: str table: str + + +@dataclass +class DataPlatformTable: + data_platform_pair: DataPlatformPair + urn: str + + +@dataclass +class Lineage: + upstreams: List[DataPlatformTable] + column_lineage: List[ColumnLineageInfo] + + @staticmethod + def empty() -> "Lineage": + return Lineage(upstreams=[], column_lineage=[]) + + +class FunctionName(Enum): + NATIVE_QUERY = "Value.NativeQuery" + POSTGRESQL_DATA_ACCESS = "PostgreSQL.Database" + ORACLE_DATA_ACCESS = "Oracle.Database" + SNOWFLAKE_DATA_ACCESS = "Snowflake.Databases" + MSSQL_DATA_ACCESS = "Sql.Database" + DATABRICK_DATA_ACCESS = "Databricks.Catalogs" + GOOGLE_BIGQUERY_DATA_ACCESS = "GoogleBigQuery.Database" + AMAZON_REDSHIFT_DATA_ACCESS = "AmazonRedshift.Database" + DATABRICK_MULTI_CLOUD_DATA_ACCESS = "DatabricksMultiCloud.Catalogs" diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py index 97698a3d0d56c..2a5de7494920b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py @@ -7,6 +7,7 @@ import lark from lark import Lark, Tree +import datahub.ingestion.source.powerbi.m_query.data_classes from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.powerbi.config import ( PowerBiDashboardSourceConfig, @@ -65,7 +66,7 @@ def get_upstream_tables( ctx: PipelineContext, config: PowerBiDashboardSourceConfig, parameters: Dict[str, str] = {}, -) -> List[resolver.Lineage]: +) -> List[datahub.ingestion.source.powerbi.m_query.data_classes.Lineage]: if table.expression is None: logger.debug(f"There is no M-Query expression in table {table.full_name}") return [] @@ -127,12 +128,14 @@ def get_upstream_tables( reporter.m_query_parse_successes += 1 try: - lineage: List[resolver.Lineage] = resolver.MQueryResolver( + lineage: List[ + datahub.ingestion.source.powerbi.m_query.data_classes.Lineage + ] = resolver.MQueryResolver( table=table, parse_tree=parse_tree, reporter=reporter, parameters=parameters, - ).resolve_to_data_platform_table_list( + ).resolve_to_lineage( ctx=ctx, config=config, platform_instance_resolver=platform_instance_resolver, diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py new file mode 100644 index 0000000000000..13d97a7029029 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py @@ -0,0 +1,920 @@ +import logging +from abc import ABC, abstractmethod +from enum import Enum +from typing import Dict, List, Optional, Tuple, Type, Union, cast + +from lark import Tree + +from datahub.emitter import mce_builder as builder +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.source.powerbi.config import ( + Constant, + DataBricksPlatformDetail, + DataPlatformPair, + PlatformDetail, + PowerBiDashboardSourceConfig, + PowerBiDashboardSourceReport, + PowerBIPlatformDetail, + SupportedDataPlatform, +) +from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import ( + AbstractDataPlatformInstanceResolver, +) +from datahub.ingestion.source.powerbi.m_query import native_sql_parser, tree_function +from datahub.ingestion.source.powerbi.m_query.data_classes import ( + AbstractIdentifierAccessor, + DataAccessFunctionDetail, + DataPlatformTable, + FunctionName, + IdentifierAccessor, + Lineage, + ReferencedTable, +) +from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table +from datahub.sql_parsing.sqlglot_lineage import SqlParsingResult + +logger = logging.getLogger(__name__) + + +def get_next_item(items: List[str], item: str) -> Optional[str]: + if item in items: + try: + index = items.index(item) + return items[index + 1] + except IndexError: + logger.debug(f'item:"{item}", not found in item-list: {items}') + return None + + +def urn_to_lowercase(value: str, flag: bool) -> str: + if flag is True: + return value.lower() + + return value + + +def make_urn( + config: PowerBiDashboardSourceConfig, + platform_instance_resolver: AbstractDataPlatformInstanceResolver, + data_platform_pair: DataPlatformPair, + server: str, + qualified_table_name: str, +) -> str: + platform_detail: PlatformDetail = platform_instance_resolver.get_platform_instance( + PowerBIPlatformDetail( + data_platform_pair=data_platform_pair, + data_platform_server=server, + ) + ) + + return builder.make_dataset_urn_with_platform_instance( + platform=data_platform_pair.datahub_data_platform_name, + platform_instance=platform_detail.platform_instance, + env=platform_detail.env, + name=urn_to_lowercase( + qualified_table_name, config.convert_lineage_urns_to_lowercase + ), + ) + + +class AbstractLineage(ABC): + """ + Base class to share common functionalities among different dataplatform for M-Query parsing. + + To create qualified table name we need to parse M-Query data-access-functions(https://learn.microsoft.com/en-us/powerquery-m/accessing-data-functions) and + the data-access-functions has some define pattern to access database-name, schema-name and table-name, for example, see below M-Query. + + let + Source = Sql.Database("localhost", "library"), + dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data] + in + dbo_book_issue + + It is MSSQL M-Query and Sql.Database is the data-access-function to access MSSQL. If this function is available in M-Query then database name is available in the second argument of the first statement and schema-name and table-name is available in the second statement. the second statement can be repeated to access different tables from MSSQL. + + DefaultTwoStepDataAccessSources extends the AbstractDataPlatformTableCreator and provides the common functionalities for data-platform which has above type of M-Query pattern + + data-access-function varies as per data-platform for example for MySQL.Database for MySQL, PostgreSQL.Database for Postgres and Oracle.Database for Oracle and number of statement to + find out database-name , schema-name and table-name also varies as per dataplatform. + + Value.NativeQuery is one of the functions which is used to execute a native query inside M-Query, for example see below M-Query + + let + Source = Value.NativeQuery(AmazonRedshift.Database("redshift-url","dev"), "select * from dev.public.category", null, [EnableFolding=true]) + in + Source + + In this M-Query database-name is available in first argument and rest of the detail i.e database & schema is available in native query. + + NativeQueryDataPlatformTableCreator extends AbstractDataPlatformTableCreator to support Redshift and Snowflake native query parsing. + + """ + + ctx: PipelineContext + table: Table + config: PowerBiDashboardSourceConfig + reporter: PowerBiDashboardSourceReport + platform_instance_resolver: AbstractDataPlatformInstanceResolver + + def __init__( + self, + ctx: PipelineContext, + table: Table, + config: PowerBiDashboardSourceConfig, + reporter: PowerBiDashboardSourceReport, + platform_instance_resolver: AbstractDataPlatformInstanceResolver, + ) -> None: + super().__init__() + self.ctx = ctx + self.table = table + self.config = config + self.reporter = reporter + self.platform_instance_resolver = platform_instance_resolver + + @abstractmethod + def create_lineage( + self, data_access_func_detail: DataAccessFunctionDetail + ) -> Lineage: + pass + + @abstractmethod + def get_platform_pair(self) -> DataPlatformPair: + pass + + @staticmethod + def get_db_detail_from_argument( + arg_list: Tree, + ) -> Tuple[Optional[str], Optional[str]]: + arguments: List[str] = tree_function.strip_char_from_list( + values=tree_function.remove_whitespaces_from_list( + tree_function.token_values(arg_list) + ), + ) + + if len(arguments) < 2: + logger.debug(f"Expected minimum 2 arguments, but got {len(arguments)}") + return None, None + + return arguments[0], arguments[1] + + @staticmethod + def create_reference_table( + arg_list: Tree, + table_detail: Dict[str, str], + ) -> Optional[ReferencedTable]: + arguments: List[str] = tree_function.strip_char_from_list( + values=tree_function.remove_whitespaces_from_list( + tree_function.token_values(arg_list) + ), + ) + + logger.debug(f"Processing arguments {arguments}") + + if ( + len(arguments) + >= 4 # [0] is warehouse FQDN. + # [1] is endpoint, we are not using it. + # [2] is "Catalog" key + # [3] is catalog's value + ): + return ReferencedTable( + warehouse=arguments[0], + catalog=arguments[3], + # As per my observation, database and catalog names are same in M-Query + database=table_detail["Database"] + if table_detail.get("Database") + else arguments[3], + schema=table_detail["Schema"], + table=table_detail.get("Table") or table_detail["View"], + ) + elif len(arguments) == 2: + return ReferencedTable( + warehouse=arguments[0], + database=table_detail["Database"], + schema=table_detail["Schema"], + table=table_detail.get("Table") or table_detail["View"], + catalog=None, + ) + + return None + + def parse_custom_sql( + self, query: str, server: str, database: Optional[str], schema: Optional[str] + ) -> Lineage: + dataplatform_tables: List[DataPlatformTable] = [] + + platform_detail: PlatformDetail = ( + self.platform_instance_resolver.get_platform_instance( + PowerBIPlatformDetail( + data_platform_pair=self.get_platform_pair(), + data_platform_server=server, + ) + ) + ) + + query = native_sql_parser.remove_drop_statement( + native_sql_parser.remove_special_characters(query) + ) + + parsed_result: Optional[ + "SqlParsingResult" + ] = native_sql_parser.parse_custom_sql( + ctx=self.ctx, + query=query, + platform=self.get_platform_pair().datahub_data_platform_name, + platform_instance=platform_detail.platform_instance, + env=platform_detail.env, + database=database, + schema=schema, + ) + + if parsed_result is None: + self.reporter.info( + title=Constant.SQL_PARSING_FAILURE, + message="Fail to parse native sql present in PowerBI M-Query", + context=f"table-name={self.table.full_name}, sql={query}", + ) + return Lineage.empty() + + if parsed_result.debug_info and parsed_result.debug_info.table_error: + self.reporter.warning( + title=Constant.SQL_PARSING_FAILURE, + message="Fail to parse native sql present in PowerBI M-Query", + context=f"table-name={self.table.full_name}, error={parsed_result.debug_info.table_error},sql={query}", + ) + return Lineage.empty() + + for urn in parsed_result.in_tables: + dataplatform_tables.append( + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ) + + logger.debug(f"Native Query parsed result={parsed_result}") + logger.debug(f"Generated dataplatform_tables={dataplatform_tables}") + + return Lineage( + upstreams=dataplatform_tables, + column_lineage=( + parsed_result.column_lineage + if parsed_result.column_lineage is not None + else [] + ), + ) + + +class AmazonRedshiftLineage(AbstractLineage): + def get_platform_pair(self) -> DataPlatformPair: + return SupportedDataPlatform.AMAZON_REDSHIFT.value + + def create_lineage( + self, data_access_func_detail: DataAccessFunctionDetail + ) -> Lineage: + logger.debug( + f"Processing AmazonRedshift data-access function detail {data_access_func_detail}" + ) + + server, db_name = self.get_db_detail_from_argument( + data_access_func_detail.arg_list + ) + if db_name is None or server is None: + return Lineage.empty() # Return an empty list + + schema_name: str = cast( + IdentifierAccessor, data_access_func_detail.identifier_accessor + ).items["Name"] + + table_name: str = cast( + IdentifierAccessor, + cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next, + ).items["Name"] + + qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" + + urn = make_urn( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, + ) + + return Lineage( + upstreams=[ + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ], + column_lineage=[], + ) + + +class OracleLineage(AbstractLineage): + def get_platform_pair(self) -> DataPlatformPair: + return SupportedDataPlatform.ORACLE.value + + @staticmethod + def _get_server_and_db_name(value: str) -> Tuple[Optional[str], Optional[str]]: + error_message: str = ( + f"The target argument ({value}) should in the format of :/[" + ".]" + ) + splitter_result: List[str] = value.split("/") + if len(splitter_result) != 2: + logger.debug(error_message) + return None, None + + db_name = splitter_result[1].split(".")[0] + + return tree_function.strip_char_from_list([splitter_result[0]])[0], db_name + + def create_lineage( + self, data_access_func_detail: DataAccessFunctionDetail + ) -> Lineage: + logger.debug( + f"Processing Oracle data-access function detail {data_access_func_detail}" + ) + + arguments: List[str] = tree_function.remove_whitespaces_from_list( + tree_function.token_values(data_access_func_detail.arg_list) + ) + + server, db_name = self._get_server_and_db_name(arguments[0]) + + if db_name is None or server is None: + return Lineage.empty() + + schema_name: str = cast( + IdentifierAccessor, data_access_func_detail.identifier_accessor + ).items["Schema"] + + table_name: str = cast( + IdentifierAccessor, + cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next, + ).items["Name"] + + qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" + + urn = make_urn( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, + ) + + return Lineage( + upstreams=[ + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ], + column_lineage=[], + ) + + +class DatabricksLineage(AbstractLineage): + def form_qualified_table_name( + self, + table_reference: ReferencedTable, + data_platform_pair: DataPlatformPair, + ) -> str: + platform_detail: PlatformDetail = ( + self.platform_instance_resolver.get_platform_instance( + PowerBIPlatformDetail( + data_platform_pair=data_platform_pair, + data_platform_server=table_reference.warehouse, + ) + ) + ) + + metastore: Optional[str] = None + + qualified_table_name: str = f"{table_reference.database}.{table_reference.schema}.{table_reference.table}" + + if isinstance(platform_detail, DataBricksPlatformDetail): + metastore = platform_detail.metastore + + if metastore is not None: + return f"{metastore}.{qualified_table_name}" + + return qualified_table_name + + def create_lineage( + self, data_access_func_detail: DataAccessFunctionDetail + ) -> Lineage: + logger.debug( + f"Processing Databrick data-access function detail {data_access_func_detail}" + ) + table_detail: Dict[str, str] = {} + temp_accessor: Optional[ + Union[IdentifierAccessor, AbstractIdentifierAccessor] + ] = data_access_func_detail.identifier_accessor + + while temp_accessor: + if isinstance(temp_accessor, IdentifierAccessor): + # Condition to handle databricks M-query pattern where table, schema and database all are present in + # the same invoke statement + if all( + element in temp_accessor.items + for element in ["Item", "Schema", "Catalog"] + ): + table_detail["Schema"] = temp_accessor.items["Schema"] + table_detail["Table"] = temp_accessor.items["Item"] + else: + table_detail[temp_accessor.items["Kind"]] = temp_accessor.items[ + "Name" + ] + + if temp_accessor.next is not None: + temp_accessor = temp_accessor.next + else: + break + else: + logger.debug( + "expecting instance to be IdentifierAccessor, please check if parsing is done properly" + ) + return Lineage.empty() + + table_reference = self.create_reference_table( + arg_list=data_access_func_detail.arg_list, + table_detail=table_detail, + ) + + if table_reference: + qualified_table_name: str = self.form_qualified_table_name( + table_reference=table_reference, + data_platform_pair=self.get_platform_pair(), + ) + + urn = make_urn( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=table_reference.warehouse, + qualified_table_name=qualified_table_name, + ) + + return Lineage( + upstreams=[ + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ], + column_lineage=[], + ) + + return Lineage.empty() + + def get_platform_pair(self) -> DataPlatformPair: + return SupportedDataPlatform.DATABRICKS_SQL.value + + +class TwoStepDataAccessPattern(AbstractLineage, ABC): + """ + These are the DataSource for which PowerBI Desktop generates default M-Query of the following pattern + let + Source = Sql.Database("localhost", "library"), + dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data] + in + dbo_book_issue + """ + + def two_level_access_pattern( + self, data_access_func_detail: DataAccessFunctionDetail + ) -> Lineage: + logger.debug( + f"Processing {self.get_platform_pair().powerbi_data_platform_name} data-access function detail {data_access_func_detail}" + ) + + server, db_name = self.get_db_detail_from_argument( + data_access_func_detail.arg_list + ) + if server is None or db_name is None: + return Lineage.empty() # Return an empty list + + schema_name: str = cast( + IdentifierAccessor, data_access_func_detail.identifier_accessor + ).items["Schema"] + + table_name: str = cast( + IdentifierAccessor, data_access_func_detail.identifier_accessor + ).items["Item"] + + qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" + + logger.debug( + f"Platform({self.get_platform_pair().datahub_data_platform_name}) qualified_table_name= {qualified_table_name}" + ) + + urn = make_urn( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, + ) + return Lineage( + upstreams=[ + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ], + column_lineage=[], + ) + + +class PostgresLineage(TwoStepDataAccessPattern): + def create_lineage( + self, data_access_func_detail: DataAccessFunctionDetail + ) -> Lineage: + return self.two_level_access_pattern(data_access_func_detail) + + def get_platform_pair(self) -> DataPlatformPair: + return SupportedDataPlatform.POSTGRES_SQL.value + + +class MSSqlLineage(TwoStepDataAccessPattern): + # https://learn.microsoft.com/en-us/sql/relational-databases/security/authentication-access/ownership-and-user-schema-separation?view=sql-server-ver16 + DEFAULT_SCHEMA = "dbo" # Default schema name in MS-SQL is dbo + + def get_platform_pair(self) -> DataPlatformPair: + return SupportedDataPlatform.MS_SQL.value + + def create_urn_using_old_parser( + self, query: str, db_name: str, server: str + ) -> List[DataPlatformTable]: + dataplatform_tables: List[DataPlatformTable] = [] + + tables: List[str] = native_sql_parser.get_tables(query) + + for parsed_table in tables: + # components: List[str] = [v.strip("[]") for v in parsed_table.split(".")] + components = [v.strip("[]") for v in parsed_table.split(".")] + if len(components) == 3: + database, schema, table = components + elif len(components) == 2: + schema, table = components + database = db_name + elif len(components) == 1: + (table,) = components + database = db_name + schema = MSSqlLineage.DEFAULT_SCHEMA + else: + self.reporter.warning( + title="Invalid table format", + message="The advanced SQL lineage feature (enable_advance_lineage_sql_construct) is disabled. Please either enable this feature or ensure the table is referenced as .. in the SQL.", + context=f"table-name={self.table.full_name}", + ) + continue + + qualified_table_name = f"{database}.{schema}.{table}" + urn = make_urn( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, + ) + dataplatform_tables.append( + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ) + + logger.debug(f"Generated upstream tables = {dataplatform_tables}") + + return dataplatform_tables + + def create_lineage( + self, data_access_func_detail: DataAccessFunctionDetail + ) -> Lineage: + arguments: List[str] = tree_function.strip_char_from_list( + values=tree_function.remove_whitespaces_from_list( + tree_function.token_values(data_access_func_detail.arg_list) + ), + ) + + server, database = self.get_db_detail_from_argument( + data_access_func_detail.arg_list + ) + if server is None or database is None: + return Lineage.empty() # Return an empty list + + assert server + assert database # to silent the lint + + query: Optional[str] = get_next_item(arguments, "Query") + if query: + if self.config.enable_advance_lineage_sql_construct is False: + # Use previous parser to generate URN to keep backward compatibility + return Lineage( + upstreams=self.create_urn_using_old_parser( + query=query, + db_name=database, + server=server, + ), + column_lineage=[], + ) + + return self.parse_custom_sql( + query=query, + database=database, + server=server, + schema=MSSqlLineage.DEFAULT_SCHEMA, + ) + + # It is a regular case of MS-SQL + logger.debug("Handling with regular case") + return self.two_level_access_pattern(data_access_func_detail) + + +class ThreeStepDataAccessPattern(AbstractLineage, ABC): + def get_datasource_server( + self, arguments: List[str], data_access_func_detail: DataAccessFunctionDetail + ) -> str: + return tree_function.strip_char_from_list([arguments[0]])[0] + + def create_lineage( + self, data_access_func_detail: DataAccessFunctionDetail + ) -> Lineage: + logger.debug( + f"Processing {self.get_platform_pair().datahub_data_platform_name} function detail {data_access_func_detail}" + ) + + arguments: List[str] = tree_function.remove_whitespaces_from_list( + tree_function.token_values(data_access_func_detail.arg_list) + ) + # First is database name + db_name: str = data_access_func_detail.identifier_accessor.items["Name"] # type: ignore + # Second is schema name + schema_name: str = cast( + IdentifierAccessor, data_access_func_detail.identifier_accessor.next # type: ignore + ).items["Name"] + # Third is table name + table_name: str = cast( + IdentifierAccessor, data_access_func_detail.identifier_accessor.next.next # type: ignore + ).items["Name"] + + qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" + + logger.debug( + f"{self.get_platform_pair().datahub_data_platform_name} qualified_table_name {qualified_table_name}" + ) + + server: str = self.get_datasource_server(arguments, data_access_func_detail) + + urn = make_urn( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, + ) + + return Lineage( + upstreams=[ + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ], + column_lineage=[], + ) + + +class SnowflakeLineage(ThreeStepDataAccessPattern): + def get_platform_pair(self) -> DataPlatformPair: + return SupportedDataPlatform.SNOWFLAKE.value + + +class GoogleBigQueryLineage(ThreeStepDataAccessPattern): + def get_platform_pair(self) -> DataPlatformPair: + return SupportedDataPlatform.GOOGLE_BIGQUERY.value + + def get_datasource_server( + self, arguments: List[str], data_access_func_detail: DataAccessFunctionDetail + ) -> str: + # In Google BigQuery server is project-name + # condition to silent lint, it is not going to be None + return ( + data_access_func_detail.identifier_accessor.items["Name"] + if data_access_func_detail.identifier_accessor is not None + else "" + ) + + +class NativeQueryLineage(AbstractLineage): + SUPPORTED_NATIVE_QUERY_DATA_PLATFORM: dict = { + SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name: SupportedDataPlatform.SNOWFLAKE, + SupportedDataPlatform.AMAZON_REDSHIFT.value.powerbi_data_platform_name: SupportedDataPlatform.AMAZON_REDSHIFT, + SupportedDataPlatform.DatabricksMultiCloud_SQL.value.powerbi_data_platform_name: SupportedDataPlatform.DatabricksMultiCloud_SQL, + } + current_data_platform: SupportedDataPlatform = SupportedDataPlatform.SNOWFLAKE + + def get_platform_pair(self) -> DataPlatformPair: + return self.current_data_platform.value + + @staticmethod + def is_native_parsing_supported(data_access_function_name: str) -> bool: + return ( + data_access_function_name + in NativeQueryLineage.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM + ) + + def create_urn_using_old_parser(self, query: str, server: str) -> Lineage: + dataplatform_tables: List[DataPlatformTable] = [] + + tables: List[str] = native_sql_parser.get_tables(query) + + for qualified_table_name in tables: + if len(qualified_table_name.split(".")) != 3: + logger.debug( + f"Skipping table {qualified_table_name} as it is not as per qualified_table_name format" + ) + continue + + urn = make_urn( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, + ) + + dataplatform_tables.append( + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ) + + logger.debug(f"Generated dataplatform_tables {dataplatform_tables}") + + return Lineage( + upstreams=dataplatform_tables, + column_lineage=[], + ) + + def get_db_name(self, data_access_tokens: List[str]) -> Optional[str]: + if ( + data_access_tokens[0] + != SupportedDataPlatform.DatabricksMultiCloud_SQL.value.powerbi_data_platform_name + ): + return None + + database: Optional[str] = get_next_item(data_access_tokens, "Database") + + if ( + database and database != Constant.M_QUERY_NULL + ): # database name is explicitly set + return database + + return get_next_item( # database name is set in Name argument + data_access_tokens, "Name" + ) or get_next_item( # If both above arguments are not available, then try Catalog + data_access_tokens, "Catalog" + ) + + def create_lineage( + self, data_access_func_detail: DataAccessFunctionDetail + ) -> Lineage: + t1: Tree = cast( + Tree, tree_function.first_arg_list_func(data_access_func_detail.arg_list) + ) + flat_argument_list: List[Tree] = tree_function.flat_argument_list(t1) + + if len(flat_argument_list) != 2: + logger.debug( + f"Expecting 2 argument, actual argument count is {len(flat_argument_list)}" + ) + logger.debug(f"Flat argument list = {flat_argument_list}") + return Lineage.empty() + + data_access_tokens: List[str] = tree_function.remove_whitespaces_from_list( + tree_function.token_values(flat_argument_list[0]) + ) + + if not self.is_native_parsing_supported(data_access_tokens[0]): + logger.debug( + f"Unsupported native-query data-platform = {data_access_tokens[0]}" + ) + logger.debug( + f"NativeQuery is supported only for {self.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM}" + ) + + return Lineage.empty() + + if len(data_access_tokens[0]) < 3: + logger.debug( + f"Server is not available in argument list for data-platform {data_access_tokens[0]}. Returning empty " + "list" + ) + return Lineage.empty() + + self.current_data_platform = self.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM[ + data_access_tokens[0] + ] + # The First argument is the query + sql_query: str = tree_function.strip_char_from_list( + values=tree_function.remove_whitespaces_from_list( + tree_function.token_values(flat_argument_list[1]) + ), + )[ + 0 + ] # Remove any whitespaces and double quotes character + + server = tree_function.strip_char_from_list([data_access_tokens[2]])[0] + + if self.config.enable_advance_lineage_sql_construct is False: + # Use previous parser to generate URN to keep backward compatibility + return self.create_urn_using_old_parser( + query=sql_query, + server=server, + ) + + database_name: Optional[str] = self.get_db_name(data_access_tokens) + + return self.parse_custom_sql( + query=sql_query, + server=server, + database=database_name, + schema=None, + ) + + +class SupportedPattern(Enum): + DATABRICKS_QUERY = ( + DatabricksLineage, + FunctionName.DATABRICK_DATA_ACCESS, + ) + + DATABRICKS_MULTI_CLOUD = ( + DatabricksLineage, + FunctionName.DATABRICK_MULTI_CLOUD_DATA_ACCESS, + ) + + POSTGRES_SQL = ( + PostgresLineage, + FunctionName.POSTGRESQL_DATA_ACCESS, + ) + + ORACLE = ( + OracleLineage, + FunctionName.ORACLE_DATA_ACCESS, + ) + + SNOWFLAKE = ( + SnowflakeLineage, + FunctionName.SNOWFLAKE_DATA_ACCESS, + ) + + MS_SQL = ( + MSSqlLineage, + FunctionName.MSSQL_DATA_ACCESS, + ) + + GOOGLE_BIG_QUERY = ( + GoogleBigQueryLineage, + FunctionName.GOOGLE_BIGQUERY_DATA_ACCESS, + ) + + AMAZON_REDSHIFT = ( + AmazonRedshiftLineage, + FunctionName.AMAZON_REDSHIFT_DATA_ACCESS, + ) + + NATIVE_QUERY = ( + NativeQueryLineage, + FunctionName.NATIVE_QUERY, + ) + + def handler(self) -> Type[AbstractLineage]: + return self.value[0] + + def function_name(self) -> str: + return self.value[1].value + + @staticmethod + def get_function_names() -> List[str]: + functions: List[str] = [] + for supported_resolver in SupportedPattern: + functions.append(supported_resolver.function_name()) + + return functions + + @staticmethod + def get_pattern_handler(function_name: str) -> Optional["SupportedPattern"]: + logger.debug(f"Looking for pattern-handler for {function_name}") + for supported_resolver in SupportedPattern: + if function_name == supported_resolver.function_name(): + return supported_resolver + logger.debug(f"pattern-handler not found for function_name {function_name}") + return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py index a40e67d08da5b..81a0e1ef2d79b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py @@ -1,286 +1,33 @@ import logging from abc import ABC, abstractmethod -from dataclasses import dataclass -from enum import Enum -from typing import Any, Dict, List, Optional, Tuple, Type, Union, cast +from typing import Any, Dict, List, Optional, Tuple, Union, cast from lark import Tree -import datahub.emitter.mce_builder as builder from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.powerbi.config import ( - Constant, - DataBricksPlatformDetail, - DataPlatformPair, - PlatformDetail, PowerBiDashboardSourceConfig, PowerBiDashboardSourceReport, - PowerBIPlatformDetail, - SupportedDataPlatform, ) from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import ( AbstractDataPlatformInstanceResolver, ) -from datahub.ingestion.source.powerbi.m_query import native_sql_parser, tree_function +from datahub.ingestion.source.powerbi.m_query import tree_function from datahub.ingestion.source.powerbi.m_query.data_classes import ( TRACE_POWERBI_MQUERY_PARSER, - AbstractIdentifierAccessor, DataAccessFunctionDetail, IdentifierAccessor, - ReferencedTable, + Lineage, +) +from datahub.ingestion.source.powerbi.m_query.pattern_handler import ( + AbstractLineage, + SupportedPattern, ) from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table -from datahub.sql_parsing.sqlglot_lineage import ColumnLineageInfo, SqlParsingResult logger = logging.getLogger(__name__) -@dataclass -class DataPlatformTable: - data_platform_pair: DataPlatformPair - urn: str - - -@dataclass -class Lineage: - upstreams: List[DataPlatformTable] - column_lineage: List[ColumnLineageInfo] - - @staticmethod - def empty() -> "Lineage": - return Lineage(upstreams=[], column_lineage=[]) - - -def urn_to_lowercase(value: str, flag: bool) -> str: - if flag is True: - return value.lower() - - return value - - -def urn_creator( - config: PowerBiDashboardSourceConfig, - platform_instance_resolver: AbstractDataPlatformInstanceResolver, - data_platform_pair: DataPlatformPair, - server: str, - qualified_table_name: str, -) -> str: - platform_detail: PlatformDetail = platform_instance_resolver.get_platform_instance( - PowerBIPlatformDetail( - data_platform_pair=data_platform_pair, - data_platform_server=server, - ) - ) - - return builder.make_dataset_urn_with_platform_instance( - platform=data_platform_pair.datahub_data_platform_name, - platform_instance=platform_detail.platform_instance, - env=platform_detail.env, - name=urn_to_lowercase( - qualified_table_name, config.convert_lineage_urns_to_lowercase - ), - ) - - -def get_next_item(items: List[str], item: str) -> Optional[str]: - if item in items: - try: - index = items.index(item) - return items[index + 1] - except IndexError: - logger.debug(f'item:"{item}", not found in item-list: {items}') - return None - - -class AbstractDataPlatformTableCreator(ABC): - """ - Base class to share common functionalities among different dataplatform for M-Query parsing. - - To create qualified table name we need to parse M-Query data-access-functions(https://learn.microsoft.com/en-us/powerquery-m/accessing-data-functions) and - the data-access-functions has some define pattern to access database-name, schema-name and table-name, for example see below M-Query. - - let - Source = Sql.Database("localhost", "library"), - dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data] - in - dbo_book_issue - - It is MSSQL M-Query and Sql.Database is the data-access-function to access MSSQL. If this function is available in M-Query then database name is available in second argument - of first statement and schema-name and table-name is available in second statement. second statement can be repeated to access different tables from MSSQL. - - DefaultTwoStepDataAccessSources extends the AbstractDataPlatformTableCreator and provides the common functionalities for data-platform which has above type of M-Query pattern - - data-access-function varies as per data-platform for example for MySQL.Database for MySQL, PostgreSQL.Database for Postgres and Oracle.Database for Oracle and number of statement to - find out database-name , schema-name and table-name also varies as per dataplatform. - - Value.NativeQuery is one of the function which is used to execute native query inside M-Query, for example see below M-Query - - let - Source = Value.NativeQuery(AmazonRedshift.Database("redshift-url","dev"), "select * from dev.public.category", null, [EnableFolding=true]) - in - Source - - In this M-Query database-name is available in first argument and rest of the detail i.e database & schema is available in native query. - - NativeQueryDataPlatformTableCreator extends AbstractDataPlatformTableCreator to support Redshift and Snowflake native query parsing. - - """ - - ctx: PipelineContext - table: Table - config: PowerBiDashboardSourceConfig - reporter: PowerBiDashboardSourceReport - platform_instance_resolver: AbstractDataPlatformInstanceResolver - - def __init__( - self, - ctx: PipelineContext, - table: Table, - config: PowerBiDashboardSourceConfig, - reporter: PowerBiDashboardSourceReport, - platform_instance_resolver: AbstractDataPlatformInstanceResolver, - ) -> None: - super().__init__() - self.ctx = ctx - self.table = table - self.config = config - self.reporter = reporter - self.platform_instance_resolver = platform_instance_resolver - - @abstractmethod - def create_lineage( - self, data_access_func_detail: DataAccessFunctionDetail - ) -> Lineage: - pass - - @abstractmethod - def get_platform_pair(self) -> DataPlatformPair: - pass - - @staticmethod - def get_db_detail_from_argument( - arg_list: Tree, - ) -> Tuple[Optional[str], Optional[str]]: - arguments: List[str] = tree_function.strip_char_from_list( - values=tree_function.remove_whitespaces_from_list( - tree_function.token_values(arg_list) - ), - ) - - if len(arguments) < 2: - logger.debug(f"Expected minimum 2 arguments, but got {len(arguments)}") - return None, None - - return arguments[0], arguments[1] - - @staticmethod - def create_reference_table( - arg_list: Tree, - table_detail: Dict[str, str], - ) -> Optional[ReferencedTable]: - arguments: List[str] = tree_function.strip_char_from_list( - values=tree_function.remove_whitespaces_from_list( - tree_function.token_values(arg_list) - ), - ) - - logger.debug(f"Processing arguments {arguments}") - - if ( - len(arguments) - >= 4 # [0] is warehouse FQDN. - # [1] is endpoint, we are not using it. - # [2] is "Catalog" key - # [3] is catalog's value - ): - return ReferencedTable( - warehouse=arguments[0], - catalog=arguments[3], - # As per my observation, database and catalog names are same in M-Query - database=table_detail["Database"] - if table_detail.get("Database") - else arguments[3], - schema=table_detail["Schema"], - table=table_detail.get("Table") or table_detail["View"], - ) - elif len(arguments) == 2: - return ReferencedTable( - warehouse=arguments[0], - database=table_detail["Database"], - schema=table_detail["Schema"], - table=table_detail.get("Table") or table_detail["View"], - catalog=None, - ) - - return None - - def parse_custom_sql( - self, query: str, server: str, database: Optional[str], schema: Optional[str] - ) -> Lineage: - dataplatform_tables: List[DataPlatformTable] = [] - - platform_detail: PlatformDetail = ( - self.platform_instance_resolver.get_platform_instance( - PowerBIPlatformDetail( - data_platform_pair=self.get_platform_pair(), - data_platform_server=server, - ) - ) - ) - - query = native_sql_parser.remove_drop_statement( - native_sql_parser.remove_special_characters(query) - ) - - parsed_result: Optional[ - "SqlParsingResult" - ] = native_sql_parser.parse_custom_sql( - ctx=self.ctx, - query=query, - platform=self.get_platform_pair().datahub_data_platform_name, - platform_instance=platform_detail.platform_instance, - env=platform_detail.env, - database=database, - schema=schema, - ) - - if parsed_result is None: - self.reporter.info( - title=Constant.SQL_PARSING_FAILURE, - message="Fail to parse native sql present in PowerBI M-Query", - context=f"table-name={self.table.full_name}, sql={query}", - ) - return Lineage.empty() - - if parsed_result.debug_info and parsed_result.debug_info.table_error: - self.reporter.warning( - title=Constant.SQL_PARSING_FAILURE, - message="Fail to parse native sql present in PowerBI M-Query", - context=f"table-name={self.table.full_name}, error={parsed_result.debug_info.table_error},sql={query}", - ) - return Lineage.empty() - - for urn in parsed_result.in_tables: - dataplatform_tables.append( - DataPlatformTable( - data_platform_pair=self.get_platform_pair(), - urn=urn, - ) - ) - - logger.debug(f"Native Query parsed result={parsed_result}") - logger.debug(f"Generated dataplatform_tables={dataplatform_tables}") - - return Lineage( - upstreams=dataplatform_tables, - column_lineage=( - parsed_result.column_lineage - if parsed_result.column_lineage is not None - else [] - ), - ) - - class AbstractDataAccessMQueryResolver(ABC): table: Table parse_tree: Tree @@ -299,10 +46,10 @@ def __init__( self.parse_tree = parse_tree self.reporter = reporter self.parameters = parameters - self.data_access_functions = SupportedResolver.get_function_names() + self.data_access_functions = SupportedPattern.get_function_names() @abstractmethod - def resolve_to_data_platform_table_list( + def resolve_to_lineage( self, ctx: PipelineContext, config: PowerBiDashboardSourceConfig, @@ -318,7 +65,7 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC): This class has generic code to process M-Query tokens and create instance of DataAccessFunctionDetail. Once DataAccessFunctionDetail instance is initialized thereafter MQueryResolver generates the DataPlatformTable with the help of AbstractDataPlatformTableCreator - (see method resolve_to_data_platform_table_list). + (see method resolve_to_lineage). Classes which extended from AbstractDataPlatformTableCreator know how to convert generated DataAccessFunctionDetail instance to the respective DataPlatformTable instance as per dataplatform. @@ -602,7 +349,7 @@ def internal( return table_links - def resolve_to_data_platform_table_list( + def resolve_to_lineage( self, ctx: PipelineContext, config: PowerBiDashboardSourceConfig, @@ -630,7 +377,7 @@ def resolve_to_data_platform_table_list( # Each item is data-access function for f_detail in table_links: # Get & Check if we support data-access-function available in M-Query - supported_resolver = SupportedResolver.get_resolver( + supported_resolver = SupportedPattern.get_pattern_handler( f_detail.data_access_function_name ) if supported_resolver is None: @@ -643,11 +390,9 @@ def resolve_to_data_platform_table_list( ) continue - # From supported_resolver enum get respective resolver like AmazonRedshift or Snowflake or Oracle or NativeQuery and create instance of it - # & also pass additional information that will be need to generate urn - table_qualified_name_creator: ( - AbstractDataPlatformTableCreator - ) = supported_resolver.get_table_full_name_creator()( + # From supported_resolver enum get respective handler like AmazonRedshift or Snowflake or Oracle or NativeQuery and create instance of it + # & also pass additional information that will be need to generate lineage + pattern_handler: (AbstractLineage) = supported_resolver.handler()( ctx=ctx, table=self.table, config=config, @@ -655,673 +400,6 @@ def resolve_to_data_platform_table_list( platform_instance_resolver=platform_instance_resolver, ) - lineage.append(table_qualified_name_creator.create_lineage(f_detail)) + lineage.append(pattern_handler.create_lineage(f_detail)) return lineage - - -class DefaultTwoStepDataAccessSources(AbstractDataPlatformTableCreator, ABC): - """ - These are the DataSource for which PowerBI Desktop generates default M-Query of following pattern - let - Source = Sql.Database("localhost", "library"), - dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data] - in - dbo_book_issue - """ - - def two_level_access_pattern( - self, data_access_func_detail: DataAccessFunctionDetail - ) -> Lineage: - logger.debug( - f"Processing {self.get_platform_pair().powerbi_data_platform_name} data-access function detail {data_access_func_detail}" - ) - - server, db_name = self.get_db_detail_from_argument( - data_access_func_detail.arg_list - ) - if server is None or db_name is None: - return Lineage.empty() # Return an empty list - - schema_name: str = cast( - IdentifierAccessor, data_access_func_detail.identifier_accessor - ).items["Schema"] - - table_name: str = cast( - IdentifierAccessor, data_access_func_detail.identifier_accessor - ).items["Item"] - - qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" - - logger.debug( - f"Platform({self.get_platform_pair().datahub_data_platform_name}) qualified_table_name= {qualified_table_name}" - ) - - urn = urn_creator( - config=self.config, - platform_instance_resolver=self.platform_instance_resolver, - data_platform_pair=self.get_platform_pair(), - server=server, - qualified_table_name=qualified_table_name, - ) - return Lineage( - upstreams=[ - DataPlatformTable( - data_platform_pair=self.get_platform_pair(), - urn=urn, - ) - ], - column_lineage=[], - ) - - -class PostgresDataPlatformTableCreator(DefaultTwoStepDataAccessSources): - def create_lineage( - self, data_access_func_detail: DataAccessFunctionDetail - ) -> Lineage: - return self.two_level_access_pattern(data_access_func_detail) - - def get_platform_pair(self) -> DataPlatformPair: - return SupportedDataPlatform.POSTGRES_SQL.value - - -class MSSqlDataPlatformTableCreator(DefaultTwoStepDataAccessSources): - # https://learn.microsoft.com/en-us/sql/relational-databases/security/authentication-access/ownership-and-user-schema-separation?view=sql-server-ver16 - DEFAULT_SCHEMA = "dbo" # Default schema name in MS-SQL is dbo - - def get_platform_pair(self) -> DataPlatformPair: - return SupportedDataPlatform.MS_SQL.value - - def create_urn_using_old_parser( - self, query: str, db_name: str, server: str - ) -> List[DataPlatformTable]: - dataplatform_tables: List[DataPlatformTable] = [] - - tables: List[str] = native_sql_parser.get_tables(query) - - for parsed_table in tables: - # components: List[str] = [v.strip("[]") for v in parsed_table.split(".")] - components = [v.strip("[]") for v in parsed_table.split(".")] - if len(components) == 3: - database, schema, table = components - elif len(components) == 2: - schema, table = components - database = db_name - elif len(components) == 1: - (table,) = components - database = db_name - schema = MSSqlDataPlatformTableCreator.DEFAULT_SCHEMA - else: - self.reporter.warning( - title="Invalid table format", - message="The advanced SQL lineage feature (enable_advance_lineage_sql_construct) is disabled. Please either enable this feature or ensure the table is referenced as .. in the SQL.", - context=f"table-name={self.table.full_name}", - ) - continue - - qualified_table_name = f"{database}.{schema}.{table}" - urn = urn_creator( - config=self.config, - platform_instance_resolver=self.platform_instance_resolver, - data_platform_pair=self.get_platform_pair(), - server=server, - qualified_table_name=qualified_table_name, - ) - dataplatform_tables.append( - DataPlatformTable( - data_platform_pair=self.get_platform_pair(), - urn=urn, - ) - ) - - logger.debug(f"Generated upstream tables = {dataplatform_tables}") - - return dataplatform_tables - - def create_lineage( - self, data_access_func_detail: DataAccessFunctionDetail - ) -> Lineage: - arguments: List[str] = tree_function.strip_char_from_list( - values=tree_function.remove_whitespaces_from_list( - tree_function.token_values(data_access_func_detail.arg_list) - ), - ) - - server, database = self.get_db_detail_from_argument( - data_access_func_detail.arg_list - ) - if server is None or database is None: - return Lineage.empty() # Return an empty list - - assert server - assert database # to silent the lint - - query: Optional[str] = get_next_item(arguments, "Query") - if query: - if self.config.enable_advance_lineage_sql_construct is False: - # Use previous parser to generate URN to keep backward compatibility - return Lineage( - upstreams=self.create_urn_using_old_parser( - query=query, - db_name=database, - server=server, - ), - column_lineage=[], - ) - - return self.parse_custom_sql( - query=query, - database=database, - server=server, - schema=MSSqlDataPlatformTableCreator.DEFAULT_SCHEMA, - ) - - # It is a regular case of MS-SQL - logger.debug("Handling with regular case") - return self.two_level_access_pattern(data_access_func_detail) - - -class OracleDataPlatformTableCreator(AbstractDataPlatformTableCreator): - def get_platform_pair(self) -> DataPlatformPair: - return SupportedDataPlatform.ORACLE.value - - @staticmethod - def _get_server_and_db_name(value: str) -> Tuple[Optional[str], Optional[str]]: - error_message: str = ( - f"The target argument ({value}) should in the format of :/[" - ".]" - ) - splitter_result: List[str] = value.split("/") - if len(splitter_result) != 2: - logger.debug(error_message) - return None, None - - db_name = splitter_result[1].split(".")[0] - - return tree_function.strip_char_from_list([splitter_result[0]])[0], db_name - - def create_lineage( - self, data_access_func_detail: DataAccessFunctionDetail - ) -> Lineage: - logger.debug( - f"Processing Oracle data-access function detail {data_access_func_detail}" - ) - - arguments: List[str] = tree_function.remove_whitespaces_from_list( - tree_function.token_values(data_access_func_detail.arg_list) - ) - - server, db_name = self._get_server_and_db_name(arguments[0]) - - if db_name is None or server is None: - return Lineage.empty() - - schema_name: str = cast( - IdentifierAccessor, data_access_func_detail.identifier_accessor - ).items["Schema"] - - table_name: str = cast( - IdentifierAccessor, - cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next, - ).items["Name"] - - qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" - - urn = urn_creator( - config=self.config, - platform_instance_resolver=self.platform_instance_resolver, - data_platform_pair=self.get_platform_pair(), - server=server, - qualified_table_name=qualified_table_name, - ) - - return Lineage( - upstreams=[ - DataPlatformTable( - data_platform_pair=self.get_platform_pair(), - urn=urn, - ) - ], - column_lineage=[], - ) - - -class DatabrickDataPlatformTableCreator(AbstractDataPlatformTableCreator): - def form_qualified_table_name( - self, - table_reference: ReferencedTable, - data_platform_pair: DataPlatformPair, - ) -> str: - platform_detail: PlatformDetail = ( - self.platform_instance_resolver.get_platform_instance( - PowerBIPlatformDetail( - data_platform_pair=data_platform_pair, - data_platform_server=table_reference.warehouse, - ) - ) - ) - - metastore: Optional[str] = None - - qualified_table_name: str = f"{table_reference.database}.{table_reference.schema}.{table_reference.table}" - - if isinstance(platform_detail, DataBricksPlatformDetail): - metastore = platform_detail.metastore - - if metastore is not None: - return f"{metastore}.{qualified_table_name}" - - return qualified_table_name - - def create_lineage( - self, data_access_func_detail: DataAccessFunctionDetail - ) -> Lineage: - logger.debug( - f"Processing Databrick data-access function detail {data_access_func_detail}" - ) - table_detail: Dict[str, str] = {} - temp_accessor: Optional[ - Union[IdentifierAccessor, AbstractIdentifierAccessor] - ] = data_access_func_detail.identifier_accessor - - while temp_accessor: - if isinstance(temp_accessor, IdentifierAccessor): - # Condition to handle databricks M-query pattern where table, schema and database all are present in - # the same invoke statement - if all( - element in temp_accessor.items - for element in ["Item", "Schema", "Catalog"] - ): - table_detail["Schema"] = temp_accessor.items["Schema"] - table_detail["Table"] = temp_accessor.items["Item"] - else: - table_detail[temp_accessor.items["Kind"]] = temp_accessor.items[ - "Name" - ] - - if temp_accessor.next is not None: - temp_accessor = temp_accessor.next - else: - break - else: - logger.debug( - "expecting instance to be IdentifierAccessor, please check if parsing is done properly" - ) - return Lineage.empty() - - table_reference = self.create_reference_table( - arg_list=data_access_func_detail.arg_list, - table_detail=table_detail, - ) - - if table_reference: - qualified_table_name: str = self.form_qualified_table_name( - table_reference=table_reference, - data_platform_pair=self.get_platform_pair(), - ) - - urn = urn_creator( - config=self.config, - platform_instance_resolver=self.platform_instance_resolver, - data_platform_pair=self.get_platform_pair(), - server=table_reference.warehouse, - qualified_table_name=qualified_table_name, - ) - - return Lineage( - upstreams=[ - DataPlatformTable( - data_platform_pair=self.get_platform_pair(), - urn=urn, - ) - ], - column_lineage=[], - ) - - return Lineage.empty() - - def get_platform_pair(self) -> DataPlatformPair: - return SupportedDataPlatform.DATABRICK_SQL.value - - -class DefaultThreeStepDataAccessSources(AbstractDataPlatformTableCreator, ABC): - def get_datasource_server( - self, arguments: List[str], data_access_func_detail: DataAccessFunctionDetail - ) -> str: - return tree_function.strip_char_from_list([arguments[0]])[0] - - def create_lineage( - self, data_access_func_detail: DataAccessFunctionDetail - ) -> Lineage: - logger.debug( - f"Processing {self.get_platform_pair().datahub_data_platform_name} function detail {data_access_func_detail}" - ) - - arguments: List[str] = tree_function.remove_whitespaces_from_list( - tree_function.token_values(data_access_func_detail.arg_list) - ) - # First is database name - db_name: str = data_access_func_detail.identifier_accessor.items["Name"] # type: ignore - # Second is schema name - schema_name: str = cast( - IdentifierAccessor, data_access_func_detail.identifier_accessor.next # type: ignore - ).items["Name"] - # Third is table name - table_name: str = cast( - IdentifierAccessor, data_access_func_detail.identifier_accessor.next.next # type: ignore - ).items["Name"] - - qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" - - logger.debug( - f"{self.get_platform_pair().datahub_data_platform_name} qualified_table_name {qualified_table_name}" - ) - - server: str = self.get_datasource_server(arguments, data_access_func_detail) - - urn = urn_creator( - config=self.config, - platform_instance_resolver=self.platform_instance_resolver, - data_platform_pair=self.get_platform_pair(), - server=server, - qualified_table_name=qualified_table_name, - ) - - return Lineage( - upstreams=[ - DataPlatformTable( - data_platform_pair=self.get_platform_pair(), - urn=urn, - ) - ], - column_lineage=[], - ) - - -class SnowflakeDataPlatformTableCreator(DefaultThreeStepDataAccessSources): - def get_platform_pair(self) -> DataPlatformPair: - return SupportedDataPlatform.SNOWFLAKE.value - - -class GoogleBigQueryDataPlatformTableCreator(DefaultThreeStepDataAccessSources): - def get_platform_pair(self) -> DataPlatformPair: - return SupportedDataPlatform.GOOGLE_BIGQUERY.value - - def get_datasource_server( - self, arguments: List[str], data_access_func_detail: DataAccessFunctionDetail - ) -> str: - # In Google BigQuery server is project-name - # condition to silent lint, it is not going to be None - return ( - data_access_func_detail.identifier_accessor.items["Name"] - if data_access_func_detail.identifier_accessor is not None - else "" - ) - - -class AmazonRedshiftDataPlatformTableCreator(AbstractDataPlatformTableCreator): - def get_platform_pair(self) -> DataPlatformPair: - return SupportedDataPlatform.AMAZON_REDSHIFT.value - - def create_lineage( - self, data_access_func_detail: DataAccessFunctionDetail - ) -> Lineage: - logger.debug( - f"Processing AmazonRedshift data-access function detail {data_access_func_detail}" - ) - - server, db_name = self.get_db_detail_from_argument( - data_access_func_detail.arg_list - ) - if db_name is None or server is None: - return Lineage.empty() # Return empty list - - schema_name: str = cast( - IdentifierAccessor, data_access_func_detail.identifier_accessor - ).items["Name"] - - table_name: str = cast( - IdentifierAccessor, - cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next, - ).items["Name"] - - qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" - - urn = urn_creator( - config=self.config, - platform_instance_resolver=self.platform_instance_resolver, - data_platform_pair=self.get_platform_pair(), - server=server, - qualified_table_name=qualified_table_name, - ) - - return Lineage( - upstreams=[ - DataPlatformTable( - data_platform_pair=self.get_platform_pair(), - urn=urn, - ) - ], - column_lineage=[], - ) - - -class NativeQueryDataPlatformTableCreator(AbstractDataPlatformTableCreator): - SUPPORTED_NATIVE_QUERY_DATA_PLATFORM: dict = { - SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name: SupportedDataPlatform.SNOWFLAKE, - SupportedDataPlatform.AMAZON_REDSHIFT.value.powerbi_data_platform_name: SupportedDataPlatform.AMAZON_REDSHIFT, - SupportedDataPlatform.DatabricksMultiCloud_SQL.value.powerbi_data_platform_name: SupportedDataPlatform.DatabricksMultiCloud_SQL, - } - current_data_platform: SupportedDataPlatform = SupportedDataPlatform.SNOWFLAKE - - def get_platform_pair(self) -> DataPlatformPair: - return self.current_data_platform.value - - @staticmethod - def is_native_parsing_supported(data_access_function_name: str) -> bool: - return ( - data_access_function_name - in NativeQueryDataPlatformTableCreator.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM - ) - - def create_urn_using_old_parser(self, query: str, server: str) -> Lineage: - dataplatform_tables: List[DataPlatformTable] = [] - - tables: List[str] = native_sql_parser.get_tables(query) - - for qualified_table_name in tables: - if len(qualified_table_name.split(".")) != 3: - logger.debug( - f"Skipping table {qualified_table_name} as it is not as per qualified_table_name format" - ) - continue - - urn = urn_creator( - config=self.config, - platform_instance_resolver=self.platform_instance_resolver, - data_platform_pair=self.get_platform_pair(), - server=server, - qualified_table_name=qualified_table_name, - ) - - dataplatform_tables.append( - DataPlatformTable( - data_platform_pair=self.get_platform_pair(), - urn=urn, - ) - ) - - logger.debug(f"Generated dataplatform_tables {dataplatform_tables}") - - return Lineage( - upstreams=dataplatform_tables, - column_lineage=[], - ) - - def get_db_name(self, data_access_tokens: List[str]) -> Optional[str]: - if ( - data_access_tokens[0] - != SupportedDataPlatform.DatabricksMultiCloud_SQL.value.powerbi_data_platform_name - ): - return None - - database: Optional[str] = get_next_item(data_access_tokens, "Database") - - if ( - database and database != Constant.M_QUERY_NULL - ): # database name is explicitly set - return database - - return get_next_item( # database name is set in Name argument - data_access_tokens, "Name" - ) or get_next_item( # If both above arguments are not available, then try Catalog - data_access_tokens, "Catalog" - ) - - def create_lineage( - self, data_access_func_detail: DataAccessFunctionDetail - ) -> Lineage: - t1: Tree = cast( - Tree, tree_function.first_arg_list_func(data_access_func_detail.arg_list) - ) - flat_argument_list: List[Tree] = tree_function.flat_argument_list(t1) - - if len(flat_argument_list) != 2: - logger.debug( - f"Expecting 2 argument, actual argument count is {len(flat_argument_list)}" - ) - logger.debug(f"Flat argument list = {flat_argument_list}") - return Lineage.empty() - - data_access_tokens: List[str] = tree_function.remove_whitespaces_from_list( - tree_function.token_values(flat_argument_list[0]) - ) - - if not self.is_native_parsing_supported(data_access_tokens[0]): - logger.debug( - f"Unsupported native-query data-platform = {data_access_tokens[0]}" - ) - logger.debug( - f"NativeQuery is supported only for {self.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM}" - ) - - return Lineage.empty() - - if len(data_access_tokens[0]) < 3: - logger.debug( - f"Server is not available in argument list for data-platform {data_access_tokens[0]}. Returning empty " - "list" - ) - return Lineage.empty() - - self.current_data_platform = self.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM[ - data_access_tokens[0] - ] - # The First argument is the query - sql_query: str = tree_function.strip_char_from_list( - values=tree_function.remove_whitespaces_from_list( - tree_function.token_values(flat_argument_list[1]) - ), - )[ - 0 - ] # Remove any whitespaces and double quotes character - - server = tree_function.strip_char_from_list([data_access_tokens[2]])[0] - - if self.config.enable_advance_lineage_sql_construct is False: - # Use previous parser to generate URN to keep backward compatibility - return self.create_urn_using_old_parser( - query=sql_query, - server=server, - ) - - database_name: Optional[str] = self.get_db_name(data_access_tokens) - - return self.parse_custom_sql( - query=sql_query, - server=server, - database=database_name, - schema=None, - ) - - -class FunctionName(Enum): - NATIVE_QUERY = "Value.NativeQuery" - POSTGRESQL_DATA_ACCESS = "PostgreSQL.Database" - ORACLE_DATA_ACCESS = "Oracle.Database" - SNOWFLAKE_DATA_ACCESS = "Snowflake.Databases" - MSSQL_DATA_ACCESS = "Sql.Database" - DATABRICK_DATA_ACCESS = "Databricks.Catalogs" - GOOGLE_BIGQUERY_DATA_ACCESS = "GoogleBigQuery.Database" - AMAZON_REDSHIFT_DATA_ACCESS = "AmazonRedshift.Database" - DATABRICK_MULTI_CLOUD_DATA_ACCESS = "DatabricksMultiCloud.Catalogs" - - -class SupportedResolver(Enum): - DATABRICKS_QUERY = ( - DatabrickDataPlatformTableCreator, - FunctionName.DATABRICK_DATA_ACCESS, - ) - - DATABRICKS_MULTI_CLOUD = ( - DatabrickDataPlatformTableCreator, - FunctionName.DATABRICK_MULTI_CLOUD_DATA_ACCESS, - ) - - POSTGRES_SQL = ( - PostgresDataPlatformTableCreator, - FunctionName.POSTGRESQL_DATA_ACCESS, - ) - - ORACLE = ( - OracleDataPlatformTableCreator, - FunctionName.ORACLE_DATA_ACCESS, - ) - - SNOWFLAKE = ( - SnowflakeDataPlatformTableCreator, - FunctionName.SNOWFLAKE_DATA_ACCESS, - ) - - MS_SQL = ( - MSSqlDataPlatformTableCreator, - FunctionName.MSSQL_DATA_ACCESS, - ) - - GOOGLE_BIG_QUERY = ( - GoogleBigQueryDataPlatformTableCreator, - FunctionName.GOOGLE_BIGQUERY_DATA_ACCESS, - ) - - AMAZON_REDSHIFT = ( - AmazonRedshiftDataPlatformTableCreator, - FunctionName.AMAZON_REDSHIFT_DATA_ACCESS, - ) - - NATIVE_QUERY = ( - NativeQueryDataPlatformTableCreator, - FunctionName.NATIVE_QUERY, - ) - - def get_table_full_name_creator(self) -> Type[AbstractDataPlatformTableCreator]: - return self.value[0] - - def get_function_name(self) -> str: - return self.value[1].value - - @staticmethod - def get_function_names() -> List[str]: - functions: List[str] = [] - for supported_resolver in SupportedResolver: - functions.append(supported_resolver.get_function_name()) - - return functions - - @staticmethod - def get_resolver(function_name: str) -> Optional["SupportedResolver"]: - logger.debug(f"Looking for resolver {function_name}") - for supported_resolver in SupportedResolver: - if function_name == supported_resolver.get_function_name(): - return supported_resolver - logger.debug(f"Resolver not found for function_name {function_name}") - return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py index ca2abf97c9f30..b52977aaa41fb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py @@ -1,7 +1,7 @@ import logging from typing import Optional, Tuple -from datahub.ingestion.source.powerbi.m_query import resolver +import datahub.ingestion.source.powerbi.m_query.data_classes logger = logging.getLogger(__name__) @@ -14,12 +14,18 @@ def validate_parse_tree( :param native_query_enabled: Whether user want to extract lineage from native query :return: True or False. """ - function_names = [fun.value for fun in resolver.FunctionName] + function_names = [ + fun.value + for fun in datahub.ingestion.source.powerbi.m_query.data_classes.FunctionName + ] if not any(fun in expression for fun in function_names): return False, "DataAccess function is not present in M-Query expression." if native_query_enabled is False: - if resolver.FunctionName.NATIVE_QUERY.value in function_names: + if ( + datahub.ingestion.source.powerbi.m_query.data_classes.FunctionName.NATIVE_QUERY.value + in function_names + ): return ( False, "Lineage extraction from native query is disabled. Enable native_query_parsing in recipe", diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index cef2d098aebc4..044946a5d308d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -10,6 +10,7 @@ import more_itertools import datahub.emitter.mce_builder as builder +import datahub.ingestion.source.powerbi.m_query.data_classes import datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes as powerbi_data_classes from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_builder import ContainerKey, gen_containers @@ -42,12 +43,13 @@ Constant, PowerBiDashboardSourceConfig, PowerBiDashboardSourceReport, + SupportedDataPlatform, ) from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import ( AbstractDataPlatformInstanceResolver, create_dataplatform_instance_resolver, ) -from datahub.ingestion.source.powerbi.m_query import parser, resolver +from datahub.ingestion.source.powerbi.m_query import parser from datahub.ingestion.source.powerbi.rest_api_wrapper.powerbi_api import PowerBiAPI from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, @@ -182,7 +184,9 @@ def extract_dataset_schema( return [schema_mcp] def make_fine_grained_lineage_class( - self, lineage: resolver.Lineage, dataset_urn: str + self, + lineage: datahub.ingestion.source.powerbi.m_query.data_classes.Lineage, + dataset_urn: str, ) -> List[FineGrainedLineage]: fine_grained_lineages: List[FineGrainedLineage] = [] @@ -234,7 +238,9 @@ def extract_lineage( upstream: List[UpstreamClass] = [] cll_lineage: List[FineGrainedLineage] = [] - upstream_lineage: List[resolver.Lineage] = parser.get_upstream_tables( + upstream_lineage: List[ + datahub.ingestion.source.powerbi.m_query.data_classes.Lineage + ] = parser.get_upstream_tables( table=table, reporter=self.__reporter, platform_instance_resolver=self.__dataplatform_instance_resolver, @@ -1294,7 +1300,7 @@ def get_allowed_workspaces(self) -> List[powerbi_data_classes.Workspace]: def validate_dataset_type_mapping(self): powerbi_data_platforms: List[str] = [ data_platform.value.powerbi_data_platform_name - for data_platform in resolver.SupportedDataPlatform + for data_platform in SupportedDataPlatform ] for key in self.source_config.dataset_type_mapping.keys(): @@ -1481,7 +1487,7 @@ def _get_dashboard_patch_work_unit( def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: # As modified_workspaces is not idempotent, hence workunit processors are run later for each workspace_id - # This will result in creating checkpoint for each workspace_id + # This will result in creating a checkpoint for each workspace_id if self.source_config.modified_since: return [] # Handle these in get_workunits_internal else: @@ -1492,7 +1498,7 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: """ - Datahub Ingestion framework invoke this method + Datahub Ingestion framework invokes this method """ logger.info("PowerBi plugin execution is started") # Validate dataset type mapping diff --git a/metadata-ingestion/src/datahub/ingestion/source/qlik_sense/data_classes.py b/metadata-ingestion/src/datahub/ingestion/source/qlik_sense/data_classes.py index 672fcbceb0603..a43f5f32493f2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/qlik_sense/data_classes.py +++ b/metadata-ingestion/src/datahub/ingestion/source/qlik_sense/data_classes.py @@ -15,6 +15,7 @@ TimeType, ) +# TODO: Replace with standardized types in sql_types.py FIELD_TYPE_MAPPING: Dict[ str, Type[ diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py index 4bc4c1451c262..06cbb7fbae27c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py @@ -222,6 +222,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource): ``` """ + # TODO: Replace with standardized types in sql_types.py REDSHIFT_FIELD_TYPE_MAPPINGS: Dict[ str, Type[ diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py index d4442749a0622..2bd8e8017f549 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py @@ -103,6 +103,7 @@ logger = logging.getLogger(__name__) # https://docs.snowflake.com/en/sql-reference/intro-summary-data-types.html +# TODO: Move to the standardized types in sql_types.py SNOWFLAKE_FIELD_TYPE_MAPPINGS = { "DATE": DateType, "BIGINT": NumberType, diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py b/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py index 71cfd0268ee6b..6f7decc79b1df 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py @@ -26,6 +26,7 @@ platform_name, support_status, ) +from datahub.ingestion.api.source import StructuredLogLevel from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.aws.s3_util import make_s3_urn from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes @@ -35,6 +36,7 @@ register_custom_type, ) from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, make_sqlalchemy_uri +from datahub.ingestion.source.sql.sql_report import SQLSourceReport from datahub.ingestion.source.sql.sql_utils import ( add_table_to_schema_container, gen_database_container, @@ -48,6 +50,15 @@ get_schema_fields_for_sqlalchemy_column, ) +try: + from typing_extensions import override +except ImportError: + _F = typing.TypeVar("_F", bound=typing.Callable[..., typing.Any]) + + def override(f: _F, /) -> _F: # noqa: F811 + return f + + logger = logging.getLogger(__name__) assert STRUCT, "required type modules are not available" @@ -322,12 +333,15 @@ class AthenaSource(SQLAlchemySource): - Profiling when enabled. """ - table_partition_cache: Dict[str, Dict[str, Partitionitem]] = {} + config: AthenaConfig + report: SQLSourceReport def __init__(self, config, ctx): super().__init__(config, ctx, "athena") self.cursor: Optional[BaseCursor] = None + self.table_partition_cache: Dict[str, Dict[str, Partitionitem]] = {} + @classmethod def create(cls, config_dict, ctx): config = AthenaConfig.parse_obj(config_dict) @@ -452,6 +466,7 @@ def add_table_to_schema_container( ) # It seems like database/schema filter in the connection string does not work and this to work around that + @override def get_schema_names(self, inspector: Inspector) -> List[str]: athena_config = typing.cast(AthenaConfig, self.config) schemas = inspector.get_schema_names() @@ -459,34 +474,42 @@ def get_schema_names(self, inspector: Inspector) -> List[str]: return [schema for schema in schemas if schema == athena_config.database] return schemas - # Overwrite to get partitions + @classmethod + def _casted_partition_key(cls, key: str) -> str: + # We need to cast the partition keys to a VARCHAR, since otherwise + # Athena may throw an error during concatenation / comparison. + return f"CAST({key} as VARCHAR)" + + @override def get_partitions( self, inspector: Inspector, schema: str, table: str - ) -> List[str]: - partitions = [] - - athena_config = typing.cast(AthenaConfig, self.config) - - if not athena_config.extract_partitions: - return [] + ) -> Optional[List[str]]: + if not self.config.extract_partitions: + return None if not self.cursor: - return [] + return None metadata: AthenaTableMetadata = self.cursor.get_table_metadata( table_name=table, schema_name=schema ) - if metadata.partition_keys: - for key in metadata.partition_keys: - if key.name: - partitions.append(key.name) - - if not partitions: - return [] + partitions = [] + for key in metadata.partition_keys: + if key.name: + partitions.append(key.name) + if not partitions: + return [] - # We create an artiificaial concatenated partition key to be able to query max partition easier - part_concat = "|| '-' ||".join(partitions) + with self.report.report_exc( + message="Failed to extract partition details", + context=f"{schema}.{table}", + level=StructuredLogLevel.WARN, + ): + # We create an artifical concatenated partition key to be able to query max partition easier + part_concat = " || '-' || ".join( + self._casted_partition_key(key) for key in partitions + ) max_partition_query = f'select {",".join(partitions)} from "{schema}"."{table}$partitions" where {part_concat} = (select max({part_concat}) from "{schema}"."{table}$partitions")' ret = self.cursor.execute(max_partition_query) max_partition: Dict[str, str] = {} @@ -500,9 +523,8 @@ def get_partitions( partitions=partitions, max_partition=max_partition, ) - return partitions - return [] + return partitions # Overwrite to modify the creation of schema fields def get_schema_fields_for_column( @@ -551,7 +573,9 @@ def generate_partition_profiler_query( if partition and partition.max_partition: max_partition_filters = [] for key, value in partition.max_partition.items(): - max_partition_filters.append(f"CAST({key} as VARCHAR) = '{value}'") + max_partition_filters.append( + f"{self._casted_partition_key(key)} = '{value}'" + ) max_partition = str(partition.max_partition) return ( max_partition, diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py index 8ea4209784063..89ca160ba1f48 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py @@ -1,5 +1,5 @@ import re -from typing import Any, Dict, ValuesView +from typing import Any, Dict, Optional, Type, Union, ValuesView from datahub.metadata.com.linkedin.pegasus2avro.schema import ( ArrayType, @@ -16,14 +16,28 @@ UnionType, ) -# these can be obtained by running `select format_type(oid, null),* from pg_type;` -# we've omitted the types without a meaningful DataHub type (e.g. postgres-specific types, index vectors, etc.) -# (run `\copy (select format_type(oid, null),* from pg_type) to 'pg_type.csv' csv header;` to get a CSV) +DATAHUB_FIELD_TYPE = Union[ + ArrayType, + BooleanType, + BytesType, + DateType, + EnumType, + MapType, + NullType, + NumberType, + RecordType, + StringType, + TimeType, + UnionType, +] -# we map from format_type since this is what dbt uses -# see https://github.com/fishtown-analytics/dbt/blob/master/plugins/postgres/dbt/include/postgres/macros/catalog.sql#L22 -# see https://www.npgsql.org/dev/types.html for helpful type annotations +# These can be obtained by running `select format_type(oid, null),* from pg_type;` +# We've omitted the types without a meaningful DataHub type (e.g. postgres-specific types, index vectors, etc.) +# (run `\copy (select format_type(oid, null),* from pg_type) to 'pg_type.csv' csv header;` to get a CSV) +# We map from format_type since this is what dbt uses. +# See https://github.com/fishtown-analytics/dbt/blob/master/plugins/postgres/dbt/include/postgres/macros/catalog.sql#L22 +# See https://www.npgsql.org/dev/types.html for helpful type annotations POSTGRES_TYPES_MAP: Dict[str, Any] = { "boolean": BooleanType, "bytea": BytesType, @@ -430,3 +444,54 @@ def resolve_vertica_modified_type(type_string: str) -> Any: "geography": None, "uuid": StringType, } + + +_merged_mapping = { + "boolean": BooleanType, + "date": DateType, + "time": TimeType, + "numeric": NumberType, + "text": StringType, + "timestamp with time zone": DateType, + "timestamp without time zone": DateType, + "integer": NumberType, + "float8": NumberType, + "struct": RecordType, + **POSTGRES_TYPES_MAP, + **SNOWFLAKE_TYPES_MAP, + **BIGQUERY_TYPES_MAP, + **SPARK_SQL_TYPES_MAP, + **TRINO_SQL_TYPES_MAP, + **ATHENA_SQL_TYPES_MAP, + **VERTICA_SQL_TYPES_MAP, +} + + +def resolve_sql_type( + column_type: Optional[str], + platform: Optional[str] = None, +) -> Optional[DATAHUB_FIELD_TYPE]: + # In theory, we should use the platform-specific mapping where available. + # However, the types don't ever conflict, so the merged mapping is fine. + TypeClass: Optional[Type[DATAHUB_FIELD_TYPE]] = ( + _merged_mapping.get(column_type) if column_type else None + ) + + if TypeClass is None and column_type: + # resolve a modified type + if platform == "trino": + TypeClass = resolve_trino_modified_type(column_type) + elif platform == "athena": + TypeClass = resolve_athena_modified_type(column_type) + elif platform == "postgres" or platform == "redshift": + # Redshift uses a variant of Postgres, so we can use the same logic. + TypeClass = resolve_postgres_modified_type(column_type) + elif platform == "vertica": + TypeClass = resolve_vertica_modified_type(column_type) + elif platform == "snowflake": + # Snowflake types are uppercase, so we check that. + TypeClass = _merged_mapping.get(column_type.upper()) + + if TypeClass: + return TypeClass() + return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py index f84f6c1b0c08d..9c5752c518df1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py @@ -33,6 +33,7 @@ logger = logging.getLogger(__name__) +# TODO: (maybe) Replace with standardized types in sql_types.py DATA_TYPE_REGISTRY: dict = { ColumnTypeName.BOOLEAN: BooleanTypeClass, ColumnTypeName.BYTE: BytesTypeClass, diff --git a/metadata-ingestion/src/datahub/telemetry/telemetry.py b/metadata-ingestion/src/datahub/telemetry/telemetry.py index 4faf04ee2d2c7..22b2cb6a101af 100644 --- a/metadata-ingestion/src/datahub/telemetry/telemetry.py +++ b/metadata-ingestion/src/datahub/telemetry/telemetry.py @@ -7,7 +7,7 @@ import uuid from functools import wraps from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, TypeVar +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, TypeVar from mixpanel import Consumer, Mixpanel from typing_extensions import ParamSpec @@ -16,10 +16,12 @@ from datahub.cli.config_utils import DATAHUB_ROOT_FOLDER from datahub.cli.env_utils import get_boolean_env_variable from datahub.configuration.common import ExceptionWithProps -from datahub.ingestion.graph.client import DataHubGraph from datahub.metadata.schema_classes import _custom_package_path from datahub.utilities.perf_timer import PerfTimer +if TYPE_CHECKING: + from datahub.ingestion.graph.client import DataHubGraph + logger = logging.getLogger(__name__) DATAHUB_FOLDER = Path(DATAHUB_ROOT_FOLDER) @@ -117,7 +119,11 @@ class Telemetry: tracking_init: bool = False sentry_enabled: bool = False + context_properties: Dict[str, Any] = {} + def __init__(self): + self.context_properties = {} + if SENTRY_DSN: self.sentry_enabled = True try: @@ -157,6 +163,9 @@ def __init__(self): except Exception as e: logger.debug(f"Error connecting to mixpanel: {e}") + # Initialize the default properties for all events. + self.set_context() + def update_config(self) -> bool: """ Update the config file with the current client ID and enabled status. @@ -238,18 +247,22 @@ def load_config(self) -> bool: return False - def update_capture_exception_context( + def set_context( self, - server: Optional[DataHubGraph] = None, + server: Optional["DataHubGraph"] = None, properties: Optional[Dict[str, Any]] = None, ) -> None: + self.context_properties = { + **self._server_props(server), + **(properties or {}), + } + if self.sentry_enabled: from sentry_sdk import set_tag properties = { **_default_telemetry_properties(), - **self._server_props(server), - **(properties or {}), + **self.context_properties, } for key in properties: @@ -297,7 +310,6 @@ def ping( self, event_name: str, properties: Optional[Dict[str, Any]] = None, - server: Optional[DataHubGraph] = None, ) -> None: """ Send a single telemetry event. @@ -323,14 +335,15 @@ def ping( properties = { **_default_telemetry_properties(), - **self._server_props(server), + **self.context_properties, **properties, } self.mp.track(self.client_id, event_name, properties) except Exception as e: logger.debug(f"Error reporting telemetry: {e}") - def _server_props(self, server: Optional[DataHubGraph]) -> Dict[str, str]: + @classmethod + def _server_props(cls, server: Optional["DataHubGraph"]) -> Dict[str, str]: if not server: return { "server_type": "n/a", @@ -435,6 +448,7 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _T: **call_props, "status": "error", **_error_props(e), + "code": e.code, }, ) telemetry_instance.capture_exception(e) diff --git a/metadata-ingestion/src/datahub/utilities/urn_encoder.py b/metadata-ingestion/src/datahub/utilities/urn_encoder.py index 88c0a128b8e46..4f19eeff3e70f 100644 --- a/metadata-ingestion/src/datahub/utilities/urn_encoder.py +++ b/metadata-ingestion/src/datahub/utilities/urn_encoder.py @@ -4,7 +4,8 @@ # NOTE: Frontend relies on encoding these three characters. Specifically, we decode and encode schema fields for column level lineage. # If this changes, make appropriate changes to datahub-web-react/src/app/lineage/utils/columnLineageUtils.ts # We also rely on encoding these exact three characters when generating schemaField urns in our graphQL layer. Update SchemaFieldUtils if this changes. -RESERVED_CHARS = {",", "(", ")"} +# Also see https://datahubproject.io/docs/what/urn/#restrictions +RESERVED_CHARS = {",", "(", ")", "␟"} RESERVED_CHARS_EXTENDED = RESERVED_CHARS.union({"%"}) diff --git a/metadata-ingestion/tests/integration/dbt/test_dbt.py b/metadata-ingestion/tests/integration/dbt/test_dbt.py index 390d8d7698dd4..c6a3dc4fd590b 100644 --- a/metadata-ingestion/tests/integration/dbt/test_dbt.py +++ b/metadata-ingestion/tests/integration/dbt/test_dbt.py @@ -11,12 +11,6 @@ from datahub.ingestion.run.pipeline_config import PipelineConfig, SourceConfig from datahub.ingestion.source.dbt.dbt_common import DBTEntitiesEnabled, EmitDirective from datahub.ingestion.source.dbt.dbt_core import DBTCoreConfig, DBTCoreSource -from datahub.ingestion.source.sql.sql_types import ( - ATHENA_SQL_TYPES_MAP, - TRINO_SQL_TYPES_MAP, - resolve_athena_modified_type, - resolve_trino_modified_type, -) from tests.test_helpers import mce_helpers, test_connection_helpers FROZEN_TIME = "2022-02-03 07:00:00" @@ -362,69 +356,6 @@ def test_dbt_tests(test_resources_dir, pytestconfig, tmp_path, mock_time, **kwar ) -@pytest.mark.parametrize( - "data_type, expected_data_type", - [ - ("boolean", "boolean"), - ("tinyint", "tinyint"), - ("smallint", "smallint"), - ("int", "int"), - ("integer", "integer"), - ("bigint", "bigint"), - ("real", "real"), - ("double", "double"), - ("decimal(10,0)", "decimal"), - ("varchar(20)", "varchar"), - ("char", "char"), - ("varbinary", "varbinary"), - ("json", "json"), - ("date", "date"), - ("time", "time"), - ("time(12)", "time"), - ("timestamp", "timestamp"), - ("timestamp(3)", "timestamp"), - ("row(x bigint, y double)", "row"), - ("array(row(x bigint, y double))", "array"), - ("map(varchar, varchar)", "map"), - ], -) -def test_resolve_trino_modified_type(data_type, expected_data_type): - assert ( - resolve_trino_modified_type(data_type) - == TRINO_SQL_TYPES_MAP[expected_data_type] - ) - - -@pytest.mark.parametrize( - "data_type, expected_data_type", - [ - ("boolean", "boolean"), - ("tinyint", "tinyint"), - ("smallint", "smallint"), - ("int", "int"), - ("integer", "integer"), - ("bigint", "bigint"), - ("float", "float"), - ("double", "double"), - ("decimal(10,0)", "decimal"), - ("varchar(20)", "varchar"), - ("char", "char"), - ("binary", "binary"), - ("date", "date"), - ("timestamp", "timestamp"), - ("timestamp(3)", "timestamp"), - ("struct", "struct"), - ("array>", "array"), - ("map", "map"), - ], -) -def test_resolve_athena_modified_type(data_type, expected_data_type): - assert ( - resolve_athena_modified_type(data_type) - == ATHENA_SQL_TYPES_MAP[expected_data_type] - ) - - @pytest.mark.integration @freeze_time(FROZEN_TIME) def test_dbt_tests_only_assertions( diff --git a/metadata-ingestion/tests/integration/feast/feast_repository_mces_golden.json b/metadata-ingestion/tests/integration/feast/feast_repository_mces_golden.json index 1b91925289845..a4fd9843c5cf4 100644 --- a/metadata-ingestion/tests/integration/feast/feast_repository_mces_golden.json +++ b/metadata-ingestion/tests/integration/feast/feast_repository_mces_golden.json @@ -9,8 +9,33 @@ "removed": false } }, + { + "com.linkedin.pegasus2avro.common.GlobalTags": { + "tags": [ + { + "tag": "urn:li:tag:deprecated" + } + ] + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpGroup:MOCK_OWNER", + "type": "BUSINESS_OWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, { "com.linkedin.pegasus2avro.ml.metadata.MLPrimaryKeyProperties": { + "customProperties": {}, "description": "Driver ID", "dataType": "ORDINAL", "sources": [ @@ -23,7 +48,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "feast-repository-test" + "runId": "feast-repository-test", + "lastRunId": "no-run-id-provided" } }, { @@ -36,8 +62,18 @@ "removed": false } }, + { + "com.linkedin.pegasus2avro.common.GlobalTags": { + "tags": [ + { + "tag": "urn:li:tag:needs_documentation" + } + ] + } + }, { "com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": { + "customProperties": {}, "description": "Conv rate", "dataType": "CONTINUOUS", "sources": [ @@ -50,7 +86,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "feast-repository-test" + "runId": "feast-repository-test", + "lastRunId": "no-run-id-provided" } }, { @@ -65,6 +102,7 @@ }, { "com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": { + "customProperties": {}, "description": "Acc rate", "dataType": "CONTINUOUS", "sources": [ @@ -77,7 +115,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "feast-repository-test" + "runId": "feast-repository-test", + "lastRunId": "no-run-id-provided" } }, { @@ -92,6 +131,7 @@ }, { "com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": { + "customProperties": {}, "description": "Avg daily trips", "dataType": "ORDINAL", "sources": [ @@ -104,7 +144,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "feast-repository-test" + "runId": "feast-repository-test", + "lastRunId": "no-run-id-provided" } }, { @@ -119,6 +160,7 @@ }, { "com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": { + "customProperties": {}, "description": "String feature", "dataType": "TEXT", "sources": [ @@ -131,7 +173,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "feast-repository-test" + "runId": "feast-repository-test", + "lastRunId": "no-run-id-provided" } }, { @@ -151,6 +194,30 @@ "removed": false } }, + { + "com.linkedin.pegasus2avro.common.GlobalTags": { + "tags": [ + { + "tag": "urn:li:tag:deprecated" + } + ] + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpGroup:MOCK_OWNER", + "type": "BUSINESS_OWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, { "com.linkedin.pegasus2avro.ml.metadata.MLFeatureTableProperties": { "customProperties": {}, @@ -170,7 +237,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "feast-repository-test" + "runId": "feast-repository-test", + "lastRunId": "no-run-id-provided" } }, { @@ -189,7 +257,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "feast-repository-test" + "runId": "feast-repository-test", + "lastRunId": "no-run-id-provided" } }, { @@ -204,6 +273,7 @@ }, { "com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": { + "customProperties": {}, "dataType": "CONTINUOUS", "sources": [ "urn:li:dataset:(urn:li:dataPlatform:request,vals_to_add,PROD)", @@ -216,7 +286,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "feast-repository-test" + "runId": "feast-repository-test", + "lastRunId": "no-run-id-provided" } }, { @@ -231,6 +302,7 @@ }, { "com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": { + "customProperties": {}, "dataType": "CONTINUOUS", "sources": [ "urn:li:dataset:(urn:li:dataPlatform:request,vals_to_add,PROD)", @@ -243,7 +315,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "feast-repository-test" + "runId": "feast-repository-test", + "lastRunId": "no-run-id-provided" } }, { @@ -278,7 +351,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "feast-repository-test" + "runId": "feast-repository-test", + "lastRunId": "no-run-id-provided" } }, { @@ -297,7 +371,40 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "feast-repository-test" + "runId": "feast-repository-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:deprecated", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "deprecated" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "feast-repository-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:needs_documentation", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "needs_documentation" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "feast-repository-test", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/feast/feature_store/data/registry.db b/metadata-ingestion/tests/integration/feast/feature_store/data/registry.db index a511ff56c9770..5dca29d92afe5 100644 Binary files a/metadata-ingestion/tests/integration/feast/feature_store/data/registry.db and b/metadata-ingestion/tests/integration/feast/feature_store/data/registry.db differ diff --git a/metadata-ingestion/tests/integration/feast/feature_store/features.py b/metadata-ingestion/tests/integration/feast/feature_store/features.py index a6e6cd3616e92..dcfd417637958 100644 --- a/metadata-ingestion/tests/integration/feast/feature_store/features.py +++ b/metadata-ingestion/tests/integration/feast/feature_store/features.py @@ -19,6 +19,8 @@ join_keys=["driver_id"], value_type=ValueType.INT64, description="Driver ID", + owner="MOCK_OWNER", + tags={"name": "deprecated"}, ) driver_hourly_stats_view = FeatureView( @@ -29,7 +31,7 @@ Field( name="conv_rate", dtype=feast.types.Float64, - tags=dict(description="Conv rate"), + tags={"name": "needs_documentation", "description": "Conv rate"}, ), Field( name="acc_rate", @@ -49,7 +51,8 @@ ], online=True, source=driver_hourly_stats_source, - tags={}, + tags={"name": "deprecated"}, + owner="MOCK_OWNER", ) input_request = RequestSource( diff --git a/metadata-ingestion/tests/integration/feast/test_feast_repository.py b/metadata-ingestion/tests/integration/feast/test_feast_repository.py index a6bdce6722289..7f04337145dc3 100644 --- a/metadata-ingestion/tests/integration/feast/test_feast_repository.py +++ b/metadata-ingestion/tests/integration/feast/test_feast_repository.py @@ -19,6 +19,15 @@ def test_feast_repository_ingest(pytestconfig, tmp_path, mock_time): "config": { "path": str(test_resources_dir / "feature_store"), "environment": "PROD", + "enable_tag_extraction": True, + "enable_owner_extraction": True, + "owner_mappings": [ + { + "feast_owner_name": "MOCK_OWNER", + "datahub_owner_urn": "urn:li:corpGroup:MOCK_OWNER", + "datahub_ownership_type": "BUSINESS_OWNER", + } + ], }, }, "sink": { diff --git a/metadata-ingestion/tests/integration/kafka/test_kafka.py b/metadata-ingestion/tests/integration/kafka/test_kafka.py index 597889c8440b7..7462f177684b7 100644 --- a/metadata-ingestion/tests/integration/kafka/test_kafka.py +++ b/metadata-ingestion/tests/integration/kafka/test_kafka.py @@ -128,11 +128,32 @@ def test_kafka_oauth_callback( pipeline.run() - is_found: bool = False + # Initialize flags to track oauth events + checks = { + "consumer_polling": False, + "consumer_oauth_callback": False, + "admin_polling": False, + "admin_oauth_callback": False, + } + + # Read log file and check for oauth events with open(log_file, "r") as file: - for line_number, line in enumerate(file, 1): + for line in file: + # Check for polling events + if "Initiating polling for kafka admin client" in line: + checks["admin_polling"] = True + elif "Initiating polling for kafka consumer" in line: + checks["consumer_polling"] = True + + # Check for oauth callbacks if oauth.MESSAGE in line: - is_found = True - break - - assert is_found + if checks["consumer_polling"] and not checks["admin_polling"]: + checks["consumer_oauth_callback"] = True + elif checks["consumer_polling"] and checks["admin_polling"]: + checks["admin_oauth_callback"] = True + + # Verify all oauth events occurred + assert checks["consumer_polling"], "Consumer polling was not initiated" + assert checks["consumer_oauth_callback"], "Consumer oauth callback not found" + assert checks["admin_polling"], "Admin polling was not initiated" + assert checks["admin_oauth_callback"], "Admin oauth callback not found" diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index f22998b47b900..63821f9038a88 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -7,6 +7,7 @@ import pytest from lark import Tree +import datahub.ingestion.source.powerbi.m_query.data_classes import datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes as powerbi_data_classes from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.source import StructuredLogLevel @@ -18,8 +19,11 @@ AbstractDataPlatformInstanceResolver, create_dataplatform_instance_resolver, ) -from datahub.ingestion.source.powerbi.m_query import parser, resolver, tree_function -from datahub.ingestion.source.powerbi.m_query.resolver import DataPlatformTable, Lineage +from datahub.ingestion.source.powerbi.m_query import parser, tree_function +from datahub.ingestion.source.powerbi.m_query.data_classes import ( + DataPlatformTable, + Lineage, +) pytestmark = pytest.mark.integration_batch_2 @@ -62,7 +66,9 @@ ] -def get_data_platform_tables_with_dummy_table(q: str) -> List[resolver.Lineage]: +def get_data_platform_tables_with_dummy_table( + q: str, +) -> List[datahub.ingestion.source.powerbi.m_query.data_classes.Lineage]: table: powerbi_data_classes.Table = powerbi_data_classes.Table( columns=[], measures=[], @@ -759,7 +765,9 @@ def test_sqlglot_parser(): } ) - lineage: List[resolver.Lineage] = parser.get_upstream_tables( + lineage: List[ + datahub.ingestion.source.powerbi.m_query.data_classes.Lineage + ] = parser.get_upstream_tables( table, reporter, ctx=ctx, @@ -806,7 +814,9 @@ def test_sqlglot_parser(): def test_databricks_multi_cloud(): q = M_QUERIES[25] - lineage: List[resolver.Lineage] = get_data_platform_tables_with_dummy_table(q=q) + lineage: List[ + datahub.ingestion.source.powerbi.m_query.data_classes.Lineage + ] = get_data_platform_tables_with_dummy_table(q=q) assert len(lineage) == 1 @@ -823,7 +833,9 @@ def test_databricks_multi_cloud(): def test_databricks_catalog_pattern_1(): q = M_QUERIES[26] - lineage: List[resolver.Lineage] = get_data_platform_tables_with_dummy_table(q=q) + lineage: List[ + datahub.ingestion.source.powerbi.m_query.data_classes.Lineage + ] = get_data_platform_tables_with_dummy_table(q=q) assert len(lineage) == 1 @@ -892,7 +904,9 @@ def test_sqlglot_parser_2(): } ) - lineage: List[resolver.Lineage] = parser.get_upstream_tables( + lineage: List[ + datahub.ingestion.source.powerbi.m_query.data_classes.Lineage + ] = parser.get_upstream_tables( table, reporter, ctx=ctx, @@ -951,7 +965,9 @@ def test_databricks_regular_case_with_view(): def test_snowflake_double_double_quotes(): q = M_QUERIES[30] - lineage: List[resolver.Lineage] = get_data_platform_tables_with_dummy_table(q=q) + lineage: List[ + datahub.ingestion.source.powerbi.m_query.data_classes.Lineage + ] = get_data_platform_tables_with_dummy_table(q=q) assert len(lineage) == 1 @@ -968,7 +984,9 @@ def test_snowflake_double_double_quotes(): def test_databricks_multicloud(): q = M_QUERIES[31] - lineage: List[resolver.Lineage] = get_data_platform_tables_with_dummy_table(q=q) + lineage: List[ + datahub.ingestion.source.powerbi.m_query.data_classes.Lineage + ] = get_data_platform_tables_with_dummy_table(q=q) assert len(lineage) == 1 @@ -985,7 +1003,9 @@ def test_databricks_multicloud(): def test_snowflake_multi_function_call(): q = M_QUERIES[32] - lineage: List[resolver.Lineage] = get_data_platform_tables_with_dummy_table(q=q) + lineage: List[ + datahub.ingestion.source.powerbi.m_query.data_classes.Lineage + ] = get_data_platform_tables_with_dummy_table(q=q) assert len(lineage) == 1 @@ -1002,7 +1022,9 @@ def test_snowflake_multi_function_call(): def test_mssql_drop_with_select(): q = M_QUERIES[33] - lineage: List[resolver.Lineage] = get_data_platform_tables_with_dummy_table(q=q) + lineage: List[ + datahub.ingestion.source.powerbi.m_query.data_classes.Lineage + ] = get_data_platform_tables_with_dummy_table(q=q) assert len(lineage) == 1 @@ -1062,7 +1084,9 @@ def test_empty_string_in_m_query(): # TRIM(TRIM(TRIM(AGENT_NAME, '\"\"'), '+'), '\\'') is in Query q = "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu10758.ap-unknown-2.fakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select #(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS CLIENT_DIRECTOR,#(lf)TRIM(TRIM(TRIM(AGENT_NAME, '\"\"'), '+'), '\\'') AS TRIM_AGENT_NAME,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS inner join OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT #(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Accounting'#(lf)and TARGET_TEAM = 'Enterprise'#(lf)AND TIER = 'Client Director'\", null, [EnableFolding=true])\nin\n Source" - lineage: List[resolver.Lineage] = get_data_platform_tables_with_dummy_table(q=q) + lineage: List[ + datahub.ingestion.source.powerbi.m_query.data_classes.Lineage + ] = get_data_platform_tables_with_dummy_table(q=q) assert len(lineage) == 1 @@ -1084,7 +1108,9 @@ def test_double_quotes_in_alias(): # SELECT CAST(sales_date AS DATE) AS \"\"Date\"\" in query q = 'let \n Source = Sql.Database("abc.com", "DB", [Query="SELECT CAST(sales_date AS DATE) AS ""Date"",#(lf) SUM(cshintrpret) / 60.0 AS ""Total Order All Items"",#(lf)#(tab)#(tab)#(tab) SUM(cshintrpret) / 60.0 - LAG(SUM(cshintrpret) / 60.0, 1) OVER (ORDER BY CAST(sales_date AS DATE)) AS ""Total minute difference"",#(lf)#(tab)#(tab)#(tab) SUM(sale_price) / 60.0 - LAG(SUM(sale_price) / 60.0, 1) OVER (ORDER BY CAST(sales_date AS DATE)) AS ""Normal minute difference""#(lf) FROM [DB].[dbo].[sales_t]#(lf) WHERE sales_date >= GETDATE() - 365#(lf) GROUP BY CAST(sales_date AS DATE),#(lf)#(tab)#(tab)CAST(sales_date AS TIME);"]) \n in \n Source' - lineage: List[resolver.Lineage] = get_data_platform_tables_with_dummy_table(q=q) + lineage: List[ + datahub.ingestion.source.powerbi.m_query.data_classes.Lineage + ] = get_data_platform_tables_with_dummy_table(q=q) assert len(lineage) == 1 diff --git a/metadata-ingestion/tests/unit/test_athena_source.py b/metadata-ingestion/tests/unit/test_athena_source.py index 875cf3800daf8..f8b6220d18273 100644 --- a/metadata-ingestion/tests/unit/test_athena_source.py +++ b/metadata-ingestion/tests/unit/test_athena_source.py @@ -93,7 +93,8 @@ def test_athena_get_table_properties(): "CreateTime": datetime.now(), "LastAccessTime": datetime.now(), "PartitionKeys": [ - {"Name": "testKey", "Type": "string", "Comment": "testComment"} + {"Name": "year", "Type": "string", "Comment": "testComment"}, + {"Name": "month", "Type": "string", "Comment": "testComment"}, ], "Parameters": { "comment": "testComment", @@ -112,8 +113,18 @@ def test_athena_get_table_properties(): response=table_metadata ) + # Mock partition query results + mock_cursor.execute.return_value.description = [ + ["year"], + ["month"], + ] + mock_cursor.execute.return_value.__iter__.return_value = [["2023", "12"]] + ctx = PipelineContext(run_id="test") source = AthenaSource(config=config, ctx=ctx) + source.cursor = mock_cursor + + # Test table properties description, custom_properties, location = source.get_table_properties( inspector=mock_inspector, table=table, schema=schema ) @@ -124,13 +135,35 @@ def test_athena_get_table_properties(): "last_access_time": "2020-04-14 07:00:00", "location": "s3://testLocation", "outputformat": "testOutputFormat", - "partition_keys": '[{"name": "testKey", "type": "string", "comment": "testComment"}]', + "partition_keys": '[{"name": "year", "type": "string", "comment": "testComment"}, {"name": "month", "type": "string", "comment": "testComment"}]', "serde.serialization.lib": "testSerde", "table_type": "testType", } - assert location == make_s3_urn("s3://testLocation", "PROD") + # Test partition functionality + partitions = source.get_partitions( + inspector=mock_inspector, schema=schema, table=table + ) + assert partitions == ["year", "month"] + + # Verify the correct SQL query was generated for partitions + expected_query = """\ +select year,month from "test_schema"."test_table$partitions" \ +where CAST(year as VARCHAR) || '-' || CAST(month as VARCHAR) = \ +(select max(CAST(year as VARCHAR) || '-' || CAST(month as VARCHAR)) \ +from "test_schema"."test_table$partitions")""" + mock_cursor.execute.assert_called_once() + actual_query = mock_cursor.execute.call_args[0][0] + assert actual_query == expected_query + + # Verify partition cache was populated correctly + assert source.table_partition_cache[schema][table].partitions == partitions + assert source.table_partition_cache[schema][table].max_partition == { + "year": "2023", + "month": "12", + } + def test_get_column_type_simple_types(): assert isinstance( @@ -214,3 +247,9 @@ def test_column_type_complex_combination(): assert isinstance( result._STRUCT_fields[2][1].item_type._STRUCT_fields[1][1], types.String ) + + +def test_casted_partition_key(): + from datahub.ingestion.source.sql.athena import AthenaSource + + assert AthenaSource._casted_partition_key("test_col") == "CAST(test_col as VARCHAR)" diff --git a/metadata-ingestion/tests/unit/test_powerbi_parser.py b/metadata-ingestion/tests/unit/test_powerbi_parser.py index 31579f0c0abd3..a487a3a5b87f8 100644 --- a/metadata-ingestion/tests/unit/test_powerbi_parser.py +++ b/metadata-ingestion/tests/unit/test_powerbi_parser.py @@ -8,9 +8,7 @@ from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import ( ResolvePlatformInstanceFromDatasetTypeMapping, ) -from datahub.ingestion.source.powerbi.m_query.resolver import ( - MSSqlDataPlatformTableCreator, -) +from datahub.ingestion.source.powerbi.m_query.pattern_handler import MSSqlLineage from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table @@ -27,7 +25,7 @@ def creator(): full_name="db.schema.test_table", ) - return MSSqlDataPlatformTableCreator( + return MSSqlLineage( ctx=PipelineContext(run_id="test-run-id"), table=table, reporter=PowerBiDashboardSourceReport(), diff --git a/metadata-ingestion/tests/unit/test_sql_types.py b/metadata-ingestion/tests/unit/test_sql_types.py new file mode 100644 index 0000000000000..ebe5ade115cdd --- /dev/null +++ b/metadata-ingestion/tests/unit/test_sql_types.py @@ -0,0 +1,78 @@ +import pytest + +from datahub.ingestion.source.sql.sql_types import ( + ATHENA_SQL_TYPES_MAP, + TRINO_SQL_TYPES_MAP, + resolve_athena_modified_type, + resolve_sql_type, + resolve_trino_modified_type, +) +from datahub.metadata.schema_classes import BooleanTypeClass, StringTypeClass + + +@pytest.mark.parametrize( + "data_type, expected_data_type", + [ + ("boolean", "boolean"), + ("tinyint", "tinyint"), + ("smallint", "smallint"), + ("int", "int"), + ("integer", "integer"), + ("bigint", "bigint"), + ("real", "real"), + ("double", "double"), + ("decimal(10,0)", "decimal"), + ("varchar(20)", "varchar"), + ("char", "char"), + ("varbinary", "varbinary"), + ("json", "json"), + ("date", "date"), + ("time", "time"), + ("time(12)", "time"), + ("timestamp", "timestamp"), + ("timestamp(3)", "timestamp"), + ("row(x bigint, y double)", "row"), + ("array(row(x bigint, y double))", "array"), + ("map(varchar, varchar)", "map"), + ], +) +def test_resolve_trino_modified_type(data_type, expected_data_type): + assert ( + resolve_trino_modified_type(data_type) + == TRINO_SQL_TYPES_MAP[expected_data_type] + ) + + +@pytest.mark.parametrize( + "data_type, expected_data_type", + [ + ("boolean", "boolean"), + ("tinyint", "tinyint"), + ("smallint", "smallint"), + ("int", "int"), + ("integer", "integer"), + ("bigint", "bigint"), + ("float", "float"), + ("double", "double"), + ("decimal(10,0)", "decimal"), + ("varchar(20)", "varchar"), + ("char", "char"), + ("binary", "binary"), + ("date", "date"), + ("timestamp", "timestamp"), + ("timestamp(3)", "timestamp"), + ("struct", "struct"), + ("array>", "array"), + ("map", "map"), + ], +) +def test_resolve_athena_modified_type(data_type, expected_data_type): + assert ( + resolve_athena_modified_type(data_type) + == ATHENA_SQL_TYPES_MAP[expected_data_type] + ) + + +def test_resolve_sql_type() -> None: + assert resolve_sql_type("boolean") == BooleanTypeClass() + assert resolve_sql_type("varchar") == StringTypeClass() diff --git a/metadata-ingestion/tests/unit/urns/test_urn.py b/metadata-ingestion/tests/unit/urns/test_urn.py index 1bf48082fec8c..73badb3d1b423 100644 --- a/metadata-ingestion/tests/unit/urns/test_urn.py +++ b/metadata-ingestion/tests/unit/urns/test_urn.py @@ -1,6 +1,12 @@ import pytest -from datahub.metadata.urns import DatasetUrn, Urn +from datahub.metadata.urns import ( + CorpUserUrn, + DashboardUrn, + DataPlatformUrn, + DatasetUrn, + Urn, +) from datahub.utilities.urns.error import InvalidUrnError pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning") @@ -36,20 +42,51 @@ def test_url_encode_urn() -> None: def test_invalid_urn() -> None: with pytest.raises(InvalidUrnError): - Urn.create_from_string("urn:li:abc") + Urn.from_string("urn:li:abc") with pytest.raises(InvalidUrnError): - Urn.create_from_string("urn:li:abc:") + Urn.from_string("urn:li:abc:") with pytest.raises(InvalidUrnError): - Urn.create_from_string("urn:li:abc:()") + Urn.from_string("urn:li:abc:()") with pytest.raises(InvalidUrnError): - Urn.create_from_string("urn:li:abc:(abc,)") + Urn.from_string("urn:li:abc:(abc,)") + + with pytest.raises(InvalidUrnError): + Urn.from_string("urn:li:corpuser:abc)") + + +def test_urn_colon() -> None: + # Colon characters are valid in urns, and should not mess up parsing. + + urn = Urn.from_string( + "urn:li:dashboard:(looker,dashboards.thelook::customer_lookup)" + ) + assert isinstance(urn, DashboardUrn) + + assert DataPlatformUrn.from_string("urn:li:dataPlatform:abc:def") + assert DatasetUrn.from_string( + "urn:li:dataset:(urn:li:dataPlatform:abc:def,table_name,PROD)" + ) + assert Urn.from_string("urn:li:corpuser:foo:bar@example.com") + + # I'm not sure why you'd ever want this, but technically it's a valid urn. + urn = Urn.from_string("urn:li:corpuser::") + assert isinstance(urn, CorpUserUrn) + assert urn.username == ":" + assert urn == CorpUserUrn(":") + + +def test_urn_coercion() -> None: + urn = CorpUserUrn("foo␟bar") + assert urn.urn() == "urn:li:corpuser:foo%E2%90%9Fbar" + + assert urn == Urn.from_string(urn.urn()) def test_urn_type_dispatch() -> None: - urn = Urn.from_string("urn:li:dataset:(urn:li:dataPlatform:abc,def,prod)") + urn = Urn.from_string("urn:li:dataset:(urn:li:dataPlatform:abc,def,PROD)") assert isinstance(urn, DatasetUrn) with pytest.raises(InvalidUrnError, match="Passed an urn of type corpuser"): diff --git a/metadata-integration/java/datahub-protobuf/scripts/check_jar.sh b/metadata-integration/java/datahub-protobuf/scripts/check_jar.sh index bd0c28f0f8698..66c70f0b85769 100755 --- a/metadata-integration/java/datahub-protobuf/scripts/check_jar.sh +++ b/metadata-integration/java/datahub-protobuf/scripts/check_jar.sh @@ -44,7 +44,9 @@ jar -tvf $jarFile |\ grep -v "mime.types" |\ grep -v "com/ibm/.*" |\ grep -v "org/glassfish/" |\ - grep -v "LICENSE" + grep -v "LICENSE" |\ + grep -v "org/apache/avro" |\ + grep -v "org/apache" if [ $? -ne 0 ]; then echo "✅ No unexpected class paths found in ${jarFile}" diff --git a/metadata-integration/java/datahub-schematron/lib/build.gradle b/metadata-integration/java/datahub-schematron/lib/build.gradle index 83dec1039f7be..3ba22ff4cb7b5 100644 --- a/metadata-integration/java/datahub-schematron/lib/build.gradle +++ b/metadata-integration/java/datahub-schematron/lib/build.gradle @@ -45,10 +45,6 @@ jacocoTestReport { test.finalizedBy jacocoTestReport -task checkShadowJar(type: Exec) { - commandLine 'sh', '-c', 'scripts/check_jar.sh' -} - configurations { provided implementation.extendsFrom provided diff --git a/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/converters/avro/AvroSchemaConverter.java b/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/converters/avro/AvroSchemaConverter.java index c199f8e6dcb92..0ddb357db76ba 100644 --- a/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/converters/avro/AvroSchemaConverter.java +++ b/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/converters/avro/AvroSchemaConverter.java @@ -345,7 +345,8 @@ private void processArrayField( log.debug("Array Field Path before expand: {}", fieldPath.asString()); fieldPath = fieldPath.popLast(); fieldPath = - fieldPath.clonePlus(new FieldElement(List.of("array"), new ArrayList<>(), null, null)); + fieldPath.clonePlus( + new FieldElement(Collections.singletonList("array"), new ArrayList<>(), null, null)); Schema.Field elementField = new Schema.Field( field.name(), @@ -400,7 +401,9 @@ private void processMapField( FieldPath valueFieldPath = fieldPath .popLast() - .clonePlus(new FieldElement(List.of("map"), new ArrayList<>(), null, null)); + .clonePlus( + new FieldElement( + Collections.singletonList("map"), new ArrayList<>(), null, null)); processField(valueField, valueFieldPath, defaultNullable, fields, isNullable, mapDataHubType); } else { SchemaField mapField = @@ -434,7 +437,7 @@ private void processUnionField( unionTypes.stream() .filter(s -> s.getType() != Schema.Type.NULL) .findFirst() - .orElseThrow(); + .orElseThrow(NoSuchElementException::new); processField( new Schema.Field(field.name(), nonNullSchema, field.doc()), @@ -476,7 +479,8 @@ private void processUnionField( FieldPath indexedFieldPath = fieldPath.popLast(); indexedFieldPath = indexedFieldPath.clonePlus( - new FieldElement(List.of("union"), new ArrayList<>(), null, null)); + new FieldElement( + Collections.singletonList("union"), new ArrayList<>(), null, null)); log.debug("TypeIndex: {}, Indexed Field path : {}", typeIndex, indexedFieldPath.asString()); // FieldPath unionFieldPath = // fieldPath.expandType(getDiscriminatedType(unionSchema), diff --git a/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/models/FieldPath.java b/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/models/FieldPath.java index e51aa1221c54e..b4b72fcc031a5 100644 --- a/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/models/FieldPath.java +++ b/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/models/FieldPath.java @@ -2,6 +2,7 @@ import com.linkedin.schema.*; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Objects; @@ -117,8 +118,8 @@ public FieldPath expandType(String type, Object typeSchema) { .getPath() .add( new FieldElement( - new ArrayList<>(List.of(type)), - new ArrayList<>(List.of(typeSchema.toString())), + new ArrayList<>(Collections.singletonList(type)), + new ArrayList<>(Collections.singletonList(typeSchema.toString())), null, null)); } diff --git a/metadata-io/metadata-io-api/build.gradle b/metadata-io/metadata-io-api/build.gradle index b8028fad07bb6..5273177b75281 100644 --- a/metadata-io/metadata-io-api/build.gradle +++ b/metadata-io/metadata-io-api/build.gradle @@ -16,3 +16,7 @@ dependencies { testImplementation externalDependency.lombok testAnnotationProcessor externalDependency.lombok } + +test { + environment 'STRICT_URN_VALIDATION_ENABLED', 'true' +} \ No newline at end of file diff --git a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/validation/ValidationApiUtils.java b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/validation/ValidationApiUtils.java index c2e1c47eca1fd..5e1f09fcc6439 100644 --- a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/validation/ValidationApiUtils.java +++ b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/validation/ValidationApiUtils.java @@ -30,7 +30,8 @@ public class ValidationApiUtils { // Related to BrowsePathv2 public static final String URN_DELIMITER_SEPARATOR = "␟"; // https://datahubproject.io/docs/what/urn/#restrictions - public static final Set ILLEGAL_URN_COMPONENT_CHARACTERS = Set.of(":", "(", ")", ","); + public static final Set ILLEGAL_URN_COMPONENT_CHARACTERS = Set.of("(", ")"); + public static final Set ILLEGAL_URN_TUPLE_CHARACTERS = Set.of(","); /** * Validates a {@link RecordTemplate} and throws {@link ValidationException} if validation fails. @@ -86,11 +87,10 @@ public static void validateUrn( "Error: URN cannot contain " + URN_DELIMITER_SEPARATOR + " character"); } + int totalParts = urn.getEntityKey().getParts().size(); List illegalComponents = urn.getEntityKey().getParts().stream() - .flatMap(ValidationApiUtils::processUrnPartRecursively) - .filter( - urnPart -> ILLEGAL_URN_COMPONENT_CHARACTERS.stream().anyMatch(urnPart::contains)) + .flatMap(part -> processUrnPartRecursively(part, totalParts)) .collect(Collectors.toList()); if (!illegalComponents.isEmpty()) { @@ -114,15 +114,25 @@ public static void validateUrn( } /** Recursively process URN parts with URL decoding */ - private static Stream processUrnPartRecursively(String urnPart) { + private static Stream processUrnPartRecursively(String urnPart, int totalParts) { String decodedPart = URLDecoder.decode(URLEncodingFixer.fixURLEncoding(urnPart), StandardCharsets.UTF_8); if (decodedPart.startsWith("urn:li:")) { // Recursively process nested URN after decoding + int nestedParts = UrnUtils.getUrn(decodedPart).getEntityKey().getParts().size(); return UrnUtils.getUrn(decodedPart).getEntityKey().getParts().stream() - .flatMap(ValidationApiUtils::processUrnPartRecursively); + .flatMap(part -> processUrnPartRecursively(part, nestedParts)); } - return Stream.of(decodedPart); + if (totalParts > 1) { + if (ILLEGAL_URN_TUPLE_CHARACTERS.stream().anyMatch(c -> urnPart.contains(c))) { + return Stream.of(urnPart); + } + } + if (ILLEGAL_URN_COMPONENT_CHARACTERS.stream().anyMatch(c -> urnPart.contains(c))) { + return Stream.of(urnPart); + } + + return Stream.empty(); } /** diff --git a/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/validation/ValidationApiUtilsTest.java b/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/validation/ValidationApiUtilsTest.java index e683e594d8766..a2c9a15d92f90 100644 --- a/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/validation/ValidationApiUtilsTest.java +++ b/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/validation/ValidationApiUtilsTest.java @@ -18,10 +18,36 @@ public void testValidateDatasetUrn() { // If no exception is thrown, test passes } - @Test(expectedExceptions = IllegalArgumentException.class) + @Test public void testSimpleUrnColon() { - Urn invalidUrn = UrnUtils.getUrn("urn:li:corpuser:foo:bar"); - ValidationApiUtils.validateUrn(entityRegistry, invalidUrn, true); + ValidationApiUtils.validateUrn( + entityRegistry, UrnUtils.getUrn("urn:li:corpuser:foo:bar"), true); + ValidationApiUtils.validateUrn( + entityRegistry, UrnUtils.getUrn("urn:li:dataPlatform:abc:def"), true); + ValidationApiUtils.validateUrn( + entityRegistry, UrnUtils.getUrn("urn:li:corpuser:foo:bar@example.com"), true); + // If no exception is thrown, test passes + } + + @Test + public void testSimpleUrnComma() { + ValidationApiUtils.validateUrn(entityRegistry, UrnUtils.getUrn("urn:li:corpuser:,"), true); + // If no exception is thrown, test passes + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testTupleUrnComma() { + ValidationApiUtils.validateUrn( + entityRegistry, UrnUtils.getUrn("urn:li:dashboard:(looker,dashboards,thelook)"), true); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testFabricTypeCasing() { + // prod != PROD + ValidationApiUtils.validateUrn( + entityRegistry, + UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:abc:def,table_name,prod)"), + true); } @Test @@ -34,7 +60,7 @@ public void testComplexUrnColon() throws URISyntaxException { } @Test(expectedExceptions = IllegalArgumentException.class) - public void testUrnFabricType() { + public void testFabricTypeParen() { Urn invalidUrn = UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:hdfs,/path/to/data,())"); ValidationApiUtils.validateUrn(entityRegistry, invalidUrn, true); } @@ -83,20 +109,20 @@ public void testValidComplexUrn() { UrnUtils.getUrn( "urn:li:dataset:(urn:li:dataPlatform:bigquery,myproject.dataset.table,PROD)"); - ValidationApiUtils.validateUrn(entityRegistry, validUrn); + ValidationApiUtils.validateUrn(entityRegistry, validUrn, true); // If no exception is thrown, test passes } @Test(expectedExceptions = NullPointerException.class) public void testUrnNull() { - ValidationApiUtils.validateUrn(entityRegistry, null); + ValidationApiUtils.validateUrn(entityRegistry, null, true); } @Test public void testValidPartialUrlEncode() { Urn validUrn = UrnUtils.getUrn("urn:li:assertion:123=-%28__% weekly__%29"); - ValidationApiUtils.validateUrn(entityRegistry, validUrn); + ValidationApiUtils.validateUrn(entityRegistry, validUrn, true); // If no exception is thrown, test passes } @@ -106,7 +132,23 @@ public void testValidPartialUrlEncode2() { UrnUtils.getUrn( "urn:li:dataset:(urn:li:dataPlatform:s3,urn:li:dataset:%28urn:li:dataPlatform:s3%2Ctest-datalake-concepts%prog_maintenance%2CPROD%29,PROD)"); - ValidationApiUtils.validateUrn(entityRegistry, validUrn); + ValidationApiUtils.validateUrn(entityRegistry, validUrn, true); + // If no exception is thrown, test passes + } + + @Test + public void testValidColon() { + Urn validUrn = + UrnUtils.getUrn("urn:li:dashboard:(looker,dashboards.thelook::cohort_data_tool)"); + + ValidationApiUtils.validateUrn(entityRegistry, validUrn, true); + // If no exception is thrown, test passes + } + + @Test + public void testNoTupleComma() { + Urn invalidUrn = UrnUtils.getUrn("urn:li:corpuser:,"); + ValidationApiUtils.validateUrn(entityRegistry, invalidUrn, true); // If no exception is thrown, test passes } } diff --git a/metadata-service/configuration/src/main/resources/bootstrap_mcps.yaml b/metadata-service/configuration/src/main/resources/bootstrap_mcps.yaml index f9497258c384f..0e283dfdfc93c 100644 --- a/metadata-service/configuration/src/main/resources/bootstrap_mcps.yaml +++ b/metadata-service/configuration/src/main/resources/bootstrap_mcps.yaml @@ -38,7 +38,7 @@ bootstrap: # Ingestion Recipes - name: ingestion-datahub-gc - version: v4 + version: v5 optional: false mcps_location: "bootstrap_mcps/ingestion-datahub-gc.yaml" values_env: "DATAHUB_GC_BOOTSTRAP_VALUES" diff --git a/metadata-service/configuration/src/main/resources/bootstrap_mcps/ingestion-datahub-gc.yaml b/metadata-service/configuration/src/main/resources/bootstrap_mcps/ingestion-datahub-gc.yaml index 395eb5db53424..c0c5be85b16b1 100644 --- a/metadata-service/configuration/src/main/resources/bootstrap_mcps/ingestion-datahub-gc.yaml +++ b/metadata-service/configuration/src/main/resources/bootstrap_mcps/ingestion-datahub-gc.yaml @@ -19,6 +19,7 @@ config: cleanup_expired_tokens: {{cleanup_expired_tokens}}{{^cleanup_expired_tokens}}false{{/cleanup_expired_tokens}} truncate_indices: {{truncate_indices}}{{^truncate_indices}}true{{/truncate_indices}} + truncate_index_older_than_days: {{truncate_indices_retention_days}}{{^truncate_indices_retention_days}}30{{/truncate_indices_retention_days}} dataprocess_cleanup: retention_days: {{dataprocess_cleanup.retention_days}}{{^dataprocess_cleanup.retention_days}}10{{/dataprocess_cleanup.retention_days}} delete_empty_data_jobs: {{dataprocess_cleanup.delete_empty_data_jobs}}{{^dataprocess_cleanup.delete_empty_data_jobs}}true{{/dataprocess_cleanup.delete_empty_data_jobs}} diff --git a/metadata-service/configuration/src/main/resources/search_config.yaml b/metadata-service/configuration/src/main/resources/search_config.yaml index e93f8af8b1d6c..47494c8cb1ca4 100644 --- a/metadata-service/configuration/src/main/resources/search_config.yaml +++ b/metadata-service/configuration/src/main/resources/search_config.yaml @@ -65,9 +65,9 @@ queryConfigurations: boost_mode: replace # Criteria for exact-match only - # Contains quotes, is a single term with `_`, `.`, or `-` (normally consider for tokenization) then use exact match query + # Contains quotes then use exact match query - queryRegex: >- - ^["'].+["']$|^[a-zA-Z0-9]\S+[_.-]\S+[a-zA-Z0-9]$ + ^["'].+["']$ simpleQuery: false prefixMatchQuery: true exactMatchQuery: true diff --git a/smoke-test/tests/cypress/cypress/e2e/siblings/siblings.js b/smoke-test/tests/cypress/cypress/e2e/siblings/siblings.js index fb772bd7af1e7..57617d7721e59 100644 --- a/smoke-test/tests/cypress/cypress/e2e/siblings/siblings.js +++ b/smoke-test/tests/cypress/cypress/e2e/siblings/siblings.js @@ -98,7 +98,7 @@ describe("siblings", () => { it("will combine results in search", () => { cy.login(); - cy.visit("/search?page=1&query=raw_orders"); + cy.visit("/search?page=1&query=%22raw_orders%22"); cy.contains("Showing 1 - 2 of ");