Merge branch 'master' into feat(ingestion/neo4j)

# Conflicts: # datahub-web-react/src/app/ingest/source/builder/sources.json
k-bartlett · Nov 29, 2024 · bf888c0 · bf888c0
2 parents 01dfc26 + 94f104d
commit bf888c0
Show file tree

Hide file tree

Showing 56 changed files with 1,875 additions and 1,243 deletions.
diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
@@ -83,6 +83,7 @@ jobs:
       - uses: gradle/actions/setup-gradle@v3
       - name: Gradle build (and test) for NOT metadata ingestion
         if: ${{  matrix.command == 'except_metadata_ingestion' && needs.setup.outputs.backend_change == 'true' }}
+        # datahub-schematron:cli excluded due to dependency on metadata-ingestion
         run: |
           ./gradlew build \
             -x :metadata-ingestion:build \
@@ -100,6 +101,7 @@ jobs:
             -x :metadata-ingestion-modules:gx-plugin:check \
             -x :datahub-frontend:build \
             -x :datahub-web-react:build \
+            -x :metadata-integration:java:datahub-schematron:cli:test \
             --parallel
       - name: Gradle build (and test) for frontend
         if: ${{  matrix.command == 'frontend' && needs.setup.outputs.frontend_change == 'true' }}

diff --git a/.github/workflows/check-datahub-jars.yml b/.github/workflows/check-datahub-jars.yml
@@ -40,4 +40,5 @@ jobs:
       - name: check ${{ matrix.command }} jar
         run: |
           ./gradlew :metadata-integration:java:${{ matrix.command }}:build --info
+          ./gradlew :metadata-integration:java:${{ matrix.command }}:checkShadowJar
           ./gradlew :metadata-integration:java:${{ matrix.command }}:javadoc
diff --git a/build.gradle b/build.gradle
@@ -48,6 +48,7 @@ buildscript {
   // see also datahub-frontend/play.gradle
   ext.playVersion = '2.8.22'
   ext.playScalaVersion = '2.13'
+  ext.akkaVersion = '2.6.21' // 2.7.0+ has incompatible license
   ext.log4jVersion = '2.23.1'
   ext.slf4jVersion = '1.7.36'
   ext.logbackClassic = '1.4.14'
@@ -105,7 +106,14 @@ project.ext.spec = [
 ]
 
 project.ext.externalDependency = [
-    'akkaHttp': "com.typesafe.akka:akka-http-core_$playScalaVersion:10.2.10",
+    'akkaHttp': "com.typesafe.akka:akka-http-core_$playScalaVersion:10.2.10", // max version due to licensing
+    'akkaActor': "com.typesafe.akka:akka-actor_$playScalaVersion:$akkaVersion",
+    'akkaStream': "com.typesafe.akka:akka-stream_$playScalaVersion:$akkaVersion",
+    'akkaActorTyped': "com.typesafe.akka:akka-actor-typed_$playScalaVersion:$akkaVersion",
+    'akkaSlf4j': "com.typesafe.akka:akka-slf4j_$playScalaVersion:$akkaVersion",
+    'akkaJackson': "com.typesafe.akka:akka-serialization-jackson_$playScalaVersion:$akkaVersion",
+    'akkaParsing': "com.typesafe.akka:akka-parsing_$playScalaVersion:$akkaVersion",
+    'akkaProtobuf': "com.typesafe.akka:akka-protobuf-v3_$playScalaVersion:$akkaVersion",
     'antlr4Runtime': 'org.antlr:antlr4-runtime:4.9.3',
     'antlr4': 'org.antlr:antlr4:4.9.3',
     'assertJ': 'org.assertj:assertj-core:3.11.1',

diff --git a/datahub-frontend/play.gradle b/datahub-frontend/play.gradle
@@ -55,6 +55,13 @@ dependencies {
   implementation externalDependency.antlr4Runtime
   implementation externalDependency.antlr4
   implementation externalDependency.akkaHttp
+  implementation externalDependency.akkaActor
+  implementation externalDependency.akkaStream
+  implementation externalDependency.akkaActorTyped
+  implementation externalDependency.akkaSlf4j
+  implementation externalDependency.akkaJackson
+  implementation externalDependency.akkaParsing
+  implementation externalDependency.akkaProtobuf
 
   implementation externalDependency.jerseyCore
   implementation externalDependency.jerseyGuava

diff --git a/datahub-web-react/src/app/ingest/source/builder/sources.json b/datahub-web-react/src/app/ingest/source/builder/sources.json
@@ -318,6 +318,14 @@
         "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/cassandra",
         "recipe": "source:\n  type: cassandra\n  config:\n    # Credentials for on prem cassandra\n    contact_point: localhost\n    port: 9042\n    username: admin\n    password: password\n\n    # Or\n    # Credentials Astra Cloud\n    #cloud_config:\n    #  secure_connect_bundle: Path to Secure Connect Bundle (.zip)\n    #  token: Application Token\n\n    # Optional Allow / Deny extraction of particular keyspaces.\n    keyspace_pattern:\n      allow: [.*]\n\n    # Optional Allow / Deny extraction of particular tables.\n    table_pattern:\n      allow: [.*]"
     },
+    {
+        "urn": "urn:li:dataPlatform:iceberg",
+        "name": "iceberg",
+        "displayName": "Iceberg",
+        "description": "Ingest databases and tables from any Iceberg catalog implementation",
+        "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/iceberg",
+        "recipe": "source:\n type: \"iceberg\"\n config:\n   env: dev\n   # each thread will open internet connections to fetch manifest files independently, \n   # this value needs to be adjusted with ulimit\n   processing_threads: 1 \n   # a single catalog definition with a form of a dictionary\n   catalog: \n     demo: # name of the catalog\n       type: \"rest\" # other types are available\n       uri: \"uri\"\n       s3.access-key-id: \"access-key\"\n       s3.secret-access-key: \"secret-access-key\"\n       s3.region: \"aws-region\"\n   profiling:\n     enabled: false\n"
+    },
     {
         "urn": "urn:li:dataPlatform:neo4j",
         "name": "neo4j",

diff --git a/docs/automations/snowflake-tag-propagation.md b/docs/automations/snowflake-tag-propagation.md
@@ -4,6 +4,8 @@ import FeatureAvailability from '@site/src/components/FeatureAvailability';
 
 <FeatureAvailability saasOnly />
 
+> Note that this Automation in currently in open **Beta**. With any questions or issues, please reach out to your Acryl representative. 
+
 ## Introduction
 
 Snowflake Tag Propagation is an automation that allows you to sync DataHub Glossary Terms and Tags on
@@ -15,6 +17,41 @@ both columns and tables back to Snowflake. This automation is available in DataH
 - Automatically Add DataHub Tags to Snowflake Tables and Columns
 - Automatically Remove DataHub Glossary Terms and Tags from Snowflake Tables and Columns when they are removed in DataHub
 
+## Prerequisites
+
+### Permissions Required for Tag Management
+
+- `CREATE TAG`: Required to create new tags in Snowflake.
+Ensure the user or role has this privilege on the specific schema or database where tags will be created.
+- `APPLY TAG`: Required to assign tags to Snowflake objects such as tables, columns, or other database objects.
+This permission must be granted at the database, schema, or object level depending on the scope.
+
+
+### Permissions Required for Object Access
+
+- `USAGE` on the database and schema: Allows access to the database and schema to view and apply changes.
+- `SELECT` on the objects (tables, views, etc.): Enables the automation to read metadata and verify existing tags.
+
+### Example Permission Grant Statements
+
+To grant the necessary permissions for a specific role (DATAHUB_AUTOMATION_ROLE), you can use the following SQL commands:
+
+```sql
+-- Tag management permissions
+GRANT CREATE TAG ON SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE;
+GRANT APPLY TAG ON SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE;
+
+-- Object access for metadata operations
+GRANT USAGE ON DATABASE your_database TO ROLE DATAHUB_AUTOMATION_ROLE;
+GRANT USAGE ON SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE;
+GRANT SELECT ON ALL TABLES IN SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE;
+
+-- Future privileges for tagging
+GRANT SELECT ON FUTURE TABLES IN SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE;
+GRANT APPLY TAG ON FUTURE TABLES IN SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE;
+```
+
+
 ## Enabling Snowflake Tag Sync
 
 1. **Navigate to Automations**: Click on 'Govern' > 'Automations' in the navigation bar.

diff --git a/docs/managed-datahub/release-notes/v_0_3_7.md b/docs/managed-datahub/release-notes/v_0_3_7.md
@@ -7,7 +7,7 @@ Release Availability Date
 
 Recommended CLI/SDK
 ---
-- `v0.14.1.11` with release notes at https://github.com/datahub/datahub/releases/tag/v0.14.1.11
+- `v0.14.1.12` with release notes at https://github.com/datahub/datahub/releases/tag/v0.14.1.12
 
 If you are using an older CLI/SDK version, then please upgrade it. This applies for all CLI/SDK usages, if you are using it through your terminal, GitHub Actions, Airflow, in Python SDK somewhere, Java SDK, etc. This is a strong recommendation to upgrade, as we keep on pushing fixes in the CLI, and it helps us support you better.
 
@@ -19,6 +19,26 @@ If you are using an older CLI/SDK version, then please upgrade it. This applies
 ## Release Changelog
 ---
 
+### v0.3.7.4
+
+- [#11935](https://github.com/datahub-project/datahub/pull/11935) - Added environment variable for enabling stricter URN validation rules `STRICT_URN_VALIDATION_ENABLED` [[1](https://datahubproject.io/docs/what/urn/#restrictions)].
+- [Automations] Filter out self-nodes in glossary term propagation
+- [Remote Executor] Allow dashes in executor ids.
+- [Search] Fix Nested Filter Counts in Primary Search
+- [Search] Fix white screen of death on empty search result
+- [Columns Tab] Support searching nested struct columns correctly in V2 UI.
+- [Logo] Fix fit of custom logo for V2 UI nav bar.
+- [Structured Properties] Better handling for special characters in structured properties
+- [Lineage] Improvements to handling lineage cycles
+- [Metadata Tests] Improve Reliability of Metadata Tests Action Application
+- [Slack Integration] Minor improvement in authentication redirect to integrate with Slack
+- [Columns Tab] Property display nullable status in column sidebar (bug)
+- [Columns Tab] Fixing merging of sibling schemas between V2 and V1 field paths.
+- [Documentation] Support group authors for institutional memory aspect
+
+
+### v0.3.7
+
 - All changes in https://github.com/datahub-project/datahub/releases/tag/v0.14.1
     - Note Breaking Changes: https://datahubproject.io/docs/how/updating-datahub/#0141
 
@@ -96,7 +116,7 @@ If you are using an older CLI/SDK version, then please upgrade it. This applies
     - Improved UX for setting up and managing SSO
 
 - Ingestion changes
-    - In addition to the improvements listed here: https://github.com/acryldata/datahub/releases/tag/v0.14.1.11
+    - In addition to the improvements listed here: https://github.com/acryldata/datahub/releases/tag/v0.14.1.12
     - PowerBI: Support for PowerBI Apps and cross-workspace lineage
     - Fivetran: Major improvements to configurability and improved reliability with large Fivetran setups
     - Snowflake & BigQuery: Improved handling of temporary tables and swap statements when generating lineage

diff --git a/docs/what/urn.md b/docs/what/urn.md
@@ -35,11 +35,17 @@ urn:li:dataset:(urn:li:dataPlatform:hdfs,PageViewEvent,EI)
 
 ## Restrictions
 
-There are a few restrictions when creating an urn:
+There are a few restrictions when creating an URN:
 
-1. Commas are reserved character in URN fields: `,`
-2. Parentheses are reserved characters in URN fields: `(` or `)`
-3. Colons are reserved characters in URN fields: `:`
-4. Urn separator UTF-8 character `␟`
+The following characters are not allowed anywhere in the URN
+
+1. Parentheses are reserved characters in URN fields: `(` or `)`
+2. The "unit separator" unicode character `␟` (U+241F)
+
+The following characters are not allowed within an URN tuple.
+
+1. Commas are reserved characters in URN tuples: `,`
+
+Example: `urn:li:dashboard:(looker,dashboards.thelook)` is a valid urn, but `urn:li:dashboard:(looker,dashboards.the,look)` is invalid.
 
 Please do not use these characters when creating or generating urns. One approach is to use URL encoding for the characters.
diff --git a/metadata-ingestion/docs/sources/iceberg/iceberg.md b/metadata-ingestion/docs/sources/iceberg/iceberg.md
@@ -18,6 +18,8 @@ This ingestion source maps the following Source System Concepts to DataHub Conce
 
 ## Troubleshooting
 
-### [Common Issue]
+### Exceptions while increasing `processing_threads`
 
-[Provide description of common issues with this integration and steps to resolve]
+Each processing thread will open several files/sockets to download manifest files from blob storage. If you experience
+exceptions appearing when increasing `processing_threads` configuration parameter, try to increase limit of open
+files (i.e. using `ulimit` in Linux).
diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
@@ -14,8 +14,8 @@
 )
 
 base_requirements = {
-    # Typing extension should be >=3.10.0.2 ideally but we can't restrict due to a Airflow 2.1 dependency conflict.
-    "typing_extensions>=3.7.4.3",
+    # Our min version of typing_extensions is somewhat constrained by Airflow.
+    "typing_extensions>=3.10.0.2",
     # Actual dependencies.
     "typing-inspect",
     # pydantic 1.8.2 is incompatible with mypy 0.910.
@@ -249,7 +249,8 @@
 
 iceberg_common = {
     # Iceberg Python SDK
-    "pyiceberg>=0.4,<0.7",
+    # Kept at 0.4.0 due to higher versions requiring pydantic>2, as soon as we are fine with it, bump this dependency
+    "pyiceberg>=0.4.0",
 }
 
 mssql_common = {
@@ -775,7 +776,7 @@
         "trino = datahub.ingestion.source.sql.trino:TrinoSource",
         "starburst-trino-usage = datahub.ingestion.source.usage.starburst_trino_usage:TrinoUsageSource",
         "nifi = datahub.ingestion.source.nifi:NifiSource",
-        "powerbi = datahub.ingestion.source.powerbi:PowerBiDashboardSource",
+        "powerbi = datahub.ingestion.source.powerbi.powerbi:PowerBiDashboardSource",
         "powerbi-report-server = datahub.ingestion.source.powerbi_report_server:PowerBiReportServerDashboardSource",
         "iceberg = datahub.ingestion.source.iceberg.iceberg:IcebergSource",
         "vertica = datahub.ingestion.source.sql.vertica:VerticaSource",

diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py
@@ -67,6 +67,7 @@
     SystemMetadataClass,
     TelemetryClientIdClass,
 )
+from datahub.telemetry.telemetry import telemetry_instance
 from datahub.utilities.perf_timer import PerfTimer
 from datahub.utilities.str_enum import StrEnum
 from datahub.utilities.urns.urn import Urn, guess_entity_type
@@ -1819,4 +1820,5 @@ def get_default_graph() -> DataHubGraph:
     graph_config = config_utils.load_client_config()
     graph = DataHubGraph(graph_config)
     graph.test_connection()
+    telemetry_instance.set_context(server=graph)
     return graph
diff --git a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py
@@ -44,7 +44,8 @@
 )
 from datahub.ingestion.transformer.transform_registry import transform_registry
 from datahub.metadata.schema_classes import MetadataChangeProposalClass
-from datahub.telemetry import stats, telemetry
+from datahub.telemetry import stats
+from datahub.telemetry.telemetry import telemetry_instance
 from datahub.utilities._custom_package_loader import model_version_name
 from datahub.utilities.global_warning_util import (
     clear_global_warnings,
@@ -273,8 +274,9 @@ def __init__(
         if self.graph is None and isinstance(self.sink, DatahubRestSink):
             with _add_init_error_context("setup default datahub client"):
                 self.graph = self.sink.emitter.to_graph()
+                self.graph.test_connection()
         self.ctx.graph = self.graph
-        telemetry.telemetry_instance.update_capture_exception_context(server=self.graph)
+        telemetry_instance.set_context(server=self.graph)
 
         with set_graph_context(self.graph):
             with _add_init_error_context("configure reporters"):
@@ -615,7 +617,7 @@ def log_ingestion_stats(self) -> None:
         sink_warnings = len(self.sink.get_report().warnings)
         global_warnings = len(get_global_warnings())
 
-        telemetry.telemetry_instance.ping(
+        telemetry_instance.ping(
             "ingest_stats",
             {
                 "source_type": self.source_type,
@@ -637,7 +639,6 @@ def log_ingestion_stats(self) -> None:
                 ),
                 "has_pipeline_name": bool(self.config.pipeline_name),
             },
-            self.ctx.graph,
         )
 
     def _approx_all_vals(self, d: LossyList[Any]) -> int:

diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py
@@ -53,19 +53,7 @@
     make_assertion_from_test,
     make_assertion_result_from_test,
 )
-from datahub.ingestion.source.sql.sql_types import (
-    ATHENA_SQL_TYPES_MAP,
-    BIGQUERY_TYPES_MAP,
-    POSTGRES_TYPES_MAP,
-    SNOWFLAKE_TYPES_MAP,
-    SPARK_SQL_TYPES_MAP,
-    TRINO_SQL_TYPES_MAP,
-    VERTICA_SQL_TYPES_MAP,
-    resolve_athena_modified_type,
-    resolve_postgres_modified_type,
-    resolve_trino_modified_type,
-    resolve_vertica_modified_type,
-)
+from datahub.ingestion.source.sql.sql_types import resolve_sql_type
 from datahub.ingestion.source.state.stale_entity_removal_handler import (
     StaleEntityRemovalHandler,
     StaleEntityRemovalSourceReport,
@@ -89,17 +77,11 @@
 from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
 from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
 from datahub.metadata.com.linkedin.pegasus2avro.schema import (
-    BooleanTypeClass,
-    DateTypeClass,
     MySqlDDL,
     NullTypeClass,
-    NumberTypeClass,
-    RecordType,
     SchemaField,
     SchemaFieldDataType,
     SchemaMetadata,
-    StringTypeClass,
-    TimeTypeClass,
 )
 from datahub.metadata.schema_classes import (
     DataPlatformInstanceClass,
@@ -804,28 +786,6 @@ def make_mapping_upstream_lineage(
     )
 
 
-# See https://github.com/fishtown-analytics/dbt/blob/master/core/dbt/adapters/sql/impl.py
-_field_type_mapping = {
-    "boolean": BooleanTypeClass,
-    "date": DateTypeClass,
-    "time": TimeTypeClass,
-    "numeric": NumberTypeClass,
-    "text": StringTypeClass,
-    "timestamp with time zone": DateTypeClass,
-    "timestamp without time zone": DateTypeClass,
-    "integer": NumberTypeClass,
-    "float8": NumberTypeClass,
-    "struct": RecordType,
-    **POSTGRES_TYPES_MAP,
-    **SNOWFLAKE_TYPES_MAP,
-    **BIGQUERY_TYPES_MAP,
-    **SPARK_SQL_TYPES_MAP,
-    **TRINO_SQL_TYPES_MAP,
-    **ATHENA_SQL_TYPES_MAP,
-    **VERTICA_SQL_TYPES_MAP,
-}
-
-
 def get_column_type(
     report: DBTSourceReport,
     dataset_name: str,
@@ -835,24 +795,10 @@ def get_column_type(
     """
     Maps known DBT types to datahub types
     """
-    TypeClass: Any = _field_type_mapping.get(column_type) if column_type else None
-
-    if TypeClass is None and column_type:
-        # resolve a modified type
-        if dbt_adapter == "trino":
-            TypeClass = resolve_trino_modified_type(column_type)
-        elif dbt_adapter == "athena":
-            TypeClass = resolve_athena_modified_type(column_type)
-        elif dbt_adapter == "postgres" or dbt_adapter == "redshift":
-            # Redshift uses a variant of Postgres, so we can use the same logic.
-            TypeClass = resolve_postgres_modified_type(column_type)
-        elif dbt_adapter == "vertica":
-            TypeClass = resolve_vertica_modified_type(column_type)
-        elif dbt_adapter == "snowflake":
-            # Snowflake types are uppercase, so we check that.
-            TypeClass = _field_type_mapping.get(column_type.upper())
-
-    # if still not found, report the warning
+
+    TypeClass = resolve_sql_type(column_type, dbt_adapter)
+
+    # if still not found, report a warning
     if TypeClass is None:
         if column_type:
             report.info(
@@ -861,9 +807,9 @@ def get_column_type(
                 context=f"{dataset_name} - {column_type}",
                 log=False,
             )
-        TypeClass = NullTypeClass
+        TypeClass = NullTypeClass()
 
-    return SchemaFieldDataType(type=TypeClass())
+    return SchemaFieldDataType(type=TypeClass)
 
 
 @platform_name("dbt")