Merge branch 'master' into fix/ING-761-dremio-ownership

sagar-salvi-apptware · Nov 29, 2024 · 2f3d864 · 2f3d864
2 parents 573808c + a46de1e
commit 2f3d864
Show file tree

Hide file tree

Showing 108 changed files with 7,427 additions and 2,647 deletions.
diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
@@ -83,6 +83,7 @@ jobs:
       - uses: gradle/actions/setup-gradle@v3
       - name: Gradle build (and test) for NOT metadata ingestion
         if: ${{  matrix.command == 'except_metadata_ingestion' && needs.setup.outputs.backend_change == 'true' }}
+        # datahub-schematron:cli excluded due to dependency on metadata-ingestion
         run: |
           ./gradlew build \
             -x :metadata-ingestion:build \
@@ -100,6 +101,7 @@ jobs:
             -x :metadata-ingestion-modules:gx-plugin:check \
             -x :datahub-frontend:build \
             -x :datahub-web-react:build \
+            -x :metadata-integration:java:datahub-schematron:cli:test \
             --parallel
       - name: Gradle build (and test) for frontend
         if: ${{  matrix.command == 'frontend' && needs.setup.outputs.frontend_change == 'true' }}

diff --git a/.github/workflows/check-datahub-jars.yml b/.github/workflows/check-datahub-jars.yml
@@ -40,4 +40,5 @@ jobs:
       - name: check ${{ matrix.command }} jar
         run: |
           ./gradlew :metadata-integration:java:${{ matrix.command }}:build --info
+          ./gradlew :metadata-integration:java:${{ matrix.command }}:checkShadowJar
           ./gradlew :metadata-integration:java:${{ matrix.command }}:javadoc
diff --git a/build.gradle b/build.gradle
@@ -48,6 +48,7 @@ buildscript {
   // see also datahub-frontend/play.gradle
   ext.playVersion = '2.8.22'
   ext.playScalaVersion = '2.13'
+  ext.akkaVersion = '2.6.21' // 2.7.0+ has incompatible license
   ext.log4jVersion = '2.23.1'
   ext.slf4jVersion = '1.7.36'
   ext.logbackClassic = '1.4.14'
@@ -105,7 +106,14 @@ project.ext.spec = [
 ]
 
 project.ext.externalDependency = [
-    'akkaHttp': "com.typesafe.akka:akka-http-core_$playScalaVersion:10.2.10",
+    'akkaHttp': "com.typesafe.akka:akka-http-core_$playScalaVersion:10.2.10", // max version due to licensing
+    'akkaActor': "com.typesafe.akka:akka-actor_$playScalaVersion:$akkaVersion",
+    'akkaStream': "com.typesafe.akka:akka-stream_$playScalaVersion:$akkaVersion",
+    'akkaActorTyped': "com.typesafe.akka:akka-actor-typed_$playScalaVersion:$akkaVersion",
+    'akkaSlf4j': "com.typesafe.akka:akka-slf4j_$playScalaVersion:$akkaVersion",
+    'akkaJackson': "com.typesafe.akka:akka-serialization-jackson_$playScalaVersion:$akkaVersion",
+    'akkaParsing': "com.typesafe.akka:akka-parsing_$playScalaVersion:$akkaVersion",
+    'akkaProtobuf': "com.typesafe.akka:akka-protobuf-v3_$playScalaVersion:$akkaVersion",
     'antlr4Runtime': 'org.antlr:antlr4-runtime:4.9.3',
     'antlr4': 'org.antlr:antlr4:4.9.3',
     'assertJ': 'org.assertj:assertj-core:3.11.1',
@@ -350,6 +358,7 @@ allprojects {
       }
     }
   }
+
 }
 
 configure(subprojects.findAll {! it.name.startsWith('spark-lineage')}) {

diff --git a/datahub-frontend/play.gradle b/datahub-frontend/play.gradle
@@ -55,6 +55,13 @@ dependencies {
   implementation externalDependency.antlr4Runtime
   implementation externalDependency.antlr4
   implementation externalDependency.akkaHttp
+  implementation externalDependency.akkaActor
+  implementation externalDependency.akkaStream
+  implementation externalDependency.akkaActorTyped
+  implementation externalDependency.akkaSlf4j
+  implementation externalDependency.akkaJackson
+  implementation externalDependency.akkaParsing
+  implementation externalDependency.akkaProtobuf
 
   implementation externalDependency.jerseyCore
   implementation externalDependency.jerseyGuava

diff --git a/...va/com/linkedin/datahub/graphql/resolvers/search/AggregateAcrossEntitiesResolverTest.java b/...va/com/linkedin/datahub/graphql/resolvers/search/AggregateAcrossEntitiesResolverTest.java
@@ -386,7 +386,11 @@ private static EntityClient initMockEntityClient(
     Mockito.when(
             client.searchAcrossEntities(
                 any(),
-                Mockito.eq(entityTypes),
+                Mockito.argThat(
+                    argument ->
+                        argument != null
+                            && argument.containsAll(entityTypes)
+                            && entityTypes.containsAll(argument)),
                 Mockito.eq(query),
                 Mockito.eq(filter),
                 Mockito.eq(start),
@@ -409,7 +413,11 @@ private static void verifyMockEntityClient(
     Mockito.verify(mockClient, Mockito.times(1))
         .searchAcrossEntities(
             any(),
-            Mockito.eq(entityTypes),
+            Mockito.argThat(
+                argument ->
+                    argument != null
+                        && argument.containsAll(entityTypes)
+                        && entityTypes.containsAll(argument)),
             Mockito.eq(query),
             Mockito.eq(filter),
             Mockito.eq(start),

diff --git a/.../java/com/linkedin/datahub/graphql/resolvers/search/SearchAcrossEntitiesResolverTest.java b/.../java/com/linkedin/datahub/graphql/resolvers/search/SearchAcrossEntitiesResolverTest.java
@@ -462,7 +462,11 @@ private static EntityClient initMockEntityClient(
     Mockito.when(
             client.searchAcrossEntities(
                 any(),
-                Mockito.eq(entityTypes),
+                Mockito.argThat(
+                    argument ->
+                        argument != null
+                            && argument.containsAll(entityTypes)
+                            && entityTypes.containsAll(argument)),
                 Mockito.eq(query),
                 Mockito.eq(filter),
                 Mockito.eq(start),
@@ -483,7 +487,11 @@ private static void verifyMockEntityClient(
     Mockito.verify(mockClient, Mockito.times(1))
         .searchAcrossEntities(
             any(),
-            Mockito.eq(entityTypes),
+            Mockito.argThat(
+                argument ->
+                    argument != null
+                        && argument.containsAll(entityTypes)
+                        && entityTypes.containsAll(argument)),
             Mockito.eq(query),
             Mockito.eq(filter),
             Mockito.eq(start),

diff --git a/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx b/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx
@@ -193,7 +193,9 @@ export const IngestionSourceList = () => {
 
     const formatExtraArgs = (extraArgs): StringMapEntryInput[] => {
         if (extraArgs === null || extraArgs === undefined) return [];
-        return extraArgs.map((entry) => ({ key: entry.key, value: entry.value }));
+        return extraArgs
+            .filter((entry) => entry.value !== null && entry.value !== undefined && entry.value !== '')
+            .map((entry) => ({ key: entry.key, value: entry.value }));
     };
 
     const createOrUpdateIngestionSource = (

diff --git a/datahub-web-react/src/app/ingest/source/builder/sources.json b/datahub-web-react/src/app/ingest/source/builder/sources.json
@@ -317,5 +317,13 @@
         "displayName": "CassandraDB",
         "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/cassandra",
         "recipe": "source:\n  type: cassandra\n  config:\n    # Credentials for on prem cassandra\n    contact_point: localhost\n    port: 9042\n    username: admin\n    password: password\n\n    # Or\n    # Credentials Astra Cloud\n    #cloud_config:\n    #  secure_connect_bundle: Path to Secure Connect Bundle (.zip)\n    #  token: Application Token\n\n    # Optional Allow / Deny extraction of particular keyspaces.\n    keyspace_pattern:\n      allow: [.*]\n\n    # Optional Allow / Deny extraction of particular tables.\n    table_pattern:\n      allow: [.*]"
+    },
+    {
+        "urn": "urn:li:dataPlatform:iceberg",
+        "name": "iceberg",
+        "displayName": "Iceberg",
+        "description": "Ingest databases and tables from any Iceberg catalog implementation",
+        "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/iceberg",
+        "recipe": "source:\n type: \"iceberg\"\n config:\n   env: dev\n   # each thread will open internet connections to fetch manifest files independently, \n   # this value needs to be adjusted with ulimit\n   processing_threads: 1 \n   # a single catalog definition with a form of a dictionary\n   catalog: \n     demo: # name of the catalog\n       type: \"rest\" # other types are available\n       uri: \"uri\"\n       s3.access-key-id: \"access-key\"\n       s3.secret-access-key: \"secret-access-key\"\n       s3.region: \"aws-region\"\n   profiling:\n     enabled: false\n"
     }
 ]
diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js
@@ -989,6 +989,7 @@ module.exports = {
     // "metadata-ingestion/examples/structured_properties/README"
     // "smoke-test/tests/openapi/README"
     // "docs/SECURITY_STANCE"
+    // "metadata-integration/java/datahub-schematron/README"
     // ],
   ],
 };
diff --git a/docs/automations/snowflake-tag-propagation.md b/docs/automations/snowflake-tag-propagation.md
@@ -4,6 +4,8 @@ import FeatureAvailability from '@site/src/components/FeatureAvailability';
 
 <FeatureAvailability saasOnly />
 
+> Note that this Automation in currently in open **Beta**. With any questions or issues, please reach out to your Acryl representative. 
+
 ## Introduction
 
 Snowflake Tag Propagation is an automation that allows you to sync DataHub Glossary Terms and Tags on
@@ -15,6 +17,41 @@ both columns and tables back to Snowflake. This automation is available in DataH
 - Automatically Add DataHub Tags to Snowflake Tables and Columns
 - Automatically Remove DataHub Glossary Terms and Tags from Snowflake Tables and Columns when they are removed in DataHub
 
+## Prerequisites
+
+### Permissions Required for Tag Management
+
+- `CREATE TAG`: Required to create new tags in Snowflake.
+Ensure the user or role has this privilege on the specific schema or database where tags will be created.
+- `APPLY TAG`: Required to assign tags to Snowflake objects such as tables, columns, or other database objects.
+This permission must be granted at the database, schema, or object level depending on the scope.
+
+
+### Permissions Required for Object Access
+
+- `USAGE` on the database and schema: Allows access to the database and schema to view and apply changes.
+- `SELECT` on the objects (tables, views, etc.): Enables the automation to read metadata and verify existing tags.
+
+### Example Permission Grant Statements
+
+To grant the necessary permissions for a specific role (DATAHUB_AUTOMATION_ROLE), you can use the following SQL commands:
+
+```sql
+-- Tag management permissions
+GRANT CREATE TAG ON SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE;
+GRANT APPLY TAG ON SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE;
+
+-- Object access for metadata operations
+GRANT USAGE ON DATABASE your_database TO ROLE DATAHUB_AUTOMATION_ROLE;
+GRANT USAGE ON SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE;
+GRANT SELECT ON ALL TABLES IN SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE;
+
+-- Future privileges for tagging
+GRANT SELECT ON FUTURE TABLES IN SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE;
+GRANT APPLY TAG ON FUTURE TABLES IN SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE;
+```
+
+
 ## Enabling Snowflake Tag Sync
 
 1. **Navigate to Automations**: Click on 'Govern' > 'Automations' in the navigation bar.

diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md
@@ -88,6 +88,9 @@ This file documents any backwards-incompatible changes in DataHub and assists pe
 ### Other Notable Changes
 
 - Downgrade to previous version is not automatically supported.
+- Data Product Properties Unset side effect introduced
+  - Previously, Data Products could be set as linked to multiple Datasets if modified directly via the REST API rather than linked through the UI or GraphQL. This side effect aligns the REST API behavior with the GraphQL behavior by introducting a side effect that enforces the 1-to-1 constraint between Data Products and Datasets
+  - NOTE: There is a pathological pattern of writes for Data Products that can introduce issues with write processing that can occur with this side effect. If you are constantly changing all of the Datasets associated with a Data Product back and forth between multiple Data Products it will result in a high volume of writes due to the need to unset previous associations.
 
 ## 0.14.0.2
 

diff --git a/docs/managed-datahub/release-notes/v_0_3_7.md b/docs/managed-datahub/release-notes/v_0_3_7.md
@@ -7,7 +7,7 @@ Release Availability Date
 
 Recommended CLI/SDK
 ---
-- `v0.14.1.11` with release notes at https://github.com/datahub/datahub/releases/tag/v0.14.1.11
+- `v0.14.1.12` with release notes at https://github.com/datahub/datahub/releases/tag/v0.14.1.12
 
 If you are using an older CLI/SDK version, then please upgrade it. This applies for all CLI/SDK usages, if you are using it through your terminal, GitHub Actions, Airflow, in Python SDK somewhere, Java SDK, etc. This is a strong recommendation to upgrade, as we keep on pushing fixes in the CLI, and it helps us support you better.
 
@@ -19,6 +19,26 @@ If you are using an older CLI/SDK version, then please upgrade it. This applies
 ## Release Changelog
 ---
 
+### v0.3.7.4
+
+- [#11935](https://github.com/datahub-project/datahub/pull/11935) - Added environment variable for enabling stricter URN validation rules `STRICT_URN_VALIDATION_ENABLED` [[1](https://datahubproject.io/docs/what/urn/#restrictions)].
+- [Automations] Filter out self-nodes in glossary term propagation
+- [Remote Executor] Allow dashes in executor ids.
+- [Search] Fix Nested Filter Counts in Primary Search
+- [Search] Fix white screen of death on empty search result
+- [Columns Tab] Support searching nested struct columns correctly in V2 UI.
+- [Logo] Fix fit of custom logo for V2 UI nav bar.
+- [Structured Properties] Better handling for special characters in structured properties
+- [Lineage] Improvements to handling lineage cycles
+- [Metadata Tests] Improve Reliability of Metadata Tests Action Application
+- [Slack Integration] Minor improvement in authentication redirect to integrate with Slack
+- [Columns Tab] Property display nullable status in column sidebar (bug)
+- [Columns Tab] Fixing merging of sibling schemas between V2 and V1 field paths.
+- [Documentation] Support group authors for institutional memory aspect
+
+
+### v0.3.7
+
 - All changes in https://github.com/datahub-project/datahub/releases/tag/v0.14.1
     - Note Breaking Changes: https://datahubproject.io/docs/how/updating-datahub/#0141
 
@@ -96,7 +116,7 @@ If you are using an older CLI/SDK version, then please upgrade it. This applies
     - Improved UX for setting up and managing SSO
 
 - Ingestion changes
-    - In addition to the improvements listed here: https://github.com/acryldata/datahub/releases/tag/v0.14.1.11
+    - In addition to the improvements listed here: https://github.com/acryldata/datahub/releases/tag/v0.14.1.12
     - PowerBI: Support for PowerBI Apps and cross-workspace lineage
     - Fivetran: Major improvements to configurability and improved reliability with large Fivetran setups
     - Snowflake & BigQuery: Improved handling of temporary tables and swap statements when generating lineage
@@ -120,3 +140,6 @@ If you are using an older CLI/SDK version, then please upgrade it. This applies
     - (system / internal) Exclude form-prompt tests in live Metadata Tests evaluation
     - (system / internal) Exclude form-prompt tests in stored Metadata Test results
     - Elasticsearch reindex time limit of 8h removed
+    - Data Product Properties Unset side effect introduced
+      - Previously, Data Products could be set as linked to multiple Datasets if modified directly via the REST API rather than linked through the UI or GraphQL. This side effect aligns the REST API behavior with the GraphQL behavior by introducting a side effect that enforces the 1-to-1 constraint between Data Products and Datasets
+      - NOTE: There is a pathological pattern of writes for Data Products that can introduce issues with write processing that can occur with this side effect. If you are constantly changing all of the Datasets associated with a Data Product back and forth between multiple Data Products it will result in a high volume of writes due to the need to unset previous associations.
diff --git a/docs/what/urn.md b/docs/what/urn.md
@@ -35,11 +35,17 @@ urn:li:dataset:(urn:li:dataPlatform:hdfs,PageViewEvent,EI)
 
 ## Restrictions
 
-There are a few restrictions when creating an urn:
+There are a few restrictions when creating an URN:
 
-1. Commas are reserved character in URN fields: `,`
-2. Parentheses are reserved characters in URN fields: `(` or `)`
-3. Colons are reserved characters in URN fields: `:`
-4. Urn separator UTF-8 character `␟`
+The following characters are not allowed anywhere in the URN
+
+1. Parentheses are reserved characters in URN fields: `(` or `)`
+2. The "unit separator" unicode character `␟` (U+241F)
+
+The following characters are not allowed within an URN tuple.
+
+1. Commas are reserved characters in URN tuples: `,`
+
+Example: `urn:li:dashboard:(looker,dashboards.thelook)` is a valid urn, but `urn:li:dashboard:(looker,dashboards.the,look)` is invalid.
 
 Please do not use these characters when creating or generating urns. One approach is to use URL encoding for the characters.
diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/TemplateUtil.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/TemplateUtil.java
@@ -81,14 +81,13 @@ public static JsonNode populateTopLevelKeys(JsonNode transformedNode, JsonPatch
           PatchOperationType.REMOVE.equals(operationPath.getFirst())
               ? keys.length
               : keys.length - 1;
-
       // Skip first as it will always be blank due to path starting with /
       for (int i = 1; i < endIdx; i++) {
+        String decodedKey = decodeValue(keys[i]);
         if (parent.get(keys[i]) == null) {
-          String decodedKey = decodeValue(keys[i]);
           ((ObjectNode) parent).set(decodedKey, instance.objectNode());
         }
-        parent = parent.get(keys[i]);
+        parent = parent.get(decodedKey);
       }
     }
 

diff --git a/...rc/test/java/com/linkedin/metadata/aspect/patch/template/UpstreamLineageTemplateTest.java b/...rc/test/java/com/linkedin/metadata/aspect/patch/template/UpstreamLineageTemplateTest.java
@@ -185,6 +185,29 @@ public void testPatchUpstream() throws Exception {
     // New entry in array because of new transformation type
     assertEquals(result4.getFineGrainedLineages().get(3), fineGrainedLineage4);
 
+    JsonPatchBuilder patchOperations5 = Json.createPatchBuilder();
+    String urn4 =
+        "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:s3,test-bucket/hive/folder_1/folder_2/my_dataset,DEV),c2)";
+    UrnArray downstreamUrns5 = new UrnArray();
+    downstreamUrns5.add(Urn.createFromString(urn4));
+    patchOperations5.add(
+        "/fineGrainedLineages/TRANSFORM/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:s3,test-bucket~1hive~1folder_1~1folder_2~1my_dataset,DEV),c2)/urn:li:query:anotherQuery/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)",
+        finegrainedLineageNode5.build());
+    JsonPatch jsonPatch5 = patchOperations5.build();
+    UpstreamLineage result5 = upstreamLineageTemplate.applyPatch(result4, jsonPatch5);
+    // Hack because Jackson parses values to doubles instead of floats
+    DataMap dataMap5 = new DataMap();
+    dataMap5.put("confidenceScore", 1.0);
+    FineGrainedLineage fineGrainedLineage5 = new FineGrainedLineage(dataMap5);
+    fineGrainedLineage5.setUpstreams(upstreamUrns3);
+    fineGrainedLineage5.setDownstreams(downstreamUrns5);
+    fineGrainedLineage5.setTransformOperation("TRANSFORM");
+    fineGrainedLineage5.setUpstreamType(FineGrainedLineageUpstreamType.FIELD_SET);
+    fineGrainedLineage5.setDownstreamType(FineGrainedLineageDownstreamType.FIELD);
+    fineGrainedLineage5.setQuery(UrnUtils.getUrn("urn:li:query:anotherQuery"));
+    // New entry in array because of new transformation type
+    assertEquals(result5.getFineGrainedLineages().get(4), fineGrainedLineage5);
+
     // Remove
     JsonPatchBuilder removeOperations = Json.createPatchBuilder();
     removeOperations.remove(

diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_datahub_ol_adapter.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_datahub_ol_adapter.py
@@ -8,7 +8,6 @@
 
 OL_SCHEME_TWEAKS = {
     "sqlserver": "mssql",
-    "trino": "presto",
     "awsathena": "athena",
 }
 

diff --git a/metadata-ingestion/docs/sources/iceberg/iceberg.md b/metadata-ingestion/docs/sources/iceberg/iceberg.md
@@ -18,6 +18,8 @@ This ingestion source maps the following Source System Concepts to DataHub Conce
 
 ## Troubleshooting
 
-### [Common Issue]
+### Exceptions while increasing `processing_threads`
 
-[Provide description of common issues with this integration and steps to resolve]
+Each processing thread will open several files/sockets to download manifest files from blob storage. If you experience
+exceptions appearing when increasing `processing_threads` configuration parameter, try to increase limit of open
+files (i.e. using `ulimit` in Linux).
diff --git a/metadata-ingestion/docs/sources/sigma/sigma_pre.md b/metadata-ingestion/docs/sources/sigma/sigma_pre.md
@@ -16,7 +16,7 @@ This source extracts the following:
 | Sigma                  | Datahub												         | Notes                            |
 |------------------------|---------------------------------------------------------------|----------------------------------|
 | `Workspace`            | [Container](../../metamodel/entities/container.md)     	     | SubType `"Sigma Workspace"`      |
-| `Workbook`             | [Container](../../metamodel/entities/container.md)            | SubType `"Sigma Workbook"`       |
+| `Workbook`             | [Dashboard](../../metamodel/entities/dashboard.md)            | SubType `"Sigma Workbook"`       |
 | `Page`                 | [Dashboard](../../metamodel/entities/dashboard.md)            |                                  |
 | `Element`              | [Chart](../../metamodel/entities/chart.md)                    |                                  |
 | `Dataset`              | [Dataset](../../metamodel/entities/dataset.md)                | SubType `"Sigma Dataset"`        |