Skip to content

Commit

Permalink
Merge branch 'master' into fix/ING-761-dremio-ownership
Browse files Browse the repository at this point in the history
  • Loading branch information
mayurinehate authored Nov 29, 2024
2 parents 573808c + a46de1e commit 2f3d864
Show file tree
Hide file tree
Showing 108 changed files with 7,427 additions and 2,647 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/build-and-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ jobs:
- uses: gradle/actions/setup-gradle@v3
- name: Gradle build (and test) for NOT metadata ingestion
if: ${{ matrix.command == 'except_metadata_ingestion' && needs.setup.outputs.backend_change == 'true' }}
# datahub-schematron:cli excluded due to dependency on metadata-ingestion
run: |
./gradlew build \
-x :metadata-ingestion:build \
Expand All @@ -100,6 +101,7 @@ jobs:
-x :metadata-ingestion-modules:gx-plugin:check \
-x :datahub-frontend:build \
-x :datahub-web-react:build \
-x :metadata-integration:java:datahub-schematron:cli:test \
--parallel
- name: Gradle build (and test) for frontend
if: ${{ matrix.command == 'frontend' && needs.setup.outputs.frontend_change == 'true' }}
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/check-datahub-jars.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,5 @@ jobs:
- name: check ${{ matrix.command }} jar
run: |
./gradlew :metadata-integration:java:${{ matrix.command }}:build --info
./gradlew :metadata-integration:java:${{ matrix.command }}:checkShadowJar
./gradlew :metadata-integration:java:${{ matrix.command }}:javadoc
11 changes: 10 additions & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ buildscript {
// see also datahub-frontend/play.gradle
ext.playVersion = '2.8.22'
ext.playScalaVersion = '2.13'
ext.akkaVersion = '2.6.21' // 2.7.0+ has incompatible license
ext.log4jVersion = '2.23.1'
ext.slf4jVersion = '1.7.36'
ext.logbackClassic = '1.4.14'
Expand Down Expand Up @@ -105,7 +106,14 @@ project.ext.spec = [
]

project.ext.externalDependency = [
'akkaHttp': "com.typesafe.akka:akka-http-core_$playScalaVersion:10.2.10",
'akkaHttp': "com.typesafe.akka:akka-http-core_$playScalaVersion:10.2.10", // max version due to licensing
'akkaActor': "com.typesafe.akka:akka-actor_$playScalaVersion:$akkaVersion",
'akkaStream': "com.typesafe.akka:akka-stream_$playScalaVersion:$akkaVersion",
'akkaActorTyped': "com.typesafe.akka:akka-actor-typed_$playScalaVersion:$akkaVersion",
'akkaSlf4j': "com.typesafe.akka:akka-slf4j_$playScalaVersion:$akkaVersion",
'akkaJackson': "com.typesafe.akka:akka-serialization-jackson_$playScalaVersion:$akkaVersion",
'akkaParsing': "com.typesafe.akka:akka-parsing_$playScalaVersion:$akkaVersion",
'akkaProtobuf': "com.typesafe.akka:akka-protobuf-v3_$playScalaVersion:$akkaVersion",
'antlr4Runtime': 'org.antlr:antlr4-runtime:4.9.3',
'antlr4': 'org.antlr:antlr4:4.9.3',
'assertJ': 'org.assertj:assertj-core:3.11.1',
Expand Down Expand Up @@ -350,6 +358,7 @@ allprojects {
}
}
}

}

configure(subprojects.findAll {! it.name.startsWith('spark-lineage')}) {
Expand Down
7 changes: 7 additions & 0 deletions datahub-frontend/play.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,13 @@ dependencies {
implementation externalDependency.antlr4Runtime
implementation externalDependency.antlr4
implementation externalDependency.akkaHttp
implementation externalDependency.akkaActor
implementation externalDependency.akkaStream
implementation externalDependency.akkaActorTyped
implementation externalDependency.akkaSlf4j
implementation externalDependency.akkaJackson
implementation externalDependency.akkaParsing
implementation externalDependency.akkaProtobuf

implementation externalDependency.jerseyCore
implementation externalDependency.jerseyGuava
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,11 @@ private static EntityClient initMockEntityClient(
Mockito.when(
client.searchAcrossEntities(
any(),
Mockito.eq(entityTypes),
Mockito.argThat(
argument ->
argument != null
&& argument.containsAll(entityTypes)
&& entityTypes.containsAll(argument)),
Mockito.eq(query),
Mockito.eq(filter),
Mockito.eq(start),
Expand All @@ -409,7 +413,11 @@ private static void verifyMockEntityClient(
Mockito.verify(mockClient, Mockito.times(1))
.searchAcrossEntities(
any(),
Mockito.eq(entityTypes),
Mockito.argThat(
argument ->
argument != null
&& argument.containsAll(entityTypes)
&& entityTypes.containsAll(argument)),
Mockito.eq(query),
Mockito.eq(filter),
Mockito.eq(start),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -462,7 +462,11 @@ private static EntityClient initMockEntityClient(
Mockito.when(
client.searchAcrossEntities(
any(),
Mockito.eq(entityTypes),
Mockito.argThat(
argument ->
argument != null
&& argument.containsAll(entityTypes)
&& entityTypes.containsAll(argument)),
Mockito.eq(query),
Mockito.eq(filter),
Mockito.eq(start),
Expand All @@ -483,7 +487,11 @@ private static void verifyMockEntityClient(
Mockito.verify(mockClient, Mockito.times(1))
.searchAcrossEntities(
any(),
Mockito.eq(entityTypes),
Mockito.argThat(
argument ->
argument != null
&& argument.containsAll(entityTypes)
&& entityTypes.containsAll(argument)),
Mockito.eq(query),
Mockito.eq(filter),
Mockito.eq(start),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,9 @@ export const IngestionSourceList = () => {

const formatExtraArgs = (extraArgs): StringMapEntryInput[] => {
if (extraArgs === null || extraArgs === undefined) return [];
return extraArgs.map((entry) => ({ key: entry.key, value: entry.value }));
return extraArgs
.filter((entry) => entry.value !== null && entry.value !== undefined && entry.value !== '')
.map((entry) => ({ key: entry.key, value: entry.value }));
};

const createOrUpdateIngestionSource = (
Expand Down
8 changes: 8 additions & 0 deletions datahub-web-react/src/app/ingest/source/builder/sources.json
Original file line number Diff line number Diff line change
Expand Up @@ -317,5 +317,13 @@
"displayName": "CassandraDB",
"docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/cassandra",
"recipe": "source:\n type: cassandra\n config:\n # Credentials for on prem cassandra\n contact_point: localhost\n port: 9042\n username: admin\n password: password\n\n # Or\n # Credentials Astra Cloud\n #cloud_config:\n # secure_connect_bundle: Path to Secure Connect Bundle (.zip)\n # token: Application Token\n\n # Optional Allow / Deny extraction of particular keyspaces.\n keyspace_pattern:\n allow: [.*]\n\n # Optional Allow / Deny extraction of particular tables.\n table_pattern:\n allow: [.*]"
},
{
"urn": "urn:li:dataPlatform:iceberg",
"name": "iceberg",
"displayName": "Iceberg",
"description": "Ingest databases and tables from any Iceberg catalog implementation",
"docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/iceberg",
"recipe": "source:\n type: \"iceberg\"\n config:\n env: dev\n # each thread will open internet connections to fetch manifest files independently, \n # this value needs to be adjusted with ulimit\n processing_threads: 1 \n # a single catalog definition with a form of a dictionary\n catalog: \n demo: # name of the catalog\n type: \"rest\" # other types are available\n uri: \"uri\"\n s3.access-key-id: \"access-key\"\n s3.secret-access-key: \"secret-access-key\"\n s3.region: \"aws-region\"\n profiling:\n enabled: false\n"
}
]
1 change: 1 addition & 0 deletions docs-website/sidebars.js
Original file line number Diff line number Diff line change
Expand Up @@ -989,6 +989,7 @@ module.exports = {
// "metadata-ingestion/examples/structured_properties/README"
// "smoke-test/tests/openapi/README"
// "docs/SECURITY_STANCE"
// "metadata-integration/java/datahub-schematron/README"
// ],
],
};
37 changes: 37 additions & 0 deletions docs/automations/snowflake-tag-propagation.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ import FeatureAvailability from '@site/src/components/FeatureAvailability';

<FeatureAvailability saasOnly />

> Note that this Automation in currently in open **Beta**. With any questions or issues, please reach out to your Acryl representative.
## Introduction

Snowflake Tag Propagation is an automation that allows you to sync DataHub Glossary Terms and Tags on
Expand All @@ -15,6 +17,41 @@ both columns and tables back to Snowflake. This automation is available in DataH
- Automatically Add DataHub Tags to Snowflake Tables and Columns
- Automatically Remove DataHub Glossary Terms and Tags from Snowflake Tables and Columns when they are removed in DataHub

## Prerequisites

### Permissions Required for Tag Management

- `CREATE TAG`: Required to create new tags in Snowflake.
Ensure the user or role has this privilege on the specific schema or database where tags will be created.
- `APPLY TAG`: Required to assign tags to Snowflake objects such as tables, columns, or other database objects.
This permission must be granted at the database, schema, or object level depending on the scope.


### Permissions Required for Object Access

- `USAGE` on the database and schema: Allows access to the database and schema to view and apply changes.
- `SELECT` on the objects (tables, views, etc.): Enables the automation to read metadata and verify existing tags.

### Example Permission Grant Statements

To grant the necessary permissions for a specific role (DATAHUB_AUTOMATION_ROLE), you can use the following SQL commands:

```sql
-- Tag management permissions
GRANT CREATE TAG ON SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE;
GRANT APPLY TAG ON SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE;

-- Object access for metadata operations
GRANT USAGE ON DATABASE your_database TO ROLE DATAHUB_AUTOMATION_ROLE;
GRANT USAGE ON SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE;
GRANT SELECT ON ALL TABLES IN SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE;

-- Future privileges for tagging
GRANT SELECT ON FUTURE TABLES IN SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE;
GRANT APPLY TAG ON FUTURE TABLES IN SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE;
```


## Enabling Snowflake Tag Sync

1. **Navigate to Automations**: Click on 'Govern' > 'Automations' in the navigation bar.
Expand Down
3 changes: 3 additions & 0 deletions docs/how/updating-datahub.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,9 @@ This file documents any backwards-incompatible changes in DataHub and assists pe
### Other Notable Changes

- Downgrade to previous version is not automatically supported.
- Data Product Properties Unset side effect introduced
- Previously, Data Products could be set as linked to multiple Datasets if modified directly via the REST API rather than linked through the UI or GraphQL. This side effect aligns the REST API behavior with the GraphQL behavior by introducting a side effect that enforces the 1-to-1 constraint between Data Products and Datasets
- NOTE: There is a pathological pattern of writes for Data Products that can introduce issues with write processing that can occur with this side effect. If you are constantly changing all of the Datasets associated with a Data Product back and forth between multiple Data Products it will result in a high volume of writes due to the need to unset previous associations.

## 0.14.0.2

Expand Down
27 changes: 25 additions & 2 deletions docs/managed-datahub/release-notes/v_0_3_7.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Release Availability Date

Recommended CLI/SDK
---
- `v0.14.1.11` with release notes at https://github.com/datahub/datahub/releases/tag/v0.14.1.11
- `v0.14.1.12` with release notes at https://github.com/datahub/datahub/releases/tag/v0.14.1.12

If you are using an older CLI/SDK version, then please upgrade it. This applies for all CLI/SDK usages, if you are using it through your terminal, GitHub Actions, Airflow, in Python SDK somewhere, Java SDK, etc. This is a strong recommendation to upgrade, as we keep on pushing fixes in the CLI, and it helps us support you better.

Expand All @@ -19,6 +19,26 @@ If you are using an older CLI/SDK version, then please upgrade it. This applies
## Release Changelog
---

### v0.3.7.4

- [#11935](https://github.com/datahub-project/datahub/pull/11935) - Added environment variable for enabling stricter URN validation rules `STRICT_URN_VALIDATION_ENABLED` [[1](https://datahubproject.io/docs/what/urn/#restrictions)].
- [Automations] Filter out self-nodes in glossary term propagation
- [Remote Executor] Allow dashes in executor ids.
- [Search] Fix Nested Filter Counts in Primary Search
- [Search] Fix white screen of death on empty search result
- [Columns Tab] Support searching nested struct columns correctly in V2 UI.
- [Logo] Fix fit of custom logo for V2 UI nav bar.
- [Structured Properties] Better handling for special characters in structured properties
- [Lineage] Improvements to handling lineage cycles
- [Metadata Tests] Improve Reliability of Metadata Tests Action Application
- [Slack Integration] Minor improvement in authentication redirect to integrate with Slack
- [Columns Tab] Property display nullable status in column sidebar (bug)
- [Columns Tab] Fixing merging of sibling schemas between V2 and V1 field paths.
- [Documentation] Support group authors for institutional memory aspect


### v0.3.7

- All changes in https://github.com/datahub-project/datahub/releases/tag/v0.14.1
- Note Breaking Changes: https://datahubproject.io/docs/how/updating-datahub/#0141

Expand Down Expand Up @@ -96,7 +116,7 @@ If you are using an older CLI/SDK version, then please upgrade it. This applies
- Improved UX for setting up and managing SSO

- Ingestion changes
- In addition to the improvements listed here: https://github.com/acryldata/datahub/releases/tag/v0.14.1.11
- In addition to the improvements listed here: https://github.com/acryldata/datahub/releases/tag/v0.14.1.12
- PowerBI: Support for PowerBI Apps and cross-workspace lineage
- Fivetran: Major improvements to configurability and improved reliability with large Fivetran setups
- Snowflake & BigQuery: Improved handling of temporary tables and swap statements when generating lineage
Expand All @@ -120,3 +140,6 @@ If you are using an older CLI/SDK version, then please upgrade it. This applies
- (system / internal) Exclude form-prompt tests in live Metadata Tests evaluation
- (system / internal) Exclude form-prompt tests in stored Metadata Test results
- Elasticsearch reindex time limit of 8h removed
- Data Product Properties Unset side effect introduced
- Previously, Data Products could be set as linked to multiple Datasets if modified directly via the REST API rather than linked through the UI or GraphQL. This side effect aligns the REST API behavior with the GraphQL behavior by introducting a side effect that enforces the 1-to-1 constraint between Data Products and Datasets
- NOTE: There is a pathological pattern of writes for Data Products that can introduce issues with write processing that can occur with this side effect. If you are constantly changing all of the Datasets associated with a Data Product back and forth between multiple Data Products it will result in a high volume of writes due to the need to unset previous associations.
16 changes: 11 additions & 5 deletions docs/what/urn.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,17 @@ urn:li:dataset:(urn:li:dataPlatform:hdfs,PageViewEvent,EI)

## Restrictions

There are a few restrictions when creating an urn:
There are a few restrictions when creating an URN:

1. Commas are reserved character in URN fields: `,`
2. Parentheses are reserved characters in URN fields: `(` or `)`
3. Colons are reserved characters in URN fields: `:`
4. Urn separator UTF-8 character ``
The following characters are not allowed anywhere in the URN

1. Parentheses are reserved characters in URN fields: `(` or `)`
2. The "unit separator" unicode character `` (U+241F)

The following characters are not allowed within an URN tuple.

1. Commas are reserved characters in URN tuples: `,`

Example: `urn:li:dashboard:(looker,dashboards.thelook)` is a valid urn, but `urn:li:dashboard:(looker,dashboards.the,look)` is invalid.

Please do not use these characters when creating or generating urns. One approach is to use URL encoding for the characters.
Original file line number Diff line number Diff line change
Expand Up @@ -81,14 +81,13 @@ public static JsonNode populateTopLevelKeys(JsonNode transformedNode, JsonPatch
PatchOperationType.REMOVE.equals(operationPath.getFirst())
? keys.length
: keys.length - 1;

// Skip first as it will always be blank due to path starting with /
for (int i = 1; i < endIdx; i++) {
String decodedKey = decodeValue(keys[i]);
if (parent.get(keys[i]) == null) {
String decodedKey = decodeValue(keys[i]);
((ObjectNode) parent).set(decodedKey, instance.objectNode());
}
parent = parent.get(keys[i]);
parent = parent.get(decodedKey);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,29 @@ public void testPatchUpstream() throws Exception {
// New entry in array because of new transformation type
assertEquals(result4.getFineGrainedLineages().get(3), fineGrainedLineage4);

JsonPatchBuilder patchOperations5 = Json.createPatchBuilder();
String urn4 =
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:s3,test-bucket/hive/folder_1/folder_2/my_dataset,DEV),c2)";
UrnArray downstreamUrns5 = new UrnArray();
downstreamUrns5.add(Urn.createFromString(urn4));
patchOperations5.add(
"/fineGrainedLineages/TRANSFORM/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:s3,test-bucket~1hive~1folder_1~1folder_2~1my_dataset,DEV),c2)/urn:li:query:anotherQuery/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)",
finegrainedLineageNode5.build());
JsonPatch jsonPatch5 = patchOperations5.build();
UpstreamLineage result5 = upstreamLineageTemplate.applyPatch(result4, jsonPatch5);
// Hack because Jackson parses values to doubles instead of floats
DataMap dataMap5 = new DataMap();
dataMap5.put("confidenceScore", 1.0);
FineGrainedLineage fineGrainedLineage5 = new FineGrainedLineage(dataMap5);
fineGrainedLineage5.setUpstreams(upstreamUrns3);
fineGrainedLineage5.setDownstreams(downstreamUrns5);
fineGrainedLineage5.setTransformOperation("TRANSFORM");
fineGrainedLineage5.setUpstreamType(FineGrainedLineageUpstreamType.FIELD_SET);
fineGrainedLineage5.setDownstreamType(FineGrainedLineageDownstreamType.FIELD);
fineGrainedLineage5.setQuery(UrnUtils.getUrn("urn:li:query:anotherQuery"));
// New entry in array because of new transformation type
assertEquals(result5.getFineGrainedLineages().get(4), fineGrainedLineage5);

// Remove
JsonPatchBuilder removeOperations = Json.createPatchBuilder();
removeOperations.remove(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

OL_SCHEME_TWEAKS = {
"sqlserver": "mssql",
"trino": "presto",
"awsathena": "athena",
}

Expand Down
6 changes: 4 additions & 2 deletions metadata-ingestion/docs/sources/iceberg/iceberg.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ This ingestion source maps the following Source System Concepts to DataHub Conce

## Troubleshooting

### [Common Issue]
### Exceptions while increasing `processing_threads`

[Provide description of common issues with this integration and steps to resolve]
Each processing thread will open several files/sockets to download manifest files from blob storage. If you experience
exceptions appearing when increasing `processing_threads` configuration parameter, try to increase limit of open
files (i.e. using `ulimit` in Linux).
2 changes: 1 addition & 1 deletion metadata-ingestion/docs/sources/sigma/sigma_pre.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ This source extracts the following:
| Sigma | Datahub | Notes |
|------------------------|---------------------------------------------------------------|----------------------------------|
| `Workspace` | [Container](../../metamodel/entities/container.md) | SubType `"Sigma Workspace"` |
| `Workbook` | [Container](../../metamodel/entities/container.md) | SubType `"Sigma Workbook"` |
| `Workbook` | [Dashboard](../../metamodel/entities/dashboard.md) | SubType `"Sigma Workbook"` |
| `Page` | [Dashboard](../../metamodel/entities/dashboard.md) | |
| `Element` | [Chart](../../metamodel/entities/chart.md) | |
| `Dataset` | [Dataset](../../metamodel/entities/dataset.md) | SubType `"Sigma Dataset"` |
Expand Down
Loading

0 comments on commit 2f3d864

Please sign in to comment.