From cff32e9c742f9bff2db686445e3f9cddaa6caf38 Mon Sep 17 00:00:00 2001
From: Aseem Bansal
Date: Sat, 9 Dec 2023 05:37:00 +0530
Subject: [PATCH 001/540] fix(ingest/transformer): correct registration (#9418)
---
metadata-ingestion/setup.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
index dac865d2dac37e..e894cbf043338d 100644
--- a/metadata-ingestion/setup.py
+++ b/metadata-ingestion/setup.py
@@ -650,7 +650,7 @@
"simple_add_dataset_properties = datahub.ingestion.transformer.add_dataset_properties:SimpleAddDatasetProperties",
"pattern_add_dataset_schema_terms = datahub.ingestion.transformer.add_dataset_schema_terms:PatternAddDatasetSchemaTerms",
"pattern_add_dataset_schema_tags = datahub.ingestion.transformer.add_dataset_schema_tags:PatternAddDatasetSchemaTags",
- "extract_owners_from_tags = datahub.ingestion.transformer.extract_ownership_from_tags:ExtractOwnersFromTagsTransformer",
+ "extract_ownership_from_tags = datahub.ingestion.transformer.extract_ownership_from_tags:ExtractOwnersFromTagsTransformer",
],
"datahub.ingestion.sink.plugins": [
"file = datahub.ingestion.sink.file:FileSink",
From e4d8dcbc02d2dae73b7054813b900af239795485 Mon Sep 17 00:00:00 2001
From: Andrew Sikowitz
Date: Mon, 11 Dec 2023 09:43:23 -0500
Subject: [PATCH 002/540] docs(ingest/sql-queries): Rearrange sections (#9426)
---
.../sql-queries/{sql-queries.md => sql-queries_pre.md} | 5 +++--
.../src/datahub/ingestion/source/sql_queries.py | 3 ++-
2 files changed, 5 insertions(+), 3 deletions(-)
rename metadata-ingestion/docs/sources/sql-queries/{sql-queries.md => sql-queries_pre.md} (67%)
diff --git a/metadata-ingestion/docs/sources/sql-queries/sql-queries.md b/metadata-ingestion/docs/sources/sql-queries/sql-queries_pre.md
similarity index 67%
rename from metadata-ingestion/docs/sources/sql-queries/sql-queries.md
rename to metadata-ingestion/docs/sources/sql-queries/sql-queries_pre.md
index e829b4366bb847..2d915f0bcf84db 100644
--- a/metadata-ingestion/docs/sources/sql-queries/sql-queries.md
+++ b/metadata-ingestion/docs/sources/sql-queries/sql-queries_pre.md
@@ -1,8 +1,9 @@
-### Example Queries File
+#### Example Queries File
```json
{"query": "SELECT x FROM my_table", "timestamp": 1689232738.051, "user": "user_a", "downstream_tables": [], "upstream_tables": ["my_database.my_schema.my_table"]}
{"query": "INSERT INTO my_table VALUES (1, 'a')", "timestamp": 1689232737.669, "user": "user_b", "downstream_tables": ["my_database.my_schema.my_table"], "upstream_tables": []}
```
-Note that this is not a valid standard JSON file, but rather a file containing one JSON object per line.
+Note that this file does not represent a single JSON object, but instead newline-delimited JSON, in which
+each line is a separate JSON object.
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py b/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py
index 58e9682df935e3..c3d6657c81fa70 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py
@@ -93,8 +93,9 @@ def compute_stats(self) -> None:
@capability(SourceCapability.LINEAGE_FINE, "Parsed from SQL queries")
class SqlQueriesSource(Source):
"""
- This source reads a specifically-formatted JSON file containing SQL queries and parses them to generate lineage.
+ This source reads a newline-delimited JSON file containing SQL queries and parses them to generate lineage.
+ ### Query File Format
This file should contain one JSON object per line, with the following fields:
- query: string - The SQL query to parse.
- timestamp (optional): number - The timestamp of the query, in seconds since the epoch.
From 5ac854dcb1f1516a5325ef5bbac466d08c016fcb Mon Sep 17 00:00:00 2001
From: Salman-Apptware <101426513+Salman-Apptware@users.noreply.github.com>
Date: Mon, 11 Dec 2023 22:43:58 +0530
Subject: [PATCH 003/540] fix(ui): Adjusting the view of the Column Stats
(#9430)
---
.../shared/tabs/Dataset/Stats/snapshot/ColumnStats.tsx | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Stats/snapshot/ColumnStats.tsx b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Stats/snapshot/ColumnStats.tsx
index 080fba66199774..0cbb79dde49cde 100644
--- a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Stats/snapshot/ColumnStats.tsx
+++ b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Stats/snapshot/ColumnStats.tsx
@@ -14,6 +14,8 @@ type Props = {
const StatSection = styled.div`
padding: 20px 20px;
overflow: auto;
+ display: flex;
+ flex-direction: column;
`;
const NameText = styled(Typography.Text)`
@@ -162,7 +164,12 @@ export default function ColumnStats({ columnStats }: Props) {
return (
Column Stats
-
+
);
}
From 8a1122049c02c4929d8029c25dac517e5fdafc48 Mon Sep 17 00:00:00 2001
From: RyanHolstien
Date: Mon, 11 Dec 2023 14:25:43 -0800
Subject: [PATCH 004/540] feat(patch): support fine grained lineage patches
(#9408)
Co-authored-by: Harshal Sheth
---
.../dataset/UpstreamLineageTemplate.java | 271 ++++++++++++-
.../registry/UpstreamLineageTemplateTest.java | 359 ++++++++++++++++++
.../java/com/linkedin/metadata/Constants.java | 5 +
.../src/datahub/specific/dataset.py | 107 +++++-
.../unit/patch/complex_dataset_patch.json | 45 ++-
.../tests/unit/patch/test_patch_builder.py | 16 +
.../dataset/UpstreamLineagePatchBuilder.java | 231 ++++++++++-
.../java/datahub/client/patch/PatchTest.java | 24 +-
8 files changed, 1023 insertions(+), 35 deletions(-)
create mode 100644 entity-registry/src/test/java/com/linkedin/metadata/models/registry/UpstreamLineageTemplateTest.java
diff --git a/entity-registry/src/main/java/com/linkedin/metadata/models/registry/template/dataset/UpstreamLineageTemplate.java b/entity-registry/src/main/java/com/linkedin/metadata/models/registry/template/dataset/UpstreamLineageTemplate.java
index 35816895669beb..81a4065dedb1a2 100644
--- a/entity-registry/src/main/java/com/linkedin/metadata/models/registry/template/dataset/UpstreamLineageTemplate.java
+++ b/entity-registry/src/main/java/com/linkedin/metadata/models/registry/template/dataset/UpstreamLineageTemplate.java
@@ -1,20 +1,41 @@
package com.linkedin.metadata.models.registry.template.dataset;
+import static com.fasterxml.jackson.databind.node.JsonNodeFactory.*;
+import static com.linkedin.metadata.Constants.*;
+
import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.node.ArrayNode;
+import com.fasterxml.jackson.databind.node.ObjectNode;
+import com.google.common.collect.Streams;
+import com.linkedin.common.urn.Urn;
+import com.linkedin.common.urn.UrnUtils;
import com.linkedin.data.template.RecordTemplate;
import com.linkedin.dataset.FineGrainedLineageArray;
import com.linkedin.dataset.UpstreamArray;
import com.linkedin.dataset.UpstreamLineage;
-import com.linkedin.metadata.models.registry.template.ArrayMergingTemplate;
+import com.linkedin.metadata.models.registry.template.CompoundKeyTemplate;
import java.util.Collections;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.stream.Collectors;
import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
-public class UpstreamLineageTemplate implements ArrayMergingTemplate {
+public class UpstreamLineageTemplate extends CompoundKeyTemplate {
+ // Fields
private static final String UPSTREAMS_FIELD_NAME = "upstreams";
private static final String DATASET_FIELD_NAME = "dataset";
+ private static final String FINE_GRAINED_LINEAGES_FIELD_NAME = "fineGrainedLineages";
+ private static final String FINE_GRAINED_UPSTREAM_TYPE = "upstreamType";
+ private static final String FINE_GRAINED_UPSTREAMS = "upstreams";
+ private static final String FINE_GRAINED_DOWNSTREAM_TYPE = "downstreamType";
+ private static final String FINE_GRAINED_DOWNSTREAMS = "downstreams";
+ private static final String FINE_GRAINED_TRANSFORMATION_OPERATION = "transformOperation";
+ private static final String FINE_GRAINED_CONFIDENCE_SCORE = "confidenceScore";
- // TODO: Fine Grained Lineages not patchable at this time, they don't have a well established key
+ // Template support
+ private static final String NONE_TRANSFORMATION_TYPE = "NONE";
+ private static final Float DEFAULT_CONFIDENCE_SCORE = 1.0f;
@Override
public UpstreamLineage getSubtype(RecordTemplate recordTemplate) throws ClassCastException {
@@ -42,14 +63,250 @@ public UpstreamLineage getDefault() {
@Nonnull
@Override
public JsonNode transformFields(JsonNode baseNode) {
- return arrayFieldToMap(
- baseNode, UPSTREAMS_FIELD_NAME, Collections.singletonList(DATASET_FIELD_NAME));
+ JsonNode transformedNode =
+ arrayFieldToMap(
+ baseNode, UPSTREAMS_FIELD_NAME, Collections.singletonList(DATASET_FIELD_NAME));
+ ((ObjectNode) transformedNode)
+ .set(
+ FINE_GRAINED_LINEAGES_FIELD_NAME,
+ combineAndTransformFineGrainedLineages(
+ transformedNode.get(FINE_GRAINED_LINEAGES_FIELD_NAME)));
+
+ return transformedNode;
}
@Nonnull
@Override
public JsonNode rebaseFields(JsonNode patched) {
- return transformedMapToArray(
- patched, UPSTREAMS_FIELD_NAME, Collections.singletonList(DATASET_FIELD_NAME));
+ JsonNode rebasedNode =
+ transformedMapToArray(
+ patched, UPSTREAMS_FIELD_NAME, Collections.singletonList(DATASET_FIELD_NAME));
+ ((ObjectNode) rebasedNode)
+ .set(
+ FINE_GRAINED_LINEAGES_FIELD_NAME,
+ reconstructFineGrainedLineages(rebasedNode.get(FINE_GRAINED_LINEAGES_FIELD_NAME)));
+ return rebasedNode;
+ }
+
+ /**
+ * Combines fine grained lineage array into a map using upstream and downstream types as keys,
+ * defaulting when not present. Due to this construction, patches will look like: path:
+ * /fineGrainedLineages/TRANSFORMATION_OPERATION/(upstreamType || downstreamType)/TYPE/FIELD_URN,
+ * op: ADD/REMOVE, value: float (confidenceScore) Due to the way FineGrainedLineage was designed
+ * it doesn't necessarily have a consistent key we can reference, so this specialized method
+ * mimics the arrayFieldToMap of the super class with the specialization that it does not put the
+ * full value of the aspect at the end of the key, just the particular array. This prevents
+ * unintended overwrites through improper MCP construction that is technically allowed by the
+ * schema when combining under fields that form the natural key.
+ *
+ * @param fineGrainedLineages the fine grained lineage array node
+ * @return the modified {@link JsonNode} with array fields transformed to maps
+ */
+ private JsonNode combineAndTransformFineGrainedLineages(@Nullable JsonNode fineGrainedLineages) {
+ ObjectNode mapNode = instance.objectNode();
+ if (!(fineGrainedLineages instanceof ArrayNode) || fineGrainedLineages.isEmpty()) {
+ return mapNode;
+ }
+ JsonNode lineageCopy = fineGrainedLineages.deepCopy();
+
+ lineageCopy
+ .elements()
+ .forEachRemaining(
+ node -> {
+ JsonNode nodeClone = node.deepCopy();
+ String transformationOperation =
+ nodeClone.has(FINE_GRAINED_TRANSFORMATION_OPERATION)
+ ? nodeClone.get(FINE_GRAINED_TRANSFORMATION_OPERATION).asText()
+ : NONE_TRANSFORMATION_TYPE;
+
+ if (!mapNode.has(transformationOperation)) {
+ mapNode.set(transformationOperation, instance.objectNode());
+ }
+ ObjectNode transformationOperationNode =
+ (ObjectNode) mapNode.get(transformationOperation);
+
+ Float confidenceScore =
+ nodeClone.has(FINE_GRAINED_CONFIDENCE_SCORE)
+ ? nodeClone.get(FINE_GRAINED_CONFIDENCE_SCORE).floatValue()
+ : DEFAULT_CONFIDENCE_SCORE;
+
+ String upstreamType =
+ nodeClone.has(FINE_GRAINED_UPSTREAM_TYPE)
+ ? nodeClone.get(FINE_GRAINED_UPSTREAM_TYPE).asText()
+ : null;
+ String downstreamType =
+ nodeClone.has(FINE_GRAINED_DOWNSTREAM_TYPE)
+ ? nodeClone.get(FINE_GRAINED_DOWNSTREAM_TYPE).asText()
+ : null;
+ ArrayNode upstreams =
+ nodeClone.has(FINE_GRAINED_UPSTREAMS)
+ ? (ArrayNode) nodeClone.get(FINE_GRAINED_UPSTREAMS)
+ : null;
+ ArrayNode downstreams =
+ nodeClone.has(FINE_GRAINED_DOWNSTREAMS)
+ ? (ArrayNode) nodeClone.get(FINE_GRAINED_DOWNSTREAMS)
+ : null;
+
+ // Handle upstreams
+ if (upstreamType == null) {
+ // Determine default type
+ Urn upstreamUrn =
+ upstreams != null ? UrnUtils.getUrn(upstreams.get(0).asText()) : null;
+ if (upstreamUrn != null
+ && SCHEMA_FIELD_ENTITY_NAME.equals(upstreamUrn.getEntityType())) {
+ upstreamType = FINE_GRAINED_LINEAGE_FIELD_SET_TYPE;
+ } else {
+ upstreamType = FINE_GRAINED_LINEAGE_DATASET_TYPE;
+ }
+ }
+ if (!transformationOperationNode.has(FINE_GRAINED_UPSTREAM_TYPE)) {
+ transformationOperationNode.set(FINE_GRAINED_UPSTREAM_TYPE, instance.objectNode());
+ }
+ ObjectNode upstreamTypeNode =
+ (ObjectNode) transformationOperationNode.get(FINE_GRAINED_UPSTREAM_TYPE);
+ if (!upstreamTypeNode.has(upstreamType)) {
+ upstreamTypeNode.set(upstreamType, instance.objectNode());
+ }
+ if (upstreams != null) {
+ addUrnsToSubType(upstreamTypeNode, upstreams, upstreamType, confidenceScore);
+ }
+
+ // Handle downstreams
+ if (downstreamType == null) {
+ // Determine default type
+ if (downstreams != null && downstreams.size() > 1) {
+ downstreamType = FINE_GRAINED_LINEAGE_FIELD_SET_TYPE;
+ } else {
+ downstreamType = FINE_GRAINED_LINEAGE_FIELD_TYPE;
+ }
+ }
+ if (!transformationOperationNode.has(FINE_GRAINED_DOWNSTREAM_TYPE)) {
+ transformationOperationNode.set(
+ FINE_GRAINED_DOWNSTREAM_TYPE, instance.objectNode());
+ }
+ ObjectNode downstreamTypeNode =
+ (ObjectNode) transformationOperationNode.get(FINE_GRAINED_DOWNSTREAM_TYPE);
+ if (!downstreamTypeNode.has(downstreamType)) {
+ downstreamTypeNode.set(downstreamType, instance.objectNode());
+ }
+ if (downstreams != null) {
+ addUrnsToSubType(downstreamTypeNode, downstreams, downstreamType, confidenceScore);
+ }
+ });
+ return mapNode;
+ }
+
+ private void addUrnsToSubType(
+ JsonNode superType, ArrayNode urnsList, String subType, Float confidenceScore) {
+ ObjectNode upstreamSubTypeNode = (ObjectNode) superType.get(subType);
+ // Will overwrite repeat urns with different confidence scores with the most recently seen
+ upstreamSubTypeNode.setAll(
+ Streams.stream(urnsList.elements())
+ .map(JsonNode::asText)
+ .distinct()
+ .collect(Collectors.toMap(urn -> urn, urn -> instance.numberNode(confidenceScore))));
+ }
+
+ /**
+ * Takes the transformed fine grained lineages map from pre-processing and reconstructs an array
+ * of FineGrainedLineages Avoids producing side effects by copying nodes, use resulting node and
+ * not the original
+ *
+ * @param transformedFineGrainedLineages the transformed fine grained lineage map
+ * @return the modified {@link JsonNode} formatted consistent with the original schema
+ */
+ private ArrayNode reconstructFineGrainedLineages(JsonNode transformedFineGrainedLineages) {
+ if (transformedFineGrainedLineages instanceof ArrayNode) {
+ // We already have an ArrayNode, no need to transform. This happens during `replace`
+ // operations
+ return (ArrayNode) transformedFineGrainedLineages;
+ }
+ ObjectNode mapNode = (ObjectNode) transformedFineGrainedLineages;
+ ArrayNode arrayNode = instance.arrayNode();
+
+ mapNode
+ .fieldNames()
+ .forEachRemaining(
+ transformationOperation -> {
+ final ObjectNode transformationOperationNode =
+ (ObjectNode) mapNode.get(transformationOperation);
+ final ObjectNode upstreamType =
+ transformationOperationNode.has(FINE_GRAINED_UPSTREAM_TYPE)
+ ? (ObjectNode) transformationOperationNode.get(FINE_GRAINED_UPSTREAM_TYPE)
+ : instance.objectNode();
+ final ObjectNode downstreamType =
+ transformationOperationNode.has(FINE_GRAINED_DOWNSTREAM_TYPE)
+ ? (ObjectNode) transformationOperationNode.get(FINE_GRAINED_DOWNSTREAM_TYPE)
+ : instance.objectNode();
+
+ // Handle upstreams
+ if (!upstreamType.isEmpty()) {
+ populateTypeNode(
+ upstreamType,
+ transformationOperation,
+ FINE_GRAINED_UPSTREAM_TYPE,
+ FINE_GRAINED_UPSTREAMS,
+ FINE_GRAINED_DOWNSTREAM_TYPE,
+ arrayNode);
+ }
+
+ // Handle downstreams
+ if (!downstreamType.isEmpty()) {
+ populateTypeNode(
+ downstreamType,
+ transformationOperation,
+ FINE_GRAINED_DOWNSTREAM_TYPE,
+ FINE_GRAINED_DOWNSTREAMS,
+ FINE_GRAINED_UPSTREAM_TYPE,
+ arrayNode);
+ }
+ });
+
+ return arrayNode;
+ }
+
+ private void populateTypeNode(
+ JsonNode typeNode,
+ String transformationOperation,
+ String typeName,
+ String arrayTypeName,
+ String defaultTypeName,
+ ArrayNode arrayNode) {
+ typeNode
+ .fieldNames()
+ .forEachRemaining(
+ subTypeName -> {
+ ObjectNode subType = (ObjectNode) typeNode.get(subTypeName);
+ if (!subType.isEmpty()) {
+ ObjectNode fineGrainedLineage = instance.objectNode();
+ AtomicReference minimumConfidenceScore = new AtomicReference<>(1.0f);
+
+ fineGrainedLineage.put(typeName, subTypeName);
+ fineGrainedLineage.put(
+ FINE_GRAINED_TRANSFORMATION_OPERATION, transformationOperation);
+ // Array to actually be filled out
+ fineGrainedLineage.set(arrayTypeName, instance.arrayNode());
+ // Added to pass model validation, because we have no way of appropriately pairing
+ // upstreams and downstreams
+ // within fine grained lineages consistently due to being able to have multiple
+ // downstream types paired with a single
+ // transform operation, we just set a default type because it's a required property
+ fineGrainedLineage.put(defaultTypeName, FINE_GRAINED_LINEAGE_FIELD_SET_TYPE);
+ subType
+ .fieldNames()
+ .forEachRemaining(
+ subTypeKey -> {
+ ((ArrayNode) fineGrainedLineage.get(arrayTypeName)).add(subTypeKey);
+ Float scoreValue = subType.get(subTypeKey).floatValue();
+ if (scoreValue <= minimumConfidenceScore.get()) {
+ minimumConfidenceScore.set(scoreValue);
+ fineGrainedLineage.set(
+ FINE_GRAINED_CONFIDENCE_SCORE,
+ instance.numberNode(minimumConfidenceScore.get()));
+ }
+ });
+ arrayNode.add(fineGrainedLineage);
+ }
+ });
}
}
diff --git a/entity-registry/src/test/java/com/linkedin/metadata/models/registry/UpstreamLineageTemplateTest.java b/entity-registry/src/test/java/com/linkedin/metadata/models/registry/UpstreamLineageTemplateTest.java
new file mode 100644
index 00000000000000..07982a87be56cb
--- /dev/null
+++ b/entity-registry/src/test/java/com/linkedin/metadata/models/registry/UpstreamLineageTemplateTest.java
@@ -0,0 +1,359 @@
+package com.linkedin.metadata.models.registry;
+
+import static com.fasterxml.jackson.databind.node.JsonNodeFactory.*;
+
+import com.fasterxml.jackson.databind.node.NumericNode;
+import com.github.fge.jackson.jsonpointer.JsonPointer;
+import com.github.fge.jsonpatch.AddOperation;
+import com.github.fge.jsonpatch.JsonPatch;
+import com.github.fge.jsonpatch.JsonPatchOperation;
+import com.github.fge.jsonpatch.RemoveOperation;
+import com.linkedin.common.UrnArray;
+import com.linkedin.common.urn.Urn;
+import com.linkedin.common.urn.UrnUtils;
+import com.linkedin.data.DataMap;
+import com.linkedin.dataset.FineGrainedLineage;
+import com.linkedin.dataset.FineGrainedLineageDownstreamType;
+import com.linkedin.dataset.FineGrainedLineageUpstreamType;
+import com.linkedin.dataset.UpstreamLineage;
+import com.linkedin.metadata.models.registry.template.dataset.UpstreamLineageTemplate;
+import java.util.ArrayList;
+import java.util.List;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+
+public class UpstreamLineageTemplateTest {
+ @Test
+ public void testPatchUpstream() throws Exception {
+ UpstreamLineageTemplate upstreamLineageTemplate = new UpstreamLineageTemplate();
+ UpstreamLineage upstreamLineage = upstreamLineageTemplate.getDefault();
+ List patchOperations = new ArrayList<>();
+ NumericNode upstreamConfidenceScore = instance.numberNode(1.0f);
+ JsonPatchOperation operation =
+ new AddOperation(
+ new JsonPointer(
+ "/fineGrainedLineages/CREATE/upstreamType/FIELD_SET/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c1)"),
+ upstreamConfidenceScore);
+ patchOperations.add(operation);
+ JsonPatch jsonPatch = new JsonPatch(patchOperations);
+
+ // Initial population test
+ UpstreamLineage result = upstreamLineageTemplate.applyPatch(upstreamLineage, jsonPatch);
+ // Hack because Jackson parses values to doubles instead of floats
+ DataMap dataMap = new DataMap();
+ dataMap.put("confidenceScore", 1.0);
+ FineGrainedLineage fineGrainedLineage = new FineGrainedLineage(dataMap);
+ UrnArray urns = new UrnArray();
+ Urn urn1 =
+ UrnUtils.getUrn(
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c1)");
+ urns.add(urn1);
+ fineGrainedLineage.setUpstreams(urns);
+ fineGrainedLineage.setTransformOperation("CREATE");
+ fineGrainedLineage.setUpstreamType(FineGrainedLineageUpstreamType.FIELD_SET);
+ fineGrainedLineage.setDownstreamType(FineGrainedLineageDownstreamType.FIELD_SET);
+ Assert.assertEquals(result.getFineGrainedLineages().get(0), fineGrainedLineage);
+
+ // Test non-overwrite upstreams and correct confidence score
+ JsonPatchOperation operation2 =
+ new AddOperation(
+ new JsonPointer(
+ "/fineGrainedLineages/CREATE/upstreamType/FIELD_SET/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)"),
+ upstreamConfidenceScore);
+ NumericNode upstreamConfidenceScore2 = instance.numberNode(0.1f);
+ JsonPatchOperation operation3 =
+ new AddOperation(
+ new JsonPointer(
+ "/fineGrainedLineages/CREATE/upstreamType/FIELD_SET/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)"),
+ upstreamConfidenceScore2);
+ List patchOperations2 = new ArrayList<>();
+ patchOperations2.add(operation2);
+ patchOperations2.add(operation3);
+ JsonPatch jsonPatch2 = new JsonPatch(patchOperations2);
+ UpstreamLineage result2 = upstreamLineageTemplate.applyPatch(result, jsonPatch2);
+ // Hack because Jackson parses values to doubles instead of floats
+ DataMap dataMap2 = new DataMap();
+ dataMap2.put("confidenceScore", 0.1);
+ FineGrainedLineage fineGrainedLineage2 = new FineGrainedLineage(dataMap2);
+ UrnArray urns2 = new UrnArray();
+ Urn urn2 =
+ UrnUtils.getUrn(
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)");
+ urns2.add(urn1);
+ urns2.add(urn2);
+ fineGrainedLineage2.setUpstreams(urns2);
+ fineGrainedLineage2.setTransformOperation("CREATE");
+ fineGrainedLineage2.setUpstreamType(FineGrainedLineageUpstreamType.FIELD_SET);
+ fineGrainedLineage2.setDownstreamType(FineGrainedLineageDownstreamType.FIELD_SET);
+ Assert.assertEquals(result2.getFineGrainedLineages().get(0), fineGrainedLineage2);
+
+ // Check different upstream types
+ JsonPatchOperation operation4 =
+ new AddOperation(
+ new JsonPointer(
+ "/fineGrainedLineages/CREATE/upstreamType/DATASET/urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD)"),
+ upstreamConfidenceScore);
+ List patchOperations3 = new ArrayList<>();
+ patchOperations3.add(operation4);
+ JsonPatch jsonPatch3 = new JsonPatch(patchOperations3);
+ UpstreamLineage result3 = upstreamLineageTemplate.applyPatch(result2, jsonPatch3);
+ // Hack because Jackson parses values to doubles instead of floats
+ DataMap dataMap3 = new DataMap();
+ dataMap3.put("confidenceScore", 1.0);
+ FineGrainedLineage fineGrainedLineage3 = new FineGrainedLineage(dataMap3);
+ UrnArray urns3 = new UrnArray();
+ Urn urn3 =
+ UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD)");
+ urns3.add(urn3);
+ fineGrainedLineage3.setUpstreams(urns3);
+ fineGrainedLineage3.setTransformOperation("CREATE");
+ fineGrainedLineage3.setUpstreamType(FineGrainedLineageUpstreamType.DATASET);
+ fineGrainedLineage3.setDownstreamType(FineGrainedLineageDownstreamType.FIELD_SET);
+ // Splits into two for different types
+ Assert.assertEquals(result3.getFineGrainedLineages().get(1), fineGrainedLineage3);
+
+ // Check different transform types
+ JsonPatchOperation operation5 =
+ new AddOperation(
+ new JsonPointer(
+ "/fineGrainedLineages/TRANSFORM/upstreamType/DATASET/urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD)"),
+ upstreamConfidenceScore);
+ List patchOperations4 = new ArrayList<>();
+ patchOperations4.add(operation5);
+ JsonPatch jsonPatch4 = new JsonPatch(patchOperations4);
+ UpstreamLineage result4 = upstreamLineageTemplate.applyPatch(result3, jsonPatch4);
+ // Hack because Jackson parses values to doubles instead of floats
+ DataMap dataMap4 = new DataMap();
+ dataMap4.put("confidenceScore", 1.0);
+ FineGrainedLineage fineGrainedLineage4 = new FineGrainedLineage(dataMap4);
+ UrnArray urns4 = new UrnArray();
+ Urn urn4 =
+ UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD)");
+ urns4.add(urn4);
+ fineGrainedLineage4.setUpstreams(urns4);
+ fineGrainedLineage4.setTransformOperation("TRANSFORM");
+ fineGrainedLineage4.setUpstreamType(FineGrainedLineageUpstreamType.DATASET);
+ fineGrainedLineage4.setDownstreamType(FineGrainedLineageDownstreamType.FIELD_SET);
+ // New entry in array because of new transformation type
+ Assert.assertEquals(result4.getFineGrainedLineages().get(2), fineGrainedLineage4);
+
+ // Remove
+ JsonPatchOperation removeOperation =
+ new RemoveOperation(
+ new JsonPointer(
+ "/fineGrainedLineages/CREATE/upstreamType/FIELD_SET/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c1)"));
+ JsonPatchOperation removeOperation2 =
+ new RemoveOperation(
+ new JsonPointer(
+ "/fineGrainedLineages/CREATE/upstreamType/FIELD_SET/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)"));
+ JsonPatchOperation removeOperation3 =
+ new RemoveOperation(
+ new JsonPointer(
+ "/fineGrainedLineages/CREATE/upstreamType/DATASET/urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD)"));
+ JsonPatchOperation removeOperation4 =
+ new RemoveOperation(
+ new JsonPointer(
+ "/fineGrainedLineages/TRANSFORM/upstreamType/DATASET/urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD)"));
+
+ List removeOperations = new ArrayList<>();
+ removeOperations.add(removeOperation);
+ removeOperations.add(removeOperation2);
+ removeOperations.add(removeOperation3);
+ removeOperations.add(removeOperation4);
+ JsonPatch removePatch = new JsonPatch(removeOperations);
+ UpstreamLineage finalResult = upstreamLineageTemplate.applyPatch(result4, removePatch);
+ Assert.assertEquals(upstreamLineageTemplate.getDefault(), finalResult);
+ }
+
+ @Test
+ public void testPatchDownstream() throws Exception {
+ UpstreamLineageTemplate upstreamLineageTemplate = new UpstreamLineageTemplate();
+ UpstreamLineage upstreamLineage = upstreamLineageTemplate.getDefault();
+ List patchOperations = new ArrayList<>();
+ NumericNode downstreamConfidenceScore = instance.numberNode(1.0f);
+ JsonPatchOperation operation =
+ new AddOperation(
+ new JsonPointer(
+ "/fineGrainedLineages/CREATE/downstreamType/FIELD_SET/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c1)"),
+ downstreamConfidenceScore);
+ patchOperations.add(operation);
+ JsonPatch jsonPatch = new JsonPatch(patchOperations);
+
+ // Initial population test
+ UpstreamLineage result = upstreamLineageTemplate.applyPatch(upstreamLineage, jsonPatch);
+ // Hack because Jackson parses values to doubles instead of floats
+ DataMap dataMap = new DataMap();
+ dataMap.put("confidenceScore", 1.0);
+ FineGrainedLineage fineGrainedLineage = new FineGrainedLineage(dataMap);
+ UrnArray urns = new UrnArray();
+ Urn urn1 =
+ UrnUtils.getUrn(
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c1)");
+ urns.add(urn1);
+ fineGrainedLineage.setDownstreams(urns);
+ fineGrainedLineage.setTransformOperation("CREATE");
+ fineGrainedLineage.setDownstreamType(FineGrainedLineageDownstreamType.FIELD_SET);
+ fineGrainedLineage.setUpstreamType(FineGrainedLineageUpstreamType.FIELD_SET);
+ Assert.assertEquals(result.getFineGrainedLineages().get(0), fineGrainedLineage);
+
+ // Test non-overwrite downstreams and correct confidence score
+ JsonPatchOperation operation2 =
+ new AddOperation(
+ new JsonPointer(
+ "/fineGrainedLineages/CREATE/downstreamType/FIELD_SET/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)"),
+ downstreamConfidenceScore);
+ NumericNode downstreamConfidenceScore2 = instance.numberNode(0.1f);
+ JsonPatchOperation operation3 =
+ new AddOperation(
+ new JsonPointer(
+ "/fineGrainedLineages/CREATE/downstreamType/FIELD_SET/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)"),
+ downstreamConfidenceScore2);
+ List patchOperations2 = new ArrayList<>();
+ patchOperations2.add(operation2);
+ patchOperations2.add(operation3);
+ JsonPatch jsonPatch2 = new JsonPatch(patchOperations2);
+ UpstreamLineage result2 = upstreamLineageTemplate.applyPatch(result, jsonPatch2);
+ // Hack because Jackson parses values to doubles instead of floats
+ DataMap dataMap2 = new DataMap();
+ dataMap2.put("confidenceScore", 0.1);
+ FineGrainedLineage fineGrainedLineage2 = new FineGrainedLineage(dataMap2);
+ UrnArray urns2 = new UrnArray();
+ Urn urn2 =
+ UrnUtils.getUrn(
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)");
+ urns2.add(urn1);
+ urns2.add(urn2);
+ fineGrainedLineage2.setDownstreams(urns2);
+ fineGrainedLineage2.setTransformOperation("CREATE");
+ fineGrainedLineage2.setDownstreamType(FineGrainedLineageDownstreamType.FIELD_SET);
+ fineGrainedLineage2.setUpstreamType(FineGrainedLineageUpstreamType.FIELD_SET);
+ Assert.assertEquals(result2.getFineGrainedLineages().get(0), fineGrainedLineage2);
+
+ // Check different downstream types
+ JsonPatchOperation operation4 =
+ new AddOperation(
+ new JsonPointer(
+ "/fineGrainedLineages/CREATE/downstreamType/FIELD/urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD)"),
+ downstreamConfidenceScore);
+ List patchOperations3 = new ArrayList<>();
+ patchOperations3.add(operation4);
+ JsonPatch jsonPatch3 = new JsonPatch(patchOperations3);
+ UpstreamLineage result3 = upstreamLineageTemplate.applyPatch(result2, jsonPatch3);
+ // Hack because Jackson parses values to doubles instead of floats
+ DataMap dataMap3 = new DataMap();
+ dataMap3.put("confidenceScore", 1.0);
+ FineGrainedLineage fineGrainedLineage3 = new FineGrainedLineage(dataMap3);
+ UrnArray urns3 = new UrnArray();
+ Urn urn3 =
+ UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD)");
+ urns3.add(urn3);
+ fineGrainedLineage3.setDownstreams(urns3);
+ fineGrainedLineage3.setTransformOperation("CREATE");
+ fineGrainedLineage3.setDownstreamType(FineGrainedLineageDownstreamType.FIELD);
+ fineGrainedLineage3.setUpstreamType(FineGrainedLineageUpstreamType.FIELD_SET);
+ // Splits into two for different types
+ Assert.assertEquals(result3.getFineGrainedLineages().get(1), fineGrainedLineage3);
+
+ // Check different transform types
+ JsonPatchOperation operation5 =
+ new AddOperation(
+ new JsonPointer(
+ "/fineGrainedLineages/TRANSFORM/downstreamType/FIELD/urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD)"),
+ downstreamConfidenceScore);
+ List patchOperations4 = new ArrayList<>();
+ patchOperations4.add(operation5);
+ JsonPatch jsonPatch4 = new JsonPatch(patchOperations4);
+ UpstreamLineage result4 = upstreamLineageTemplate.applyPatch(result3, jsonPatch4);
+ // Hack because Jackson parses values to doubles instead of floats
+ DataMap dataMap4 = new DataMap();
+ dataMap4.put("confidenceScore", 1.0);
+ FineGrainedLineage fineGrainedLineage4 = new FineGrainedLineage(dataMap4);
+ UrnArray urns4 = new UrnArray();
+ Urn urn4 =
+ UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD)");
+ urns4.add(urn4);
+ fineGrainedLineage4.setDownstreams(urns4);
+ fineGrainedLineage4.setTransformOperation("TRANSFORM");
+ fineGrainedLineage4.setDownstreamType(FineGrainedLineageDownstreamType.FIELD);
+ fineGrainedLineage4.setUpstreamType(FineGrainedLineageUpstreamType.FIELD_SET);
+ // New entry in array because of new transformation type
+ Assert.assertEquals(result4.getFineGrainedLineages().get(2), fineGrainedLineage4);
+
+ // Remove
+ JsonPatchOperation removeOperation =
+ new RemoveOperation(
+ new JsonPointer(
+ "/fineGrainedLineages/CREATE/downstreamType/FIELD_SET/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c1)"));
+ JsonPatchOperation removeOperation2 =
+ new RemoveOperation(
+ new JsonPointer(
+ "/fineGrainedLineages/CREATE/downstreamType/FIELD_SET/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)"));
+ JsonPatchOperation removeOperation3 =
+ new RemoveOperation(
+ new JsonPointer(
+ "/fineGrainedLineages/CREATE/downstreamType/FIELD/urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD)"));
+ JsonPatchOperation removeOperation4 =
+ new RemoveOperation(
+ new JsonPointer(
+ "/fineGrainedLineages/TRANSFORM/downstreamType/FIELD/urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD)"));
+
+ List removeOperations = new ArrayList<>();
+ removeOperations.add(removeOperation);
+ removeOperations.add(removeOperation2);
+ removeOperations.add(removeOperation3);
+ removeOperations.add(removeOperation4);
+ JsonPatch removePatch = new JsonPatch(removeOperations);
+ UpstreamLineage finalResult = upstreamLineageTemplate.applyPatch(result4, removePatch);
+ Assert.assertEquals(upstreamLineageTemplate.getDefault(), finalResult);
+ }
+
+ @Test
+ public void testUpAndDown() throws Exception {
+ UpstreamLineageTemplate upstreamLineageTemplate = new UpstreamLineageTemplate();
+ UpstreamLineage upstreamLineage = upstreamLineageTemplate.getDefault();
+ List patchOperations = new ArrayList<>();
+ NumericNode downstreamConfidenceScore = instance.numberNode(1.0f);
+ JsonPatchOperation operation =
+ new AddOperation(
+ new JsonPointer(
+ "/fineGrainedLineages/CREATE/downstreamType/FIELD_SET/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c1)"),
+ downstreamConfidenceScore);
+ patchOperations.add(operation);
+ NumericNode upstreamConfidenceScore = instance.numberNode(1.0f);
+ JsonPatchOperation operation2 =
+ new AddOperation(
+ new JsonPointer(
+ "/fineGrainedLineages/CREATE/upstreamType/FIELD_SET/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c1)"),
+ upstreamConfidenceScore);
+ patchOperations.add(operation2);
+ JsonPatch jsonPatch = new JsonPatch(patchOperations);
+
+ // Initial population test
+ UpstreamLineage result = upstreamLineageTemplate.applyPatch(upstreamLineage, jsonPatch);
+ // Hack because Jackson parses values to doubles instead of floats
+ DataMap dataMap = new DataMap();
+ dataMap.put("confidenceScore", 1.0);
+ FineGrainedLineage fineGrainedLineage = new FineGrainedLineage(dataMap);
+ UrnArray urns = new UrnArray();
+ Urn urn1 =
+ UrnUtils.getUrn(
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c1)");
+ urns.add(urn1);
+ fineGrainedLineage.setTransformOperation("CREATE");
+ fineGrainedLineage.setUpstreams(urns);
+ fineGrainedLineage.setUpstreamType(FineGrainedLineageUpstreamType.FIELD_SET);
+ fineGrainedLineage.setDownstreamType(FineGrainedLineageDownstreamType.FIELD_SET);
+ fineGrainedLineage.setDownstreams(urns);
+
+ // Hack because Jackson parses values to doubles instead of floats
+ DataMap dataMap2 = new DataMap();
+ dataMap2.put("confidenceScore", 1.0);
+ FineGrainedLineage fineGrainedLineage2 = new FineGrainedLineage(dataMap2);
+ fineGrainedLineage2.setTransformOperation("CREATE");
+ fineGrainedLineage2.setUpstreamType(FineGrainedLineageUpstreamType.FIELD_SET);
+ fineGrainedLineage2.setDownstreamType(FineGrainedLineageDownstreamType.FIELD_SET);
+ fineGrainedLineage2.setDownstreams(urns);
+
+ Assert.assertEquals(result.getFineGrainedLineages().get(1), fineGrainedLineage2);
+ }
+}
diff --git a/li-utils/src/main/java/com/linkedin/metadata/Constants.java b/li-utils/src/main/java/com/linkedin/metadata/Constants.java
index f5a3c9c12ff70e..3d9b533dc8f720 100644
--- a/li-utils/src/main/java/com/linkedin/metadata/Constants.java
+++ b/li-utils/src/main/java/com/linkedin/metadata/Constants.java
@@ -125,6 +125,11 @@ public class Constants {
public static final String VIEW_PROPERTIES_ASPECT_NAME = "viewProperties";
public static final String DATASET_PROFILE_ASPECT_NAME = "datasetProfile";
+ // Aspect support
+ public static final String FINE_GRAINED_LINEAGE_DATASET_TYPE = "DATASET";
+ public static final String FINE_GRAINED_LINEAGE_FIELD_SET_TYPE = "FIELD_SET";
+ public static final String FINE_GRAINED_LINEAGE_FIELD_TYPE = "FIELD";
+
// Chart
public static final String CHART_KEY_ASPECT_NAME = "chartKey";
public static final String CHART_INFO_ASPECT_NAME = "chartInfo";
diff --git a/metadata-ingestion/src/datahub/specific/dataset.py b/metadata-ingestion/src/datahub/specific/dataset.py
index fcfe049fb15cf9..294a80572669b8 100644
--- a/metadata-ingestion/src/datahub/specific/dataset.py
+++ b/metadata-ingestion/src/datahub/specific/dataset.py
@@ -1,4 +1,4 @@
-from typing import Dict, Generic, List, Optional, TypeVar, Union
+from typing import Dict, Generic, List, Optional, Tuple, TypeVar, Union
from urllib.parse import quote
from datahub.emitter.mcp_patch_builder import MetadataPatchProposal
@@ -6,6 +6,9 @@
DatasetPropertiesClass as DatasetProperties,
EditableDatasetPropertiesClass as EditableDatasetProperties,
EditableSchemaMetadataClass as EditableSchemaMetadata,
+ FineGrainedLineageClass as FineGrainedLineage,
+ FineGrainedLineageDownstreamTypeClass as FineGrainedLineageDownstreamType,
+ FineGrainedLineageUpstreamTypeClass as FineGrainedLineageUpstreamType,
GlobalTagsClass as GlobalTags,
GlossaryTermAssociationClass as Term,
GlossaryTermsClass as GlossaryTerms,
@@ -144,6 +147,108 @@ def set_upstream_lineages(self, upstreams: List[Upstream]) -> "DatasetPatchBuild
)
return self
+ def add_fine_grained_upstream_lineage(
+ self, fine_grained_lineage: FineGrainedLineage
+ ) -> "DatasetPatchBuilder":
+ (
+ transform_op,
+ upstream_type,
+ downstream_type,
+ ) = DatasetPatchBuilder.get_fine_grained_key(fine_grained_lineage)
+ for upstream_urn in fine_grained_lineage.upstreams or []:
+ self._add_patch(
+ UpstreamLineage.ASPECT_NAME,
+ "add",
+ path=DatasetPatchBuilder.quote_fine_grained_upstream_path(
+ transform_op, upstream_type, upstream_urn
+ ),
+ value=fine_grained_lineage.confidenceScore,
+ )
+ for downstream_urn in fine_grained_lineage.downstreams or []:
+ self._add_patch(
+ UpstreamLineage.ASPECT_NAME,
+ "add",
+ path=DatasetPatchBuilder.quote_fine_grained_downstream_path(
+ transform_op, downstream_type, downstream_urn
+ ),
+ value=fine_grained_lineage.confidenceScore,
+ )
+ return self
+
+ @staticmethod
+ def get_fine_grained_key(
+ fine_grained_lineage: FineGrainedLineage,
+ ) -> Tuple[str, str, str]:
+ transform_op = fine_grained_lineage.transformOperation or "NONE"
+ upstream_type = (
+ fine_grained_lineage.upstreamType
+ if isinstance(fine_grained_lineage.upstreamType, str)
+ else FineGrainedLineageUpstreamType.FIELD_SET
+ )
+ downstream_type = (
+ fine_grained_lineage.downstreamType
+ if isinstance(fine_grained_lineage.downstreamType, str)
+ else FineGrainedLineageDownstreamType.FIELD_SET
+ )
+ return transform_op, upstream_type, downstream_type
+
+ @staticmethod
+ def quote_fine_grained_downstream_path(
+ transform_op: str, downstream_type: str, downstream_urn: str
+ ) -> str:
+ return (
+ f"/fineGrainedLineages/{quote(transform_op, safe='')}/downstreamType/"
+ f"{quote(downstream_type, safe='')}/{quote(downstream_urn, safe='')}"
+ )
+
+ @staticmethod
+ def quote_fine_grained_upstream_path(
+ transform_op: str, upstream_type: str, upstream_urn: str
+ ) -> str:
+ return (
+ f"/fineGrainedLineages/{quote(transform_op, safe='')}/upstreamType/"
+ f"{quote(upstream_type, safe='')}/{quote(upstream_urn, safe='')}"
+ )
+
+ def remove_fine_grained_upstream_lineage(
+ self, fine_grained_lineage: FineGrainedLineage
+ ) -> "DatasetPatchBuilder":
+ (
+ transform_op,
+ upstream_type,
+ downstream_type,
+ ) = DatasetPatchBuilder.get_fine_grained_key(fine_grained_lineage)
+ for upstream_urn in fine_grained_lineage.upstreams or []:
+ self._add_patch(
+ UpstreamLineage.ASPECT_NAME,
+ "remove",
+ path=DatasetPatchBuilder.quote_fine_grained_upstream_path(
+ transform_op, upstream_type, upstream_urn
+ ),
+ value={},
+ )
+ for downstream_urn in fine_grained_lineage.downstreams or []:
+ self._add_patch(
+ UpstreamLineage.ASPECT_NAME,
+ "remove",
+ path=DatasetPatchBuilder.quote_fine_grained_downstream_path(
+ transform_op, downstream_type, downstream_urn
+ ),
+ value={},
+ )
+ return self
+
+ def set_fine_grained_upstream_lineages(
+ self, fine_grained_lineages: List[FineGrainedLineage]
+ ) -> "DatasetPatchBuilder":
+ self._add_patch(
+ UpstreamLineage.ASPECT_NAME,
+ "add",
+ path="/fineGrainedLineages",
+ value=fine_grained_lineages,
+ )
+ return self
+
def add_tag(self, tag: Tag) -> "DatasetPatchBuilder":
self._add_patch(
GlobalTags.ASPECT_NAME, "add", path=f"/tags/{tag.tag}", value=tag
diff --git a/metadata-ingestion/tests/unit/patch/complex_dataset_patch.json b/metadata-ingestion/tests/unit/patch/complex_dataset_patch.json
index d5dfe125942fba..ed5a7723ac2bf1 100644
--- a/metadata-ingestion/tests/unit/patch/complex_dataset_patch.json
+++ b/metadata-ingestion/tests/unit/patch/complex_dataset_patch.json
@@ -42,26 +42,31 @@
}
},
{
- "entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD)",
- "changeType": "PATCH",
- "aspectName": "upstreamLineage",
- "aspect": {
- "json": [
- {
- "op": "add",
- "path": "/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2Cfct_users_created_upstream%2CPROD%29",
- "value": {
- "auditStamp": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
- },
- "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created_upstream,PROD)",
- "type": "TRANSFORMED"
- }
- }
- ]
- }
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD)",
+ "changeType": "PATCH",
+ "aspectName": "upstreamLineage",
+ "aspect": {
+ "json": [
+ {
+ "op": "add",
+ "path": "/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2Cfct_users_created_upstream%2CPROD%29",
+ "value": {
+ "auditStamp": {
+ "time": 0,
+ "actor": "urn:li:corpuser:unknown"
+ },
+ "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created_upstream,PROD)",
+ "type": "TRANSFORMED"
+ }
+ },
+ {
+ "op": "add",
+ "path": "/fineGrainedLineages/TRANSFORM/upstreamType/DATASET/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Ahive%2Cfct_users_created_upstream%2CPROD%29",
+ "value": 1.0
+ }
+ ]
+ }
},
{
"entityType": "dataset",
diff --git a/metadata-ingestion/tests/unit/patch/test_patch_builder.py b/metadata-ingestion/tests/unit/patch/test_patch_builder.py
index 0701b3d6968959..f05c4978f8644e 100644
--- a/metadata-ingestion/tests/unit/patch/test_patch_builder.py
+++ b/metadata-ingestion/tests/unit/patch/test_patch_builder.py
@@ -7,6 +7,9 @@
from datahub.ingestion.sink.file import write_metadata_file
from datahub.metadata.schema_classes import (
DatasetLineageTypeClass,
+ FineGrainedLineageClass,
+ FineGrainedLineageDownstreamTypeClass,
+ FineGrainedLineageUpstreamTypeClass,
GenericAspectClass,
MetadataChangeProposalClass,
TagAssociationClass,
@@ -53,6 +56,19 @@ def test_complex_dataset_patch(
type=DatasetLineageTypeClass.TRANSFORMED,
)
)
+ .add_fine_grained_upstream_lineage(
+ fine_grained_lineage=FineGrainedLineageClass(
+ upstreamType=FineGrainedLineageUpstreamTypeClass.DATASET,
+ upstreams=[
+ make_dataset_urn(
+ platform="hive", name="fct_users_created_upstream", env="PROD"
+ )
+ ],
+ downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD_SET,
+ transformOperation="TRANSFORM",
+ confidenceScore=1.0,
+ )
+ )
)
patcher.for_field("field1").add_tag(TagAssociationClass(tag=make_tag_urn("tag1")))
diff --git a/metadata-integration/java/datahub-client/src/main/java/datahub/client/patch/dataset/UpstreamLineagePatchBuilder.java b/metadata-integration/java/datahub-client/src/main/java/datahub/client/patch/dataset/UpstreamLineagePatchBuilder.java
index 6ded8a25b4e22c..9db2ebc522e093 100644
--- a/metadata-integration/java/datahub-client/src/main/java/datahub/client/patch/dataset/UpstreamLineagePatchBuilder.java
+++ b/metadata-integration/java/datahub-client/src/main/java/datahub/client/patch/dataset/UpstreamLineagePatchBuilder.java
@@ -5,10 +5,14 @@
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.linkedin.common.urn.DatasetUrn;
+import com.linkedin.common.urn.Urn;
import com.linkedin.dataset.DatasetLineageType;
+import com.linkedin.dataset.FineGrainedLineageDownstreamType;
+import com.linkedin.dataset.FineGrainedLineageUpstreamType;
import datahub.client.patch.AbstractMultiFieldPatchBuilder;
import datahub.client.patch.PatchOperationType;
import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
import lombok.ToString;
import org.apache.commons.lang3.tuple.ImmutableTriple;
@@ -16,7 +20,8 @@
public class UpstreamLineagePatchBuilder
extends AbstractMultiFieldPatchBuilder {
- private static final String PATH_START = "/upstreams/";
+ private static final String UPSTREAMS_PATH_START = "/upstreams/";
+ private static final String FINE_GRAINED_PATH_START = "/fineGrainedLineages/";
private static final String DATASET_KEY = "dataset";
private static final String AUDIT_STAMP_KEY = "auditStamp";
private static final String TIME_KEY = "time";
@@ -34,13 +39,233 @@ public UpstreamLineagePatchBuilder addUpstream(
.set(AUDIT_STAMP_KEY, auditStamp);
pathValues.add(
- ImmutableTriple.of(PatchOperationType.ADD.getValue(), PATH_START + datasetUrn, value));
+ ImmutableTriple.of(
+ PatchOperationType.ADD.getValue(), UPSTREAMS_PATH_START + datasetUrn, value));
return this;
}
public UpstreamLineagePatchBuilder removeUpstream(@Nonnull DatasetUrn datasetUrn) {
pathValues.add(
- ImmutableTriple.of(PatchOperationType.REMOVE.getValue(), PATH_START + datasetUrn, null));
+ ImmutableTriple.of(
+ PatchOperationType.REMOVE.getValue(), UPSTREAMS_PATH_START + datasetUrn, null));
+ return this;
+ }
+
+ /**
+ * Method for adding an upstream FineGrained Dataset
+ *
+ * @param datasetUrn dataset to be set as upstream
+ * @param confidenceScore optional, confidence score for the lineage edge. Defaults to 1.0 for
+ * full confidence
+ * @param transformationOperation string operation type that describes the transformation
+ * operation happening in the lineage edge
+ * @return this builder
+ */
+ public UpstreamLineagePatchBuilder addFineGrainedUpstreamDataset(
+ @Nonnull DatasetUrn datasetUrn,
+ @Nullable Float confidenceScore,
+ @Nonnull String transformationOperation) {
+ Float finalConfidenceScore = getConfidenceScoreOrDefault(confidenceScore);
+
+ pathValues.add(
+ ImmutableTriple.of(
+ PatchOperationType.ADD.getValue(),
+ FINE_GRAINED_PATH_START
+ + transformationOperation
+ + "/"
+ + "upstreamType"
+ + "/"
+ + "DATASET"
+ + "/"
+ + datasetUrn,
+ instance.numberNode(finalConfidenceScore)));
+ return this;
+ }
+
+ /**
+ * Adds a field as a fine grained upstream
+ *
+ * @param schemaFieldUrn a schema field to be marked as upstream, format:
+ * urn:li:schemaField(DATASET_URN, COLUMN NAME)
+ * @param confidenceScore optional, confidence score for the lineage edge. Defaults to 1.0 for
+ * full confidence
+ * @param transformationOperation string operation type that describes the transformation
+ * operation happening in the lineage edge
+ * @param type the upstream lineage type, either Field or Field Set
+ * @return this builder
+ */
+ public UpstreamLineagePatchBuilder addFineGrainedUpstreamField(
+ @Nonnull Urn schemaFieldUrn,
+ @Nullable Float confidenceScore,
+ @Nonnull String transformationOperation,
+ @Nullable FineGrainedLineageUpstreamType type) {
+ Float finalConfidenceScore = getConfidenceScoreOrDefault(confidenceScore);
+ String finalType;
+ if (type == null) {
+ // Default to set of fields if not explicitly a single field
+ finalType = FineGrainedLineageUpstreamType.FIELD_SET.toString();
+ } else {
+ finalType = type.toString();
+ }
+
+ pathValues.add(
+ ImmutableTriple.of(
+ PatchOperationType.ADD.getValue(),
+ FINE_GRAINED_PATH_START
+ + transformationOperation
+ + "/"
+ + "upstreamType"
+ + "/"
+ + finalType
+ + "/"
+ + schemaFieldUrn,
+ instance.numberNode(finalConfidenceScore)));
+
+ return this;
+ }
+
+ /**
+ * Adds a field as a fine grained downstream
+ *
+ * @param schemaFieldUrn a schema field to be marked as downstream, format:
+ * urn:li:schemaField(DATASET_URN, COLUMN NAME)
+ * @param confidenceScore optional, confidence score for the lineage edge. Defaults to 1.0 for
+ * full confidence
+ * @param transformationOperation string operation type that describes the transformation
+ * operation happening in the lineage edge
+ * @param type the downstream lineage type, either Field or Field Set
+ * @return this builder
+ */
+ public UpstreamLineagePatchBuilder addFineGrainedDownstreamField(
+ @Nonnull Urn schemaFieldUrn,
+ @Nullable Float confidenceScore,
+ @Nonnull String transformationOperation,
+ @Nullable FineGrainedLineageDownstreamType type) {
+ Float finalConfidenceScore = getConfidenceScoreOrDefault(confidenceScore);
+ String finalType;
+ if (type == null) {
+ // Default to set of fields if not explicitly a single field
+ finalType = FineGrainedLineageDownstreamType.FIELD_SET.toString();
+ } else {
+ finalType = type.toString();
+ }
+
+ pathValues.add(
+ ImmutableTriple.of(
+ PatchOperationType.ADD.getValue(),
+ FINE_GRAINED_PATH_START
+ + transformationOperation
+ + "/"
+ + "downstreamType"
+ + "/"
+ + finalType
+ + "/"
+ + schemaFieldUrn,
+ instance.numberNode(finalConfidenceScore)));
+ return this;
+ }
+
+ private Float getConfidenceScoreOrDefault(@Nullable Float confidenceScore) {
+ float finalConfidenceScore;
+ if (confidenceScore != null && confidenceScore > 0 && confidenceScore <= 1.0f) {
+ finalConfidenceScore = confidenceScore;
+ } else {
+ finalConfidenceScore = 1.0f;
+ }
+
+ return finalConfidenceScore;
+ }
+
+ /**
+ * Removes a field as a fine grained upstream
+ *
+ * @param schemaFieldUrn a schema field to be marked as upstream, format:
+ * urn:li:schemaField(DATASET_URN, COLUMN NAME)
+ * @param transformationOperation string operation type that describes the transformation
+ * operation happening in the lineage edge
+ * @param type the upstream lineage type, either Field or Field Set
+ * @return this builder
+ */
+ public UpstreamLineagePatchBuilder removeFineGrainedUpstreamField(
+ @Nonnull Urn schemaFieldUrn,
+ @Nonnull String transformationOperation,
+ @Nullable FineGrainedLineageUpstreamType type) {
+ String finalType;
+ if (type == null) {
+ // Default to set of fields if not explicitly a single field
+ finalType = FineGrainedLineageUpstreamType.FIELD_SET.toString();
+ } else {
+ finalType = type.toString();
+ }
+
+ pathValues.add(
+ ImmutableTriple.of(
+ PatchOperationType.REMOVE.getValue(),
+ FINE_GRAINED_PATH_START
+ + transformationOperation
+ + "/"
+ + "upstreamType"
+ + "/"
+ + finalType
+ + "/"
+ + schemaFieldUrn,
+ null));
+
+ return this;
+ }
+
+ public UpstreamLineagePatchBuilder removeFineGrainedUpstreamDataset(
+ @Nonnull DatasetUrn datasetUrn, @Nonnull String transformationOperation) {
+
+ pathValues.add(
+ ImmutableTriple.of(
+ PatchOperationType.REMOVE.getValue(),
+ FINE_GRAINED_PATH_START
+ + transformationOperation
+ + "/"
+ + "upstreamType"
+ + "/"
+ + "DATASET"
+ + "/"
+ + datasetUrn,
+ null));
+ return this;
+ }
+
+ /**
+ * Adds a field as a fine grained downstream
+ *
+ * @param schemaFieldUrn a schema field to be marked as downstream, format:
+ * urn:li:schemaField(DATASET_URN, COLUMN NAME)
+ * @param transformationOperation string operation type that describes the transformation
+ * operation happening in the lineage edge
+ * @param type the downstream lineage type, either Field or Field Set
+ * @return this builder
+ */
+ public UpstreamLineagePatchBuilder removeFineGrainedDownstreamField(
+ @Nonnull Urn schemaFieldUrn,
+ @Nonnull String transformationOperation,
+ @Nullable FineGrainedLineageDownstreamType type) {
+ String finalType;
+ if (type == null) {
+ // Default to set of fields if not explicitly a single field
+ finalType = FineGrainedLineageDownstreamType.FIELD_SET.toString();
+ } else {
+ finalType = type.toString();
+ }
+
+ pathValues.add(
+ ImmutableTriple.of(
+ PatchOperationType.REMOVE.getValue(),
+ FINE_GRAINED_PATH_START
+ + transformationOperation
+ + "/"
+ + "downstreamType"
+ + "/"
+ + finalType
+ + "/"
+ + schemaFieldUrn,
+ null));
return this;
}
diff --git a/metadata-integration/java/datahub-client/src/test/java/datahub/client/patch/PatchTest.java b/metadata-integration/java/datahub-client/src/test/java/datahub/client/patch/PatchTest.java
index 1d387acb0ce12b..563742990f5468 100644
--- a/metadata-integration/java/datahub-client/src/test/java/datahub/client/patch/PatchTest.java
+++ b/metadata-integration/java/datahub-client/src/test/java/datahub/client/patch/PatchTest.java
@@ -14,6 +14,7 @@
import com.linkedin.common.urn.DatasetUrn;
import com.linkedin.common.urn.GlossaryTermUrn;
import com.linkedin.common.urn.TagUrn;
+import com.linkedin.common.urn.Urn;
import com.linkedin.common.urn.UrnUtils;
import com.linkedin.dataset.DatasetLineageType;
import com.linkedin.metadata.graph.LineageDirection;
@@ -49,15 +50,21 @@ public class PatchTest {
public void testLocalUpstream() {
RestEmitter restEmitter = new RestEmitter(RestEmitterConfig.builder().build());
try {
+ DatasetUrn upstreamUrn =
+ DatasetUrn.createFromString(
+ "urn:li:dataset:(urn:li:dataPlatform:kafka,SampleKafkaDataset,PROD)");
+ Urn schemaFieldUrn =
+ UrnUtils.getUrn(
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:kafka,SampleKafkaDataset,PROD), foo)");
MetadataChangeProposal upstreamPatch =
new UpstreamLineagePatchBuilder()
.urn(
UrnUtils.getUrn(
"urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)"))
- .addUpstream(
- DatasetUrn.createFromString(
- "urn:li:dataset:(urn:li:dataPlatform:kafka,SampleKafkaDataset,PROD)"),
- DatasetLineageType.TRANSFORMED)
+ .addUpstream(upstreamUrn, DatasetLineageType.TRANSFORMED)
+ .addFineGrainedUpstreamDataset(upstreamUrn, null, "TRANSFORM")
+ .addFineGrainedUpstreamField(schemaFieldUrn, null, "TRANSFORM", null)
+ .addFineGrainedDownstreamField(schemaFieldUrn, null, "TRANSFORM", null)
.build();
Future response = restEmitter.emit(upstreamPatch);
@@ -73,6 +80,12 @@ public void testLocalUpstream() {
public void testLocalUpstreamRemove() {
RestEmitter restEmitter = new RestEmitter(RestEmitterConfig.builder().build());
try {
+ DatasetUrn upstreamUrn =
+ DatasetUrn.createFromString(
+ "urn:li:dataset:(urn:li:dataPlatform:kafka,SampleKafkaDataset,PROD)");
+ Urn schemaFieldUrn =
+ UrnUtils.getUrn(
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:kafka,SampleKafkaDataset,PROD), foo)");
MetadataChangeProposal upstreamPatch =
new UpstreamLineagePatchBuilder()
.urn(
@@ -81,6 +94,9 @@ public void testLocalUpstreamRemove() {
.removeUpstream(
DatasetUrn.createFromString(
"urn:li:dataset:(urn:li:dataPlatform:kafka,SampleKafkaDataset,PROD)"))
+ .removeFineGrainedUpstreamDataset(upstreamUrn, "TRANSFORM")
+ .removeFineGrainedUpstreamField(schemaFieldUrn, "TRANSFORM", null)
+ .removeFineGrainedDownstreamField(schemaFieldUrn, "TRANSFORM", null)
.build();
Future response = restEmitter.emit(upstreamPatch);
From 79ccbc57d1c3266025c8e52ce18fbfcff550c387 Mon Sep 17 00:00:00 2001
From: RyanHolstien
Date: Mon, 11 Dec 2023 14:41:23 -0800
Subject: [PATCH 005/540] fix(CVE-2023-6378): update logback classic (#9438)
---
build.gradle | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/build.gradle b/build.gradle
index f5e5403e822e77..b16e3ca169c717 100644
--- a/build.gradle
+++ b/build.gradle
@@ -16,7 +16,7 @@ buildscript {
ext.playVersion = '2.8.18'
ext.log4jVersion = '2.19.0'
ext.slf4jVersion = '1.7.36'
- ext.logbackClassic = '1.2.12'
+ ext.logbackClassic = '1.2.13'
ext.hadoop3Version = '3.3.5'
ext.kafkaVersion = '2.3.0'
ext.hazelcastVersion = '5.3.6'
From ee4e8dd74c569d0dfc98e8eb13034c91b0ad61a8 Mon Sep 17 00:00:00 2001
From: Salman-Apptware <101426513+Salman-Apptware@users.noreply.github.com>
Date: Tue, 12 Dec 2023 15:03:30 +0530
Subject: [PATCH 006/540] feat: allow the sidebar size to be draggable (#9401)
---
.../src/app/search/SearchResults.tsx | 2 +-
.../src/app/search/sidebar/BrowseSidebar.tsx | 51 ++++++++++++-------
.../src/app/search/sidebar/EntityNode.tsx | 3 +-
.../cypress/cypress/e2e/browse/browseV2.js | 10 ++--
4 files changed, 41 insertions(+), 25 deletions(-)
diff --git a/datahub-web-react/src/app/search/SearchResults.tsx b/datahub-web-react/src/app/search/SearchResults.tsx
index 56e83e42350270..d7ad6d517d8fed 100644
--- a/datahub-web-react/src/app/search/SearchResults.tsx
+++ b/datahub-web-react/src/app/search/SearchResults.tsx
@@ -197,7 +197,7 @@ export const SearchResults = ({
{showBrowseV2 && (
-
+
)}
diff --git a/datahub-web-react/src/app/search/sidebar/BrowseSidebar.tsx b/datahub-web-react/src/app/search/sidebar/BrowseSidebar.tsx
index 822e75b65febc3..c16bcdcaf6c727 100644
--- a/datahub-web-react/src/app/search/sidebar/BrowseSidebar.tsx
+++ b/datahub-web-react/src/app/search/sidebar/BrowseSidebar.tsx
@@ -1,4 +1,4 @@
-import React from 'react';
+import React, { useState } from 'react';
import styled from 'styled-components';
import { Typography } from 'antd';
import EntityNode from './EntityNode';
@@ -7,10 +7,16 @@ import SidebarLoadingError from './SidebarLoadingError';
import { SEARCH_RESULTS_BROWSE_SIDEBAR_ID } from '../../onboarding/config/SearchOnboardingConfig';
import useSidebarEntities from './useSidebarEntities';
import { ANTD_GRAY_V2 } from '../../entity/shared/constants';
+import { ProfileSidebarResizer } from '../../entity/shared/containers/profile/sidebar/ProfileSidebarResizer';
-const Sidebar = styled.div<{ visible: boolean; width: number }>`
+
+export const MAX_BROWSER_WIDTH = 500;
+export const MIN_BROWSWER_WIDTH = 200;
+
+export const SidebarWrapper = styled.div<{ visible: boolean; width: number }>`
height: 100%;
width: ${(props) => (props.visible ? `${props.width}px` : '0')};
+ min-width: ${(props) => (props.visible ? `${props.width}px` : '0')};
transition: width 250ms ease-in-out;
border-right: 1px solid ${(props) => props.theme.styles['border-color-base']};
background-color: ${ANTD_GRAY_V2[1]};
@@ -37,29 +43,38 @@ const SidebarBody = styled.div<{ visible: boolean }>`
type Props = {
visible: boolean;
- width: number;
};
-const BrowseSidebar = ({ visible, width }: Props) => {
+const BrowseSidebar = ({ visible }: Props) => {
const { error, entityAggregations, retry } = useSidebarEntities({
skip: !visible,
});
+ const [browserWidth, setBrowserWith] = useState(window.innerWidth * 0.2);
return (
-
+ <>
+
+
+ setBrowserWith(Math.min(Math.max(widthProp, MIN_BROWSWER_WIDTH), MAX_BROWSER_WIDTH))
+ }
+ initialSize={browserWidth}
+ isSidebarOnLeft
+ />
+ >
);
};
diff --git a/datahub-web-react/src/app/search/sidebar/EntityNode.tsx b/datahub-web-react/src/app/search/sidebar/EntityNode.tsx
index e04e4253dca134..627d19c4fb10c1 100644
--- a/datahub-web-react/src/app/search/sidebar/EntityNode.tsx
+++ b/datahub-web-react/src/app/search/sidebar/EntityNode.tsx
@@ -38,7 +38,8 @@ const EntityNode = () => {
onToggle: (isNowOpen: boolean) => trackToggleNodeEvent(isNowOpen, 'entity'),
});
- const onClickHeader = () => {
+ const onClickHeader = (e) => {
+ e.preventDefault();
if (count) toggle();
};
diff --git a/smoke-test/tests/cypress/cypress/e2e/browse/browseV2.js b/smoke-test/tests/cypress/cypress/e2e/browse/browseV2.js
index a61b9030b13c6f..f45edc5fa04819 100644
--- a/smoke-test/tests/cypress/cypress/e2e/browse/browseV2.js
+++ b/smoke-test/tests/cypress/cypress/e2e/browse/browseV2.js
@@ -46,31 +46,31 @@ describe("search", () => {
cy.get("[data-testid=browse-v2")
.invoke("css", "width")
- .should("match", /^\d\d\dpx$/);
+ .should("match", /\d\d\dpx$/);
cy.get("[data-testid=browse-v2-toggle").click();
cy.get("[data-testid=browse-v2")
.invoke("css", "width")
- .should("match", /^\dpx$/);
+ .should("match", /\dpx$/);
cy.reload();
cy.get("[data-testid=browse-v2")
.invoke("css", "width")
- .should("match", /^\dpx$/);
+ .should("match", /\dpx$/);
cy.get("[data-testid=browse-v2-toggle").click();
cy.get("[data-testid=browse-v2")
.invoke("css", "width")
- .should("match", /^\d\d\dpx$/);
+ .should("match", /\d\d\dpx$/);
cy.reload();
cy.get("[data-testid=browse-v2")
.invoke("css", "width")
- .should("match", /^\d\d\dpx$/);
+ .should("match", /\d\d\dpx$/);
});
it("should take you to the old browse experience when clicking entity type on home page with the browse flag off", () => {
From abbc4cdc577647d7b97a03117c4317805a3a8ce3 Mon Sep 17 00:00:00 2001
From: Aseem Bansal
Date: Tue, 12 Dec 2023 17:26:29 +0530
Subject: [PATCH 007/540] fix(json-schema): do not send invalid URLs (#9417)
---
.../ingestion/source/schema/json_schema.py | 19 ++++++++++++++++---
1 file changed, 16 insertions(+), 3 deletions(-)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/schema/json_schema.py b/metadata-ingestion/src/datahub/ingestion/source/schema/json_schema.py
index f6e944f4fc3cb3..c7e8a15d8dfa48 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/schema/json_schema.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/schema/json_schema.py
@@ -9,6 +9,7 @@
from os.path import basename, dirname
from pathlib import Path
from typing import Any, Iterable, List, Optional, Union
+from urllib.parse import urlparse
import jsonref
from pydantic import AnyHttpUrl, DirectoryPath, FilePath, validator
@@ -53,6 +54,16 @@
logger = logging.getLogger(__name__)
+def is_url_valid(url: Optional[str]) -> bool:
+ if url is None:
+ return False
+ try:
+ result = urlparse(url)
+ return all([result.scheme, result.netloc])
+ except Exception:
+ return False
+
+
class URIReplacePattern(ConfigModel):
match: str = Field(
description="Pattern to match on uri-s as part of reference resolution. See replace field",
@@ -281,12 +292,14 @@ def _load_one_file(
entityUrn=dataset_urn, aspect=models.StatusClass(removed=False)
).as_workunit()
+ external_url = JsonSchemaTranslator._get_id_from_any_schema(schema_dict)
+ if not is_url_valid(external_url):
+ external_url = None
+
yield MetadataChangeProposalWrapper(
entityUrn=dataset_urn,
aspect=models.DatasetPropertiesClass(
- externalUrl=JsonSchemaTranslator._get_id_from_any_schema(
- schema_dict
- ),
+ externalUrl=external_url,
name=dataset_simple_name,
description=JsonSchemaTranslator._get_description_from_any_schema(
schema_dict
From ffccc6556110ea197402ad1de72117ffd5509a8d Mon Sep 17 00:00:00 2001
From: Tamas Nemeth
Date: Tue, 12 Dec 2023 18:31:58 +0100
Subject: [PATCH 008/540] fix(ingest/profiling) Fixing profile eligibility
check (#9446)
---
.../datahub/ingestion/source/sql/sql_generic_profiler.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py
index 844a458d9f1ab6..a2f91e5fae1a98 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py
@@ -274,16 +274,16 @@ def is_dataset_eligible_for_profiling(
return False
if self.config.profiling.profile_table_size_limit is not None and (
- size_in_bytes is None
- or size_in_bytes / (2**30)
+ size_in_bytes is not None
+ and size_in_bytes / (2**30)
> self.config.profiling.profile_table_size_limit
):
self.report.profiling_skipped_size_limit[schema_name] += 1
return False
if self.config.profiling.profile_table_row_limit is not None and (
- rows_count is None
- or rows_count > self.config.profiling.profile_table_row_limit
+ rows_count is not None
+ and rows_count > self.config.profiling.profile_table_row_limit
):
self.report.profiling_skipped_row_limit[schema_name] += 1
return False
From 66f90c7ffd483f397c99dbf494280d3cd9ef10dd Mon Sep 17 00:00:00 2001
From: Harshal Sheth
Date: Tue, 12 Dec 2023 12:32:59 -0500
Subject: [PATCH 009/540] fix(ingest): avoid git dependency in dbt (#9447)
---
metadata-ingestion/src/datahub/configuration/git.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/metadata-ingestion/src/datahub/configuration/git.py b/metadata-ingestion/src/datahub/configuration/git.py
index 9ea9007553839b..a5f88744661a4a 100644
--- a/metadata-ingestion/src/datahub/configuration/git.py
+++ b/metadata-ingestion/src/datahub/configuration/git.py
@@ -6,7 +6,6 @@
from datahub.configuration.common import ConfigModel
from datahub.configuration.validate_field_rename import pydantic_renamed_field
-from datahub.ingestion.source.git.git_import import GitClone
_GITHUB_PREFIX = "https://github.com/"
_GITLAB_PREFIX = "https://gitlab.com/"
@@ -151,6 +150,9 @@ def clone(
) -> pathlib.Path:
"""Clones the repo into a temporary directory and returns the path to the checkout."""
+ # We import this here to avoid a hard dependency on gitpython.
+ from datahub.ingestion.source.git.git_import import GitClone
+
assert self.repo_ssh_locator
git_clone = GitClone(str(tmp_path))
From 02982ed88600f9b11c2387e540299c437ca21ed6 Mon Sep 17 00:00:00 2001
From: Harshal Sheth
Date: Tue, 12 Dec 2023 12:38:21 -0500
Subject: [PATCH 010/540] feat(ingest): add retries for tableau (#9437)
---
.../src/datahub/ingestion/source/tableau.py | 15 ++++++++++++++-
1 file changed, 14 insertions(+), 1 deletion(-)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau.py
index da44d09121c6c1..f870e99df27c5f 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/tableau.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/tableau.py
@@ -21,7 +21,7 @@
import tableauserverclient as TSC
from pydantic import root_validator, validator
from pydantic.fields import Field
-from requests.adapters import ConnectionError
+from requests.adapters import ConnectionError, HTTPAdapter
from tableauserverclient import (
PersonalAccessTokenAuth,
Server,
@@ -29,6 +29,7 @@
TableauAuth,
)
from tableauserverclient.server.endpoint.exceptions import NonXMLResponseError
+from urllib3 import Retry
import datahub.emitter.mce_builder as builder
import datahub.utilities.sqlglot_lineage as sqlglot_l
@@ -174,6 +175,7 @@ class TableauConnectionConfig(ConfigModel):
description="Unique relationship between the Tableau Server and site",
)
+ max_retries: int = Field(3, description="Number of retries for failed requests.")
ssl_verify: Union[bool, str] = Field(
default=True,
description="Whether to verify SSL certificates. If using self-signed certificates, set to false or provide the path to the .pem certificate bundle.",
@@ -224,6 +226,17 @@ def make_tableau_client(self) -> Server:
# From https://stackoverflow.com/a/50159273/5004662.
server._session.trust_env = False
+ # Setup request retries.
+ adapter = HTTPAdapter(
+ max_retries=Retry(
+ total=self.max_retries,
+ backoff_factor=1,
+ status_forcelist=[429, 500, 502, 503, 504],
+ )
+ )
+ server._session.mount("http://", adapter)
+ server._session.mount("https://", adapter)
+
server.auth.sign_in(authentication)
return server
except ServerResponseError as e:
From 9899aca4995ec0bd5a7e3ccc6c7e1495b4ee78df Mon Sep 17 00:00:00 2001
From: david-leifker <114954101+david-leifker@users.noreply.github.com>
Date: Tue, 12 Dec 2023 12:16:27 -0600
Subject: [PATCH 011/540] docs(updating-datahub): update docs for v0.12.1
(#9441)
---
docs/how/updating-datahub.md | 17 ++++++++++++++---
1 file changed, 14 insertions(+), 3 deletions(-)
diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md
index 94ab1b0611c339..36be572f2886e5 100644
--- a/docs/how/updating-datahub.md
+++ b/docs/how/updating-datahub.md
@@ -7,15 +7,26 @@ This file documents any backwards-incompatible changes in DataHub and assists pe
### Breaking Changes
- Updating MySQL version for quickstarts to 8.2, may cause quickstart issues for existing instances.
+
+### Potential Downtime
+
+### Deprecations
+
+### Other Notable Changes
+
+## 0.12.1
+
+### Breaking Changes
+
- #9244: The `redshift-legacy` and `redshift-legacy-usage` sources, which have been deprecated for >6 months, have been removed. The new `redshift` source is a superset of the functionality provided by those legacy sources.
- `database_alias` config is no longer supported in SQL sources namely - Redshift, MySQL, Oracle, Postgres, Trino, Presto-on-hive. The config will automatically be ignored if it's present in your recipe. It has been deprecated since v0.9.6.
- #9257: The Python SDK urn types are now autogenerated. The new classes are largely backwards compatible with the previous, manually written classes, but many older methods are now deprecated in favor of a more uniform interface. The only breaking change is that the signature for the director constructor e.g. `TagUrn("tag", ["tag_name"])` is no longer supported, and the simpler `TagUrn("tag_name")` should be used instead.
The canonical place to import the urn classes from is `datahub.metadata.urns.*`. Other import paths, like `datahub.utilities.urns.corpuser_urn.CorpuserUrn` are retained for backwards compatibility, but are considered deprecated.
- #9286: The `DataHubRestEmitter.emit` method no longer returns anything. It previously returned a tuple of timestamps.
- #8951: A great expectations based profiler has been added for the Unity Catalog source.
-To use the old profiler, set `method: analyze` under the `profiling` section in your recipe.
-To use the new profiler, set `method: ge`. Profiling is disabled by default, so to enable it,
-one of these methods must be specified.
+ To use the old profiler, set `method: analyze` under the `profiling` section in your recipe.
+ To use the new profiler, set `method: ge`. Profiling is disabled by default, so to enable it,
+ one of these methods must be specified.
### Potential Downtime
From eb8cbd8b4150b31429cf09158cb1113f275ac544 Mon Sep 17 00:00:00 2001
From: Salman-Apptware <101426513+Salman-Apptware@users.noreply.github.com>
Date: Wed, 13 Dec 2023 12:19:49 +0530
Subject: [PATCH 012/540] feat: Allow specifying Data Product URN via UI
(#9386)
Co-authored-by: Aseem Bansal
---
.../DataHubDataFetcherExceptionHandler.java | 40 +++++++----
.../CreateDataProductResolver.java | 1 +
.../src/main/resources/entity.graphql | 4 ++
.../CreateDataProductModal.tsx | 5 +-
.../DataProductAdvancedOption.tsx | 68 +++++++++++++++++++
.../DataProductBuilderForm.tsx | 11 ++-
.../entity/domain/DataProductsTab/types.ts | 6 ++
.../metadata/service/DataProductService.java | 22 +++++-
.../tests/privileges/test_privileges.py | 7 +-
9 files changed, 137 insertions(+), 27 deletions(-)
create mode 100644 datahub-web-react/src/app/entity/domain/DataProductsTab/DataProductAdvancedOption.tsx
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/exception/DataHubDataFetcherExceptionHandler.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/exception/DataHubDataFetcherExceptionHandler.java
index 7c3ea1d581b6ed..746ce0cdc10fe1 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/exception/DataHubDataFetcherExceptionHandler.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/exception/DataHubDataFetcherExceptionHandler.java
@@ -12,6 +12,8 @@
@Slf4j
public class DataHubDataFetcherExceptionHandler implements DataFetcherExceptionHandler {
+ private static final String DEFAULT_ERROR_MESSAGE = "An unknown error occurred.";
+
@Override
public DataFetcherExceptionHandlerResult onException(
DataFetcherExceptionHandlerParameters handlerParameters) {
@@ -19,28 +21,40 @@ public DataFetcherExceptionHandlerResult onException(
SourceLocation sourceLocation = handlerParameters.getSourceLocation();
ResultPath path = handlerParameters.getPath();
- log.error("Failed to execute DataFetcher", exception);
-
DataHubGraphQLErrorCode errorCode = DataHubGraphQLErrorCode.SERVER_ERROR;
- String message = "An unknown error occurred.";
+ String message = DEFAULT_ERROR_MESSAGE;
- // note: make sure to access the true error message via `getCause()`
- if (exception.getCause() instanceof IllegalArgumentException) {
+ IllegalArgumentException illException =
+ findFirstThrowableCauseOfClass(exception, IllegalArgumentException.class);
+ if (illException != null) {
+ log.error("Failed to execute", illException);
errorCode = DataHubGraphQLErrorCode.BAD_REQUEST;
- message = exception.getCause().getMessage();
+ message = illException.getMessage();
}
- if (exception instanceof DataHubGraphQLException) {
- errorCode = ((DataHubGraphQLException) exception).errorCode();
- message = exception.getMessage();
+ DataHubGraphQLException graphQLException =
+ findFirstThrowableCauseOfClass(exception, DataHubGraphQLException.class);
+ if (graphQLException != null) {
+ log.error("Failed to execute", graphQLException);
+ errorCode = graphQLException.errorCode();
+ message = graphQLException.getMessage();
}
- if (exception.getCause() instanceof DataHubGraphQLException) {
- errorCode = ((DataHubGraphQLException) exception.getCause()).errorCode();
- message = exception.getCause().getMessage();
+ if (illException == null && graphQLException == null) {
+ log.error("Failed to execute", exception);
}
-
DataHubGraphQLError error = new DataHubGraphQLError(message, path, sourceLocation, errorCode);
return DataFetcherExceptionHandlerResult.newResult().error(error).build();
}
+
+ T findFirstThrowableCauseOfClass(Throwable throwable, Class clazz) {
+ while (throwable != null) {
+ if (clazz.isInstance(throwable)) {
+ return (T) throwable;
+ } else {
+ throwable = throwable.getCause();
+ }
+ }
+ return null;
+ }
}
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataproduct/CreateDataProductResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataproduct/CreateDataProductResolver.java
index 10c487a839f358..8ac7b2c3ce3754 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataproduct/CreateDataProductResolver.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataproduct/CreateDataProductResolver.java
@@ -47,6 +47,7 @@ public CompletableFuture get(final DataFetchingEnvironment environm
try {
final Urn dataProductUrn =
_dataProductService.createDataProduct(
+ input.getId(),
input.getProperties().getName(),
input.getProperties().getDescription(),
authentication);
diff --git a/datahub-graphql-core/src/main/resources/entity.graphql b/datahub-graphql-core/src/main/resources/entity.graphql
index feb344154d11e4..307c7f7b383e30 100644
--- a/datahub-graphql-core/src/main/resources/entity.graphql
+++ b/datahub-graphql-core/src/main/resources/entity.graphql
@@ -11055,6 +11055,10 @@ input CreateDataProductInput {
The primary key of the Domain
"""
domainUrn: String!
+ """
+ An optional id for the new data product
+ """
+ id: String
}
"""
diff --git a/datahub-web-react/src/app/entity/domain/DataProductsTab/CreateDataProductModal.tsx b/datahub-web-react/src/app/entity/domain/DataProductsTab/CreateDataProductModal.tsx
index 2d82521a90df58..0610fbfa7a7704 100644
--- a/datahub-web-react/src/app/entity/domain/DataProductsTab/CreateDataProductModal.tsx
+++ b/datahub-web-react/src/app/entity/domain/DataProductsTab/CreateDataProductModal.tsx
@@ -32,6 +32,7 @@ export default function CreateDataProductModal({ domain, onCreateDataProduct, on
variables: {
input: {
domainUrn: domain.urn,
+ id: builderState.id,
properties: {
name: builderState.name,
description: builderState.description || undefined,
@@ -49,10 +50,10 @@ export default function CreateDataProductModal({ domain, onCreateDataProduct, on
onClose();
}
})
- .catch(() => {
+ .catch(( error ) => {
onClose();
message.destroy();
- message.error({ content: 'Failed to create Data Product. An unexpected error occurred' });
+ message.error({ content: `Failed to create Data Product: ${error.message}.` });
});
}
diff --git a/datahub-web-react/src/app/entity/domain/DataProductsTab/DataProductAdvancedOption.tsx b/datahub-web-react/src/app/entity/domain/DataProductsTab/DataProductAdvancedOption.tsx
new file mode 100644
index 00000000000000..a077a0308af1ff
--- /dev/null
+++ b/datahub-web-react/src/app/entity/domain/DataProductsTab/DataProductAdvancedOption.tsx
@@ -0,0 +1,68 @@
+import React from "react";
+import { Collapse, Form, Input, Typography } from "antd";
+import styled from "styled-components";
+import { validateCustomUrnId } from '../../../shared/textUtil';
+import { DataProductBuilderFormProps } from "./types";
+
+
+const FormItem = styled(Form.Item)`
+ .ant-form-item-label {
+ padding-bottom: 2px;
+ }
+`;
+
+const FormItemWithMargin = styled(FormItem)`
+ margin-bottom: 16px;
+`;
+
+const FormItemNoMargin = styled(FormItem)`
+ margin-bottom: 0;
+`;
+
+const AdvancedLabel = styled(Typography.Text)`
+ color: #373d44;
+`;
+
+export function DataProductAdvancedOption({builderState, updateBuilderState }: DataProductBuilderFormProps){
+
+ function updateDataProductId(id: string) {
+ updateBuilderState({
+ ...builderState,
+ id,
+ });
+ }
+
+ return (
+
+ Advanced Options} key="1">
+ Data Product Id}
+ help="By default, a random UUID will be generated to uniquely identify this data product. If
+ you'd like to provide a custom id instead to more easily keep track of this data product,
+ you may provide it here. Be careful, you cannot easily change the data product id after
+ creation."
+ >
+ ({
+ validator(_, value) {
+ if (value && validateCustomUrnId(value)) {
+ return Promise.resolve();
+ }
+ return Promise.reject(new Error('Please enter a valid Data product id'));
+ },
+ }),
+ ]}
+ >
+ updateDataProductId(e.target.value)}
+ />
+
+
+
+
+ )
+}
\ No newline at end of file
diff --git a/datahub-web-react/src/app/entity/domain/DataProductsTab/DataProductBuilderForm.tsx b/datahub-web-react/src/app/entity/domain/DataProductsTab/DataProductBuilderForm.tsx
index b5a27a6e1b8766..98bb09098a36ea 100644
--- a/datahub-web-react/src/app/entity/domain/DataProductsTab/DataProductBuilderForm.tsx
+++ b/datahub-web-react/src/app/entity/domain/DataProductsTab/DataProductBuilderForm.tsx
@@ -3,18 +3,14 @@ import React from 'react';
import styled from 'styled-components';
import { Editor as MarkdownEditor } from '../../shared/tabs/Documentation/components/editor/Editor';
import { ANTD_GRAY } from '../../shared/constants';
-import { DataProductBuilderState } from './types';
+import { DataProductBuilderFormProps } from './types';
+import { DataProductAdvancedOption } from './DataProductAdvancedOption';
const StyledEditor = styled(MarkdownEditor)`
border: 1px solid ${ANTD_GRAY[4]};
`;
-type Props = {
- builderState: DataProductBuilderState;
- updateBuilderState: (newState: DataProductBuilderState) => void;
-};
-
-export default function DataProductBuilderForm({ builderState, updateBuilderState }: Props) {
+export default function DataProductBuilderForm({ builderState, updateBuilderState }: DataProductBuilderFormProps) {
function updateName(name: string) {
updateBuilderState({
...builderState,
@@ -47,6 +43,7 @@ export default function DataProductBuilderForm({ builderState, updateBuilderStat
Description}>
+
);
}
diff --git a/datahub-web-react/src/app/entity/domain/DataProductsTab/types.ts b/datahub-web-react/src/app/entity/domain/DataProductsTab/types.ts
index 1ed3ede39cfbe4..fe22e3ed9a2a4b 100644
--- a/datahub-web-react/src/app/entity/domain/DataProductsTab/types.ts
+++ b/datahub-web-react/src/app/entity/domain/DataProductsTab/types.ts
@@ -1,4 +1,10 @@
export type DataProductBuilderState = {
name: string;
+ id?: string;
description?: string;
};
+
+export type DataProductBuilderFormProps = {
+ builderState: DataProductBuilderState;
+ updateBuilderState: (newState: DataProductBuilderState) => void;
+};
\ No newline at end of file
diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/service/DataProductService.java b/metadata-service/services/src/main/java/com/linkedin/metadata/service/DataProductService.java
index 10016ee89605b9..d60427a27a5c59 100644
--- a/metadata-service/services/src/main/java/com/linkedin/metadata/service/DataProductService.java
+++ b/metadata-service/services/src/main/java/com/linkedin/metadata/service/DataProductService.java
@@ -1,5 +1,7 @@
package com.linkedin.metadata.service;
+import static com.linkedin.metadata.Constants.DATA_PRODUCT_ENTITY_NAME;
+
import com.datahub.authentication.Authentication;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
@@ -22,6 +24,7 @@
import com.linkedin.metadata.graph.GraphClient;
import com.linkedin.metadata.query.filter.RelationshipDirection;
import com.linkedin.metadata.utils.EntityKeyUtils;
+import com.linkedin.r2.RemoteInvocationException;
import java.util.List;
import java.util.Objects;
import java.util.UUID;
@@ -58,11 +61,26 @@ public DataProductService(@Nonnull EntityClient entityClient, @Nonnull GraphClie
* @return the urn of the newly created DataProduct
*/
public Urn createDataProduct(
- @Nullable String name, @Nullable String description, @Nonnull Authentication authentication) {
+ @Nullable String id,
+ @Nullable String name,
+ @Nullable String description,
+ @Nonnull Authentication authentication) {
// 1. Generate a unique id for the new DataProduct.
final DataProductKey key = new DataProductKey();
- key.setId(UUID.randomUUID().toString());
+ if (id != null && !id.isBlank()) {
+ key.setId(id);
+ } else {
+ key.setId(UUID.randomUUID().toString());
+ }
+ try {
+ if (_entityClient.exists(
+ EntityKeyUtils.convertEntityKeyToUrn(key, DATA_PRODUCT_ENTITY_NAME), authentication)) {
+ throw new IllegalArgumentException("This Data product already exists!");
+ }
+ } catch (RemoteInvocationException e) {
+ throw new RuntimeException("Unable to check for existence of Data Product!");
+ }
// 2. Create a new instance of DataProductProperties
final DataProductProperties properties = new DataProductProperties();
diff --git a/smoke-test/tests/privileges/test_privileges.py b/smoke-test/tests/privileges/test_privileges.py
index aa54a50b04e7f8..75e2265f1f5551 100644
--- a/smoke-test/tests/privileges/test_privileges.py
+++ b/smoke-test/tests/privileges/test_privileges.py
@@ -63,7 +63,7 @@ def _ensure_cant_perform_action(session, json,assertion_key):
action_response.raise_for_status()
action_data = action_response.json()
- assert action_data["errors"][0]["extensions"]["code"] == 403
+ assert action_data["errors"][0]["extensions"]["code"] == 403, action_data["errors"][0]
assert action_data["errors"][0]["extensions"]["type"] == "UNAUTHORIZED"
assert action_data["data"][assertion_key] == None
@@ -367,8 +367,9 @@ def test_privilege_to_create_and_manage_policies():
# Verify new user can't create a policy
create_policy = {
- "query": """mutation createPolicy($input: PolicyUpdateInput!) {\n
- createPolicy(input: $input) }""",
+ "query": """mutation createPolicy($input: PolicyUpdateInput!) {
+ createPolicy(input: $input)
+ }""",
"variables": {
"input": {
"type": "PLATFORM",
From 5af799ee892a0a1f9655ff569c4da63ffa976e52 Mon Sep 17 00:00:00 2001
From: Salman-Apptware <101426513+Salman-Apptware@users.noreply.github.com>
Date: Wed, 13 Dec 2023 14:31:24 +0530
Subject: [PATCH 013/540] feat(ownership): add button to copy urn of an
Ownership Type (#9452)
---
.../entity/ownership/table/ActionsColumn.tsx | 18 +++++++++++++++++-
1 file changed, 17 insertions(+), 1 deletion(-)
diff --git a/datahub-web-react/src/app/entity/ownership/table/ActionsColumn.tsx b/datahub-web-react/src/app/entity/ownership/table/ActionsColumn.tsx
index 41e07520a0ece5..e08853ad150bfc 100644
--- a/datahub-web-react/src/app/entity/ownership/table/ActionsColumn.tsx
+++ b/datahub-web-react/src/app/entity/ownership/table/ActionsColumn.tsx
@@ -1,6 +1,6 @@
import React from 'react';
import { Dropdown, MenuProps, Popconfirm, Typography, message, notification } from 'antd';
-import { DeleteOutlined, EditOutlined, MoreOutlined } from '@ant-design/icons';
+import { CopyOutlined, DeleteOutlined, EditOutlined, MoreOutlined } from '@ant-design/icons';
import styled from 'styled-components/macro';
import { OwnershipTypeEntity } from '../../../../types.generated';
import { useDeleteOwnershipTypeMutation } from '../../../../graphql/ownership.generated';
@@ -48,6 +48,10 @@ export const ActionsColumn = ({ ownershipType, setIsOpen, setOwnershipType, refe
setOwnershipType(ownershipType);
};
+ const onCopy=() => {
+ navigator.clipboard.writeText(ownershipType.urn);
+ }
+
const [deleteOwnershipTypeMutation] = useDeleteOwnershipTypeMutation();
const onDelete = () => {
@@ -106,6 +110,15 @@ export const ActionsColumn = ({ ownershipType, setIsOpen, setOwnershipType, refe
),
},
+ {
+ key: 'copy',
+ icon: (
+
+
+ Copy Urn
+
+ ),
+ },
];
const onClick: MenuProps['onClick'] = (e) => {
@@ -113,6 +126,9 @@ export const ActionsColumn = ({ ownershipType, setIsOpen, setOwnershipType, refe
if (key === 'edit') {
editOnClick();
}
+ else if( key === 'copy') {
+ onCopy();
+ }
};
const menuProps: MenuProps = {
From a92230b32162dc26776210a3278eadaafaa6e08e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EA=B9=80=EA=B0=80=EC=9C=A4?=
<60080153+KaYunKIM@users.noreply.github.com>
Date: Thu, 14 Dec 2023 02:30:18 +0900
Subject: [PATCH 014/540] docs(ingest/tableau): add token to sink config in
sample recipe (#9411)
Co-authored-by: KaYunKIM
Co-authored-by: Harshal Sheth
---
metadata-ingestion/examples/recipes/tableau_to_datahub.dhub.yaml | 1 +
1 file changed, 1 insertion(+)
diff --git a/metadata-ingestion/examples/recipes/tableau_to_datahub.dhub.yaml b/metadata-ingestion/examples/recipes/tableau_to_datahub.dhub.yaml
index ed6567b5889df1..a9db27bb52a233 100644
--- a/metadata-ingestion/examples/recipes/tableau_to_datahub.dhub.yaml
+++ b/metadata-ingestion/examples/recipes/tableau_to_datahub.dhub.yaml
@@ -18,3 +18,4 @@ sink:
type: "datahub-rest"
config:
server: "http://localhost:8080"
+ token: token_value # optional
From 3cde9549a290d2560d9eebaa4fc5a3521266a841 Mon Sep 17 00:00:00 2001
From: allizex <150264485+allizex@users.noreply.github.com>
Date: Wed, 13 Dec 2023 20:26:45 +0100
Subject: [PATCH 015/540] feat(glossary): add ability to clone glossary
term(name and documentation) from term profile menu (#9445)
Co-authored-by: Olga Dimova <38855943+olgadimova@users.noreply.github.com>
---
.../glossaryTerm/GlossaryTermEntity.tsx | 7 +++-
.../CreateGlossaryEntityModal.tsx | 34 ++++++++++++++++---
.../shared/EntityDropdown/EntityDropdown.tsx | 22 ++++++++++++
.../src/app/entity/shared/types.ts | 1 +
4 files changed, 59 insertions(+), 5 deletions(-)
diff --git a/datahub-web-react/src/app/entity/glossaryTerm/GlossaryTermEntity.tsx b/datahub-web-react/src/app/entity/glossaryTerm/GlossaryTermEntity.tsx
index 080ee5889aec92..a6f6d9b0e28671 100644
--- a/datahub-web-react/src/app/entity/glossaryTerm/GlossaryTermEntity.tsx
+++ b/datahub-web-react/src/app/entity/glossaryTerm/GlossaryTermEntity.tsx
@@ -65,7 +65,12 @@ export class GlossaryTermEntity implements Entity {
useEntityQuery={useGetGlossaryTermQuery as any}
headerActionItems={new Set([EntityActionItem.BATCH_ADD_GLOSSARY_TERM])}
headerDropdownItems={
- new Set([EntityMenuItems.UPDATE_DEPRECATION, EntityMenuItems.MOVE, EntityMenuItems.DELETE])
+ new Set([
+ EntityMenuItems.UPDATE_DEPRECATION,
+ EntityMenuItems.CLONE,
+ EntityMenuItems.MOVE,
+ EntityMenuItems.DELETE,
+ ])
}
isNameEditable
hideBrowseBar
diff --git a/datahub-web-react/src/app/entity/shared/EntityDropdown/CreateGlossaryEntityModal.tsx b/datahub-web-react/src/app/entity/shared/EntityDropdown/CreateGlossaryEntityModal.tsx
index 9788d36af2c65a..d60e86b0af8ca4 100644
--- a/datahub-web-react/src/app/entity/shared/EntityDropdown/CreateGlossaryEntityModal.tsx
+++ b/datahub-web-react/src/app/entity/shared/EntityDropdown/CreateGlossaryEntityModal.tsx
@@ -1,8 +1,9 @@
-import React, { useState } from 'react';
+import React, { useEffect, useState } from 'react';
import styled from 'styled-components/macro';
import { EditOutlined } from '@ant-design/icons';
import { message, Button, Input, Modal, Typography, Form, Collapse } from 'antd';
import DOMPurify from 'dompurify';
+import { useHistory } from 'react-router';
import {
useCreateGlossaryTermMutation,
useCreateGlossaryNodeMutation,
@@ -16,6 +17,7 @@ import DescriptionModal from '../components/legacy/DescriptionModal';
import { validateCustomUrnId } from '../../../shared/textUtil';
import { useGlossaryEntityData } from '../GlossaryEntityContext';
import { getGlossaryRootToUpdate, updateGlossarySidebar } from '../../../glossary/utils';
+import { getEntityPath } from '../containers/profile/utils';
const StyledItem = styled(Form.Item)`
margin-bottom: 0;
@@ -33,6 +35,7 @@ interface Props {
entityType: EntityType;
onClose: () => void;
refetchData?: () => void;
+ isCloning?: boolean;
}
function CreateGlossaryEntityModal(props: Props) {
@@ -43,15 +46,31 @@ function CreateGlossaryEntityModal(props: Props) {
const entityRegistry = useEntityRegistry();
const [stagedId, setStagedId] = useState(undefined);
const [stagedName, setStagedName] = useState('');
- const [selectedParentUrn, setSelectedParentUrn] = useState(entityData.urn);
+ const [selectedParentUrn, setSelectedParentUrn] = useState(props.isCloning ? '' : entityData.urn);
const [documentation, setDocumentation] = useState('');
const [isDocumentationModalVisible, setIsDocumentationModalVisible] = useState(false);
const [createButtonDisabled, setCreateButtonDisabled] = useState(true);
const refetch = useRefetch();
+ const history = useHistory();
const [createGlossaryTermMutation] = useCreateGlossaryTermMutation();
const [createGlossaryNodeMutation] = useCreateGlossaryNodeMutation();
+ useEffect(() => {
+ if (props.isCloning && entityData.entityData) {
+ const { properties } = entityData.entityData;
+
+ if (properties?.name) {
+ setStagedName(properties.name);
+ form.setFieldValue('name', properties.name);
+ }
+
+ if (properties?.description) {
+ setDocumentation(properties.description);
+ }
+ }
+ }, [props.isCloning, entityData.entityData, form]);
+
function createGlossaryEntity() {
const mutation =
entityType === EntityType.GlossaryTerm ? createGlossaryTermMutation : createGlossaryNodeMutation;
@@ -67,7 +86,7 @@ function CreateGlossaryEntityModal(props: Props) {
},
},
})
- .then(() => {
+ .then((res) => {
message.loading({ content: 'Updating...', duration: 2 });
setTimeout(() => {
analytics.event({
@@ -82,12 +101,19 @@ function CreateGlossaryEntityModal(props: Props) {
refetch();
if (isInGlossaryContext) {
// either refresh this current glossary node or the root nodes or root terms
- const nodeToUpdate = entityData?.urn || getGlossaryRootToUpdate(entityType);
+ const nodeToUpdate = selectedParentUrn || getGlossaryRootToUpdate(entityType);
updateGlossarySidebar([nodeToUpdate], urnsToUpdate, setUrnsToUpdate);
}
if (refetchData) {
refetchData();
}
+ if (props.isCloning) {
+ const redirectUrn =
+ entityType === EntityType.GlossaryTerm
+ ? res.data?.createGlossaryTerm
+ : res.data?.createGlossaryNode;
+ history.push(getEntityPath(entityType, redirectUrn, entityRegistry, false, false));
+ }
}, 2000);
})
.catch((e) => {
diff --git a/datahub-web-react/src/app/entity/shared/EntityDropdown/EntityDropdown.tsx b/datahub-web-react/src/app/entity/shared/EntityDropdown/EntityDropdown.tsx
index 5d4f9d9f875cfe..8d7f1cca9c1cbd 100644
--- a/datahub-web-react/src/app/entity/shared/EntityDropdown/EntityDropdown.tsx
+++ b/datahub-web-react/src/app/entity/shared/EntityDropdown/EntityDropdown.tsx
@@ -9,6 +9,7 @@ import {
LinkOutlined,
MoreOutlined,
PlusOutlined,
+ CopyOutlined,
} from '@ant-design/icons';
import { Redirect } from 'react-router';
import { EntityType } from '../../../../types.generated';
@@ -32,6 +33,7 @@ export enum EntityMenuItems {
ADD_TERM_GROUP,
DELETE,
MOVE,
+ CLONE,
}
export const MenuIcon = styled(MoreOutlined)<{ fontSize?: number }>`
@@ -107,6 +109,7 @@ function EntityDropdown(props: Props) {
const [isCreateTermModalVisible, setIsCreateTermModalVisible] = useState(false);
const [isCreateNodeModalVisible, setIsCreateNodeModalVisible] = useState(false);
+ const [isCloneEntityModalVisible, setIsCloneEntityModalVisible] = useState(false);
const [isDeprecationModalVisible, setIsDeprecationModalVisible] = useState(false);
const [isMoveModalVisible, setIsMoveModalVisible] = useState(false);
@@ -230,6 +233,17 @@ function EntityDropdown(props: Props) {
)}
+ {menuItems.has(EntityMenuItems.CLONE) && (
+ setIsCloneEntityModalVisible(true)}
+ >
+
+ Clone
+
+
+ )}
}
trigger={['click']}
@@ -250,6 +264,14 @@ function EntityDropdown(props: Props) {
refetchData={refetchForNodes}
/>
)}
+ {isCloneEntityModalVisible && (
+ setIsCloneEntityModalVisible(false)}
+ refetchData={entityType === EntityType.GlossaryTerm ? refetchForTerms : refetchForNodes}
+ isCloning
+ />
+ )}
{isDeprecationModalVisible && (
;
properties?: Maybe<{
+ name?: Maybe;
description?: Maybe;
qualifiedName?: Maybe;
sourceUrl?: Maybe;
From a495d652e0e08885ce35eb3110a27853c2c05071 Mon Sep 17 00:00:00 2001
From: skrydal
Date: Wed, 13 Dec 2023 20:34:20 +0100
Subject: [PATCH 016/540] feat(ingestion): Add typeUrn handling to ownership
transformers (#9370)
---
.../docs/transformer/dataset_transformer.md | 32 +++++++-------
.../src/datahub/emitter/mce_builder.py | 31 ++++++-------
.../transformer/add_dataset_ownership.py | 34 +++++---------
.../tests/unit/test_pipeline.py | 5 ++-
.../tests/unit/test_transform_dataset.py | 44 ++++++++++++++++++-
5 files changed, 86 insertions(+), 60 deletions(-)
diff --git a/metadata-ingestion/docs/transformer/dataset_transformer.md b/metadata-ingestion/docs/transformer/dataset_transformer.md
index d1a1555a3ca022..1c84a2759d23e6 100644
--- a/metadata-ingestion/docs/transformer/dataset_transformer.md
+++ b/metadata-ingestion/docs/transformer/dataset_transformer.md
@@ -55,12 +55,12 @@ transformers:
```
## Simple Add Dataset ownership
### Config Details
-| Field | Required | Type | Default | Description |
-|-----------------------------|----------|--------------|---------------|------------------------------------------------------------------|
-| `owner_urns` | ✅ | list[string] | | List of owner urns. |
-| `ownership_type` | | string | `DATAOWNER` | ownership type of the owners. |
-| `replace_existing` | | boolean | `false` | Whether to remove owners from entity sent by ingestion source. |
-| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. |
+| Field | Required | Type | Default | Description |
+|--------------------|----------|--------------|-------------|---------------------------------------------------------------------|
+| `owner_urns` | ✅ | list[string] | | List of owner urns. |
+| `ownership_type` | | string | "DATAOWNER" | ownership type of the owners (either as enum or ownership type urn) |
+| `replace_existing` | | boolean | `false` | Whether to remove owners from entity sent by ingestion source. |
+| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. |
For transformer behaviour on `replace_existing` and `semantics`, please refer section [Relationship Between replace_existing And semantics](#relationship-between-replace_existing-and-semantics).
@@ -95,7 +95,7 @@ transformers:
- "urn:li:corpuser:username1"
- "urn:li:corpuser:username2"
- "urn:li:corpGroup:groupname"
- ownership_type: "PRODUCER"
+ ownership_type: "urn:li:ownershipType:__system__producer"
```
- Add owners, however overwrite the owners available for the dataset on DataHub GMS
```yaml
@@ -107,7 +107,7 @@ transformers:
- "urn:li:corpuser:username1"
- "urn:li:corpuser:username2"
- "urn:li:corpGroup:groupname"
- ownership_type: "PRODUCER"
+ ownership_type: "urn:li:ownershipType:__system__producer"
```
- Add owners, however keep the owners available for the dataset on DataHub GMS
```yaml
@@ -124,12 +124,12 @@ transformers:
## Pattern Add Dataset ownership
### Config Details
-| Field | Required | Type | Default | Description |
-|-----------------------------|--------- |-----------------------|------------------|-----------------------------------------------------------------------------------------|
-| `owner_pattern` | ✅ | map[regx, list[urn]] | | entity urn with regular expression and list of owners urn apply to matching entity urn. |
-| `ownership_type` | | string | `DATAOWNER` | ownership type of the owners. |
-| `replace_existing` | | boolean | `false` | Whether to remove owners from entity sent by ingestion source. |
-| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. |
+| Field | Required | Type | Default | Description |
+|--------------------|----------|----------------------|-------------|-----------------------------------------------------------------------------------------|
+| `owner_pattern` | ✅ | map[regx, list[urn]] | | entity urn with regular expression and list of owners urn apply to matching entity urn. |
+| `ownership_type` | | string | "DATAOWNER" | ownership type of the owners (either as enum or ownership type urn) |
+| `replace_existing` | | boolean | `false` | Whether to remove owners from entity sent by ingestion source. |
+| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. |
let’s suppose we’d like to append a series of users who we know to own a different dataset from a data source but aren't detected during normal ingestion. To do so, we can use the `pattern_add_dataset_ownership` module that’s included in the ingestion framework. This will match the pattern to `urn` of the dataset and assign the respective owners.
@@ -158,7 +158,7 @@ The config, which we’d append to our ingestion recipe YAML, would look like th
rules:
".*example1.*": ["urn:li:corpuser:username1"]
".*example2.*": ["urn:li:corpuser:username2"]
- ownership_type: "PRODUCER"
+ ownership_type: "urn:li:ownershipType:__system__producer"
```
- Add owner, however overwrite the owners available for the dataset on DataHub GMS
```yaml
@@ -170,7 +170,7 @@ The config, which we’d append to our ingestion recipe YAML, would look like th
rules:
".*example1.*": ["urn:li:corpuser:username1"]
".*example2.*": ["urn:li:corpuser:username2"]
- ownership_type: "PRODUCER"
+ ownership_type: "urn:li:ownershipType:__system__producer"
```
- Add owner, however keep the owners available for the dataset on DataHub GMS
```yaml
diff --git a/metadata-ingestion/src/datahub/emitter/mce_builder.py b/metadata-ingestion/src/datahub/emitter/mce_builder.py
index 64c9ec1bb5704d..3b2c87ea25a314 100644
--- a/metadata-ingestion/src/datahub/emitter/mce_builder.py
+++ b/metadata-ingestion/src/datahub/emitter/mce_builder.py
@@ -9,12 +9,13 @@
from typing import (
TYPE_CHECKING,
Any,
+ Iterable,
List,
Optional,
+ Tuple,
Type,
TypeVar,
Union,
- cast,
get_type_hints,
)
@@ -342,26 +343,20 @@ def make_ml_model_group_urn(platform: str, group_name: str, env: str) -> str:
)
-def is_valid_ownership_type(ownership_type: Optional[str]) -> bool:
- return ownership_type is not None and ownership_type in [
- OwnershipTypeClass.TECHNICAL_OWNER,
- OwnershipTypeClass.BUSINESS_OWNER,
- OwnershipTypeClass.DATA_STEWARD,
- OwnershipTypeClass.NONE,
- OwnershipTypeClass.DEVELOPER,
- OwnershipTypeClass.DATAOWNER,
- OwnershipTypeClass.DELEGATE,
- OwnershipTypeClass.PRODUCER,
- OwnershipTypeClass.CONSUMER,
- OwnershipTypeClass.STAKEHOLDER,
+def get_class_fields(_class: Type[object]) -> Iterable[str]:
+ return [
+ f
+ for f in dir(_class)
+ if not callable(getattr(_class, f)) and not f.startswith("_")
]
-def validate_ownership_type(ownership_type: Optional[str]) -> str:
- if is_valid_ownership_type(ownership_type):
- return cast(str, ownership_type)
- else:
- raise ValueError(f"Unexpected ownership type: {ownership_type}")
+def validate_ownership_type(ownership_type: str) -> Tuple[str, Optional[str]]:
+ if ownership_type.startswith("urn:li:"):
+ return OwnershipTypeClass.CUSTOM, ownership_type
+ if ownership_type in get_class_fields(OwnershipTypeClass):
+ return ownership_type, None
+ raise ValueError(f"Unexpected ownership type: {ownership_type}")
def make_lineage_mce(
diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_ownership.py b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_ownership.py
index 71cf6cfa7e92bf..73cb8e4d6739bd 100644
--- a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_ownership.py
+++ b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_ownership.py
@@ -14,11 +14,8 @@
from datahub.ingestion.transformer.dataset_transformer import (
DatasetOwnershipTransformer,
)
-from datahub.metadata.schema_classes import (
- OwnerClass,
- OwnershipClass,
- OwnershipTypeClass,
-)
+from datahub.metadata._schema_classes import OwnershipTypeClass
+from datahub.metadata.schema_classes import OwnerClass, OwnershipClass
class AddDatasetOwnershipConfig(TransformerSemanticsConfigModel):
@@ -102,7 +99,7 @@ def transform_aspect(
class DatasetOwnershipBaseConfig(TransformerSemanticsConfigModel):
- ownership_type: Optional[str] = OwnershipTypeClass.DATAOWNER
+ ownership_type: str = OwnershipTypeClass.DATAOWNER
class SimpleDatasetOwnershipConfig(DatasetOwnershipBaseConfig):
@@ -114,11 +111,14 @@ class SimpleAddDatasetOwnership(AddDatasetOwnership):
"""Transformer that adds a specified set of owners to each dataset."""
def __init__(self, config: SimpleDatasetOwnershipConfig, ctx: PipelineContext):
- ownership_type = builder.validate_ownership_type(config.ownership_type)
+ ownership_type, ownership_type_urn = builder.validate_ownership_type(
+ config.ownership_type
+ )
owners = [
OwnerClass(
owner=owner,
type=ownership_type,
+ typeUrn=ownership_type_urn,
)
for owner in config.owner_urns
]
@@ -147,29 +147,17 @@ class PatternDatasetOwnershipConfig(DatasetOwnershipBaseConfig):
class PatternAddDatasetOwnership(AddDatasetOwnership):
"""Transformer that adds a specified set of owners to each dataset."""
- def getOwners(
- self,
- key: str,
- owner_pattern: KeyValuePattern,
- ownership_type: Optional[str] = None,
- ) -> List[OwnerClass]:
- owners = [
- OwnerClass(
- owner=owner,
- type=builder.validate_ownership_type(ownership_type),
- )
- for owner in owner_pattern.value(key)
- ]
- return owners
-
def __init__(self, config: PatternDatasetOwnershipConfig, ctx: PipelineContext):
- ownership_type = builder.validate_ownership_type(config.ownership_type)
owner_pattern = config.owner_pattern
+ ownership_type, ownership_type_urn = builder.validate_ownership_type(
+ config.ownership_type
+ )
generic_config = AddDatasetOwnershipConfig(
get_owners_to_add=lambda urn: [
OwnerClass(
owner=owner,
type=ownership_type,
+ typeUrn=ownership_type_urn,
)
for owner in owner_pattern.value(urn)
],
diff --git a/metadata-ingestion/tests/unit/test_pipeline.py b/metadata-ingestion/tests/unit/test_pipeline.py
index 7ce78f0ab3e13a..0f3c984196a784 100644
--- a/metadata-ingestion/tests/unit/test_pipeline.py
+++ b/metadata-ingestion/tests/unit/test_pipeline.py
@@ -214,7 +214,10 @@ def test_run_including_registered_transformation(self):
"transformers": [
{
"type": "simple_add_dataset_ownership",
- "config": {"owner_urns": ["urn:li:corpuser:foo"]},
+ "config": {
+ "owner_urns": ["urn:li:corpuser:foo"],
+ "ownership_type": "urn:li:ownershipType:__system__technical_owner",
+ },
}
],
"sink": {"type": "tests.test_helpers.sink_helpers.RecordingSink"},
diff --git a/metadata-ingestion/tests/unit/test_transform_dataset.py b/metadata-ingestion/tests/unit/test_transform_dataset.py
index bc95451620d22f..8014df2f5c519d 100644
--- a/metadata-ingestion/tests/unit/test_transform_dataset.py
+++ b/metadata-ingestion/tests/unit/test_transform_dataset.py
@@ -234,7 +234,7 @@ def test_simple_dataset_ownership_transformation(mock_time):
assert last_event.entityUrn == outputs[0].record.proposedSnapshot.urn
assert all(
[
- owner.type == models.OwnershipTypeClass.DATAOWNER
+ owner.type == models.OwnershipTypeClass.DATAOWNER and owner.typeUrn is None
for owner in last_event.aspect.owners
]
)
@@ -247,7 +247,7 @@ def test_simple_dataset_ownership_transformation(mock_time):
assert len(second_ownership_aspect.owners) == 3
assert all(
[
- owner.type == models.OwnershipTypeClass.DATAOWNER
+ owner.type == models.OwnershipTypeClass.DATAOWNER and owner.typeUrn is None
for owner in second_ownership_aspect.owners
]
)
@@ -293,6 +293,44 @@ def test_simple_dataset_ownership_with_type_transformation(mock_time):
assert ownership_aspect.owners[0].type == models.OwnershipTypeClass.PRODUCER
+def test_simple_dataset_ownership_with_type_urn_transformation(mock_time):
+ input = make_generic_dataset()
+
+ transformer = SimpleAddDatasetOwnership.create(
+ {
+ "owner_urns": [
+ builder.make_user_urn("person1"),
+ ],
+ "ownership_type": "urn:li:ownershipType:__system__technical_owner",
+ },
+ PipelineContext(run_id="test"),
+ )
+
+ output = list(
+ transformer.transform(
+ [
+ RecordEnvelope(input, metadata={}),
+ RecordEnvelope(EndOfStream(), metadata={}),
+ ]
+ )
+ )
+
+ assert len(output) == 3
+
+ # original MCE is unchanged
+ assert input == output[0].record
+
+ ownership_aspect = output[1].record.aspect
+
+ assert isinstance(ownership_aspect, OwnershipClass)
+ assert len(ownership_aspect.owners) == 1
+ assert ownership_aspect.owners[0].type == OwnershipTypeClass.CUSTOM
+ assert (
+ ownership_aspect.owners[0].typeUrn
+ == "urn:li:ownershipType:__system__technical_owner"
+ )
+
+
def _test_extract_tags(in_urn: str, regex_str: str, out_tag: str) -> None:
input = make_generic_dataset(entity_urn=in_urn)
transformer = ExtractDatasetTags.create(
@@ -883,6 +921,7 @@ def test_pattern_dataset_ownership_transformation(mock_time):
".*example2.*": [builder.make_user_urn("person2")],
}
},
+ "ownership_type": "DATAOWNER",
},
PipelineContext(run_id="test"),
)
@@ -2233,6 +2272,7 @@ def fake_ownership_class(entity_urn: str) -> models.OwnershipClass:
"replace_existing": False,
"semantics": TransformerSemantics.PATCH,
"owner_urns": [owner2],
+ "ownership_type": "DATAOWNER",
},
pipeline_context=pipeline_context,
)
From 32d237b56f54c83bd7b8d343b04d36f53ae72d0a Mon Sep 17 00:00:00 2001
From: Arun Vasudevan <12974850+arunvasudevan@users.noreply.github.com>
Date: Wed, 13 Dec 2023 16:02:21 -0600
Subject: [PATCH 017/540] fix(ingest): reduce GraphQL Logs to warning for
circuit breaker (#9436)
---
.../src/datahub/api/circuit_breaker/__init__.py | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/metadata-ingestion/src/datahub/api/circuit_breaker/__init__.py b/metadata-ingestion/src/datahub/api/circuit_breaker/__init__.py
index 4dcf40454736b9..27317826264b85 100644
--- a/metadata-ingestion/src/datahub/api/circuit_breaker/__init__.py
+++ b/metadata-ingestion/src/datahub/api/circuit_breaker/__init__.py
@@ -1,3 +1,7 @@
+import logging
+
+from gql.transport.requests import log as requests_logger
+
from datahub.api.circuit_breaker.assertion_circuit_breaker import (
AssertionCircuitBreaker,
AssertionCircuitBreakerConfig,
@@ -6,3 +10,5 @@
OperationCircuitBreaker,
OperationCircuitBreakerConfig,
)
+
+requests_logger.setLevel(logging.WARNING)
From 288e458739ec15e0d294ed5c0eb54963fee01071 Mon Sep 17 00:00:00 2001
From: Salman-Apptware <101426513+Salman-Apptware@users.noreply.github.com>
Date: Thu, 14 Dec 2023 06:19:05 +0530
Subject: [PATCH 018/540] refactor(ui): support Apollo caching for settings /
Policies (#9442)
---
.../app/permissions/policy/ManagePolicies.tsx | 194 ++-------------
.../policy/_tests_/policyUtils.test.tsx | 110 +++++++++
.../src/app/permissions/policy/policyUtils.ts | 98 ++++++++
.../src/app/permissions/policy/usePolicy.ts | 227 ++++++++++++++++++
4 files changed, 460 insertions(+), 169 deletions(-)
create mode 100644 datahub-web-react/src/app/permissions/policy/_tests_/policyUtils.test.tsx
create mode 100644 datahub-web-react/src/app/permissions/policy/usePolicy.ts
diff --git a/datahub-web-react/src/app/permissions/policy/ManagePolicies.tsx b/datahub-web-react/src/app/permissions/policy/ManagePolicies.tsx
index 2f0c284fc4e8f3..72c22f3bddc2cd 100644
--- a/datahub-web-react/src/app/permissions/policy/ManagePolicies.tsx
+++ b/datahub-web-react/src/app/permissions/policy/ManagePolicies.tsx
@@ -1,5 +1,5 @@
import React, { useEffect, useMemo, useState } from 'react';
-import { Button, Empty, message, Modal, Pagination, Tag } from 'antd';
+import { Button, Empty, message, Pagination, Tag } from 'antd';
import styled from 'styled-components/macro';
import * as QueryString from 'query-string';
import { DeleteOutlined, PlusOutlined } from '@ant-design/icons';
@@ -7,26 +7,15 @@ import { useLocation } from 'react-router';
import PolicyBuilderModal from './PolicyBuilderModal';
import {
Policy,
- PolicyUpdateInput,
PolicyState,
- PolicyType,
- Maybe,
- ResourceFilterInput,
- PolicyMatchFilter,
- PolicyMatchFilterInput,
- PolicyMatchCriterionInput,
- EntityType,
} from '../../../types.generated';
import { useAppConfig } from '../../useAppConfig';
import PolicyDetailsModal from './PolicyDetailsModal';
import {
- useCreatePolicyMutation,
- useDeletePolicyMutation,
useListPoliciesQuery,
- useUpdatePolicyMutation,
} from '../../../graphql/policy.generated';
import { Message } from '../../shared/Message';
-import { EMPTY_POLICY } from './policyUtils';
+import { DEFAULT_PAGE_SIZE, EMPTY_POLICY } from './policyUtils';
import TabToolbar from '../../entity/shared/components/styled/TabToolbar';
import { StyledTable } from '../../entity/shared/components/styled/StyledTable';
import AvatarsGroup from '../AvatarsGroup';
@@ -37,6 +26,7 @@ import { scrollToTop } from '../../shared/searchUtils';
import analytics, { EventType } from '../../analytics';
import { POLICIES_CREATE_POLICY_ID, POLICIES_INTRO_ID } from '../../onboarding/config/PoliciesOnboardingConfig';
import { OnboardingTour } from '../../onboarding/OnboardingTour';
+import { usePolicy } from './usePolicy';
const SourceContainer = styled.div`
overflow: auto;
@@ -84,58 +74,6 @@ const PageContainer = styled.span`
overflow: auto;
`;
-const DEFAULT_PAGE_SIZE = 10;
-
-type PrivilegeOptionType = {
- type?: string;
- name?: Maybe;
-};
-
-const toFilterInput = (filter: PolicyMatchFilter): PolicyMatchFilterInput => {
- return {
- criteria: filter.criteria?.map((criterion): PolicyMatchCriterionInput => {
- return {
- field: criterion.field,
- values: criterion.values.map((criterionValue) => criterionValue.value),
- condition: criterion.condition,
- };
- }),
- };
-};
-
-const toPolicyInput = (policy: Omit): PolicyUpdateInput => {
- let policyInput: PolicyUpdateInput = {
- type: policy.type,
- name: policy.name,
- state: policy.state,
- description: policy.description,
- privileges: policy.privileges,
- actors: {
- users: policy.actors.users,
- groups: policy.actors.groups,
- allUsers: policy.actors.allUsers,
- allGroups: policy.actors.allGroups,
- resourceOwners: policy.actors.resourceOwners,
- resourceOwnersTypes: policy.actors.resourceOwnersTypes,
- },
- };
- if (policy.resources !== null && policy.resources !== undefined) {
- let resourceFilter: ResourceFilterInput = {
- type: policy.resources.type,
- resources: policy.resources.resources,
- allResources: policy.resources.allResources,
- };
- if (policy.resources.filter) {
- resourceFilter = { ...resourceFilter, filter: toFilterInput(policy.resources.filter) };
- }
- // Add the resource filters.
- policyInput = {
- ...policyInput,
- resources: resourceFilter,
- };
- }
- return policyInput;
-};
// TODO: Cleanup the styling.
export const ManagePolicies = () => {
@@ -163,9 +101,7 @@ export const ManagePolicies = () => {
const [focusPolicyUrn, setFocusPolicyUrn] = useState(undefined);
const [focusPolicy, setFocusPolicy] = useState>(EMPTY_POLICY);
- // Construct privileges
- const platformPrivileges = policiesConfig?.platformPrivileges || [];
- const resourcePrivileges = policiesConfig?.resourcePrivileges || [];
+
const {
loading: policiesLoading,
@@ -183,15 +119,6 @@ export const ManagePolicies = () => {
fetchPolicy: (query?.length || 0) > 0 ? 'no-cache' : 'cache-first',
});
- // Any time a policy is removed, edited, or created, refetch the list.
- const [createPolicy, { error: createPolicyError }] = useCreatePolicyMutation();
-
- const [updatePolicy, { error: updatePolicyError }] = useUpdatePolicyMutation();
-
- const [deletePolicy, { error: deletePolicyError }] = useDeletePolicyMutation();
-
- const updateError = createPolicyError || updatePolicyError || deletePolicyError;
-
const totalPolicies = policiesData?.listPolicies?.total || 0;
const policies = useMemo(() => policiesData?.listPolicies?.policies || [], [policiesData]);
@@ -212,28 +139,6 @@ export const ManagePolicies = () => {
setShowPolicyBuilderModal(false);
};
- const getPrivilegeNames = (policy: Omit) => {
- let privileges: PrivilegeOptionType[] = [];
- if (policy?.type === PolicyType.Platform) {
- privileges = platformPrivileges
- .filter((platformPrivilege) => policy.privileges.includes(platformPrivilege.type))
- .map((platformPrivilege) => {
- return { type: platformPrivilege.type, name: platformPrivilege.displayName };
- });
- } else {
- const allResourcePriviliges = resourcePrivileges.find(
- (resourcePrivilege) => resourcePrivilege.resourceType === 'all',
- );
- privileges =
- allResourcePriviliges?.privileges
- .filter((resourcePrivilege) => policy.privileges.includes(resourcePrivilege.type))
- .map((b) => {
- return { type: b.type, name: b.displayName };
- }) || [];
- }
- return privileges;
- };
-
const onViewPolicy = (policy: Policy) => {
setShowViewPolicyModal(true);
setFocusPolicyUrn(policy?.urn);
@@ -247,79 +152,30 @@ export const ManagePolicies = () => {
};
const onEditPolicy = (policy: Policy) => {
- setShowPolicyBuilderModal(true);
- setFocusPolicyUrn(policy?.urn);
- setFocusPolicy({ ...policy });
- };
-
- // On Delete Policy handler
- const onRemovePolicy = (policy: Policy) => {
- Modal.confirm({
- title: `Delete ${policy?.name}`,
- content: `Are you sure you want to remove policy?`,
- onOk() {
- deletePolicy({ variables: { urn: policy?.urn as string } }); // There must be a focus policy urn.
- analytics.event({
- type: EventType.DeleteEntityEvent,
- entityUrn: policy?.urn,
- entityType: EntityType.DatahubPolicy,
- });
- message.success('Successfully removed policy.');
- setTimeout(() => {
- policiesRefetch();
- }, 3000);
- onCancelViewPolicy();
- },
- onCancel() {},
- okText: 'Yes',
- maskClosable: true,
- closable: true,
- });
+ setShowPolicyBuilderModal(true);
+ setFocusPolicyUrn(policy?.urn);
+ setFocusPolicy({ ...policy });
};
- // On Activate and deactivate Policy handler
- const onToggleActiveDuplicate = (policy: Policy) => {
- const newState = policy?.state === PolicyState.Active ? PolicyState.Inactive : PolicyState.Active;
- const newPolicy = {
- ...policy,
- state: newState,
- };
- updatePolicy({
- variables: {
- urn: policy?.urn as string, // There must be a focus policy urn.
- input: toPolicyInput(newPolicy),
- },
- });
- message.success(`Successfully ${newState === PolicyState.Active ? 'activated' : 'deactivated'} policy.`);
- setTimeout(() => {
- policiesRefetch();
- }, 3000);
- setShowViewPolicyModal(false);
- };
-
- // On Add/Update Policy handler
- const onSavePolicy = (savePolicy: Omit) => {
- if (focusPolicyUrn) {
- // If there's an URN associated with the focused policy, then we are editing an existing policy.
- updatePolicy({ variables: { urn: focusPolicyUrn, input: toPolicyInput(savePolicy) } });
- analytics.event({
- type: EventType.UpdatePolicyEvent,
- policyUrn: focusPolicyUrn,
- });
- } else {
- // If there's no URN associated with the focused policy, then we are creating.
- createPolicy({ variables: { input: toPolicyInput(savePolicy) } });
- analytics.event({
- type: EventType.CreatePolicyEvent,
- });
- }
- message.success('Successfully saved policy.');
- setTimeout(() => {
- policiesRefetch();
- }, 3000);
- onClosePolicyBuilder();
- };
+ const {
+ createPolicyError,
+ updatePolicyError,
+ deletePolicyError,
+ onSavePolicy,
+ onToggleActiveDuplicate,
+ onRemovePolicy,
+ getPrivilegeNames
+ } = usePolicy(
+ policiesConfig,
+ focusPolicyUrn,
+ policiesRefetch,
+ setShowViewPolicyModal,
+ onCancelViewPolicy,
+ onClosePolicyBuilder
+ );
+ const updateError = createPolicyError || updatePolicyError || deletePolicyError;
+
const tableColumns = [
{
title: 'Name',
diff --git a/datahub-web-react/src/app/permissions/policy/_tests_/policyUtils.test.tsx b/datahub-web-react/src/app/permissions/policy/_tests_/policyUtils.test.tsx
new file mode 100644
index 00000000000000..06d2e97255139e
--- /dev/null
+++ b/datahub-web-react/src/app/permissions/policy/_tests_/policyUtils.test.tsx
@@ -0,0 +1,110 @@
+import {
+ addOrUpdatePoliciesInList,
+ updateListPoliciesCache,
+ removeFromListPoliciesCache,
+ } from '../policyUtils';
+
+ // Mock the Apollo Client readQuery and writeQuery methods
+ const mockReadQuery = jest.fn();
+ const mockWriteQuery = jest.fn();
+
+ jest.mock('@apollo/client', () => ({
+ ...jest.requireActual('@apollo/client'),
+ useApolloClient: () => ({
+ readQuery: mockReadQuery,
+ writeQuery: mockWriteQuery,
+ }),
+ }));
+
+ describe('addOrUpdatePoliciesInList', () => {
+ it('should add a new policy to the list', () => {
+ const existingPolicies = [{ urn: 'existing-urn' }];
+ const newPolicies = { urn: 'new-urn' };
+
+ const result = addOrUpdatePoliciesInList(existingPolicies, newPolicies);
+
+ expect(result.length).toBe(existingPolicies.length + 1);
+ expect(result).toContain(newPolicies);
+ });
+
+ it('should update an existing policy in the list', () => {
+ const existingPolicies = [{ urn: 'existing-urn' }];
+ const newPolicies = { urn: 'existing-urn', updatedField: 'new-value' };
+
+ const result = addOrUpdatePoliciesInList(existingPolicies, newPolicies);
+
+ expect(result.length).toBe(existingPolicies.length);
+ expect(result).toContainEqual(newPolicies);
+ });
+ });
+
+ describe('updateListPoliciesCache', () => {
+ // Mock client.readQuery response
+ const mockReadQueryResponse = {
+ listPolicies: {
+ start: 0,
+ count: 1,
+ total: 1,
+ policies: [{ urn: 'existing-urn' }],
+ },
+ };
+
+ beforeEach(() => {
+ mockReadQuery.mockReturnValueOnce(mockReadQueryResponse);
+ });
+
+ it('should update the list policies cache with a new policy', () => {
+ const mockClient = {
+ readQuery: mockReadQuery,
+ writeQuery: mockWriteQuery,
+ };
+
+ const policiesToAdd = [{ urn: 'new-urn' }];
+ const pageSize = 10;
+
+ updateListPoliciesCache(mockClient, policiesToAdd, pageSize);
+
+ // Ensure writeQuery is called with the expected data
+ expect(mockWriteQuery).toHaveBeenCalledWith({
+ query: expect.any(Object),
+ variables: { input: { start: 0, count: pageSize, query: undefined } },
+ data: expect.any(Object),
+ });
+ });
+ });
+
+ describe('removeFromListPoliciesCache', () => {
+ // Mock client.readQuery response
+ const mockReadQueryResponse = {
+ listPolicies: {
+ start: 0,
+ count: 1,
+ total: 1,
+ policies: [{ urn: 'existing-urn' }],
+ },
+ };
+
+ beforeEach(() => {
+ mockReadQuery.mockReturnValueOnce(mockReadQueryResponse);
+ });
+
+ it('should remove a policy from the list policies cache', () => {
+ const mockClient = {
+ readQuery: mockReadQuery,
+ writeQuery: mockWriteQuery,
+ };
+
+ const urnToRemove = 'existing-urn';
+ const pageSize = 10;
+
+ removeFromListPoliciesCache(mockClient, urnToRemove, pageSize);
+
+ // Ensure writeQuery is called with the expected data
+ expect(mockWriteQuery).toHaveBeenCalledWith({
+ query: expect.any(Object),
+ variables: { input: { start: 0, count: pageSize } },
+ data: expect.any(Object),
+ });
+ });
+ });
+
\ No newline at end of file
diff --git a/datahub-web-react/src/app/permissions/policy/policyUtils.ts b/datahub-web-react/src/app/permissions/policy/policyUtils.ts
index 2f178fcdeb5c34..27aa8fcd351e9b 100644
--- a/datahub-web-react/src/app/permissions/policy/policyUtils.ts
+++ b/datahub-web-react/src/app/permissions/policy/policyUtils.ts
@@ -10,6 +10,9 @@ import {
ResourceFilter,
ResourcePrivileges,
} from '../../../types.generated';
+import { ListPoliciesDocument, ListPoliciesQuery } from '../../../graphql/policy.generated';
+
+export const DEFAULT_PAGE_SIZE = 10;
export const EMPTY_POLICY = {
type: PolicyType.Metadata,
@@ -126,3 +129,98 @@ export const setFieldValues = (
}
return { ...filter, criteria: [...restCriteria, createCriterion(resourceFieldType, fieldValues)] };
};
+
+export const addOrUpdatePoliciesInList = (existingPolicies, newPolicies) => {
+ const policies = [...existingPolicies];
+ let didUpdate = false;
+ const updatedPolicies = policies.map((policy) => {
+ if (policy.urn === newPolicies.urn) {
+ didUpdate = true;
+ return newPolicies;
+ }
+ return policy;
+ });
+ return didUpdate ? updatedPolicies : [newPolicies, ...existingPolicies];
+};
+
+/**
+ * Add an entry to the ListPolicies cache.
+ */
+export const updateListPoliciesCache = (client, policies, pageSize) => {
+ // Read the data from our cache for this query.
+ const currData: ListPoliciesQuery | null = client.readQuery({
+ query: ListPoliciesDocument,
+ variables: {
+ input: {
+ start: 0,
+ count: pageSize,
+ query: undefined,
+ },
+ },
+ });
+
+ // Add our new policy into the existing list.
+ const existingPolicies = [...(currData?.listPolicies?.policies || [])];
+ const newPolicies = addOrUpdatePoliciesInList(existingPolicies, policies);
+ const didAddTest = newPolicies.length > existingPolicies.length;
+
+ // Write our data back to the cache.
+ client.writeQuery({
+ query: ListPoliciesDocument,
+ variables: {
+ input: {
+ start: 0,
+ count: pageSize,
+ query: undefined,
+ },
+ },
+ data: {
+
+ listPolicies: {
+ __typename: 'ListPoliciesResult',
+ start: 0,
+ count: didAddTest ? (currData?.listPolicies?.count || 0) + 1 : currData?.listPolicies?.count,
+ total: didAddTest ? (currData?.listPolicies?.total || 0) + 1 : currData?.listPolicies?.total,
+ policies: newPolicies,
+ },
+ },
+ });
+};
+
+/**
+ * Remove an entry from the ListTests cache.
+ */
+export const removeFromListPoliciesCache = (client, urn, pageSize) => {
+ // Read the data from our cache for this query.
+ const currData: ListPoliciesQuery | null = client.readQuery({
+ query: ListPoliciesDocument,
+ variables: {
+ input: {
+ start: 0,
+ count: pageSize,
+ },
+ },
+ });
+
+ // Remove the policy from the existing tests set.
+ const newPolicies = [...(currData?.listPolicies?.policies || []).filter((policy) => policy.urn !== urn)];
+
+ // Write our data back to the cache.
+ client.writeQuery({
+ query: ListPoliciesDocument,
+ variables: {
+ input: {
+ start: 0,
+ count: pageSize,
+ },
+ },
+ data: {
+ listPolicies: {
+ start: currData?.listPolicies?.start || 0,
+ count: (currData?.listPolicies?.count || 1) - 1,
+ total: (currData?.listPolicies?.total || 1) - 1,
+ policies: newPolicies,
+ },
+ },
+ });
+};
diff --git a/datahub-web-react/src/app/permissions/policy/usePolicy.ts b/datahub-web-react/src/app/permissions/policy/usePolicy.ts
new file mode 100644
index 00000000000000..6f359805e42db1
--- /dev/null
+++ b/datahub-web-react/src/app/permissions/policy/usePolicy.ts
@@ -0,0 +1,227 @@
+import { Modal, message } from 'antd';
+import { useApolloClient } from '@apollo/client';
+import {
+ EntityType,
+ Policy,
+ PolicyMatchCriterionInput,
+ PolicyMatchFilter,
+ PolicyMatchFilterInput,
+ PolicyState,
+ PolicyType,
+ Maybe,
+ PolicyUpdateInput,
+ ResourceFilterInput,
+} from '../../../types.generated';
+import { useCreatePolicyMutation, useDeletePolicyMutation, useUpdatePolicyMutation } from '../../../graphql/policy.generated';
+import analytics, { EventType } from '../../analytics';
+import { DEFAULT_PAGE_SIZE, removeFromListPoliciesCache, updateListPoliciesCache } from './policyUtils';
+
+
+type PrivilegeOptionType = {
+ type?: string;
+ name?: Maybe;
+};
+
+export function usePolicy(
+ policiesConfig,
+ focusPolicyUrn,
+ policiesRefetch,
+ setShowViewPolicyModal,
+ onCancelViewPolicy,
+ onClosePolicyBuilder
+){
+
+ const client = useApolloClient();
+
+ // Construct privileges
+ const platformPrivileges = policiesConfig?.platformPrivileges || [];
+ const resourcePrivileges = policiesConfig?.resourcePrivileges || [];
+
+ // Any time a policy is removed, edited, or created, refetch the list.
+ const [createPolicy, { error: createPolicyError }] = useCreatePolicyMutation();
+
+ const [updatePolicy, { error: updatePolicyError }] = useUpdatePolicyMutation();
+
+ const [deletePolicy, { error: deletePolicyError }] = useDeletePolicyMutation();
+
+ const toFilterInput = (filter: PolicyMatchFilter): PolicyMatchFilterInput => {
+ return {
+ criteria: filter.criteria?.map((criterion): PolicyMatchCriterionInput => {
+ return {
+ field: criterion.field,
+ values: criterion.values.map((criterionValue) => criterionValue.value),
+ condition: criterion.condition,
+ };
+ }),
+ };
+ };
+
+ const toPolicyInput = (policy: Omit): PolicyUpdateInput => {
+ let policyInput: PolicyUpdateInput = {
+ type: policy.type,
+ name: policy.name,
+ state: policy.state,
+ description: policy.description,
+ privileges: policy.privileges,
+ actors: {
+ users: policy.actors.users,
+ groups: policy.actors.groups,
+ allUsers: policy.actors.allUsers,
+ allGroups: policy.actors.allGroups,
+ resourceOwners: policy.actors.resourceOwners,
+ resourceOwnersTypes: policy.actors.resourceOwnersTypes,
+ },
+ };
+ if (policy.resources !== null && policy.resources !== undefined) {
+ let resourceFilter: ResourceFilterInput = {
+ type: policy.resources.type,
+ resources: policy.resources.resources,
+ allResources: policy.resources.allResources,
+ };
+ if (policy.resources.filter) {
+ resourceFilter = { ...resourceFilter, filter: toFilterInput(policy.resources.filter) };
+ }
+ // Add the resource filters.
+ policyInput = {
+ ...policyInput,
+ resources: resourceFilter,
+ };
+ }
+ return policyInput;
+ };
+
+ const getPrivilegeNames = (policy: Omit) => {
+ let privileges: PrivilegeOptionType[] = [];
+ if (policy?.type === PolicyType.Platform) {
+ privileges = platformPrivileges
+ .filter((platformPrivilege) => policy.privileges.includes(platformPrivilege.type))
+ .map((platformPrivilege) => {
+ return { type: platformPrivilege.type, name: platformPrivilege.displayName };
+ });
+ } else {
+ const allResourcePriviliges = resourcePrivileges.find(
+ (resourcePrivilege) => resourcePrivilege.resourceType === 'all',
+ );
+ privileges =
+ allResourcePriviliges?.privileges
+ .filter((resourcePrivilege) => policy.privileges.includes(resourcePrivilege.type))
+ .map((b) => {
+ return { type: b.type, name: b.displayName };
+ }) || [];
+ }
+ return privileges;
+ };
+
+ // On Delete Policy handler
+ const onRemovePolicy = (policy: Policy) => {
+ Modal.confirm({
+ title: `Delete ${policy?.name}`,
+ content: `Are you sure you want to remove policy?`,
+ onOk() {
+ deletePolicy({ variables: { urn: policy?.urn as string } })
+ .then(()=>{
+ // There must be a focus policy urn.
+ analytics.event({
+ type: EventType.DeleteEntityEvent,
+ entityUrn: policy?.urn,
+ entityType: EntityType.DatahubPolicy,
+ });
+ message.success('Successfully removed policy.');
+ removeFromListPoliciesCache(client,policy?.urn, DEFAULT_PAGE_SIZE);
+ setTimeout(() => {
+ policiesRefetch();
+ }, 3000);
+ onCancelViewPolicy();
+ })
+ },
+ onCancel() {},
+ okText: 'Yes',
+ maskClosable: true,
+ closable: true,
+ });
+ };
+
+ // On Activate and deactivate Policy handler
+ const onToggleActiveDuplicate = (policy: Policy) => {
+ const newState = policy?.state === PolicyState.Active ? PolicyState.Inactive : PolicyState.Active;
+ const newPolicy = {
+ ...policy,
+ state: newState,
+ };
+ updatePolicy({
+ variables: {
+ urn: policy?.urn as string, // There must be a focus policy urn.
+ input: toPolicyInput(newPolicy),
+ },
+ }).then(()=>{
+ const updatePolicies= {
+ ...newPolicy,
+ __typename: 'ListPoliciesResult',
+ }
+ updateListPoliciesCache(client,updatePolicies,DEFAULT_PAGE_SIZE);
+ message.success(`Successfully ${newState === PolicyState.Active ? 'activated' : 'deactivated'} policy.`);
+ setTimeout(() => {
+ policiesRefetch();
+ }, 3000);
+ })
+
+ setShowViewPolicyModal(false);
+ };
+
+ // On Add/Update Policy handler
+ const onSavePolicy = (savePolicy: Omit) => {
+ if (focusPolicyUrn) {
+ // If there's an URN associated with the focused policy, then we are editing an existing policy.
+ updatePolicy({ variables: { urn: focusPolicyUrn, input: toPolicyInput(savePolicy) } })
+ .then(()=>{
+ const newPolicy = {
+ __typename: 'ListPoliciesResult',
+ urn: focusPolicyUrn,
+ ...savePolicy,
+ };
+ analytics.event({
+ type: EventType.UpdatePolicyEvent,
+ policyUrn: focusPolicyUrn,
+ });
+ message.success('Successfully saved policy.');
+ updateListPoliciesCache(client,newPolicy,DEFAULT_PAGE_SIZE);
+ setTimeout(() => {
+ policiesRefetch();
+ }, 1000);
+ onClosePolicyBuilder();
+ })
+ } else {
+ // If there's no URN associated with the focused policy, then we are creating.
+ createPolicy({ variables: { input: toPolicyInput(savePolicy) } })
+ .then((result)=>{
+ const newPolicy = {
+ __typename: 'ListPoliciesResult',
+ urn: result?.data?.createPolicy,
+ ...savePolicy,
+ type: null,
+ actors: null,
+ resources: null,
+ };
+ analytics.event({
+ type: EventType.CreatePolicyEvent,
+ });
+ message.success('Successfully saved policy.');
+ setTimeout(() => {
+ policiesRefetch();
+ }, 1000);
+ updateListPoliciesCache(client,newPolicy,DEFAULT_PAGE_SIZE);
+ onClosePolicyBuilder();
+ })
+ }
+ };
+
+ return{
+ createPolicyError,
+ updatePolicyError,
+ deletePolicyError,
+ onSavePolicy,
+ onToggleActiveDuplicate,
+ onRemovePolicy,
+ getPrivilegeNames,
+ }
+}
\ No newline at end of file
From b87f9774ae646180675023196871f5965a5d97c3 Mon Sep 17 00:00:00 2001
From: Sumit Patil <91715217+sumitappt@users.noreply.github.com>
Date: Thu, 14 Dec 2023 06:41:30 +0530
Subject: [PATCH 019/540] =?UTF-8?q?refactor=20|=20PRD-785=20|=20datahub=20?=
=?UTF-8?q?oss:=20migrate=20use=20of=20useGetAuthenticatedU=E2=80=A6=20(#9?=
=?UTF-8?q?456)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Co-authored-by: John Joyce
---
datahub-web-react/src/app/AdminConsole.tsx | 8 ++++----
datahub-web-react/src/app/embed/EmbeddedPage.tsx | 6 +++---
2 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/datahub-web-react/src/app/AdminConsole.tsx b/datahub-web-react/src/app/AdminConsole.tsx
index 8b14ca35763d10..f6395a3bd3cb8a 100644
--- a/datahub-web-react/src/app/AdminConsole.tsx
+++ b/datahub-web-react/src/app/AdminConsole.tsx
@@ -4,9 +4,9 @@ import { Menu } from 'antd';
import styled from 'styled-components';
import { BankOutlined, BarChartOutlined, MenuOutlined } from '@ant-design/icons';
import Sider from 'antd/lib/layout/Sider';
-import { useGetAuthenticatedUser } from './useGetAuthenticatedUser';
import { useAppConfig } from './useAppConfig';
import { ANTD_GRAY } from './entity/shared/constants';
+import { useUserContext } from './context/useUserContext';
const ToggleContainer = styled.div`
background-color: ${ANTD_GRAY[4]};
@@ -32,7 +32,7 @@ const ControlSlideOut = styled(Sider)`
* Container for all views behind an authentication wall.
*/
export const AdminConsole = (): JSX.Element => {
- const me = useGetAuthenticatedUser();
+ const me = useUserContext();
const [adminConsoleOpen, setAdminConsoleOpen] = useState(false);
const { config } = useAppConfig();
@@ -40,8 +40,8 @@ export const AdminConsole = (): JSX.Element => {
const isAnalyticsEnabled = config?.analyticsConfig.enabled;
const isPoliciesEnabled = config?.policiesConfig.enabled;
- const showAnalytics = (isAnalyticsEnabled && me && me.platformPrivileges.viewAnalytics) || false;
- const showPolicyBuilder = (isPoliciesEnabled && me && me.platformPrivileges.managePolicies) || false;
+ const showAnalytics = (isAnalyticsEnabled && me && me?.platformPrivileges?.viewAnalytics) || false;
+ const showPolicyBuilder = (isPoliciesEnabled && me && me?.platformPrivileges?.managePolicies) || false;
const showAdminConsole = showAnalytics || showPolicyBuilder;
const onMenuItemClick = () => {
diff --git a/datahub-web-react/src/app/embed/EmbeddedPage.tsx b/datahub-web-react/src/app/embed/EmbeddedPage.tsx
index 429f83f34af6e8..603a72675c4337 100644
--- a/datahub-web-react/src/app/embed/EmbeddedPage.tsx
+++ b/datahub-web-react/src/app/embed/EmbeddedPage.tsx
@@ -8,9 +8,9 @@ import { VIEW_ENTITY_PAGE } from '../entity/shared/constants';
import { decodeUrn } from '../entity/shared/utils';
import CompactContext from '../shared/CompactContext';
import { useEntityRegistry } from '../useEntityRegistry';
-import { useGetAuthenticatedUserUrn } from '../useGetAuthenticatedUser';
import analytics from '../analytics/analytics';
import { EventType } from '../analytics';
+import { useUserContext } from '../context/useUserContext';
const EmbeddedPageWrapper = styled.div`
max-height: 100%;
@@ -39,11 +39,11 @@ export default function EmbeddedPage({ entityType }: Props) {
});
}, [entityType, urn]);
- const authenticatedUserUrn = useGetAuthenticatedUserUrn();
+ const { urn : authenticatedUserUrn } = useUserContext();
const { data } = useGetGrantedPrivilegesQuery({
variables: {
input: {
- actorUrn: authenticatedUserUrn,
+ actorUrn: authenticatedUserUrn as string,
resourceSpec: { resourceType: entityType, resourceUrn: urn },
},
},
From ff0570edacdd967d8fef23ac3333ccc93e50e406 Mon Sep 17 00:00:00 2001
From: John Joyce
Date: Wed, 13 Dec 2023 17:12:48 -0800
Subject: [PATCH 020/540] refactor(ui): Minor improvements & refactoring
(#9420)
---
.../search/EmbeddedListSearchResults.tsx | 6 +-
.../src/app/lineage/LineageLoadingSection.tsx | 5 +-
datahub-web-react/src/graphql/domain.graphql | 4 +-
datahub-web-react/src/graphql/lineage.graphql | 167 ++++++++++++------
datahub-web-react/src/graphql/query.graphql | 10 ++
.../com/linkedin/query/QueryProperties.pdl | 7 +-
6 files changed, 139 insertions(+), 60 deletions(-)
diff --git a/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearchResults.tsx b/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearchResults.tsx
index 1daf2a4c59b70f..80fc2aa223fdf5 100644
--- a/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearchResults.tsx
+++ b/datahub-web-react/src/app/entity/shared/components/styled/search/EmbeddedListSearchResults.tsx
@@ -1,5 +1,5 @@
import React from 'react';
-import { Pagination, Typography } from 'antd';
+import { Pagination, Spin, Typography } from 'antd';
import { LoadingOutlined } from '@ant-design/icons';
import styled from 'styled-components';
import { FacetFilterInput, FacetMetadata, SearchResults as SearchResultType } from '../../../../../../types.generated';
@@ -61,7 +61,7 @@ const LoadingContainer = styled.div`
`;
const StyledLoading = styled(LoadingOutlined)`
- font-size: 36px;
+ font-size: 32px;
color: ${ANTD_GRAY[7]};
padding-bottom: 18px;
]`;
@@ -128,7 +128,7 @@ export const EmbeddedListSearchResults = ({
{loading && (
-
+ } />
)}
{!loading && (
diff --git a/datahub-web-react/src/app/lineage/LineageLoadingSection.tsx b/datahub-web-react/src/app/lineage/LineageLoadingSection.tsx
index 9d84de0c211729..3b7f0e48ecdf4c 100644
--- a/datahub-web-react/src/app/lineage/LineageLoadingSection.tsx
+++ b/datahub-web-react/src/app/lineage/LineageLoadingSection.tsx
@@ -1,5 +1,6 @@
import * as React from 'react';
import styled from 'styled-components';
+import { Spin } from 'antd';
import { LoadingOutlined } from '@ant-design/icons';
import { ANTD_GRAY } from '../entity/shared/constants';
@@ -13,7 +14,7 @@ const Container = styled.div`
`;
const StyledLoading = styled(LoadingOutlined)`
- font-size: 36px;
+ font-size: 32px;
color: ${ANTD_GRAY[7]};
padding-bottom: 18px;
]`;
@@ -21,7 +22,7 @@ const StyledLoading = styled(LoadingOutlined)`
export default function LineageLoadingSection() {
return (
-
+ } />
);
}
diff --git a/datahub-web-react/src/graphql/domain.graphql b/datahub-web-react/src/graphql/domain.graphql
index 951b93fcba9af1..170a5b5df476ba 100644
--- a/datahub-web-react/src/graphql/domain.graphql
+++ b/datahub-web-react/src/graphql/domain.graphql
@@ -27,9 +27,7 @@ query getDomain($urn: String!) {
}
}
}
- children: relationships(input: { types: ["IsPartOf"], direction: INCOMING, start: 0, count: 0 }) {
- total
- }
+ ...domainEntitiesFields
}
}
diff --git a/datahub-web-react/src/graphql/lineage.graphql b/datahub-web-react/src/graphql/lineage.graphql
index dc511ca411e8db..4e9b8aacfcfa15 100644
--- a/datahub-web-react/src/graphql/lineage.graphql
+++ b/datahub-web-react/src/graphql/lineage.graphql
@@ -164,6 +164,9 @@ fragment lineageNodeProperties on EntityWithRelationships {
domain {
...entityDomain
}
+ parentContainers {
+ ...parentContainersFields
+ }
...entityDataProduct
status {
removed
@@ -188,6 +191,9 @@ fragment lineageNodeProperties on EntityWithRelationships {
ownership {
...ownershipFields
}
+ parentContainers {
+ ...parentContainersFields
+ }
subTypes {
typeNames
}
@@ -361,6 +367,60 @@ fragment partialLineageResults on EntityLineageResult {
filtered
}
+fragment entityLineage on Entity {
+ urn
+ type
+ ...lineageNodeProperties
+ ...canEditLineageFragment
+ ... on Dataset {
+ schemaMetadata(version: 0) @include(if: $showColumns) {
+ ...schemaMetadataFields
+ }
+ siblings {
+ isPrimary
+ siblings {
+ urn
+ type
+ ... on Dataset {
+ exists
+ }
+ ...lineageNodeProperties
+ }
+ }
+ }
+ ... on Chart {
+ inputFields @include(if: $showColumns) {
+ ...inputFieldsFields
+ }
+ }
+ ... on EntityWithRelationships {
+ upstream: lineage(
+ input: {
+ direction: UPSTREAM
+ start: 0
+ count: 100
+ separateSiblings: $separateSiblings
+ startTimeMillis: $startTimeMillis
+ endTimeMillis: $endTimeMillis
+ }
+ ) @skip(if: $excludeUpstream) {
+ ...fullLineageResults
+ }
+ downstream: lineage(
+ input: {
+ direction: DOWNSTREAM
+ start: 0
+ count: 100
+ separateSiblings: $separateSiblings
+ startTimeMillis: $startTimeMillis
+ endTimeMillis: $endTimeMillis
+ }
+ ) @skip(if: $excludeDownstream) {
+ ...fullLineageResults
+ }
+ }
+}
+
query getEntityLineage(
$urn: String!
$separateSiblings: Boolean
@@ -371,57 +431,21 @@ query getEntityLineage(
$excludeDownstream: Boolean = false
) {
entity(urn: $urn) {
- urn
- type
- ...lineageNodeProperties
- ...canEditLineageFragment
- ... on Dataset {
- schemaMetadata(version: 0) @include(if: $showColumns) {
- ...schemaMetadataFields
- }
- siblings {
- isPrimary
- siblings {
- urn
- type
- ... on Dataset {
- exists
- }
- ...lineageNodeProperties
- }
- }
- }
- ... on Chart {
- inputFields @include(if: $showColumns) {
- ...inputFieldsFields
- }
- }
- ... on EntityWithRelationships {
- upstream: lineage(
- input: {
- direction: UPSTREAM
- start: 0
- count: 100
- separateSiblings: $separateSiblings
- startTimeMillis: $startTimeMillis
- endTimeMillis: $endTimeMillis
- }
- ) @skip(if: $excludeUpstream) {
- ...fullLineageResults
- }
- downstream: lineage(
- input: {
- direction: DOWNSTREAM
- start: 0
- count: 100
- separateSiblings: $separateSiblings
- startTimeMillis: $startTimeMillis
- endTimeMillis: $endTimeMillis
- }
- ) @skip(if: $excludeDownstream) {
- ...fullLineageResults
- }
- }
+ ...entityLineage
+ }
+}
+
+query getBulkEntityLineage(
+ $urns: [String!]!,
+ $separateSiblings: Boolean
+ $showColumns: Boolean!
+ $startTimeMillis: Long
+ $endTimeMillis: Long
+ $excludeUpstream: Boolean = false
+ $excludeDownstream: Boolean = false
+) {
+ entities(urns: $urns) {
+ ...entityLineage
}
}
@@ -489,3 +513,44 @@ query getLineageCounts(
}
}
}
+
+query getSearchAcrossLineageCounts(
+ $urn: String!
+ $excludeUpstream: Boolean = false
+ $excludeDownstream: Boolean = false
+) {
+ upstreams: searchAcrossLineage(
+ input: {
+ urn: $urn
+ query: "*"
+ start: 0
+ count: 10000
+ filters: [{ field: "degree", value: "1", values: ["1"] }]
+ direction: UPSTREAM
+ }
+ ) @skip(if: $excludeUpstream) {
+ start
+ count
+ total
+ facets {
+ ...facetFields
+ }
+ }
+ downstreams: searchAcrossLineage(
+ input: {
+ urn: $urn
+ query: "*"
+ start: 0
+ count: 10000
+ filters: [{ field: "degree", value: "1", values: ["1"] }]
+ direction: DOWNSTREAM
+ }
+ ) @skip(if: $excludeDownstream) {
+ start
+ count
+ total
+ facets {
+ ...facetFields
+ }
+ }
+}
\ No newline at end of file
diff --git a/datahub-web-react/src/graphql/query.graphql b/datahub-web-react/src/graphql/query.graphql
index 84908b24f9ae7f..e24c12a4448b11 100644
--- a/datahub-web-react/src/graphql/query.graphql
+++ b/datahub-web-react/src/graphql/query.graphql
@@ -1,3 +1,13 @@
+query getQuery($urn: String!) {
+ entity(urn: $urn) {
+ urn
+ type
+ ... on QueryEntity {
+ ...query
+ }
+ }
+}
+
fragment query on QueryEntity {
urn
properties {
diff --git a/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl
index 3ba19d348913bf..9587775dbed3a8 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl
@@ -1,6 +1,7 @@
namespace com.linkedin.query
import com.linkedin.common.AuditStamp
+import com.linkedin.common.Urn
/**
* Information about a Query against one or more data assets (e.g. Tables or Views).
@@ -22,7 +23,11 @@ record QueryProperties {
/**
* The query was entered manually by a user (via the UI).
*/
- MANUAL
+ MANUAL,
+ /**
+ * The query was discovered by a crawler.
+ */
+ SYSTEM
}
/**
From 70e64e80786a2112b3c77d790d9634ee17dd1d34 Mon Sep 17 00:00:00 2001
From: Seokyun Ha
Date: Thu, 14 Dec 2023 18:02:37 +0900
Subject: [PATCH 021/540] feat(ingest): add ingest `--no-progress` option
(#9300)
---
docs/cli.md | 1 +
metadata-ingestion/src/datahub/cli/ingest_cli.py | 10 ++++++++++
.../src/datahub/ingestion/run/pipeline.py | 6 +++++-
3 files changed, 16 insertions(+), 1 deletion(-)
diff --git a/docs/cli.md b/docs/cli.md
index 8845ed5a6dac78..cb5077db429061 100644
--- a/docs/cli.md
+++ b/docs/cli.md
@@ -98,6 +98,7 @@ Command Options:
--preview-workunits The number of workunits to produce for preview
--strict-warnings If enabled, ingestion runs with warnings will yield a non-zero error code
--test-source-connection When set, ingestion will only test the source connection details from the recipe
+ --no-progress If enabled, mute intermediate progress ingestion reports
```
#### ingest --dry-run
diff --git a/metadata-ingestion/src/datahub/cli/ingest_cli.py b/metadata-ingestion/src/datahub/cli/ingest_cli.py
index b7827ec9f050b4..569a836f3ef5c2 100644
--- a/metadata-ingestion/src/datahub/cli/ingest_cli.py
+++ b/metadata-ingestion/src/datahub/cli/ingest_cli.py
@@ -97,6 +97,13 @@ def ingest() -> None:
@click.option(
"--no-spinner", type=bool, is_flag=True, default=False, help="Turn off spinner"
)
+@click.option(
+ "--no-progress",
+ type=bool,
+ is_flag=True,
+ default=False,
+ help="If enabled, mute intermediate progress ingestion reports",
+)
@telemetry.with_telemetry(
capture_kwargs=[
"dry_run",
@@ -105,6 +112,7 @@ def ingest() -> None:
"test_source_connection",
"no_default_report",
"no_spinner",
+ "no_progress",
]
)
def run(
@@ -117,6 +125,7 @@ def run(
report_to: str,
no_default_report: bool,
no_spinner: bool,
+ no_progress: bool,
) -> None:
"""Ingest metadata into DataHub."""
@@ -170,6 +179,7 @@ async def run_ingestion_and_check_upgrade() -> int:
preview_workunits,
report_to,
no_default_report,
+ no_progress,
raw_pipeline_config,
)
diff --git a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py
index f2735c24ca19dc..25e17d692109a5 100644
--- a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py
+++ b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py
@@ -173,6 +173,7 @@ def __init__(
preview_workunits: int = 10,
report_to: Optional[str] = None,
no_default_report: bool = False,
+ no_progress: bool = False,
):
self.config = config
self.dry_run = dry_run
@@ -180,6 +181,7 @@ def __init__(
self.preview_workunits = preview_workunits
self.report_to = report_to
self.reporters: List[PipelineRunListener] = []
+ self.no_progress = no_progress
self.num_intermediate_workunits = 0
self.last_time_printed = int(time.time())
self.cli_report = CliReport()
@@ -330,6 +332,7 @@ def create(
preview_workunits: int = 10,
report_to: Optional[str] = "datahub",
no_default_report: bool = False,
+ no_progress: bool = False,
raw_config: Optional[dict] = None,
) -> "Pipeline":
config = PipelineConfig.from_dict(config_dict, raw_config)
@@ -340,6 +343,7 @@ def create(
preview_workunits=preview_workunits,
report_to=report_to,
no_default_report=no_default_report,
+ no_progress=no_progress,
)
def _time_to_print(self) -> bool:
@@ -379,7 +383,7 @@ def run(self) -> None:
self.preview_workunits if self.preview_mode else None,
):
try:
- if self._time_to_print():
+ if self._time_to_print() and not self.no_progress:
self.pretty_print_summary(currently_running=True)
except Exception as e:
logger.warning(f"Failed to print summary {e}")
From b0de1dc0ce7a2de221a27f12dfecea9924380ab2 Mon Sep 17 00:00:00 2001
From: Aseem Bansal
Date: Thu, 14 Dec 2023 18:41:50 +0530
Subject: [PATCH 022/540] fix(powerbi): add access token refresh (#9405)
Co-authored-by: elish7lapid
Co-authored-by: treff7es
---
.../ingestion/source/powerbi/config.py | 1 +
.../powerbi/rest_api_wrapper/data_resolver.py | 15 +-
.../tests/integration/powerbi/test_powerbi.py | 235 +++++++++++++++---
3 files changed, 212 insertions(+), 39 deletions(-)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py
index f71afac737ca61..70786efff79a4d 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py
@@ -95,6 +95,7 @@ class Constant:
TITLE = "title"
EMBED_URL = "embedUrl"
ACCESS_TOKEN = "access_token"
+ ACCESS_TOKEN_EXPIRY = "expires_in"
IS_READ_ONLY = "isReadOnly"
WEB_URL = "webUrl"
ODATA_COUNT = "@odata.count"
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py
index c6314c212d104d..3aeffa60bc28e0 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py
@@ -1,6 +1,7 @@
import logging
import math
from abc import ABC, abstractmethod
+from datetime import datetime, timedelta
from time import sleep
from typing import Any, Dict, List, Optional
@@ -59,6 +60,7 @@ def __init__(
tenant_id: str,
):
self.__access_token: Optional[str] = None
+ self.__access_token_expiry_time: Optional[datetime] = None
self.__tenant_id = tenant_id
# Test connection by generating access token
logger.info("Trying to connect to {}".format(self._get_authority_url()))
@@ -128,7 +130,7 @@ def get_authorization_header(self):
return {Constant.Authorization: self.get_access_token()}
def get_access_token(self):
- if self.__access_token is not None:
+ if self.__access_token is not None and not self._is_access_token_expired():
return self.__access_token
logger.info("Generating PowerBi access token")
@@ -150,11 +152,22 @@ def get_access_token(self):
self.__access_token = "Bearer {}".format(
auth_response.get(Constant.ACCESS_TOKEN)
)
+ safety_gap = 300
+ self.__access_token_expiry_time = datetime.now() + timedelta(
+ seconds=(
+ max(auth_response.get(Constant.ACCESS_TOKEN_EXPIRY, 0) - safety_gap, 0)
+ )
+ )
logger.debug(f"{Constant.PBIAccessToken}={self.__access_token}")
return self.__access_token
+ def _is_access_token_expired(self) -> bool:
+ if not self.__access_token_expiry_time:
+ return True
+ return self.__access_token_expiry_time < datetime.now()
+
def get_dashboards(self, workspace: Workspace) -> List[Dashboard]:
"""
Get the list of dashboard from PowerBi for the given workspace identifier
diff --git a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py
index c9b0ded4337491..b2cbccf983eb0c 100644
--- a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py
+++ b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py
@@ -1,8 +1,10 @@
+import datetime
import logging
import re
import sys
from typing import Any, Dict, List, cast
from unittest import mock
+from unittest.mock import MagicMock
import pytest
from freezegun import freeze_time
@@ -31,13 +33,23 @@ def enable_logging():
logging.getLogger().setLevel(logging.DEBUG)
-def mock_msal_cca(*args, **kwargs):
- class MsalClient:
- def acquire_token_for_client(self, *args, **kwargs):
- return {
- "access_token": "dummy",
- }
+class MsalClient:
+ call_num = 0
+ token: Dict[str, Any] = {
+ "access_token": "dummy",
+ }
+
+ @staticmethod
+ def acquire_token_for_client(*args, **kwargs):
+ MsalClient.call_num += 1
+ return MsalClient.token
+
+ @staticmethod
+ def reset():
+ MsalClient.call_num = 0
+
+def mock_msal_cca(*args, **kwargs):
return MsalClient()
@@ -627,7 +639,13 @@ def default_source_config():
@freeze_time(FROZEN_TIME)
@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca)
@pytest.mark.integration
-def test_powerbi_ingest(mock_msal, pytestconfig, tmp_path, mock_time, requests_mock):
+def test_powerbi_ingest(
+ mock_msal: MagicMock,
+ pytestconfig: pytest.Config,
+ tmp_path: str,
+ mock_time: datetime.datetime,
+ requests_mock: Any,
+) -> None:
enable_logging()
test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi"
@@ -658,7 +676,7 @@ def test_powerbi_ingest(mock_msal, pytestconfig, tmp_path, mock_time, requests_m
mce_helpers.check_golden_file(
pytestconfig,
- output_path=tmp_path / "powerbi_mces.json",
+ output_path=f"{tmp_path}/powerbi_mces.json",
golden_path=f"{test_resources_dir}/{golden_file}",
)
@@ -667,8 +685,12 @@ def test_powerbi_ingest(mock_msal, pytestconfig, tmp_path, mock_time, requests_m
@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca)
@pytest.mark.integration
def test_powerbi_platform_instance_ingest(
- mock_msal, pytestconfig, tmp_path, mock_time, requests_mock
-):
+ mock_msal: MagicMock,
+ pytestconfig: pytest.Config,
+ tmp_path: str,
+ mock_time: datetime.datetime,
+ requests_mock: Any,
+) -> None:
enable_logging()
test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi"
@@ -711,8 +733,12 @@ def test_powerbi_platform_instance_ingest(
@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca)
@pytest.mark.integration
def test_powerbi_ingest_urn_lower_case(
- mock_msal, pytestconfig, tmp_path, mock_time, requests_mock
-):
+ mock_msal: MagicMock,
+ pytestconfig: pytest.Config,
+ tmp_path: str,
+ mock_time: datetime.datetime,
+ requests_mock: Any,
+) -> None:
test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi"
register_mock_api(request_mock=requests_mock)
@@ -752,8 +778,12 @@ def test_powerbi_ingest_urn_lower_case(
@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca)
@pytest.mark.integration
def test_override_ownership(
- mock_msal, pytestconfig, tmp_path, mock_time, requests_mock
-):
+ mock_msal: MagicMock,
+ pytestconfig: pytest.Config,
+ tmp_path: str,
+ mock_time: datetime.datetime,
+ requests_mock: Any,
+) -> None:
test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi"
register_mock_api(request_mock=requests_mock)
@@ -783,7 +813,7 @@ def test_override_ownership(
mce_helpers.check_golden_file(
pytestconfig,
- output_path=tmp_path / "powerbi_mces_disabled_ownership.json",
+ output_path=f"{tmp_path}/powerbi_mces_disabled_ownership.json",
golden_path=f"{test_resources_dir}/{mce_out_file}",
)
@@ -792,8 +822,13 @@ def test_override_ownership(
@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca)
@pytest.mark.integration
def test_scan_all_workspaces(
- mock_msal, pytestconfig, tmp_path, mock_time, requests_mock
-):
+ mock_msal: MagicMock,
+ pytestconfig: pytest.Config,
+ tmp_path: str,
+ mock_time: datetime.datetime,
+ requests_mock: Any,
+) -> None:
+
test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi"
register_mock_api(request_mock=requests_mock)
@@ -828,7 +863,7 @@ def test_scan_all_workspaces(
mce_helpers.check_golden_file(
pytestconfig,
- output_path=tmp_path / "powerbi_mces_scan_all_workspaces.json",
+ output_path=f"{tmp_path}/powerbi_mces_scan_all_workspaces.json",
golden_path=f"{test_resources_dir}/{golden_file}",
)
@@ -836,7 +871,14 @@ def test_scan_all_workspaces(
@freeze_time(FROZEN_TIME)
@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca)
@pytest.mark.integration
-def test_extract_reports(mock_msal, pytestconfig, tmp_path, mock_time, requests_mock):
+def test_extract_reports(
+ mock_msal: MagicMock,
+ pytestconfig: pytest.Config,
+ tmp_path: str,
+ mock_time: datetime.datetime,
+ requests_mock: Any,
+) -> None:
+
enable_logging()
test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi"
@@ -868,7 +910,7 @@ def test_extract_reports(mock_msal, pytestconfig, tmp_path, mock_time, requests_
mce_helpers.check_golden_file(
pytestconfig,
- output_path=tmp_path / "powerbi_report_mces.json",
+ output_path=f"{tmp_path}/powerbi_report_mces.json",
golden_path=f"{test_resources_dir}/{golden_file}",
)
@@ -876,7 +918,13 @@ def test_extract_reports(mock_msal, pytestconfig, tmp_path, mock_time, requests_
@freeze_time(FROZEN_TIME)
@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca)
@pytest.mark.integration
-def test_extract_lineage(mock_msal, pytestconfig, tmp_path, mock_time, requests_mock):
+def test_extract_lineage(
+ mock_msal: MagicMock,
+ pytestconfig: pytest.Config,
+ tmp_path: str,
+ mock_time: datetime.datetime,
+ requests_mock: Any,
+) -> None:
enable_logging()
test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi"
@@ -925,8 +973,12 @@ def test_extract_lineage(mock_msal, pytestconfig, tmp_path, mock_time, requests_
@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca)
@pytest.mark.integration
def test_extract_endorsements(
- mock_msal, pytestconfig, tmp_path, mock_time, requests_mock
-):
+ mock_msal: MagicMock,
+ pytestconfig: pytest.Config,
+ tmp_path: str,
+ mock_time: datetime.datetime,
+ requests_mock: Any,
+) -> None:
test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi"
register_mock_api(request_mock=requests_mock)
@@ -957,7 +1009,7 @@ def test_extract_endorsements(
mce_helpers.check_golden_file(
pytestconfig,
- output_path=tmp_path / "powerbi_endorsement_mces.json",
+ output_path=f"{tmp_path}/powerbi_endorsement_mces.json",
golden_path=f"{test_resources_dir}/{mce_out_file}",
)
@@ -966,8 +1018,12 @@ def test_extract_endorsements(
@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca)
@pytest.mark.integration
def test_admin_access_is_not_allowed(
- mock_msal, pytestconfig, tmp_path, mock_time, requests_mock
-):
+ mock_msal: MagicMock,
+ pytestconfig: pytest.Config,
+ tmp_path: str,
+ mock_time: datetime.datetime,
+ requests_mock: Any,
+) -> None:
enable_logging()
test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi"
@@ -1024,8 +1080,12 @@ def test_admin_access_is_not_allowed(
@freeze_time(FROZEN_TIME)
@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca)
def test_workspace_container(
- mock_msal, pytestconfig, tmp_path, mock_time, requests_mock
-):
+ mock_msal: MagicMock,
+ pytestconfig: pytest.Config,
+ tmp_path: str,
+ mock_time: datetime.datetime,
+ requests_mock: Any,
+) -> None:
enable_logging()
test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi"
@@ -1062,11 +1122,92 @@ def test_workspace_container(
mce_helpers.check_golden_file(
pytestconfig,
- output_path=tmp_path / "powerbi_container_mces.json",
+ output_path=f"{tmp_path}/powerbi_container_mces.json",
golden_path=f"{test_resources_dir}/{mce_out_file}",
)
+@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca)
+def test_access_token_expiry_with_long_expiry(
+ mock_msal: MagicMock,
+ pytestconfig: pytest.Config,
+ tmp_path: str,
+ mock_time: datetime.datetime,
+ requests_mock: Any,
+) -> None:
+ enable_logging()
+
+ register_mock_api(request_mock=requests_mock)
+
+ pipeline = Pipeline.create(
+ {
+ "run_id": "powerbi-test",
+ "source": {
+ "type": "powerbi",
+ "config": {
+ **default_source_config(),
+ },
+ },
+ "sink": {
+ "type": "file",
+ "config": {
+ "filename": f"{tmp_path}/powerbi_access_token_mces.json",
+ },
+ },
+ }
+ )
+
+ # for long expiry, the token should only be requested once.
+ MsalClient.token = {
+ "access_token": "dummy2",
+ "expires_in": 3600,
+ }
+
+ MsalClient.reset()
+ pipeline.run()
+ # We expect the token to be requested twice (once for AdminApiResolver and one for RegularApiResolver)
+ assert MsalClient.call_num == 2
+
+
+@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca)
+def test_access_token_expiry_with_short_expiry(
+ mock_msal: MagicMock,
+ pytestconfig: pytest.Config,
+ tmp_path: str,
+ mock_time: datetime.datetime,
+ requests_mock: Any,
+) -> None:
+ enable_logging()
+
+ register_mock_api(request_mock=requests_mock)
+
+ pipeline = Pipeline.create(
+ {
+ "run_id": "powerbi-test",
+ "source": {
+ "type": "powerbi",
+ "config": {
+ **default_source_config(),
+ },
+ },
+ "sink": {
+ "type": "file",
+ "config": {
+ "filename": f"{tmp_path}/powerbi_access_token_mces.json",
+ },
+ },
+ }
+ )
+
+ # for short expiry, the token should be requested when expires.
+ MsalClient.token = {
+ "access_token": "dummy",
+ "expires_in": 0,
+ }
+ pipeline.run()
+ assert MsalClient.call_num > 2
+
+
def dataset_type_mapping_set_to_all_platform(pipeline: Pipeline) -> None:
source_config: PowerBiDashboardSourceConfig = cast(
PowerBiDashboardSource, pipeline.source
@@ -1306,8 +1447,12 @@ def validate_pipeline(pipeline: Pipeline) -> None:
@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca)
@pytest.mark.integration
def test_reports_with_failed_page_request(
- mock_msal, pytestconfig, tmp_path, mock_time, requests_mock
-):
+ mock_msal: MagicMock,
+ pytestconfig: pytest.Config,
+ tmp_path: str,
+ mock_time: datetime.datetime,
+ requests_mock: Any,
+) -> None:
"""
Test that all reports are fetched even if a single page request fails
"""
@@ -1419,8 +1564,12 @@ def test_reports_with_failed_page_request(
@freeze_time(FROZEN_TIME)
@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca)
def test_independent_datasets_extraction(
- mock_msal, pytestconfig, tmp_path, mock_time, requests_mock
-):
+ mock_msal: MagicMock,
+ pytestconfig: pytest.Config,
+ tmp_path: str,
+ mock_time: datetime.datetime,
+ requests_mock: Any,
+) -> None:
test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi"
@@ -1503,14 +1652,20 @@ def test_independent_datasets_extraction(
mce_helpers.check_golden_file(
pytestconfig,
- output_path=tmp_path / "powerbi_independent_mces.json",
+ output_path=f"{tmp_path}/powerbi_independent_mces.json",
golden_path=f"{test_resources_dir}/{golden_file}",
)
@freeze_time(FROZEN_TIME)
@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca)
-def test_cll_extraction(mock_msal, pytestconfig, tmp_path, mock_time, requests_mock):
+def test_cll_extraction(
+ mock_msal: MagicMock,
+ pytestconfig: pytest.Config,
+ tmp_path: str,
+ mock_time: datetime.datetime,
+ requests_mock: Any,
+) -> None:
test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi"
@@ -1553,7 +1708,7 @@ def test_cll_extraction(mock_msal, pytestconfig, tmp_path, mock_time, requests_m
mce_helpers.check_golden_file(
pytestconfig,
- output_path=tmp_path / "powerbi_cll_mces.json",
+ output_path=f"{tmp_path}/powerbi_cll_mces.json",
golden_path=f"{test_resources_dir}/{golden_file}",
)
@@ -1561,8 +1716,12 @@ def test_cll_extraction(mock_msal, pytestconfig, tmp_path, mock_time, requests_m
@freeze_time(FROZEN_TIME)
@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca)
def test_cll_extraction_flags(
- mock_msal, pytestconfig, tmp_path, mock_time, requests_mock
-):
+ mock_msal: MagicMock,
+ pytestconfig: pytest.Config,
+ tmp_path: str,
+ mock_time: datetime.datetime,
+ requests_mock: Any,
+) -> None:
register_mock_api(
request_mock=requests_mock,
From 9ecda6485202ce89291bd1485c861cf7be1b8741 Mon Sep 17 00:00:00 2001
From: Sumit Patil <91715217+sumitappt@users.noreply.github.com>
Date: Thu, 14 Dec 2023 19:07:48 +0530
Subject: [PATCH 023/540] fix(analytics): do not ping the track endpoint before
login (#9462)
---
datahub-web-react/src/app/analytics/analytics.ts | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/datahub-web-react/src/app/analytics/analytics.ts b/datahub-web-react/src/app/analytics/analytics.ts
index a66d76a09cf4de..468164069cfd03 100644
--- a/datahub-web-react/src/app/analytics/analytics.ts
+++ b/datahub-web-react/src/app/analytics/analytics.ts
@@ -30,16 +30,17 @@ export function getMergedTrackingOptions(options?: any) {
export default {
page: (data?: PageData, options?: any, callback?: (...params: any[]) => any) => {
+ const actorUrn = Cookies.get(CLIENT_AUTH_COOKIE) || undefined;
const modifiedData = {
...data,
type: EventType[EventType.PageViewEvent],
- actorUrn: Cookies.get(CLIENT_AUTH_COOKIE) || undefined,
+ actorUrn,
timestamp: Date.now(),
date: new Date().toString(),
userAgent: navigator.userAgent,
browserId: getBrowserId(),
};
- if (NODE_ENV === 'test') {
+ if (NODE_ENV === 'test' || !actorUrn) {
return null;
}
const trackingOptions = getMergedTrackingOptions(options);
From aac1c55a14fdf65cb51f1fd0f441d93eb7757098 Mon Sep 17 00:00:00 2001
From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com>
Date: Thu, 14 Dec 2023 21:05:06 +0530
Subject: [PATCH 024/540] feat(ingest/unity): enable hive metastore ingestion
(#9416)
---
metadata-ingestion/setup.py | 5 +-
.../ingestion/source/bigquery_v2/bigquery.py | 4 +
.../ingestion/source/source_registry.py | 9 +
.../datahub/ingestion/source/unity/config.py | 51 +-
.../source/unity/hive_metastore_proxy.py | 242 ++
.../datahub/ingestion/source/unity/proxy.py | 22 +
.../ingestion/source/unity/proxy_types.py | 38 +-
.../datahub/ingestion/source/unity/report.py | 4 +-
.../datahub/ingestion/source/unity/source.py | 64 +-
.../unity/test_unity_catalog_ingest.py | 77 +-
.../unity/unity_catalog_mces_golden.json | 2509 +++++++++--------
.../tests/unit/test_unity_catalog_config.py | 65 +-
12 files changed, 1958 insertions(+), 1132 deletions(-)
create mode 100644 metadata-ingestion/src/datahub/ingestion/source/unity/hive_metastore_proxy.py
diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
index e894cbf043338d..5d15d7167b63e8 100644
--- a/metadata-ingestion/setup.py
+++ b/metadata-ingestion/setup.py
@@ -263,7 +263,8 @@
"pyspark~=3.3.0",
"requests",
# Version 2.4.0 includes sqlalchemy dialect, 2.8.0 includes some bug fixes
- "databricks-sql-connector>=2.8.0",
+ # Version 3.0.0 required SQLAlchemy > 2.0.21
+ "databricks-sql-connector>=2.8.0,<3.0.0",
}
mysql = sql_common | {"pymysql>=1.0.2"}
@@ -395,6 +396,8 @@
"powerbi-report-server": powerbi_report_server,
"vertica": sql_common | {"vertica-sqlalchemy-dialect[vertica-python]==0.0.8.1"},
"unity-catalog": databricks | sql_common | sqllineage_lib,
+ # databricks is alias for unity-catalog and needs to be kept in sync
+ "databricks": databricks | sql_common | sqllineage_lib,
"fivetran": snowflake_common,
}
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
index 6959a483130106..9813945683289c 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
@@ -1031,6 +1031,10 @@ def gen_dataset_urn_from_ref(self, ref: BigQueryTableRef) -> str:
def gen_schema_fields(self, columns: List[BigqueryColumn]) -> List[SchemaField]:
schema_fields: List[SchemaField] = []
+ # Below line affects HiveColumnToAvroConverter._STRUCT_TYPE_SEPARATOR in global scope
+ # TODO: Refractor this such that
+ # converter = HiveColumnToAvroConverter(struct_type_separator=" ");
+ # converter.get_schema_fields_for_hive_column(...)
HiveColumnToAvroConverter._STRUCT_TYPE_SEPARATOR = " "
_COMPLEX_TYPE = re.compile("^(struct|array)")
last_id = -1
diff --git a/metadata-ingestion/src/datahub/ingestion/source/source_registry.py b/metadata-ingestion/src/datahub/ingestion/source/source_registry.py
index c3fbab3f9a0122..e003c658f45e8d 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/source_registry.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/source_registry.py
@@ -14,3 +14,12 @@
"mssql-odbc",
"mssql",
)
+
+# Use databricks as alias for unity-catalog ingestion source.
+# As mentioned here - https://docs.databricks.com/en/data-governance/unity-catalog/enable-workspaces.html,
+# Databricks is rolling out Unity Catalog gradually across accounts.
+# TODO: Rename unity-catalog source to databricks source, once it is rolled out for all accounts
+source_registry.register_alias(
+ "databricks",
+ "unity-catalog",
+)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/config.py b/metadata-ingestion/src/datahub/ingestion/source/unity/config.py
index 2c567120b4850e..96971faeea69f4 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/unity/config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/unity/config.py
@@ -129,6 +129,14 @@ class UnityCatalogSourceConfig(
workspace_url: str = pydantic.Field(
description="Databricks workspace url. e.g. https://my-workspace.cloud.databricks.com"
)
+ warehouse_id: Optional[str] = pydantic.Field(
+ default=None,
+ description="SQL Warehouse id, for running queries. If not set, will use the default warehouse.",
+ )
+ include_hive_metastore: bool = pydantic.Field(
+ default=False,
+ description="Whether to ingest legacy `hive_metastore` catalog. This requires executing queries on SQL warehouse.",
+ )
workspace_name: Optional[str] = pydantic.Field(
default=None,
description="Name of the workspace. Default to deployment name present in workspace_url",
@@ -254,16 +262,17 @@ class UnityCatalogSourceConfig(
scheme: str = DATABRICKS
- def get_sql_alchemy_url(self):
+ def get_sql_alchemy_url(self, database: Optional[str] = None) -> str:
+ uri_opts = {"http_path": f"/sql/1.0/warehouses/{self.warehouse_id}"}
+ if database:
+ uri_opts["catalog"] = database
return make_sqlalchemy_uri(
scheme=self.scheme,
username="token",
password=self.token,
at=urlparse(self.workspace_url).netloc,
- db=None,
- uri_opts={
- "http_path": f"/sql/1.0/warehouses/{self.profiling.warehouse_id}"
- },
+ db=database,
+ uri_opts=uri_opts,
)
def is_profiling_enabled(self) -> bool:
@@ -304,3 +313,35 @@ def include_metastore_warning(cls, v: bool) -> bool:
logger.warning(msg)
add_global_warning(msg)
return v
+
+ @pydantic.root_validator(skip_on_failure=True)
+ def set_warehouse_id_from_profiling(cls, values: Dict[str, Any]) -> Dict[str, Any]:
+ profiling: Optional[UnityCatalogProfilerConfig] = values.get("profiling")
+ if not values.get("warehouse_id") and profiling and profiling.warehouse_id:
+ values["warehouse_id"] = profiling.warehouse_id
+ if (
+ values.get("warehouse_id")
+ and profiling
+ and profiling.warehouse_id
+ and values["warehouse_id"] != profiling.warehouse_id
+ ):
+ raise ValueError(
+ "When `warehouse_id` is set, it must match the `warehouse_id` in `profiling`."
+ )
+
+ if values.get("include_hive_metastore") and not values.get("warehouse_id"):
+ raise ValueError(
+ "When `include_hive_metastore` is set, `warehouse_id` must be set."
+ )
+
+ if values.get("warehouse_id") and profiling and not profiling.warehouse_id:
+ profiling.warehouse_id = values["warehouse_id"]
+
+ return values
+
+ @pydantic.validator("schema_pattern", always=True)
+ def schema_pattern_should__always_deny_information_schema(
+ cls, v: AllowDenyPattern
+ ) -> AllowDenyPattern:
+ v.deny.append(".*\\.information_schema")
+ return v
diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/hive_metastore_proxy.py b/metadata-ingestion/src/datahub/ingestion/source/unity/hive_metastore_proxy.py
new file mode 100644
index 00000000000000..99b2ff998662cb
--- /dev/null
+++ b/metadata-ingestion/src/datahub/ingestion/source/unity/hive_metastore_proxy.py
@@ -0,0 +1,242 @@
+import logging
+from datetime import datetime
+from functools import lru_cache
+from typing import Iterable, List, Optional
+
+from databricks.sdk.service.catalog import ColumnTypeName, DataSourceFormat
+from databricks.sql.types import Row
+from sqlalchemy import create_engine, inspect
+from sqlalchemy.engine.reflection import Inspector
+
+from datahub.ingestion.api.closeable import Closeable
+from datahub.ingestion.source.unity.proxy_types import (
+ Catalog,
+ Column,
+ CustomCatalogType,
+ HiveTableType,
+ Metastore,
+ Schema,
+ Table,
+)
+
+logger = logging.getLogger(__name__)
+HIVE_METASTORE = "hive_metastore"
+
+type_map = {
+ "boolean": ColumnTypeName.BOOLEAN,
+ "tinyint": ColumnTypeName.INT,
+ "smallint": ColumnTypeName.INT,
+ "int": ColumnTypeName.INT,
+ "bigint": ColumnTypeName.LONG,
+ "float": ColumnTypeName.FLOAT,
+ "double": ColumnTypeName.DOUBLE,
+ "decimal": ColumnTypeName.DECIMAL,
+ "string": ColumnTypeName.STRING,
+ "varchar": ColumnTypeName.STRING,
+ "timestamp": ColumnTypeName.TIMESTAMP,
+ "date": ColumnTypeName.DATE,
+ "binary": ColumnTypeName.BINARY,
+}
+
+
+class HiveMetastoreProxy(Closeable):
+ # TODO: Support for view lineage using SQL parsing
+ # Why not use hive ingestion source directly here ?
+ # 1. hive ingestion source assumes 2-level namespace heirarchy and currently
+ # there is no other intermediate interface except sqlalchemy inspector
+ # that can be used to fetch hive metadata.
+ # 2. hive recipe for databricks (databricks+pyhive dialect) does not
+ # readily support SQL warehouse. Also this dialect is not actively maintained.
+ """
+ Proxy to read metadata from hive_metastore databricks catalog. This is required
+ as unity catalog apis do not return details about this legacy metastore.
+ """
+
+ def __init__(self, sqlalchemy_url: str, options: dict) -> None:
+ try:
+ self.inspector = HiveMetastoreProxy.get_inspector(sqlalchemy_url, options)
+ except Exception:
+ # This means that there is no `hive_metastore` catalog in databricks workspace
+ # Not tested but seems like the logical conclusion.
+ raise
+
+ @staticmethod
+ def get_inspector(sqlalchemy_url: str, options: dict) -> Inspector:
+ engine = create_engine(sqlalchemy_url, **options)
+ return inspect(engine.connect())
+
+ def hive_metastore_catalog(self, metastore: Optional[Metastore]) -> Catalog:
+ return Catalog(
+ id=HIVE_METASTORE,
+ name=HIVE_METASTORE,
+ comment=None,
+ metastore=metastore,
+ owner=None,
+ type=CustomCatalogType.HIVE_METASTORE_CATALOG,
+ )
+
+ def hive_metastore_schemas(self, catalog: Catalog) -> Iterable[Schema]:
+ for schema_name in self.inspector.get_schema_names():
+ yield Schema(
+ name=schema_name,
+ id=f"{catalog.id}.{schema_name}",
+ catalog=catalog,
+ comment=None,
+ owner=None,
+ )
+
+ def hive_metastore_tables(self, schema: Schema) -> Iterable[Table]:
+ views = self.inspector.get_view_names(schema.name)
+ for table_name in views:
+ yield self._get_table(schema, table_name, True)
+
+ for table_name in self.inspector.get_table_names(schema.name):
+ if table_name in views:
+ continue
+ yield self._get_table(schema, table_name, False)
+
+ def _get_table(self, schema: Schema, table_name: str, is_view: bool) -> Table:
+ columns = self._get_columns(schema, table_name)
+ detailed_info = self._get_table_info(schema, table_name)
+
+ comment = detailed_info.pop("Comment", None)
+ storage_location = detailed_info.pop("Location", None)
+ datasource_format = self._get_datasource_format(
+ detailed_info.pop("Provider", None)
+ )
+
+ created_at = self._get_created_at(detailed_info.pop("Created Time", None))
+
+ return Table(
+ name=table_name,
+ id=f"{schema.id}.{table_name}",
+ table_type=self._get_table_type(detailed_info.pop("Type", None)),
+ schema=schema,
+ columns=columns,
+ storage_location=storage_location,
+ data_source_format=datasource_format,
+ view_definition=self._get_view_definition(schema.name, table_name)
+ if is_view
+ else None,
+ properties=detailed_info,
+ owner=None,
+ generation=None,
+ created_at=created_at,
+ created_by=None,
+ updated_at=None,
+ updated_by=None,
+ table_id=f"{schema.id}.{table_name}",
+ comment=comment,
+ )
+
+ def _get_created_at(self, created_at: Optional[str]) -> Optional[datetime]:
+ return (
+ datetime.strptime(created_at, "%a %b %d %H:%M:%S %Z %Y")
+ if created_at
+ else None
+ )
+
+ def _get_datasource_format(
+ self, provider: Optional[str]
+ ) -> Optional[DataSourceFormat]:
+ raw_format = provider
+ if raw_format:
+ try:
+ return DataSourceFormat(raw_format.upper())
+ except Exception:
+ logger.debug(f"Unknown datasource format : {raw_format}")
+ pass
+ return None
+
+ def _get_view_definition(self, schema_name: str, table_name: str) -> Optional[str]:
+ try:
+ rows = self._execute_sql(
+ f"SHOW CREATE TABLE `{schema_name}`.`{table_name}`"
+ )
+ for row in rows:
+ return row[0]
+ except Exception:
+ logger.debug(
+ f"Failed to get view definition for {schema_name}.{table_name}"
+ )
+ return None
+
+ def _get_table_type(self, type: Optional[str]) -> HiveTableType:
+ if type == "EXTERNAL":
+ return HiveTableType.HIVE_EXTERNAL_TABLE
+ elif type == "MANAGED":
+ return HiveTableType.HIVE_MANAGED_TABLE
+ elif type == "VIEW":
+ return HiveTableType.HIVE_VIEW
+ else:
+ return HiveTableType.UNKNOWN
+
+ def _get_table_info(self, schema: Schema, table_name: str) -> dict:
+ rows = self._describe_extended(schema.name, table_name)
+
+ index = rows.index(("# Detailed Table Information", "", ""))
+ rows = rows[index + 1 :]
+ # Copied from https://github.com/acryldata/PyHive/blob/master/pyhive/sqlalchemy_hive.py#L375
+ # Generate properties dictionary.
+ properties = {}
+ active_heading = None
+ for col_name, data_type, value in rows:
+ col_name = col_name.rstrip()
+ if col_name.startswith("# "):
+ continue
+ elif col_name == "" and data_type is None:
+ active_heading = None
+ continue
+ elif col_name != "" and data_type is None:
+ active_heading = col_name
+ elif col_name != "" and data_type is not None:
+ properties[col_name] = data_type.strip()
+ else:
+ # col_name == "", data_type is not None
+ prop_name = "{} {}".format(active_heading, data_type.rstrip())
+ properties[prop_name] = value.rstrip()
+
+ return properties
+
+ def _get_columns(self, schema: Schema, table_name: str) -> List[Column]:
+ rows = self._describe_extended(schema.name, table_name)
+
+ columns: List[Column] = []
+ for i, row in enumerate(rows):
+ if i == 0 and row[0].strip() == "col_name":
+ continue # first row
+ if row[0].strip() in (
+ "",
+ "# Partition Information",
+ "# Detailed Table Information",
+ ):
+ break
+ columns.append(
+ Column(
+ name=row[0].strip(),
+ id=f"{schema.id}.{table_name}.{row[0].strip()}",
+ type_text=row[1].strip(),
+ type_name=type_map.get(row[1].strip().lower()),
+ type_scale=None,
+ type_precision=None,
+ position=None,
+ nullable=None,
+ comment=row[2],
+ )
+ )
+
+ return columns
+
+ @lru_cache(maxsize=1)
+ def _describe_extended(self, schema_name: str, table_name: str) -> List[Row]:
+ """
+ Rows are structured as shown in examples here
+ https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-aux-describe-table.html#examples
+ """
+ return self._execute_sql(f"DESCRIBE EXTENDED `{schema_name}`.`{table_name}`")
+
+ def _execute_sql(self, sql: str) -> List[Row]:
+ return self.inspector.bind.execute(sql).fetchall()
+
+ def close(self):
+ self.inspector.bind.close() # type:ignore
diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py
index 375c76db8e9719..13baa8b57a639d 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py
@@ -26,6 +26,7 @@
from databricks.sdk.service.workspace import ObjectType
import datahub
+from datahub.ingestion.source.unity.hive_metastore_proxy import HiveMetastoreProxy
from datahub.ingestion.source.unity.proxy_profiling import (
UnityCatalogProxyProfilingMixin,
)
@@ -33,6 +34,7 @@
ALLOWED_STATEMENT_TYPES,
Catalog,
Column,
+ CustomCatalogType,
ExternalTableReference,
Metastore,
Notebook,
@@ -87,6 +89,7 @@ def __init__(
personal_access_token: str,
warehouse_id: Optional[str],
report: UnityCatalogReport,
+ hive_metastore_proxy: Optional[HiveMetastoreProxy] = None,
):
self._workspace_client = WorkspaceClient(
host=workspace_url,
@@ -96,6 +99,7 @@ def __init__(
)
self.warehouse_id = warehouse_id or ""
self.report = report
+ self.hive_metastore_proxy = hive_metastore_proxy
def check_basic_connectivity(self) -> bool:
return bool(self._workspace_client.catalogs.list())
@@ -105,6 +109,9 @@ def assigned_metastore(self) -> Metastore:
return self._create_metastore(response)
def catalogs(self, metastore: Optional[Metastore]) -> Iterable[Catalog]:
+ if self.hive_metastore_proxy:
+ yield self.hive_metastore_proxy.hive_metastore_catalog(metastore)
+
response = self._workspace_client.catalogs.list()
if not response:
logger.info("Catalogs not found")
@@ -122,6 +129,12 @@ def catalog(
return self._create_catalog(metastore, response)
def schemas(self, catalog: Catalog) -> Iterable[Schema]:
+ if (
+ self.hive_metastore_proxy
+ and catalog.type == CustomCatalogType.HIVE_METASTORE_CATALOG
+ ):
+ yield from self.hive_metastore_proxy.hive_metastore_schemas(catalog)
+ return
response = self._workspace_client.schemas.list(catalog_name=catalog.name)
if not response:
logger.info(f"Schemas not found for catalog {catalog.id}")
@@ -130,6 +143,12 @@ def schemas(self, catalog: Catalog) -> Iterable[Schema]:
yield self._create_schema(catalog, schema)
def tables(self, schema: Schema) -> Iterable[Table]:
+ if (
+ self.hive_metastore_proxy
+ and schema.catalog.type == CustomCatalogType.HIVE_METASTORE_CATALOG
+ ):
+ yield from self.hive_metastore_proxy.hive_metastore_tables(schema)
+ return
with patch("databricks.sdk.service.catalog.TableInfo", TableInfoWithGeneration):
response = self._workspace_client.tables.list(
catalog_name=schema.catalog.name, schema_name=schema.name
@@ -244,6 +263,9 @@ def list_lineages_by_column(self, table_name: str, column_name: str) -> dict:
)
def table_lineage(self, table: Table, include_entity_lineage: bool) -> None:
+ if table.schema.catalog.type == CustomCatalogType.HIVE_METASTORE_CATALOG:
+ # Lineage is not available for Hive Metastore Tables.
+ return None
# Lineage endpoint doesn't exists on 2.1 version
try:
response: dict = self.list_lineages_by_table(
diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py
index 315c1c0d20186f..e5951cb0fa4ffc 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py
@@ -4,7 +4,8 @@
import logging
from dataclasses import dataclass, field
from datetime import datetime
-from typing import Dict, FrozenSet, List, Optional, Set
+from enum import Enum
+from typing import Dict, FrozenSet, List, Optional, Set, Union
from databricks.sdk.service.catalog import (
CatalogType,
@@ -75,6 +76,17 @@
NotebookId = int
+class CustomCatalogType(Enum):
+ HIVE_METASTORE_CATALOG = "HIVE_METASTORE_CATALOG"
+
+
+class HiveTableType(Enum):
+ HIVE_MANAGED_TABLE = "HIVE_MANAGED_TABLE"
+ HIVE_EXTERNAL_TABLE = "HIVE_EXTERNAL_TABLE"
+ HIVE_VIEW = "HIVE_VIEW"
+ UNKNOWN = "UNKNOWN"
+
+
@dataclass
class CommonProperty:
id: str
@@ -95,7 +107,7 @@ class Metastore(CommonProperty):
class Catalog(CommonProperty):
metastore: Optional[Metastore]
owner: Optional[str]
- type: CatalogType
+ type: Union[CatalogType, CustomCatalogType]
@dataclass
@@ -107,11 +119,11 @@ class Schema(CommonProperty):
@dataclass
class Column(CommonProperty):
type_text: str
- type_name: ColumnTypeName
- type_precision: int
- type_scale: int
- position: int
- nullable: bool
+ type_name: Optional[ColumnTypeName]
+ type_precision: Optional[int]
+ type_scale: Optional[int]
+ position: Optional[int]
+ nullable: Optional[bool]
comment: Optional[str]
@@ -212,11 +224,11 @@ class Table(CommonProperty):
columns: List[Column]
storage_location: Optional[str]
data_source_format: Optional[DataSourceFormat]
- table_type: TableType
+ table_type: Union[TableType, HiveTableType]
owner: Optional[str]
generation: Optional[int]
- created_at: datetime
- created_by: str
+ created_at: Optional[datetime]
+ created_by: Optional[str]
updated_at: Optional[datetime]
updated_by: Optional[str]
table_id: str
@@ -231,7 +243,11 @@ class Table(CommonProperty):
def __post_init__(self):
self.ref = TableReference.create(self)
- self.is_view = self.table_type in [TableType.VIEW, TableType.MATERIALIZED_VIEW]
+ self.is_view = self.table_type in [
+ TableType.VIEW,
+ TableType.MATERIALIZED_VIEW,
+ HiveTableType.HIVE_VIEW,
+ ]
@dataclass
diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/report.py b/metadata-ingestion/src/datahub/ingestion/source/unity/report.py
index 7f19b6e2103ea9..0770d9d27055c0 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/unity/report.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/unity/report.py
@@ -1,5 +1,5 @@
from dataclasses import dataclass, field
-from typing import Tuple
+from typing import Optional, Tuple
from datahub.ingestion.api.report import EntityFilterReport
from datahub.ingestion.source.sql.sql_generic_profiler import ProfilingSqlReport
@@ -16,6 +16,8 @@ class UnityCatalogReport(IngestionStageReport, ProfilingSqlReport):
table_profiles: EntityFilterReport = EntityFilterReport.field(type="table profile")
notebooks: EntityFilterReport = EntityFilterReport.field(type="notebook")
+ hive_metastore_catalog_found: Optional[bool] = None
+
num_column_lineage_skipped_column_count: int = 0
num_external_upstreams_lacking_permissions: int = 0
num_external_upstreams_unsupported: int = 0
diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py
index d1940c1d576073..43c5e244393772 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py
@@ -58,6 +58,10 @@
)
from datahub.ingestion.source.unity.connection_test import UnityCatalogConnectionTest
from datahub.ingestion.source.unity.ge_profiler import UnityCatalogGEProfiler
+from datahub.ingestion.source.unity.hive_metastore_proxy import (
+ HIVE_METASTORE,
+ HiveMetastoreProxy,
+)
from datahub.ingestion.source.unity.proxy import UnityCatalogApiProxy
from datahub.ingestion.source.unity.proxy_types import (
DATA_TYPE_REGISTRY,
@@ -142,12 +146,17 @@ def __init__(self, ctx: PipelineContext, config: UnityCatalogSourceConfig):
self.config = config
self.report: UnityCatalogReport = UnityCatalogReport()
+
+ self.init_hive_metastore_proxy()
+
self.unity_catalog_api_proxy = UnityCatalogApiProxy(
config.workspace_url,
config.token,
- config.profiling.warehouse_id,
+ config.warehouse_id,
report=self.report,
+ hive_metastore_proxy=self.hive_metastore_proxy,
)
+
self.external_url_base = urljoin(self.config.workspace_url, "/explore/data")
# Determine the platform_instance_name
@@ -174,6 +183,23 @@ def __init__(self, ctx: PipelineContext, config: UnityCatalogSourceConfig):
# Global map of tables, for profiling
self.tables: FileBackedDict[Table] = FileBackedDict()
+ def init_hive_metastore_proxy(self):
+ self.hive_metastore_proxy: Optional[HiveMetastoreProxy] = None
+ if self.config.include_hive_metastore:
+ try:
+ self.hive_metastore_proxy = HiveMetastoreProxy(
+ self.config.get_sql_alchemy_url(HIVE_METASTORE), self.config.options
+ )
+ self.report.hive_metastore_catalog_found = True
+ except Exception as e:
+ logger.debug("Exception", exc_info=True)
+ self.warn(
+ logger,
+ HIVE_METASTORE,
+ f"Failed to connect to hive_metastore due to {e}",
+ )
+ self.report.hive_metastore_catalog_found = False
+
@staticmethod
def test_connection(config_dict: dict) -> TestConnectionReport:
return UnityCatalogConnectionTest(config_dict).get_connection_test()
@@ -194,7 +220,7 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
self.report.report_ingestion_stage_start("Ingestion Setup")
wait_on_warehouse = None
- if self.config.is_profiling_enabled():
+ if self.config.is_profiling_enabled() or self.config.include_hive_metastore:
self.report.report_ingestion_stage_start("Start warehouse")
# Can take several minutes, so start now and wait later
wait_on_warehouse = self.unity_catalog_api_proxy.start_warehouse()
@@ -204,6 +230,9 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
f"SQL warehouse {self.config.profiling.warehouse_id} not found",
)
return
+ else:
+ # wait until warehouse is started
+ wait_on_warehouse.result()
if self.config.include_ownership:
self.report.report_ingestion_stage_start("Ingest service principals")
@@ -678,18 +707,25 @@ def _create_table_property_aspect(self, table: Table) -> DatasetPropertiesClass:
custom_properties["table_type"] = table.table_type.value
- custom_properties["created_by"] = table.created_by
- custom_properties["created_at"] = str(table.created_at)
+ if table.created_by:
+ custom_properties["created_by"] = table.created_by
if table.properties:
custom_properties.update({k: str(v) for k, v in table.properties.items()})
custom_properties["table_id"] = table.table_id
- custom_properties["owner"] = table.owner
- custom_properties["updated_by"] = table.updated_by
- custom_properties["updated_at"] = str(table.updated_at)
-
- created = TimeStampClass(
- int(table.created_at.timestamp() * 1000), make_user_urn(table.created_by)
- )
+ if table.owner:
+ custom_properties["owner"] = table.owner
+ if table.updated_by:
+ custom_properties["updated_by"] = table.updated_by
+ if table.updated_at:
+ custom_properties["updated_at"] = str(table.updated_at)
+
+ created: Optional[TimeStampClass] = None
+ if table.created_at:
+ custom_properties["created_at"] = str(table.created_at)
+ created = TimeStampClass(
+ int(table.created_at.timestamp() * 1000),
+ make_user_urn(table.created_by) if table.created_by else None,
+ )
last_modified = created
if table.updated_at:
last_modified = TimeStampClass(
@@ -780,3 +816,9 @@ def _create_schema_field(column: Column) -> List[SchemaFieldClass]:
description=column.comment,
)
]
+
+ def close(self):
+ if self.hive_metastore_proxy:
+ self.hive_metastore_proxy.close()
+
+ super().close()
diff --git a/metadata-ingestion/tests/integration/unity/test_unity_catalog_ingest.py b/metadata-ingestion/tests/integration/unity/test_unity_catalog_ingest.py
index c43ba7eee58478..aab7630d57f460 100644
--- a/metadata-ingestion/tests/integration/unity/test_unity_catalog_ingest.py
+++ b/metadata-ingestion/tests/integration/unity/test_unity_catalog_ingest.py
@@ -3,6 +3,7 @@
from unittest.mock import patch
import databricks
+import pytest
from databricks.sdk.service.catalog import (
CatalogInfo,
GetMetastoreSummaryResponse,
@@ -12,12 +13,15 @@
from freezegun import freeze_time
from datahub.ingestion.run.pipeline import Pipeline
+from datahub.ingestion.source.unity.hive_metastore_proxy import HiveMetastoreProxy
from tests.test_helpers import mce_helpers
FROZEN_TIME = "2021-12-07 07:00:00"
SERVICE_PRINCIPAL_ID_1 = str(uuid.uuid4())
SERVICE_PRINCIPAL_ID_2 = str(uuid.uuid4())
+pytestmark = pytest.mark.integration_batch_1
+
def register_mock_api(request_mock):
api_vs_response = {
@@ -215,6 +219,65 @@ def register_mock_data(workspace_client):
]
+def mock_hive_sql(query):
+ if query == "DESCRIBE EXTENDED `bronze_kambi`.`bet`":
+ return [
+ ("betStatusId", "bigint", None),
+ ("channelId", "bigint", None),
+ (
+ "combination",
+ "struct>,eventId:bigint,eventName:string,eventStartDate:string,live:boolean,odds:double,outcomeIds:array,outcomeLabel:string,sportId:string,status:string,voidReason:string>>,payout:double,rewardExtraPayout:double,stake:double>",
+ None,
+ ),
+ ("", "", ""),
+ ("# Detailed Table Information", "", ""),
+ ("Catalog", "hive_metastore", ""),
+ ("Database", "bronze_kambi", ""),
+ ("Table", "bet", ""),
+ ("Created Time", "Wed Jun 22 05:14:56 UTC 2022", ""),
+ ("Last Access", "UNKNOWN", ""),
+ ("Created By", "Spark 3.2.1", ""),
+ ("Type", "MANAGED", ""),
+ ("Location", "dbfs:/user/hive/warehouse/bronze_kambi.db/bet", ""),
+ ("Provider", "delta", ""),
+ ("Owner", "root", ""),
+ ("Is_managed_location", "true", ""),
+ (
+ "Table Properties",
+ "[delta.autoOptimize.autoCompact=true,delta.autoOptimize.optimizeWrite=true,delta.minReaderVersion=1,delta.minWriterVersion=2]",
+ "",
+ ),
+ ]
+ elif query == "DESCRIBE EXTENDED `bronze_kambi`.`view1`":
+ return [
+ ("betStatusId", "bigint", None),
+ ("channelId", "bigint", None),
+ (
+ "combination",
+ "struct>,eventId:bigint,eventName:string,eventStartDate:string,live:boolean,odds:double,outcomeIds:array,outcomeLabel:string,sportId:string,status:string,voidReason:string>>,payout:double,rewardExtraPayout:double,stake:double>",
+ None,
+ ),
+ ("", "", ""),
+ ("# Detailed Table Information", "", ""),
+ ("Catalog", "hive_metastore", ""),
+ ("Database", "bronze_kambi", ""),
+ ("Table", "view1", ""),
+ ("Created Time", "Wed Jun 22 05:14:56 UTC 2022", ""),
+ ("Last Access", "UNKNOWN", ""),
+ ("Created By", "Spark 3.2.1", ""),
+ ("Type", "VIEW", ""),
+ ("Owner", "root", ""),
+ ]
+ elif query == "SHOW CREATE TABLE `bronze_kambi`.`view1`":
+ return [
+ (
+ "CREATE VIEW `hive_metastore`.`bronze_kambi`.`view1` AS SELECT * FROM `hive_metastore`.`bronze_kambi`.`bet`",
+ )
+ ]
+
+ return []
+
+
@freeze_time(FROZEN_TIME)
def test_ingestion(pytestconfig, tmp_path, requests_mock):
test_resources_dir = pytestconfig.rootpath / "tests/integration/unity"
@@ -223,11 +286,21 @@ def test_ingestion(pytestconfig, tmp_path, requests_mock):
output_file_name = "unity_catalog_mcps.json"
- with patch("databricks.sdk.WorkspaceClient") as WorkspaceClient:
+ with patch("databricks.sdk.WorkspaceClient") as WorkspaceClient, patch.object(
+ HiveMetastoreProxy, "get_inspector"
+ ) as get_inspector, patch.object(HiveMetastoreProxy, "_execute_sql") as execute_sql:
workspace_client: mock.MagicMock = mock.MagicMock()
WorkspaceClient.return_value = workspace_client
register_mock_data(workspace_client)
+ inspector = mock.MagicMock()
+ inspector.get_schema_names.return_value = ["bronze_kambi"]
+ inspector.get_view_names.return_value = ["view1"]
+ inspector.get_table_names.return_value = ["bet", "view1"]
+ get_inspector.return_value = inspector
+
+ execute_sql.side_effect = mock_hive_sql
+
config_dict: dict = {
"run_id": "unity-catalog-test",
"pipeline_name": "unity-catalog-test-pipeline",
@@ -237,6 +310,8 @@ def test_ingestion(pytestconfig, tmp_path, requests_mock):
"workspace_url": "https://dummy.cloud.databricks.com",
"token": "fake",
"include_ownership": True,
+ "include_hive_metastore": True,
+ "warehouse_id": "test",
},
},
"sink": {
diff --git a/metadata-ingestion/tests/integration/unity/unity_catalog_mces_golden.json b/metadata-ingestion/tests/integration/unity/unity_catalog_mces_golden.json
index d25c86a3a1f9a3..98a6615dd2b52c 100644
--- a/metadata-ingestion/tests/integration/unity/unity_catalog_mces_golden.json
+++ b/metadata-ingestion/tests/integration/unity/unity_catalog_mces_golden.json
@@ -114,7 +114,7 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596",
+ "entityUrn": "urn:li:container:6d6f608f9f945f2862d99b855bdd3202",
"changeType": "UPSERT",
"aspectName": "containerProperties",
"aspect": {
@@ -123,11 +123,10 @@
"platform": "databricks",
"env": "PROD",
"metastore": "acryl metastore",
- "catalog": "main"
+ "catalog": "hive_metastore"
},
- "externalUrl": "https://dummy.cloud.databricks.com/explore/data/main",
- "name": "main",
- "description": "Main catalog (auto-created)"
+ "externalUrl": "https://dummy.cloud.databricks.com/explore/data/hive_metastore",
+ "name": "hive_metastore"
}
},
"systemMetadata": {
@@ -138,7 +137,7 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596",
+ "entityUrn": "urn:li:container:6d6f608f9f945f2862d99b855bdd3202",
"changeType": "UPSERT",
"aspectName": "status",
"aspect": {
@@ -156,10 +155,18 @@
"entityType": "container",
"entityUrn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596",
"changeType": "UPSERT",
- "aspectName": "dataPlatformInstance",
+ "aspectName": "containerProperties",
"aspect": {
"json": {
- "platform": "urn:li:dataPlatform:databricks"
+ "customProperties": {
+ "platform": "databricks",
+ "env": "PROD",
+ "metastore": "acryl metastore",
+ "catalog": "main"
+ },
+ "externalUrl": "https://dummy.cloud.databricks.com/explore/data/main",
+ "name": "main",
+ "description": "Main catalog (auto-created)"
}
},
"systemMetadata": {
@@ -170,7 +177,7 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596",
+ "entityUrn": "urn:li:container:6d6f608f9f945f2862d99b855bdd3202",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
@@ -188,21 +195,12 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596",
+ "entityUrn": "urn:li:container:6d6f608f9f945f2862d99b855bdd3202",
"changeType": "UPSERT",
- "aspectName": "ownership",
+ "aspectName": "container",
"aspect": {
"json": {
- "owners": [
- {
- "owner": "urn:li:corpuser:account users",
- "type": "DATAOWNER"
- }
- ],
- "lastModified": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
- }
+ "container": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb"
}
},
"systemMetadata": {
@@ -213,12 +211,12 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596",
+ "entityUrn": "urn:li:container:6d6f608f9f945f2862d99b855bdd3202",
"changeType": "UPSERT",
- "aspectName": "container",
+ "aspectName": "dataPlatformInstance",
"aspect": {
"json": {
- "container": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb"
+ "platform": "urn:li:dataPlatform:databricks"
}
},
"systemMetadata": {
@@ -229,7 +227,7 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596",
+ "entityUrn": "urn:li:container:6d6f608f9f945f2862d99b855bdd3202",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
@@ -250,32 +248,7 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:5ada0a9773235325e506410c512feabb",
- "changeType": "UPSERT",
- "aspectName": "containerProperties",
- "aspect": {
- "json": {
- "customProperties": {
- "platform": "databricks",
- "env": "PROD",
- "metastore": "acryl metastore",
- "catalog": "main",
- "unity_schema": "default"
- },
- "externalUrl": "https://dummy.cloud.databricks.com/explore/data/main/default",
- "name": "default",
- "description": "Default schema (auto-created)"
- }
- },
- "systemMetadata": {
- "lastObserved": 1638860400000,
- "runId": "unity-catalog-test",
- "lastRunId": "no-run-id-provided"
- }
-},
-{
- "entityType": "container",
- "entityUrn": "urn:li:container:5ada0a9773235325e506410c512feabb",
+ "entityUrn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596",
"changeType": "UPSERT",
"aspectName": "status",
"aspect": {
@@ -291,7 +264,7 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:5ada0a9773235325e506410c512feabb",
+ "entityUrn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
@@ -307,13 +280,13 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:5ada0a9773235325e506410c512feabb",
+ "entityUrn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
"json": {
"typeNames": [
- "Schema"
+ "Catalog"
]
}
},
@@ -325,14 +298,14 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:5ada0a9773235325e506410c512feabb",
+ "entityUrn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596",
"changeType": "UPSERT",
"aspectName": "ownership",
"aspect": {
"json": {
"owners": [
{
- "owner": "urn:li:corpuser:abc@acryl.io",
+ "owner": "urn:li:corpuser:account users",
"type": "DATAOWNER"
}
],
@@ -350,12 +323,12 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:5ada0a9773235325e506410c512feabb",
+ "entityUrn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596",
"changeType": "UPSERT",
"aspectName": "container",
"aspect": {
"json": {
- "container": "urn:li:container:83d98e62e36bddc3596c2b738e23b596"
+ "container": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb"
}
},
"systemMetadata": {
@@ -366,21 +339,20 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:5ada0a9773235325e506410c512feabb",
+ "entityUrn": "urn:li:container:1b3927f927ada651ce5fe3fb84227f8e",
"changeType": "UPSERT",
- "aspectName": "browsePathsV2",
+ "aspectName": "containerProperties",
"aspect": {
"json": {
- "path": [
- {
- "id": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb",
- "urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb"
- },
- {
- "id": "urn:li:container:83d98e62e36bddc3596c2b738e23b596",
- "urn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596"
- }
- ]
+ "customProperties": {
+ "platform": "databricks",
+ "env": "PROD",
+ "metastore": "acryl metastore",
+ "catalog": "hive_metastore",
+ "unity_schema": "bronze_kambi"
+ },
+ "externalUrl": "https://dummy.cloud.databricks.com/explore/data/hive_metastore/bronze_kambi",
+ "name": "bronze_kambi"
}
},
"systemMetadata": {
@@ -390,13 +362,13 @@
}
},
{
- "entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.default.quickstart_table,PROD)",
+ "entityType": "container",
+ "entityUrn": "urn:li:container:1b3927f927ada651ce5fe3fb84227f8e",
"changeType": "UPSERT",
- "aspectName": "container",
+ "aspectName": "status",
"aspect": {
"json": {
- "container": "urn:li:container:5ada0a9773235325e506410c512feabb"
+ "removed": false
}
},
"systemMetadata": {
@@ -406,40 +378,18 @@
}
},
{
- "entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.default.quickstart_table,PROD)",
+ "entityType": "container",
+ "entityUrn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596",
"changeType": "UPSERT",
- "aspectName": "datasetProperties",
+ "aspectName": "browsePathsV2",
"aspect": {
"json": {
- "customProperties": {
- "storage_location": "s3://db-02eec1f70bfe4115445be9fdb1aac6ac-s3-root-bucket/metastore/2c983545-d403-4f87-9063-5b7e3b6d3736/tables/cff27aa1-1c6a-4d78-b713-562c660c2896",
- "data_source_format": "DELTA",
- "generation": "2",
- "table_type": "MANAGED",
- "created_by": "abc@acryl.io",
- "created_at": "2022-10-19 13:21:38.688000+00:00",
- "delta.lastCommitTimestamp": "1666185711000",
- "delta.lastUpdateVersion": "1",
- "delta.minReaderVersion": "1",
- "delta.minWriterVersion": "2",
- "table_id": "cff27aa1-1c6a-4d78-b713-562c660c2896",
- "owner": "account users",
- "updated_by": "abc@acryl.io",
- "updated_at": "2022-10-19 13:27:29.633000+00:00"
- },
- "externalUrl": "https://dummy.cloud.databricks.com/explore/data/main/default/quickstart_table",
- "name": "quickstart_table",
- "qualifiedName": "main.default.quickstart_table",
- "created": {
- "time": 1666185698688,
- "actor": "urn:li:corpuser:abc@acryl.io"
- },
- "lastModified": {
- "time": 1666186049633,
- "actor": "urn:li:corpuser:abc@acryl.io"
- },
- "tags": []
+ "path": [
+ {
+ "id": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb",
+ "urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb"
+ }
+ ]
}
},
"systemMetadata": {
@@ -449,14 +399,14 @@
}
},
{
- "entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.default.quickstart_table,PROD)",
+ "entityType": "container",
+ "entityUrn": "urn:li:container:1b3927f927ada651ce5fe3fb84227f8e",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
"json": {
"typeNames": [
- "Table"
+ "Schema"
]
}
},
@@ -467,55 +417,13 @@
}
},
{
- "entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.default.quickstart_table,PROD)",
+ "entityType": "container",
+ "entityUrn": "urn:li:container:1b3927f927ada651ce5fe3fb84227f8e",
"changeType": "UPSERT",
- "aspectName": "schemaMetadata",
+ "aspectName": "container",
"aspect": {
"json": {
- "schemaName": "acryl_metastore.main.default.quickstart_table",
- "platform": "urn:li:dataPlatform:databricks",
- "version": 0,
- "created": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
- },
- "lastModified": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
- },
- "hash": "",
- "platformSchema": {
- "com.linkedin.schema.MySqlDDL": {
- "tableSchema": ""
- }
- },
- "fields": [
- {
- "fieldPath": "columnA",
- "nullable": true,
- "type": {
- "type": {
- "com.linkedin.schema.NumberType": {}
- }
- },
- "nativeDataType": "int",
- "recursive": false,
- "isPartOfKey": false
- },
- {
- "fieldPath": "columnB",
- "nullable": true,
- "type": {
- "type": {
- "com.linkedin.schema.StringType": {}
- }
- },
- "nativeDataType": "string",
- "recursive": false,
- "isPartOfKey": false
- }
- ]
+ "container": "urn:li:container:6d6f608f9f945f2862d99b855bdd3202"
}
},
"systemMetadata": {
@@ -525,22 +433,13 @@
}
},
{
- "entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.default.quickstart_table,PROD)",
+ "entityType": "container",
+ "entityUrn": "urn:li:container:1b3927f927ada651ce5fe3fb84227f8e",
"changeType": "UPSERT",
- "aspectName": "ownership",
+ "aspectName": "dataPlatformInstance",
"aspect": {
"json": {
- "owners": [
- {
- "owner": "urn:li:corpuser:account users",
- "type": "DATAOWNER"
- }
- ],
- "lastModified": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
- }
+ "platform": "urn:li:dataPlatform:databricks"
}
},
"systemMetadata": {
@@ -551,7 +450,23 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.default.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.hive_metastore.bronze_kambi.view1,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "container",
+ "aspect": {
+ "json": {
+ "container": "urn:li:container:1b3927f927ada651ce5fe3fb84227f8e"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1638860400000,
+ "runId": "unity-catalog-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:1b3927f927ada651ce5fe3fb84227f8e",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
@@ -562,12 +477,8 @@
"urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb"
},
{
- "id": "urn:li:container:83d98e62e36bddc3596c2b738e23b596",
- "urn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596"
- },
- {
- "id": "urn:li:container:5ada0a9773235325e506410c512feabb",
- "urn": "urn:li:container:5ada0a9773235325e506410c512feabb"
+ "id": "urn:li:container:6d6f608f9f945f2862d99b855bdd3202",
+ "urn": "urn:li:container:6d6f608f9f945f2862d99b855bdd3202"
}
]
}
@@ -579,22 +490,33 @@
}
},
{
- "entityType": "container",
- "entityUrn": "urn:li:container:0e09e6ec299ef004941e25221d3ef6b6",
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.hive_metastore.bronze_kambi.view1,PROD)",
"changeType": "UPSERT",
- "aspectName": "containerProperties",
+ "aspectName": "datasetProperties",
"aspect": {
"json": {
"customProperties": {
- "platform": "databricks",
- "env": "PROD",
- "metastore": "acryl metastore",
- "catalog": "main",
- "unity_schema": "information_schema"
+ "table_type": "HIVE_VIEW",
+ "Catalog": "hive_metastore",
+ "Database": "bronze_kambi",
+ "Table": "view1",
+ "Last Access": "UNKNOWN",
+ "Created By": "Spark 3.2.1",
+ "Owner": "root",
+ "table_id": "hive_metastore.bronze_kambi.view1",
+ "created_at": "2022-06-22 05:14:56"
+ },
+ "externalUrl": "https://dummy.cloud.databricks.com/explore/data/hive_metastore/bronze_kambi/view1",
+ "name": "view1",
+ "qualifiedName": "hive_metastore.bronze_kambi.view1",
+ "created": {
+ "time": 1655874896000
},
- "externalUrl": "https://dummy.cloud.databricks.com/explore/data/main/information_schema",
- "name": "information_schema",
- "description": "Information schema (auto-created)"
+ "lastModified": {
+ "time": 1655874896000
+ },
+ "tags": []
}
},
"systemMetadata": {
@@ -604,13 +526,15 @@
}
},
{
- "entityType": "container",
- "entityUrn": "urn:li:container:0e09e6ec299ef004941e25221d3ef6b6",
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.hive_metastore.bronze_kambi.view1,PROD)",
"changeType": "UPSERT",
- "aspectName": "status",
+ "aspectName": "viewProperties",
"aspect": {
"json": {
- "removed": false
+ "materialized": false,
+ "viewLogic": "CREATE VIEW `hive_metastore`.`bronze_kambi`.`view1` AS SELECT * FROM `hive_metastore`.`bronze_kambi`.`bet`",
+ "viewLanguage": "SQL"
}
},
"systemMetadata": {
@@ -621,13 +545,22 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:0e09e6ec299ef004941e25221d3ef6b6",
+ "entityUrn": "urn:li:container:5ada0a9773235325e506410c512feabb",
"changeType": "UPSERT",
- "aspectName": "dataPlatformInstance",
+ "aspectName": "containerProperties",
"aspect": {
"json": {
- "platform": "urn:li:dataPlatform:databricks"
- }
+ "customProperties": {
+ "platform": "databricks",
+ "env": "PROD",
+ "metastore": "acryl metastore",
+ "catalog": "main",
+ "unity_schema": "default"
+ },
+ "externalUrl": "https://dummy.cloud.databricks.com/explore/data/main/default",
+ "name": "default",
+ "description": "Default schema (auto-created)"
+ }
},
"systemMetadata": {
"lastObserved": 1638860400000,
@@ -636,14 +569,14 @@
}
},
{
- "entityType": "container",
- "entityUrn": "urn:li:container:0e09e6ec299ef004941e25221d3ef6b6",
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.hive_metastore.bronze_kambi.view1,PROD)",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
"json": {
"typeNames": [
- "Schema"
+ "View"
]
}
},
@@ -654,49 +587,8 @@
}
},
{
- "entityType": "container",
- "entityUrn": "urn:li:container:0e09e6ec299ef004941e25221d3ef6b6",
- "changeType": "UPSERT",
- "aspectName": "ownership",
- "aspect": {
- "json": {
- "owners": [
- {
- "owner": "urn:li:corpuser:Service Principal 1",
- "type": "DATAOWNER"
- }
- ],
- "lastModified": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
- }
- }
- },
- "systemMetadata": {
- "lastObserved": 1638860400000,
- "runId": "unity-catalog-test",
- "lastRunId": "no-run-id-provided"
- }
-},
-{
- "entityType": "container",
- "entityUrn": "urn:li:container:0e09e6ec299ef004941e25221d3ef6b6",
- "changeType": "UPSERT",
- "aspectName": "container",
- "aspect": {
- "json": {
- "container": "urn:li:container:83d98e62e36bddc3596c2b738e23b596"
- }
- },
- "systemMetadata": {
- "lastObserved": 1638860400000,
- "runId": "unity-catalog-test",
- "lastRunId": "no-run-id-provided"
- }
-},
-{
- "entityType": "container",
- "entityUrn": "urn:li:container:0e09e6ec299ef004941e25221d3ef6b6",
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.hive_metastore.bronze_kambi.view1,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
@@ -707,8 +599,12 @@
"urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb"
},
{
- "id": "urn:li:container:83d98e62e36bddc3596c2b738e23b596",
- "urn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596"
+ "id": "urn:li:container:6d6f608f9f945f2862d99b855bdd3202",
+ "urn": "urn:li:container:6d6f608f9f945f2862d99b855bdd3202"
+ },
+ {
+ "id": "urn:li:container:1b3927f927ada651ce5fe3fb84227f8e",
+ "urn": "urn:li:container:1b3927f927ada651ce5fe3fb84227f8e"
}
]
}
@@ -720,74 +616,13 @@
}
},
{
- "entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.information_schema.quickstart_table,PROD)",
- "changeType": "UPSERT",
- "aspectName": "container",
- "aspect": {
- "json": {
- "container": "urn:li:container:0e09e6ec299ef004941e25221d3ef6b6"
- }
- },
- "systemMetadata": {
- "lastObserved": 1638860400000,
- "runId": "unity-catalog-test",
- "lastRunId": "no-run-id-provided"
- }
-},
-{
- "entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.information_schema.quickstart_table,PROD)",
- "changeType": "UPSERT",
- "aspectName": "datasetProperties",
- "aspect": {
- "json": {
- "customProperties": {
- "storage_location": "s3://db-02eec1f70bfe4115445be9fdb1aac6ac-s3-root-bucket/metastore/2c983545-d403-4f87-9063-5b7e3b6d3736/tables/cff27aa1-1c6a-4d78-b713-562c660c2896",
- "data_source_format": "DELTA",
- "generation": "2",
- "table_type": "MANAGED",
- "created_by": "abc@acryl.io",
- "created_at": "2022-10-19 13:21:38.688000+00:00",
- "delta.lastCommitTimestamp": "1666185711000",
- "delta.lastUpdateVersion": "1",
- "delta.minReaderVersion": "1",
- "delta.minWriterVersion": "2",
- "table_id": "cff27aa1-1c6a-4d78-b713-562c660c2896",
- "owner": "account users",
- "updated_by": "abc@acryl.io",
- "updated_at": "2022-10-19 13:27:29.633000+00:00"
- },
- "externalUrl": "https://dummy.cloud.databricks.com/explore/data/main/information_schema/quickstart_table",
- "name": "quickstart_table",
- "qualifiedName": "main.information_schema.quickstart_table",
- "created": {
- "time": 1666185698688,
- "actor": "urn:li:corpuser:abc@acryl.io"
- },
- "lastModified": {
- "time": 1666186049633,
- "actor": "urn:li:corpuser:abc@acryl.io"
- },
- "tags": []
- }
- },
- "systemMetadata": {
- "lastObserved": 1638860400000,
- "runId": "unity-catalog-test",
- "lastRunId": "no-run-id-provided"
- }
-},
-{
- "entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.information_schema.quickstart_table,PROD)",
+ "entityType": "container",
+ "entityUrn": "urn:li:container:5ada0a9773235325e506410c512feabb",
"changeType": "UPSERT",
- "aspectName": "subTypes",
+ "aspectName": "status",
"aspect": {
"json": {
- "typeNames": [
- "Table"
- ]
+ "removed": false
}
},
"systemMetadata": {
@@ -798,12 +633,12 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.information_schema.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.hive_metastore.bronze_kambi.view1,PROD)",
"changeType": "UPSERT",
"aspectName": "schemaMetadata",
"aspect": {
"json": {
- "schemaName": "acryl_metastore.main.information_schema.quickstart_table",
+ "schemaName": "hive_metastore.bronze_kambi.view1",
"platform": "urn:li:dataPlatform:databricks",
"version": 0,
"created": {
@@ -822,144 +657,409 @@
},
"fields": [
{
- "fieldPath": "columnA",
- "nullable": true,
+ "fieldPath": "betStatusId",
+ "nullable": false,
"type": {
"type": {
"com.linkedin.schema.NumberType": {}
}
},
- "nativeDataType": "int",
+ "nativeDataType": "bigint",
"recursive": false,
"isPartOfKey": false
},
{
- "fieldPath": "columnB",
- "nullable": true,
+ "fieldPath": "channelId",
+ "nullable": false,
"type": {
"type": {
- "com.linkedin.schema.StringType": {}
+ "com.linkedin.schema.NumberType": {}
}
},
- "nativeDataType": "string",
+ "nativeDataType": "bigint",
"recursive": false,
"isPartOfKey": false
- }
- ]
- }
- },
- "systemMetadata": {
- "lastObserved": 1638860400000,
- "runId": "unity-catalog-test",
- "lastRunId": "no-run-id-provided"
- }
-},
-{
- "entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.information_schema.quickstart_table,PROD)",
- "changeType": "UPSERT",
- "aspectName": "ownership",
- "aspect": {
- "json": {
- "owners": [
+ },
{
- "owner": "urn:li:corpuser:account users",
- "type": "DATAOWNER"
- }
- ],
- "lastModified": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
- }
- }
- },
- "systemMetadata": {
- "lastObserved": 1638860400000,
- "runId": "unity-catalog-test",
- "lastRunId": "no-run-id-provided"
- }
-},
-{
- "entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.information_schema.quickstart_table,PROD)",
- "changeType": "UPSERT",
- "aspectName": "browsePathsV2",
- "aspect": {
- "json": {
- "path": [
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination",
+ "nullable": false,
+ "type": {
+ "type": {
+ "com.linkedin.schema.RecordType": {}
+ }
+ },
+ "nativeDataType": "struct>,eventid:bigint,eventname:string,eventstartdate:string,live:boolean,odds:double,outcomeids:array,outcomelabel:string,sportid:string,status:string,voidreason:string>>,payout:double,rewardextrapayout:double,stake:double>",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"struct>,eventid:bigint,eventname:string,eventstartdate:string,live:boolean,odds:double,outcomeids:array,outcomelabel:string,sportid:string,status:string,voidreason:string>>,payout:double,rewardextrapayout:double,stake:double>\"}"
+ },
{
- "id": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb",
- "urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb"
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=long].combinationref",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "bigint",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"bigint\", \"_nullable\": true}"
},
{
- "id": "urn:li:container:83d98e62e36bddc3596c2b738e23b596",
- "urn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596"
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=double].currentodds",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "double",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}"
},
{
- "id": "urn:li:container:0e09e6ec299ef004941e25221d3ef6b6",
- "urn": "urn:li:container:0e09e6ec299ef004941e25221d3ef6b6"
- }
- ]
- }
- },
- "systemMetadata": {
- "lastObserved": 1638860400000,
- "runId": "unity-catalog-test",
- "lastRunId": "no-run-id-provided"
- }
-},
-{
- "entityType": "container",
- "entityUrn": "urn:li:container:481380c5a355638fc626eca8380cdda9",
- "changeType": "UPSERT",
- "aspectName": "containerProperties",
- "aspect": {
- "json": {
- "customProperties": {
- "platform": "databricks",
- "env": "PROD",
- "metastore": "acryl metastore",
- "catalog": "main",
- "unity_schema": "quickstart_schema"
- },
- "externalUrl": "https://dummy.cloud.databricks.com/explore/data/main/quickstart_schema",
- "name": "quickstart_schema",
- "description": "A new Unity Catalog schema called quickstart_schema"
- }
- },
- "systemMetadata": {
- "lastObserved": 1638860400000,
- "runId": "unity-catalog-test",
- "lastRunId": "no-run-id-provided"
- }
-},
-{
- "entityType": "container",
- "entityUrn": "urn:li:container:481380c5a355638fc626eca8380cdda9",
- "changeType": "UPSERT",
- "aspectName": "status",
- "aspect": {
- "json": {
- "removed": false
- }
- },
- "systemMetadata": {
- "lastObserved": 1638860400000,
- "runId": "unity-catalog-test",
- "lastRunId": "no-run-id-provided"
- }
-},
-{
- "entityType": "container",
- "entityUrn": "urn:li:container:481380c5a355638fc626eca8380cdda9",
- "changeType": "UPSERT",
- "aspectName": "dataPlatformInstance",
- "aspect": {
- "json": {
- "platform": "urn:li:dataPlatform:databricks"
- }
- },
- "systemMetadata": {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=boolean].eachway",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.BooleanType": {}
+ }
+ },
+ "nativeDataType": "boolean",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"boolean\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=boolean].livebetting",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.BooleanType": {}
+ }
+ },
+ "nativeDataType": "boolean",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"boolean\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=double].odds",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "double",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes",
+ "nullable": false,
+ "type": {
+ "type": {
+ "com.linkedin.schema.ArrayType": {
+ "nestedType": [
+ "record"
+ ]
+ }
+ }
+ },
+ "nativeDataType": "array>,eventid:bigint,eventname:string,eventstartdate:string,live:boolean,odds:double,outcomeids:array,outcomelabel:string,sportid:string,status:string,voidreason:string>>",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"array>,eventid:bigint,eventname:string,eventstartdate:string,live:boolean,odds:double,outcomeids:array,outcomelabel:string,sportid:string,status:string,voidreason:string>>\"}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=long].betoffertypeid",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "bigint",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"bigint\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=long].criterionid",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "bigint",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"bigint\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=string].criterionname",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "string",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=double].currentodds",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "double",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=long].eventgroupid",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "bigint",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"bigint\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=array].[type=struct].eventgrouppath",
+ "nullable": false,
+ "type": {
+ "type": {
+ "com.linkedin.schema.ArrayType": {
+ "nestedType": [
+ "record"
+ ]
+ }
+ }
+ },
+ "nativeDataType": "array>",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"array>\"}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=array].[type=struct].eventgrouppath.[type=long].id",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "bigint",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"bigint\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=array].[type=struct].eventgrouppath.[type=string].name",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "string",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=long].eventid",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "bigint",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"bigint\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=string].eventname",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "string",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=string].eventstartdate",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "string",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=boolean].live",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.BooleanType": {}
+ }
+ },
+ "nativeDataType": "boolean",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"boolean\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=double].odds",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "double",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=array].[type=long].outcomeids",
+ "nullable": false,
+ "type": {
+ "type": {
+ "com.linkedin.schema.ArrayType": {
+ "nestedType": [
+ "long"
+ ]
+ }
+ }
+ },
+ "nativeDataType": "array",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"array\"}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=string].outcomelabel",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "string",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=string].sportid",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "string",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=string].status",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "string",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=string].voidreason",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "string",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=double].payout",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "double",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=double].rewardextrapayout",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "double",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=double].stake",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "double",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}"
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
"lastObserved": 1638860400000,
"runId": "unity-catalog-test",
"lastRunId": "no-run-id-provided"
@@ -967,7 +1067,23 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:481380c5a355638fc626eca8380cdda9",
+ "entityUrn": "urn:li:container:5ada0a9773235325e506410c512feabb",
+ "changeType": "UPSERT",
+ "aspectName": "dataPlatformInstance",
+ "aspect": {
+ "json": {
+ "platform": "urn:li:dataPlatform:databricks"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1638860400000,
+ "runId": "unity-catalog-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:5ada0a9773235325e506410c512feabb",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
@@ -985,14 +1101,14 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:481380c5a355638fc626eca8380cdda9",
+ "entityUrn": "urn:li:container:5ada0a9773235325e506410c512feabb",
"changeType": "UPSERT",
"aspectName": "ownership",
"aspect": {
"json": {
"owners": [
{
- "owner": "urn:li:corpuser:account users",
+ "owner": "urn:li:corpuser:abc@acryl.io",
"type": "DATAOWNER"
}
],
@@ -1009,13 +1125,13 @@
}
},
{
- "entityType": "container",
- "entityUrn": "urn:li:container:481380c5a355638fc626eca8380cdda9",
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.hive_metastore.bronze_kambi.bet,PROD)",
"changeType": "UPSERT",
"aspectName": "container",
"aspect": {
"json": {
- "container": "urn:li:container:83d98e62e36bddc3596c2b738e23b596"
+ "container": "urn:li:container:1b3927f927ada651ce5fe3fb84227f8e"
}
},
"systemMetadata": {
@@ -1026,37 +1142,12 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:481380c5a355638fc626eca8380cdda9",
- "changeType": "UPSERT",
- "aspectName": "browsePathsV2",
- "aspect": {
- "json": {
- "path": [
- {
- "id": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb",
- "urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb"
- },
- {
- "id": "urn:li:container:83d98e62e36bddc3596c2b738e23b596",
- "urn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596"
- }
- ]
- }
- },
- "systemMetadata": {
- "lastObserved": 1638860400000,
- "runId": "unity-catalog-test",
- "lastRunId": "no-run-id-provided"
- }
-},
-{
- "entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.quickstart_schema.quickstart_table,PROD)",
+ "entityUrn": "urn:li:container:5ada0a9773235325e506410c512feabb",
"changeType": "UPSERT",
"aspectName": "container",
"aspect": {
"json": {
- "container": "urn:li:container:481380c5a355638fc626eca8380cdda9"
+ "container": "urn:li:container:83d98e62e36bddc3596c2b738e23b596"
}
},
"systemMetadata": {
@@ -1067,37 +1158,34 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.quickstart_schema.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.hive_metastore.bronze_kambi.bet,PROD)",
"changeType": "UPSERT",
"aspectName": "datasetProperties",
"aspect": {
"json": {
"customProperties": {
- "storage_location": "s3://db-02eec1f70bfe4115445be9fdb1aac6ac-s3-root-bucket/metastore/2c983545-d403-4f87-9063-5b7e3b6d3736/tables/cff27aa1-1c6a-4d78-b713-562c660c2896",
+ "storage_location": "dbfs:/user/hive/warehouse/bronze_kambi.db/bet",
"data_source_format": "DELTA",
- "generation": "2",
- "table_type": "MANAGED",
- "created_by": "abc@acryl.io",
- "created_at": "2022-10-19 13:21:38.688000+00:00",
- "delta.lastCommitTimestamp": "1666185711000",
- "delta.lastUpdateVersion": "1",
- "delta.minReaderVersion": "1",
- "delta.minWriterVersion": "2",
- "table_id": "cff27aa1-1c6a-4d78-b713-562c660c2896",
- "owner": "account users",
- "updated_by": "abc@acryl.io",
- "updated_at": "2022-10-19 13:27:29.633000+00:00"
- },
- "externalUrl": "https://dummy.cloud.databricks.com/explore/data/main/quickstart_schema/quickstart_table",
- "name": "quickstart_table",
- "qualifiedName": "main.quickstart_schema.quickstart_table",
+ "table_type": "HIVE_MANAGED_TABLE",
+ "Catalog": "hive_metastore",
+ "Database": "bronze_kambi",
+ "Table": "bet",
+ "Last Access": "UNKNOWN",
+ "Created By": "Spark 3.2.1",
+ "Owner": "root",
+ "Is_managed_location": "true",
+ "Table Properties": "[delta.autoOptimize.autoCompact=true,delta.autoOptimize.optimizeWrite=true,delta.minReaderVersion=1,delta.minWriterVersion=2]",
+ "table_id": "hive_metastore.bronze_kambi.bet",
+ "created_at": "2022-06-22 05:14:56"
+ },
+ "externalUrl": "https://dummy.cloud.databricks.com/explore/data/hive_metastore/bronze_kambi/bet",
+ "name": "bet",
+ "qualifiedName": "hive_metastore.bronze_kambi.bet",
"created": {
- "time": 1666185698688,
- "actor": "urn:li:corpuser:abc@acryl.io"
+ "time": 1655874896000
},
"lastModified": {
- "time": 1666186049633,
- "actor": "urn:li:corpuser:abc@acryl.io"
+ "time": 1655874896000
},
"tags": []
}
@@ -1110,7 +1198,7 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.quickstart_schema.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.hive_metastore.bronze_kambi.bet,PROD)",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
@@ -1127,53 +1215,20 @@
}
},
{
- "entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.quickstart_schema.quickstart_table,PROD)",
+ "entityType": "container",
+ "entityUrn": "urn:li:container:5ada0a9773235325e506410c512feabb",
"changeType": "UPSERT",
- "aspectName": "schemaMetadata",
+ "aspectName": "browsePathsV2",
"aspect": {
"json": {
- "schemaName": "acryl_metastore.main.quickstart_schema.quickstart_table",
- "platform": "urn:li:dataPlatform:databricks",
- "version": 0,
- "created": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
- },
- "lastModified": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
- },
- "hash": "",
- "platformSchema": {
- "com.linkedin.schema.MySqlDDL": {
- "tableSchema": ""
- }
- },
- "fields": [
+ "path": [
{
- "fieldPath": "columnA",
- "nullable": true,
- "type": {
- "type": {
- "com.linkedin.schema.NumberType": {}
- }
- },
- "nativeDataType": "int",
- "recursive": false,
- "isPartOfKey": false
+ "id": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb",
+ "urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb"
},
{
- "fieldPath": "columnB",
- "nullable": true,
- "type": {
- "type": {
- "com.linkedin.schema.StringType": {}
- }
- },
- "nativeDataType": "string",
- "recursive": false,
- "isPartOfKey": false
+ "id": "urn:li:container:83d98e62e36bddc3596c2b738e23b596",
+ "urn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596"
}
]
}
@@ -1186,32 +1241,7 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.quickstart_schema.quickstart_table,PROD)",
- "changeType": "UPSERT",
- "aspectName": "ownership",
- "aspect": {
- "json": {
- "owners": [
- {
- "owner": "urn:li:corpuser:account users",
- "type": "DATAOWNER"
- }
- ],
- "lastModified": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
- }
- }
- },
- "systemMetadata": {
- "lastObserved": 1638860400000,
- "runId": "unity-catalog-test",
- "lastRunId": "no-run-id-provided"
- }
-},
-{
- "entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.quickstart_schema.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.hive_metastore.bronze_kambi.bet,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
@@ -1222,12 +1252,12 @@
"urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb"
},
{
- "id": "urn:li:container:83d98e62e36bddc3596c2b738e23b596",
- "urn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596"
+ "id": "urn:li:container:6d6f608f9f945f2862d99b855bdd3202",
+ "urn": "urn:li:container:6d6f608f9f945f2862d99b855bdd3202"
},
{
- "id": "urn:li:container:481380c5a355638fc626eca8380cdda9",
- "urn": "urn:li:container:481380c5a355638fc626eca8380cdda9"
+ "id": "urn:li:container:1b3927f927ada651ce5fe3fb84227f8e",
+ "urn": "urn:li:container:1b3927f927ada651ce5fe3fb84227f8e"
}
]
}
@@ -1239,272 +1269,429 @@
}
},
{
- "entityType": "container",
- "entityUrn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965",
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.hive_metastore.bronze_kambi.bet,PROD)",
"changeType": "UPSERT",
- "aspectName": "containerProperties",
+ "aspectName": "schemaMetadata",
"aspect": {
"json": {
- "customProperties": {
- "platform": "databricks",
- "env": "PROD",
- "metastore": "acryl metastore",
- "catalog": "quickstart_catalog"
+ "schemaName": "hive_metastore.bronze_kambi.bet",
+ "platform": "urn:li:dataPlatform:databricks",
+ "version": 0,
+ "created": {
+ "time": 0,
+ "actor": "urn:li:corpuser:unknown"
},
- "externalUrl": "https://dummy.cloud.databricks.com/explore/data/quickstart_catalog",
- "name": "quickstart_catalog",
- "description": ""
- }
- },
- "systemMetadata": {
- "lastObserved": 1638860400000,
- "runId": "unity-catalog-test",
- "lastRunId": "no-run-id-provided"
- }
-},
-{
- "entityType": "container",
- "entityUrn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965",
- "changeType": "UPSERT",
- "aspectName": "status",
- "aspect": {
- "json": {
- "removed": false
- }
- },
- "systemMetadata": {
- "lastObserved": 1638860400000,
- "runId": "unity-catalog-test",
- "lastRunId": "no-run-id-provided"
- }
-},
-{
- "entityType": "container",
- "entityUrn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965",
- "changeType": "UPSERT",
- "aspectName": "dataPlatformInstance",
- "aspect": {
- "json": {
- "platform": "urn:li:dataPlatform:databricks"
- }
- },
- "systemMetadata": {
- "lastObserved": 1638860400000,
- "runId": "unity-catalog-test",
- "lastRunId": "no-run-id-provided"
- }
-},
-{
- "entityType": "container",
- "entityUrn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965",
- "changeType": "UPSERT",
- "aspectName": "subTypes",
- "aspect": {
- "json": {
- "typeNames": [
- "Catalog"
- ]
- }
- },
- "systemMetadata": {
- "lastObserved": 1638860400000,
- "runId": "unity-catalog-test",
- "lastRunId": "no-run-id-provided"
- }
-},
-{
- "entityType": "container",
- "entityUrn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965",
- "changeType": "UPSERT",
- "aspectName": "ownership",
- "aspect": {
- "json": {
- "owners": [
- {
- "owner": "urn:li:corpuser:account users",
- "type": "DATAOWNER"
- }
- ],
"lastModified": {
"time": 0,
"actor": "urn:li:corpuser:unknown"
- }
- }
- },
- "systemMetadata": {
- "lastObserved": 1638860400000,
- "runId": "unity-catalog-test",
- "lastRunId": "no-run-id-provided"
- }
-},
-{
- "entityType": "container",
- "entityUrn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965",
- "changeType": "UPSERT",
- "aspectName": "container",
- "aspect": {
- "json": {
- "container": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb"
- }
- },
- "systemMetadata": {
- "lastObserved": 1638860400000,
- "runId": "unity-catalog-test",
- "lastRunId": "no-run-id-provided"
- }
-},
-{
- "entityType": "container",
- "entityUrn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965",
- "changeType": "UPSERT",
- "aspectName": "browsePathsV2",
- "aspect": {
- "json": {
- "path": [
- {
- "id": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb",
- "urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb"
- }
- ]
- }
- },
- "systemMetadata": {
- "lastObserved": 1638860400000,
- "runId": "unity-catalog-test",
- "lastRunId": "no-run-id-provided"
- }
-},
-{
- "entityType": "container",
- "entityUrn": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90",
- "changeType": "UPSERT",
- "aspectName": "containerProperties",
- "aspect": {
- "json": {
- "customProperties": {
- "platform": "databricks",
- "env": "PROD",
- "metastore": "acryl metastore",
- "catalog": "quickstart_catalog",
- "unity_schema": "default"
},
- "externalUrl": "https://dummy.cloud.databricks.com/explore/data/quickstart_catalog/default",
- "name": "default",
- "description": "Default schema (auto-created)"
- }
- },
- "systemMetadata": {
- "lastObserved": 1638860400000,
- "runId": "unity-catalog-test",
- "lastRunId": "no-run-id-provided"
- }
-},
-{
- "entityType": "container",
- "entityUrn": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90",
- "changeType": "UPSERT",
- "aspectName": "status",
- "aspect": {
- "json": {
- "removed": false
- }
- },
- "systemMetadata": {
- "lastObserved": 1638860400000,
- "runId": "unity-catalog-test",
- "lastRunId": "no-run-id-provided"
- }
-},
-{
- "entityType": "container",
- "entityUrn": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90",
- "changeType": "UPSERT",
- "aspectName": "dataPlatformInstance",
- "aspect": {
- "json": {
- "platform": "urn:li:dataPlatform:databricks"
- }
- },
- "systemMetadata": {
- "lastObserved": 1638860400000,
- "runId": "unity-catalog-test",
- "lastRunId": "no-run-id-provided"
- }
-},
-{
- "entityType": "container",
- "entityUrn": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90",
- "changeType": "UPSERT",
- "aspectName": "subTypes",
- "aspect": {
- "json": {
- "typeNames": [
- "Schema"
- ]
- }
- },
- "systemMetadata": {
- "lastObserved": 1638860400000,
- "runId": "unity-catalog-test",
- "lastRunId": "no-run-id-provided"
- }
-},
-{
- "entityType": "container",
- "entityUrn": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90",
- "changeType": "UPSERT",
- "aspectName": "ownership",
- "aspect": {
- "json": {
- "owners": [
+ "hash": "",
+ "platformSchema": {
+ "com.linkedin.schema.MySqlDDL": {
+ "tableSchema": ""
+ }
+ },
+ "fields": [
+ {
+ "fieldPath": "betStatusId",
+ "nullable": false,
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "bigint",
+ "recursive": false,
+ "isPartOfKey": false
+ },
+ {
+ "fieldPath": "channelId",
+ "nullable": false,
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "bigint",
+ "recursive": false,
+ "isPartOfKey": false
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination",
+ "nullable": false,
+ "type": {
+ "type": {
+ "com.linkedin.schema.RecordType": {}
+ }
+ },
+ "nativeDataType": "struct>,eventid:bigint,eventname:string,eventstartdate:string,live:boolean,odds:double,outcomeids:array,outcomelabel:string,sportid:string,status:string,voidreason:string>>,payout:double,rewardextrapayout:double,stake:double>",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"struct>,eventid:bigint,eventname:string,eventstartdate:string,live:boolean,odds:double,outcomeids:array,outcomelabel:string,sportid:string,status:string,voidreason:string>>,payout:double,rewardextrapayout:double,stake:double>\"}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=long].combinationref",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "bigint",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"bigint\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=double].currentodds",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "double",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=boolean].eachway",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.BooleanType": {}
+ }
+ },
+ "nativeDataType": "boolean",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"boolean\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=boolean].livebetting",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.BooleanType": {}
+ }
+ },
+ "nativeDataType": "boolean",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"boolean\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=double].odds",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "double",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes",
+ "nullable": false,
+ "type": {
+ "type": {
+ "com.linkedin.schema.ArrayType": {
+ "nestedType": [
+ "record"
+ ]
+ }
+ }
+ },
+ "nativeDataType": "array>,eventid:bigint,eventname:string,eventstartdate:string,live:boolean,odds:double,outcomeids:array,outcomelabel:string,sportid:string,status:string,voidreason:string>>",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"array>,eventid:bigint,eventname:string,eventstartdate:string,live:boolean,odds:double,outcomeids:array,outcomelabel:string,sportid:string,status:string,voidreason:string>>\"}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=long].betoffertypeid",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "bigint",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"bigint\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=long].criterionid",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "bigint",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"bigint\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=string].criterionname",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "string",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=double].currentodds",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "double",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=long].eventgroupid",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "bigint",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"bigint\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=array].[type=struct].eventgrouppath",
+ "nullable": false,
+ "type": {
+ "type": {
+ "com.linkedin.schema.ArrayType": {
+ "nestedType": [
+ "record"
+ ]
+ }
+ }
+ },
+ "nativeDataType": "array>",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"array>\"}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=array].[type=struct].eventgrouppath.[type=long].id",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "bigint",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"bigint\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=array].[type=struct].eventgrouppath.[type=string].name",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "string",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=long].eventid",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "bigint",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"bigint\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=string].eventname",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "string",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=string].eventstartdate",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "string",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=boolean].live",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.BooleanType": {}
+ }
+ },
+ "nativeDataType": "boolean",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"boolean\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=double].odds",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "double",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=array].[type=long].outcomeids",
+ "nullable": false,
+ "type": {
+ "type": {
+ "com.linkedin.schema.ArrayType": {
+ "nestedType": [
+ "long"
+ ]
+ }
+ }
+ },
+ "nativeDataType": "array",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"array\"}"
+ },
{
- "owner": "urn:li:corpuser:abc@acryl.io",
- "type": "DATAOWNER"
- }
- ],
- "lastModified": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
- }
- }
- },
- "systemMetadata": {
- "lastObserved": 1638860400000,
- "runId": "unity-catalog-test",
- "lastRunId": "no-run-id-provided"
- }
-},
-{
- "entityType": "container",
- "entityUrn": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90",
- "changeType": "UPSERT",
- "aspectName": "container",
- "aspect": {
- "json": {
- "container": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965"
- }
- },
- "systemMetadata": {
- "lastObserved": 1638860400000,
- "runId": "unity-catalog-test",
- "lastRunId": "no-run-id-provided"
- }
-},
-{
- "entityType": "container",
- "entityUrn": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90",
- "changeType": "UPSERT",
- "aspectName": "browsePathsV2",
- "aspect": {
- "json": {
- "path": [
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=string].outcomelabel",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "string",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}"
+ },
{
- "id": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb",
- "urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb"
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=string].sportid",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "string",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}"
},
{
- "id": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965",
- "urn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965"
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=string].status",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "string",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=array].[type=struct].outcomes.[type=string].voidreason",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.StringType": {}
+ }
+ },
+ "nativeDataType": "string",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=double].payout",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "double",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=double].rewardextrapayout",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "double",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}"
+ },
+ {
+ "fieldPath": "[version=2.0].[type=struct].[type=struct].combination.[type=double].stake",
+ "nullable": true,
+ "type": {
+ "type": {
+ "com.linkedin.schema.NumberType": {}
+ }
+ },
+ "nativeDataType": "double",
+ "recursive": false,
+ "isPartOfKey": false,
+ "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}"
}
]
}
@@ -1517,12 +1704,12 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.default.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.default.quickstart_table,PROD)",
"changeType": "UPSERT",
"aspectName": "container",
"aspect": {
"json": {
- "container": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90"
+ "container": "urn:li:container:5ada0a9773235325e506410c512feabb"
}
},
"systemMetadata": {
@@ -1533,7 +1720,7 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.default.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.default.quickstart_table,PROD)",
"changeType": "UPSERT",
"aspectName": "datasetProperties",
"aspect": {
@@ -1554,9 +1741,9 @@
"updated_by": "abc@acryl.io",
"updated_at": "2022-10-19 13:27:29.633000+00:00"
},
- "externalUrl": "https://dummy.cloud.databricks.com/explore/data/quickstart_catalog/default/quickstart_table",
+ "externalUrl": "https://dummy.cloud.databricks.com/explore/data/main/default/quickstart_table",
"name": "quickstart_table",
- "qualifiedName": "quickstart_catalog.default.quickstart_table",
+ "qualifiedName": "main.default.quickstart_table",
"created": {
"time": 1666185698688,
"actor": "urn:li:corpuser:abc@acryl.io"
@@ -1576,7 +1763,7 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.default.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.default.quickstart_table,PROD)",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
@@ -1594,12 +1781,12 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.default.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.default.quickstart_table,PROD)",
"changeType": "UPSERT",
"aspectName": "schemaMetadata",
"aspect": {
"json": {
- "schemaName": "acryl_metastore.quickstart_catalog.default.quickstart_table",
+ "schemaName": "acryl_metastore.main.default.quickstart_table",
"platform": "urn:li:dataPlatform:databricks",
"version": 0,
"created": {
@@ -1652,7 +1839,7 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.default.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.default.quickstart_table,PROD)",
"changeType": "UPSERT",
"aspectName": "ownership",
"aspect": {
@@ -1677,7 +1864,7 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.default.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.default.quickstart_table,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
@@ -1688,12 +1875,12 @@
"urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb"
},
{
- "id": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965",
- "urn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965"
+ "id": "urn:li:container:83d98e62e36bddc3596c2b738e23b596",
+ "urn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596"
},
{
- "id": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90",
- "urn": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90"
+ "id": "urn:li:container:5ada0a9773235325e506410c512feabb",
+ "urn": "urn:li:container:5ada0a9773235325e506410c512feabb"
}
]
}
@@ -1706,7 +1893,7 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:29f99476d533719be0cebc374d5265dc",
+ "entityUrn": "urn:li:container:481380c5a355638fc626eca8380cdda9",
"changeType": "UPSERT",
"aspectName": "containerProperties",
"aspect": {
@@ -1715,12 +1902,12 @@
"platform": "databricks",
"env": "PROD",
"metastore": "acryl metastore",
- "catalog": "quickstart_catalog",
- "unity_schema": "information_schema"
+ "catalog": "main",
+ "unity_schema": "quickstart_schema"
},
- "externalUrl": "https://dummy.cloud.databricks.com/explore/data/quickstart_catalog/information_schema",
- "name": "information_schema",
- "description": "Information schema (auto-created)"
+ "externalUrl": "https://dummy.cloud.databricks.com/explore/data/main/quickstart_schema",
+ "name": "quickstart_schema",
+ "description": "A new Unity Catalog schema called quickstart_schema"
}
},
"systemMetadata": {
@@ -1731,7 +1918,7 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:29f99476d533719be0cebc374d5265dc",
+ "entityUrn": "urn:li:container:481380c5a355638fc626eca8380cdda9",
"changeType": "UPSERT",
"aspectName": "status",
"aspect": {
@@ -1747,7 +1934,7 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:29f99476d533719be0cebc374d5265dc",
+ "entityUrn": "urn:li:container:481380c5a355638fc626eca8380cdda9",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
@@ -1763,7 +1950,7 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:29f99476d533719be0cebc374d5265dc",
+ "entityUrn": "urn:li:container:481380c5a355638fc626eca8380cdda9",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
@@ -1781,14 +1968,14 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:29f99476d533719be0cebc374d5265dc",
+ "entityUrn": "urn:li:container:481380c5a355638fc626eca8380cdda9",
"changeType": "UPSERT",
"aspectName": "ownership",
"aspect": {
"json": {
"owners": [
{
- "owner": "urn:li:corpuser:Service Principal 1",
+ "owner": "urn:li:corpuser:account users",
"type": "DATAOWNER"
}
],
@@ -1806,12 +1993,12 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:29f99476d533719be0cebc374d5265dc",
+ "entityUrn": "urn:li:container:481380c5a355638fc626eca8380cdda9",
"changeType": "UPSERT",
"aspectName": "container",
"aspect": {
"json": {
- "container": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965"
+ "container": "urn:li:container:83d98e62e36bddc3596c2b738e23b596"
}
},
"systemMetadata": {
@@ -1822,7 +2009,7 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:29f99476d533719be0cebc374d5265dc",
+ "entityUrn": "urn:li:container:481380c5a355638fc626eca8380cdda9",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
@@ -1833,8 +2020,8 @@
"urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb"
},
{
- "id": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965",
- "urn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965"
+ "id": "urn:li:container:83d98e62e36bddc3596c2b738e23b596",
+ "urn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596"
}
]
}
@@ -1847,12 +2034,12 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.information_schema.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.quickstart_schema.quickstart_table,PROD)",
"changeType": "UPSERT",
"aspectName": "container",
"aspect": {
"json": {
- "container": "urn:li:container:29f99476d533719be0cebc374d5265dc"
+ "container": "urn:li:container:481380c5a355638fc626eca8380cdda9"
}
},
"systemMetadata": {
@@ -1863,7 +2050,7 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.information_schema.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.quickstart_schema.quickstart_table,PROD)",
"changeType": "UPSERT",
"aspectName": "datasetProperties",
"aspect": {
@@ -1884,9 +2071,9 @@
"updated_by": "abc@acryl.io",
"updated_at": "2022-10-19 13:27:29.633000+00:00"
},
- "externalUrl": "https://dummy.cloud.databricks.com/explore/data/quickstart_catalog/information_schema/quickstart_table",
+ "externalUrl": "https://dummy.cloud.databricks.com/explore/data/main/quickstart_schema/quickstart_table",
"name": "quickstart_table",
- "qualifiedName": "quickstart_catalog.information_schema.quickstart_table",
+ "qualifiedName": "main.quickstart_schema.quickstart_table",
"created": {
"time": 1666185698688,
"actor": "urn:li:corpuser:abc@acryl.io"
@@ -1906,7 +2093,7 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.information_schema.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.quickstart_schema.quickstart_table,PROD)",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
@@ -1924,12 +2111,12 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.information_schema.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.quickstart_schema.quickstart_table,PROD)",
"changeType": "UPSERT",
"aspectName": "schemaMetadata",
"aspect": {
"json": {
- "schemaName": "acryl_metastore.quickstart_catalog.information_schema.quickstart_table",
+ "schemaName": "acryl_metastore.main.quickstart_schema.quickstart_table",
"platform": "urn:li:dataPlatform:databricks",
"version": 0,
"created": {
@@ -1981,8 +2168,136 @@
}
},
{
- "entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.information_schema.quickstart_table,PROD)",
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.quickstart_schema.quickstart_table,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "ownership",
+ "aspect": {
+ "json": {
+ "owners": [
+ {
+ "owner": "urn:li:corpuser:account users",
+ "type": "DATAOWNER"
+ }
+ ],
+ "lastModified": {
+ "time": 0,
+ "actor": "urn:li:corpuser:unknown"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1638860400000,
+ "runId": "unity-catalog-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.quickstart_schema.quickstart_table,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "browsePathsV2",
+ "aspect": {
+ "json": {
+ "path": [
+ {
+ "id": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb",
+ "urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb"
+ },
+ {
+ "id": "urn:li:container:83d98e62e36bddc3596c2b738e23b596",
+ "urn": "urn:li:container:83d98e62e36bddc3596c2b738e23b596"
+ },
+ {
+ "id": "urn:li:container:481380c5a355638fc626eca8380cdda9",
+ "urn": "urn:li:container:481380c5a355638fc626eca8380cdda9"
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1638860400000,
+ "runId": "unity-catalog-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965",
+ "changeType": "UPSERT",
+ "aspectName": "containerProperties",
+ "aspect": {
+ "json": {
+ "customProperties": {
+ "platform": "databricks",
+ "env": "PROD",
+ "metastore": "acryl metastore",
+ "catalog": "quickstart_catalog"
+ },
+ "externalUrl": "https://dummy.cloud.databricks.com/explore/data/quickstart_catalog",
+ "name": "quickstart_catalog",
+ "description": ""
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1638860400000,
+ "runId": "unity-catalog-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1638860400000,
+ "runId": "unity-catalog-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965",
+ "changeType": "UPSERT",
+ "aspectName": "dataPlatformInstance",
+ "aspect": {
+ "json": {
+ "platform": "urn:li:dataPlatform:databricks"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1638860400000,
+ "runId": "unity-catalog-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965",
+ "changeType": "UPSERT",
+ "aspectName": "subTypes",
+ "aspect": {
+ "json": {
+ "typeNames": [
+ "Catalog"
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1638860400000,
+ "runId": "unity-catalog-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965",
"changeType": "UPSERT",
"aspectName": "ownership",
"aspect": {
@@ -2006,8 +2321,24 @@
}
},
{
- "entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.information_schema.quickstart_table,PROD)",
+ "entityType": "container",
+ "entityUrn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965",
+ "changeType": "UPSERT",
+ "aspectName": "container",
+ "aspect": {
+ "json": {
+ "container": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1638860400000,
+ "runId": "unity-catalog-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
@@ -2016,14 +2347,6 @@
{
"id": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb",
"urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb"
- },
- {
- "id": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965",
- "urn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965"
- },
- {
- "id": "urn:li:container:29f99476d533719be0cebc374d5265dc",
- "urn": "urn:li:container:29f99476d533719be0cebc374d5265dc"
}
]
}
@@ -2036,7 +2359,7 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:47a033e31b92a120f08f297c05d286f1",
+ "entityUrn": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90",
"changeType": "UPSERT",
"aspectName": "containerProperties",
"aspect": {
@@ -2046,11 +2369,11 @@
"env": "PROD",
"metastore": "acryl metastore",
"catalog": "quickstart_catalog",
- "unity_schema": "quickstart_schema"
+ "unity_schema": "default"
},
- "externalUrl": "https://dummy.cloud.databricks.com/explore/data/quickstart_catalog/quickstart_schema",
- "name": "quickstart_schema",
- "description": "A new Unity Catalog schema called quickstart_schema"
+ "externalUrl": "https://dummy.cloud.databricks.com/explore/data/quickstart_catalog/default",
+ "name": "default",
+ "description": "Default schema (auto-created)"
}
},
"systemMetadata": {
@@ -2061,7 +2384,7 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:47a033e31b92a120f08f297c05d286f1",
+ "entityUrn": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90",
"changeType": "UPSERT",
"aspectName": "status",
"aspect": {
@@ -2077,7 +2400,7 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:47a033e31b92a120f08f297c05d286f1",
+ "entityUrn": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
@@ -2093,7 +2416,7 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:47a033e31b92a120f08f297c05d286f1",
+ "entityUrn": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
@@ -2111,14 +2434,14 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:47a033e31b92a120f08f297c05d286f1",
+ "entityUrn": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90",
"changeType": "UPSERT",
"aspectName": "ownership",
"aspect": {
"json": {
"owners": [
{
- "owner": "urn:li:corpuser:account users",
+ "owner": "urn:li:corpuser:abc@acryl.io",
"type": "DATAOWNER"
}
],
@@ -2136,7 +2459,7 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:47a033e31b92a120f08f297c05d286f1",
+ "entityUrn": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90",
"changeType": "UPSERT",
"aspectName": "container",
"aspect": {
@@ -2152,7 +2475,7 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:47a033e31b92a120f08f297c05d286f1",
+ "entityUrn": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
@@ -2177,12 +2500,12 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.quickstart_schema.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.default.quickstart_table,PROD)",
"changeType": "UPSERT",
"aspectName": "container",
"aspect": {
"json": {
- "container": "urn:li:container:47a033e31b92a120f08f297c05d286f1"
+ "container": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90"
}
},
"systemMetadata": {
@@ -2193,7 +2516,7 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.quickstart_schema.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.default.quickstart_table,PROD)",
"changeType": "UPSERT",
"aspectName": "datasetProperties",
"aspect": {
@@ -2214,9 +2537,9 @@
"updated_by": "abc@acryl.io",
"updated_at": "2022-10-19 13:27:29.633000+00:00"
},
- "externalUrl": "https://dummy.cloud.databricks.com/explore/data/quickstart_catalog/quickstart_schema/quickstart_table",
+ "externalUrl": "https://dummy.cloud.databricks.com/explore/data/quickstart_catalog/default/quickstart_table",
"name": "quickstart_table",
- "qualifiedName": "quickstart_catalog.quickstart_schema.quickstart_table",
+ "qualifiedName": "quickstart_catalog.default.quickstart_table",
"created": {
"time": 1666185698688,
"actor": "urn:li:corpuser:abc@acryl.io"
@@ -2236,7 +2559,7 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.quickstart_schema.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.default.quickstart_table,PROD)",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
@@ -2254,12 +2577,12 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.quickstart_schema.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.default.quickstart_table,PROD)",
"changeType": "UPSERT",
"aspectName": "schemaMetadata",
"aspect": {
"json": {
- "schemaName": "acryl_metastore.quickstart_catalog.quickstart_schema.quickstart_table",
+ "schemaName": "acryl_metastore.quickstart_catalog.default.quickstart_table",
"platform": "urn:li:dataPlatform:databricks",
"version": 0,
"created": {
@@ -2312,7 +2635,7 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.quickstart_schema.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.default.quickstart_table,PROD)",
"changeType": "UPSERT",
"aspectName": "ownership",
"aspect": {
@@ -2324,153 +2647,9 @@
}
],
"lastModified": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
- }
- }
- },
- "systemMetadata": {
- "lastObserved": 1638860400000,
- "runId": "unity-catalog-test",
- "lastRunId": "no-run-id-provided"
- }
-},
-{
- "entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.quickstart_schema.quickstart_table,PROD)",
- "changeType": "UPSERT",
- "aspectName": "browsePathsV2",
- "aspect": {
- "json": {
- "path": [
- {
- "id": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb",
- "urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb"
- },
- {
- "id": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965",
- "urn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965"
- },
- {
- "id": "urn:li:container:47a033e31b92a120f08f297c05d286f1",
- "urn": "urn:li:container:47a033e31b92a120f08f297c05d286f1"
- }
- ]
- }
- },
- "systemMetadata": {
- "lastObserved": 1638860400000,
- "runId": "unity-catalog-test",
- "lastRunId": "no-run-id-provided"
- }
-},
-{
- "entityType": "container",
- "entityUrn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9",
- "changeType": "UPSERT",
- "aspectName": "containerProperties",
- "aspect": {
- "json": {
- "customProperties": {
- "platform": "databricks",
- "env": "PROD",
- "metastore": "acryl metastore",
- "catalog": "system"
- },
- "externalUrl": "https://dummy.cloud.databricks.com/explore/data/system",
- "name": "system",
- "description": "System catalog (auto-created)"
- }
- },
- "systemMetadata": {
- "lastObserved": 1638860400000,
- "runId": "unity-catalog-test",
- "lastRunId": "no-run-id-provided"
- }
-},
-{
- "entityType": "container",
- "entityUrn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9",
- "changeType": "UPSERT",
- "aspectName": "status",
- "aspect": {
- "json": {
- "removed": false
- }
- },
- "systemMetadata": {
- "lastObserved": 1638860400000,
- "runId": "unity-catalog-test",
- "lastRunId": "no-run-id-provided"
- }
-},
-{
- "entityType": "container",
- "entityUrn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9",
- "changeType": "UPSERT",
- "aspectName": "dataPlatformInstance",
- "aspect": {
- "json": {
- "platform": "urn:li:dataPlatform:databricks"
- }
- },
- "systemMetadata": {
- "lastObserved": 1638860400000,
- "runId": "unity-catalog-test",
- "lastRunId": "no-run-id-provided"
- }
-},
-{
- "entityType": "container",
- "entityUrn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9",
- "changeType": "UPSERT",
- "aspectName": "subTypes",
- "aspect": {
- "json": {
- "typeNames": [
- "Catalog"
- ]
- }
- },
- "systemMetadata": {
- "lastObserved": 1638860400000,
- "runId": "unity-catalog-test",
- "lastRunId": "no-run-id-provided"
- }
-},
-{
- "entityType": "container",
- "entityUrn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9",
- "changeType": "UPSERT",
- "aspectName": "ownership",
- "aspect": {
- "json": {
- "owners": [
- {
- "owner": "urn:li:corpuser:Service Principal 2",
- "type": "DATAOWNER"
- }
- ],
- "lastModified": {
- "time": 0,
- "actor": "urn:li:corpuser:unknown"
- }
- }
- },
- "systemMetadata": {
- "lastObserved": 1638860400000,
- "runId": "unity-catalog-test",
- "lastRunId": "no-run-id-provided"
- }
-},
-{
- "entityType": "container",
- "entityUrn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9",
- "changeType": "UPSERT",
- "aspectName": "container",
- "aspect": {
- "json": {
- "container": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb"
+ "time": 0,
+ "actor": "urn:li:corpuser:unknown"
+ }
}
},
"systemMetadata": {
@@ -2480,8 +2659,8 @@
}
},
{
- "entityType": "container",
- "entityUrn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9",
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.default.quickstart_table,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
@@ -2490,6 +2669,14 @@
{
"id": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb",
"urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb"
+ },
+ {
+ "id": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965",
+ "urn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965"
+ },
+ {
+ "id": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90",
+ "urn": "urn:li:container:ce568b660cba2e1a4e811b010ac27f90"
}
]
}
@@ -2502,7 +2689,7 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:b330768923270ff5450695bee1c94247",
+ "entityUrn": "urn:li:container:47a033e31b92a120f08f297c05d286f1",
"changeType": "UPSERT",
"aspectName": "containerProperties",
"aspect": {
@@ -2511,12 +2698,12 @@
"platform": "databricks",
"env": "PROD",
"metastore": "acryl metastore",
- "catalog": "system",
- "unity_schema": "default"
+ "catalog": "quickstart_catalog",
+ "unity_schema": "quickstart_schema"
},
- "externalUrl": "https://dummy.cloud.databricks.com/explore/data/system/default",
- "name": "default",
- "description": "Default schema (auto-created)"
+ "externalUrl": "https://dummy.cloud.databricks.com/explore/data/quickstart_catalog/quickstart_schema",
+ "name": "quickstart_schema",
+ "description": "A new Unity Catalog schema called quickstart_schema"
}
},
"systemMetadata": {
@@ -2527,7 +2714,7 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:b330768923270ff5450695bee1c94247",
+ "entityUrn": "urn:li:container:47a033e31b92a120f08f297c05d286f1",
"changeType": "UPSERT",
"aspectName": "status",
"aspect": {
@@ -2543,7 +2730,7 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:b330768923270ff5450695bee1c94247",
+ "entityUrn": "urn:li:container:47a033e31b92a120f08f297c05d286f1",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
@@ -2559,7 +2746,7 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:b330768923270ff5450695bee1c94247",
+ "entityUrn": "urn:li:container:47a033e31b92a120f08f297c05d286f1",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
@@ -2577,14 +2764,14 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:b330768923270ff5450695bee1c94247",
+ "entityUrn": "urn:li:container:47a033e31b92a120f08f297c05d286f1",
"changeType": "UPSERT",
"aspectName": "ownership",
"aspect": {
"json": {
"owners": [
{
- "owner": "urn:li:corpuser:abc@acryl.io",
+ "owner": "urn:li:corpuser:account users",
"type": "DATAOWNER"
}
],
@@ -2602,12 +2789,12 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:b330768923270ff5450695bee1c94247",
+ "entityUrn": "urn:li:container:47a033e31b92a120f08f297c05d286f1",
"changeType": "UPSERT",
"aspectName": "container",
"aspect": {
"json": {
- "container": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9"
+ "container": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965"
}
},
"systemMetadata": {
@@ -2618,7 +2805,7 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:b330768923270ff5450695bee1c94247",
+ "entityUrn": "urn:li:container:47a033e31b92a120f08f297c05d286f1",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
@@ -2629,8 +2816,8 @@
"urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb"
},
{
- "id": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9",
- "urn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9"
+ "id": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965",
+ "urn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965"
}
]
}
@@ -2643,12 +2830,12 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.default.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.quickstart_schema.quickstart_table,PROD)",
"changeType": "UPSERT",
"aspectName": "container",
"aspect": {
"json": {
- "container": "urn:li:container:b330768923270ff5450695bee1c94247"
+ "container": "urn:li:container:47a033e31b92a120f08f297c05d286f1"
}
},
"systemMetadata": {
@@ -2659,7 +2846,7 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.default.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.quickstart_schema.quickstart_table,PROD)",
"changeType": "UPSERT",
"aspectName": "datasetProperties",
"aspect": {
@@ -2680,9 +2867,9 @@
"updated_by": "abc@acryl.io",
"updated_at": "2022-10-19 13:27:29.633000+00:00"
},
- "externalUrl": "https://dummy.cloud.databricks.com/explore/data/system/default/quickstart_table",
+ "externalUrl": "https://dummy.cloud.databricks.com/explore/data/quickstart_catalog/quickstart_schema/quickstart_table",
"name": "quickstart_table",
- "qualifiedName": "system.default.quickstart_table",
+ "qualifiedName": "quickstart_catalog.quickstart_schema.quickstart_table",
"created": {
"time": 1666185698688,
"actor": "urn:li:corpuser:abc@acryl.io"
@@ -2702,7 +2889,7 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.default.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.quickstart_schema.quickstart_table,PROD)",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
@@ -2720,12 +2907,12 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.default.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.quickstart_schema.quickstart_table,PROD)",
"changeType": "UPSERT",
"aspectName": "schemaMetadata",
"aspect": {
"json": {
- "schemaName": "acryl_metastore.system.default.quickstart_table",
+ "schemaName": "acryl_metastore.quickstart_catalog.quickstart_schema.quickstart_table",
"platform": "urn:li:dataPlatform:databricks",
"version": 0,
"created": {
@@ -2778,7 +2965,7 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.default.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.quickstart_schema.quickstart_table,PROD)",
"changeType": "UPSERT",
"aspectName": "ownership",
"aspect": {
@@ -2803,7 +2990,7 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.default.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.quickstart_schema.quickstart_table,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
@@ -2814,12 +3001,148 @@
"urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb"
},
{
- "id": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9",
- "urn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9"
+ "id": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965",
+ "urn": "urn:li:container:079ede9d4f0640985a8ccf8eb180e965"
},
{
- "id": "urn:li:container:b330768923270ff5450695bee1c94247",
- "urn": "urn:li:container:b330768923270ff5450695bee1c94247"
+ "id": "urn:li:container:47a033e31b92a120f08f297c05d286f1",
+ "urn": "urn:li:container:47a033e31b92a120f08f297c05d286f1"
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1638860400000,
+ "runId": "unity-catalog-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9",
+ "changeType": "UPSERT",
+ "aspectName": "containerProperties",
+ "aspect": {
+ "json": {
+ "customProperties": {
+ "platform": "databricks",
+ "env": "PROD",
+ "metastore": "acryl metastore",
+ "catalog": "system"
+ },
+ "externalUrl": "https://dummy.cloud.databricks.com/explore/data/system",
+ "name": "system",
+ "description": "System catalog (auto-created)"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1638860400000,
+ "runId": "unity-catalog-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1638860400000,
+ "runId": "unity-catalog-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9",
+ "changeType": "UPSERT",
+ "aspectName": "dataPlatformInstance",
+ "aspect": {
+ "json": {
+ "platform": "urn:li:dataPlatform:databricks"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1638860400000,
+ "runId": "unity-catalog-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9",
+ "changeType": "UPSERT",
+ "aspectName": "subTypes",
+ "aspect": {
+ "json": {
+ "typeNames": [
+ "Catalog"
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1638860400000,
+ "runId": "unity-catalog-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9",
+ "changeType": "UPSERT",
+ "aspectName": "ownership",
+ "aspect": {
+ "json": {
+ "owners": [
+ {
+ "owner": "urn:li:corpuser:Service Principal 2",
+ "type": "DATAOWNER"
+ }
+ ],
+ "lastModified": {
+ "time": 0,
+ "actor": "urn:li:corpuser:unknown"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1638860400000,
+ "runId": "unity-catalog-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9",
+ "changeType": "UPSERT",
+ "aspectName": "container",
+ "aspect": {
+ "json": {
+ "container": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1638860400000,
+ "runId": "unity-catalog-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "container",
+ "entityUrn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9",
+ "changeType": "UPSERT",
+ "aspectName": "browsePathsV2",
+ "aspect": {
+ "json": {
+ "path": [
+ {
+ "id": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb",
+ "urn": "urn:li:container:22ec33be0e53ba3e61bb6c4ad58f6ffb"
}
]
}
@@ -2832,7 +3155,7 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:cb26af5fb7ba2e1c6f2cd804101a5a59",
+ "entityUrn": "urn:li:container:b330768923270ff5450695bee1c94247",
"changeType": "UPSERT",
"aspectName": "containerProperties",
"aspect": {
@@ -2842,11 +3165,11 @@
"env": "PROD",
"metastore": "acryl metastore",
"catalog": "system",
- "unity_schema": "information_schema"
+ "unity_schema": "default"
},
- "externalUrl": "https://dummy.cloud.databricks.com/explore/data/system/information_schema",
- "name": "information_schema",
- "description": "Information schema (auto-created)"
+ "externalUrl": "https://dummy.cloud.databricks.com/explore/data/system/default",
+ "name": "default",
+ "description": "Default schema (auto-created)"
}
},
"systemMetadata": {
@@ -2857,7 +3180,7 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:cb26af5fb7ba2e1c6f2cd804101a5a59",
+ "entityUrn": "urn:li:container:b330768923270ff5450695bee1c94247",
"changeType": "UPSERT",
"aspectName": "status",
"aspect": {
@@ -2873,7 +3196,7 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:cb26af5fb7ba2e1c6f2cd804101a5a59",
+ "entityUrn": "urn:li:container:b330768923270ff5450695bee1c94247",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
@@ -2889,7 +3212,7 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:cb26af5fb7ba2e1c6f2cd804101a5a59",
+ "entityUrn": "urn:li:container:b330768923270ff5450695bee1c94247",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
@@ -2907,14 +3230,14 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:cb26af5fb7ba2e1c6f2cd804101a5a59",
+ "entityUrn": "urn:li:container:b330768923270ff5450695bee1c94247",
"changeType": "UPSERT",
"aspectName": "ownership",
"aspect": {
"json": {
"owners": [
{
- "owner": "urn:li:corpuser:Service Principal 1",
+ "owner": "urn:li:corpuser:abc@acryl.io",
"type": "DATAOWNER"
}
],
@@ -2932,7 +3255,7 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:cb26af5fb7ba2e1c6f2cd804101a5a59",
+ "entityUrn": "urn:li:container:b330768923270ff5450695bee1c94247",
"changeType": "UPSERT",
"aspectName": "container",
"aspect": {
@@ -2948,7 +3271,7 @@
},
{
"entityType": "container",
- "entityUrn": "urn:li:container:cb26af5fb7ba2e1c6f2cd804101a5a59",
+ "entityUrn": "urn:li:container:b330768923270ff5450695bee1c94247",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
@@ -2973,12 +3296,12 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.information_schema.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.default.quickstart_table,PROD)",
"changeType": "UPSERT",
"aspectName": "container",
"aspect": {
"json": {
- "container": "urn:li:container:cb26af5fb7ba2e1c6f2cd804101a5a59"
+ "container": "urn:li:container:b330768923270ff5450695bee1c94247"
}
},
"systemMetadata": {
@@ -2989,7 +3312,7 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.information_schema.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.default.quickstart_table,PROD)",
"changeType": "UPSERT",
"aspectName": "datasetProperties",
"aspect": {
@@ -3010,9 +3333,9 @@
"updated_by": "abc@acryl.io",
"updated_at": "2022-10-19 13:27:29.633000+00:00"
},
- "externalUrl": "https://dummy.cloud.databricks.com/explore/data/system/information_schema/quickstart_table",
+ "externalUrl": "https://dummy.cloud.databricks.com/explore/data/system/default/quickstart_table",
"name": "quickstart_table",
- "qualifiedName": "system.information_schema.quickstart_table",
+ "qualifiedName": "system.default.quickstart_table",
"created": {
"time": 1666185698688,
"actor": "urn:li:corpuser:abc@acryl.io"
@@ -3032,7 +3355,7 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.information_schema.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.default.quickstart_table,PROD)",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
@@ -3050,12 +3373,12 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.information_schema.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.default.quickstart_table,PROD)",
"changeType": "UPSERT",
"aspectName": "schemaMetadata",
"aspect": {
"json": {
- "schemaName": "acryl_metastore.system.information_schema.quickstart_table",
+ "schemaName": "acryl_metastore.system.default.quickstart_table",
"platform": "urn:li:dataPlatform:databricks",
"version": 0,
"created": {
@@ -3108,7 +3431,7 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.information_schema.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.default.quickstart_table,PROD)",
"changeType": "UPSERT",
"aspectName": "ownership",
"aspect": {
@@ -3133,7 +3456,7 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.information_schema.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.default.quickstart_table,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
@@ -3148,8 +3471,8 @@
"urn": "urn:li:container:5f7e6ee26826ba56e6d1d0b94f291fa9"
},
{
- "id": "urn:li:container:cb26af5fb7ba2e1c6f2cd804101a5a59",
- "urn": "urn:li:container:cb26af5fb7ba2e1c6f2cd804101a5a59"
+ "id": "urn:li:container:b330768923270ff5450695bee1c94247",
+ "urn": "urn:li:container:b330768923270ff5450695bee1c94247"
}
]
}
@@ -3506,22 +3829,6 @@
"lastRunId": "no-run-id-provided"
}
},
-{
- "entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.information_schema.quickstart_table,PROD)",
- "changeType": "UPSERT",
- "aspectName": "status",
- "aspect": {
- "json": {
- "removed": false
- }
- },
- "systemMetadata": {
- "lastObserved": 1638860400000,
- "runId": "unity-catalog-test",
- "lastRunId": "no-run-id-provided"
- }
-},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.quickstart_schema.quickstart_table,PROD)",
@@ -3556,7 +3863,7 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.information_schema.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.quickstart_schema.quickstart_table,PROD)",
"changeType": "UPSERT",
"aspectName": "status",
"aspect": {
@@ -3572,7 +3879,7 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.quickstart_schema.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.default.quickstart_table,PROD)",
"changeType": "UPSERT",
"aspectName": "status",
"aspect": {
@@ -3588,7 +3895,7 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.default.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.quickstart_schema.quickstart_table,PROD)",
"changeType": "UPSERT",
"aspectName": "status",
"aspect": {
@@ -3604,7 +3911,7 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.information_schema.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.hive_metastore.bronze_kambi.bet,PROD)",
"changeType": "UPSERT",
"aspectName": "status",
"aspect": {
@@ -3620,7 +3927,7 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.quickstart_schema.quickstart_table,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.hive_metastore.bronze_kambi.view1,PROD)",
"changeType": "UPSERT",
"aspectName": "status",
"aspect": {
diff --git a/metadata-ingestion/tests/unit/test_unity_catalog_config.py b/metadata-ingestion/tests/unit/test_unity_catalog_config.py
index 4098ed4074de2f..3c0994cde7889f 100644
--- a/metadata-ingestion/tests/unit/test_unity_catalog_config.py
+++ b/metadata-ingestion/tests/unit/test_unity_catalog_config.py
@@ -67,7 +67,6 @@ def test_profiling_requires_warehouses_id():
@freeze_time(FROZEN_TIME)
def test_workspace_url_should_start_with_https():
-
with pytest.raises(ValueError, match="Workspace URL must start with http scheme"):
UnityCatalogSourceConfig.parse_obj(
{
@@ -76,3 +75,67 @@ def test_workspace_url_should_start_with_https():
"profiling": {"enabled": True},
}
)
+
+
+def test_global_warehouse_id_is_set_from_profiling():
+ config = UnityCatalogSourceConfig.parse_obj(
+ {
+ "token": "token",
+ "workspace_url": "https://XXXXXXXXXXXXXXXXXXXXX",
+ "profiling": {
+ "method": "ge",
+ "enabled": True,
+ "warehouse_id": "my_warehouse_id",
+ },
+ }
+ )
+ assert config.profiling.warehouse_id == "my_warehouse_id"
+ assert config.warehouse_id == "my_warehouse_id"
+
+
+def test_set_different_warehouse_id_from_profiling():
+ with pytest.raises(
+ ValueError,
+ match="When `warehouse_id` is set, it must match the `warehouse_id` in `profiling`.",
+ ):
+ UnityCatalogSourceConfig.parse_obj(
+ {
+ "token": "token",
+ "workspace_url": "https://XXXXXXXXXXXXXXXXXXXXX",
+ "warehouse_id": "my_global_warehouse_id",
+ "profiling": {
+ "method": "ge",
+ "enabled": True,
+ "warehouse_id": "my_warehouse_id",
+ },
+ }
+ )
+
+
+def test_warehouse_id_must_be_set_if_include_hive_metastore_is_true():
+ with pytest.raises(
+ ValueError,
+ match="When `include_hive_metastore` is set, `warehouse_id` must be set.",
+ ):
+ UnityCatalogSourceConfig.parse_obj(
+ {
+ "token": "token",
+ "workspace_url": "https://XXXXXXXXXXXXXXXXXXXXX",
+ "include_hive_metastore": True,
+ }
+ )
+
+
+def test_set_profiling_warehouse_id_from_global():
+ config = UnityCatalogSourceConfig.parse_obj(
+ {
+ "token": "token",
+ "workspace_url": "https://XXXXXXXXXXXXXXXXXXXXX",
+ "warehouse_id": "my_global_warehouse_id",
+ "profiling": {
+ "method": "ge",
+ "enabled": True,
+ },
+ }
+ )
+ assert config.profiling.warehouse_id == "my_global_warehouse_id"
From 0d6a5e5df25b58af0a434d5d2f83f6ef463ba99b Mon Sep 17 00:00:00 2001
From: siddiquebagwan-gslab
Date: Thu, 14 Dec 2023 21:06:28 +0530
Subject: [PATCH 025/540] feat(ingestion/transformer): create tag if not exist
(#9076)
---
.../src/datahub/ingestion/graph/client.py | 24 ++++++
.../ingestion/transformer/add_dataset_tags.py | 42 ++++++++++-
.../ingestion/transformer/base_transformer.py | 75 +++++++++++++++----
.../tests/unit/test_transform_dataset.py | 32 ++++++--
4 files changed, 154 insertions(+), 19 deletions(-)
diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py
index d91165ac9777ca..5c24b06dde9998 100644
--- a/metadata-ingestion/src/datahub/ingestion/graph/client.py
+++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py
@@ -787,9 +787,11 @@ def get_aspect_counts(self, aspect: str, urn_like: Optional[str] = None) -> int:
def execute_graphql(self, query: str, variables: Optional[Dict] = None) -> Dict:
url = f"{self.config.server}/api/graphql"
+
body: Dict = {
"query": query,
}
+
if variables:
body["variables"] = variables
@@ -1065,6 +1067,28 @@ def parse_sql_lineage(
default_schema=default_schema,
)
+ def create_tag(self, tag_name: str) -> str:
+ graph_query: str = """
+ mutation($tag_detail: CreateTagInput!) {
+ createTag(input: $tag_detail)
+ }
+ """
+
+ variables = {
+ "tag_detail": {
+ "name": tag_name,
+ "id": tag_name,
+ },
+ }
+
+ res = self.execute_graphql(
+ query=graph_query,
+ variables=variables,
+ )
+
+ # return urn
+ return res["createTag"]
+
def close(self) -> None:
self._make_schema_resolver.cache_clear()
super().close()
diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_tags.py b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_tags.py
index 5a276ad899c482..72a8c226e491ed 100644
--- a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_tags.py
+++ b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_tags.py
@@ -1,14 +1,24 @@
+import logging
from typing import Callable, List, Optional, cast
+import datahub.emitter.mce_builder as builder
from datahub.configuration.common import (
KeyValuePattern,
TransformerSemanticsConfigModel,
)
from datahub.configuration.import_resolver import pydantic_resolve_key
from datahub.emitter.mce_builder import Aspect
+from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.transformer.dataset_transformer import DatasetTagsTransformer
-from datahub.metadata.schema_classes import GlobalTagsClass, TagAssociationClass
+from datahub.metadata.schema_classes import (
+ GlobalTagsClass,
+ TagAssociationClass,
+ TagKeyClass,
+)
+from datahub.utilities.urns.tag_urn import TagUrn
+
+logger = logging.getLogger(__name__)
class AddDatasetTagsConfig(TransformerSemanticsConfigModel):
@@ -22,11 +32,13 @@ class AddDatasetTags(DatasetTagsTransformer):
ctx: PipelineContext
config: AddDatasetTagsConfig
+ processed_tags: List[TagAssociationClass]
def __init__(self, config: AddDatasetTagsConfig, ctx: PipelineContext):
super().__init__()
self.ctx = ctx
self.config = config
+ self.processed_tags = []
@classmethod
def create(cls, config_dict: dict, ctx: PipelineContext) -> "AddDatasetTags":
@@ -45,11 +57,38 @@ def transform_aspect(
tags_to_add = self.config.get_tags_to_add(entity_urn)
if tags_to_add is not None:
out_global_tags_aspect.tags.extend(tags_to_add)
+ self.processed_tags.extend(
+ tags_to_add
+ ) # Keep track of tags added so that we can create them in handle_end_of_stream
return self.get_result_semantics(
self.config, self.ctx.graph, entity_urn, out_global_tags_aspect
)
+ def handle_end_of_stream(self) -> List[MetadataChangeProposalWrapper]:
+
+ mcps: List[MetadataChangeProposalWrapper] = []
+
+ logger.debug("Generating tags")
+
+ for tag_association in self.processed_tags:
+ ids: List[str] = TagUrn.create_from_string(
+ tag_association.tag
+ ).get_entity_id()
+
+ assert len(ids) == 1, "Invalid Tag Urn"
+
+ tag_name: str = ids[0]
+
+ mcps.append(
+ MetadataChangeProposalWrapper(
+ entityUrn=builder.make_tag_urn(tag=tag_name),
+ aspect=TagKeyClass(name=tag_name),
+ )
+ )
+
+ return mcps
+
class SimpleDatasetTagConfig(TransformerSemanticsConfigModel):
tag_urns: List[str]
@@ -82,6 +121,7 @@ class PatternAddDatasetTags(AddDatasetTags):
"""Transformer that adds a specified set of tags to each dataset."""
def __init__(self, config: PatternDatasetTagsConfig, ctx: PipelineContext):
+ config.tag_pattern.all
tag_pattern = config.tag_pattern
generic_config = AddDatasetTagsConfig(
get_tags_to_add=lambda _: [
diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/base_transformer.py b/metadata-ingestion/src/datahub/ingestion/transformer/base_transformer.py
index e0d6ae720c9a18..8b6f42dcfba4b8 100644
--- a/metadata-ingestion/src/datahub/ingestion/transformer/base_transformer.py
+++ b/metadata-ingestion/src/datahub/ingestion/transformer/base_transformer.py
@@ -17,13 +17,30 @@
log = logging.getLogger(__name__)
-class LegacyMCETransformer(Transformer, metaclass=ABCMeta):
+def _update_work_unit_id(
+ envelope: RecordEnvelope, urn: str, aspect_name: str
+) -> Dict[Any, Any]:
+ structured_urn = Urn.create_from_string(urn)
+ simple_name = "-".join(structured_urn.get_entity_id())
+ record_metadata = envelope.metadata.copy()
+ record_metadata.update({"workunit_id": f"txform-{simple_name}-{aspect_name}"})
+ return record_metadata
+
+
+class HandleEndOfStreamTransformer:
+ def handle_end_of_stream(self) -> List[MetadataChangeProposalWrapper]:
+ return []
+
+
+class LegacyMCETransformer(
+ Transformer, HandleEndOfStreamTransformer, metaclass=ABCMeta
+):
@abstractmethod
def transform_one(self, mce: MetadataChangeEventClass) -> MetadataChangeEventClass:
pass
-class SingleAspectTransformer(metaclass=ABCMeta):
+class SingleAspectTransformer(HandleEndOfStreamTransformer, metaclass=ABCMeta):
@abstractmethod
def aspect_name(self) -> str:
"""Implement this method to specify a single aspect that the transformer is interested in subscribing to. No default provided."""
@@ -180,6 +197,32 @@ def _transform_or_record_mcpw(
self._record_mcp(envelope.record)
return envelope if envelope.record.aspect is not None else None
+ def _handle_end_of_stream(
+ self, envelope: RecordEnvelope
+ ) -> Iterable[RecordEnvelope]:
+
+ if not isinstance(self, SingleAspectTransformer) and not isinstance(
+ self, LegacyMCETransformer
+ ):
+ return
+
+ mcps: List[MetadataChangeProposalWrapper] = self.handle_end_of_stream()
+
+ for mcp in mcps:
+ if mcp.aspect is None or mcp.entityUrn is None: # to silent the lint error
+ continue
+
+ record_metadata = _update_work_unit_id(
+ envelope=envelope,
+ aspect_name=mcp.aspect.get_aspect_name(), # type: ignore
+ urn=mcp.entityUrn,
+ )
+
+ yield RecordEnvelope(
+ record=mcp,
+ metadata=record_metadata,
+ )
+
def transform(
self, record_envelopes: Iterable[RecordEnvelope]
) -> Iterable[RecordEnvelope]:
@@ -216,17 +259,10 @@ def transform(
else None,
)
if transformed_aspect:
- # for end of stream records, we modify the workunit-id
structured_urn = Urn.create_from_string(urn)
- simple_name = "-".join(structured_urn.get_entity_id())
- record_metadata = envelope.metadata.copy()
- record_metadata.update(
- {
- "workunit_id": f"txform-{simple_name}-{self.aspect_name()}"
- }
- )
- yield RecordEnvelope(
- record=MetadataChangeProposalWrapper(
+
+ mcp: MetadataChangeProposalWrapper = (
+ MetadataChangeProposalWrapper(
entityUrn=urn,
entityType=structured_urn.get_type(),
systemMetadata=last_seen_mcp.systemMetadata
@@ -234,8 +270,21 @@ def transform(
else last_seen_mce_system_metadata,
aspectName=self.aspect_name(),
aspect=transformed_aspect,
- ),
+ )
+ )
+
+ record_metadata = _update_work_unit_id(
+ envelope=envelope,
+ aspect_name=mcp.aspect.get_aspect_name(), # type: ignore
+ urn=mcp.entityUrn,
+ )
+
+ yield RecordEnvelope(
+ record=mcp,
metadata=record_metadata,
)
+
self._mark_processed(urn)
+ yield from self._handle_end_of_stream(envelope=envelope)
+
yield envelope
diff --git a/metadata-ingestion/tests/unit/test_transform_dataset.py b/metadata-ingestion/tests/unit/test_transform_dataset.py
index 8014df2f5c519d..546549dcf37a4a 100644
--- a/metadata-ingestion/tests/unit/test_transform_dataset.py
+++ b/metadata-ingestion/tests/unit/test_transform_dataset.py
@@ -813,13 +813,25 @@ def test_simple_dataset_tags_transformation(mock_time):
]
)
)
- assert len(outputs) == 3
+
+ assert len(outputs) == 5
# Check that tags were added.
tags_aspect = outputs[1].record.aspect
+ assert tags_aspect.tags[0].tag == builder.make_tag_urn("NeedsDocumentation")
assert tags_aspect
assert len(tags_aspect.tags) == 2
- assert tags_aspect.tags[0].tag == builder.make_tag_urn("NeedsDocumentation")
+
+ # Check new tag entity should be there
+ assert outputs[2].record.aspectName == "tagKey"
+ assert outputs[2].record.aspect.name == "NeedsDocumentation"
+ assert outputs[2].record.entityUrn == builder.make_tag_urn("NeedsDocumentation")
+
+ assert outputs[3].record.aspectName == "tagKey"
+ assert outputs[3].record.aspect.name == "Legacy"
+ assert outputs[3].record.entityUrn == builder.make_tag_urn("Legacy")
+
+ assert isinstance(outputs[4].record, EndOfStream)
def dummy_tag_resolver_method(dataset_snapshot):
@@ -853,7 +865,7 @@ def test_pattern_dataset_tags_transformation(mock_time):
)
)
- assert len(outputs) == 3
+ assert len(outputs) == 5
tags_aspect = outputs[1].record.aspect
assert tags_aspect
assert len(tags_aspect.tags) == 2
@@ -1363,7 +1375,7 @@ def test_mcp_add_tags_missing(mock_time):
]
input_stream.append(RecordEnvelope(record=EndOfStream(), metadata={}))
outputs = list(transformer.transform(input_stream))
- assert len(outputs) == 3
+ assert len(outputs) == 5
assert outputs[0].record == dataset_mcp
# Check that tags were added, this will be the second result
tags_aspect = outputs[1].record.aspect
@@ -1395,13 +1407,23 @@ def test_mcp_add_tags_existing(mock_time):
]
input_stream.append(RecordEnvelope(record=EndOfStream(), metadata={}))
outputs = list(transformer.transform(input_stream))
- assert len(outputs) == 2
+
+ assert len(outputs) == 4
+
# Check that tags were added, this will be the second result
tags_aspect = outputs[0].record.aspect
assert tags_aspect
assert len(tags_aspect.tags) == 3
assert tags_aspect.tags[0].tag == builder.make_tag_urn("Test")
assert tags_aspect.tags[1].tag == builder.make_tag_urn("NeedsDocumentation")
+ assert tags_aspect.tags[2].tag == builder.make_tag_urn("Legacy")
+
+ # Check tag entities got added
+ assert outputs[1].record.entityType == "tag"
+ assert outputs[1].record.entityUrn == builder.make_tag_urn("NeedsDocumentation")
+ assert outputs[2].record.entityType == "tag"
+ assert outputs[2].record.entityUrn == builder.make_tag_urn("Legacy")
+
assert isinstance(outputs[-1].record, EndOfStream)
From ecef50f8fc75309562cf2729380ed18d5020ae8b Mon Sep 17 00:00:00 2001
From: Shirshanka Das
Date: Thu, 14 Dec 2023 08:03:36 -0800
Subject: [PATCH 026/540] =?UTF-8?q?fix(ingest):=20make=20user=5Furn=20and?=
=?UTF-8?q?=20group=5Furn=20generation=20consider=20user=20and=E2=80=A6=20?=
=?UTF-8?q?(#9026)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Co-authored-by: Aseem Bansal
---
.../src/datahub/emitter/mce_builder.py | 8 +++----
.../tests/unit/test_mce_builder.py | 22 +++++++++++++++++++
2 files changed, 26 insertions(+), 4 deletions(-)
diff --git a/metadata-ingestion/src/datahub/emitter/mce_builder.py b/metadata-ingestion/src/datahub/emitter/mce_builder.py
index 3b2c87ea25a314..9da1b0ab56f890 100644
--- a/metadata-ingestion/src/datahub/emitter/mce_builder.py
+++ b/metadata-ingestion/src/datahub/emitter/mce_builder.py
@@ -193,20 +193,20 @@ def assertion_urn_to_key(assertion_urn: str) -> Optional[AssertionKeyClass]:
def make_user_urn(username: str) -> str:
"""
- Makes a user urn if the input is not a user urn already
+ Makes a user urn if the input is not a user or group urn already
"""
return (
f"urn:li:corpuser:{username}"
- if not username.startswith("urn:li:corpuser:")
+ if not username.startswith(("urn:li:corpuser:", "urn:li:corpGroup:"))
else username
)
def make_group_urn(groupname: str) -> str:
"""
- Makes a group urn if the input is not a group urn already
+ Makes a group urn if the input is not a user or group urn already
"""
- if groupname and groupname.startswith("urn:li:corpGroup:"):
+ if groupname and groupname.startswith(("urn:li:corpGroup:", "urn:li:corpuser:")):
return groupname
else:
return f"urn:li:corpGroup:{groupname}"
diff --git a/metadata-ingestion/tests/unit/test_mce_builder.py b/metadata-ingestion/tests/unit/test_mce_builder.py
index b9025d76a3a1d1..d7c84f7863b407 100644
--- a/metadata-ingestion/tests/unit/test_mce_builder.py
+++ b/metadata-ingestion/tests/unit/test_mce_builder.py
@@ -33,3 +33,25 @@ def test_create_dataset_urn_with_reserved_chars() -> None:
)
== "urn:li:dataset:(urn:li:dataPlatform:platform%29,platform%2Cinstance.table_%28name%29,PROD)"
)
+
+
+def test_make_user_urn() -> None:
+ assert builder.make_user_urn("someUser") == "urn:li:corpuser:someUser"
+ assert (
+ builder.make_user_urn("urn:li:corpuser:someUser") == "urn:li:corpuser:someUser"
+ )
+ assert (
+ builder.make_user_urn("urn:li:corpGroup:someGroup")
+ == "urn:li:corpGroup:someGroup"
+ )
+
+
+def test_make_group_urn() -> None:
+ assert builder.make_group_urn("someGroup") == "urn:li:corpGroup:someGroup"
+ assert (
+ builder.make_group_urn("urn:li:corpGroup:someGroup")
+ == "urn:li:corpGroup:someGroup"
+ )
+ assert (
+ builder.make_group_urn("urn:li:corpuser:someUser") == "urn:li:corpuser:someUser"
+ )
From 1741c07d769f56a9cf066172725384b4e8780839 Mon Sep 17 00:00:00 2001
From: Shubham Jagtap <132359390+shubhamjagtap639@users.noreply.github.com>
Date: Thu, 14 Dec 2023 23:01:51 +0530
Subject: [PATCH 027/540] feat(ingestion): Add test_connection methods for
important sources (#9334)
---
.../datahub/ingestion/source/dbt/dbt_cloud.py | 89 ++--
.../datahub/ingestion/source/dbt/dbt_core.py | 56 ++-
.../src/datahub/ingestion/source/kafka.py | 74 ++-
.../ingestion/source/powerbi/powerbi.py | 22 +-
.../ingestion/source/sql/sql_common.py | 26 +-
.../src/datahub/ingestion/source/tableau.py | 23 +-
.../ingestion/source_config/sql/snowflake.py | 2 +-
.../tests/integration/dbt/test_dbt.py | 69 ++-
.../tests/integration/kafka/test_kafka.py | 85 +++-
.../tests/integration/mysql/test_mysql.py | 38 +-
.../tests/integration/powerbi/test_powerbi.py | 23 +-
.../tableau/test_tableau_ingest.py | 21 +-
.../test_helpers/test_connection_helpers.py | 47 ++
.../tests/unit/test_snowflake_source.py | 428 +++++++-----------
.../tests/unit/test_sql_common.py | 62 ++-
15 files changed, 684 insertions(+), 381 deletions(-)
create mode 100644 metadata-ingestion/tests/test_helpers/test_connection_helpers.py
diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py
index a9685b2554553d..069c1f2781460a 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py
@@ -14,7 +14,12 @@
platform_name,
support_status,
)
-from datahub.ingestion.api.source import SourceCapability
+from datahub.ingestion.api.source import (
+ CapabilityReport,
+ SourceCapability,
+ TestableSource,
+ TestConnectionReport,
+)
from datahub.ingestion.source.dbt.dbt_common import (
DBTColumn,
DBTCommonConfig,
@@ -177,7 +182,7 @@ class DBTCloudConfig(DBTCommonConfig):
@support_status(SupportStatus.INCUBATING)
@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
-class DBTCloudSource(DBTSourceBase):
+class DBTCloudSource(DBTSourceBase, TestableSource):
"""
This source pulls dbt metadata directly from the dbt Cloud APIs.
@@ -199,6 +204,57 @@ def create(cls, config_dict, ctx):
config = DBTCloudConfig.parse_obj(config_dict)
return cls(config, ctx, "dbt")
+ @staticmethod
+ def test_connection(config_dict: dict) -> TestConnectionReport:
+ test_report = TestConnectionReport()
+ try:
+ source_config = DBTCloudConfig.parse_obj_allow_extras(config_dict)
+ DBTCloudSource._send_graphql_query(
+ metadata_endpoint=source_config.metadata_endpoint,
+ token=source_config.token,
+ query=_DBT_GRAPHQL_QUERY.format(type="tests", fields="jobId"),
+ variables={
+ "jobId": source_config.job_id,
+ "runId": source_config.run_id,
+ },
+ )
+ test_report.basic_connectivity = CapabilityReport(capable=True)
+ except Exception as e:
+ test_report.basic_connectivity = CapabilityReport(
+ capable=False, failure_reason=str(e)
+ )
+ return test_report
+
+ @staticmethod
+ def _send_graphql_query(
+ metadata_endpoint: str, token: str, query: str, variables: Dict
+ ) -> Dict:
+ logger.debug(f"Sending GraphQL query to dbt Cloud: {query}")
+ response = requests.post(
+ metadata_endpoint,
+ json={
+ "query": query,
+ "variables": variables,
+ },
+ headers={
+ "Authorization": f"Bearer {token}",
+ "X-dbt-partner-source": "acryldatahub",
+ },
+ )
+
+ try:
+ res = response.json()
+ if "errors" in res:
+ raise ValueError(
+ f'Unable to fetch metadata from dbt Cloud: {res["errors"]}'
+ )
+ data = res["data"]
+ except JSONDecodeError as e:
+ response.raise_for_status()
+ raise e
+
+ return data
+
def load_nodes(self) -> Tuple[List[DBTNode], Dict[str, Optional[str]]]:
# TODO: In dbt Cloud, commands are scheduled as part of jobs, where
# each job can have multiple runs. We currently only fully support
@@ -213,6 +269,8 @@ def load_nodes(self) -> Tuple[List[DBTNode], Dict[str, Optional[str]]]:
for node_type, fields in _DBT_FIELDS_BY_TYPE.items():
logger.info(f"Fetching {node_type} from dbt Cloud")
data = self._send_graphql_query(
+ metadata_endpoint=self.config.metadata_endpoint,
+ token=self.config.token,
query=_DBT_GRAPHQL_QUERY.format(type=node_type, fields=fields),
variables={
"jobId": self.config.job_id,
@@ -232,33 +290,6 @@ def load_nodes(self) -> Tuple[List[DBTNode], Dict[str, Optional[str]]]:
return nodes, additional_metadata
- def _send_graphql_query(self, query: str, variables: Dict) -> Dict:
- logger.debug(f"Sending GraphQL query to dbt Cloud: {query}")
- response = requests.post(
- self.config.metadata_endpoint,
- json={
- "query": query,
- "variables": variables,
- },
- headers={
- "Authorization": f"Bearer {self.config.token}",
- "X-dbt-partner-source": "acryldatahub",
- },
- )
-
- try:
- res = response.json()
- if "errors" in res:
- raise ValueError(
- f'Unable to fetch metadata from dbt Cloud: {res["errors"]}'
- )
- data = res["data"]
- except JSONDecodeError as e:
- response.raise_for_status()
- raise e
-
- return data
-
def _parse_into_dbt_node(self, node: Dict) -> DBTNode:
key = node["uniqueId"]
diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py
index ac2b2815f3caaa..563b005d7a88d2 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py
@@ -18,7 +18,12 @@
platform_name,
support_status,
)
-from datahub.ingestion.api.source import SourceCapability
+from datahub.ingestion.api.source import (
+ CapabilityReport,
+ SourceCapability,
+ TestableSource,
+ TestConnectionReport,
+)
from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
from datahub.ingestion.source.dbt.dbt_common import (
DBTColumn,
@@ -60,11 +65,6 @@ class DBTCoreConfig(DBTCommonConfig):
_github_info_deprecated = pydantic_renamed_field("github_info", "git_info")
- @property
- def s3_client(self):
- assert self.aws_connection
- return self.aws_connection.get_s3_client()
-
@validator("aws_connection")
def aws_connection_needed_if_s3_uris_present(
cls, aws_connection: Optional[AwsConnectionConfig], values: Dict, **kwargs: Any
@@ -363,7 +363,7 @@ def load_test_results(
@support_status(SupportStatus.CERTIFIED)
@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
-class DBTCoreSource(DBTSourceBase):
+class DBTCoreSource(DBTSourceBase, TestableSource):
"""
The artifacts used by this source are:
- [dbt manifest file](https://docs.getdbt.com/reference/artifacts/manifest-json)
@@ -387,12 +387,34 @@ def create(cls, config_dict, ctx):
config = DBTCoreConfig.parse_obj(config_dict)
return cls(config, ctx, "dbt")
- def load_file_as_json(self, uri: str) -> Any:
+ @staticmethod
+ def test_connection(config_dict: dict) -> TestConnectionReport:
+ test_report = TestConnectionReport()
+ try:
+ source_config = DBTCoreConfig.parse_obj_allow_extras(config_dict)
+ DBTCoreSource.load_file_as_json(
+ source_config.manifest_path, source_config.aws_connection
+ )
+ DBTCoreSource.load_file_as_json(
+ source_config.catalog_path, source_config.aws_connection
+ )
+ test_report.basic_connectivity = CapabilityReport(capable=True)
+ except Exception as e:
+ test_report.basic_connectivity = CapabilityReport(
+ capable=False, failure_reason=str(e)
+ )
+ return test_report
+
+ @staticmethod
+ def load_file_as_json(
+ uri: str, aws_connection: Optional[AwsConnectionConfig]
+ ) -> Dict:
if re.match("^https?://", uri):
return json.loads(requests.get(uri).text)
elif re.match("^s3://", uri):
u = urlparse(uri)
- response = self.config.s3_client.get_object(
+ assert aws_connection
+ response = aws_connection.get_s3_client().get_object(
Bucket=u.netloc, Key=u.path.lstrip("/")
)
return json.loads(response["Body"].read().decode("utf-8"))
@@ -410,12 +432,18 @@ def loadManifestAndCatalog(
Optional[str],
Optional[str],
]:
- dbt_manifest_json = self.load_file_as_json(self.config.manifest_path)
+ dbt_manifest_json = self.load_file_as_json(
+ self.config.manifest_path, self.config.aws_connection
+ )
- dbt_catalog_json = self.load_file_as_json(self.config.catalog_path)
+ dbt_catalog_json = self.load_file_as_json(
+ self.config.catalog_path, self.config.aws_connection
+ )
if self.config.sources_path is not None:
- dbt_sources_json = self.load_file_as_json(self.config.sources_path)
+ dbt_sources_json = self.load_file_as_json(
+ self.config.sources_path, self.config.aws_connection
+ )
sources_results = dbt_sources_json["results"]
else:
sources_results = {}
@@ -491,7 +519,9 @@ def load_nodes(self) -> Tuple[List[DBTNode], Dict[str, Optional[str]]]:
# This will populate the test_results field on each test node.
all_nodes = load_test_results(
self.config,
- self.load_file_as_json(self.config.test_results_path),
+ self.load_file_as_json(
+ self.config.test_results_path, self.config.aws_connection
+ ),
all_nodes,
)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka.py b/metadata-ingestion/src/datahub/ingestion/source/kafka.py
index 25520e7aa66fff..99ef737206ab0c 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/kafka.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/kafka.py
@@ -15,6 +15,7 @@
ConfigResource,
TopicMetadata,
)
+from confluent_kafka.schema_registry.schema_registry_client import SchemaRegistryClient
from datahub.configuration.common import AllowDenyPattern
from datahub.configuration.kafka import KafkaConsumerConnectionConfig
@@ -40,7 +41,13 @@
support_status,
)
from datahub.ingestion.api.registry import import_path
-from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceCapability
+from datahub.ingestion.api.source import (
+ CapabilityReport,
+ MetadataWorkUnitProcessor,
+ SourceCapability,
+ TestableSource,
+ TestConnectionReport,
+)
from datahub.ingestion.api.workunit import MetadataWorkUnit
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
from datahub.ingestion.source.kafka_schema_registry_base import KafkaSchemaRegistryBase
@@ -133,6 +140,18 @@ class KafkaSourceConfig(
)
+def get_kafka_consumer(
+ connection: KafkaConsumerConnectionConfig,
+) -> confluent_kafka.Consumer:
+ return confluent_kafka.Consumer(
+ {
+ "group.id": "test",
+ "bootstrap.servers": connection.bootstrap,
+ **connection.consumer_config,
+ }
+ )
+
+
@dataclass
class KafkaSourceReport(StaleEntityRemovalSourceReport):
topics_scanned: int = 0
@@ -145,6 +164,45 @@ def report_dropped(self, topic: str) -> None:
self.filtered.append(topic)
+class KafkaConnectionTest:
+ def __init__(self, config_dict: dict):
+ self.config = KafkaSourceConfig.parse_obj_allow_extras(config_dict)
+ self.report = KafkaSourceReport()
+ self.consumer: confluent_kafka.Consumer = get_kafka_consumer(
+ self.config.connection
+ )
+
+ def get_connection_test(self) -> TestConnectionReport:
+ capability_report = {
+ SourceCapability.SCHEMA_METADATA: self.schema_registry_connectivity(),
+ }
+ return TestConnectionReport(
+ basic_connectivity=self.basic_connectivity(),
+ capability_report={
+ k: v for k, v in capability_report.items() if v is not None
+ },
+ )
+
+ def basic_connectivity(self) -> CapabilityReport:
+ try:
+ self.consumer.list_topics(timeout=10)
+ return CapabilityReport(capable=True)
+ except Exception as e:
+ return CapabilityReport(capable=False, failure_reason=str(e))
+
+ def schema_registry_connectivity(self) -> CapabilityReport:
+ try:
+ SchemaRegistryClient(
+ {
+ "url": self.config.connection.schema_registry_url,
+ **self.config.connection.schema_registry_config,
+ }
+ ).get_subjects()
+ return CapabilityReport(capable=True)
+ except Exception as e:
+ return CapabilityReport(capable=False, failure_reason=str(e))
+
+
@platform_name("Kafka")
@config_class(KafkaSourceConfig)
@support_status(SupportStatus.CERTIFIED)
@@ -160,7 +218,7 @@ def report_dropped(self, topic: str) -> None:
SourceCapability.SCHEMA_METADATA,
"Schemas associated with each topic are extracted from the schema registry. Avro and Protobuf (certified), JSON (incubating). Schema references are supported.",
)
-class KafkaSource(StatefulIngestionSourceBase):
+class KafkaSource(StatefulIngestionSourceBase, TestableSource):
"""
This plugin extracts the following:
- Topics from the Kafka broker
@@ -183,12 +241,8 @@ def create_schema_registry(
def __init__(self, config: KafkaSourceConfig, ctx: PipelineContext):
super().__init__(config, ctx)
self.source_config: KafkaSourceConfig = config
- self.consumer: confluent_kafka.Consumer = confluent_kafka.Consumer(
- {
- "group.id": "test",
- "bootstrap.servers": self.source_config.connection.bootstrap,
- **self.source_config.connection.consumer_config,
- }
+ self.consumer: confluent_kafka.Consumer = get_kafka_consumer(
+ self.source_config.connection
)
self.init_kafka_admin_client()
self.report: KafkaSourceReport = KafkaSourceReport()
@@ -226,6 +280,10 @@ def init_kafka_admin_client(self) -> None:
f"Failed to create Kafka Admin Client due to error {e}.",
)
+ @staticmethod
+ def test_connection(config_dict: dict) -> TestConnectionReport:
+ return KafkaConnectionTest(config_dict).get_connection_test()
+
@classmethod
def create(cls, config_dict: Dict, ctx: PipelineContext) -> "KafkaSource":
config: KafkaSourceConfig = KafkaSourceConfig.parse_obj(config_dict)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py
index 4b1d0403ac7760..cdf7c975c0614f 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py
@@ -19,7 +19,13 @@
platform_name,
support_status,
)
-from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
+from datahub.ingestion.api.source import (
+ CapabilityReport,
+ MetadataWorkUnitProcessor,
+ SourceReport,
+ TestableSource,
+ TestConnectionReport,
+)
from datahub.ingestion.api.source_helpers import auto_workunit
from datahub.ingestion.api.workunit import MetadataWorkUnit
from datahub.ingestion.source.common.subtypes import (
@@ -1147,7 +1153,7 @@ def report_to_datahub_work_units(
SourceCapability.LINEAGE_FINE,
"Disabled by default, configured using `extract_column_level_lineage`. ",
)
-class PowerBiDashboardSource(StatefulIngestionSourceBase):
+class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
"""
This plugin extracts the following:
- Power BI dashboards, tiles and datasets
@@ -1186,6 +1192,18 @@ def __init__(self, config: PowerBiDashboardSourceConfig, ctx: PipelineContext):
self, self.source_config, self.ctx
)
+ @staticmethod
+ def test_connection(config_dict: dict) -> TestConnectionReport:
+ test_report = TestConnectionReport()
+ try:
+ PowerBiAPI(PowerBiDashboardSourceConfig.parse_obj_allow_extras(config_dict))
+ test_report.basic_connectivity = CapabilityReport(capable=True)
+ except Exception as e:
+ test_report.basic_connectivity = CapabilityReport(
+ capable=False, failure_reason=str(e)
+ )
+ return test_report
+
@classmethod
def create(cls, config_dict, ctx):
config = PowerBiDashboardSourceConfig.parse_obj(config_dict)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
index 590bc7f696784e..a831dfa50342d7 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
@@ -15,6 +15,7 @@
Tuple,
Type,
Union,
+ cast,
)
import sqlalchemy.dialects.postgresql.base
@@ -35,7 +36,12 @@
from datahub.emitter.sql_parsing_builder import SqlParsingBuilder
from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.api.incremental_lineage_helper import auto_incremental_lineage
-from datahub.ingestion.api.source import MetadataWorkUnitProcessor
+from datahub.ingestion.api.source import (
+ CapabilityReport,
+ MetadataWorkUnitProcessor,
+ TestableSource,
+ TestConnectionReport,
+)
from datahub.ingestion.api.workunit import MetadataWorkUnit
from datahub.ingestion.source.common.subtypes import (
DatasetContainerSubTypes,
@@ -298,7 +304,7 @@ class ProfileMetadata:
dataset_name_to_storage_bytes: Dict[str, int] = field(default_factory=dict)
-class SQLAlchemySource(StatefulIngestionSourceBase):
+class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
"""A Base class for all SQL Sources that use SQLAlchemy to extend"""
def __init__(self, config: SQLCommonConfig, ctx: PipelineContext, platform: str):
@@ -348,6 +354,22 @@ def __init__(self, config: SQLCommonConfig, ctx: PipelineContext, platform: str)
else:
self._view_definition_cache = {}
+ @classmethod
+ def test_connection(cls, config_dict: dict) -> TestConnectionReport:
+ test_report = TestConnectionReport()
+ try:
+ source = cast(
+ SQLAlchemySource,
+ cls.create(config_dict, PipelineContext(run_id="test_connection")),
+ )
+ list(source.get_inspectors())
+ test_report.basic_connectivity = CapabilityReport(capable=True)
+ except Exception as e:
+ test_report.basic_connectivity = CapabilityReport(
+ capable=False, failure_reason=str(e)
+ )
+ return test_report
+
def warn(self, log: logging.Logger, key: str, reason: str) -> None:
self.report.report_warning(key, reason[:100])
log.warning(f"{key} => {reason}")
diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau.py
index f870e99df27c5f..ed5fe543310b8f 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/tableau.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/tableau.py
@@ -58,7 +58,13 @@
platform_name,
support_status,
)
-from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source
+from datahub.ingestion.api.source import (
+ CapabilityReport,
+ MetadataWorkUnitProcessor,
+ Source,
+ TestableSource,
+ TestConnectionReport,
+)
from datahub.ingestion.api.workunit import MetadataWorkUnit
from datahub.ingestion.source import tableau_constant as c
from datahub.ingestion.source.common.subtypes import (
@@ -469,7 +475,7 @@ class TableauSourceReport(StaleEntityRemovalSourceReport):
SourceCapability.LINEAGE_FINE,
"Enabled by default, configure using `extract_column_level_lineage`",
)
-class TableauSource(StatefulIngestionSourceBase):
+class TableauSource(StatefulIngestionSourceBase, TestableSource):
platform = "tableau"
def __hash__(self):
@@ -509,6 +515,19 @@ def __init__(
self._authenticate()
+ @staticmethod
+ def test_connection(config_dict: dict) -> TestConnectionReport:
+ test_report = TestConnectionReport()
+ try:
+ source_config = TableauConfig.parse_obj_allow_extras(config_dict)
+ source_config.make_tableau_client()
+ test_report.basic_connectivity = CapabilityReport(capable=True)
+ except Exception as e:
+ test_report.basic_connectivity = CapabilityReport(
+ capable=False, failure_reason=str(e)
+ )
+ return test_report
+
def close(self) -> None:
try:
if self.server is not None:
diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py
index ccc4e115729a2c..46bd24c7e1f4c3 100644
--- a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py
+++ b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py
@@ -143,7 +143,7 @@ def _check_oauth_config(oauth_config: Optional[OAuthConfiguration]) -> None:
"'oauth_config' is none but should be set when using OAUTH_AUTHENTICATOR authentication"
)
if oauth_config.use_certificate is True:
- if oauth_config.provider == OAuthIdentityProvider.OKTA.value:
+ if oauth_config.provider == OAuthIdentityProvider.OKTA:
raise ValueError(
"Certificate authentication is not supported for Okta."
)
diff --git a/metadata-ingestion/tests/integration/dbt/test_dbt.py b/metadata-ingestion/tests/integration/dbt/test_dbt.py
index 95b5374bbb41df..587831495c1ea7 100644
--- a/metadata-ingestion/tests/integration/dbt/test_dbt.py
+++ b/metadata-ingestion/tests/integration/dbt/test_dbt.py
@@ -10,20 +10,25 @@
from datahub.ingestion.run.pipeline import Pipeline
from datahub.ingestion.run.pipeline_config import PipelineConfig, SourceConfig
from datahub.ingestion.source.dbt.dbt_common import DBTEntitiesEnabled, EmitDirective
-from datahub.ingestion.source.dbt.dbt_core import DBTCoreConfig
+from datahub.ingestion.source.dbt.dbt_core import DBTCoreConfig, DBTCoreSource
from datahub.ingestion.source.sql.sql_types import (
ATHENA_SQL_TYPES_MAP,
TRINO_SQL_TYPES_MAP,
resolve_athena_modified_type,
resolve_trino_modified_type,
)
-from tests.test_helpers import mce_helpers
+from tests.test_helpers import mce_helpers, test_connection_helpers
FROZEN_TIME = "2022-02-03 07:00:00"
GMS_PORT = 8080
GMS_SERVER = f"http://localhost:{GMS_PORT}"
+@pytest.fixture(scope="module")
+def test_resources_dir(pytestconfig):
+ return pytestconfig.rootpath / "tests/integration/dbt"
+
+
@dataclass
class DbtTestConfig:
run_id: str
@@ -195,7 +200,14 @@ def set_paths(
)
@pytest.mark.integration
@freeze_time(FROZEN_TIME)
-def test_dbt_ingest(dbt_test_config, pytestconfig, tmp_path, mock_time, requests_mock):
+def test_dbt_ingest(
+ dbt_test_config,
+ test_resources_dir,
+ pytestconfig,
+ tmp_path,
+ mock_time,
+ requests_mock,
+):
config: DbtTestConfig = dbt_test_config
test_resources_dir = pytestconfig.rootpath / "tests/integration/dbt"
@@ -233,11 +245,48 @@ def test_dbt_ingest(dbt_test_config, pytestconfig, tmp_path, mock_time, requests
)
+@pytest.mark.parametrize(
+ "config_dict, is_success",
+ [
+ (
+ {
+ "manifest_path": "dbt_manifest.json",
+ "catalog_path": "dbt_catalog.json",
+ "target_platform": "postgres",
+ },
+ True,
+ ),
+ (
+ {
+ "manifest_path": "dbt_manifest.json",
+ "catalog_path": "dbt_catalog-this-file-does-not-exist.json",
+ "target_platform": "postgres",
+ },
+ False,
+ ),
+ ],
+)
@pytest.mark.integration
@freeze_time(FROZEN_TIME)
-def test_dbt_tests(pytestconfig, tmp_path, mock_time, **kwargs):
- test_resources_dir = pytestconfig.rootpath / "tests/integration/dbt"
+def test_dbt_test_connection(test_resources_dir, config_dict, is_success):
+ config_dict["manifest_path"] = str(
+ (test_resources_dir / config_dict["manifest_path"]).resolve()
+ )
+ config_dict["catalog_path"] = str(
+ (test_resources_dir / config_dict["catalog_path"]).resolve()
+ )
+ report = test_connection_helpers.run_test_connection(DBTCoreSource, config_dict)
+ if is_success:
+ test_connection_helpers.assert_basic_connectivity_success(report)
+ else:
+ test_connection_helpers.assert_basic_connectivity_failure(
+ report, "No such file or directory"
+ )
+
+@pytest.mark.integration
+@freeze_time(FROZEN_TIME)
+def test_dbt_tests(test_resources_dir, pytestconfig, tmp_path, mock_time, **kwargs):
# Run the metadata ingestion pipeline.
output_file = tmp_path / "dbt_test_events.json"
golden_path = test_resources_dir / "dbt_test_events_golden.json"
@@ -340,9 +389,9 @@ def test_resolve_athena_modified_type(data_type, expected_data_type):
@pytest.mark.integration
@freeze_time(FROZEN_TIME)
-def test_dbt_tests_only_assertions(pytestconfig, tmp_path, mock_time, **kwargs):
- test_resources_dir = pytestconfig.rootpath / "tests/integration/dbt"
-
+def test_dbt_tests_only_assertions(
+ test_resources_dir, pytestconfig, tmp_path, mock_time, **kwargs
+):
# Run the metadata ingestion pipeline.
output_file = tmp_path / "test_only_assertions.json"
@@ -418,10 +467,8 @@ def test_dbt_tests_only_assertions(pytestconfig, tmp_path, mock_time, **kwargs):
@pytest.mark.integration
@freeze_time(FROZEN_TIME)
def test_dbt_only_test_definitions_and_results(
- pytestconfig, tmp_path, mock_time, **kwargs
+ test_resources_dir, pytestconfig, tmp_path, mock_time, **kwargs
):
- test_resources_dir = pytestconfig.rootpath / "tests/integration/dbt"
-
# Run the metadata ingestion pipeline.
output_file = tmp_path / "test_only_definitions_and_assertions.json"
diff --git a/metadata-ingestion/tests/integration/kafka/test_kafka.py b/metadata-ingestion/tests/integration/kafka/test_kafka.py
index 63d284801c94cd..dfdbea5de5cbfd 100644
--- a/metadata-ingestion/tests/integration/kafka/test_kafka.py
+++ b/metadata-ingestion/tests/integration/kafka/test_kafka.py
@@ -3,18 +3,22 @@
import pytest
from freezegun import freeze_time
-from tests.test_helpers import mce_helpers
+from datahub.ingestion.api.source import SourceCapability
+from datahub.ingestion.source.kafka import KafkaSource
+from tests.test_helpers import mce_helpers, test_connection_helpers
from tests.test_helpers.click_helpers import run_datahub_cmd
from tests.test_helpers.docker_helpers import wait_for_port
FROZEN_TIME = "2020-04-14 07:00:00"
-@freeze_time(FROZEN_TIME)
-@pytest.mark.integration
-def test_kafka_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time):
- test_resources_dir = pytestconfig.rootpath / "tests/integration/kafka"
+@pytest.fixture(scope="module")
+def test_resources_dir(pytestconfig):
+ return pytestconfig.rootpath / "tests/integration/kafka"
+
+@pytest.fixture(scope="module")
+def mock_kafka_service(docker_compose_runner, test_resources_dir):
with docker_compose_runner(
test_resources_dir / "docker-compose.yml", "kafka", cleanup=False
) as docker_services:
@@ -31,14 +35,67 @@ def test_kafka_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time):
command = f"{test_resources_dir}/send_records.sh {test_resources_dir}"
subprocess.run(command, shell=True, check=True)
- # Run the metadata ingestion pipeline.
- config_file = (test_resources_dir / "kafka_to_file.yml").resolve()
- run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path)
+ yield docker_compose_runner
+
+
+@freeze_time(FROZEN_TIME)
+@pytest.mark.integration
+def test_kafka_ingest(
+ mock_kafka_service, test_resources_dir, pytestconfig, tmp_path, mock_time
+):
+ # Run the metadata ingestion pipeline.
+ config_file = (test_resources_dir / "kafka_to_file.yml").resolve()
+ run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path)
- # Verify the output.
- mce_helpers.check_golden_file(
- pytestconfig,
- output_path=tmp_path / "kafka_mces.json",
- golden_path=test_resources_dir / "kafka_mces_golden.json",
- ignore_paths=[],
+ # Verify the output.
+ mce_helpers.check_golden_file(
+ pytestconfig,
+ output_path=tmp_path / "kafka_mces.json",
+ golden_path=test_resources_dir / "kafka_mces_golden.json",
+ ignore_paths=[],
+ )
+
+
+@pytest.mark.parametrize(
+ "config_dict, is_success",
+ [
+ (
+ {
+ "connection": {
+ "bootstrap": "localhost:29092",
+ "schema_registry_url": "http://localhost:28081",
+ },
+ },
+ True,
+ ),
+ (
+ {
+ "connection": {
+ "bootstrap": "localhost:2909",
+ "schema_registry_url": "http://localhost:2808",
+ },
+ },
+ False,
+ ),
+ ],
+)
+@pytest.mark.integration
+@freeze_time(FROZEN_TIME)
+def test_kafka_test_connection(mock_kafka_service, config_dict, is_success):
+ report = test_connection_helpers.run_test_connection(KafkaSource, config_dict)
+ if is_success:
+ test_connection_helpers.assert_basic_connectivity_success(report)
+ test_connection_helpers.assert_capability_report(
+ capability_report=report.capability_report,
+ success_capabilities=[SourceCapability.SCHEMA_METADATA],
+ )
+ else:
+ test_connection_helpers.assert_basic_connectivity_failure(
+ report, "Failed to get metadata"
+ )
+ test_connection_helpers.assert_capability_report(
+ capability_report=report.capability_report,
+ failure_capabilities={
+ SourceCapability.SCHEMA_METADATA: "Failed to establish a new connection"
+ },
)
diff --git a/metadata-ingestion/tests/integration/mysql/test_mysql.py b/metadata-ingestion/tests/integration/mysql/test_mysql.py
index 23fd97ff2671ed..c19198c7d2bbd0 100644
--- a/metadata-ingestion/tests/integration/mysql/test_mysql.py
+++ b/metadata-ingestion/tests/integration/mysql/test_mysql.py
@@ -3,7 +3,8 @@
import pytest
from freezegun import freeze_time
-from tests.test_helpers import mce_helpers
+from datahub.ingestion.source.sql.mysql import MySQLSource
+from tests.test_helpers import mce_helpers, test_connection_helpers
from tests.test_helpers.click_helpers import run_datahub_cmd
from tests.test_helpers.docker_helpers import wait_for_port
@@ -75,3 +76,38 @@ def test_mysql_ingest_no_db(
output_path=tmp_path / "mysql_mces.json",
golden_path=test_resources_dir / golden_file,
)
+
+
+@pytest.mark.parametrize(
+ "config_dict, is_success",
+ [
+ (
+ {
+ "host_port": "localhost:53307",
+ "database": "northwind",
+ "username": "root",
+ "password": "example",
+ },
+ True,
+ ),
+ (
+ {
+ "host_port": "localhost:5330",
+ "database": "wrong_db",
+ "username": "wrong_user",
+ "password": "wrong_pass",
+ },
+ False,
+ ),
+ ],
+)
+@freeze_time(FROZEN_TIME)
+@pytest.mark.integration
+def test_mysql_test_connection(mysql_runner, config_dict, is_success):
+ report = test_connection_helpers.run_test_connection(MySQLSource, config_dict)
+ if is_success:
+ test_connection_helpers.assert_basic_connectivity_success(report)
+ else:
+ test_connection_helpers.assert_basic_connectivity_failure(
+ report, "Connection refused"
+ )
diff --git a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py
index b2cbccf983eb0c..4e8469f919db9c 100644
--- a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py
+++ b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py
@@ -21,7 +21,7 @@
Report,
Workspace,
)
-from tests.test_helpers import mce_helpers
+from tests.test_helpers import mce_helpers, test_connection_helpers
pytestmark = pytest.mark.integration_batch_2
FROZEN_TIME = "2022-02-03 07:00:00"
@@ -681,6 +681,27 @@ def test_powerbi_ingest(
)
+@freeze_time(FROZEN_TIME)
+@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca)
+@pytest.mark.integration
+def test_powerbi_test_connection_success(mock_msal):
+ report = test_connection_helpers.run_test_connection(
+ PowerBiDashboardSource, default_source_config()
+ )
+ test_connection_helpers.assert_basic_connectivity_success(report)
+
+
+@freeze_time(FROZEN_TIME)
+@pytest.mark.integration
+def test_powerbi_test_connection_failure():
+ report = test_connection_helpers.run_test_connection(
+ PowerBiDashboardSource, default_source_config()
+ )
+ test_connection_helpers.assert_basic_connectivity_failure(
+ report, "Unable to get authority configuration"
+ )
+
+
@freeze_time(FROZEN_TIME)
@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca)
@pytest.mark.integration
diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py
index 0510f4a40f6597..90fa71013338da 100644
--- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py
+++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py
@@ -28,7 +28,7 @@
)
from datahub.metadata.schema_classes import MetadataChangeProposalClass, UpstreamClass
from datahub.utilities.sqlglot_lineage import SqlParsingResult
-from tests.test_helpers import mce_helpers
+from tests.test_helpers import mce_helpers, test_connection_helpers
from tests.test_helpers.state_helpers import (
get_current_checkpoint_from_pipeline,
validate_all_providers_have_committed_successfully,
@@ -290,6 +290,25 @@ def test_tableau_ingest(pytestconfig, tmp_path, mock_datahub_graph):
)
+@freeze_time(FROZEN_TIME)
+@pytest.mark.integration
+def test_tableau_test_connection_success():
+ with mock.patch("datahub.ingestion.source.tableau.Server"):
+ report = test_connection_helpers.run_test_connection(
+ TableauSource, config_source_default
+ )
+ test_connection_helpers.assert_basic_connectivity_success(report)
+
+
+@freeze_time(FROZEN_TIME)
+@pytest.mark.integration
+def test_tableau_test_connection_failure():
+ report = test_connection_helpers.run_test_connection(
+ TableauSource, config_source_default
+ )
+ test_connection_helpers.assert_basic_connectivity_failure(report, "Unable to login")
+
+
@freeze_time(FROZEN_TIME)
@pytest.mark.integration
def test_tableau_cll_ingest(pytestconfig, tmp_path, mock_datahub_graph):
diff --git a/metadata-ingestion/tests/test_helpers/test_connection_helpers.py b/metadata-ingestion/tests/test_helpers/test_connection_helpers.py
new file mode 100644
index 00000000000000..45543033ae010c
--- /dev/null
+++ b/metadata-ingestion/tests/test_helpers/test_connection_helpers.py
@@ -0,0 +1,47 @@
+from typing import Dict, List, Optional, Type, Union
+
+from datahub.ingestion.api.source import (
+ CapabilityReport,
+ SourceCapability,
+ TestableSource,
+ TestConnectionReport,
+)
+
+
+def run_test_connection(
+ source_cls: Type[TestableSource], config_dict: Dict
+) -> TestConnectionReport:
+ return source_cls.test_connection(config_dict)
+
+
+def assert_basic_connectivity_success(report: TestConnectionReport) -> None:
+ assert report is not None
+ assert report.basic_connectivity
+ assert report.basic_connectivity.capable
+ assert report.basic_connectivity.failure_reason is None
+
+
+def assert_basic_connectivity_failure(
+ report: TestConnectionReport, expected_reason: str
+) -> None:
+ assert report is not None
+ assert report.basic_connectivity
+ assert not report.basic_connectivity.capable
+ assert report.basic_connectivity.failure_reason
+ assert expected_reason in report.basic_connectivity.failure_reason
+
+
+def assert_capability_report(
+ capability_report: Optional[Dict[Union[SourceCapability, str], CapabilityReport]],
+ success_capabilities: List[SourceCapability] = [],
+ failure_capabilities: Dict[SourceCapability, str] = {},
+) -> None:
+ assert capability_report
+ for capability in success_capabilities:
+ assert capability_report[capability]
+ assert capability_report[capability].failure_reason is None
+ for capability, expected_reason in failure_capabilities.items():
+ assert not capability_report[capability].capable
+ failure_reason = capability_report[capability].failure_reason
+ assert failure_reason
+ assert expected_reason in failure_reason
diff --git a/metadata-ingestion/tests/unit/test_snowflake_source.py b/metadata-ingestion/tests/unit/test_snowflake_source.py
index 343f4466fd6fdf..536c91ace4f5ed 100644
--- a/metadata-ingestion/tests/unit/test_snowflake_source.py
+++ b/metadata-ingestion/tests/unit/test_snowflake_source.py
@@ -1,3 +1,4 @@
+from typing import Any, Dict
from unittest.mock import MagicMock, patch
import pytest
@@ -24,10 +25,20 @@
SnowflakeObjectAccessEntry,
)
from datahub.ingestion.source.snowflake.snowflake_v2 import SnowflakeV2Source
+from tests.test_helpers import test_connection_helpers
+
+default_oauth_dict: Dict[str, Any] = {
+ "client_id": "client_id",
+ "client_secret": "secret",
+ "use_certificate": False,
+ "provider": "microsoft",
+ "scopes": ["datahub_role"],
+ "authority_url": "https://dev-abc.okta.com/oauth2/def/v1/token",
+}
def test_snowflake_source_throws_error_on_account_id_missing():
- with pytest.raises(ValidationError):
+ with pytest.raises(ValidationError, match="account_id\n field required"):
SnowflakeV2Config.parse_obj(
{
"username": "user",
@@ -37,27 +48,21 @@ def test_snowflake_source_throws_error_on_account_id_missing():
def test_no_client_id_invalid_oauth_config():
- oauth_dict = {
- "provider": "microsoft",
- "scopes": ["https://microsoft.com/f4b353d5-ef8d/.default"],
- "client_secret": "6Hb9apkbc6HD7",
- "authority_url": "https://login.microsoftonline.com/yourorganisation.com",
- }
- with pytest.raises(ValueError):
+ oauth_dict = default_oauth_dict.copy()
+ del oauth_dict["client_id"]
+ with pytest.raises(ValueError, match="client_id\n field required"):
OAuthConfiguration.parse_obj(oauth_dict)
def test_snowflake_throws_error_on_client_secret_missing_if_use_certificate_is_false():
- oauth_dict = {
- "client_id": "882e9831-7ea51cb2b954",
- "provider": "microsoft",
- "scopes": ["https://microsoft.com/f4b353d5-ef8d/.default"],
- "use_certificate": False,
- "authority_url": "https://login.microsoftonline.com/yourorganisation.com",
- }
+ oauth_dict = default_oauth_dict.copy()
+ del oauth_dict["client_secret"]
OAuthConfiguration.parse_obj(oauth_dict)
- with pytest.raises(ValueError):
+ with pytest.raises(
+ ValueError,
+ match="'oauth_config.client_secret' was none but should be set when using use_certificate false for oauth_config",
+ ):
SnowflakeV2Config.parse_obj(
{
"account_id": "test",
@@ -68,16 +73,13 @@ def test_snowflake_throws_error_on_client_secret_missing_if_use_certificate_is_f
def test_snowflake_throws_error_on_encoded_oauth_private_key_missing_if_use_certificate_is_true():
- oauth_dict = {
- "client_id": "882e9831-7ea51cb2b954",
- "provider": "microsoft",
- "scopes": ["https://microsoft.com/f4b353d5-ef8d/.default"],
- "use_certificate": True,
- "authority_url": "https://login.microsoftonline.com/yourorganisation.com",
- "encoded_oauth_public_key": "fkdsfhkshfkjsdfiuwrwfkjhsfskfhksjf==",
- }
+ oauth_dict = default_oauth_dict.copy()
+ oauth_dict["use_certificate"] = True
OAuthConfiguration.parse_obj(oauth_dict)
- with pytest.raises(ValueError):
+ with pytest.raises(
+ ValueError,
+ match="'base64_encoded_oauth_private_key' was none but should be set when using certificate for oauth_config",
+ ):
SnowflakeV2Config.parse_obj(
{
"account_id": "test",
@@ -88,16 +90,13 @@ def test_snowflake_throws_error_on_encoded_oauth_private_key_missing_if_use_cert
def test_snowflake_oauth_okta_does_not_support_certificate():
- oauth_dict = {
- "client_id": "882e9831-7ea51cb2b954",
- "provider": "okta",
- "scopes": ["https://microsoft.com/f4b353d5-ef8d/.default"],
- "use_certificate": True,
- "authority_url": "https://login.microsoftonline.com/yourorganisation.com",
- "encoded_oauth_public_key": "fkdsfhkshfkjsdfiuwrwfkjhsfskfhksjf==",
- }
+ oauth_dict = default_oauth_dict.copy()
+ oauth_dict["use_certificate"] = True
+ oauth_dict["provider"] = "okta"
OAuthConfiguration.parse_obj(oauth_dict)
- with pytest.raises(ValueError):
+ with pytest.raises(
+ ValueError, match="Certificate authentication is not supported for Okta."
+ ):
SnowflakeV2Config.parse_obj(
{
"account_id": "test",
@@ -108,79 +107,52 @@ def test_snowflake_oauth_okta_does_not_support_certificate():
def test_snowflake_oauth_happy_paths():
- okta_dict = {
- "client_id": "client_id",
- "client_secret": "secret",
- "provider": "okta",
- "scopes": ["datahub_role"],
- "authority_url": "https://dev-abc.okta.com/oauth2/def/v1/token",
- }
+ oauth_dict = default_oauth_dict.copy()
+ oauth_dict["provider"] = "okta"
assert SnowflakeV2Config.parse_obj(
{
"account_id": "test",
"authentication_type": "OAUTH_AUTHENTICATOR",
- "oauth_config": okta_dict,
+ "oauth_config": oauth_dict,
}
)
-
- microsoft_dict = {
- "client_id": "client_id",
- "provider": "microsoft",
- "scopes": ["https://microsoft.com/f4b353d5-ef8d/.default"],
- "use_certificate": True,
- "authority_url": "https://login.microsoftonline.com/yourorganisation.com",
- "encoded_oauth_public_key": "publickey",
- "encoded_oauth_private_key": "privatekey",
- }
+ oauth_dict["use_certificate"] = True
+ oauth_dict["provider"] = "microsoft"
+ oauth_dict["encoded_oauth_public_key"] = "publickey"
+ oauth_dict["encoded_oauth_private_key"] = "privatekey"
assert SnowflakeV2Config.parse_obj(
{
"account_id": "test",
"authentication_type": "OAUTH_AUTHENTICATOR",
- "oauth_config": microsoft_dict,
+ "oauth_config": oauth_dict,
}
)
+default_config_dict: Dict[str, Any] = {
+ "username": "user",
+ "password": "password",
+ "account_id": "https://acctname.snowflakecomputing.com",
+ "warehouse": "COMPUTE_WH",
+ "role": "sysadmin",
+}
+
+
def test_account_id_is_added_when_host_port_is_present():
- config = SnowflakeV2Config.parse_obj(
- {
- "username": "user",
- "password": "password",
- "host_port": "acctname",
- "database_pattern": {"allow": {"^demo$"}},
- "warehouse": "COMPUTE_WH",
- "role": "sysadmin",
- }
- )
+ config_dict = default_config_dict.copy()
+ del config_dict["account_id"]
+ config_dict["host_port"] = "acctname"
+ config = SnowflakeV2Config.parse_obj(config_dict)
assert config.account_id == "acctname"
def test_account_id_with_snowflake_host_suffix():
- config = SnowflakeV2Config.parse_obj(
- {
- "username": "user",
- "password": "password",
- "account_id": "https://acctname.snowflakecomputing.com",
- "database_pattern": {"allow": {"^demo$"}},
- "warehouse": "COMPUTE_WH",
- "role": "sysadmin",
- }
- )
+ config = SnowflakeV2Config.parse_obj(default_config_dict)
assert config.account_id == "acctname"
def test_snowflake_uri_default_authentication():
- config = SnowflakeV2Config.parse_obj(
- {
- "username": "user",
- "password": "password",
- "account_id": "acctname",
- "database_pattern": {"allow": {"^demo$"}},
- "warehouse": "COMPUTE_WH",
- "role": "sysadmin",
- }
- )
-
+ config = SnowflakeV2Config.parse_obj(default_config_dict)
assert config.get_sql_alchemy_url() == (
"snowflake://user:password@acctname"
"?application=acryl_datahub"
@@ -191,17 +163,10 @@ def test_snowflake_uri_default_authentication():
def test_snowflake_uri_external_browser_authentication():
- config = SnowflakeV2Config.parse_obj(
- {
- "username": "user",
- "account_id": "acctname",
- "database_pattern": {"allow": {"^demo$"}},
- "warehouse": "COMPUTE_WH",
- "role": "sysadmin",
- "authentication_type": "EXTERNAL_BROWSER_AUTHENTICATOR",
- }
- )
-
+ config_dict = default_config_dict.copy()
+ del config_dict["password"]
+ config_dict["authentication_type"] = "EXTERNAL_BROWSER_AUTHENTICATOR"
+ config = SnowflakeV2Config.parse_obj(config_dict)
assert config.get_sql_alchemy_url() == (
"snowflake://user@acctname"
"?application=acryl_datahub"
@@ -212,18 +177,12 @@ def test_snowflake_uri_external_browser_authentication():
def test_snowflake_uri_key_pair_authentication():
- config = SnowflakeV2Config.parse_obj(
- {
- "username": "user",
- "account_id": "acctname",
- "database_pattern": {"allow": {"^demo$"}},
- "warehouse": "COMPUTE_WH",
- "role": "sysadmin",
- "authentication_type": "KEY_PAIR_AUTHENTICATOR",
- "private_key_path": "/a/random/path",
- "private_key_password": "a_random_password",
- }
- )
+ config_dict = default_config_dict.copy()
+ del config_dict["password"]
+ config_dict["authentication_type"] = "KEY_PAIR_AUTHENTICATOR"
+ config_dict["private_key_path"] = "/a/random/path"
+ config_dict["private_key_password"] = "a_random_password"
+ config = SnowflakeV2Config.parse_obj(config_dict)
assert config.get_sql_alchemy_url() == (
"snowflake://user@acctname"
@@ -235,63 +194,35 @@ def test_snowflake_uri_key_pair_authentication():
def test_options_contain_connect_args():
- config = SnowflakeV2Config.parse_obj(
- {
- "username": "user",
- "password": "password",
- "account_id": "acctname",
- "database_pattern": {"allow": {"^demo$"}},
- "warehouse": "COMPUTE_WH",
- "role": "sysadmin",
- }
- )
+ config = SnowflakeV2Config.parse_obj(default_config_dict)
connect_args = config.get_options().get("connect_args")
assert connect_args is not None
def test_snowflake_config_with_view_lineage_no_table_lineage_throws_error():
- with pytest.raises(ValidationError):
- SnowflakeV2Config.parse_obj(
- {
- "username": "user",
- "password": "password",
- "account_id": "acctname",
- "database_pattern": {"allow": {"^demo$"}},
- "warehouse": "COMPUTE_WH",
- "role": "sysadmin",
- "include_view_lineage": True,
- "include_table_lineage": False,
- }
- )
+ config_dict = default_config_dict.copy()
+ config_dict["include_view_lineage"] = True
+ config_dict["include_table_lineage"] = False
+ with pytest.raises(
+ ValidationError,
+ match="include_table_lineage must be True for include_view_lineage to be set",
+ ):
+ SnowflakeV2Config.parse_obj(config_dict)
def test_snowflake_config_with_column_lineage_no_table_lineage_throws_error():
- with pytest.raises(ValidationError):
- SnowflakeV2Config.parse_obj(
- {
- "username": "user",
- "password": "password",
- "account_id": "acctname",
- "database_pattern": {"allow": {"^demo$"}},
- "warehouse": "COMPUTE_WH",
- "role": "sysadmin",
- "include_column_lineage": True,
- "include_table_lineage": False,
- }
- )
+ config_dict = default_config_dict.copy()
+ config_dict["include_column_lineage"] = True
+ config_dict["include_table_lineage"] = False
+ with pytest.raises(
+ ValidationError,
+ match="include_table_lineage must be True for include_column_lineage to be set",
+ ):
+ SnowflakeV2Config.parse_obj(config_dict)
def test_snowflake_config_with_no_connect_args_returns_base_connect_args():
- config: SnowflakeV2Config = SnowflakeV2Config.parse_obj(
- {
- "username": "user",
- "password": "password",
- "account_id": "acctname",
- "database_pattern": {"allow": {"^demo$"}},
- "warehouse": "COMPUTE_WH",
- "role": "sysadmin",
- }
- )
+ config: SnowflakeV2Config = SnowflakeV2Config.parse_obj(default_config_dict)
assert config.get_options()["connect_args"] is not None
assert config.get_options()["connect_args"] == {
CLIENT_PREFETCH_THREADS: 10,
@@ -300,7 +231,10 @@ def test_snowflake_config_with_no_connect_args_returns_base_connect_args():
def test_private_key_set_but_auth_not_changed():
- with pytest.raises(ValidationError):
+ with pytest.raises(
+ ValidationError,
+ match="Either `private_key` and `private_key_path` is set but `authentication_type` is DEFAULT_AUTHENTICATOR. Should be set to 'KEY_PAIR_AUTHENTICATOR' when using key pair authentication",
+ ):
SnowflakeV2Config.parse_obj(
{
"account_id": "acctname",
@@ -310,19 +244,11 @@ def test_private_key_set_but_auth_not_changed():
def test_snowflake_config_with_connect_args_overrides_base_connect_args():
- config: SnowflakeV2Config = SnowflakeV2Config.parse_obj(
- {
- "username": "user",
- "password": "password",
- "account_id": "acctname",
- "database_pattern": {"allow": {"^demo$"}},
- "warehouse": "COMPUTE_WH",
- "role": "sysadmin",
- "connect_args": {
- CLIENT_PREFETCH_THREADS: 5,
- },
- }
- )
+ config_dict = default_config_dict.copy()
+ config_dict["connect_args"] = {
+ CLIENT_PREFETCH_THREADS: 5,
+ }
+ config: SnowflakeV2Config = SnowflakeV2Config.parse_obj(config_dict)
assert config.get_options()["connect_args"] is not None
assert config.get_options()["connect_args"][CLIENT_PREFETCH_THREADS] == 5
assert config.get_options()["connect_args"][CLIENT_SESSION_KEEP_ALIVE] is True
@@ -331,35 +257,20 @@ def test_snowflake_config_with_connect_args_overrides_base_connect_args():
@patch("snowflake.connector.connect")
def test_test_connection_failure(mock_connect):
mock_connect.side_effect = Exception("Failed to connect to snowflake")
- config = {
- "username": "user",
- "password": "password",
- "account_id": "missing",
- "warehouse": "COMPUTE_WH",
- "role": "sysadmin",
- }
- report = SnowflakeV2Source.test_connection(config)
- assert report is not None
- assert report.basic_connectivity
- assert not report.basic_connectivity.capable
- assert report.basic_connectivity.failure_reason
- assert "Failed to connect to snowflake" in report.basic_connectivity.failure_reason
+ report = test_connection_helpers.run_test_connection(
+ SnowflakeV2Source, default_config_dict
+ )
+ test_connection_helpers.assert_basic_connectivity_failure(
+ report, "Failed to connect to snowflake"
+ )
@patch("snowflake.connector.connect")
def test_test_connection_basic_success(mock_connect):
- config = {
- "username": "user",
- "password": "password",
- "account_id": "missing",
- "warehouse": "COMPUTE_WH",
- "role": "sysadmin",
- }
- report = SnowflakeV2Source.test_connection(config)
- assert report is not None
- assert report.basic_connectivity
- assert report.basic_connectivity.capable
- assert report.basic_connectivity.failure_reason is None
+ report = test_connection_helpers.run_test_connection(
+ SnowflakeV2Source, default_config_dict
+ )
+ test_connection_helpers.assert_basic_connectivity_success(report)
def setup_mock_connect(mock_connect, query_results=None):
@@ -400,31 +311,18 @@ def query_results(query):
return []
raise ValueError(f"Unexpected query: {query}")
- config = {
- "username": "user",
- "password": "password",
- "account_id": "missing",
- "warehouse": "COMPUTE_WH",
- "role": "sysadmin",
- }
setup_mock_connect(mock_connect, query_results)
- report = SnowflakeV2Source.test_connection(config)
- assert report is not None
- assert report.basic_connectivity
- assert report.basic_connectivity.capable
- assert report.basic_connectivity.failure_reason is None
-
- assert report.capability_report
- assert report.capability_report[SourceCapability.CONTAINERS].capable
- assert not report.capability_report[SourceCapability.SCHEMA_METADATA].capable
- failure_reason = report.capability_report[
- SourceCapability.SCHEMA_METADATA
- ].failure_reason
- assert failure_reason
-
- assert (
- "Current role TEST_ROLE does not have permissions to use warehouse"
- in failure_reason
+ report = test_connection_helpers.run_test_connection(
+ SnowflakeV2Source, default_config_dict
+ )
+ test_connection_helpers.assert_basic_connectivity_success(report)
+
+ test_connection_helpers.assert_capability_report(
+ capability_report=report.capability_report,
+ success_capabilities=[SourceCapability.CONTAINERS],
+ failure_capabilities={
+ SourceCapability.SCHEMA_METADATA: "Current role TEST_ROLE does not have permissions to use warehouse"
+ },
)
@@ -445,25 +343,17 @@ def query_results(query):
setup_mock_connect(mock_connect, query_results)
- config = {
- "username": "user",
- "password": "password",
- "account_id": "missing",
- "warehouse": "COMPUTE_WH",
- "role": "sysadmin",
- }
- report = SnowflakeV2Source.test_connection(config)
- assert report is not None
- assert report.basic_connectivity
- assert report.basic_connectivity.capable
- assert report.basic_connectivity.failure_reason is None
- assert report.capability_report
-
- assert report.capability_report[SourceCapability.CONTAINERS].capable
- assert not report.capability_report[SourceCapability.SCHEMA_METADATA].capable
- assert (
- report.capability_report[SourceCapability.SCHEMA_METADATA].failure_reason
- is not None
+ report = test_connection_helpers.run_test_connection(
+ SnowflakeV2Source, default_config_dict
+ )
+ test_connection_helpers.assert_basic_connectivity_success(report)
+
+ test_connection_helpers.assert_capability_report(
+ capability_report=report.capability_report,
+ success_capabilities=[SourceCapability.CONTAINERS],
+ failure_capabilities={
+ SourceCapability.SCHEMA_METADATA: "Either no tables exist or current role does not have permissions to access them"
+ },
)
@@ -488,24 +378,19 @@ def query_results(query):
setup_mock_connect(mock_connect, query_results)
- config = {
- "username": "user",
- "password": "password",
- "account_id": "missing",
- "warehouse": "COMPUTE_WH",
- "role": "sysadmin",
- }
- report = SnowflakeV2Source.test_connection(config)
-
- assert report is not None
- assert report.basic_connectivity
- assert report.basic_connectivity.capable
- assert report.basic_connectivity.failure_reason is None
- assert report.capability_report
-
- assert report.capability_report[SourceCapability.CONTAINERS].capable
- assert report.capability_report[SourceCapability.SCHEMA_METADATA].capable
- assert report.capability_report[SourceCapability.DESCRIPTIONS].capable
+ report = test_connection_helpers.run_test_connection(
+ SnowflakeV2Source, default_config_dict
+ )
+ test_connection_helpers.assert_basic_connectivity_success(report)
+
+ test_connection_helpers.assert_capability_report(
+ capability_report=report.capability_report,
+ success_capabilities=[
+ SourceCapability.CONTAINERS,
+ SourceCapability.SCHEMA_METADATA,
+ SourceCapability.DESCRIPTIONS,
+ ],
+ )
@patch("snowflake.connector.connect")
@@ -538,25 +423,21 @@ def query_results(query):
setup_mock_connect(mock_connect, query_results)
- config = {
- "username": "user",
- "password": "password",
- "account_id": "missing",
- "warehouse": "COMPUTE_WH",
- "role": "sysadmin",
- }
- report = SnowflakeV2Source.test_connection(config)
- assert report is not None
- assert report.basic_connectivity
- assert report.basic_connectivity.capable
- assert report.basic_connectivity.failure_reason is None
- assert report.capability_report
-
- assert report.capability_report[SourceCapability.CONTAINERS].capable
- assert report.capability_report[SourceCapability.SCHEMA_METADATA].capable
- assert report.capability_report[SourceCapability.DATA_PROFILING].capable
- assert report.capability_report[SourceCapability.DESCRIPTIONS].capable
- assert report.capability_report[SourceCapability.LINEAGE_COARSE].capable
+ report = test_connection_helpers.run_test_connection(
+ SnowflakeV2Source, default_config_dict
+ )
+ test_connection_helpers.assert_basic_connectivity_success(report)
+
+ test_connection_helpers.assert_capability_report(
+ capability_report=report.capability_report,
+ success_capabilities=[
+ SourceCapability.CONTAINERS,
+ SourceCapability.SCHEMA_METADATA,
+ SourceCapability.DATA_PROFILING,
+ SourceCapability.DESCRIPTIONS,
+ SourceCapability.LINEAGE_COARSE,
+ ],
+ )
def test_aws_cloud_region_from_snowflake_region_id():
@@ -610,11 +491,10 @@ def test_azure_cloud_region_from_snowflake_region_id():
def test_unknown_cloud_region_from_snowflake_region_id():
- with pytest.raises(Exception) as e:
+ with pytest.raises(Exception, match="Unknown snowflake region"):
SnowflakeV2Source.get_cloud_region_from_snowflake_region_id(
"somecloud_someregion"
)
- assert "Unknown snowflake region" in str(e)
def test_snowflake_object_access_entry_missing_object_id():
diff --git a/metadata-ingestion/tests/unit/test_sql_common.py b/metadata-ingestion/tests/unit/test_sql_common.py
index e23d290b611f4c..a98bf641711220 100644
--- a/metadata-ingestion/tests/unit/test_sql_common.py
+++ b/metadata-ingestion/tests/unit/test_sql_common.py
@@ -1,8 +1,7 @@
from typing import Dict
-from unittest.mock import Mock
+from unittest import mock
import pytest
-from sqlalchemy.engine.reflection import Inspector
from datahub.ingestion.source.sql.sql_common import PipelineContext, SQLAlchemySource
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
@@ -13,19 +12,24 @@
class _TestSQLAlchemyConfig(SQLCommonConfig):
def get_sql_alchemy_url(self):
- pass
+ return "mysql+pymysql://user:pass@localhost:5330"
class _TestSQLAlchemySource(SQLAlchemySource):
- pass
+ @classmethod
+ def create(cls, config_dict, ctx):
+ config = _TestSQLAlchemyConfig.parse_obj(config_dict)
+ return cls(config, ctx, "TEST")
+
+
+def get_test_sql_alchemy_source():
+ return _TestSQLAlchemySource.create(
+ config_dict={}, ctx=PipelineContext(run_id="test_ctx")
+ )
def test_generate_foreign_key():
- config: SQLCommonConfig = _TestSQLAlchemyConfig()
- ctx: PipelineContext = PipelineContext(run_id="test_ctx")
- platform: str = "TEST"
- inspector: Inspector = Mock()
- source = _TestSQLAlchemySource(config=config, ctx=ctx, platform=platform)
+ source = get_test_sql_alchemy_source()
fk_dict: Dict[str, str] = {
"name": "test_constraint",
"referred_table": "test_table",
@@ -37,7 +41,7 @@ def test_generate_foreign_key():
dataset_urn="test_urn",
schema="test_schema",
fk_dict=fk_dict,
- inspector=inspector,
+ inspector=mock.Mock(),
)
assert fk_dict.get("name") == foreign_key.name
@@ -48,11 +52,7 @@ def test_generate_foreign_key():
def test_use_source_schema_for_foreign_key_if_not_specified():
- config: SQLCommonConfig = _TestSQLAlchemyConfig()
- ctx: PipelineContext = PipelineContext(run_id="test_ctx")
- platform: str = "TEST"
- inspector: Inspector = Mock()
- source = _TestSQLAlchemySource(config=config, ctx=ctx, platform=platform)
+ source = get_test_sql_alchemy_source()
fk_dict: Dict[str, str] = {
"name": "test_constraint",
"referred_table": "test_table",
@@ -63,7 +63,7 @@ def test_use_source_schema_for_foreign_key_if_not_specified():
dataset_urn="test_urn",
schema="test_schema",
fk_dict=fk_dict,
- inspector=inspector,
+ inspector=mock.Mock(),
)
assert fk_dict.get("name") == foreign_key.name
@@ -105,14 +105,32 @@ def test_get_platform_from_sqlalchemy_uri(uri: str, expected_platform: str) -> N
def test_get_db_schema_with_dots_in_view_name():
- config: SQLCommonConfig = _TestSQLAlchemyConfig()
- ctx: PipelineContext = PipelineContext(run_id="test_ctx")
- platform: str = "TEST"
- source = _TestSQLAlchemySource(config=config, ctx=ctx, platform=platform)
-
+ source = get_test_sql_alchemy_source()
database, schema = source.get_db_schema(
dataset_identifier="database.schema.long.view.name1"
)
-
assert database == "database"
assert schema == "schema"
+
+
+def test_test_connection_success():
+ source = get_test_sql_alchemy_source()
+ with mock.patch(
+ "datahub.ingestion.source.sql.sql_common.SQLAlchemySource.get_inspectors",
+ side_effect=lambda: [],
+ ):
+ report = source.test_connection({})
+ assert report is not None
+ assert report.basic_connectivity
+ assert report.basic_connectivity.capable
+ assert report.basic_connectivity.failure_reason is None
+
+
+def test_test_connection_failure():
+ source = get_test_sql_alchemy_source()
+ report = source.test_connection({})
+ assert report is not None
+ assert report.basic_connectivity
+ assert not report.basic_connectivity.capable
+ assert report.basic_connectivity.failure_reason
+ assert "Connection refused" in report.basic_connectivity.failure_reason
From 26114dfeb2d255f1b2a562396908f48c8dd0ad64 Mon Sep 17 00:00:00 2001
From: naoki kuroda <68233204+nnnkkk7@users.noreply.github.com>
Date: Fri, 15 Dec 2023 05:42:45 +0900
Subject: [PATCH 028/540] docs: fix sample command for container logs (#9427)
---
docs/how/extract-container-logs.md | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/docs/how/extract-container-logs.md b/docs/how/extract-container-logs.md
index 9251d0665c02cf..b5fbb4c83cc645 100644
--- a/docs/how/extract-container-logs.md
+++ b/docs/how/extract-container-logs.md
@@ -86,7 +86,7 @@ Depending on your issue, you may be interested to view both debug and normal inf
Since log files are named based on the current date, you'll need to use "ls" to see which files currently exist. To do so, you can use the `kubectl exec` command, using the pod name recorded in step one:
```
-kubectl exec datahub-frontend-1231ead-6767 -n default -- ls -la /tmp/datahub/logs/gms
+kubectl exec datahub-gms-c578b47cd-7676 -n default -- ls -la /tmp/datahub/logs/gms
total 36388
drwxr-xr-x 2 datahub datahub 4096 Jul 29 07:45 .
@@ -131,5 +131,5 @@ Now you should be able to view the logs locally.
There are a few ways to get files out of the pod and into a local file. You can either use `kubectl cp` or simply `cat` and pipe the file of interest. We'll show an example using the latter approach:
```
-kubectl exec datahub-frontend-1231ead-6767 -n default -- cat /tmp/datahub/logs/gms/gms.log > my-local-gms.log
+kubectl exec datahub-gms-c578b47cd-7676 -n default -- cat /tmp/datahub/logs/gms/gms.log > my-local-gms.log
```
\ No newline at end of file
From 4354af20126d1befb2c7391c23310a4eca5bb688 Mon Sep 17 00:00:00 2001
From: Harshal Sheth
Date: Thu, 14 Dec 2023 16:54:40 -0500
Subject: [PATCH 029/540] fix(ingest): bump source configs json schema version
(#9424)
---
docs-website/genJsonSchema/gen_json_schema.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/docs-website/genJsonSchema/gen_json_schema.py b/docs-website/genJsonSchema/gen_json_schema.py
index 81c1d5a2c1a30f..4af72487644bd6 100644
--- a/docs-website/genJsonSchema/gen_json_schema.py
+++ b/docs-website/genJsonSchema/gen_json_schema.py
@@ -7,7 +7,7 @@
def get_base() -> Any:
return {
- "$schema": "http://json-schema.org/draft-04/schema#",
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
"id": "https://json.schemastore.org/datahub-ingestion",
"title": "Datahub Ingestion",
"description": "Root schema of Datahub Ingestion",
@@ -116,7 +116,7 @@ def get_base() -> Any:
"bootstrap": {
"type": "string",
"description": "Kafka bootstrap URL.",
- "default": "localhost:9092"
+ "default": "localhost:9092",
},
"producer_config": {
"type": "object",
@@ -125,7 +125,7 @@ def get_base() -> Any:
"schema_registry_url": {
"type": "string",
"description": "URL of schema registry being used.",
- "default": "http://localhost:8081"
+ "default": "http://localhost:8081",
},
"schema_registry_config": {
"type": "object",
From 0ea6145a9d491a1b882ba5a7a4667fb323d31dc4 Mon Sep 17 00:00:00 2001
From: Tamas Nemeth
Date: Fri, 15 Dec 2023 00:12:45 +0100
Subject: [PATCH 030/540] fix(ingest/profiling): Add option to enable external
table profiling (#9463)
---
.../datahub/ingestion/source/ge_profiling_config.py | 5 +++++
.../src/datahub/ingestion/source/redshift/profile.py | 9 +++++++++
.../ingestion/source/snowflake/snowflake_profiler.py | 10 ++++++++++
.../ingestion/source/snowflake/snowflake_schema.py | 3 +++
.../ingestion/source/sql/sql_generic_profiler.py | 3 +++
.../tests/integration/snowflake/common.py | 1 +
6 files changed, 31 insertions(+)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py
index 24a3e520d8caff..f340a7b41b7af8 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py
@@ -167,6 +167,11 @@ class GEProfilingConfig(ConfigModel):
"Applicable only if `use_sampling` is set to True.",
)
+ profile_external_tables: bool = Field(
+ default=False,
+ description="Whether to profile external tables. Only Snowflake and Redshift supports this.",
+ )
+
@pydantic.root_validator(pre=True)
def deprecate_bigquery_temp_table_schema(cls, values):
# TODO: Update docs to remove mention of this field.
diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/profile.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/profile.py
index b05850cef6e948..eed82ec4d83e76 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/redshift/profile.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/profile.py
@@ -48,6 +48,15 @@ def get_workunits(
if not self.config.schema_pattern.allowed(schema):
continue
for table in tables[db].get(schema, {}):
+ if (
+ not self.config.profiling.profile_external_tables
+ and table.type == "EXTERNAL_TABLE"
+ ):
+ self.report.profiling_skipped_other[schema] += 1
+ logger.info(
+ f"Skipping profiling of external table {db}.{schema}.{table.name}"
+ )
+ continue
# Emit the profile work unit
profile_request = self.get_profile_request(table, schema, db)
if profile_request is not None:
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py
index 89857c45642678..4bda7da422e9d6 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py
@@ -50,6 +50,16 @@ def get_workunits(
profile_requests = []
for schema in database.schemas:
for table in db_tables[schema.name]:
+ if (
+ not self.config.profiling.profile_external_tables
+ and table.type == "EXTERNAL TABLE"
+ ):
+ logger.info(
+ f"Skipping profiling of external table {database.name}.{schema.name}.{table.name}"
+ )
+ self.report.profiling_skipped_other[schema.name] += 1
+ continue
+
profile_request = self.get_profile_request(
table, schema.name, database.name
)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py
index e5b214ba35e4b6..9526bdec4b05dc 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py
@@ -77,6 +77,7 @@ def get_precise_native_type(self):
@dataclass
class SnowflakeTable(BaseTable):
+ type: Optional[str] = None
clustering_key: Optional[str] = None
pk: Optional[SnowflakePK] = None
columns: List[SnowflakeColumn] = field(default_factory=list)
@@ -265,6 +266,7 @@ def get_tables_for_database(
tables[table["TABLE_SCHEMA"]].append(
SnowflakeTable(
name=table["TABLE_NAME"],
+ type=table["TABLE_TYPE"],
created=table["CREATED"],
last_altered=table["LAST_ALTERED"],
size_in_bytes=table["BYTES"],
@@ -288,6 +290,7 @@ def get_tables_for_schema(
tables.append(
SnowflakeTable(
name=table["TABLE_NAME"],
+ type=table["TABLE_TYPE"],
created=table["CREATED"],
last_altered=table["LAST_ALTERED"],
size_in_bytes=table["BYTES"],
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py
index a2f91e5fae1a98..30fad9ad584c12 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py
@@ -35,6 +35,9 @@ class DetailedProfilerReportMixin:
profiling_skipped_row_limit: TopKDict[str, int] = field(
default_factory=int_top_k_dict
)
+
+ profiling_skipped_other: TopKDict[str, int] = field(default_factory=int_top_k_dict)
+
num_tables_not_eligible_profiling: Dict[str, int] = field(
default_factory=int_top_k_dict
)
diff --git a/metadata-ingestion/tests/integration/snowflake/common.py b/metadata-ingestion/tests/integration/snowflake/common.py
index b21cea5f0988d0..53b87636068bfe 100644
--- a/metadata-ingestion/tests/integration/snowflake/common.py
+++ b/metadata-ingestion/tests/integration/snowflake/common.py
@@ -79,6 +79,7 @@ def default_query_results( # noqa: C901
{
"TABLE_SCHEMA": "TEST_SCHEMA",
"TABLE_NAME": "TABLE_{}".format(tbl_idx),
+ "TABLE_TYPE": "BASE TABLE",
"CREATED": datetime(2021, 6, 8, 0, 0, 0, 0),
"LAST_ALTERED": datetime(2021, 6, 8, 0, 0, 0, 0),
"BYTES": 1024,
From 6a169357283790e158472957f87f8c6cfbe67136 Mon Sep 17 00:00:00 2001
From: RyanHolstien
Date: Fri, 15 Dec 2023 11:23:04 -0600
Subject: [PATCH 031/540] fix(operations): fix get index sizes integer wrap
(#9450)
---
.../ElasticSearchTimeseriesAspectService.java | 8 +-
.../TimeseriesAspectServiceUnitTest.java | 78 +++++++++++++++++++
.../timeseries/TimeseriesIndexSizeResult.pdl | 3 +
...nkedin.operations.operations.snapshot.json | 5 ++
4 files changed, 90 insertions(+), 4 deletions(-)
create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/timeseries/search/TimeseriesAspectServiceUnitTest.java
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java
index eec7680a56ecb0..f9ab86d41335db 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java
@@ -206,10 +206,10 @@ public List getIndexSizes() {
elemResult.setEntityName(indexEntityAndAspect.get().getFirst());
elemResult.setAspectName(indexEntityAndAspect.get().getSecond());
}
- int sizeBytes =
- entry.getValue().get("primaries").get("store").get("size_in_bytes").asInt();
- float sizeMb = (float) sizeBytes / 1000;
- elemResult.setSizeMb(sizeMb);
+ long sizeBytes =
+ entry.getValue().get("primaries").get("store").get("size_in_bytes").asLong();
+ double sizeMb = (double) sizeBytes / 1000000;
+ elemResult.setSizeInMb(sizeMb);
res.add(elemResult);
});
return res;
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/timeseries/search/TimeseriesAspectServiceUnitTest.java b/metadata-io/src/test/java/com/linkedin/metadata/timeseries/search/TimeseriesAspectServiceUnitTest.java
new file mode 100644
index 00000000000000..a23267dcf6f55e
--- /dev/null
+++ b/metadata-io/src/test/java/com/linkedin/metadata/timeseries/search/TimeseriesAspectServiceUnitTest.java
@@ -0,0 +1,78 @@
+package com.linkedin.metadata.timeseries.search;
+
+import static org.mockito.Mockito.*;
+
+import com.fasterxml.jackson.databind.node.JsonNodeFactory;
+import com.fasterxml.jackson.databind.node.NumericNode;
+import com.fasterxml.jackson.databind.node.ObjectNode;
+import com.linkedin.metadata.models.registry.EntityRegistry;
+import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor;
+import com.linkedin.metadata.timeseries.TimeseriesAspectService;
+import com.linkedin.metadata.timeseries.elastic.ElasticSearchTimeseriesAspectService;
+import com.linkedin.metadata.timeseries.elastic.indexbuilder.TimeseriesAspectIndexBuilders;
+import com.linkedin.metadata.utils.elasticsearch.IndexConvention;
+import com.linkedin.timeseries.TimeseriesIndexSizeResult;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+import org.apache.commons.io.IOUtils;
+import org.apache.http.HttpEntity;
+import org.opensearch.client.Request;
+import org.opensearch.client.Response;
+import org.opensearch.client.RestClient;
+import org.opensearch.client.RestHighLevelClient;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+
+/**
+ * Test using mocks instead of integration for testing functionality not dependent on a real server
+ * response
+ */
+public class TimeseriesAspectServiceUnitTest {
+
+ private final RestHighLevelClient _searchClient = mock(RestHighLevelClient.class);
+ private final IndexConvention _indexConvention = mock(IndexConvention.class);
+ private final TimeseriesAspectIndexBuilders _timeseriesAspectIndexBuilders =
+ mock(TimeseriesAspectIndexBuilders.class);
+ private final EntityRegistry _entityRegistry = mock(EntityRegistry.class);
+ private final ESBulkProcessor _bulkProcessor = mock(ESBulkProcessor.class);
+ private final RestClient _restClient = mock(RestClient.class);
+ private final TimeseriesAspectService _timeseriesAspectService =
+ new ElasticSearchTimeseriesAspectService(
+ _searchClient,
+ _indexConvention,
+ _timeseriesAspectIndexBuilders,
+ _entityRegistry,
+ _bulkProcessor,
+ 0);
+
+ private static final String INDEX_PATTERN = "indexPattern";
+
+ @Test
+ public void testGetIndicesIntegerWrap() throws IOException {
+ when(_indexConvention.getAllTimeseriesAspectIndicesPattern()).thenReturn(INDEX_PATTERN);
+ when(_searchClient.getLowLevelClient()).thenReturn(_restClient);
+ ObjectNode jsonNode = JsonNodeFactory.instance.objectNode();
+ ObjectNode indicesNode = JsonNodeFactory.instance.objectNode();
+ ObjectNode indexNode = JsonNodeFactory.instance.objectNode();
+ ObjectNode primariesNode = JsonNodeFactory.instance.objectNode();
+ ObjectNode storeNode = JsonNodeFactory.instance.objectNode();
+ NumericNode bytesNode = JsonNodeFactory.instance.numberNode(8078398031L);
+ storeNode.set("size_in_bytes", bytesNode);
+ primariesNode.set("store", storeNode);
+ indexNode.set("primaries", primariesNode);
+ indicesNode.set("someIndexName", indexNode);
+ jsonNode.set("indices", indicesNode);
+
+ Response response = mock(Response.class);
+ HttpEntity responseEntity = mock(HttpEntity.class);
+ when(response.getEntity()).thenReturn(responseEntity);
+ when(responseEntity.getContent())
+ .thenReturn(IOUtils.toInputStream(jsonNode.toString(), StandardCharsets.UTF_8));
+ when(_restClient.performRequest(any(Request.class))).thenReturn(response);
+
+ List results = _timeseriesAspectService.getIndexSizes();
+
+ Assert.assertEquals(results.get(0).getSizeInMb(), 8078.398031);
+ }
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/timeseries/TimeseriesIndexSizeResult.pdl b/metadata-models/src/main/pegasus/com/linkedin/timeseries/TimeseriesIndexSizeResult.pdl
index b888ef7c0716b1..35297314187bf0 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/timeseries/TimeseriesIndexSizeResult.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/timeseries/TimeseriesIndexSizeResult.pdl
@@ -22,5 +22,8 @@ record TimeseriesIndexSizeResult{
/**
* Size
*/
+ @deprecated = "use sizeInMb instead"
sizeMb: float = 0
+
+ sizeInMb: double = 0
}
diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json
index 339ce62de62980..eae0eed2dd50ba 100644
--- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json
+++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json
@@ -3668,6 +3668,11 @@
"name" : "sizeMb",
"type" : "float",
"doc" : "Size",
+ "default" : 0.0,
+ "deprecated" : "use sizeInMb instead"
+ }, {
+ "name" : "sizeInMb",
+ "type" : "double",
"default" : 0.0
} ]
}, {
From 824df5a6a3e9fed2f18f3e454c40b8d822011b5c Mon Sep 17 00:00:00 2001
From: david-leifker <114954101+david-leifker@users.noreply.github.com>
Date: Fri, 15 Dec 2023 13:28:33 -0600
Subject: [PATCH 032/540] feat(build): gradle 8, jdk17, neo4j 5 (#9458)
---
.github/workflows/airflow-plugin.yml | 5 +
.github/workflows/build-and-test.yml | 4 +-
.github/workflows/check-datahub-jars.yml | 4 +-
.github/workflows/docker-unified.yml | 39 +-
.github/workflows/documentation.yml | 4 +-
.github/workflows/metadata-ingestion.yml | 5 +
.github/workflows/metadata-io.yml | 4 +-
.github/workflows/metadata-model.yml | 5 +
.github/workflows/publish-datahub-jars.yml | 4 +-
.github/workflows/spark-smoke-test.yml | 4 +-
build.gradle | 137 +-
buildSrc/build.gradle | 13 +-
.../pegasus/gradle/PegasusPlugin.java | 2444 +++++++++++++++++
.../gradle/tasks/ChangedFileReportTask.java | 124 +
datahub-frontend/build.gradle | 22 +-
datahub-frontend/play.gradle | 19 +-
datahub-graphql-core/build.gradle | 3 +-
datahub-web-react/build.gradle | 10 +-
docker/datahub-frontend/Dockerfile | 7 +-
docker/datahub-frontend/start.sh | 2 +
docker/datahub-gms/Dockerfile | 4 +-
docker/datahub-ingestion/build.gradle | 6 +-
docker/datahub-mae-consumer/Dockerfile | 4 +-
docker/datahub-mce-consumer/Dockerfile | 4 +-
docker/datahub-upgrade/Dockerfile | 4 +-
docker/kafka-setup/Dockerfile | 2 +-
docs-website/build.gradle | 18 +-
docs-website/vercel-setup.sh | 2 +-
docs/developers.md | 10 +-
docs/how/updating-datahub.md | 4 +
docs/troubleshooting/build.md | 4 +-
entity-registry/build.gradle | 7 +-
gradle/wrapper/gradle-wrapper.properties | 2 +-
li-utils/build.gradle | 20 +-
metadata-auth/auth-api/build.gradle | 9 +-
metadata-events/mxe-utils-avro/build.gradle | 5 +-
.../java/datahub-client/build.gradle | 16 +-
.../datahub-protobuf-example/build.gradle | 4 -
.../java/datahub-protobuf/build.gradle | 8 +-
.../java/examples/build.gradle | 16 +-
.../java/spark-lineage/build.gradle | 68 +-
.../java/spark-lineage/scripts/check_jar.sh | 4 +-
.../docker/SparkBase.Dockerfile | 2 +-
.../python_test_run.sh | 13 +-
.../spark-smoke-test/spark-docker.conf | 4 +
.../test-spark-lineage/build.gradle | 11 -
.../datahub/spark/TestCoalesceJobLineage.java | 5 +-
.../datahub/spark/TestSparkJobsLineage.java | 3 +
metadata-io/build.gradle | 5 +-
.../graph/neo4j/Neo4jGraphService.java | 4 +-
metadata-jobs/mae-consumer/build.gradle | 1 +
metadata-jobs/mce-consumer/build.gradle | 3 +-
metadata-jobs/pe-consumer/build.gradle | 3 +-
metadata-models-custom/build.gradle | 2 +-
metadata-models-validator/build.gradle | 4 +-
metadata-models/build.gradle | 20 +-
metadata-service/auth-config/build.gradle | 4 +-
metadata-service/auth-filter/build.gradle | 4 +-
metadata-service/auth-impl/build.gradle | 4 +-
...formInstanceFieldResolverProviderTest.java | 4 +-
.../auth-servlet-impl/build.gradle | 4 +-
metadata-service/factories/build.gradle | 4 +-
.../graphql-servlet-impl/build.gradle | 4 +-
metadata-service/openapi-servlet/build.gradle | 4 +-
metadata-service/plugin/build.gradle | 6 +-
.../src/test/sample-test-plugins/build.gradle | 4 +-
metadata-service/restli-api/build.gradle | 6 +-
metadata-service/restli-client/build.gradle | 6 +-
.../restli-servlet-impl/build.gradle | 6 +-
.../schema-registry-api/build.gradle | 7 +-
.../schema-registry-servlet/build.gradle | 4 +-
metadata-service/services/build.gradle | 6 +-
metadata-service/servlet/build.gradle | 4 +-
metadata-utils/build.gradle | 4 +-
mock-entity-registry/build.gradle | 4 +-
smoke-test/build.gradle | 7 +-
test-models/build.gradle | 16 +-
vercel.json | 2 +-
78 files changed, 3008 insertions(+), 266 deletions(-)
create mode 100644 buildSrc/src/main/java/com/linkedin/pegasus/gradle/PegasusPlugin.java
create mode 100644 buildSrc/src/main/java/com/linkedin/pegasus/gradle/tasks/ChangedFileReportTask.java
diff --git a/.github/workflows/airflow-plugin.yml b/.github/workflows/airflow-plugin.yml
index d0c0f52781b9af..cd1e159b7d53cc 100644
--- a/.github/workflows/airflow-plugin.yml
+++ b/.github/workflows/airflow-plugin.yml
@@ -49,6 +49,11 @@ jobs:
extra_pip_extras: plugin-v2
fail-fast: false
steps:
+ - name: Set up JDK 17
+ uses: actions/setup-java@v3
+ with:
+ distribution: "zulu"
+ java-version: 17
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
index 10c137a206531a..dab64cf2dca5e6 100644
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@@ -37,11 +37,11 @@ jobs:
with:
timezoneLinux: ${{ matrix.timezone }}
- uses: hsheth2/sane-checkout-action@v1
- - name: Set up JDK 11
+ - name: Set up JDK 17
uses: actions/setup-java@v3
with:
distribution: "zulu"
- java-version: 11
+ java-version: 17
- uses: actions/setup-python@v4
with:
python-version: "3.10"
diff --git a/.github/workflows/check-datahub-jars.yml b/.github/workflows/check-datahub-jars.yml
index 8e507ea40fd963..46d97ffec88618 100644
--- a/.github/workflows/check-datahub-jars.yml
+++ b/.github/workflows/check-datahub-jars.yml
@@ -28,11 +28,11 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: hsheth2/sane-checkout-action@v1
- - name: Set up JDK 11
+ - name: Set up JDK 17
uses: actions/setup-java@v3
with:
distribution: "zulu"
- java-version: 11
+ java-version: 17
- uses: actions/setup-python@v4
with:
python-version: "3.10"
diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml
index fef23f9efa85f1..169a86000adccb 100644
--- a/.github/workflows/docker-unified.yml
+++ b/.github/workflows/docker-unified.yml
@@ -79,6 +79,11 @@ jobs:
runs-on: ubuntu-latest
needs: setup
steps:
+ - name: Set up JDK 17
+ uses: actions/setup-java@v3
+ with:
+ distribution: "zulu"
+ java-version: 17
- name: Check out the repo
uses: hsheth2/sane-checkout-action@v1
- name: Pre-build artifacts for docker image
@@ -135,6 +140,11 @@ jobs:
runs-on: ubuntu-latest
needs: setup
steps:
+ - name: Set up JDK 17
+ uses: actions/setup-java@v3
+ with:
+ distribution: "zulu"
+ java-version: 17
- name: Check out the repo
uses: hsheth2/sane-checkout-action@v1
- name: Pre-build artifacts for docker image
@@ -191,6 +201,11 @@ jobs:
runs-on: ubuntu-latest
needs: setup
steps:
+ - name: Set up JDK 17
+ uses: actions/setup-java@v3
+ with:
+ distribution: "zulu"
+ java-version: 17
- name: Check out the repo
uses: hsheth2/sane-checkout-action@v1
- name: Pre-build artifacts for docker image
@@ -247,6 +262,11 @@ jobs:
runs-on: ubuntu-latest
needs: setup
steps:
+ - name: Set up JDK 17
+ uses: actions/setup-java@v3
+ with:
+ distribution: "zulu"
+ java-version: 17
- name: Check out the repo
uses: hsheth2/sane-checkout-action@v1
- name: Pre-build artifacts for docker image
@@ -303,6 +323,11 @@ jobs:
runs-on: ubuntu-latest
needs: setup
steps:
+ - name: Set up JDK 17
+ uses: actions/setup-java@v3
+ with:
+ distribution: "zulu"
+ java-version: 17
- name: Check out the repo
uses: hsheth2/sane-checkout-action@v1
- name: Pre-build artifacts for docker image
@@ -537,6 +562,11 @@ jobs:
needs_artifact_download: ${{ (steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true') && needs.setup.outputs.publish != 'true' }}
needs: [setup, datahub_ingestion_base_slim_build]
steps:
+ - name: Set up JDK 17
+ uses: actions/setup-java@v3
+ with:
+ distribution: "zulu"
+ java-version: 17
- name: Check out the repo
uses: hsheth2/sane-checkout-action@v1
- uses: dorny/paths-filter@v2
@@ -618,6 +648,11 @@ jobs:
needs_artifact_download: ${{ (steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true') && needs.setup.outputs.publish != 'true' }}
needs: [setup, datahub_ingestion_base_full_build]
steps:
+ - name: Set up JDK 17
+ uses: actions/setup-java@v3
+ with:
+ distribution: "zulu"
+ java-version: 17
- name: Check out the repo
uses: hsheth2/sane-checkout-action@v1
- uses: dorny/paths-filter@v2
@@ -720,11 +755,11 @@ jobs:
run: df -h . && docker images
- name: Check out the repo
uses: actions/checkout@v3
- - name: Set up JDK 11
+ - name: Set up JDK 17
uses: actions/setup-java@v3
with:
distribution: "zulu"
- java-version: 11
+ java-version: 17
- uses: actions/setup-python@v4
with:
python-version: "3.10"
diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
index c94282938120e4..29953b8b70d911 100644
--- a/.github/workflows/documentation.yml
+++ b/.github/workflows/documentation.yml
@@ -27,11 +27,11 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- - name: Set up JDK 11
+ - name: Set up JDK 17
uses: actions/setup-java@v3
with:
distribution: "zulu"
- java-version: 11
+ java-version: 17
- uses: actions/setup-python@v4
with:
python-version: "3.10"
diff --git a/.github/workflows/metadata-ingestion.yml b/.github/workflows/metadata-ingestion.yml
index ec6bd4141cc6fc..4e04fef3b3980b 100644
--- a/.github/workflows/metadata-ingestion.yml
+++ b/.github/workflows/metadata-ingestion.yml
@@ -44,6 +44,11 @@ jobs:
- python-version: "3.10"
fail-fast: false
steps:
+ - name: Set up JDK 17
+ uses: actions/setup-java@v3
+ with:
+ distribution: "zulu"
+ java-version: 17
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
diff --git a/.github/workflows/metadata-io.yml b/.github/workflows/metadata-io.yml
index 48f230ce14c8db..2188fcb07c77a6 100644
--- a/.github/workflows/metadata-io.yml
+++ b/.github/workflows/metadata-io.yml
@@ -29,11 +29,11 @@ jobs:
timeout-minutes: 60
steps:
- uses: actions/checkout@v3
- - name: Set up JDK 11
+ - name: Set up JDK 17
uses: actions/setup-java@v3
with:
distribution: "zulu"
- java-version: 11
+ java-version: 17
- uses: actions/setup-python@v4
with:
python-version: "3.10"
diff --git a/.github/workflows/metadata-model.yml b/.github/workflows/metadata-model.yml
index eb098a327e4cb5..d0112f1b14e7af 100644
--- a/.github/workflows/metadata-model.yml
+++ b/.github/workflows/metadata-model.yml
@@ -29,6 +29,11 @@ jobs:
runs-on: ubuntu-latest
needs: setup
steps:
+ - name: Set up JDK 17
+ uses: actions/setup-java@v3
+ with:
+ distribution: "zulu"
+ java-version: 17
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
diff --git a/.github/workflows/publish-datahub-jars.yml b/.github/workflows/publish-datahub-jars.yml
index ec7985ef3b3d03..24d1c5436b3156 100644
--- a/.github/workflows/publish-datahub-jars.yml
+++ b/.github/workflows/publish-datahub-jars.yml
@@ -49,11 +49,11 @@ jobs:
if: ${{ needs.check-secret.outputs.publish-enabled == 'true' }}
steps:
- uses: hsheth2/sane-checkout-action@v1
- - name: Set up JDK 11
+ - name: Set up JDK 17
uses: actions/setup-java@v3
with:
distribution: "zulu"
- java-version: 11
+ java-version: 17
- uses: actions/setup-python@v4
with:
python-version: "3.10"
diff --git a/.github/workflows/spark-smoke-test.yml b/.github/workflows/spark-smoke-test.yml
index 70b66d6452b266..60e183cce5179c 100644
--- a/.github/workflows/spark-smoke-test.yml
+++ b/.github/workflows/spark-smoke-test.yml
@@ -30,11 +30,11 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: hsheth2/sane-checkout-action@v1
- - name: Set up JDK 11
+ - name: Set up JDK 17
uses: actions/setup-java@v3
with:
distribution: "zulu"
- java-version: 11
+ java-version: 17
- uses: actions/setup-python@v4
with:
python-version: "3.10"
diff --git a/build.gradle b/build.gradle
index b16e3ca169c717..a7a85db0398e21 100644
--- a/build.gradle
+++ b/build.gradle
@@ -1,17 +1,20 @@
buildscript {
+ ext.jdkVersion = 17
+ ext.javaClassVersion = 11
+
ext.junitJupiterVersion = '5.6.1'
// Releases: https://github.com/linkedin/rest.li/blob/master/CHANGELOG.md
- ext.pegasusVersion = '29.46.8'
+ ext.pegasusVersion = '29.48.4'
ext.mavenVersion = '3.6.3'
ext.springVersion = '5.3.29'
ext.springBootVersion = '2.7.14'
ext.openTelemetryVersion = '1.18.0'
- ext.neo4jVersion = '4.4.9'
- ext.neo4jTestVersion = '4.4.25'
- ext.neo4jApocVersion = '4.4.0.20:all'
+ ext.neo4jVersion = '5.14.0'
+ ext.neo4jTestVersion = '5.14.0'
+ ext.neo4jApocVersion = '5.14.0'
ext.testContainersVersion = '1.17.4'
ext.elasticsearchVersion = '2.9.0' // ES 7.10, Opensearch 1.x, 2.x
- ext.jacksonVersion = '2.15.2'
+ ext.jacksonVersion = '2.15.3'
ext.jettyVersion = '9.4.46.v20220331'
ext.playVersion = '2.8.18'
ext.log4jVersion = '2.19.0'
@@ -29,19 +32,19 @@ buildscript {
buildscript.repositories.addAll(project.repositories)
dependencies {
classpath 'com.linkedin.pegasus:gradle-plugins:' + pegasusVersion
- classpath 'com.github.node-gradle:gradle-node-plugin:2.2.4'
+ classpath 'com.github.node-gradle:gradle-node-plugin:7.0.1'
classpath 'io.acryl.gradle.plugin:gradle-avro-plugin:0.2.0'
classpath 'org.springframework.boot:spring-boot-gradle-plugin:' + springBootVersion
classpath "io.codearte.gradle.nexus:gradle-nexus-staging-plugin:0.30.0"
classpath "com.palantir.gradle.gitversion:gradle-git-version:3.0.0"
classpath "org.gradle.playframework:gradle-playframework:0.14"
- classpath "gradle.plugin.org.hidetake:gradle-swagger-generator-plugin:2.19.1"
+ classpath "gradle.plugin.org.hidetake:gradle-swagger-generator-plugin:2.19.2"
}
}
plugins {
- id 'com.gorylenko.gradle-git-properties' version '2.4.0-rc2'
- id 'com.github.johnrengelman.shadow' version '6.1.0'
+ id 'com.gorylenko.gradle-git-properties' version '2.4.1'
+ id 'com.github.johnrengelman.shadow' version '8.1.1' apply false
id 'com.palantir.docker' version '0.35.0' apply false
id "com.diffplug.spotless" version "6.23.3"
// https://blog.ltgt.net/javax-jakarta-mess-and-gradle-solution/
@@ -149,19 +152,20 @@ project.ext.externalDependency = [
'log4jApi': "org.apache.logging.log4j:log4j-api:$log4jVersion",
'log4j12Api': "org.slf4j:log4j-over-slf4j:$slf4jVersion",
'log4j2Api': "org.apache.logging.log4j:log4j-to-slf4j:$log4jVersion",
- 'lombok': 'org.projectlombok:lombok:1.18.16',
+ 'lombok': 'org.projectlombok:lombok:1.18.30',
'mariadbConnector': 'org.mariadb.jdbc:mariadb-java-client:2.6.0',
'mavenArtifact': "org.apache.maven:maven-artifact:$mavenVersion",
'mixpanel': 'com.mixpanel:mixpanel-java:1.4.4',
- 'mockito': 'org.mockito:mockito-core:3.0.0',
- 'mockitoInline': 'org.mockito:mockito-inline:3.0.0',
+ 'mockito': 'org.mockito:mockito-core:4.11.0',
+ 'mockitoInline': 'org.mockito:mockito-inline:4.11.0',
'mockServer': 'org.mock-server:mockserver-netty:5.11.2',
'mockServerClient': 'org.mock-server:mockserver-client-java:5.11.2',
'mysqlConnector': 'mysql:mysql-connector-java:8.0.20',
'neo4jHarness': 'org.neo4j.test:neo4j-harness:' + neo4jTestVersion,
'neo4jJavaDriver': 'org.neo4j.driver:neo4j-java-driver:' + neo4jVersion,
'neo4jTestJavaDriver': 'org.neo4j.driver:neo4j-java-driver:' + neo4jTestVersion,
- 'neo4jApoc': 'org.neo4j.procedure:apoc:' + neo4jApocVersion,
+ 'neo4jApocCore': 'org.neo4j.procedure:apoc-core:' + neo4jApocVersion,
+ 'neo4jApocCommon': 'org.neo4j.procedure:apoc-common:' + neo4jApocVersion,
'opentelemetryApi': 'io.opentelemetry:opentelemetry-api:' + openTelemetryVersion,
'opentelemetryAnnotations': 'io.opentelemetry:opentelemetry-extension-annotations:' + openTelemetryVersion,
'opentracingJdbc':'io.opentracing.contrib:opentracing-jdbc:0.2.15',
@@ -190,8 +194,8 @@ project.ext.externalDependency = [
'servletApi': 'javax.servlet:javax.servlet-api:3.1.0',
'shiroCore': 'org.apache.shiro:shiro-core:1.11.0',
'snakeYaml': 'org.yaml:snakeyaml:2.0',
- 'sparkSql' : 'org.apache.spark:spark-sql_2.11:2.4.8',
- 'sparkHive' : 'org.apache.spark:spark-hive_2.11:2.4.8',
+ 'sparkSql' : 'org.apache.spark:spark-sql_2.12:3.0.3',
+ 'sparkHive' : 'org.apache.spark:spark-hive_2.12:3.0.3',
'springBeans': "org.springframework:spring-beans:$springVersion",
'springContext': "org.springframework:spring-context:$springVersion",
'springCore': "org.springframework:spring-core:$springVersion",
@@ -210,7 +214,6 @@ project.ext.externalDependency = [
'springActuator': "org.springframework.boot:spring-boot-starter-actuator:$springBootVersion",
'swaggerAnnotations': 'io.swagger.core.v3:swagger-annotations:2.2.15',
'swaggerCli': 'io.swagger.codegen.v3:swagger-codegen-cli:3.0.46',
- 'testngJava8': 'org.testng:testng:7.5.1',
'testng': 'org.testng:testng:7.8.0',
'testContainers': 'org.testcontainers:testcontainers:' + testContainersVersion,
'testContainersJunit': 'org.testcontainers:junit-jupiter:' + testContainersVersion,
@@ -226,13 +229,69 @@ project.ext.externalDependency = [
'charle': 'com.charleskorn.kaml:kaml:0.53.0',
'common': 'commons-io:commons-io:2.7',
'jline':'jline:jline:1.4.1',
- 'jetbrains':' org.jetbrains.kotlin:kotlin-stdlib:1.6.0'
+ 'jetbrains':' org.jetbrains.kotlin:kotlin-stdlib:1.6.0',
+ 'annotationApi': 'javax.annotation:javax.annotation-api:1.3.2'
]
allprojects {
apply plugin: 'idea'
apply plugin: 'eclipse'
// apply plugin: 'org.gradlex.java-ecosystem-capabilities'
+
+ tasks.withType(Test).configureEach {
+ // https://docs.gradle.org/current/userguide/performance.html
+ maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1
+
+ if (project.configurations.getByName("testImplementation").getDependencies()
+ .any{ it.getName().contains("testng") }) {
+ useTestNG()
+ }
+ }
+
+ if (project.plugins.hasPlugin('java')
+ || project.plugins.hasPlugin('java-library')
+ || project.plugins.hasPlugin('application')
+ || project.plugins.hasPlugin('pegasus')) {
+
+ java {
+ toolchain {
+ languageVersion = JavaLanguageVersion.of(jdkVersion)
+ }
+ }
+
+ compileJava {
+ options.release = javaClassVersion
+ }
+ tasks.withType(JavaCompile).configureEach {
+ javaCompiler = javaToolchains.compilerFor {
+ languageVersion = JavaLanguageVersion.of(jdkVersion)
+ }
+ }
+
+ tasks.withType(JavaExec).configureEach {
+ javaLauncher = javaToolchains.launcherFor {
+ languageVersion = JavaLanguageVersion.of(jdkVersion)
+ }
+ }
+
+ // not duplicated, need to set this outside and inside afterEvaluate
+ afterEvaluate {
+ compileJava {
+ options.release = javaClassVersion
+ }
+ tasks.withType(JavaCompile).configureEach {
+ javaCompiler = javaToolchains.compilerFor {
+ languageVersion = JavaLanguageVersion.of(jdkVersion)
+ }
+ }
+
+ tasks.withType(JavaExec).configureEach {
+ javaLauncher = javaToolchains.launcherFor {
+ languageVersion = JavaLanguageVersion.of(jdkVersion)
+ }
+ }
+ }
+ }
}
configure(subprojects.findAll {! it.name.startsWith('spark-lineage')}) {
@@ -264,8 +323,9 @@ subprojects {
failOnNoGitDirectory = false
}
- plugins.withType(JavaPlugin) {
+ plugins.withType(JavaPlugin).configureEach {
dependencies {
+ implementation externalDependency.annotationApi
constraints {
implementation("com.google.googlejavaformat:google-java-format:$googleJavaFormatVersion")
implementation('io.netty:netty-all:4.1.100.Final')
@@ -276,18 +336,30 @@ subprojects {
implementation("com.fasterxml.jackson.core:jackson-dataformat-cbor:$jacksonVersion")
}
}
+
spotless {
java {
googleJavaFormat()
target project.fileTree(project.projectDir) {
- include '**/*.java'
- exclude 'build/**/*.java'
- exclude '**/generated/**/*.*'
- exclude '**/mainGeneratedDataTemplate/**/*.*'
- exclude '**/mainGeneratedRest/**/*.*'
+ include 'src/**/*.java'
+ exclude 'src/**/resources/'
+ exclude 'src/**/generated/'
+ exclude 'src/**/mainGeneratedDataTemplate/'
+ exclude 'src/**/mainGeneratedRest/'
+ exclude 'src/renamed/avro/'
+ exclude 'src/test/sample-test-plugins/'
}
}
}
+
+ if (project.plugins.hasPlugin('pegasus')) {
+ dependencies {
+ dataTemplateCompile spec.product.pegasus.data
+ dataTemplateCompile externalDependency.annotationApi // support > jdk8
+ restClientCompile spec.product.pegasus.restliClient
+ }
+ }
+
afterEvaluate {
def spotlessJavaTask = tasks.findByName('spotlessJava')
def processTask = tasks.findByName('processResources')
@@ -305,28 +377,11 @@ subprojects {
}
}
- tasks.withType(JavaCompile).configureEach {
- javaCompiler = javaToolchains.compilerFor {
- languageVersion = JavaLanguageVersion.of(11)
- }
- }
- tasks.withType(Test).configureEach {
- javaLauncher = javaToolchains.launcherFor {
- languageVersion = JavaLanguageVersion.of(11)
- }
- // https://docs.gradle.org/current/userguide/performance.html
- maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1
-
- if (project.configurations.getByName("testImplementation").getDependencies()
- .any{ it.getName().contains("testng") }) {
- useTestNG()
- }
- }
-
afterEvaluate {
if (project.plugins.hasPlugin('pegasus')) {
dependencies {
dataTemplateCompile spec.product.pegasus.data
+ dataTemplateCompile externalDependency.annotationApi // support > jdk8
restClientCompile spec.product.pegasus.restliClient
}
}
diff --git a/buildSrc/build.gradle b/buildSrc/build.gradle
index 1f9d30d520171b..0c2d91e1f7ac1b 100644
--- a/buildSrc/build.gradle
+++ b/buildSrc/build.gradle
@@ -1,9 +1,11 @@
-apply plugin: 'java'
-
buildscript {
apply from: '../repositories.gradle'
}
+plugins {
+ id 'java'
+}
+
dependencies {
/**
* Forked version of abandoned repository: https://github.com/fge/json-schema-avro
@@ -21,6 +23,9 @@ dependencies {
implementation 'com.fasterxml.jackson.dataformat:jackson-dataformat-yaml:2.13.5'
implementation 'commons-io:commons-io:2.11.0'
- compileOnly 'org.projectlombok:lombok:1.18.14'
- annotationProcessor 'org.projectlombok:lombok:1.18.14'
+ compileOnly 'org.projectlombok:lombok:1.18.30'
+ annotationProcessor 'org.projectlombok:lombok:1.18.30'
+
+ // pegasus dependency, overrides for tasks
+ implementation 'com.linkedin.pegasus:gradle-plugins:29.48.4'
}
\ No newline at end of file
diff --git a/buildSrc/src/main/java/com/linkedin/pegasus/gradle/PegasusPlugin.java b/buildSrc/src/main/java/com/linkedin/pegasus/gradle/PegasusPlugin.java
new file mode 100644
index 00000000000000..2460abcad6f9e9
--- /dev/null
+++ b/buildSrc/src/main/java/com/linkedin/pegasus/gradle/PegasusPlugin.java
@@ -0,0 +1,2444 @@
+/*
+ * Copyright (c) 2019 LinkedIn Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.linkedin.pegasus.gradle;
+
+import com.linkedin.pegasus.gradle.PegasusOptions.IdlOptions;
+import com.linkedin.pegasus.gradle.internal.CompatibilityLogChecker;
+import com.linkedin.pegasus.gradle.tasks.ChangedFileReportTask;
+import com.linkedin.pegasus.gradle.tasks.CheckIdlTask;
+import com.linkedin.pegasus.gradle.tasks.CheckPegasusSnapshotTask;
+import com.linkedin.pegasus.gradle.tasks.CheckRestModelTask;
+import com.linkedin.pegasus.gradle.tasks.CheckSnapshotTask;
+import com.linkedin.pegasus.gradle.tasks.GenerateAvroSchemaTask;
+import com.linkedin.pegasus.gradle.tasks.GenerateDataTemplateTask;
+import com.linkedin.pegasus.gradle.tasks.GeneratePegasusSnapshotTask;
+import com.linkedin.pegasus.gradle.tasks.GenerateRestClientTask;
+import com.linkedin.pegasus.gradle.tasks.GenerateRestModelTask;
+import com.linkedin.pegasus.gradle.tasks.PublishRestModelTask;
+import com.linkedin.pegasus.gradle.tasks.TranslateSchemasTask;
+import com.linkedin.pegasus.gradle.tasks.ValidateExtensionSchemaTask;
+import com.linkedin.pegasus.gradle.tasks.ValidateSchemaAnnotationTask;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.lang.reflect.Method;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.function.Function;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+
+import org.gradle.api.Action;
+import org.gradle.api.GradleException;
+import org.gradle.api.Plugin;
+import org.gradle.api.Project;
+import org.gradle.api.Task;
+import org.gradle.api.artifacts.Configuration;
+import org.gradle.api.artifacts.ConfigurationContainer;
+import org.gradle.api.file.FileCollection;
+import org.gradle.api.plugins.JavaBasePlugin;
+import org.gradle.api.plugins.JavaPlugin;
+import org.gradle.api.plugins.JavaPluginConvention;
+import org.gradle.api.plugins.JavaPluginExtension;
+import org.gradle.api.publish.PublishingExtension;
+import org.gradle.api.publish.ivy.IvyPublication;
+import org.gradle.api.publish.ivy.plugins.IvyPublishPlugin;
+import org.gradle.api.tasks.Copy;
+import org.gradle.api.tasks.Delete;
+import org.gradle.api.tasks.SourceSet;
+import org.gradle.api.tasks.SourceSetContainer;
+import org.gradle.api.tasks.Sync;
+import org.gradle.api.tasks.TaskProvider;
+import org.gradle.api.tasks.bundling.Jar;
+import org.gradle.api.tasks.compile.JavaCompile;
+import org.gradle.api.tasks.javadoc.Javadoc;
+import org.gradle.language.base.plugins.LifecycleBasePlugin;
+import org.gradle.language.jvm.tasks.ProcessResources;
+import org.gradle.plugins.ide.eclipse.EclipsePlugin;
+import org.gradle.plugins.ide.eclipse.model.EclipseModel;
+import org.gradle.plugins.ide.idea.IdeaPlugin;
+import org.gradle.plugins.ide.idea.model.IdeaModule;
+import org.gradle.util.GradleVersion;
+
+
+/**
+ * Pegasus code generation plugin.
+ * The supported project layout for this plugin is as follows:
+ *
+ *
+ * --- api/
+ * | --- build.gradle
+ * | --- src/
+ * | --- <sourceSet>/
+ * | | --- idl/
+ * | | | --- <published idl (.restspec.json) files>
+ * | | --- java/
+ * | | | --- <packageName>/
+ * | | | --- <common java files>
+ * | | --- pegasus/
+ * | | --- <packageName>/
+ * | | --- <data schema (.pdsc) files>
+ * | --- <sourceSet>GeneratedDataTemplate/
+ * | | --- java/
+ * | | --- <packageName>/
+ * | | --- <data template source files generated from data schema (.pdsc) files>
+ * | --- <sourceSet>GeneratedAvroSchema/
+ * | | --- avro/
+ * | | --- <packageName>/
+ * | | --- <avsc avro schema files (.avsc) generated from pegasus schema files>
+ * | --- <sourceSet>GeneratedRest/
+ * | --- java/
+ * | --- <packageName>/
+ * | --- <rest client source (.java) files generated from published idl>
+ * --- impl/
+ * | --- build.gradle
+ * | --- src/
+ * | --- <sourceSet>/
+ * | | --- java/
+ * | | --- <packageName>/
+ * | | --- <resource class source (.java) files>
+ * | --- <sourceSet>GeneratedRest/
+ * | --- idl/
+ * | --- <generated idl (.restspec.json) files>
+ * --- <other projects>/
+ *
+ *
+ *
+ * api : contains all the files which are commonly depended by the server and
+ * client implementation. The common files include the data schema (.pdsc) files,
+ * the idl (.restspec.json) files and potentially Java interface files used by both sides.
+ *
+ *
+ * impl : contains the resource class for server implementation.
+ *
+ *
+ * Performs the following functions:
+ *
+ * Generate data model and data template jars for each source set.
+ *
+ * Overview:
+ *
+ *
+ * In the api project, the plugin generates the data template source (.java) files from the
+ * data schema (.pdsc) files, and furthermore compiles the source files and packages them
+ * to jar files. Details of jar contents will be explained in following paragraphs.
+ * In general, data schema files should exist only in api projects.
+ *
+ *
+ *
+ * Configure the server and client implementation projects to depend on the
+ * api project's dataTemplate configuration to get access to the generated data templates
+ * from within these projects. This allows api classes to be built first so that implementation
+ * projects can consume them. We recommend this structure to avoid circular dependencies
+ * (directly or indirectly) among implementation projects.
+ *
+ *
+ * Detail:
+ *
+ *
+ * Generates data template source (.java) files from data schema (.pdsc) files,
+ * compiles the data template source (.java) files into class (.class) files,
+ * creates a data model jar file and a data template jar file.
+ * The data model jar file contains the source data schema (.pdsc) files.
+ * The data template jar file contains both the source data schema (.pdsc) files
+ * and the generated data template class (.class) files.
+ *
+ *
+ *
+ * In the data template generation phase, the plugin creates a new target source set
+ * for the generated files. The new target source set's name is the input source set name's
+ * suffixed with "GeneratedDataTemplate", e.g. "mainGeneratedDataTemplate".
+ * The plugin invokes PegasusDataTemplateGenerator to generate data template source (.java) files
+ * for all data schema (.pdsc) files present in the input source set's pegasus
+ * directory, e.g. "src/main/pegasus". The generated data template source (.java) files
+ * will be in the new target source set's java source directory, e.g.
+ * "src/mainGeneratedDataTemplate/java". In addition to
+ * the data schema (.pdsc) files in the pegasus directory, the dataModel configuration
+ * specifies resolver path for the PegasusDataTemplateGenerator. The resolver path
+ * provides the data schemas and previously generated data template classes that
+ * may be referenced by the input source set's data schemas. In most cases, the dataModel
+ * configuration should contain data template jars.
+ *
+ *
+ *
+ * The next phase is the data template compilation phase, the plugin compiles the generated
+ * data template source (.java) files into class files. The dataTemplateCompile configuration
+ * specifies the pegasus jars needed to compile these classes. The compileClasspath of the
+ * target source set is a composite of the dataModel configuration which includes the data template
+ * classes that were previously generated and included in the dependent data template jars,
+ * and the dataTemplateCompile configuration.
+ * This configuration should specify a dependency on the Pegasus data jar.
+ *
+ *
+ *
+ * The following phase is creating the the data model jar and the data template jar.
+ * This plugin creates the data model jar that includes the contents of the
+ * input source set's pegasus directory, and sets the jar file's classification to
+ * "data-model". Hence, the resulting jar file's name should end with "-data-model.jar".
+ * It adds the data model jar as an artifact to the dataModel configuration.
+ * This jar file should only contain data schema (.pdsc) files.
+ *
+ *
+ *
+ * This plugin also create the data template jar that includes the contents of the input
+ * source set's pegasus directory and the java class output directory of the
+ * target source set. It sets the jar file's classification to "data-template".
+ * Hence, the resulting jar file's name should end with "-data-template.jar".
+ * It adds the data template jar file as an artifact to the dataTemplate configuration.
+ * This jar file contains both data schema (.pdsc) files and generated data template
+ * class (.class) files.
+ *
+ *
+ *
+ * This plugin will ensure that data template source files are generated before
+ * compiling the input source set and before the idea and eclipse tasks. It
+ * also adds the generated classes to the compileClasspath of the input source set.
+ *
+ *
+ *
+ * The configurations that apply to generating the data model and data template jars
+ * are as follow:
+ *
+ *
+ * The dataTemplateCompile configuration specifies the classpath for compiling
+ * the generated data template source (.java) files. In most cases,
+ * it should be the Pegasus data jar.
+ * (The default compile configuration is not used for compiling data templates because
+ * it is not desirable to include non data template dependencies in the data template jar.)
+ * The configuration should not directly include data template jars. Data template jars
+ * should be included in the dataModel configuration.
+ *
+ *
+ * The dataModel configuration provides the value of the "generator.resolver.path"
+ * system property that is passed to PegasusDataTemplateGenerator. In most cases,
+ * this configuration should contain only data template jars. The data template jars
+ * contain both data schema (.pdsc) files and generated data template (.class) files.
+ * PegasusDataTemplateGenerator will not generate data template (.java) files for
+ * classes that can be found in the resolver path. This avoids redundant generation
+ * of the same classes, and inclusion of these classes in multiple jars.
+ * The dataModel configuration is also used to publish the data model jar which
+ * contains only data schema (.pdsc) files.
+ *
+ *
+ * The testDataModel configuration is similar to the dataModel configuration
+ * except it is used when generating data templates from test source sets.
+ * It extends from the dataModel configuration. It is also used to publish
+ * the data model jar from test source sets.
+ *
+ *
+ * The dataTemplate configuration is used to publish the data template
+ * jar which contains both data schema (.pdsc) files and the data template class
+ * (.class) files generated from these data schema (.pdsc) files.
+ *
+ *
+ * The testDataTemplate configuration is similar to the dataTemplate configuration
+ * except it is used when publishing the data template jar files generated from
+ * test source sets.
+ *
+ *
+ *
+ *
+ * Performs the following functions:
+ *
+ * Generate avro schema jars for each source set.
+ *
+ * Overview:
+ *
+ *
+ * In the api project, the task 'generateAvroSchema' generates the avro schema (.avsc)
+ * files from pegasus schema (.pdsc) files. In general, data schema files should exist
+ * only in api projects.
+ *
+ *
+ *
+ * Configure the server and client implementation projects to depend on the
+ * api project's avroSchema configuration to get access to the generated avro schemas
+ * from within these projects.
+ *
+ *
+ *
+ * This plugin also create the avro schema jar that includes the contents of the input
+ * source set's avro directory and the avsc schema files.
+ * The resulting jar file's name should end with "-avro-schema.jar".
+ *
+ *
+ * Generate rest model and rest client jars for each source set.
+ *
+ * Overview:
+ *
+ *
+ * In the api project, generates rest client source (.java) files from the idl,
+ * compiles the rest client source (.java) files to rest client class (.class) files
+ * and puts them in jar files. In general, the api project should be only place that
+ * contains the publishable idl files. If the published idl changes an existing idl
+ * in the api project, the plugin will emit message indicating this has occurred and
+ * suggest that the entire project be rebuilt if it is desirable for clients of the
+ * idl to pick up the newly published changes.
+ *
+ *
+ *
+ * In the impl project, generates the idl (.restspec.json) files from the input
+ * source set's resource class files, then compares them against the existing idl
+ * files in the api project for compatibility checking. If incompatible changes are
+ * found, the build fails (unless certain flag is specified, see below). If the
+ * generated idl passes compatibility checks (see compatibility check levels below),
+ * publishes the generated idl (.restspec.json) to the api project.
+ *
+ *
+ * Detail:
+ *
+ * rest client generation phase : in api project
+ *
+ *
+ * In this phase, the rest client source (.java) files are generated from the
+ * api project idl (.restspec.json) files using RestRequestBuilderGenerator.
+ * The generated rest client source files will be in the new target source set's
+ * java source directory, e.g. "src/mainGeneratedRest/java".
+ *
+ *
+ *
+ * RestRequestBuilderGenerator requires access to the data schemas referenced
+ * by the idl. The dataModel configuration specifies the resolver path needed
+ * by RestRequestBuilderGenerator to access the data schemas referenced by
+ * the idl that is not in the source set's pegasus directory.
+ * This plugin automatically includes the data schema (.pdsc) files in the
+ * source set's pegasus directory in the resolver path.
+ * In most cases, the dataModel configuration should contain data template jars.
+ * The data template jars contains both data schema (.pdsc) files and generated
+ * data template class (.class) files. By specifying data template jars instead
+ * of data model jars, redundant generation of data template classes is avoided
+ * as classes that can be found in the resolver path are not generated.
+ *
+ *
+ * rest client compilation phase : in api project
+ *
+ *
+ * In this phase, the plugin compiles the generated rest client source (.java)
+ * files into class files. The restClientCompile configuration specifies the
+ * pegasus jars needed to compile these classes. The compile classpath is a
+ * composite of the dataModel configuration which includes the data template
+ * classes that were previously generated and included in the dependent data template
+ * jars, and the restClientCompile configuration.
+ * This configuration should specify a dependency on the Pegasus restli-client jar.
+ *
+ *
+ *
+ * The following stage is creating the the rest model jar and the rest client jar.
+ * This plugin creates the rest model jar that includes the
+ * generated idl (.restspec.json) files, and sets the jar file's classification to
+ * "rest-model". Hence, the resulting jar file's name should end with "-rest-model.jar".
+ * It adds the rest model jar as an artifact to the restModel configuration.
+ * This jar file should only contain idl (.restspec.json) files.
+ *
+ *
+ *
+ * This plugin also create the rest client jar that includes the generated
+ * idl (.restspec.json) files and the java class output directory of the
+ * target source set. It sets the jar file's classification to "rest-client".
+ * Hence, the resulting jar file's name should end with "-rest-client.jar".
+ * It adds the rest client jar file as an artifact to the restClient configuration.
+ * This jar file contains both idl (.restspec.json) files and generated rest client
+ * class (.class) files.
+ *
+ *
+ * idl generation phase : in server implementation project
+ *
+ *
+ * Before entering this phase, the plugin will ensure that generating idl will
+ * occur after compiling the input source set. It will also ensure that IDEA
+ * and Eclipse tasks runs after rest client source (.java) files are generated.
+ *
+ *
+ *
+ * In this phase, the plugin creates a new target source set for the generated files.
+ * The new target source set's name is the input source set name's* suffixed with
+ * "GeneratedRest", e.g. "mainGeneratedRest". The plugin invokes
+ * RestLiResourceModelExporter to generate idl (.restspec.json) files for each
+ * IdlItem in the input source set's pegasus IdlOptions. The generated idl files
+ * will be in target source set's idl directory, e.g. "src/mainGeneratedRest/idl".
+ * For example, the following adds an IdlItem to the source set's pegasus IdlOptions.
+ * This line should appear in the impl project's build.gradle. If no IdlItem is added,
+ * this source set will be excluded from generating idl and checking idl compatibility,
+ * even there are existing idl files.
+ *
+ * pegasus.main.idlOptions.addIdlItem(["com.linkedin.restli.examples.groups.server"])
+ *
+ *
+ *
+ *
+ * After the idl generation phase, each included idl file is checked for compatibility against
+ * those in the api project. In case the current interface breaks compatibility,
+ * by default the build fails and reports all compatibility errors and warnings. Otherwise,
+ * the build tasks in the api project later will package the resource classes into jar files.
+ * User can change the compatibility requirement between the current and published idl by
+ * setting the "rest.model.compatibility" project property, i.e.
+ * "gradle -Prest.model.compatibility= ..." The following levels are supported:
+ *
+ * ignore : idl compatibility check will occur but its result will be ignored.
+ * The result will be aggregated and printed at the end of the build.
+ * backwards : build fails if there are backwards incompatible changes in idl.
+ * Build continues if there are only compatible changes.
+ * equivalent (default) : build fails if there is any functional changes (compatible or
+ * incompatible) in the current idl. Only docs and comments are allowed to be different.
+ *
+ * The plugin needs to know where the api project is. It searches the api project in the
+ * following steps. If all searches fail, the build fails.
+ *
+ *
+ * Use the specified project from the impl project build.gradle file. The ext.apiProject
+ * property explicitly assigns the api project. E.g.
+ *
+ * ext.apiProject = project(':groups:groups-server-api')
+ *
+ * If multiple such statements exist, the last will be used. Wrong project path causes Gradle
+ * evaluation error.
+ *
+ *
+ * If no ext.apiProject property is defined, the plugin will try to guess the
+ * api project name with the following conventions. The search stops at the first successful match.
+ *
+ *
+ * If the impl project name ends with the following suffixes, substitute the suffix with "-api".
+ *
+ * -impl
+ * -service
+ * -server
+ * -server-impl
+ *
+ * This list can be overridden by inserting the following line to the project build.gradle:
+ *
+ * ext.apiProjectSubstitutionSuffixes = ['-new-suffix-1', '-new-suffix-2']
+ *
+ * Alternatively, this setting could be applied globally to all projects by putting it in
+ * the subprojects section of the root build.gradle.
+ *
+ *
+ * Append "-api" to the impl project name.
+ *
+ *
+ *
+ *
+ * The plugin invokes RestLiResourceModelCompatibilityChecker to check compatibility.
+ *
+ *
+ *
+ * The idl files in the api project are not generated by the plugin, but rather
+ * "published" from the impl project. The publishRestModel task is used to copy the
+ * idl files to the api project. This task is invoked automatically if the idls are
+ * verified to be "safe". "Safe" is determined by the "rest.model.compatibility"
+ * property. Because this task is skipped if the idls are functionally equivalent
+ * (not necessarily identical, e.g. differ in doc fields), if the default "equivalent"
+ * compatibility level is used, no file will be copied. If such automatic publishing
+ * is intended to be skip, set the "rest.model.skipPublish" property to true.
+ * Note that all the properties are per-project and can be overridden in each project's
+ * build.gradle file.
+ *
+ *
+ *
+ * Please always keep in mind that if idl publishing is happened, a subsequent whole-project
+ * rebuild is necessary to pick up the changes. Otherwise, the Hudson job will fail and
+ * the source code commit will fail.
+ *
+ *
+ *
+ * The configurations that apply to generating the rest model and rest client jars
+ * are as follow:
+ *
+ *
+ * The restClientCompile configuration specifies the classpath for compiling
+ * the generated rest client source (.java) files. In most cases,
+ * it should be the Pegasus restli-client jar.
+ * (The default compile configuration is not used for compiling rest client because
+ * it is not desirable to include non rest client dependencies, such as
+ * the rest server implementation classes, in the data template jar.)
+ * The configuration should not directly include data template jars. Data template jars
+ * should be included in the dataModel configuration.
+ *
+ *
+ * The dataModel configuration provides the value of the "generator.resolver.path"
+ * system property that is passed to RestRequestBuilderGenerator.
+ * This configuration should contain only data template jars. The data template jars
+ * contain both data schema (.pdsc) files and generated data template (.class) files.
+ * The RestRequestBuilderGenerator will only generate rest client classes.
+ * The dataModel configuration is also included in the compile classpath for the
+ * generated rest client source files. The dataModel configuration does not
+ * include generated data template classes, then the Java compiler may not able to
+ * find the data template classes referenced by the generated rest client.
+ *
+ *
+ * The testDataModel configuration is similar to the dataModel configuration
+ * except it is used when generating rest client source files from
+ * test source sets.
+ *
+ *
+ * The restModel configuration is used to publish the rest model jar
+ * which contains generated idl (.restspec.json) files.
+ *
+ *
+ * The testRestModel configuration is similar to the restModel configuration
+ * except it is used to publish rest model jar files generated from
+ * test source sets.
+ *
+ *
+ * The restClient configuration is used to publish the rest client jar
+ * which contains both generated idl (.restspec.json) files and
+ * the rest client class (.class) files generated from from these
+ * idl (.restspec.json) files.
+ *
+ *
+ * The testRestClient configuration is similar to the restClient configuration
+ * except it is used to publish rest client jar files generated from
+ * test source sets.
+ *
+ *
+ *
+ *
+ *
+ * This plugin considers test source sets whose names begin with 'test' or 'integTest' to be
+ * test source sets.
+ *
+ */
+public class PegasusPlugin implements Plugin
+{
+ public static boolean debug = false;
+
+ private static final GradleVersion MIN_REQUIRED_VERSION = GradleVersion.version("1.0"); // Next: 5.2.1
+ private static final GradleVersion MIN_SUGGESTED_VERSION = GradleVersion.version("5.2.1"); // Next: 5.3
+
+ //
+ // Constants for generating sourceSet names and corresponding directory names
+ // for generated code
+ //
+ private static final String DATA_TEMPLATE_GEN_TYPE = "DataTemplate";
+ private static final String REST_GEN_TYPE = "Rest";
+ private static final String AVRO_SCHEMA_GEN_TYPE = "AvroSchema";
+
+ public static final String DATA_TEMPLATE_FILE_SUFFIX = ".pdsc";
+ public static final String PDL_FILE_SUFFIX = ".pdl";
+ // gradle property to opt OUT schema annotation validation, by default this feature is enabled.
+ private static final String DISABLE_SCHEMA_ANNOTATION_VALIDATION = "schema.annotation.validation.disable";
+ // gradle property to opt in for destroying stale files from the build directory,
+ // by default it is disabled, because it triggers hot-reload (even if it results in a no-op)
+ private static final String DESTROY_STALE_FILES_ENABLE = "enableDestroyStaleFiles";
+ public static final Collection DATA_TEMPLATE_FILE_SUFFIXES = new ArrayList<>();
+
+ public static final String IDL_FILE_SUFFIX = ".restspec.json";
+ public static final String SNAPSHOT_FILE_SUFFIX = ".snapshot.json";
+ public static final String SNAPSHOT_COMPAT_REQUIREMENT = "rest.model.compatibility";
+ public static final String IDL_COMPAT_REQUIREMENT = "rest.idl.compatibility";
+ // Pegasus schema compatibility level configuration, which is used to define the {@link CompatibilityLevel}.
+ public static final String PEGASUS_SCHEMA_SNAPSHOT_REQUIREMENT = "pegasusPlugin.pegasusSchema.compatibility";
+ // Pegasus extension schema compatibility level configuration, which is used to define the {@link CompatibilityLevel}
+ public static final String PEGASUS_EXTENSION_SCHEMA_SNAPSHOT_REQUIREMENT = "pegasusPlugin.extensionSchema.compatibility";
+ // CompatibilityOptions Mode configuration, which is used to define the {@link CompatibilityOptions#Mode} in the compatibility checker.
+ private static final String PEGASUS_COMPATIBILITY_MODE = "pegasusPlugin.pegasusSchemaCompatibilityCheckMode";
+
+ private static final Pattern TEST_DIR_REGEX = Pattern.compile("^(integ)?[Tt]est");
+ private static final String SNAPSHOT_NO_PUBLISH = "rest.model.noPublish";
+ private static final String SNAPSHOT_FORCE_PUBLISH = "rest.model.forcePublish";
+ private static final String PROCESS_EMPTY_IDL_DIR = "rest.idl.processEmptyIdlDir";
+ private static final String IDL_NO_PUBLISH = "rest.idl.noPublish";
+ private static final String IDL_FORCE_PUBLISH = "rest.idl.forcePublish";
+ private static final String SKIP_IDL_CHECK = "rest.idl.skipCheck";
+ // gradle property to skip running GenerateRestModel task.
+ // Note it affects GenerateRestModel task only, and does not skip tasks depends on GenerateRestModel.
+ private static final String SKIP_GENERATE_REST_MODEL= "rest.model.skipGenerateRestModel";
+ private static final String SUPPRESS_REST_CLIENT_RESTLI_2 = "rest.client.restli2.suppress";
+ private static final String SUPPRESS_REST_CLIENT_RESTLI_1 = "rest.client.restli1.suppress";
+
+ private static final String GENERATOR_CLASSLOADER_NAME = "pegasusGeneratorClassLoader";
+
+ private static final String CONVERT_TO_PDL_REVERSE = "convertToPdl.reverse";
+ private static final String CONVERT_TO_PDL_KEEP_ORIGINAL = "convertToPdl.keepOriginal";
+ private static final String CONVERT_TO_PDL_SKIP_VERIFICATION = "convertToPdl.skipVerification";
+ private static final String CONVERT_TO_PDL_PRESERVE_SOURCE_CMD = "convertToPdl.preserveSourceCmd";
+
+ // Below variables are used to collect data across all pegasus projects (sub-projects) and then print information
+ // to the user at the end after build is finished.
+ private static StringBuffer _restModelCompatMessage = new StringBuffer();
+ private static final Collection _needCheckinFiles = new ArrayList<>();
+ private static final Collection _needBuildFolders = new ArrayList<>();
+ private static final Collection _possibleMissingFilesInEarlierCommit = new ArrayList<>();
+
+ private static final String RUN_ONCE = "runOnce";
+ private static final Object STATIC_PROJECT_EVALUATED_LOCK = new Object();
+
+ private static final List UNUSED_CONFIGURATIONS = Arrays.asList(
+ "dataTemplateGenerator", "restTools", "avroSchemaGenerator");
+ // Directory in the dataTemplate jar that holds schemas translated from PDL to PDSC.
+ private static final String TRANSLATED_SCHEMAS_DIR = "legacyPegasusSchemas";
+ // Enable the use of argFiles for the tasks that support them
+ private static final String ENABLE_ARG_FILE = "pegasusPlugin.enableArgFile";
+ // Enable the generation of fluent APIs
+ private static final String ENABLE_FLUENT_API = "pegasusPlugin.enableFluentApi";
+
+ // This config impacts GenerateDataTemplateTask and GenerateRestClientTask;
+ // If not set, by default all paths generated in these two tasks will be lower-case.
+ // This default behavior is needed because Linux, MacOS, Windows treat case sensitive paths differently,
+ // and we want to be consistent, so we choose lower-case as default case for path generated
+ private static final String CODE_GEN_PATH_CASE_SENSITIVE = "pegasusPlugin.generateCaseSensitivePath";
+
+ private static final String PEGASUS_PLUGIN_CONFIGURATION = "pegasusPlugin";
+
+ // Enable the use of generic pegasus schema compatibility checker
+ private static final String ENABLE_PEGASUS_SCHEMA_COMPATIBILITY_CHECK = "pegasusPlugin.enablePegasusSchemaCompatibilityCheck";
+
+ private static final String PEGASUS_SCHEMA_SNAPSHOT = "PegasusSchemaSnapshot";
+
+ private static final String PEGASUS_EXTENSION_SCHEMA_SNAPSHOT = "PegasusExtensionSchemaSnapshot";
+
+ private static final String PEGASUS_SCHEMA_SNAPSHOT_DIR = "pegasusSchemaSnapshot";
+
+ private static final String PEGASUS_EXTENSION_SCHEMA_SNAPSHOT_DIR = "pegasusExtensionSchemaSnapshot";
+
+ private static final String PEGASUS_SCHEMA_SNAPSHOT_DIR_OVERRIDE = "overridePegasusSchemaSnapshotDir";
+
+ private static final String PEGASUS_EXTENSION_SCHEMA_SNAPSHOT_DIR_OVERRIDE = "overridePegasusExtensionSchemaSnapshotDir";
+
+ private static final String SRC = "src";
+
+ private static final String SCHEMA_ANNOTATION_HANDLER_CONFIGURATION = "schemaAnnotationHandler";
+
+ private static final String COMPATIBILITY_OPTIONS_MODE_EXTENSION = "EXTENSION";
+
+
+ @SuppressWarnings("unchecked")
+ private Class extends Plugin> _thisPluginType = (Class extends Plugin>)
+ getClass().asSubclass(Plugin.class);
+
+ private Task _generateSourcesJarTask;
+ private Javadoc _generateJavadocTask;
+ private Task _generateJavadocJarTask;
+ private boolean _configureIvyPublications = true;
+
+ public void setPluginType(Class extends Plugin> pluginType)
+ {
+ _thisPluginType = pluginType;
+ }
+
+ public void setSourcesJarTask(Task sourcesJarTask)
+ {
+ _generateSourcesJarTask = sourcesJarTask;
+ }
+
+ public void setJavadocJarTask(Task javadocJarTask)
+ {
+ _generateJavadocJarTask = javadocJarTask;
+ }
+
+ public void setConfigureIvyPublications(boolean configureIvyPublications) {
+ _configureIvyPublications = configureIvyPublications;
+ }
+
+ @Override
+ public void apply(Project project)
+ {
+ checkGradleVersion(project);
+
+ project.getPlugins().apply(JavaPlugin.class);
+
+ // this HashMap will have a PegasusOptions per sourceSet
+ project.getExtensions().getExtraProperties().set("pegasus", new HashMap<>());
+ // this map will extract PegasusOptions.GenerationMode to project property
+ project.getExtensions().getExtraProperties().set("PegasusGenerationMode",
+ Arrays.stream(PegasusOptions.GenerationMode.values())
+ .collect(Collectors.toMap(PegasusOptions.GenerationMode::name, Function.identity())));
+
+ synchronized (STATIC_PROJECT_EVALUATED_LOCK)
+ {
+ // Check if this is the first time the block will run. Pegasus plugin can run multiple times in a build if
+ // multiple sub-projects applied the plugin.
+ if (!project.getRootProject().hasProperty(RUN_ONCE)
+ || !Boolean.parseBoolean(String.valueOf(project.getRootProject().property(RUN_ONCE))))
+ {
+ project.getGradle().projectsEvaluated(gradle ->
+ gradle.getRootProject().subprojects(subproject ->
+ UNUSED_CONFIGURATIONS.forEach(configurationName -> {
+ Configuration conf = subproject.getConfigurations().findByName(configurationName);
+ if (conf != null && !conf.getDependencies().isEmpty()) {
+ subproject.getLogger().warn("*** Project {} declares dependency to unused configuration \"{}\". "
+ + "This configuration is deprecated and you can safely remove the dependency. ***",
+ subproject.getPath(), configurationName);
+ }
+ })
+ )
+ );
+
+ // Re-initialize the static variables as they might have stale values from previous run. With Gradle 3.0 and
+ // gradle daemon enabled, the plugin class might not be loaded for every run.
+ DATA_TEMPLATE_FILE_SUFFIXES.clear();
+ DATA_TEMPLATE_FILE_SUFFIXES.add(DATA_TEMPLATE_FILE_SUFFIX);
+ DATA_TEMPLATE_FILE_SUFFIXES.add(PDL_FILE_SUFFIX);
+
+ _restModelCompatMessage = new StringBuffer();
+ _needCheckinFiles.clear();
+ _needBuildFolders.clear();
+ _possibleMissingFilesInEarlierCommit.clear();
+
+ project.getGradle().buildFinished(result ->
+ {
+ StringBuilder endOfBuildMessage = new StringBuilder();
+ if (_restModelCompatMessage.length() > 0)
+ {
+ endOfBuildMessage.append(_restModelCompatMessage);
+ }
+
+ if (!_needCheckinFiles.isEmpty())
+ {
+ endOfBuildMessage.append(createModifiedFilesMessage(_needCheckinFiles, _needBuildFolders));
+ }
+
+ if (!_possibleMissingFilesInEarlierCommit.isEmpty())
+ {
+ endOfBuildMessage.append(createPossibleMissingFilesMessage(_possibleMissingFilesInEarlierCommit));
+ }
+
+ if (endOfBuildMessage.length() > 0)
+ {
+ result.getGradle().getRootProject().getLogger().quiet(endOfBuildMessage.toString());
+ }
+ });
+
+ // Set an extra property on the root project to indicate the initialization is complete for the current build.
+ project.getRootProject().getExtensions().getExtraProperties().set(RUN_ONCE, true);
+ }
+ }
+
+ ConfigurationContainer configurations = project.getConfigurations();
+
+ // configuration for getting the required classes to make pegasus call main methods
+ configurations.maybeCreate(PEGASUS_PLUGIN_CONFIGURATION);
+
+ // configuration for compiling generated data templates
+ Configuration dataTemplateCompile = configurations.maybeCreate("dataTemplateCompile");
+ dataTemplateCompile.setVisible(false);
+
+ // configuration for running rest client generator
+ Configuration restClientCompile = configurations.maybeCreate("restClientCompile");
+ restClientCompile.setVisible(false);
+
+ // configuration for running data template generator
+ // DEPRECATED! This configuration is no longer used. Please stop using it.
+ Configuration dataTemplateGenerator = configurations.maybeCreate("dataTemplateGenerator");
+ dataTemplateGenerator.setVisible(false);
+
+ // configuration for running rest client generator
+ // DEPRECATED! This configuration is no longer used. Please stop using it.
+ Configuration restTools = configurations.maybeCreate("restTools");
+ restTools.setVisible(false);
+
+ // configuration for running Avro schema generator
+ // DEPRECATED! To skip avro schema generation, use PegasusOptions.generationModes
+ Configuration avroSchemaGenerator = configurations.maybeCreate("avroSchemaGenerator");
+ avroSchemaGenerator.setVisible(false);
+
+ // configuration for depending on data schemas and potentially generated data templates
+ // and for publishing jars containing data schemas to the project artifacts for including in the ivy.xml
+ Configuration dataModel = configurations.maybeCreate("dataModel");
+ Configuration testDataModel = configurations.maybeCreate("testDataModel");
+ testDataModel.extendsFrom(dataModel);
+
+ // configuration for depending on data schemas and potentially generated data templates
+ // and for publishing jars containing data schemas to the project artifacts for including in the ivy.xml
+ Configuration avroSchema = configurations.maybeCreate("avroSchema");
+ Configuration testAvroSchema = configurations.maybeCreate("testAvroSchema");
+ testAvroSchema.extendsFrom(avroSchema);
+
+ // configuration for depending on rest idl and potentially generated client builders
+ // and for publishing jars containing rest idl to the project artifacts for including in the ivy.xml
+ Configuration restModel = configurations.maybeCreate("restModel");
+ Configuration testRestModel = configurations.maybeCreate("testRestModel");
+ testRestModel.extendsFrom(restModel);
+
+ // configuration for publishing jars containing data schemas and generated data templates
+ // to the project artifacts for including in the ivy.xml
+ //
+ // published data template jars depends on the configurations used to compile the classes
+ // in the jar, this includes the data models/templates used by the data template generator
+ // and the classes used to compile the generated classes.
+ Configuration dataTemplate = configurations.maybeCreate("dataTemplate");
+ dataTemplate.extendsFrom(dataTemplateCompile, dataModel);
+ Configuration testDataTemplate = configurations.maybeCreate("testDataTemplate");
+ testDataTemplate.extendsFrom(dataTemplate, testDataModel);
+
+ // configuration for processing and validating schema annotation during build time.
+ //
+ // The configuration contains dependencies to schema annotation handlers which would process schema annotations
+ // and validate.
+ Configuration schemaAnnotationHandler = configurations.maybeCreate(SCHEMA_ANNOTATION_HANDLER_CONFIGURATION);
+
+ // configuration for publishing jars containing rest idl and generated client builders
+ // to the project artifacts for including in the ivy.xml
+ //
+ // published client builder jars depends on the configurations used to compile the classes
+ // in the jar, this includes the data models/templates (potentially generated by this
+ // project and) used by the data template generator and the classes used to compile
+ // the generated classes.
+ Configuration restClient = configurations.maybeCreate("restClient");
+ restClient.extendsFrom(restClientCompile, dataTemplate);
+ Configuration testRestClient = configurations.maybeCreate("testRestClient");
+ testRestClient.extendsFrom(restClient, testDataTemplate);
+
+ Properties properties = new Properties();
+ InputStream inputStream = getClass().getResourceAsStream("/pegasus-version.properties");
+ if (inputStream != null)
+ {
+ try
+ {
+ properties.load(inputStream);
+ }
+ catch (IOException e)
+ {
+ throw new GradleException("Unable to read pegasus-version.properties file.", e);
+ }
+
+ String version = properties.getProperty("pegasus.version");
+
+ project.getDependencies().add(PEGASUS_PLUGIN_CONFIGURATION, "com.linkedin.pegasus:data:" + version);
+ project.getDependencies().add(PEGASUS_PLUGIN_CONFIGURATION, "com.linkedin.pegasus:data-avro-generator:" + version);
+ project.getDependencies().add(PEGASUS_PLUGIN_CONFIGURATION, "com.linkedin.pegasus:generator:" + version);
+ project.getDependencies().add(PEGASUS_PLUGIN_CONFIGURATION, "com.linkedin.pegasus:restli-tools:" + version);
+ }
+ else
+ {
+ project.getLogger().lifecycle("Unable to add pegasus dependencies to {}. Please be sure that "
+ + "'com.linkedin.pegasus:data', 'com.linkedin.pegasus:data-avro-generator', 'com.linkedin.pegasus:generator', 'com.linkedin.pegasus:restli-tools'"
+ + " are available on the configuration pegasusPlugin",
+ project.getPath());
+ }
+ project.getDependencies().add(PEGASUS_PLUGIN_CONFIGURATION, "org.slf4j:slf4j-simple:1.7.2");
+ project.getDependencies().add(PEGASUS_PLUGIN_CONFIGURATION, project.files(System.getProperty("java.home") + "/../lib/tools.jar"));
+
+ // this call has to be here because:
+ // 1) artifact cannot be published once projects has been evaluated, so we need to first
+ // create the tasks and artifact handler, then progressively append sources
+ // 2) in order to append sources progressively, the source and documentation tasks and artifacts must be
+ // configured/created before configuring and creating the code generation tasks.
+
+ configureGeneratedSourcesAndJavadoc(project);
+
+ ChangedFileReportTask changedFileReportTask = project.getTasks()
+ .create("changedFilesReport", ChangedFileReportTask.class);
+
+ project.getTasks().getByName("check").dependsOn(changedFileReportTask);
+
+ SourceSetContainer sourceSets = project.getConvention()
+ .getPlugin(JavaPluginConvention.class).getSourceSets();
+
+ sourceSets.all(sourceSet ->
+ {
+ if (sourceSet.getName().toLowerCase(Locale.US).contains("generated"))
+ {
+ return;
+ }
+
+ checkAvroSchemaExist(project, sourceSet);
+
+ // the idl Generator input options will be inside the PegasusOptions class. Users of the
+ // plugin can set the inputOptions in their build.gradle
+ @SuppressWarnings("unchecked")
+ Map pegasusOptions = (Map) project
+ .getExtensions().getExtraProperties().get("pegasus");
+
+ pegasusOptions.put(sourceSet.getName(), new PegasusOptions());
+
+ // rest model generation could fail on incompatibility
+ // if it can fail, fail it early
+ configureRestModelGeneration(project, sourceSet);
+
+ // Do compatibility check for schemas under "pegasus" directory if the configuration property is provided.
+ if (isPropertyTrue(project, ENABLE_PEGASUS_SCHEMA_COMPATIBILITY_CHECK))
+ {
+ configurePegasusSchemaSnapshotGeneration(project, sourceSet, false);
+ }
+
+ configurePegasusSchemaSnapshotGeneration(project, sourceSet, true);
+
+ configureConversionUtilities(project, sourceSet);
+
+ GenerateDataTemplateTask generateDataTemplateTask = configureDataTemplateGeneration(project, sourceSet);
+
+ configureAvroSchemaGeneration(project, sourceSet);
+
+ configureRestClientGeneration(project, sourceSet);
+
+ if (!isPropertyTrue(project, DISABLE_SCHEMA_ANNOTATION_VALIDATION))
+ {
+ configureSchemaAnnotationValidation(project, sourceSet, generateDataTemplateTask);
+ }
+
+ Task cleanGeneratedDirTask = project.task(sourceSet.getTaskName("clean", "GeneratedDir"));
+ cleanGeneratedDirTask.doLast(new CacheableAction<>(task ->
+ {
+ deleteGeneratedDir(project, sourceSet, REST_GEN_TYPE);
+ deleteGeneratedDir(project, sourceSet, AVRO_SCHEMA_GEN_TYPE);
+ deleteGeneratedDir(project, sourceSet, DATA_TEMPLATE_GEN_TYPE);
+ }));
+
+ // make clean depends on deleting the generated directories
+ project.getTasks().getByName("clean").dependsOn(cleanGeneratedDirTask);
+
+ // Set data schema directories as resource roots
+ configureDataSchemaResourcesRoot(project, sourceSet);
+ });
+
+ project.getExtensions().getExtraProperties().set(GENERATOR_CLASSLOADER_NAME, getClass().getClassLoader());
+ }
+
+ protected void configureSchemaAnnotationValidation(Project project,
+ SourceSet sourceSet,
+ GenerateDataTemplateTask generateDataTemplatesTask)
+ {
+ // Task would execute based on the following order.
+ // generateDataTemplatesTask -> validateSchemaAnnotationTask
+
+ // Create ValidateSchemaAnnotation task
+ ValidateSchemaAnnotationTask validateSchemaAnnotationTask = project.getTasks()
+ .create(sourceSet.getTaskName("validate", "schemaAnnotation"), ValidateSchemaAnnotationTask.class, task ->
+ {
+ task.setInputDir(generateDataTemplatesTask.getInputDir());
+ task.setResolverPath(getDataModelConfig(project, sourceSet)); // same resolver path as generateDataTemplatesTask
+ task.setClassPath(project.getConfigurations() .getByName(SCHEMA_ANNOTATION_HANDLER_CONFIGURATION)
+ .plus(project.getConfigurations().getByName(PEGASUS_PLUGIN_CONFIGURATION))
+ .plus(project.getConfigurations().getByName(JavaPlugin.RUNTIME_CLASSPATH_CONFIGURATION_NAME)));
+ task.setHandlerJarPath(project.getConfigurations() .getByName(SCHEMA_ANNOTATION_HANDLER_CONFIGURATION));
+ if (isPropertyTrue(project, ENABLE_ARG_FILE))
+ {
+ task.setEnableArgFile(true);
+ }
+ }
+ );
+
+ // validateSchemaAnnotationTask depend on generateDataTemplatesTask
+ validateSchemaAnnotationTask.dependsOn(generateDataTemplatesTask);
+
+ // Check depends on validateSchemaAnnotationTask.
+ project.getTasks().getByName("check").dependsOn(validateSchemaAnnotationTask);
+ }
+
+
+
+ @SuppressWarnings("deprecation")
+ protected void configureGeneratedSourcesAndJavadoc(Project project)
+ {
+ _generateJavadocTask = project.getTasks().create("generateJavadoc", Javadoc.class);
+
+ if (_generateSourcesJarTask == null)
+ {
+ //
+ // configuration for publishing jars containing sources for generated classes
+ // to the project artifacts for including in the ivy.xml
+ //
+ ConfigurationContainer configurations = project.getConfigurations();
+ Configuration generatedSources = configurations.maybeCreate("generatedSources");
+ Configuration testGeneratedSources = configurations.maybeCreate("testGeneratedSources");
+ testGeneratedSources.extendsFrom(generatedSources);
+
+ _generateSourcesJarTask = project.getTasks().create("generateSourcesJar", Jar.class, jarTask -> {
+ jarTask.setGroup(JavaBasePlugin.DOCUMENTATION_GROUP);
+ jarTask.setDescription("Generates a jar file containing the sources for the generated Java classes.");
+ // FIXME change to #getArchiveClassifier().set("sources"); breaks backwards-compatibility before 5.1
+ // DataHub Note - applied FIXME
+ jarTask.getArchiveClassifier().set("sources");
+ });
+
+ project.getArtifacts().add("generatedSources", _generateSourcesJarTask);
+ }
+
+ if (_generateJavadocJarTask == null)
+ {
+ //
+ // configuration for publishing jars containing Javadoc for generated classes
+ // to the project artifacts for including in the ivy.xml
+ //
+ ConfigurationContainer configurations = project.getConfigurations();
+ Configuration generatedJavadoc = configurations.maybeCreate("generatedJavadoc");
+ Configuration testGeneratedJavadoc = configurations.maybeCreate("testGeneratedJavadoc");
+ testGeneratedJavadoc.extendsFrom(generatedJavadoc);
+
+ _generateJavadocJarTask = project.getTasks().create("generateJavadocJar", Jar.class, jarTask -> {
+ jarTask.dependsOn(_generateJavadocTask);
+ jarTask.setGroup(JavaBasePlugin.DOCUMENTATION_GROUP);
+ jarTask.setDescription("Generates a jar file containing the Javadoc for the generated Java classes.");
+ // FIXME change to #getArchiveClassifier().set("sources"); breaks backwards-compatibility before 5.1
+ // DataHub Note - applied FIXME
+ jarTask.getArchiveClassifier().set("javadoc");
+ jarTask.from(_generateJavadocTask.getDestinationDir());
+ });
+
+ project.getArtifacts().add("generatedJavadoc", _generateJavadocJarTask);
+ }
+ else
+ {
+ // TODO: Tighten the types so that _generateJavadocJarTask must be of type Jar.
+ ((Jar) _generateJavadocJarTask).from(_generateJavadocTask.getDestinationDir());
+ _generateJavadocJarTask.dependsOn(_generateJavadocTask);
+ }
+ }
+
+ private static void deleteGeneratedDir(Project project, SourceSet sourceSet, String dirType)
+ {
+ String generatedDirPath = getGeneratedDirPath(project, sourceSet, dirType);
+ project.getLogger().info("Delete generated directory {}", generatedDirPath);
+ project.delete(generatedDirPath);
+ }
+
+ private static > Class getCompatibilityLevelClass(Project project)
+ {
+ ClassLoader generatorClassLoader = (ClassLoader) project.property(GENERATOR_CLASSLOADER_NAME);
+
+ String className = "com.linkedin.restli.tools.idlcheck.CompatibilityLevel";
+ try
+ {
+ @SuppressWarnings("unchecked")
+ Class enumClass = (Class) generatorClassLoader.loadClass(className).asSubclass(Enum.class);
+ return enumClass;
+ }
+ catch (ClassNotFoundException e)
+ {
+ throw new RuntimeException("Could not load class " + className);
+ }
+ }
+
+ private static void addGeneratedDir(Project project, SourceSet sourceSet, Collection configurations)
+ {
+ project.getPlugins().withType(IdeaPlugin.class, ideaPlugin -> {
+ IdeaModule ideaModule = ideaPlugin.getModel().getModule();
+ // stupid if block needed because of stupid assignment required to update source dirs
+ if (isTestSourceSet(sourceSet))
+ {
+ Set sourceDirs = ideaModule.getTestSourceDirs();
+ sourceDirs.addAll(sourceSet.getJava().getSrcDirs());
+ // this is stupid but assignment is required
+ ideaModule.setTestSourceDirs(sourceDirs);
+ if (debug)
+ {
+ System.out.println("Added " + sourceSet.getJava().getSrcDirs() + " to IdeaModule testSourceDirs "
+ + ideaModule.getTestSourceDirs());
+ }
+ }
+ else
+ {
+ Set sourceDirs = ideaModule.getSourceDirs();
+ sourceDirs.addAll(sourceSet.getJava().getSrcDirs());
+ // this is stupid but assignment is required
+ ideaModule.setSourceDirs(sourceDirs);
+ if (debug)
+ {
+ System.out.println("Added " + sourceSet.getJava().getSrcDirs() + " to IdeaModule sourceDirs "
+ + ideaModule.getSourceDirs());
+ }
+ }
+ Collection compilePlus = ideaModule.getScopes().get("COMPILE").get("plus");
+ compilePlus.addAll(configurations);
+ ideaModule.getScopes().get("COMPILE").put("plus", compilePlus);
+ });
+ }
+
+ private static void checkAvroSchemaExist(Project project, SourceSet sourceSet)
+ {
+ String sourceDir = "src" + File.separatorChar + sourceSet.getName();
+ File avroSourceDir = project.file(sourceDir + File.separatorChar + "avro");
+ if (avroSourceDir.exists())
+ {
+ project.getLogger().lifecycle("{}'s {} has non-empty avro directory. pegasus plugin does not process avro directory",
+ project.getName(), sourceDir);
+ }
+ }
+
+ // Compute the name of the source set that will contain a type of an input generated code.
+ // e.g. genType may be 'DataTemplate' or 'Rest'
+ private static String getGeneratedSourceSetName(SourceSet sourceSet, String genType)
+ {
+ return sourceSet.getName() + "Generated" + genType;
+ }
+
+ // Compute the directory name that will contain a type generated code of an input source set.
+ // e.g. genType may be 'DataTemplate' or 'Rest'
+ public static String getGeneratedDirPath(Project project, SourceSet sourceSet, String genType)
+ {
+ String override = getOverridePath(project, sourceSet, "overrideGeneratedDir");
+ String sourceSetName = getGeneratedSourceSetName(sourceSet, genType);
+ String base = override == null ? "src" : override;
+
+ return base + File.separatorChar + sourceSetName;
+ }
+
+ public static String getDataSchemaPath(Project project, SourceSet sourceSet)
+ {
+ String override = getOverridePath(project, sourceSet, "overridePegasusDir");
+ if (override == null)
+ {
+ return "src" + File.separatorChar + sourceSet.getName() + File.separatorChar + "pegasus";
+ }
+ else
+ {
+ return override;
+ }
+ }
+
+ private static String getExtensionSchemaPath(Project project, SourceSet sourceSet)
+ {
+ String override = getOverridePath(project, sourceSet, "overrideExtensionSchemaDir");
+ if(override == null)
+ {
+ return "src" + File.separatorChar + sourceSet.getName() + File.separatorChar + "extensions";
+ }
+ else
+ {
+ return override;
+ }
+ }
+
+ private static String getSnapshotPath(Project project, SourceSet sourceSet)
+ {
+ String override = getOverridePath(project, sourceSet, "overrideSnapshotDir");
+ if (override == null)
+ {
+ return "src" + File.separatorChar + sourceSet.getName() + File.separatorChar + "snapshot";
+ }
+ else
+ {
+ return override;
+ }
+ }
+
+ private static String getIdlPath(Project project, SourceSet sourceSet)
+ {
+ String override = getOverridePath(project, sourceSet, "overrideIdlDir");
+ if (override == null)
+ {
+ return "src" + File.separatorChar + sourceSet.getName() + File.separatorChar + "idl";
+ }
+ else
+ {
+ return override;
+ }
+ }
+
+ private static String getPegasusSchemaSnapshotPath(Project project, SourceSet sourceSet)
+ {
+ String override = getOverridePath(project, sourceSet, PEGASUS_SCHEMA_SNAPSHOT_DIR_OVERRIDE);
+ if (override == null)
+ {
+ return SRC + File.separatorChar + sourceSet.getName() + File.separatorChar + PEGASUS_SCHEMA_SNAPSHOT_DIR;
+ }
+ else
+ {
+ return override;
+ }
+ }
+
+ private static String getPegasusExtensionSchemaSnapshotPath(Project project, SourceSet sourceSet)
+ {
+ String override = getOverridePath(project, sourceSet, PEGASUS_EXTENSION_SCHEMA_SNAPSHOT_DIR_OVERRIDE);
+ if (override == null)
+ {
+ return SRC + File.separatorChar + sourceSet.getName() + File.separatorChar + PEGASUS_EXTENSION_SCHEMA_SNAPSHOT_DIR;
+ }
+ else
+ {
+ return override;
+ }
+ }
+
+ private static String getOverridePath(Project project, SourceSet sourceSet, String overridePropertyName)
+ {
+ String sourceSetPropertyName = sourceSet.getName() + '.' + overridePropertyName;
+ String override = getNonEmptyProperty(project, sourceSetPropertyName);
+
+ if (override == null && sourceSet.getName().equals("main"))
+ {
+ override = getNonEmptyProperty(project, overridePropertyName);
+ }
+
+ return override;
+ }
+
+ private static boolean isTestSourceSet(SourceSet sourceSet)
+ {
+ return TEST_DIR_REGEX.matcher(sourceSet.getName()).find();
+ }
+
+ private static Configuration getDataModelConfig(Project project, SourceSet sourceSet)
+ {
+ return isTestSourceSet(sourceSet)
+ ? project.getConfigurations().getByName("testDataModel")
+ : project.getConfigurations().getByName("dataModel");
+ }
+
+ private static boolean isTaskSuccessful(Task task)
+ {
+ return task.getState().getExecuted()
+ // Task is not successful if it is not upto date and is skipped.
+ && !(task.getState().getSkipped() && !task.getState().getUpToDate())
+ && task.getState().getFailure() == null;
+ }
+
+ private static boolean isResultEquivalent(File compatibilityLogFile)
+ {
+ return isResultEquivalent(compatibilityLogFile, false);
+ }
+
+ private static boolean isResultEquivalent(File compatibilityLogFile, boolean restSpecOnly)
+ {
+ CompatibilityLogChecker logChecker = new CompatibilityLogChecker();
+ try
+ {
+ logChecker.write(Files.readAllBytes(compatibilityLogFile.toPath()));
+ }
+ catch (IOException e)
+ {
+ throw new GradleException("Error while processing compatibility report: " + e.getMessage());
+ }
+ return logChecker.getRestSpecCompatibility().isEmpty() &&
+ (restSpecOnly || logChecker.getModelCompatibility().isEmpty());
+ }
+
+ protected void configureRestModelGeneration(Project project, SourceSet sourceSet)
+ {
+ if (sourceSet.getAllSource().isEmpty())
+ {
+ project.getLogger().info("No source files found for sourceSet {}. Skipping idl generation.", sourceSet.getName());
+ return;
+ }
+
+ // afterEvaluate needed so that api project can be overridden via ext.apiProject
+ project.afterEvaluate(p ->
+ {
+ // find api project here instead of in each project's plugin configuration
+ // this allows api project relation options (ext.api*) to be specified anywhere in the build.gradle file
+ // alternatively, pass closures to task configuration, and evaluate the closures when task is executed
+ Project apiProject = getCheckedApiProject(project);
+
+ // make sure the api project is evaluated. Important for configure-on-demand mode.
+ if (apiProject != null)
+ {
+ project.evaluationDependsOn(apiProject.getPath());
+
+ if (!apiProject.getPlugins().hasPlugin(_thisPluginType))
+ {
+ apiProject = null;
+ }
+ }
+
+ if (apiProject == null)
+ {
+ return;
+ }
+
+ Task untypedJarTask = project.getTasks().findByName(sourceSet.getJarTaskName());
+ if (!(untypedJarTask instanceof Jar))
+ {
+ return;
+ }
+ Jar jarTask = (Jar) untypedJarTask;
+
+ String snapshotCompatPropertyName = findProperty(FileCompatibilityType.SNAPSHOT);
+ if (project.hasProperty(snapshotCompatPropertyName) && "off".equalsIgnoreCase((String) project.property(snapshotCompatPropertyName)))
+ {
+ project.getLogger().lifecycle("Project {} snapshot compatibility level \"OFF\" is deprecated. Default to \"IGNORE\".",
+ project.getPath());
+ }
+
+ // generate the rest model
+ FileCollection restModelCodegenClasspath = project.getConfigurations().getByName(PEGASUS_PLUGIN_CONFIGURATION)
+ .plus(project.getConfigurations().getByName(JavaPlugin.RUNTIME_CLASSPATH_CONFIGURATION_NAME))
+ .plus(sourceSet.getRuntimeClasspath());
+ String destinationDirPrefix = getGeneratedDirPath(project, sourceSet, REST_GEN_TYPE) + File.separatorChar;
+ FileCollection restModelResolverPath = apiProject.files(getDataSchemaPath(project, sourceSet))
+ .plus(getDataModelConfig(apiProject, sourceSet));
+ Set watchedRestModelInputDirs = buildWatchedRestModelInputDirs(project, sourceSet);
+ Set restModelInputDirs = difference(sourceSet.getAllSource().getSrcDirs(),
+ sourceSet.getResources().getSrcDirs());
+
+ Task generateRestModelTask = project.getTasks()
+ .create(sourceSet.getTaskName("generate", "restModel"), GenerateRestModelTask.class, task ->
+ {
+ task.dependsOn(project.getTasks().getByName(sourceSet.getClassesTaskName()));
+ task.setCodegenClasspath(restModelCodegenClasspath);
+ task.setWatchedCodegenClasspath(restModelCodegenClasspath
+ .filter(file -> !"main".equals(file.getName()) && !"classes".equals(file.getName())));
+ task.setInputDirs(restModelInputDirs);
+ task.setWatchedInputDirs(watchedRestModelInputDirs.isEmpty()
+ ? restModelInputDirs : watchedRestModelInputDirs);
+ // we need all the artifacts from runtime for any private implementation classes the server code might need.
+ task.setSnapshotDestinationDir(project.file(destinationDirPrefix + "snapshot"));
+ task.setIdlDestinationDir(project.file(destinationDirPrefix + "idl"));
+
+ @SuppressWarnings("unchecked")
+ Map pegasusOptions = (Map) project
+ .getExtensions().getExtraProperties().get("pegasus");
+ task.setIdlOptions(pegasusOptions.get(sourceSet.getName()).idlOptions);
+
+ task.setResolverPath(restModelResolverPath);
+ if (isPropertyTrue(project, ENABLE_ARG_FILE))
+ {
+ task.setEnableArgFile(true);
+ }
+
+ task.onlyIf(t -> !isPropertyTrue(project, SKIP_GENERATE_REST_MODEL));
+
+ task.doFirst(new CacheableAction<>(t -> deleteGeneratedDir(project, sourceSet, REST_GEN_TYPE)));
+ });
+
+ File apiSnapshotDir = apiProject.file(getSnapshotPath(apiProject, sourceSet));
+ File apiIdlDir = apiProject.file(getIdlPath(apiProject, sourceSet));
+ apiSnapshotDir.mkdirs();
+
+ if (!isPropertyTrue(project, SKIP_IDL_CHECK))
+ {
+ apiIdlDir.mkdirs();
+ }
+
+ CheckRestModelTask checkRestModelTask = project.getTasks()
+ .create(sourceSet.getTaskName("check", "RestModel"), CheckRestModelTask.class, task ->
+ {
+ task.dependsOn(generateRestModelTask);
+ task.setCurrentSnapshotFiles(SharedFileUtils.getSnapshotFiles(project, destinationDirPrefix));
+ task.setPreviousSnapshotDirectory(apiSnapshotDir);
+ task.setCurrentIdlFiles(SharedFileUtils.getIdlFiles(project, destinationDirPrefix));
+ task.setPreviousIdlDirectory(apiIdlDir);
+ task.setCodegenClasspath(project.getConfigurations().getByName(PEGASUS_PLUGIN_CONFIGURATION));
+ task.setModelCompatLevel(PropertyUtil.findCompatLevel(project, FileCompatibilityType.SNAPSHOT));
+ task.onlyIf(t -> !isPropertyTrue(project, SKIP_IDL_CHECK));
+
+ task.doLast(new CacheableAction<>(t ->
+ {
+ if (!task.isEquivalent())
+ {
+ _restModelCompatMessage.append(task.getWholeMessage());
+ }
+ }));
+ });
+
+ CheckSnapshotTask checkSnapshotTask = project.getTasks()
+ .create(sourceSet.getTaskName("check", "Snapshot"), CheckSnapshotTask.class, task -> {
+ task.dependsOn(generateRestModelTask);
+ task.setCurrentSnapshotFiles(SharedFileUtils.getSnapshotFiles(project, destinationDirPrefix));
+ task.setPreviousSnapshotDirectory(apiSnapshotDir);
+ task.setCodegenClasspath(project.getConfigurations().getByName(PEGASUS_PLUGIN_CONFIGURATION));
+ task.setSnapshotCompatLevel(PropertyUtil.findCompatLevel(project, FileCompatibilityType.SNAPSHOT));
+
+ task.onlyIf(t -> isPropertyTrue(project, SKIP_IDL_CHECK));
+ });
+
+ CheckIdlTask checkIdlTask = project.getTasks()
+ .create(sourceSet.getTaskName("check", "Idl"), CheckIdlTask.class, task ->
+ {
+ task.dependsOn(generateRestModelTask);
+ task.setCurrentIdlFiles(SharedFileUtils.getIdlFiles(project, destinationDirPrefix));
+ task.setPreviousIdlDirectory(apiIdlDir);
+ task.setResolverPath(restModelResolverPath);
+ task.setCodegenClasspath(project.getConfigurations().getByName(PEGASUS_PLUGIN_CONFIGURATION));
+ task.setIdlCompatLevel(PropertyUtil.findCompatLevel(project, FileCompatibilityType.IDL));
+ if (isPropertyTrue(project, ENABLE_ARG_FILE))
+ {
+ task.setEnableArgFile(true);
+ }
+
+
+ task.onlyIf(t -> !isPropertyTrue(project, SKIP_IDL_CHECK)
+ && !"OFF".equals(PropertyUtil.findCompatLevel(project, FileCompatibilityType.IDL)));
+ });
+
+ // rest model publishing involves cross-project reference
+ // configure after all projects have been evaluated
+ // the file copy can be turned off by "rest.model.noPublish" flag
+ Task publishRestliSnapshotTask = project.getTasks()
+ .create(sourceSet.getTaskName("publish", "RestliSnapshot"), PublishRestModelTask.class, task ->
+ {
+ task.dependsOn(checkRestModelTask, checkSnapshotTask, checkIdlTask);
+ task.from(SharedFileUtils.getSnapshotFiles(project, destinationDirPrefix));
+ task.into(apiSnapshotDir);
+ task.setSuffix(SNAPSHOT_FILE_SUFFIX);
+
+ task.onlyIf(t ->
+ isPropertyTrue(project, SNAPSHOT_FORCE_PUBLISH) ||
+ (
+ !isPropertyTrue(project, SNAPSHOT_NO_PUBLISH) &&
+ (
+ (
+ isPropertyTrue(project, SKIP_IDL_CHECK) &&
+ isTaskSuccessful(checkSnapshotTask) &&
+ checkSnapshotTask.getSummaryTarget().exists() &&
+ !isResultEquivalent(checkSnapshotTask.getSummaryTarget())
+ ) ||
+ (
+ !isPropertyTrue(project, SKIP_IDL_CHECK) &&
+ isTaskSuccessful(checkRestModelTask) &&
+ checkRestModelTask.getSummaryTarget().exists() &&
+ !isResultEquivalent(checkRestModelTask.getSummaryTarget())
+ )
+ ))
+ );
+ });
+
+ Task publishRestliIdlTask = project.getTasks()
+ .create(sourceSet.getTaskName("publish", "RestliIdl"), PublishRestModelTask.class, task -> {
+ task.dependsOn(checkRestModelTask, checkIdlTask, checkSnapshotTask);
+ task.from(SharedFileUtils.getIdlFiles(project, destinationDirPrefix));
+ task.into(apiIdlDir);
+ task.setSuffix(IDL_FILE_SUFFIX);
+
+ task.onlyIf(t ->
+ isPropertyTrue(project, IDL_FORCE_PUBLISH) ||
+ (
+ !isPropertyTrue(project, IDL_NO_PUBLISH) &&
+ (
+ (
+ isPropertyTrue(project, SKIP_IDL_CHECK) &&
+ isTaskSuccessful(checkSnapshotTask) &&
+ checkSnapshotTask.getSummaryTarget().exists() &&
+ !isResultEquivalent(checkSnapshotTask.getSummaryTarget(), true)
+ ) ||
+ (
+ !isPropertyTrue(project, SKIP_IDL_CHECK) &&
+ (
+ (isTaskSuccessful(checkRestModelTask) &&
+ checkRestModelTask.getSummaryTarget().exists() &&
+ !isResultEquivalent(checkRestModelTask.getSummaryTarget(), true)) ||
+ (isTaskSuccessful(checkIdlTask) &&
+ checkIdlTask.getSummaryTarget().exists() &&
+ !isResultEquivalent(checkIdlTask.getSummaryTarget()))
+ )
+ )
+ ))
+ );
+ });
+
+ project.getLogger().info("API project selected for {} is {}",
+ publishRestliIdlTask.getPath(), apiProject.getPath());
+
+ jarTask.from(SharedFileUtils.getIdlFiles(project, destinationDirPrefix));
+ // add generated .restspec.json files as resources to the jar
+ jarTask.dependsOn(publishRestliSnapshotTask, publishRestliIdlTask);
+
+ ChangedFileReportTask changedFileReportTask = (ChangedFileReportTask) project.getTasks()
+ .getByName("changedFilesReport");
+
+ // Use the files from apiDir for generating the changed files report as we need to notify user only when
+ // source system files are modified.
+ changedFileReportTask.setIdlFiles(SharedFileUtils.getSuffixedFiles(project, apiIdlDir, IDL_FILE_SUFFIX));
+ changedFileReportTask.setSnapshotFiles(SharedFileUtils.getSuffixedFiles(project, apiSnapshotDir,
+ SNAPSHOT_FILE_SUFFIX));
+ changedFileReportTask.mustRunAfter(publishRestliSnapshotTask, publishRestliIdlTask);
+ changedFileReportTask.doLast(new CacheableAction<>(t ->
+ {
+ if (!changedFileReportTask.getNeedCheckinFiles().isEmpty())
+ {
+ project.getLogger().info("Adding modified files to need checkin list...");
+ _needCheckinFiles.addAll(changedFileReportTask.getNeedCheckinFiles());
+ _needBuildFolders.add(getCheckedApiProject(project).getPath());
+ }
+ }));
+ });
+ }
+
+ protected void configurePegasusSchemaSnapshotGeneration(Project project, SourceSet sourceSet, boolean isExtensionSchema)
+ {
+ File schemaDir = isExtensionSchema? project.file(getExtensionSchemaPath(project, sourceSet))
+ : project.file(getDataSchemaPath(project, sourceSet));
+
+ if ((isExtensionSchema && SharedFileUtils.getSuffixedFiles(project, schemaDir, PDL_FILE_SUFFIX).isEmpty()) ||
+ (!isExtensionSchema && SharedFileUtils.getSuffixedFiles(project, schemaDir, DATA_TEMPLATE_FILE_SUFFIXES).isEmpty()))
+ {
+ return;
+ }
+
+ Path publishablePegasusSchemaSnapshotDir = project.getBuildDir().toPath().resolve(sourceSet.getName() +
+ (isExtensionSchema ? PEGASUS_EXTENSION_SCHEMA_SNAPSHOT: PEGASUS_SCHEMA_SNAPSHOT));
+
+ Task generatePegasusSchemaSnapshot = generatePegasusSchemaSnapshot(project, sourceSet,
+ isExtensionSchema ? PEGASUS_EXTENSION_SCHEMA_SNAPSHOT: PEGASUS_SCHEMA_SNAPSHOT, schemaDir,
+ publishablePegasusSchemaSnapshotDir.toFile(), isExtensionSchema);
+
+ File pegasusSchemaSnapshotDir = project.file(isExtensionSchema ? getPegasusExtensionSchemaSnapshotPath(project, sourceSet)
+ : getPegasusSchemaSnapshotPath(project, sourceSet));
+ pegasusSchemaSnapshotDir.mkdirs();
+
+ Task checkSchemaSnapshot = project.getTasks().create(sourceSet.getTaskName("check",
+ isExtensionSchema ? PEGASUS_EXTENSION_SCHEMA_SNAPSHOT: PEGASUS_SCHEMA_SNAPSHOT),
+ CheckPegasusSnapshotTask.class, task ->
+ {
+ task.dependsOn(generatePegasusSchemaSnapshot);
+ task.setCurrentSnapshotDirectory(publishablePegasusSchemaSnapshotDir.toFile());
+ task.setPreviousSnapshotDirectory(pegasusSchemaSnapshotDir);
+ task.setCodegenClasspath(project.getConfigurations().getByName(PEGASUS_PLUGIN_CONFIGURATION)
+ .plus(project.getConfigurations().getByName(SCHEMA_ANNOTATION_HANDLER_CONFIGURATION))
+ .plus(project.getConfigurations().getByName(JavaPlugin.RUNTIME_CLASSPATH_CONFIGURATION_NAME)));
+ task.setCompatibilityLevel(isExtensionSchema ?
+ PropertyUtil.findCompatLevel(project, FileCompatibilityType.PEGASUS_EXTENSION_SCHEMA_SNAPSHOT)
+ :PropertyUtil.findCompatLevel(project, FileCompatibilityType.PEGASUS_SCHEMA_SNAPSHOT));
+ task.setCompatibilityMode(isExtensionSchema ? COMPATIBILITY_OPTIONS_MODE_EXTENSION :
+ PropertyUtil.findCompatMode(project, PEGASUS_COMPATIBILITY_MODE));
+ task.setExtensionSchema(isExtensionSchema);
+ task.setHandlerJarPath(project.getConfigurations() .getByName(SCHEMA_ANNOTATION_HANDLER_CONFIGURATION));
+
+ task.onlyIf(t ->
+ {
+ String pegasusSnapshotCompatPropertyName = isExtensionSchema ?
+ findProperty(FileCompatibilityType.PEGASUS_EXTENSION_SCHEMA_SNAPSHOT)
+ : findProperty(FileCompatibilityType.PEGASUS_SCHEMA_SNAPSHOT);
+ return !project.hasProperty(pegasusSnapshotCompatPropertyName) ||
+ !"off".equalsIgnoreCase((String) project.property(pegasusSnapshotCompatPropertyName));
+ });
+ });
+
+ Task publishPegasusSchemaSnapshot = publishPegasusSchemaSnapshot(project, sourceSet,
+ isExtensionSchema ? PEGASUS_EXTENSION_SCHEMA_SNAPSHOT: PEGASUS_SCHEMA_SNAPSHOT, checkSchemaSnapshot,
+ publishablePegasusSchemaSnapshotDir.toFile(), pegasusSchemaSnapshotDir);
+
+ project.getTasks().getByName(LifecycleBasePlugin.ASSEMBLE_TASK_NAME).dependsOn(publishPegasusSchemaSnapshot);
+ }
+
+ @SuppressWarnings("deprecation")
+ protected void configureAvroSchemaGeneration(Project project, SourceSet sourceSet)
+ {
+ File dataSchemaDir = project.file(getDataSchemaPath(project, sourceSet));
+ File avroDir = project.file(getGeneratedDirPath(project, sourceSet, AVRO_SCHEMA_GEN_TYPE)
+ + File.separatorChar + "avro");
+
+ // generate avro schema files from data schema
+ Task generateAvroSchemaTask = project.getTasks()
+ .create(sourceSet.getTaskName("generate", "avroSchema"), GenerateAvroSchemaTask.class, task -> {
+ task.setInputDir(dataSchemaDir);
+ task.setDestinationDir(avroDir);
+ task.setResolverPath(getDataModelConfig(project, sourceSet));
+ task.setCodegenClasspath(project.getConfigurations().getByName(PEGASUS_PLUGIN_CONFIGURATION));
+ if (isPropertyTrue(project, ENABLE_ARG_FILE))
+ {
+ task.setEnableArgFile(true);
+ }
+
+ task.onlyIf(t ->
+ {
+ if (task.getInputDir().exists())
+ {
+ @SuppressWarnings("unchecked")
+ Map pegasusOptions = (Map) project
+ .getExtensions().getExtraProperties().get("pegasus");
+
+ if (pegasusOptions.get(sourceSet.getName()).hasGenerationMode(PegasusOptions.GenerationMode.AVRO))
+ {
+ return true;
+ }
+ }
+
+ return !project.getConfigurations().getByName("avroSchemaGenerator").isEmpty();
+ });
+
+ task.doFirst(new CacheableAction<>(t -> deleteGeneratedDir(project, sourceSet, AVRO_SCHEMA_GEN_TYPE)));
+ });
+
+ project.getTasks().getByName(sourceSet.getCompileJavaTaskName()).dependsOn(generateAvroSchemaTask);
+
+ // create avro schema jar file
+
+ Task avroSchemaJarTask = project.getTasks().create(sourceSet.getName() + "AvroSchemaJar", Jar.class, task ->
+ {
+ // add path prefix to each file in the data schema directory
+ task.from(avroDir, copySpec ->
+ copySpec.eachFile(fileCopyDetails ->
+ fileCopyDetails.setPath("avro" + File.separatorChar + fileCopyDetails.getPath())));
+
+ // FIXME change to #getArchiveAppendix().set(...); breaks backwards-compatibility before 5.1
+ // DataHub Note - applied FIXME
+ task.getArchiveAppendix().set(getAppendix(sourceSet, "avro-schema"));
+ task.setDescription("Generate an avro schema jar");
+ });
+
+ if (!isTestSourceSet(sourceSet))
+ {
+ project.getArtifacts().add("avroSchema", avroSchemaJarTask);
+ }
+ else
+ {
+ project.getArtifacts().add("testAvroSchema", avroSchemaJarTask);
+ }
+ }
+
+ protected void configureConversionUtilities(Project project, SourceSet sourceSet)
+ {
+ File dataSchemaDir = project.file(getDataSchemaPath(project, sourceSet));
+ boolean reverse = isPropertyTrue(project, CONVERT_TO_PDL_REVERSE);
+ boolean keepOriginal = isPropertyTrue(project, CONVERT_TO_PDL_KEEP_ORIGINAL);
+ boolean skipVerification = isPropertyTrue(project, CONVERT_TO_PDL_SKIP_VERIFICATION);
+ String preserveSourceCmd = getNonEmptyProperty(project, CONVERT_TO_PDL_PRESERVE_SOURCE_CMD);
+
+ // Utility task for migrating between PDSC and PDL.
+ project.getTasks().create(sourceSet.getTaskName("convert", "ToPdl"), TranslateSchemasTask.class, task ->
+ {
+ task.setInputDir(dataSchemaDir);
+ task.setDestinationDir(dataSchemaDir);
+ task.setResolverPath(getDataModelConfig(project, sourceSet));
+ task.setCodegenClasspath(project.getConfigurations().getByName(PEGASUS_PLUGIN_CONFIGURATION));
+ task.setPreserveSourceCmd(preserveSourceCmd);
+ if (reverse)
+ {
+ task.setSourceFormat(SchemaFileType.PDL);
+ task.setDestinationFormat(SchemaFileType.PDSC);
+ }
+ else
+ {
+ task.setSourceFormat(SchemaFileType.PDSC);
+ task.setDestinationFormat(SchemaFileType.PDL);
+ }
+ task.setKeepOriginal(keepOriginal);
+ task.setSkipVerification(skipVerification);
+ if (isPropertyTrue(project, ENABLE_ARG_FILE))
+ {
+ task.setEnableArgFile(true);
+ }
+
+ task.onlyIf(t -> task.getInputDir().exists());
+ task.doLast(new CacheableAction<>(t ->
+ {
+ project.getLogger().lifecycle("Pegasus schema conversion complete.");
+ project.getLogger().lifecycle("All pegasus schema files in " + dataSchemaDir + " have been converted");
+ project.getLogger().lifecycle("You can use '-PconvertToPdl.reverse=true|false' to change the direction of conversion.");
+ }));
+ });
+
+ // Helper task for reformatting existing PDL schemas by generating them again.
+ project.getTasks().create(sourceSet.getTaskName("reformat", "Pdl"), TranslateSchemasTask.class, task ->
+ {
+ task.setInputDir(dataSchemaDir);
+ task.setDestinationDir(dataSchemaDir);
+ task.setResolverPath(getDataModelConfig(project, sourceSet));
+ task.setCodegenClasspath(project.getConfigurations().getByName(PEGASUS_PLUGIN_CONFIGURATION));
+ task.setSourceFormat(SchemaFileType.PDL);
+ task.setDestinationFormat(SchemaFileType.PDL);
+ task.setKeepOriginal(true);
+ task.setSkipVerification(true);
+ if (isPropertyTrue(project, ENABLE_ARG_FILE))
+ {
+ task.setEnableArgFile(true);
+ }
+
+ task.onlyIf(t -> task.getInputDir().exists());
+ task.doLast(new CacheableAction<>(t -> project.getLogger().lifecycle("PDL reformat complete.")));
+ });
+ }
+
+ @SuppressWarnings("deprecation")
+ protected GenerateDataTemplateTask configureDataTemplateGeneration(Project project, SourceSet sourceSet)
+ {
+ File dataSchemaDir = project.file(getDataSchemaPath(project, sourceSet));
+ File generatedDataTemplateDir = project.file(getGeneratedDirPath(project, sourceSet, DATA_TEMPLATE_GEN_TYPE)
+ + File.separatorChar + "java");
+ File publishableSchemasBuildDir = project.file(project.getBuildDir().getAbsolutePath()
+ + File.separatorChar + sourceSet.getName() + "Schemas");
+ File publishableLegacySchemasBuildDir = project.file(project.getBuildDir().getAbsolutePath()
+ + File.separatorChar + sourceSet.getName() + "LegacySchemas");
+ File publishableExtensionSchemasBuildDir = project.file(project.getBuildDir().getAbsolutePath()
+ + File.separatorChar + sourceSet.getName() + "ExtensionSchemas");
+
+ // generate data template source files from data schema
+ GenerateDataTemplateTask generateDataTemplatesTask = project.getTasks()
+ .create(sourceSet.getTaskName("generate", "dataTemplate"), GenerateDataTemplateTask.class, task ->
+ {
+ task.setInputDir(dataSchemaDir);
+ task.setDestinationDir(generatedDataTemplateDir);
+ task.setResolverPath(getDataModelConfig(project, sourceSet));
+ task.setCodegenClasspath(project.getConfigurations().getByName(PEGASUS_PLUGIN_CONFIGURATION));
+ if (isPropertyTrue(project, ENABLE_ARG_FILE))
+ {
+ task.setEnableArgFile(true);
+ }
+ if (isPropertyTrue(project, CODE_GEN_PATH_CASE_SENSITIVE))
+ {
+ task.setGenerateLowercasePath(false);
+ }
+
+ task.onlyIf(t ->
+ {
+ if (task.getInputDir().exists())
+ {
+ @SuppressWarnings("unchecked")
+ Map pegasusOptions = (Map) project
+ .getExtensions().getExtraProperties().get("pegasus");
+
+ return pegasusOptions.get(sourceSet.getName()).hasGenerationMode(PegasusOptions.GenerationMode.PEGASUS);
+ }
+
+ return false;
+ });
+
+ task.doFirst(new CacheableAction<>(t -> deleteGeneratedDir(project, sourceSet, DATA_TEMPLATE_GEN_TYPE)));
+ });
+
+ // TODO: Tighten the types so that _generateSourcesJarTask must be of type Jar.
+ ((Jar) _generateSourcesJarTask).from(generateDataTemplatesTask.getDestinationDir());
+ _generateSourcesJarTask.dependsOn(generateDataTemplatesTask);
+
+ _generateJavadocTask.source(generateDataTemplatesTask.getDestinationDir());
+ _generateJavadocTask.setClasspath(_generateJavadocTask.getClasspath()
+ .plus(project.getConfigurations().getByName("dataTemplateCompile"))
+ .plus(generateDataTemplatesTask.getResolverPath()));
+ _generateJavadocTask.dependsOn(generateDataTemplatesTask);
+
+ // Add extra dependencies for data model compilation
+ project.getDependencies().add("dataTemplateCompile", "com.google.code.findbugs:jsr305:3.0.2");
+
+ // create new source set for generated java source and class files
+ String targetSourceSetName = getGeneratedSourceSetName(sourceSet, DATA_TEMPLATE_GEN_TYPE);
+
+ SourceSetContainer sourceSets = project.getConvention()
+ .getPlugin(JavaPluginConvention.class).getSourceSets();
+
+ SourceSet targetSourceSet = sourceSets.create(targetSourceSetName, ss ->
+ {
+ ss.java(sourceDirectorySet -> sourceDirectorySet.srcDir(generatedDataTemplateDir));
+ ss.setCompileClasspath(getDataModelConfig(project, sourceSet)
+ .plus(project.getConfigurations().getByName("dataTemplateCompile")));
+ });
+
+ // idea plugin needs to know about new generated java source directory and its dependencies
+ addGeneratedDir(project, targetSourceSet, Arrays.asList(
+ getDataModelConfig(project, sourceSet),
+ project.getConfigurations().getByName("dataTemplateCompile")));
+
+ // Set source compatibility to 1.8 as the data-templates now generate code with Java 8 features.
+ JavaCompile compileTask = project.getTasks()
+ .withType(JavaCompile.class).getByName(targetSourceSet.getCompileJavaTaskName());
+ compileTask.doFirst(new CacheableAction<>(task -> {
+ ((JavaCompile) task).setSourceCompatibility("1.8");
+ ((JavaCompile) task).setTargetCompatibility("1.8");
+ }));
+ // make sure that java source files have been generated before compiling them
+ compileTask.dependsOn(generateDataTemplatesTask);
+
+ // Dummy task to maintain backward compatibility
+ // TODO: Delete this task once use cases have had time to reference the new task
+ Task destroyStaleFiles = project.getTasks().create(sourceSet.getName() + "DestroyStaleFiles", Delete.class);
+ destroyStaleFiles.onlyIf(task -> {
+ project.getLogger().lifecycle("{} task is a NO-OP task.", task.getPath());
+ return false;
+ });
+
+ // Dummy task to maintain backward compatibility, as this task was replaced by CopySchemas
+ // TODO: Delete this task once use cases have had time to reference the new task
+ Task copyPdscSchemasTask = project.getTasks().create(sourceSet.getName() + "CopyPdscSchemas", Copy.class);
+ copyPdscSchemasTask.dependsOn(destroyStaleFiles);
+ copyPdscSchemasTask.onlyIf(task -> {
+ project.getLogger().lifecycle("{} task is a NO-OP task.", task.getPath());
+ return false;
+ });
+
+ // Prepare schema files for publication by syncing schema folders.
+ Task prepareSchemasForPublishTask = project.getTasks()
+ .create(sourceSet.getName() + "CopySchemas", Sync.class, task ->
+ {
+ task.from(dataSchemaDir, syncSpec -> DATA_TEMPLATE_FILE_SUFFIXES.forEach(suffix -> syncSpec.include("**/*" + suffix)));
+ task.into(publishableSchemasBuildDir);
+ });
+ prepareSchemasForPublishTask.dependsOn(copyPdscSchemasTask);
+
+ Collection dataTemplateJarDepends = new ArrayList<>();
+ dataTemplateJarDepends.add(compileTask);
+ dataTemplateJarDepends.add(prepareSchemasForPublishTask);
+
+ // Convert all PDL files back to PDSC for publication
+ // TODO: Remove this conversion permanently once translated PDSCs are no longer needed.
+ Task prepareLegacySchemasForPublishTask = project.getTasks()
+ .create(sourceSet.getName() + "TranslateSchemas", TranslateSchemasTask.class, task ->
+ {
+ task.setInputDir(dataSchemaDir);
+ task.setDestinationDir(publishableLegacySchemasBuildDir);
+ task.setResolverPath(getDataModelConfig(project, sourceSet));
+ task.setCodegenClasspath(project.getConfigurations().getByName(PEGASUS_PLUGIN_CONFIGURATION));
+ task.setSourceFormat(SchemaFileType.PDL);
+ task.setDestinationFormat(SchemaFileType.PDSC);
+ task.setKeepOriginal(true);
+ task.setSkipVerification(true);
+ if (isPropertyTrue(project, ENABLE_ARG_FILE))
+ {
+ task.setEnableArgFile(true);
+ }
+ });
+
+ prepareLegacySchemasForPublishTask.dependsOn(destroyStaleFiles);
+ dataTemplateJarDepends.add(prepareLegacySchemasForPublishTask);
+
+ // extension schema directory
+ File extensionSchemaDir = project.file(getExtensionSchemaPath(project, sourceSet));
+
+ if (!SharedFileUtils.getSuffixedFiles(project, extensionSchemaDir, PDL_FILE_SUFFIX).isEmpty())
+ {
+ // Validate extension schemas if extension schemas are provided.
+ ValidateExtensionSchemaTask validateExtensionSchemaTask = project.getTasks()
+ .create(sourceSet.getTaskName("validate", "ExtensionSchemas"), ValidateExtensionSchemaTask.class, task ->
+ {
+ task.setInputDir(extensionSchemaDir);
+ task.setResolverPath(
+ getDataModelConfig(project, sourceSet).plus(project.files(getDataSchemaPath(project, sourceSet))));
+ task.setClassPath(project.getConfigurations().getByName(PEGASUS_PLUGIN_CONFIGURATION));
+ if (isPropertyTrue(project, ENABLE_ARG_FILE))
+ {
+ task.setEnableArgFile(true);
+ }
+ });
+
+ Task prepareExtensionSchemasForPublishTask = project.getTasks()
+ .create(sourceSet.getName() + "CopyExtensionSchemas", Sync.class, task ->
+ {
+ task.from(extensionSchemaDir, syncSpec -> syncSpec.include("**/*" + PDL_FILE_SUFFIX));
+ task.into(publishableExtensionSchemasBuildDir);
+ });
+
+ prepareExtensionSchemasForPublishTask.dependsOn(validateExtensionSchemaTask);
+ prepareExtensionSchemasForPublishTask.dependsOn(copyPdscSchemasTask);
+ dataTemplateJarDepends.add(prepareExtensionSchemasForPublishTask);
+ }
+
+ // include pegasus files in the output of this SourceSet
+ project.getTasks().withType(ProcessResources.class).getByName(targetSourceSet.getProcessResourcesTaskName(), it ->
+ {
+ it.from(prepareSchemasForPublishTask, copy -> copy.into("pegasus"));
+ // TODO: Remove this permanently once translated PDSCs are no longer needed.
+ it.from(prepareLegacySchemasForPublishTask, copy -> copy.into(TRANSLATED_SCHEMAS_DIR));
+ Sync copyExtensionSchemasTask = project.getTasks().withType(Sync.class).findByName(sourceSet.getName() + "CopyExtensionSchemas");
+ if (copyExtensionSchemasTask != null)
+ {
+ it.from(copyExtensionSchemasTask, copy -> copy.into("extensions"));
+ }
+ });
+
+ // create data template jar file
+ Jar dataTemplateJarTask = project.getTasks()
+ .create(sourceSet.getName() + "DataTemplateJar", Jar.class, task ->
+ {
+ task.dependsOn(dataTemplateJarDepends);
+ task.from(targetSourceSet.getOutput());
+
+ // FIXME change to #getArchiveAppendix().set(...); breaks backwards-compatibility before 5.1
+ // DataHub Note - applied FIXME
+ task.getArchiveAppendix().set(getAppendix(sourceSet, "data-template"));
+ task.setDescription("Generate a data template jar");
+ });
+
+ // add the data model and date template jars to the list of project artifacts.
+ if (!isTestSourceSet(sourceSet))
+ {
+ project.getArtifacts().add("dataTemplate", dataTemplateJarTask);
+ }
+ else
+ {
+ project.getArtifacts().add("testDataTemplate", dataTemplateJarTask);
+ }
+
+ // include additional dependencies into the appropriate configuration used to compile the input source set
+ // must include the generated data template classes and their dependencies the configuration.
+ // "compile" and "testCompile" configurations have been removed in Gradle 7,
+ // but to keep the maximum backward compatibility, here we handle Gradle 7 and earlier version differently
+ // Once MIN_REQUIRED_VERSION reaches 7.0, we can remove the check of isAtLeastGradle7()
+ String compileConfigName;
+ if (isAtLeastGradle7()) {
+ compileConfigName = isTestSourceSet(sourceSet) ? "testImplementation" : project.getConfigurations().findByName("api") != null ? "api" : "implementation";
+ }
+ else
+ {
+ compileConfigName = isTestSourceSet(sourceSet) ? "testCompile" : "compile";
+ }
+
+ Configuration compileConfig = project.getConfigurations().maybeCreate(compileConfigName);
+ compileConfig.extendsFrom(
+ getDataModelConfig(project, sourceSet),
+ project.getConfigurations().getByName("dataTemplateCompile"));
+
+ // The getArchivePath() API doesn’t carry any task dependency and has been deprecated.
+ // Replace it with getArchiveFile() on Gradle 7,
+ // but keep getArchivePath() to be backwards-compatibility with Gradle version older than 5.1
+ // DataHub Note - applied FIXME
+ project.getDependencies().add(compileConfigName, project.files(
+ isAtLeastGradle7() ? dataTemplateJarTask.getArchiveFile() : dataTemplateJarTask.getArchivePath()));
+
+ if (_configureIvyPublications) {
+ // The below Action is only applied when the 'ivy-publish' is applied by the consumer.
+ // If the consumer does not use ivy-publish, this is a noop.
+ // this Action prepares the project applying the pegasus plugin to publish artifacts using these steps:
+ // 1. Registers "feature variants" for pegasus-specific artifacts;
+ // see https://docs.gradle.org/6.1/userguide/feature_variants.html
+ // 2. Wires legacy configurations like `dataTemplateCompile` to auto-generated feature variant *Api and
+ // *Implementation configurations for backwards compatibility.
+ // 3. Configures the Ivy Publication to include auto-generated feature variant *Api and *Implementation
+ // configurations and their dependencies.
+ project.getPlugins().withType(IvyPublishPlugin.class, ivyPublish -> {
+ if (!isAtLeastGradle61())
+ {
+ throw new GradleException("Using the ivy-publish plugin with the pegasus plugin requires Gradle 6.1 or higher " +
+ "at build time. Please upgrade.");
+ }
+
+ JavaPluginExtension java = project.getExtensions().getByType(JavaPluginExtension.class);
+ // create new capabilities per source set; automatically creates api and implementation configurations
+ String featureName = mapSourceSetToFeatureName(targetSourceSet);
+ try
+ {
+ /*
+ reflection is required to preserve compatibility with Gradle 5.2.1 and below
+ TODO once Gradle 5.3+ is required, remove reflection and replace with:
+ java.registerFeature(featureName, featureSpec -> {
+ featureSpec.usingSourceSet(targetSourceSet);
+ });
+ */
+ Method registerFeature = JavaPluginExtension.class.getDeclaredMethod("registerFeature", String.class, Action.class);
+ Action>/**/ featureSpecAction = createFeatureVariantFromSourceSet(targetSourceSet);
+ registerFeature.invoke(java, featureName, featureSpecAction);
+ }
+ catch (ReflectiveOperationException e)
+ {
+ throw new GradleException("Unable to register new feature variant", e);
+ }
+
+ // expose transitive dependencies to consumers via variant configurations
+ Configuration featureConfiguration = project.getConfigurations().getByName(featureName);
+ Configuration mainGeneratedDataTemplateApi = project.getConfigurations().getByName(targetSourceSet.getApiConfigurationName());
+ featureConfiguration.extendsFrom(mainGeneratedDataTemplateApi);
+ mainGeneratedDataTemplateApi.extendsFrom(
+ getDataModelConfig(project, targetSourceSet),
+ project.getConfigurations().getByName("dataTemplateCompile"));
+
+ // Configure the existing IvyPublication
+ // For backwards-compatibility, make the legacy dataTemplate/testDataTemplate configurations extend
+ // their replacements, auto-created when we registered the new feature variant
+ project.afterEvaluate(p -> {
+ PublishingExtension publishing = p.getExtensions().getByType(PublishingExtension.class);
+ // When configuring a Gradle Publication, use this value to find the name of the publication to configure. Defaults to "ivy".
+ String publicationName = p.getExtensions().getExtraProperties().getProperties().getOrDefault("PegasusPublicationName", "ivy").toString();
+ IvyPublication ivyPublication = publishing.getPublications().withType(IvyPublication.class).getByName(publicationName);
+ ivyPublication.configurations(configurations -> configurations.create(featureName, legacyConfiguration -> {
+ legacyConfiguration.extend(p.getConfigurations().getByName(targetSourceSet.getApiElementsConfigurationName()).getName());
+ legacyConfiguration.extend(p.getConfigurations().getByName(targetSourceSet.getRuntimeElementsConfigurationName()).getName());
+ }));
+ });
+ });
+ }
+
+ if (debug)
+ {
+ System.out.println("configureDataTemplateGeneration sourceSet " + sourceSet.getName());
+ System.out.println(compileConfigName + ".allDependencies : "
+ + project.getConfigurations().getByName(compileConfigName).getAllDependencies());
+ System.out.println(compileConfigName + ".extendsFrom: "
+ + project.getConfigurations().getByName(compileConfigName).getExtendsFrom());
+ System.out.println(compileConfigName + ".transitive: "
+ + project.getConfigurations().getByName(compileConfigName).isTransitive());
+ }
+
+ project.getTasks().getByName(sourceSet.getCompileJavaTaskName()).dependsOn(dataTemplateJarTask);
+ return generateDataTemplatesTask;
+ }
+
+ private String mapSourceSetToFeatureName(SourceSet sourceSet) {
+ String featureName = "";
+ switch (sourceSet.getName()) {
+ case "mainGeneratedDataTemplate":
+ featureName = "dataTemplate";
+ break;
+ case "testGeneratedDataTemplate":
+ featureName = "testDataTemplate";
+ break;
+ case "mainGeneratedRest":
+ featureName = "restClient";
+ break;
+ case "testGeneratedRest":
+ featureName = "testRestClient";
+ break;
+ case "mainGeneratedAvroSchema":
+ featureName = "avroSchema";
+ break;
+ case "testGeneratedAvroSchema":
+ featureName = "testAvroSchema";
+ break;
+ default:
+ String msg = String.format("Unable to map %s to an appropriate feature name", sourceSet);
+ throw new GradleException(msg);
+ }
+ return featureName;
+ }
+
+ // Generate rest client from idl files generated from java source files in the specified source set.
+ //
+ // This generates rest client source files from idl file generated from java source files
+ // in the source set. The generated rest client source files will be in a new source set.
+ // It also compiles the rest client source files into classes, and creates both the
+ // rest model and rest client jar files.
+ //
+ @SuppressWarnings("deprecation")
+ protected void configureRestClientGeneration(Project project, SourceSet sourceSet)
+ {
+ // idl directory for api project
+ File idlDir = project.file(getIdlPath(project, sourceSet));
+ if (SharedFileUtils.getSuffixedFiles(project, idlDir, IDL_FILE_SUFFIX).isEmpty() && !isPropertyTrue(project,
+ PROCESS_EMPTY_IDL_DIR))
+ {
+ return;
+ }
+ File generatedRestClientDir = project.file(getGeneratedDirPath(project, sourceSet, REST_GEN_TYPE)
+ + File.separatorChar + "java");
+
+ // always include imported data template jars in compileClasspath of rest client
+ FileCollection dataModelConfig = getDataModelConfig(project, sourceSet);
+
+ // if data templates generated from this source set, add the generated data template jar to compileClasspath
+ // of rest client.
+ String dataTemplateSourceSetName = getGeneratedSourceSetName(sourceSet, DATA_TEMPLATE_GEN_TYPE);
+
+ Jar dataTemplateJarTask = null;
+
+ SourceSetContainer sourceSets = project.getConvention()
+ .getPlugin(JavaPluginConvention.class).getSourceSets();
+
+ FileCollection dataModels;
+ if (sourceSets.findByName(dataTemplateSourceSetName) != null)
+ {
+ if (debug)
+ {
+ System.out.println("sourceSet " + sourceSet.getName() + " has generated sourceSet " + dataTemplateSourceSetName);
+ }
+ dataTemplateJarTask = (Jar) project.getTasks().getByName(sourceSet.getName() + "DataTemplateJar");
+ // The getArchivePath() API doesn’t carry any task dependency and has been deprecated.
+ // Replace it with getArchiveFile() on Gradle 7,
+ // but keep getArchivePath() to be backwards-compatibility with Gradle version older than 5.1
+ // DataHub Note - applied FIXME
+ dataModels = dataModelConfig.plus(project.files(
+ isAtLeastGradle7() ? dataTemplateJarTask.getArchiveFile() : dataTemplateJarTask.getArchivePath()));
+ }
+ else
+ {
+ dataModels = dataModelConfig;
+ }
+
+ // create source set for generated rest model, rest client source and class files.
+ String targetSourceSetName = getGeneratedSourceSetName(sourceSet, REST_GEN_TYPE);
+ SourceSet targetSourceSet = sourceSets.create(targetSourceSetName, ss ->
+ {
+ ss.java(sourceDirectorySet -> sourceDirectorySet.srcDir(generatedRestClientDir));
+ ss.setCompileClasspath(dataModels.plus(project.getConfigurations().getByName("restClientCompile")));
+ });
+
+ project.getPlugins().withType(EclipsePlugin.class, eclipsePlugin -> {
+ EclipseModel eclipseModel = (EclipseModel) project.getExtensions().findByName("eclipse");
+ eclipseModel.getClasspath().getPlusConfigurations()
+ .add(project.getConfigurations().getByName("restClientCompile"));
+ });
+
+ // idea plugin needs to know about new rest client source directory and its dependencies
+ addGeneratedDir(project, targetSourceSet, Arrays.asList(
+ getDataModelConfig(project, sourceSet),
+ project.getConfigurations().getByName("restClientCompile")));
+
+ // generate the rest client source files
+ GenerateRestClientTask generateRestClientTask = project.getTasks()
+ .create(targetSourceSet.getTaskName("generate", "restClient"), GenerateRestClientTask.class, task ->
+ {
+ task.dependsOn(project.getConfigurations().getByName("dataTemplate"));
+ task.setInputDir(idlDir);
+ task.setResolverPath(dataModels.plus(project.getConfigurations().getByName("restClientCompile")));
+ task.setRuntimeClasspath(project.getConfigurations().getByName("dataModel")
+ .plus(project.getConfigurations().getByName("dataTemplate").getArtifacts().getFiles()));
+ task.setCodegenClasspath(project.getConfigurations().getByName(PEGASUS_PLUGIN_CONFIGURATION));
+ task.setDestinationDir(generatedRestClientDir);
+ task.setRestli2FormatSuppressed(project.hasProperty(SUPPRESS_REST_CLIENT_RESTLI_2));
+ task.setRestli1FormatSuppressed(project.hasProperty(SUPPRESS_REST_CLIENT_RESTLI_1));
+ if (isPropertyTrue(project, ENABLE_ARG_FILE))
+ {
+ task.setEnableArgFile(true);
+ }
+ if (isPropertyTrue(project, CODE_GEN_PATH_CASE_SENSITIVE))
+ {
+ task.setGenerateLowercasePath(false);
+ }
+ if (isPropertyTrue(project, ENABLE_FLUENT_API))
+ {
+ task.setGenerateFluentApi(true);
+ }
+ task.doFirst(new CacheableAction<>(t -> project.delete(generatedRestClientDir)));
+ });
+
+ if (dataTemplateJarTask != null)
+ {
+ generateRestClientTask.dependsOn(dataTemplateJarTask);
+ }
+
+ // TODO: Tighten the types so that _generateSourcesJarTask must be of type Jar.
+ ((Jar) _generateSourcesJarTask).from(generateRestClientTask.getDestinationDir());
+ _generateSourcesJarTask.dependsOn(generateRestClientTask);
+
+ _generateJavadocTask.source(generateRestClientTask.getDestinationDir());
+ _generateJavadocTask.setClasspath(_generateJavadocTask.getClasspath()
+ .plus(project.getConfigurations().getByName("restClientCompile"))
+ .plus(generateRestClientTask.getResolverPath()));
+ _generateJavadocTask.dependsOn(generateRestClientTask);
+
+ // make sure rest client source files have been generated before compiling them
+ JavaCompile compileGeneratedRestClientTask = (JavaCompile) project.getTasks()
+ .getByName(targetSourceSet.getCompileJavaTaskName());
+ compileGeneratedRestClientTask.dependsOn(generateRestClientTask);
+ compileGeneratedRestClientTask.getOptions().getCompilerArgs().add("-Xlint:-deprecation");
+
+ // create the rest model jar file
+ Task restModelJarTask = project.getTasks().create(sourceSet.getName() + "RestModelJar", Jar.class, task ->
+ {
+ task.from(idlDir, copySpec ->
+ {
+ copySpec.eachFile(fileCopyDetails -> project.getLogger()
+ .info("Add idl file: {}", fileCopyDetails));
+ copySpec.setIncludes(Collections.singletonList('*' + IDL_FILE_SUFFIX));
+ });
+ // FIXME change to #getArchiveAppendix().set(...); breaks backwards-compatibility before 5.1
+ // DataHub Note - applied FIXME
+ task.getArchiveAppendix().set(getAppendix(sourceSet, "rest-model"));
+ task.setDescription("Generate rest model jar");
+ });
+
+ // create the rest client jar file
+ Task restClientJarTask = project.getTasks()
+ .create(sourceSet.getName() + "RestClientJar", Jar.class, task ->
+ {
+ task.dependsOn(compileGeneratedRestClientTask);
+ task.from(idlDir, copySpec -> {
+ copySpec.eachFile(fileCopyDetails -> {
+ project.getLogger().info("Add interface file: {}", fileCopyDetails);
+ fileCopyDetails.setPath("idl" + File.separatorChar + fileCopyDetails.getPath());
+ });
+ copySpec.setIncludes(Collections.singletonList('*' + IDL_FILE_SUFFIX));
+ });
+ task.from(targetSourceSet.getOutput());
+ // FIXME change to #getArchiveAppendix().set(...); breaks backwards-compatibility before 5.1
+ // DataHub Note - applied FIXME
+ task.getArchiveAppendix().set(getAppendix(sourceSet, "rest-client"));
+ task.setDescription("Generate rest client jar");
+ });
+
+ // add the rest model jar and the rest client jar to the list of project artifacts.
+ if (!isTestSourceSet(sourceSet))
+ {
+ project.getArtifacts().add("restModel", restModelJarTask);
+ project.getArtifacts().add("restClient", restClientJarTask);
+ }
+ else
+ {
+ project.getArtifacts().add("testRestModel", restModelJarTask);
+ project.getArtifacts().add("testRestClient", restClientJarTask);
+ }
+ }
+
+ // Return the appendix for generated jar files.
+ // The source set name is not included for the main source set.
+ private static String getAppendix(SourceSet sourceSet, String suffix)
+ {
+ return sourceSet.getName().equals("main") ? suffix : sourceSet.getName() + '-' + suffix;
+ }
+
+ private static Project getApiProject(Project project)
+ {
+ if (project.getExtensions().getExtraProperties().has("apiProject"))
+ {
+ return (Project) project.getExtensions().getExtraProperties().get("apiProject");
+ }
+
+ List subsSuffixes;
+ if (project.getExtensions().getExtraProperties().has("apiProjectSubstitutionSuffixes"))
+ {
+ @SuppressWarnings("unchecked")
+ List suffixValue = (List) project.getExtensions()
+ .getExtraProperties().get("apiProjectSubstitutionSuffixes");
+
+ subsSuffixes = suffixValue;
+ }
+ else
+ {
+ subsSuffixes = Arrays.asList("-impl", "-service", "-server", "-server-impl");
+ }
+
+ for (String suffix : subsSuffixes)
+ {
+ if (project.getPath().endsWith(suffix))
+ {
+ String searchPath = project.getPath().substring(0, project.getPath().length() - suffix.length()) + "-api";
+ Project apiProject = project.findProject(searchPath);
+ if (apiProject != null)
+ {
+ return apiProject;
+ }
+ }
+ }
+
+ return project.findProject(project.getPath() + "-api");
+ }
+
+ private static Project getCheckedApiProject(Project project)
+ {
+ Project apiProject = getApiProject(project);
+
+ if (apiProject == project)
+ {
+ throw new GradleException("The API project of ${project.path} must not be itself.");
+ }
+
+ return apiProject;
+ }
+
+ /**
+ * return the property value if the property exists and is not empty (-Pname=value)
+ * return null if property does not exist or the property is empty (-Pname)
+ *
+ * @param project the project where to look for the property
+ * @param propertyName the name of the property
+ */
+ public static String getNonEmptyProperty(Project project, String propertyName)
+ {
+ if (!project.hasProperty(propertyName))
+ {
+ return null;
+ }
+
+ String propertyValue = project.property(propertyName).toString();
+ if (propertyValue.isEmpty())
+ {
+ return null;
+ }
+
+ return propertyValue;
+ }
+
+ /**
+ * Return true if the given property exists and its value is true
+ *
+ * @param project the project where to look for the property
+ * @param propertyName the name of the property
+ */
+ public static boolean isPropertyTrue(Project project, String propertyName)
+ {
+ return project.hasProperty(propertyName) && Boolean.valueOf(project.property(propertyName).toString());
+ }
+
+ private static String createModifiedFilesMessage(Collection nonEquivExpectedFiles,
+ Collection foldersToBeBuilt)
+ {
+ StringBuilder builder = new StringBuilder();
+ builder.append("\nRemember to checkin the changes to the following new or modified files:\n");
+ for (String file : nonEquivExpectedFiles)
+ {
+ builder.append(" ");
+ builder.append(file);
+ builder.append("\n");
+ }
+
+ if (!foldersToBeBuilt.isEmpty())
+ {
+ builder.append("\nThe file modifications include service interface changes, you can build the the following projects "
+ + "to re-generate the client APIs accordingly:\n");
+ for (String folder : foldersToBeBuilt)
+ {
+ builder.append(" ");
+ builder.append(folder);
+ builder.append("\n");
+ }
+ }
+
+ return builder.toString();
+ }
+
+ private static String createPossibleMissingFilesMessage(Collection missingFiles)
+ {
+ StringBuilder builder = new StringBuilder();
+ builder.append("If this is the result of an automated build, then you may have forgotten to check in some snapshot or idl files:\n");
+ for (String file : missingFiles)
+ {
+ builder.append(" ");
+ builder.append(file);
+ builder.append("\n");
+ }
+
+ return builder.toString();
+ }
+
+ private static String findProperty(FileCompatibilityType type)
+ {
+ String property;
+ switch (type)
+ {
+ case SNAPSHOT:
+ property = SNAPSHOT_COMPAT_REQUIREMENT;
+ break;
+ case IDL:
+ property = IDL_COMPAT_REQUIREMENT;
+ break;
+ case PEGASUS_SCHEMA_SNAPSHOT:
+ property = PEGASUS_SCHEMA_SNAPSHOT_REQUIREMENT;
+ break;
+ case PEGASUS_EXTENSION_SCHEMA_SNAPSHOT:
+ property = PEGASUS_EXTENSION_SCHEMA_SNAPSHOT_REQUIREMENT;
+ break;
+ default:
+ throw new GradleException("No property defined for compatibility type " + type);
+ }
+ return property;
+ }
+
+ private static Set buildWatchedRestModelInputDirs(Project project, SourceSet sourceSet) {
+ @SuppressWarnings("unchecked")
+ Map pegasusOptions = (Map) project
+ .getExtensions().getExtraProperties().get("pegasus");
+
+ File rootPath = new File(project.getProjectDir(),
+ pegasusOptions.get(sourceSet.getName()).restModelOptions.getRestResourcesRootPath());
+
+ IdlOptions idlOptions = pegasusOptions.get(sourceSet.getName()).idlOptions;
+
+ // if idlItems exist, only watch the smaller subset
+ return idlOptions.getIdlItems().stream()
+ .flatMap(idlItem -> Arrays.stream(idlItem.packageNames))
+ .map(packageName -> new File(rootPath, packageName.replace('.', '/')))
+ .collect(Collectors.toCollection(TreeSet::new));
+ }
+
+ private static Set difference(Set left, Set right)
+ {
+ Set result = new HashSet<>(left);
+ result.removeAll(right);
+ return result;
+ }
+
+ /**
+ * Configures the given source set so that its data schema directory (usually 'pegasus') is marked as a resource root.
+ * The purpose of this is to improve the IDE experience. Makes sure to exclude this directory from being packaged in
+ * with the default Jar task.
+ */
+ private static void configureDataSchemaResourcesRoot(Project project, SourceSet sourceSet)
+ {
+ sourceSet.resources(sourceDirectorySet -> {
+ final String dataSchemaPath = getDataSchemaPath(project, sourceSet);
+ final File dataSchemaRoot = project.file(dataSchemaPath);
+ sourceDirectorySet.srcDir(dataSchemaPath);
+ project.getLogger().info("Adding resource root '{}'", dataSchemaPath);
+
+ final String extensionsSchemaPath = getExtensionSchemaPath(project, sourceSet);
+ final File extensionsSchemaRoot = project.file(extensionsSchemaPath);
+ sourceDirectorySet.srcDir(extensionsSchemaPath);
+ project.getLogger().info("Adding resource root '{}'", extensionsSchemaPath);
+
+ // Exclude the data schema and extensions schema directory from being copied into the default Jar task
+ sourceDirectorySet.getFilter().exclude(fileTreeElement -> {
+ final File file = fileTreeElement.getFile();
+ // Traversal starts with the children of a resource root, so checking the direct parent is sufficient
+ final boolean underDataSchemaRoot = dataSchemaRoot.equals(file.getParentFile());
+ final boolean underExtensionsSchemaRoot = extensionsSchemaRoot.equals(file.getParentFile());
+ final boolean exclude = (underDataSchemaRoot || underExtensionsSchemaRoot);
+ if (exclude)
+ {
+ project.getLogger().info("Excluding resource directory '{}'", file);
+ }
+ return exclude;
+ });
+ });
+ }
+
+ private Task generatePegasusSchemaSnapshot(Project project, SourceSet sourceSet, String taskName, File inputDir, File outputDir,
+ boolean isExtensionSchema)
+ {
+ return project.getTasks().create(sourceSet.getTaskName("generate", taskName),
+ GeneratePegasusSnapshotTask.class, task ->
+ {
+ task.setInputDir(inputDir);
+ task.setResolverPath(getDataModelConfig(project, sourceSet).plus(project.files(getDataSchemaPath(project, sourceSet))));
+ task.setClassPath(project.getConfigurations().getByName(PEGASUS_PLUGIN_CONFIGURATION));
+ task.setPegasusSchemaSnapshotDestinationDir(outputDir);
+ task.setExtensionSchema(isExtensionSchema);
+ if (isPropertyTrue(project, ENABLE_ARG_FILE))
+ {
+ task.setEnableArgFile(true);
+ }
+ });
+ }
+
+ private Task publishPegasusSchemaSnapshot(Project project, SourceSet sourceSet, String taskName, Task checkPegasusSnapshotTask,
+ File inputDir, File outputDir)
+ {
+ return project.getTasks().create(sourceSet.getTaskName("publish", taskName),
+ Sync.class, task ->
+ {
+ task.dependsOn(checkPegasusSnapshotTask);
+ task.from(inputDir);
+ task.into(outputDir);
+ task.onlyIf(t -> !SharedFileUtils.getSuffixedFiles(project, inputDir, PDL_FILE_SUFFIX).isEmpty());
+ });
+ }
+
+ private void checkGradleVersion(Project project)
+ {
+ if (MIN_REQUIRED_VERSION.compareTo(GradleVersion.current()) > 0)
+ {
+ throw new GradleException(String.format("This plugin does not support %s. Please use %s or later.",
+ GradleVersion.current(),
+ MIN_REQUIRED_VERSION));
+ }
+ if (MIN_SUGGESTED_VERSION.compareTo(GradleVersion.current()) > 0)
+ {
+ project.getLogger().warn(String.format("Pegasus supports %s, but it may not be supported in the next major release. Please use %s or later.",
+ GradleVersion.current(),
+ MIN_SUGGESTED_VERSION));
+ }
+ }
+
+ /**
+ * Reflection is necessary to obscure types introduced in Gradle 5.3
+ *
+ * @param sourceSet the target sourceset upon which to create a new feature variant
+ * @return an Action which modifies a org.gradle.api.plugins.FeatureSpec instance
+ */
+ private Action>/**/ createFeatureVariantFromSourceSet(SourceSet sourceSet)
+ {
+ return featureSpec -> {
+ try
+ {
+ Class> clazz = Class.forName("org.gradle.api.plugins.FeatureSpec");
+ Method usingSourceSet = clazz.getDeclaredMethod("usingSourceSet", SourceSet.class);
+ usingSourceSet.invoke(featureSpec, sourceSet);
+ }
+ catch (ReflectiveOperationException e)
+ {
+ throw new GradleException("Unable to invoke FeatureSpec#usingSourceSet(SourceSet)", e);
+ }
+ };
+ }
+
+ protected static boolean isAtLeastGradle61()
+ {
+ return GradleVersion.current().getBaseVersion().compareTo(GradleVersion.version("6.1")) >= 0;
+ }
+
+ public static boolean isAtLeastGradle7() {
+ return GradleVersion.current().getBaseVersion().compareTo(GradleVersion.version("7.0")) >= 0;
+ }
+}
\ No newline at end of file
diff --git a/buildSrc/src/main/java/com/linkedin/pegasus/gradle/tasks/ChangedFileReportTask.java b/buildSrc/src/main/java/com/linkedin/pegasus/gradle/tasks/ChangedFileReportTask.java
new file mode 100644
index 00000000000000..a2aafaf1be0172
--- /dev/null
+++ b/buildSrc/src/main/java/com/linkedin/pegasus/gradle/tasks/ChangedFileReportTask.java
@@ -0,0 +1,124 @@
+package com.linkedin.pegasus.gradle.tasks;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.function.Consumer;
+import java.util.stream.Collectors;
+import org.gradle.api.DefaultTask;
+import org.gradle.api.file.FileCollection;
+import org.gradle.api.specs.Specs;
+import org.gradle.api.tasks.InputFiles;
+import org.gradle.api.tasks.Internal;
+import org.gradle.api.tasks.SkipWhenEmpty;
+import org.gradle.api.tasks.TaskAction;
+import org.gradle.work.FileChange;
+import org.gradle.work.InputChanges;
+
+
+public class ChangedFileReportTask extends DefaultTask
+{
+ private final Collection _needCheckinFiles = new ArrayList<>();
+
+ private FileCollection _idlFiles = getProject().files();
+ private FileCollection _snapshotFiles = getProject().files();
+
+ public ChangedFileReportTask()
+ {
+ //with Gradle 6.0, Declaring an incremental task without outputs is not allowed.
+ getOutputs().upToDateWhen(Specs.satisfyNone());
+ }
+
+ // DataHub Note - updated for InputChanges
+ @TaskAction
+ public void checkFilesForChanges(InputChanges inputChanges)
+ {
+ getLogger().lifecycle("Checking idl and snapshot files for changes...");
+ getLogger().info("idlFiles: " + _idlFiles.getAsPath());
+ getLogger().info("snapshotFiles: " + _snapshotFiles.getAsPath());
+
+ Set filesRemoved = new HashSet<>();
+ Set filesAdded = new HashSet<>();
+ Set filesChanged = new HashSet<>();
+
+ if (inputChanges.isIncremental())
+ {
+ Consumer handleChange = change ->
+ {
+ switch (change.getChangeType()) {
+ case ADDED:
+ filesAdded.add(change.getFile().getAbsolutePath());
+ break;
+ case REMOVED:
+ filesRemoved.add(change.getFile().getAbsolutePath());
+ break;
+ case MODIFIED:
+ filesChanged.add(change.getFile().getAbsolutePath());
+ break;
+ }
+ };
+
+ inputChanges.getFileChanges(_idlFiles).forEach(handleChange);
+ inputChanges.getFileChanges(_snapshotFiles).forEach(handleChange);
+
+ if (!filesRemoved.isEmpty())
+ {
+ String files = joinByComma(filesRemoved);
+ _needCheckinFiles.add(files);
+ getLogger().lifecycle(
+ "The following files have been removed, be sure to remove them from source control: {}", files);
+ }
+
+ if (!filesAdded.isEmpty())
+ {
+ String files = joinByComma(filesAdded);
+ _needCheckinFiles.add(files);
+ getLogger().lifecycle("The following files have been added, be sure to add them to source control: {}", files);
+ }
+
+ if (!filesChanged.isEmpty())
+ {
+ String files = joinByComma(filesChanged);
+ _needCheckinFiles.add(files);
+ getLogger().lifecycle(
+ "The following files have been changed, be sure to commit the changes to source control: {}", files);
+ }
+ }
+ }
+
+ private String joinByComma(Set files)
+ {
+ return files.stream().collect(Collectors.joining(", "));
+ }
+
+ @InputFiles
+ @SkipWhenEmpty
+ public FileCollection getSnapshotFiles()
+ {
+ return _snapshotFiles;
+ }
+
+ public void setSnapshotFiles(FileCollection snapshotFiles)
+ {
+ _snapshotFiles = snapshotFiles;
+ }
+
+ @InputFiles
+ @SkipWhenEmpty
+ public FileCollection getIdlFiles()
+ {
+ return _idlFiles;
+ }
+
+ public void setIdlFiles(FileCollection idlFiles)
+ {
+ _idlFiles = idlFiles;
+ }
+
+ @Internal
+ public Collection getNeedCheckinFiles()
+ {
+ return _needCheckinFiles;
+ }
+}
\ No newline at end of file
diff --git a/datahub-frontend/build.gradle b/datahub-frontend/build.gradle
index a1b97701dbf882..437c72e6394ea6 100644
--- a/datahub-frontend/build.gradle
+++ b/datahub-frontend/build.gradle
@@ -2,6 +2,7 @@ plugins {
id "io.github.kobylynskyi.graphql.codegen" version "4.1.1"
id 'scala'
id 'com.palantir.docker'
+ id 'org.gradle.playframework'
}
apply from: "../gradle/versioning/versioning.gradle"
@@ -20,7 +21,6 @@ model {
}
task myTar(type: Tar) {
- extension = "tgz"
compression = Compression.GZIP
from("${buildDir}/stage")
@@ -119,3 +119,23 @@ task cleanLocalDockerImages {
}
}
dockerClean.finalizedBy(cleanLocalDockerImages)
+
+// gradle 8 fixes
+tasks.getByName('createDatahub-frontendTarDist').dependsOn 'stageMainDist'
+tasks.getByName('createDatahub-frontendZipDist').dependsOn 'stageMainDist'
+stagePlayBinaryDist.dependsOn tasks.getByName('createDatahub-frontendStartScripts')
+playBinaryDistTar.dependsOn tasks.getByName('createDatahub-frontendStartScripts')
+playBinaryDistZip.dependsOn tasks.getByName('createDatahub-frontendStartScripts')
+tasks.getByName('stageDatahub-frontendDist').dependsOn stagePlayBinaryDist
+tasks.getByName('stageDatahub-frontendDist').dependsOn createPlayBinaryStartScripts
+tasks.getByName('datahub-frontendDistTar').dependsOn createPlayBinaryStartScripts
+tasks.getByName('datahub-frontendDistTar').dependsOn createMainStartScripts
+tasks.getByName('datahub-frontendDistZip').dependsOn createPlayBinaryStartScripts
+tasks.getByName('datahub-frontendDistZip').dependsOn createMainStartScripts
+playBinaryDistTar.dependsOn createMainStartScripts
+playBinaryDistZip.dependsOn createMainStartScripts
+createMainStartScripts.dependsOn 'stageDatahub-frontendDist'
+createPlayBinaryTarDist.dependsOn 'stageDatahub-frontendDist'
+createPlayBinaryZipDist.dependsOn 'stageDatahub-frontendDist'
+createPlayBinaryTarDist.dependsOn 'stageMainDist'
+createPlayBinaryZipDist.dependsOn 'stageMainDist'
diff --git a/datahub-frontend/play.gradle b/datahub-frontend/play.gradle
index dd1ceee411f746..84fb4c02620b8e 100644
--- a/datahub-frontend/play.gradle
+++ b/datahub-frontend/play.gradle
@@ -1,4 +1,3 @@
-apply plugin: "org.gradle.playframework"
// Change this to listen on a different port
project.ext.httpPort = 9001
@@ -101,4 +100,22 @@ play {
test {
useJUnitPlatform()
+
+ def playJava17CompatibleJvmArgs = [
+ "--add-opens=java.base/java.lang=ALL-UNNAMED",
+ //"--add-opens=java.base/java.lang.invoke=ALL-UNNAMED",
+ //"--add-opens=java.base/java.lang.reflect=ALL-UNNAMED",
+ //"--add-opens=java.base/java.io=ALL-UNNAMED",
+ //"--add-opens=java.base/java.net=ALL-UNNAMED",
+ //"--add-opens=java.base/java.nio=ALL-UNNAMED",
+ "--add-opens=java.base/java.util=ALL-UNNAMED",
+ //"--add-opens=java.base/java.util.concurrent=ALL-UNNAMED",
+ //"--add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED",
+ //"--add-opens=java.base/sun.nio.ch=ALL-UNNAMED",
+ //"--add-opens=java.base/sun.nio.cs=ALL-UNNAMED",
+ //"--add-opens=java.base/sun.security.action=ALL-UNNAMED",
+ //"--add-opens=java.base/sun.util.calendar=ALL-UNNAMED",
+ //"--add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED",
+ ]
+ jvmArgs = playJava17CompatibleJvmArgs
}
diff --git a/datahub-graphql-core/build.gradle b/datahub-graphql-core/build.gradle
index fba0031351b588..6e8cb939669226 100644
--- a/datahub-graphql-core/build.gradle
+++ b/datahub-graphql-core/build.gradle
@@ -1,7 +1,8 @@
plugins {
+ id 'java'
id "io.github.kobylynskyi.graphql.codegen" version "4.1.1"
}
-apply plugin: 'java'
+
dependencies {
implementation project(':metadata-service:restli-client')
diff --git a/datahub-web-react/build.gradle b/datahub-web-react/build.gradle
index fd36e5ac4bc2c3..72821d8b97dc0b 100644
--- a/datahub-web-react/build.gradle
+++ b/datahub-web-react/build.gradle
@@ -1,8 +1,8 @@
plugins {
id 'java'
+ id 'distribution'
+ id 'com.github.node-gradle.node'
}
-apply plugin: 'distribution'
-apply plugin: 'com.github.node-gradle.node'
node {
@@ -35,7 +35,7 @@ node {
yarnWorkDir = file("${project.projectDir}/.gradle/yarn")
// Set the work directory where node_modules should be located
- nodeModulesDir = file("${project.projectDir}")
+ nodeProjectDir = file("${project.projectDir}")
}
@@ -94,7 +94,7 @@ configurations {
distZip {
dependsOn yarnQuickBuild
- baseName 'datahub-web-react'
+ archiveFileName = "datahub-web-react-${archiveVersion}.${archiveExtension}"
from 'dist'
}
@@ -112,5 +112,5 @@ jar {
into('public') {
from zipTree(distZip.outputs.files.first())
}
- classifier = 'assets'
+ archiveClassifier = 'assets'
}
diff --git a/docker/datahub-frontend/Dockerfile b/docker/datahub-frontend/Dockerfile
index 0c4c229af34f0c..17d691177aa345 100644
--- a/docker/datahub-frontend/Dockerfile
+++ b/docker/datahub-frontend/Dockerfile
@@ -17,7 +17,7 @@ RUN if [ "${ALPINE_REPO_URL}" != "http://dl-cdn.alpinelinux.org/alpine" ] ; then
# PFP-260: Upgrade Sqlite to >=3.28.0-r0 to fix https://security.snyk.io/vuln/SNYK-ALPINE39-SQLITE-449762
RUN apk --no-cache --update-cache --available upgrade \
&& apk --no-cache add curl sqlite libc6-compat java-snappy \
- && apk --no-cache add openjdk11-jre-headless --repository=${ALPINE_REPO_URL}/edge/community \
+ && apk --no-cache add openjdk17-jre-headless --repository=${ALPINE_REPO_URL}/edge/community \
&& apk --no-cache add jattach --repository ${ALPINE_REPO_URL}/edge/community/
ENV LD_LIBRARY_PATH="/lib:/lib64"
@@ -25,7 +25,10 @@ ENV LD_LIBRARY_PATH="/lib:/lib64"
FROM base as prod-install
COPY ./datahub-frontend.zip /
-RUN unzip datahub-frontend.zip && rm datahub-frontend.zip
+RUN unzip datahub-frontend.zip -d /datahub-frontend \
+ && mv /datahub-frontend/main/* /datahub-frontend \
+ && rmdir /datahub-frontend/main \
+ && rm datahub-frontend.zip
COPY ./docker/monitoring/client-prometheus-config.yaml /datahub-frontend/
RUN chown -R datahub:datahub /datahub-frontend && chmod 755 /datahub-frontend
diff --git a/docker/datahub-frontend/start.sh b/docker/datahub-frontend/start.sh
index 12e6b8915096d6..f5de9c87968b0d 100755
--- a/docker/datahub-frontend/start.sh
+++ b/docker/datahub-frontend/start.sh
@@ -49,6 +49,8 @@ export JAVA_OPTS="${JAVA_MEMORY_OPTS:-"-Xms512m -Xmx1024m"} \
-Djava.security.auth.login.config=datahub-frontend/conf/jaas.conf \
-Dlogback.configurationFile=datahub-frontend/conf/logback.xml \
-Dlogback.debug=false \
+ --add-opens java.base/java.lang=ALL-UNNAMED \
+ --add-opens=java.base/java.util=ALL-UNNAMED \
${PROMETHEUS_AGENT:-} ${OTEL_AGENT:-} \
${TRUSTSTORE_FILE:-} ${TRUSTSTORE_TYPE:-} ${TRUSTSTORE_PASSWORD:-} \
${HTTP_PROXY:-} ${HTTPS_PROXY:-} ${NO_PROXY:-} \
diff --git a/docker/datahub-gms/Dockerfile b/docker/datahub-gms/Dockerfile
index 9c79e1da542f0b..b26a02c1d3b15a 100644
--- a/docker/datahub-gms/Dockerfile
+++ b/docker/datahub-gms/Dockerfile
@@ -40,14 +40,14 @@ RUN if [ "${ALPINE_REPO_URL}" != "http://dl-cdn.alpinelinux.org/alpine" ] ; then
# PFP-260: Upgrade Sqlite to >=3.28.0-r0 to fix https://security.snyk.io/vuln/SNYK-ALPINE39-SQLITE-449762
RUN apk --no-cache --update-cache --available upgrade \
&& apk --no-cache add curl bash coreutils gcompat sqlite libc6-compat java-snappy \
- && apk --no-cache add openjdk11-jre-headless --repository=${ALPINE_REPO_URL}/edge/community \
+ && apk --no-cache add openjdk17-jre-headless --repository=${ALPINE_REPO_URL}/edge/community \
&& apk --no-cache add jattach --repository ${ALPINE_REPO_URL}/edge/community/ \
&& curl -sS ${MAVEN_CENTRAL_REPO_URL}/org/eclipse/jetty/jetty-runner/9.4.46.v20220331/jetty-runner-9.4.46.v20220331.jar --output jetty-runner.jar \
&& curl -sS ${MAVEN_CENTRAL_REPO_URL}/org/eclipse/jetty/jetty-jmx/9.4.46.v20220331/jetty-jmx-9.4.46.v20220331.jar --output jetty-jmx.jar \
&& curl -sS ${MAVEN_CENTRAL_REPO_URL}/org/eclipse/jetty/jetty-util/9.4.46.v20220331/jetty-util-9.4.46.v20220331.jar --output jetty-util.jar \
&& wget --no-verbose ${GITHUB_REPO_URL}/open-telemetry/opentelemetry-java-instrumentation/releases/download/v1.24.0/opentelemetry-javaagent.jar \
&& wget --no-verbose ${MAVEN_CENTRAL_REPO_URL}/io/prometheus/jmx/jmx_prometheus_javaagent/${JMX_VERSION}/jmx_prometheus_javaagent-${JMX_VERSION}.jar -O jmx_prometheus_javaagent.jar \
- && cp /usr/lib/jvm/java-11-openjdk/jre/lib/security/cacerts /tmp/kafka.client.truststore.jks
+ && cp /usr/lib/jvm/java-17-openjdk/jre/lib/security/cacerts /tmp/kafka.client.truststore.jks
COPY --from=binary /go/bin/dockerize /usr/local/bin
ENV LD_LIBRARY_PATH="/lib:/lib64"
diff --git a/docker/datahub-ingestion/build.gradle b/docker/datahub-ingestion/build.gradle
index 52db594e2ef852..36444210f1938b 100644
--- a/docker/datahub-ingestion/build.gradle
+++ b/docker/datahub-ingestion/build.gradle
@@ -45,9 +45,9 @@ docker {
buildArgs(dockerBuildArgs)
}
-tasks.getByName('docker').dependsOn(['build',
- ':docker:datahub-ingestion-base:docker',
- ':metadata-ingestion:codegen'])
+tasks.getByName('dockerPrepare').dependsOn(['build',
+ ':docker:datahub-ingestion-base:docker',
+ ':metadata-ingestion:codegen'])
task mkdirBuildDocker {
doFirst {
diff --git a/docker/datahub-mae-consumer/Dockerfile b/docker/datahub-mae-consumer/Dockerfile
index 5bfa5f35ace179..9b7c6e762462e3 100644
--- a/docker/datahub-mae-consumer/Dockerfile
+++ b/docker/datahub-mae-consumer/Dockerfile
@@ -38,11 +38,11 @@ ENV JMX_VERSION=0.18.0
# PFP-260: Upgrade Sqlite to >=3.28.0-r0 to fix https://security.snyk.io/vuln/SNYK-ALPINE39-SQLITE-449762
RUN apk --no-cache --update-cache --available upgrade \
&& apk --no-cache add curl bash coreutils sqlite libc6-compat java-snappy \
- && apk --no-cache add openjdk11-jre-headless --repository=${ALPINE_REPO_URL}/edge/community \
+ && apk --no-cache add openjdk17-jre-headless --repository=${ALPINE_REPO_URL}/edge/community \
&& apk --no-cache add jattach --repository ${ALPINE_REPO_URL}/edge/community/ \
&& wget --no-verbose ${GITHUB_REPO_URL}/open-telemetry/opentelemetry-java-instrumentation/releases/download/v1.24.0/opentelemetry-javaagent.jar \
&& wget --no-verbose ${MAVEN_CENTRAL_REPO_URL}/io/prometheus/jmx/jmx_prometheus_javaagent/${JMX_VERSION}/jmx_prometheus_javaagent-${JMX_VERSION}.jar -O jmx_prometheus_javaagent.jar \
- && cp /usr/lib/jvm/java-11-openjdk/jre/lib/security/cacerts /tmp/kafka.client.truststore.jks
+ && cp /usr/lib/jvm/java-17-openjdk/jre/lib/security/cacerts /tmp/kafka.client.truststore.jks
COPY --from=binary /go/bin/dockerize /usr/local/bin
ENV LD_LIBRARY_PATH="/lib:/lib64"
diff --git a/docker/datahub-mce-consumer/Dockerfile b/docker/datahub-mce-consumer/Dockerfile
index cc79a3072c1930..4da94794e0ead3 100644
--- a/docker/datahub-mce-consumer/Dockerfile
+++ b/docker/datahub-mce-consumer/Dockerfile
@@ -38,11 +38,11 @@ ENV JMX_VERSION=0.18.0
# PFP-260: Upgrade Sqlite to >=3.28.0-r0 to fix https://security.snyk.io/vuln/SNYK-ALPINE39-SQLITE-449762
RUN apk --no-cache --update-cache --available upgrade \
&& apk --no-cache add curl bash sqlite libc6-compat java-snappy \
- && apk --no-cache add openjdk11-jre-headless --repository=${ALPINE_REPO_URL}/edge/community \
+ && apk --no-cache add openjdk17-jre-headless --repository=${ALPINE_REPO_URL}/edge/community \
&& apk --no-cache add jattach --repository ${ALPINE_REPO_URL}/edge/community/ \
&& wget --no-verbose ${GITHUB_REPO_URL}/open-telemetry/opentelemetry-java-instrumentation/releases/download/v1.24.0/opentelemetry-javaagent.jar \
&& wget --no-verbose ${MAVEN_CENTRAL_REPO_URL}/io/prometheus/jmx/jmx_prometheus_javaagent/${JMX_VERSION}/jmx_prometheus_javaagent-${JMX_VERSION}.jar -O jmx_prometheus_javaagent.jar \
- && cp /usr/lib/jvm/java-11-openjdk/jre/lib/security/cacerts /tmp/kafka.client.truststore.jks
+ && cp /usr/lib/jvm/java-17-openjdk/jre/lib/security/cacerts /tmp/kafka.client.truststore.jks
COPY --from=binary /go/bin/dockerize /usr/local/bin
FROM base as prod-install
diff --git a/docker/datahub-upgrade/Dockerfile b/docker/datahub-upgrade/Dockerfile
index 2beb5b54dac383..00dae87dfc3ded 100644
--- a/docker/datahub-upgrade/Dockerfile
+++ b/docker/datahub-upgrade/Dockerfile
@@ -38,13 +38,13 @@ ENV JMX_VERSION=0.18.0
# PFP-260: Upgrade Sqlite to >=3.28.0-r0 to fix https://security.snyk.io/vuln/SNYK-ALPINE39-SQLITE-449762
RUN apk --no-cache --update-cache --available upgrade \
&& apk --no-cache add curl bash coreutils gcompat sqlite libc6-compat java-snappy \
- && apk --no-cache add openjdk11-jre-headless --repository=${ALPINE_REPO_URL}/edge/community \
+ && apk --no-cache add openjdk17-jre-headless --repository=${ALPINE_REPO_URL}/edge/community \
&& curl -sS ${MAVEN_CENTRAL_REPO_URL}/org/eclipse/jetty/jetty-runner/9.4.46.v20220331/jetty-runner-9.4.46.v20220331.jar --output jetty-runner.jar \
&& curl -sS ${MAVEN_CENTRAL_REPO_URL}/org/eclipse/jetty/jetty-jmx/9.4.46.v20220331/jetty-jmx-9.4.46.v20220331.jar --output jetty-jmx.jar \
&& curl -sS ${MAVEN_CENTRAL_REPO_URL}/org/eclipse/jetty/jetty-util/9.4.46.v20220331/jetty-util-9.4.46.v20220331.jar --output jetty-util.jar \
&& wget --no-verbose ${GITHUB_REPO_URL}/open-telemetry/opentelemetry-java-instrumentation/releases/download/v1.24.0/opentelemetry-javaagent.jar \
&& wget --no-verbose ${MAVEN_CENTRAL_REPO_URL}/io/prometheus/jmx/jmx_prometheus_javaagent/${JMX_VERSION}/jmx_prometheus_javaagent-${JMX_VERSION}.jar -O jmx_prometheus_javaagent.jar \
- && cp /usr/lib/jvm/java-11-openjdk/jre/lib/security/cacerts /tmp/kafka.client.truststore.jks
+ && cp /usr/lib/jvm/java-17-openjdk/jre/lib/security/cacerts /tmp/kafka.client.truststore.jks
COPY --from=binary /go/bin/dockerize /usr/local/bin
ENV LD_LIBRARY_PATH="/lib:/lib64"
diff --git a/docker/kafka-setup/Dockerfile b/docker/kafka-setup/Dockerfile
index f6a4b62a793562..53353863b6e5f6 100644
--- a/docker/kafka-setup/Dockerfile
+++ b/docker/kafka-setup/Dockerfile
@@ -31,7 +31,7 @@ LABEL name="kafka" version=${KAFKA_VERSION}
RUN if [ "${ALPINE_REPO_URL}" != "http://dl-cdn.alpinelinux.org/alpine" ] ; then sed -i "s#http.*://dl-cdn.alpinelinux.org/alpine#${ALPINE_REPO_URL}#g" /etc/apk/repositories ; fi
RUN apk add --no-cache bash coreutils
-RUN apk --no-cache add openjdk11-jre-headless --repository=${ALPINE_REPO_URL}/edge/community
+RUN apk --no-cache add openjdk17-jre-headless --repository=${ALPINE_REPO_URL}/edge/community
RUN apk add --no-cache -t .build-deps git curl ca-certificates jq gcc musl-dev libffi-dev zip
RUN mkdir -p /opt \
diff --git a/docs-website/build.gradle b/docs-website/build.gradle
index a213ec1ae8194d..2644491a2a5f80 100644
--- a/docs-website/build.gradle
+++ b/docs-website/build.gradle
@@ -1,5 +1,7 @@
-apply plugin: 'distribution'
-apply plugin: 'com.github.node-gradle.node'
+plugins {
+ id 'distribution'
+ id 'com.github.node-gradle.node'
+}
node {
@@ -12,10 +14,10 @@ node {
}
// Version of node to use.
- version = '16.16.0'
+ version = '21.2.0'
// Version of Yarn to use.
- yarnVersion = '1.22.0'
+ yarnVersion = '1.22.1'
// Base URL for fetching node distributions (set nodeDistBaseUrl if you have a mirror).
if (project.hasProperty('nodeDistBaseUrl')) {
@@ -31,7 +33,7 @@ node {
yarnWorkDir = file("${project.projectDir}/.gradle/yarn")
// Set the work directory where node_modules should be located
- nodeModulesDir = file("${project.projectDir}")
+ nodeProjectDir = file("${project.projectDir}")
}
/*
@@ -122,7 +124,11 @@ task yarnBuild(type: YarnTask, dependsOn: [yarnLint, yarnGenerate, downloadHisto
// See https://stackoverflow.com/questions/53230823/fatal-error-ineffective-mark-compacts-near-heap-limit-allocation-failed-java
// and https://github.com/facebook/docusaurus/issues/8329.
// TODO: As suggested in https://github.com/facebook/docusaurus/issues/4765, try switching to swc-loader.
- environment = ['NODE_OPTIONS': '--max-old-space-size=10248']
+ if (project.hasProperty('useSystemNode') && project.getProperty('useSystemNode').toBoolean()) {
+ environment = ['NODE_OPTIONS': '--max-old-space-size=10248']
+ } else {
+ environment = ['NODE_OPTIONS': '--max-old-space-size=10248 --openssl-legacy-provider']
+ }
args = ['run', 'build']
}
diff --git a/docs-website/vercel-setup.sh b/docs-website/vercel-setup.sh
index db532e167b59f1..915635b24ee884 100755
--- a/docs-website/vercel-setup.sh
+++ b/docs-website/vercel-setup.sh
@@ -12,7 +12,7 @@ set -euxo pipefail
yum groupinstall "Development Tools" -y
yum erase openssl-devel -y
-yum install openssl11 openssl11-devel libffi-devel bzip2-devel wget -y
+yum install openssl11 openssl11-devel libffi-devel bzip2-devel wget nodejs -y
wget https://www.python.org/ftp/python/3.10.11/Python-3.10.11.tgz
tar -xf Python-3.10.11.tgz
diff --git a/docs/developers.md b/docs/developers.md
index c3c3a59283e662..60d31f5e4523f7 100644
--- a/docs/developers.md
+++ b/docs/developers.md
@@ -6,16 +6,12 @@ title: "Local Development"
## Requirements
-- Both [Java 11 JDK](https://openjdk.org/projects/jdk/11/) and [Java 8 JDK](https://openjdk.java.net/projects/jdk8/)
+- [Java 17 JDK](https://openjdk.org/projects/jdk/17/)
- [Python 3.10](https://www.python.org/downloads/release/python-3100/)
- [Docker](https://www.docker.com/)
- [Docker Compose](https://docs.docker.com/compose/)
- Docker engine with at least 8GB of memory to run tests.
-:::caution
-
-Do not try to use a JDK newer than JDK 11. The build process does not currently work with newer JDKs versions.
-
:::
On macOS, these can be installed using [Homebrew](https://brew.sh/).
@@ -147,11 +143,11 @@ You're probably using a Java version that's too new for gradle. Run the followin
java --version
```
-While it may be possible to build and run DataHub using newer versions of Java, we currently only support [Java 11](https://openjdk.org/projects/jdk/11/) (aka Java 11).
+While it may be possible to build and run DataHub using newer versions of Java, we currently only support [Java 17](https://openjdk.org/projects/jdk/17/) (aka Java 17).
#### Getting `cannot find symbol` error for `javax.annotation.Generated`
-Similar to the previous issue, please use Java 1.8 to build the project.
+Similar to the previous issue, please use Java 17 to build the project.
You can install multiple version of Java on a single machine and switch between them using the `JAVA_HOME` environment variable. See [this document](https://docs.oracle.com/cd/E21454_01/html/821-2531/inst_jdk_javahome_t.html) for more details.
#### `:metadata-models:generateDataTemplate` task fails with `java.nio.file.InvalidPathException: Illegal char <:> at index XX` or `Caused by: java.lang.IllegalArgumentException: 'other' has different root` error
diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md
index 36be572f2886e5..61ad2d623d72a4 100644
--- a/docs/how/updating-datahub.md
+++ b/docs/how/updating-datahub.md
@@ -7,11 +7,15 @@ This file documents any backwards-incompatible changes in DataHub and assists pe
### Breaking Changes
- Updating MySQL version for quickstarts to 8.2, may cause quickstart issues for existing instances.
+- Neo4j 5.x, may require migration from 4.x
+- Build now requires JDK17 (Runtime Java 11)
### Potential Downtime
### Deprecations
+- Spark 2.x (including previous JDK8 build requirements)
+
### Other Notable Changes
## 0.12.1
diff --git a/docs/troubleshooting/build.md b/docs/troubleshooting/build.md
index 112bcdc47e9567..7b4ae98cdb03bd 100644
--- a/docs/troubleshooting/build.md
+++ b/docs/troubleshooting/build.md
@@ -10,11 +10,11 @@ You're probably using a Java version that's too new for gradle. Run the followin
java --version
```
-While it may be possible to build and run DataHub using newer versions of Java, we currently only support [Java 11](https://openjdk.org/projects/jdk/11/) (aka Java 11).
+While it may be possible to build and run DataHub using newer versions of Java, we currently only support [Java 17](https://openjdk.org/projects/jdk/17/) (aka Java 17).
## Getting `cannot find symbol` error for `javax.annotation.Generated`
-Similar to the previous issue, please use Java 1.8 to build the project.
+Similar to the previous issue, please use Java 17 to build the project.
You can install multiple version of Java on a single machine and switch between them using the `JAVA_HOME` environment variable. See [this document](https://docs.oracle.com/cd/E21454_01/html/821-2531/inst_jdk_javahome_t.html) for more details.
## `:metadata-models:generateDataTemplate` task fails with `java.nio.file.InvalidPathException: Illegal char <:> at index XX` or `Caused by: java.lang.IllegalArgumentException: 'other' has different root` error
diff --git a/entity-registry/build.gradle b/entity-registry/build.gradle
index 3da0bf5bb4fb81..77cca24c0e7234 100644
--- a/entity-registry/build.gradle
+++ b/entity-registry/build.gradle
@@ -1,10 +1,13 @@
-apply plugin: 'pegasus'
-apply plugin: 'java-library'
+plugins {
+ id 'pegasus'
+ id 'java-library'
+}
dependencies {
implementation spec.product.pegasus.data
implementation spec.product.pegasus.generator
api project(path: ':metadata-models')
+ api project(path: ':metadata-models', configuration: "dataTemplate")
implementation externalDependency.slf4jApi
compileOnly externalDependency.lombok
implementation externalDependency.guava
diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties
index 4e86b9270786fb..bdc9a83b1e6524 100644
--- a/gradle/wrapper/gradle-wrapper.properties
+++ b/gradle/wrapper/gradle-wrapper.properties
@@ -1,6 +1,6 @@
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-7.6.2-bin.zip
+distributionUrl=https\://services.gradle.org/distributions/gradle-8.0.2-bin.zip
networkTimeout=10000
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists
diff --git a/li-utils/build.gradle b/li-utils/build.gradle
index 1d5222e39185af..975cd2bccccf31 100644
--- a/li-utils/build.gradle
+++ b/li-utils/build.gradle
@@ -1,17 +1,9 @@
-apply plugin: 'java-library'
-apply plugin: 'pegasus'
-
-tasks.withType(JavaCompile).configureEach {
- javaCompiler = javaToolchains.compilerFor {
- languageVersion = JavaLanguageVersion.of(8)
- }
-}
-tasks.withType(Test).configureEach {
- javaLauncher = javaToolchains.launcherFor {
- languageVersion = JavaLanguageVersion.of(8)
- }
+plugins {
+ id 'java-library'
+ id 'pegasus'
}
+
dependencies {
api spec.product.pegasus.data
implementation externalDependency.commonsLang
@@ -28,7 +20,7 @@ dependencies {
testImplementation externalDependency.commonsIo
testImplementation project(':test-models')
testImplementation project(path: ':test-models', configuration: 'testDataTemplate')
- testImplementation externalDependency.testngJava8
+ testImplementation externalDependency.testng
}
idea {
@@ -38,4 +30,4 @@ idea {
}
// Need to compile backing java parameterDefinitions with the data template.
-sourceSets.mainGeneratedDataTemplate.java.srcDirs('src/main/javaPegasus/')
\ No newline at end of file
+sourceSets.mainGeneratedDataTemplate.java.srcDirs('src/main/javaPegasus/')
diff --git a/metadata-auth/auth-api/build.gradle b/metadata-auth/auth-api/build.gradle
index 7159aa5f15e61e..c68c3019bd2b45 100644
--- a/metadata-auth/auth-api/build.gradle
+++ b/metadata-auth/auth-api/build.gradle
@@ -15,13 +15,12 @@ test {
}
jar {
- archiveName = "$project.name-lib.jar"
+ archiveClassifier = "lib"
}
shadowJar {
zip64 true
- classifier = null
- archiveName = "$project.name-${version}.jar"
+ archiveClassifier = ""
exclude "META-INF/*.RSA", "META-INF/*.SF","META-INF/*.DSA"
}
@@ -39,12 +38,12 @@ dependencies() {
}
task sourcesJar(type: Jar) {
- classifier 'sources'
+ archiveClassifier = 'sources'
from sourceSets.main.allJava
}
task javadocJar(type: Jar, dependsOn: javadoc) {
- classifier 'javadoc'
+ archiveClassifier = 'javadoc'
from javadoc.destinationDir
}
diff --git a/metadata-events/mxe-utils-avro/build.gradle b/metadata-events/mxe-utils-avro/build.gradle
index 3493797ab4f972..98bfb9127b2094 100644
--- a/metadata-events/mxe-utils-avro/build.gradle
+++ b/metadata-events/mxe-utils-avro/build.gradle
@@ -1,8 +1,11 @@
-apply plugin: 'java-library'
+plugins {
+ id 'java-library'
+}
dependencies {
api project(':metadata-events:mxe-avro')
api project(':metadata-models')
+ api project(path: ':metadata-models', configuration: "dataTemplate")
api spec.product.pegasus.dataAvro
testImplementation externalDependency.testng
diff --git a/metadata-integration/java/datahub-client/build.gradle b/metadata-integration/java/datahub-client/build.gradle
index 7ae01faaaabddb..b14953d7ce0218 100644
--- a/metadata-integration/java/datahub-client/build.gradle
+++ b/metadata-integration/java/datahub-client/build.gradle
@@ -14,19 +14,9 @@ import org.apache.tools.ant.filters.ReplaceTokens
jar.enabled = false // Since we only want to build shadow jars, disabling the regular jar creation
-tasks.withType(JavaCompile).configureEach {
- javaCompiler = javaToolchains.compilerFor {
- languageVersion = JavaLanguageVersion.of(8)
- }
-}
-tasks.withType(Test).configureEach {
- javaLauncher = javaToolchains.launcherFor {
- languageVersion = JavaLanguageVersion.of(8)
- }
-}
-
dependencies {
implementation project(':metadata-models')
+ implementation project(path: ':metadata-models', configuration: "dataTemplate")
implementation(externalDependency.kafkaAvroSerializer) {
exclude group: "org.apache.avro"
}
@@ -49,7 +39,7 @@ dependencies {
annotationProcessor externalDependency.lombok
// VisibleForTesting
compileOnly externalDependency.guava
- testImplementation externalDependency.testngJava8
+ testImplementation externalDependency.testng
testImplementation externalDependency.mockito
testImplementation externalDependency.mockServer
testImplementation externalDependency.mockServerClient
@@ -241,4 +231,4 @@ sourceSets.main.resources.srcDir "${generateOpenApiPojos.outputDir}/src/main/res
clean {
project.delete("$projectDir/generated")
-}
+}
\ No newline at end of file
diff --git a/metadata-integration/java/datahub-protobuf-example/build.gradle b/metadata-integration/java/datahub-protobuf-example/build.gradle
index 4e53d8ed763baa..1efb43360457a0 100644
--- a/metadata-integration/java/datahub-protobuf-example/build.gradle
+++ b/metadata-integration/java/datahub-protobuf-example/build.gradle
@@ -64,10 +64,6 @@ protobuf {
task publishSchema(dependsOn: build) {
description "Publishes protobuf schema in the `main` sourceSet to DataHub"
- def javaLauncher = javaToolchains.launcherFor {
- languageVersion = JavaLanguageVersion.of(11)
- }
-
fileTree("schema").matching {
exclude "protobuf/meta/**"
}.each {f ->
diff --git a/metadata-integration/java/datahub-protobuf/build.gradle b/metadata-integration/java/datahub-protobuf/build.gradle
index bc919119f8fac7..2cb36a14cb9c7d 100644
--- a/metadata-integration/java/datahub-protobuf/build.gradle
+++ b/metadata-integration/java/datahub-protobuf/build.gradle
@@ -12,12 +12,6 @@ apply from: '../versioning.gradle'
jar.enabled = false // Since we only want to build shadow jars, disabling the regular jar creation
-afterEvaluate {
- if (project.plugins.hasPlugin('java')) {
- sourceCompatibility = 11
- targetCompatibility = 11
- }
-}
ext {
javaMainClass = "datahub.protobuf.Proto2DataHub"
}
@@ -211,4 +205,4 @@ nexusStaging {
password = System.getenv("NEXUS_PASSWORD")
}
-
+startScripts.dependsOn shadowJar
\ No newline at end of file
diff --git a/metadata-integration/java/examples/build.gradle b/metadata-integration/java/examples/build.gradle
index 581e9f82da0dc8..ddf574e8c8905d 100644
--- a/metadata-integration/java/examples/build.gradle
+++ b/metadata-integration/java/examples/build.gradle
@@ -1,16 +1,6 @@
-apply plugin: 'java'
-apply plugin: 'jacoco'
-
-
-tasks.withType(JavaCompile).configureEach {
- javaCompiler = javaToolchains.compilerFor {
- languageVersion = JavaLanguageVersion.of(8)
- }
-}
-tasks.withType(Test).configureEach {
- javaLauncher = javaToolchains.launcherFor {
- languageVersion = JavaLanguageVersion.of(8)
- }
+plugins {
+ id 'java'
+ id 'jacoco'
}
dependencies {
diff --git a/metadata-integration/java/spark-lineage/build.gradle b/metadata-integration/java/spark-lineage/build.gradle
index 7143ac48331439..c5dd9b5012c290 100644
--- a/metadata-integration/java/spark-lineage/build.gradle
+++ b/metadata-integration/java/spark-lineage/build.gradle
@@ -11,17 +11,6 @@ apply from: '../versioning.gradle'
jar.enabled = false // Since we only want to build shadow jars, disabling the regular jar creation
-tasks.withType(JavaCompile).configureEach {
- javaCompiler = javaToolchains.compilerFor {
- languageVersion = JavaLanguageVersion.of(8)
- }
-}
-tasks.withType(Test).configureEach {
- javaLauncher = javaToolchains.launcherFor {
- languageVersion = JavaLanguageVersion.of(8)
- }
-}
-
//to rename artifacts for publish
project.archivesBaseName = 'datahub-'+project.name
@@ -34,18 +23,19 @@ configurations {
dependencies {
- //Needed for tie breaking of guava version need for spark and wiremock
- provided(externalDependency.hadoopMapreduceClient) {
- force = true
+ constraints {
+ provided(externalDependency.hadoopMapreduceClient) {
+ because 'Needed for tie breaking of guava version need for spark and wiremock'
+ }
+ provided(externalDependency.hadoopCommon) {
+ because 'required for org.apache.hadoop.util.StopWatch'
+ }
+ provided(externalDependency.commonsIo) {
+ because 'required for org.apache.commons.io.Charsets that is used internally'
+ }
}
- provided(externalDependency.hadoopCommon) {
- force = true
- } // required for org.apache.hadoop.util.StopWatch
-
- provided(externalDependency.commonsIo) {
- force = true
- } // required for org.apache.commons.io.Charsets that is used internally
+ provided 'org.scala-lang:scala-library:2.12.18'
implementation externalDependency.slf4jApi
compileOnly externalDependency.lombok
@@ -86,7 +76,7 @@ task checkShadowJar(type: Exec) {
shadowJar {
zip64=true
- classifier=''
+ archiveClassifier = ''
mergeServiceFiles()
def exclude_modules = project
@@ -107,7 +97,7 @@ shadowJar {
// preventing java multi-release JAR leakage
// https://github.com/johnrengelman/shadow/issues/729
- exclude('module-info.class', 'META-INF/versions/**')
+ exclude('module-info.class', 'META-INF/versions/**', 'LICENSE', 'NOTICE')
// prevent jni conflict with spark
exclude '**/libzstd-jni.*'
@@ -138,6 +128,25 @@ jacocoTestReport {
test {
forkEvery = 1
useJUnit()
+
+ def sparkJava17CompatibleJvmArgs = [
+ "--add-opens=java.base/java.lang=ALL-UNNAMED",
+ //"--add-opens=java.base/java.lang.invoke=ALL-UNNAMED",
+ //"--add-opens=java.base/java.lang.reflect=ALL-UNNAMED",
+ //"--add-opens=java.base/java.io=ALL-UNNAMED",
+ "--add-opens=java.base/java.net=ALL-UNNAMED",
+ "--add-opens=java.base/java.nio=ALL-UNNAMED",
+ //"--add-opens=java.base/java.util=ALL-UNNAMED",
+ //"--add-opens=java.base/java.util.concurrent=ALL-UNNAMED",
+ //"--add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED",
+ "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED",
+ //"--add-opens=java.base/sun.nio.cs=ALL-UNNAMED",
+ //"--add-opens=java.base/sun.security.action=ALL-UNNAMED",
+ //"--add-opens=java.base/sun.util.calendar=ALL-UNNAMED",
+ //"--add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED",
+ ]
+ jvmArgs = sparkJava17CompatibleJvmArgs
+
finalizedBy jacocoTestReport
}
@@ -151,12 +160,12 @@ task integrationTest(type: Exec, dependsOn: [shadowJar, ':docker:quickstartSlim'
}
task sourcesJar(type: Jar) {
- classifier 'sources'
+ archiveClassifier = 'sources'
from sourceSets.main.allJava
}
task javadocJar(type: Jar, dependsOn: javadoc) {
- classifier 'javadoc'
+ archiveClassifier = 'javadoc'
from javadoc.destinationDir
}
@@ -224,3 +233,12 @@ nexusStaging {
username = System.getenv("NEXUS_USERNAME")
password = System.getenv("NEXUS_PASSWORD")
}
+
+task cleanExtraDirs {
+ delete "$projectDir/derby.log"
+ delete "$projectDir/src/test/resources/data/hive"
+ delete "$projectDir/src/test/resources/data/out.csv"
+ delete "$projectDir/src/test/resources/data/out_persist.csv"
+ delete "$projectDir/spark-smoke-test/venv"
+}
+clean.finalizedBy(cleanExtraDirs)
diff --git a/metadata-integration/java/spark-lineage/scripts/check_jar.sh b/metadata-integration/java/spark-lineage/scripts/check_jar.sh
index dd9cae68f31cb0..275b91304e7ee3 100755
--- a/metadata-integration/java/spark-lineage/scripts/check_jar.sh
+++ b/metadata-integration/java/spark-lineage/scripts/check_jar.sh
@@ -34,7 +34,9 @@ jar -tvf $jarFile |\
grep -v "linux/" |\
grep -v "darwin" |\
grep -v "MetadataChangeProposal.avsc" |\
- grep -v "aix"
+ grep -v "aix" |\
+ grep -v "library.properties" |\
+ grep -v "rootdoc.txt"
if [ $? -ne 0 ]; then
echo "✅ No unexpected class paths found in ${jarFile}"
diff --git a/metadata-integration/java/spark-lineage/spark-smoke-test/docker/SparkBase.Dockerfile b/metadata-integration/java/spark-lineage/spark-smoke-test/docker/SparkBase.Dockerfile
index 119338be6c2a9b..21d0701fcfcd68 100644
--- a/metadata-integration/java/spark-lineage/spark-smoke-test/docker/SparkBase.Dockerfile
+++ b/metadata-integration/java/spark-lineage/spark-smoke-test/docker/SparkBase.Dockerfile
@@ -17,7 +17,7 @@ RUN apt-get update -y && \
apt-get install /tmp/zulu-repo_1.0.0-3_all.deb && \
apt-get update && \
# apt-cache search zulu && \
- apt-get install -y --no-install-recommends zulu11-jre && \
+ apt-get install -y --no-install-recommends zulu17-jre && \
apt-get clean && \
curl -sS https://archive.apache.org/dist/spark/spark-${spark_version}/spark-${spark_version}-bin-hadoop${hadoop_version}.tgz -o spark.tgz && \
tar -xf spark.tgz && \
diff --git a/metadata-integration/java/spark-lineage/spark-smoke-test/python-spark-lineage-test/python_test_run.sh b/metadata-integration/java/spark-lineage/spark-smoke-test/python-spark-lineage-test/python_test_run.sh
index 429f692500c802..c06e2faec0bcba 100755
--- a/metadata-integration/java/spark-lineage/spark-smoke-test/python-spark-lineage-test/python_test_run.sh
+++ b/metadata-integration/java/spark-lineage/spark-smoke-test/python-spark-lineage-test/python_test_run.sh
@@ -7,25 +7,24 @@
saluation () {
echo "--------------------------------------------------------"
- echo "Starting execution $1"
+ echo "Starting execution $1 (properties: $2)"
echo "--------------------------------------------------------"
}
-saluation "HdfsIn2HdfsOut1.py"
-
+saluation "HdfsIn2HdfsOut1.py" $2
spark-submit --properties-file $2 HdfsIn2HdfsOut1.py
-saluation "HdfsIn2HdfsOut2.py"
+saluation "HdfsIn2HdfsOut2.py" $2
spark-submit --properties-file $2 HdfsIn2HdfsOut2.py
-saluation "HdfsIn2HiveCreateTable.py"
+saluation "HdfsIn2HiveCreateTable.py" $2
spark-submit --properties-file $2 HdfsIn2HiveCreateTable.py
-saluation "HdfsIn2HiveCreateInsertTable.py"
+saluation "HdfsIn2HiveCreateInsertTable.py" $2
spark-submit --properties-file $2 HdfsIn2HiveCreateInsertTable.py
-saluation "HiveInHiveOut.py"
+saluation "HiveInHiveOut.py" $2
spark-submit --properties-file $2 HiveInHiveOut.py
diff --git a/metadata-integration/java/spark-lineage/spark-smoke-test/spark-docker.conf b/metadata-integration/java/spark-lineage/spark-smoke-test/spark-docker.conf
index 43103c3db65ad1..a511d9f114f2b2 100644
--- a/metadata-integration/java/spark-lineage/spark-smoke-test/spark-docker.conf
+++ b/metadata-integration/java/spark-lineage/spark-smoke-test/spark-docker.conf
@@ -4,3 +4,7 @@ spark.jars file:///opt/workspace/datahub-spark-lineage*.jar
spark.extraListeners datahub.spark.DatahubSparkListener
spark.datahub.rest.server http://datahub-gms:8080
+
+spark.driver.extraJavaOptions --add-opens java.base/java.lang=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED
+spark.executor.extraJavaOptions --add-opens java.base/java.lang=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED
+
diff --git a/metadata-integration/java/spark-lineage/spark-smoke-test/test-spark-lineage/build.gradle b/metadata-integration/java/spark-lineage/spark-smoke-test/test-spark-lineage/build.gradle
index 12aa1775d6104e..6337f8c9beec63 100644
--- a/metadata-integration/java/spark-lineage/spark-smoke-test/test-spark-lineage/build.gradle
+++ b/metadata-integration/java/spark-lineage/spark-smoke-test/test-spark-lineage/build.gradle
@@ -17,17 +17,6 @@ repositories {
jcenter()
}
-tasks.withType(JavaCompile).configureEach {
- javaCompiler = javaToolchains.compilerFor {
- languageVersion = JavaLanguageVersion.of(8)
- }
-}
-tasks.withType(Test).configureEach {
- javaLauncher = javaToolchains.launcherFor {
- languageVersion = JavaLanguageVersion.of(8)
- }
-}
-
dependencies {
implementation 'org.apache.spark:spark-sql_2.11:2.4.8'
}
diff --git a/metadata-integration/java/spark-lineage/src/test/java/datahub/spark/TestCoalesceJobLineage.java b/metadata-integration/java/spark-lineage/src/test/java/datahub/spark/TestCoalesceJobLineage.java
index 2df468fc03e743..053055716eaa07 100644
--- a/metadata-integration/java/spark-lineage/src/test/java/datahub/spark/TestCoalesceJobLineage.java
+++ b/metadata-integration/java/spark-lineage/src/test/java/datahub/spark/TestCoalesceJobLineage.java
@@ -37,7 +37,7 @@ public class TestCoalesceJobLineage {
private static final String APP_NAME = "sparkCoalesceTestApp";
- private static final String TEST_RELATIVE_PATH = "../";
+ private static final String TEST_RELATIVE_PATH = "";
private static final String RESOURCE_DIR = "src/test/resources";
private static final String DATA_DIR = TEST_RELATIVE_PATH + RESOURCE_DIR + "/data";
private static final String WAREHOUSE_LOC = DATA_DIR + "/hive/warehouse/coalesce";
@@ -142,6 +142,9 @@ public void setup() {
"spark.datahub.parent.datajob_urn",
"urn:li:dataJob:(urn:li:dataFlow:(airflow,datahub_analytics_refresh,prod),load_dashboard_info_to_snowflake)")
.config("spark.sql.warehouse.dir", new File(WAREHOUSE_LOC).getAbsolutePath())
+ .config(
+ "javax.jdo.option.ConnectionURL",
+ "jdbc:derby:;databaseName=build/tmp/metastore_db_coalesce;create=true")
.enableHiveSupport()
.getOrCreate();
diff --git a/metadata-integration/java/spark-lineage/src/test/java/datahub/spark/TestSparkJobsLineage.java b/metadata-integration/java/spark-lineage/src/test/java/datahub/spark/TestSparkJobsLineage.java
index 3a70c10e0c1f9b..fa896814d16f62 100644
--- a/metadata-integration/java/spark-lineage/src/test/java/datahub/spark/TestSparkJobsLineage.java
+++ b/metadata-integration/java/spark-lineage/src/test/java/datahub/spark/TestSparkJobsLineage.java
@@ -191,6 +191,9 @@ public static void setup() {
.config("spark.datahub.metadata.dataset.platformInstance", DATASET_PLATFORM_INSTANCE)
.config("spark.datahub.metadata.dataset.env", DATASET_ENV.name())
.config("spark.sql.warehouse.dir", new File(WAREHOUSE_LOC).getAbsolutePath())
+ .config(
+ "javax.jdo.option.ConnectionURL",
+ "jdbc:derby:;databaseName=build/tmp/metastore_db_spark;create=true")
.enableHiveSupport()
.getOrCreate();
diff --git a/metadata-io/build.gradle b/metadata-io/build.gradle
index 48f80f06d07c2c..568b99acdf8943 100644
--- a/metadata-io/build.gradle
+++ b/metadata-io/build.gradle
@@ -62,7 +62,10 @@ dependencies {
testImplementation externalDependency.h2
testImplementation externalDependency.mysqlConnector
testImplementation externalDependency.neo4jHarness
- testImplementation (externalDependency.neo4jApoc) {
+ testImplementation (externalDependency.neo4jApocCore) {
+ exclude group: 'org.yaml', module: 'snakeyaml'
+ }
+ testImplementation (externalDependency.neo4jApocCommon) {
exclude group: 'org.yaml', module: 'snakeyaml'
}
testImplementation externalDependency.mockito
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphService.java b/metadata-io/src/main/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphService.java
index 217d54c5c0b0ff..c8d3147711eba5 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphService.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphService.java
@@ -432,8 +432,8 @@ private Pair> generateLineageStatementAndParameters(
+ "(b)) "
+ "WHERE a <> b "
+ " AND ALL(rt IN relationships(path) WHERE "
- + " (EXISTS(rt.source) AND rt.source = 'UI') OR "
- + " (NOT EXISTS(rt.createdOn) AND NOT EXISTS(rt.updatedOn)) OR "
+ + " (rt.source IS NOT NULL AND rt.source = 'UI') OR "
+ + " (rt.createdOn IS NULL AND rt.updatedOn IS NULL) OR "
+ " ($startTimeMillis <= rt.createdOn <= $endTimeMillis OR "
+ " $startTimeMillis <= rt.updatedOn <= $endTimeMillis) "
+ " ) "
diff --git a/metadata-jobs/mae-consumer/build.gradle b/metadata-jobs/mae-consumer/build.gradle
index fcb8b62e4ac9d5..2e068d5a3501e2 100644
--- a/metadata-jobs/mae-consumer/build.gradle
+++ b/metadata-jobs/mae-consumer/build.gradle
@@ -60,6 +60,7 @@ task avroSchemaSources(type: Copy) {
}
compileJava.dependsOn avroSchemaSources
+processResources.dependsOn avroSchemaSources
clean {
project.delete("src/main/resources/avro")
diff --git a/metadata-jobs/mce-consumer/build.gradle b/metadata-jobs/mce-consumer/build.gradle
index 97eec9fcff051c..5fa65c06de7149 100644
--- a/metadata-jobs/mce-consumer/build.gradle
+++ b/metadata-jobs/mce-consumer/build.gradle
@@ -1,8 +1,8 @@
plugins {
id 'java'
+ id 'pegasus'
}
-apply plugin: 'pegasus'
configurations {
avro
@@ -49,6 +49,7 @@ task avroSchemaSources(type: Copy) {
}
compileJava.dependsOn avroSchemaSources
+processResources.dependsOn avroSchemaSources
clean {
project.delete("src/main/resources/avro")
diff --git a/metadata-jobs/pe-consumer/build.gradle b/metadata-jobs/pe-consumer/build.gradle
index 81e8b8c9971f00..2fd19af92971e2 100644
--- a/metadata-jobs/pe-consumer/build.gradle
+++ b/metadata-jobs/pe-consumer/build.gradle
@@ -1,7 +1,7 @@
plugins {
id 'java'
+ id 'pegasus'
}
-apply plugin: 'pegasus'
configurations {
avro
@@ -37,6 +37,7 @@ task avroSchemaSources(type: Copy) {
}
compileJava.dependsOn avroSchemaSources
+processResources.dependsOn avroSchemaSources
clean {
project.delete("src/main/resources/avro")
diff --git a/metadata-models-custom/build.gradle b/metadata-models-custom/build.gradle
index 71d3b0fd1f736c..3ac08dca7c0dbe 100644
--- a/metadata-models-custom/build.gradle
+++ b/metadata-models-custom/build.gradle
@@ -16,8 +16,8 @@ buildscript {
plugins {
id 'base'
id 'maven-publish'
+ id 'pegasus'
}
-apply plugin: 'pegasus'
if (project.hasProperty('projVersion')) {
project.version = project.projVersion
diff --git a/metadata-models-validator/build.gradle b/metadata-models-validator/build.gradle
index c8d1d2e6651d6c..1dae53e817ae14 100644
--- a/metadata-models-validator/build.gradle
+++ b/metadata-models-validator/build.gradle
@@ -1,4 +1,6 @@
-apply plugin: 'java'
+plugins {
+ id 'java'
+}
dependencies {
implementation project(":entity-registry")
diff --git a/metadata-models/build.gradle b/metadata-models/build.gradle
index e90a4042c1921d..04c90fa444f0ca 100644
--- a/metadata-models/build.gradle
+++ b/metadata-models/build.gradle
@@ -1,20 +1,12 @@
import io.datahubproject.GenerateJsonSchemaTask
-apply plugin: 'java-library'
-apply plugin: 'pegasus'
-apply plugin: 'org.hidetake.swagger.generator'
-
-tasks.withType(JavaCompile).configureEach {
- javaCompiler = javaToolchains.compilerFor {
- languageVersion = JavaLanguageVersion.of(8)
- }
-}
-tasks.withType(Test).configureEach {
- javaLauncher = javaToolchains.launcherFor {
- languageVersion = JavaLanguageVersion.of(8)
- }
+plugins {
+ id 'pegasus'
+ id 'java-library'
+ id 'org.hidetake.swagger.generator'
}
+
dependencies {
api spec.product.pegasus.data
constraints {
@@ -35,7 +27,7 @@ dependencies {
swaggerCodegen externalDependency.swaggerCli
testImplementation externalDependency.guava
- testImplementation externalDependency.testngJava8
+ testImplementation externalDependency.testng
}
sourceSets {
diff --git a/metadata-service/auth-config/build.gradle b/metadata-service/auth-config/build.gradle
index c7a1128897dd5c..8302e3b0c2fe67 100644
--- a/metadata-service/auth-config/build.gradle
+++ b/metadata-service/auth-config/build.gradle
@@ -1,4 +1,6 @@
-apply plugin: 'java'
+plugins {
+ id 'java'
+}
dependencies {
implementation project(path: ':metadata-models')
diff --git a/metadata-service/auth-filter/build.gradle b/metadata-service/auth-filter/build.gradle
index 61e9015adc9423..9d763ca11421b5 100644
--- a/metadata-service/auth-filter/build.gradle
+++ b/metadata-service/auth-filter/build.gradle
@@ -1,4 +1,6 @@
-apply plugin: 'java'
+plugins {
+ id 'java'
+}
dependencies {
implementation project(':metadata-auth:auth-api')
diff --git a/metadata-service/auth-impl/build.gradle b/metadata-service/auth-impl/build.gradle
index 60d622dea54475..4f4b0658caf249 100644
--- a/metadata-service/auth-impl/build.gradle
+++ b/metadata-service/auth-impl/build.gradle
@@ -1,4 +1,6 @@
-apply plugin: 'java'
+plugins {
+ id 'java'
+}
compileJava {
diff --git a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/fieldresolverprovider/DataPlatformInstanceFieldResolverProviderTest.java b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/fieldresolverprovider/DataPlatformInstanceFieldResolverProviderTest.java
index d5d5b0c4e6c71d..f03113f3eb9bdb 100644
--- a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/fieldresolverprovider/DataPlatformInstanceFieldResolverProviderTest.java
+++ b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/fieldresolverprovider/DataPlatformInstanceFieldResolverProviderTest.java
@@ -8,7 +8,7 @@
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.times;
import static org.mockito.Mockito.verify;
-import static org.mockito.Mockito.verifyZeroInteractions;
+import static org.mockito.Mockito.verifyNoMoreInteractions;
import static org.mockito.Mockito.when;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertTrue;
@@ -68,7 +68,7 @@ public void shouldReturnFieldValueWithResourceSpecIfTypeIsDataPlatformInstance()
assertEquals(
Set.of(DATA_PLATFORM_INSTANCE_URN), result.getFieldValuesFuture().join().getValues());
- verifyZeroInteractions(entityClientMock);
+ verifyNoMoreInteractions(entityClientMock);
}
@Test
diff --git a/metadata-service/auth-servlet-impl/build.gradle b/metadata-service/auth-servlet-impl/build.gradle
index 7945b3b4e9a06c..b8310bbd4ebc01 100644
--- a/metadata-service/auth-servlet-impl/build.gradle
+++ b/metadata-service/auth-servlet-impl/build.gradle
@@ -1,4 +1,6 @@
-apply plugin: 'java'
+plugins {
+ id 'java'
+}
dependencies {
implementation project(':metadata-auth:auth-api')
diff --git a/metadata-service/factories/build.gradle b/metadata-service/factories/build.gradle
index 86644e3b034da4..145ec7e65188c5 100644
--- a/metadata-service/factories/build.gradle
+++ b/metadata-service/factories/build.gradle
@@ -1,4 +1,6 @@
-apply plugin: 'java-library'
+plugins {
+ id 'java-library'
+}
dependencies {
api project(':metadata-io')
diff --git a/metadata-service/graphql-servlet-impl/build.gradle b/metadata-service/graphql-servlet-impl/build.gradle
index 51f67631159d34..57676982421186 100644
--- a/metadata-service/graphql-servlet-impl/build.gradle
+++ b/metadata-service/graphql-servlet-impl/build.gradle
@@ -1,4 +1,6 @@
-apply plugin: 'java'
+plugins {
+ id 'java'
+}
dependencies {
implementation project(':datahub-graphql-core')
diff --git a/metadata-service/openapi-servlet/build.gradle b/metadata-service/openapi-servlet/build.gradle
index 1909b4862d294a..0430d4427528dd 100644
--- a/metadata-service/openapi-servlet/build.gradle
+++ b/metadata-service/openapi-servlet/build.gradle
@@ -1,4 +1,6 @@
-apply plugin: 'java'
+plugins {
+ id 'java'
+}
dependencies {
diff --git a/metadata-service/plugin/build.gradle b/metadata-service/plugin/build.gradle
index 00a6384b923a02..3f91b8f6ae6ba5 100644
--- a/metadata-service/plugin/build.gradle
+++ b/metadata-service/plugin/build.gradle
@@ -1,4 +1,6 @@
-apply plugin: 'java'
+plugins {
+ id 'java'
+}
dependencies {
@@ -30,4 +32,4 @@ test {
clean {
dependsOn ':metadata-service:plugin:src:test:sample-test-plugins:clean'
-}
+}
\ No newline at end of file
diff --git a/metadata-service/plugin/src/test/sample-test-plugins/build.gradle b/metadata-service/plugin/src/test/sample-test-plugins/build.gradle
index f299a35db0f64d..d4b2b4c92ad631 100644
--- a/metadata-service/plugin/src/test/sample-test-plugins/build.gradle
+++ b/metadata-service/plugin/src/test/sample-test-plugins/build.gradle
@@ -1,4 +1,6 @@
-apply plugin: 'java'
+plugins {
+ id 'java'
+}
jar {
archiveFileName = "sample-plugins.jar"
diff --git a/metadata-service/restli-api/build.gradle b/metadata-service/restli-api/build.gradle
index 352738d01f8da7..505320e8267eed 100644
--- a/metadata-service/restli-api/build.gradle
+++ b/metadata-service/restli-api/build.gradle
@@ -1,4 +1,6 @@
-apply plugin: 'pegasus'
+plugins {
+ id 'pegasus'
+}
dependencies {
dataModel project(':metadata-models')
@@ -17,4 +19,4 @@ dependencies {
because("CVE-2023-1428, CVE-2023-32731")
}
}
-}
\ No newline at end of file
+}
diff --git a/metadata-service/restli-client/build.gradle b/metadata-service/restli-client/build.gradle
index 7cad1981ad9112..86336755dc0954 100644
--- a/metadata-service/restli-client/build.gradle
+++ b/metadata-service/restli-client/build.gradle
@@ -1,5 +1,7 @@
-apply plugin: 'pegasus'
-apply plugin: 'java-library'
+plugins {
+ id 'pegasus'
+ id 'java-library'
+}
dependencies {
api project(':metadata-service:restli-api')
diff --git a/metadata-service/restli-servlet-impl/build.gradle b/metadata-service/restli-servlet-impl/build.gradle
index de6fb6690e693b..ec5b645ee233c7 100644
--- a/metadata-service/restli-servlet-impl/build.gradle
+++ b/metadata-service/restli-servlet-impl/build.gradle
@@ -1,5 +1,7 @@
-apply plugin: 'java'
-apply plugin: 'pegasus'
+plugins {
+ id 'java'
+ id 'pegasus'
+}
sourceSets {
integTest {
diff --git a/metadata-service/schema-registry-api/build.gradle b/metadata-service/schema-registry-api/build.gradle
index 077d7d4f2d6a44..c146d5202fef9a 100644
--- a/metadata-service/schema-registry-api/build.gradle
+++ b/metadata-service/schema-registry-api/build.gradle
@@ -1,5 +1,8 @@
-apply plugin: 'java'
-apply plugin: 'org.hidetake.swagger.generator'
+plugins {
+ id 'org.hidetake.swagger.generator'
+ id 'java'
+}
+
dependencies {
// Dependencies for open api
diff --git a/metadata-service/schema-registry-servlet/build.gradle b/metadata-service/schema-registry-servlet/build.gradle
index 554ac696c94fdb..7bab51d51a86c0 100644
--- a/metadata-service/schema-registry-servlet/build.gradle
+++ b/metadata-service/schema-registry-servlet/build.gradle
@@ -1,4 +1,6 @@
-apply plugin: 'java'
+plugins {
+ id 'java'
+}
dependencies {
implementation project(':metadata-service:factories')
diff --git a/metadata-service/services/build.gradle b/metadata-service/services/build.gradle
index b6af3d330d185b..c683b0c75f40a8 100644
--- a/metadata-service/services/build.gradle
+++ b/metadata-service/services/build.gradle
@@ -1,5 +1,7 @@
-apply plugin: 'java'
-apply plugin: 'org.hidetake.swagger.generator'
+plugins {
+ id 'org.hidetake.swagger.generator'
+ id 'java'
+}
configurations {
enhance
diff --git a/metadata-service/servlet/build.gradle b/metadata-service/servlet/build.gradle
index eb2cd9c2d3de7c..f961bf6a9de7eb 100644
--- a/metadata-service/servlet/build.gradle
+++ b/metadata-service/servlet/build.gradle
@@ -1,4 +1,6 @@
-apply plugin: 'java'
+plugins {
+ id 'java'
+}
dependencies {
implementation project(':metadata-io')
diff --git a/metadata-utils/build.gradle b/metadata-utils/build.gradle
index 7bc6aa2d434424..3d65675219624d 100644
--- a/metadata-utils/build.gradle
+++ b/metadata-utils/build.gradle
@@ -1,4 +1,6 @@
-apply plugin: 'java-library'
+plugins {
+ id 'java-library'
+}
dependencies {
api externalDependency.avro
diff --git a/mock-entity-registry/build.gradle b/mock-entity-registry/build.gradle
index 12d7e58eee0a1a..8242d6451dd606 100644
--- a/mock-entity-registry/build.gradle
+++ b/mock-entity-registry/build.gradle
@@ -1,4 +1,6 @@
-apply plugin: 'java'
+plugins {
+ id 'java'
+}
dependencies {
implementation project(':entity-registry')
diff --git a/smoke-test/build.gradle b/smoke-test/build.gradle
index ee0ea3c7be384f..1614a4b8527dc9 100644
--- a/smoke-test/build.gradle
+++ b/smoke-test/build.gradle
@@ -11,10 +11,10 @@ node {
}
// Version of node to use.
- version = '16.8.0'
+ version = '21.2.0'
// Version of Yarn to use.
- yarnVersion = '1.22.0'
+ yarnVersion = '1.22.1'
// Base URL for fetching node distributions (set nodeDistBaseUrl if you have a mirror).
if (project.hasProperty('nodeDistBaseUrl')) {
@@ -30,11 +30,12 @@ node {
yarnWorkDir = file("${project.projectDir}/.gradle/yarn")
// Set the work directory where node_modules should be located
- nodeModulesDir = file("${project.projectDir}")
+ nodeProjectDir = file("${project.projectDir}")
}
task yarnInstall(type: YarnTask) {
println "Root directory: ${project.rootDir}";
+ environment = ['NODE_OPTIONS': '--openssl-legacy-provider']
args = ['install', '--cwd', "${project.rootDir}/smoke-test/tests/cypress"]
}
\ No newline at end of file
diff --git a/test-models/build.gradle b/test-models/build.gradle
index c74f7249fa1d9e..e8733f0525870b 100644
--- a/test-models/build.gradle
+++ b/test-models/build.gradle
@@ -1,17 +1,9 @@
-apply plugin: 'pegasus'
-apply plugin: 'java-library'
-
-tasks.withType(JavaCompile).configureEach {
- javaCompiler = javaToolchains.compilerFor {
- languageVersion = JavaLanguageVersion.of(8)
- }
-}
-tasks.withType(Test).configureEach {
- javaLauncher = javaToolchains.launcherFor {
- languageVersion = JavaLanguageVersion.of(8)
- }
+plugins {
+ id 'pegasus'
+ id 'java-library'
}
+
dependencies {
implementation spec.product.pegasus.data
implementation externalDependency.commonsIo
diff --git a/vercel.json b/vercel.json
index d5515e68b05bdb..a1815cab8ae88c 100644
--- a/vercel.json
+++ b/vercel.json
@@ -1,5 +1,5 @@
{
- "buildCommand": "./gradlew :docs-website:build",
+ "buildCommand": "./gradlew -PuseSystemNode=true :docs-website:build",
"github": {
"silent": true,
"autoJobCancelation": true
From caef6771b828d8ee94f76801a9121f4e1a2e7561 Mon Sep 17 00:00:00 2001
From: Harshal Sheth
Date: Fri, 15 Dec 2023 15:07:56 -0500
Subject: [PATCH 033/540] feat(ingest/redshift): drop repeated operations
(#9440)
---
metadata-ingestion/setup.py | 6 +-
.../ingestion/source/redshift/report.py | 3 +-
.../ingestion/source/redshift/usage.py | 68 +++++++++++++++++--
.../redshift-usage/test_redshift_usage.py | 54 ++++++++++++++-
4 files changed, 121 insertions(+), 10 deletions(-)
diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
index 5d15d7167b63e8..1bc1bc5100b08d 100644
--- a/metadata-ingestion/setup.py
+++ b/metadata-ingestion/setup.py
@@ -368,7 +368,11 @@
| {"psycopg2-binary", "pymysql>=1.0.2"},
"pulsar": {"requests"},
"redash": {"redash-toolbelt", "sql-metadata"} | sqllineage_lib,
- "redshift": sql_common | redshift_common | usage_common | sqlglot_lib,
+ "redshift": sql_common
+ | redshift_common
+ | usage_common
+ | sqlglot_lib
+ | {"cachetools"},
"s3": {*s3_base, *data_lake_profiling},
"gcs": {*s3_base, *data_lake_profiling},
"sagemaker": aws_common,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/report.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/report.py
index b845580f359394..333c851650fb3a 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/redshift/report.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/report.py
@@ -29,7 +29,8 @@ class RedshiftReport(ProfilingSqlReport, IngestionStageReport, BaseTimeWindowRep
lineage_mem_size: Dict[str, str] = field(default_factory=TopKDict)
tables_in_mem_size: Dict[str, str] = field(default_factory=TopKDict)
views_in_mem_size: Dict[str, str] = field(default_factory=TopKDict)
- num_operational_stats_skipped: int = 0
+ num_operational_stats_filtered: int = 0
+ num_repeated_operations_dropped: int = 0
num_usage_stat_skipped: int = 0
num_lineage_tables_dropped: int = 0
num_lineage_dropped_query_parser: int = 0
diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py
index c789e605b9c29f..409027a8805a0d 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py
@@ -4,6 +4,7 @@
from datetime import datetime
from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
+import cachetools
import pydantic.error_wrappers
import redshift_connector
from pydantic.fields import Field
@@ -251,7 +252,7 @@ def _get_workunits_internal(
) -> Iterable[MetadataWorkUnit]:
self.report.num_usage_workunits_emitted = 0
self.report.num_usage_stat_skipped = 0
- self.report.num_operational_stats_skipped = 0
+ self.report.num_operational_stats_filtered = 0
if self.config.include_operational_stats:
self.report.report_ingestion_stage_start(USAGE_EXTRACTION_OPERATIONAL_STATS)
@@ -304,8 +305,13 @@ def _gen_operation_aspect_workunits(
)
# Generate operation aspect work units from the access events
- yield from self._gen_operation_aspect_workunits_from_access_events(
- access_events_iterable, all_tables=all_tables
+ yield from (
+ mcpw.as_workunit()
+ for mcpw in self._drop_repeated_operations(
+ self._gen_operation_aspect_workunits_from_access_events(
+ access_events_iterable, all_tables=all_tables
+ )
+ )
)
def _should_process_event(
@@ -366,11 +372,61 @@ def _gen_access_events_from_history_query(
yield access_event
results = cursor.fetchmany()
+ def _drop_repeated_operations(
+ self, events: Iterable[MetadataChangeProposalWrapper]
+ ) -> Iterable[MetadataChangeProposalWrapper]:
+ """Drop repeated operations on the same entity.
+
+ ASSUMPTION: Events are ordered by lastUpdatedTimestamp, descending.
+
+ Operations are only dropped if they were within 1 minute of each other,
+ and have the same operation type, user, and entity.
+
+ This is particularly useful when we see a string of insert operations
+ that are all really part of the same overall operation.
+ """
+
+ OPERATION_CACHE_MAXSIZE = 1000
+ DROP_WINDOW_SEC = 10
+
+ # All timestamps are in milliseconds.
+ timestamp_low_watermark = 0
+
+ def timer():
+ return -timestamp_low_watermark
+
+ # dict of entity urn -> (last event's actor, operation type)
+ # TODO: Remove the type ignore and use TTLCache[key_type, value_type] directly once that's supported in Python 3.9.
+ last_events: Dict[str, Tuple[Optional[str], str]] = cachetools.TTLCache( # type: ignore[assignment]
+ maxsize=OPERATION_CACHE_MAXSIZE, ttl=DROP_WINDOW_SEC * 1000, timer=timer
+ )
+
+ for event in events:
+ assert isinstance(event.aspect, OperationClass)
+
+ timestamp_low_watermark = min(
+ timestamp_low_watermark, event.aspect.lastUpdatedTimestamp
+ )
+
+ urn = event.entityUrn
+ assert urn
+ assert isinstance(event.aspect.operationType, str)
+ value: Tuple[Optional[str], str] = (
+ event.aspect.actor,
+ event.aspect.operationType,
+ )
+ if urn in last_events and last_events[urn] == value:
+ self.report.num_repeated_operations_dropped += 1
+ continue
+
+ last_events[urn] = value
+ yield event
+
def _gen_operation_aspect_workunits_from_access_events(
self,
events_iterable: Iterable[RedshiftAccessEvent],
all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]],
- ) -> Iterable[MetadataWorkUnit]:
+ ) -> Iterable[MetadataChangeProposalWrapper]:
self.report.num_operational_stats_workunits_emitted = 0
for event in events_iterable:
if not (
@@ -384,7 +440,7 @@ def _gen_operation_aspect_workunits_from_access_events(
continue
if not self._should_process_event(event, all_tables=all_tables):
- self.report.num_operational_stats_skipped += 1
+ self.report.num_operational_stats_filtered += 1
continue
assert event.operation_type in ["insert", "delete"]
@@ -406,7 +462,7 @@ def _gen_operation_aspect_workunits_from_access_events(
resource: str = f"{event.database}.{event.schema_}.{event.table}".lower()
yield MetadataChangeProposalWrapper(
entityUrn=self.dataset_urn_builder(resource), aspect=operation_aspect
- ).as_workunit()
+ )
self.report.num_operational_stats_workunits_emitted += 1
def _aggregate_access_events(
diff --git a/metadata-ingestion/tests/integration/redshift-usage/test_redshift_usage.py b/metadata-ingestion/tests/integration/redshift-usage/test_redshift_usage.py
index 74eec82b39ba3d..a9eebb8d54154e 100644
--- a/metadata-ingestion/tests/integration/redshift-usage/test_redshift_usage.py
+++ b/metadata-ingestion/tests/integration/redshift-usage/test_redshift_usage.py
@@ -2,11 +2,11 @@
import pathlib
from pathlib import Path
from typing import Dict, List, Union
-from unittest.mock import Mock, patch
+from unittest.mock import MagicMock, Mock, patch
from freezegun import freeze_time
-from datahub.emitter.mce_builder import make_dataset_urn
+from datahub.emitter.mce_builder import make_dataset_urn, make_user_urn
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.ingestion.sink.file import write_metadata_file
from datahub.ingestion.source.redshift.config import RedshiftConfig
@@ -20,6 +20,7 @@
MetadataChangeEvent,
MetadataChangeProposal,
)
+from datahub.metadata.schema_classes import OperationClass, OperationTypeClass
from tests.test_helpers import mce_helpers
FROZEN_TIME = "2021-09-15 09:00:00"
@@ -243,3 +244,52 @@ def load_access_events(test_resources_dir: pathlib.Path) -> List[Dict]:
with access_events_history_file.open() as access_events_json:
access_events = json.loads(access_events_json.read())
return access_events
+
+
+def test_duplicate_operations_dropped():
+ report = RedshiftReport()
+ usage_extractor = RedshiftUsageExtractor(
+ config=MagicMock(),
+ connection=MagicMock(),
+ report=report,
+ dataset_urn_builder=MagicMock(),
+ redundant_run_skip_handler=None,
+ )
+
+ user = make_user_urn("jdoe")
+ urnA = "urn:li:dataset:(urn:li:dataPlatform:redshift,db.schema.tableA,PROD)"
+ urnB = "urn:li:dataset:(urn:li:dataPlatform:redshift,db.schema.tableB,PROD)"
+
+ opA1 = MetadataChangeProposalWrapper(
+ entityUrn=urnA,
+ aspect=OperationClass(
+ timestampMillis=100 * 1000,
+ lastUpdatedTimestamp=95 * 1000,
+ actor=user,
+ operationType=OperationTypeClass.INSERT,
+ ),
+ )
+ opB1 = MetadataChangeProposalWrapper(
+ entityUrn=urnB,
+ aspect=OperationClass(
+ timestampMillis=101 * 1000,
+ lastUpdatedTimestamp=94 * 1000,
+ actor=user,
+ operationType=OperationTypeClass.INSERT,
+ ),
+ )
+ opA2 = MetadataChangeProposalWrapper(
+ entityUrn=urnA,
+ aspect=OperationClass(
+ timestampMillis=102 * 1000,
+ lastUpdatedTimestamp=90 * 1000,
+ actor=user,
+ operationType=OperationTypeClass.INSERT,
+ ),
+ )
+
+ dedups = list(usage_extractor._drop_repeated_operations([opA1, opB1, opA2]))
+ assert dedups == [
+ opA1,
+ opB1,
+ ]
From e58e2bf3be6cf43923ff400667406ee6dc95cd3a Mon Sep 17 00:00:00 2001
From: kushagra-apptware <81357546+kushagra-apptware@users.noreply.github.com>
Date: Mon, 18 Dec 2023 11:02:33 +0530
Subject: [PATCH 034/540] feat: Deprecation 'Note' changed to Markdown
Renderable (#9396)
Setting auto merge after test cases are passed
---
.../EntityDropdown/UpdateDeprecationModal.tsx | 14 +++-
.../components/styled/DeprecationPill.tsx | 82 +++++++++++++++++--
.../tests/cypress/cypress/support/commands.js | 2 +-
3 files changed, 86 insertions(+), 12 deletions(-)
diff --git a/datahub-web-react/src/app/entity/shared/EntityDropdown/UpdateDeprecationModal.tsx b/datahub-web-react/src/app/entity/shared/EntityDropdown/UpdateDeprecationModal.tsx
index 6ae893e12575fd..25527497b33a81 100644
--- a/datahub-web-react/src/app/entity/shared/EntityDropdown/UpdateDeprecationModal.tsx
+++ b/datahub-web-react/src/app/entity/shared/EntityDropdown/UpdateDeprecationModal.tsx
@@ -1,7 +1,10 @@
import React from 'react';
-import { Button, DatePicker, Form, Input, message, Modal } from 'antd';
+import { Button, DatePicker, Form, message, Modal } from 'antd';
+import styled from 'styled-components';
import { useBatchUpdateDeprecationMutation } from '../../../../graphql/mutations.generated';
import { handleBatchError } from '../utils';
+import { Editor } from '../tabs/Documentation/components/editor/Editor';
+import { ANTD_GRAY } from '../constants';
type Props = {
urns: string[];
@@ -9,6 +12,10 @@ type Props = {
refetch?: () => void;
};
+const StyledEditor = styled(Editor)`
+ border: 1px solid ${ANTD_GRAY[4.5]};
+`;
+
export const UpdateDeprecationModal = ({ urns, onClose, refetch }: Props) => {
const [batchUpdateDeprecation] = useBatchUpdateDeprecationMutation();
const [form] = Form.useForm();
@@ -64,10 +71,11 @@ export const UpdateDeprecationModal = ({ urns, onClose, refetch }: Props) => {
>
}
+ width='40%'
>
-
+
+
diff --git a/datahub-web-react/src/app/entity/shared/components/styled/DeprecationPill.tsx b/datahub-web-react/src/app/entity/shared/components/styled/DeprecationPill.tsx
index f60a74247ebcc2..9ec2aab193aa0b 100644
--- a/datahub-web-react/src/app/entity/shared/components/styled/DeprecationPill.tsx
+++ b/datahub-web-react/src/app/entity/shared/components/styled/DeprecationPill.tsx
@@ -1,4 +1,4 @@
-import React from 'react';
+import React, { useState } from 'react';
import { InfoCircleOutlined } from '@ant-design/icons';
import { Divider, message, Modal, Popover, Tooltip, Typography } from 'antd';
import { blue } from '@ant-design/colors';
@@ -8,6 +8,8 @@ import { Deprecation } from '../../../../../types.generated';
import { getLocaleTimezone } from '../../../../shared/time/timeUtils';
import { ANTD_GRAY } from '../../constants';
import { useBatchUpdateDeprecationMutation } from '../../../../../graphql/mutations.generated';
+import { Editor } from '../../tabs/Documentation/components/editor/Editor';
+import StripMarkdownText, { removeMarkdown } from './StripMarkdownText';
const DeprecatedContainer = styled.div`
height: 18px;
@@ -38,11 +40,6 @@ const DeprecatedTitle = styled(Typography.Text)`
font-weight: bold;
`;
-const DeprecatedSubTitle = styled(Typography.Text)`
- display: block;
- margin-bottom: 5px;
-`;
-
const LastEvaluatedAtLabel = styled.div`
padding: 0;
margin: 0;
@@ -70,15 +67,42 @@ const IconGroup = styled.div`
}
`;
+const DescriptionContainer = styled.div`
+ position: relative;
+ display: flex;
+ flex-direction: column;
+ width: 100%;
+ height: 100%;
+ min-height: 22px;
+ margin-bottom: 14px;
+`;
+const StyledViewer = styled(Editor)`
+ padding-right: 8px;
+ display: block;
+
+ .remirror-editor.ProseMirror {
+ padding: 0;
+ }
+`;
+
+const ExpandedActions = styled.div`
+ height: 10px;
+`;
+const ReadLessText = styled(Typography.Link)`
+ margin-right: 4px;
+`;
type Props = {
urn: string;
deprecation: Deprecation;
refetch?: () => void;
showUndeprecate: boolean | null;
};
+const ABBREVIATED_LIMIT = 80;
export const DeprecationPill = ({ deprecation, urn, refetch, showUndeprecate }: Props) => {
const [batchUpdateDeprecationMutation] = useBatchUpdateDeprecationMutation();
+ const [expanded, setExpanded] = useState(false);
+ const overLimit = deprecation?.note && removeMarkdown(deprecation?.note).length > 80;
/**
* Deprecation Decommission Timestamp
*/
@@ -131,14 +155,56 @@ export const DeprecationPill = ({ deprecation, urn, refetch, showUndeprecate }:
return (
{deprecation?.note !== '' && Deprecation note }
{isDividerNeeded && }
- {deprecation?.note !== '' && {deprecation.note} }
+
+ {expanded || !overLimit ? (
+ <>
+ {
+ deprecation?.note && deprecation?.note !== '' &&
+ <>
+
+
+ {overLimit && (
+ {
+ setExpanded(false);
+ }}
+ >
+ Read Less
+
+ )}
+
+ >
+ }
+ >
+ ) : (
+ <>
+
+ {
+ setExpanded(true);
+ }}
+ >
+ Read More
+
+ >
+ }
+ shouldWrap
+ >
+ {deprecation.note}
+
+ >
+ )}
+
{deprecation?.decommissionTime !== null && (
diff --git a/smoke-test/tests/cypress/cypress/support/commands.js b/smoke-test/tests/cypress/cypress/support/commands.js
index 5e3664f944edf1..ffbd050488181b 100644
--- a/smoke-test/tests/cypress/cypress/support/commands.js
+++ b/smoke-test/tests/cypress/cypress/support/commands.js
@@ -171,7 +171,7 @@ Cypress.Commands.add("deleteFromDropdown", () => {
Cypress.Commands.add("addViaFormModal", (text, modelHeader) => {
cy.waitTextVisible(modelHeader);
- cy.get(".ant-form-item-control-input-content > input[type='text']").first().type(text);
+ cy.get('.ProseMirror-focused').type(text);
cy.get(".ant-modal-footer > button:nth-child(2)").click();
});
From b4fe451d932315546ebd98623f1572a66c41ad43 Mon Sep 17 00:00:00 2001
From: gaurav2733 <77378510+gaurav2733@users.noreply.github.com>
Date: Mon, 18 Dec 2023 12:38:30 +0530
Subject: [PATCH 035/540] feat : markdown support for group description (#9455)
---
.../group/EditGroupDescriptionModal.tsx | 64 ++++++++
.../src/app/entity/group/GroupInfoSideBar.tsx | 145 ++++++++++++++++--
.../app/identity/group/CreateGroupModal.tsx | 106 +++++++------
.../cypress/e2e/settings/managing_groups.js | 6 +-
4 files changed, 261 insertions(+), 60 deletions(-)
create mode 100644 datahub-web-react/src/app/entity/group/EditGroupDescriptionModal.tsx
diff --git a/datahub-web-react/src/app/entity/group/EditGroupDescriptionModal.tsx b/datahub-web-react/src/app/entity/group/EditGroupDescriptionModal.tsx
new file mode 100644
index 00000000000000..a898a73c254efe
--- /dev/null
+++ b/datahub-web-react/src/app/entity/group/EditGroupDescriptionModal.tsx
@@ -0,0 +1,64 @@
+import React, { useState } from 'react';
+import { Button, Modal, Form } from 'antd';
+import styled from 'styled-components';
+
+import { Editor } from '../shared/tabs/Documentation/components/editor/Editor';
+import { ANTD_GRAY } from '../shared/constants';
+
+type Props = {
+ onClose: () => void;
+ onSaveAboutMe: () => void;
+ setStagedDescription: (des: string) => void;
+ stagedDescription: string | undefined;
+};
+const StyledEditor = styled(Editor)`
+ border: 1px solid ${ANTD_GRAY[4]};
+`;
+
+export default function EditGroupDescriptionModal({
+ onClose,
+ onSaveAboutMe,
+ setStagedDescription,
+ stagedDescription,
+}: Props) {
+ const [form] = Form.useForm();
+ const [aboutText,setAboutText] = useState(stagedDescription)
+
+ function updateDescription(description: string) {
+ setAboutText(aboutText)
+ setStagedDescription(description);
+
+ }
+
+ const saveDescription = () => {
+ onSaveAboutMe();
+ onClose();
+ };
+
+ return (
+
+
+ Cancel
+
+
+ Update
+
+ >
+ }
+ >
+
+
+
+
+
+
+
+ );
+}
diff --git a/datahub-web-react/src/app/entity/group/GroupInfoSideBar.tsx b/datahub-web-react/src/app/entity/group/GroupInfoSideBar.tsx
index d9eaed2682ea19..07885a4d0f6304 100644
--- a/datahub-web-react/src/app/entity/group/GroupInfoSideBar.tsx
+++ b/datahub-web-react/src/app/entity/group/GroupInfoSideBar.tsx
@@ -16,14 +16,15 @@ import {
EmptyValue,
SocialDetails,
EditButton,
- AboutSection,
- AboutSectionText,
GroupsSection,
+ AboutSection,
} from '../shared/SidebarStyledComponents';
import GroupMembersSideBarSection from './GroupMembersSideBarSection';
import { useUserContext } from '../../context/useUserContext';
-
-const { Paragraph } = Typography;
+import StripMarkdownText, { removeMarkdown } from '../shared/components/styled/StripMarkdownText';
+import { Editor } from '../shared/tabs/Documentation/components/editor/Editor';
+import EditGroupDescriptionModal from './EditGroupDescriptionModal';
+import { REDESIGN_COLORS } from '../shared/constants';
type SideBarData = {
photoUrl: string | undefined;
@@ -80,6 +81,61 @@ const GroupTitle = styled(Typography.Title)`
}
`;
+const EditIcon = styled(EditOutlined)`
+ cursor: pointer;
+ color: ${REDESIGN_COLORS.BLUE};
+`;
+const AddNewDescription = styled(Button)`
+ display: none;
+ margin: -4px;
+ width: 140px;
+`;
+
+const StyledViewer = styled(Editor)`
+ padding-right: 8px;
+ display: block;
+
+ .remirror-editor.ProseMirror {
+ padding: 0;
+ }
+`;
+
+const DescriptionContainer = styled.div`
+ position: relative;
+ display: flex;
+ flex-direction: column;
+ width: 100%;
+ text-align:left;
+ font-weight: normal;
+ font
+ min-height: 22px;
+
+ &:hover ${AddNewDescription} {
+ display: block;
+ }
+ & ins.diff {
+ background-color: #b7eb8f99;
+ text-decoration: none;
+ &:hover {
+ background-color: #b7eb8faa;
+ }
+ }
+ & del.diff {
+ background-color: #ffa39e99;
+ text-decoration: line-through;
+ &: hover {
+ background-color: #ffa39eaa;
+ }
+ }
+`;
+
+const ExpandedActions = styled.div`
+ height: 10px;
+`;
+const ReadLessText = styled(Typography.Link)`
+ margin-right: 4px;
+`;
+
/**
* Responsible for reading & writing users.
*/
@@ -106,7 +162,17 @@ export default function GroupInfoSidebar({ sideBarData, refetch }: Props) {
const me = useUserContext();
const canEditGroup = me?.platformPrivileges?.manageIdentities;
const [groupTitle, setGroupTitle] = useState(name);
+ const [expanded, setExpanded] = useState(false);
+ const [isUpdatingDescription, SetIsUpdatingDescription] = useState(false);
+ const [stagedDescription, setStagedDescription] = useState(aboutText);
+
const [updateName] = useUpdateNameMutation();
+ const overLimit = removeMarkdown(aboutText || '').length > 80;
+ const ABBREVIATED_LIMIT = 80;
+
+ useEffect(() => {
+ setStagedDescription(aboutText);
+ }, [aboutText]);
useEffect(() => {
setGroupTitle(groupTitle);
@@ -136,12 +202,12 @@ export default function GroupInfoSidebar({ sideBarData, refetch }: Props) {
};
// About Text save
- const onSaveAboutMe = (inputString) => {
+ const onSaveAboutMe = () => {
updateCorpGroupPropertiesMutation({
variables: {
urn: urn || '',
input: {
- description: inputString,
+ description: stagedDescription,
},
},
})
@@ -201,16 +267,65 @@ export default function GroupInfoSidebar({ sideBarData, refetch }: Props) {
- {TITLES.about}
-
-
- {aboutText || }
-
-
+
+ {TITLES.about}
+
+ SetIsUpdatingDescription(true)} data-testid="edit-icon" />
+
+
+
+ {(aboutText && expanded) || !overLimit ? (
+ <>
+ {/* Read only viewer for displaying group description */}
+
+
+ {overLimit && (
+ {
+ setExpanded(false);
+ }}
+ >
+ Read Less
+
+ )}
+
+ >
+ ) : (
+ <>
+ {/* Display abbreviated description with option to read more */}
+
+ {
+ setExpanded(true);
+ }}
+ >
+ Read More
+
+ >
+ }
+ shouldWrap
+ >
+ {aboutText}
+
+ >
+ )}
+
+ {/* Modal for updating group description */}
+ {isUpdatingDescription && (
+ {
+ SetIsUpdatingDescription(false);
+ setStagedDescription(aboutText);
+ }}
+ onSaveAboutMe={onSaveAboutMe}
+ setStagedDescription={setStagedDescription}
+ stagedDescription={stagedDescription}
+ />
+ )}
diff --git a/datahub-web-react/src/app/identity/group/CreateGroupModal.tsx b/datahub-web-react/src/app/identity/group/CreateGroupModal.tsx
index 214cb251767c9c..4ba714ca23ae06 100644
--- a/datahub-web-react/src/app/identity/group/CreateGroupModal.tsx
+++ b/datahub-web-react/src/app/identity/group/CreateGroupModal.tsx
@@ -1,16 +1,23 @@
-import React, { useState } from 'react';
+import React, { useRef, useState } from 'react';
import { message, Button, Input, Modal, Typography, Form, Collapse } from 'antd';
+import styled from 'styled-components';
import { useCreateGroupMutation } from '../../../graphql/group.generated';
import { useEnterKeyListener } from '../../shared/useEnterKeyListener';
import { validateCustomUrnId } from '../../shared/textUtil';
import analytics, { EventType } from '../../analytics';
import { CorpGroup, EntityType } from '../../../types.generated';
+import { Editor as MarkdownEditor } from '../../entity/shared/tabs/Documentation/components/editor/Editor';
+import { ANTD_GRAY } from '../../entity/shared/constants';
type Props = {
onClose: () => void;
onCreate: (group: CorpGroup) => void;
};
+const StyledEditor = styled(MarkdownEditor)`
+ border: 1px solid ${ANTD_GRAY[4]};
+`;
+
export default function CreateGroupModal({ onClose, onCreate }: Props) {
const [stagedName, setStagedName] = useState('');
const [stagedDescription, setStagedDescription] = useState('');
@@ -19,45 +26,54 @@ export default function CreateGroupModal({ onClose, onCreate }: Props) {
const [createButtonEnabled, setCreateButtonEnabled] = useState(true);
const [form] = Form.useForm();
+ // Reference to the styled editor for handling focus
+ const styledEditorRef = useRef(null);
+
const onCreateGroup = () => {
- createGroupMutation({
- variables: {
- input: {
- id: stagedId,
- name: stagedName,
- description: stagedDescription,
- },
- },
- })
- .then(({ data, errors }) => {
- if (!errors) {
- analytics.event({
- type: EventType.CreateGroupEvent,
- });
- message.success({
- content: `Created group!`,
- duration: 3,
- });
- // TODO: Get a full corp group back from create endpoint.
- onCreate({
- urn: data?.createGroup || '',
- type: EntityType.CorpGroup,
+ // Check if the Enter key was pressed inside the styled editor to prevent unintended form submission
+ const isEditorNewlineKeypress =
+ document.activeElement !== styledEditorRef.current &&
+ !styledEditorRef.current?.contains(document.activeElement);
+ if (isEditorNewlineKeypress) {
+ createGroupMutation({
+ variables: {
+ input: {
+ id: stagedId,
name: stagedName,
- info: {
- description: stagedDescription,
- },
- });
- }
- })
- .catch((e) => {
- message.destroy();
- message.error({ content: `Failed to create group!: \n ${e.message || ''}`, duration: 3 });
+ description: stagedDescription,
+ },
+ },
})
- .finally(() => {
- setStagedName('');
- setStagedDescription('');
- });
- onClose();
+ .then(({ data, errors }) => {
+ if (!errors) {
+ analytics.event({
+ type: EventType.CreateGroupEvent,
+ });
+ message.success({
+ content: `Created group!`,
+ duration: 3,
+ });
+ // TODO: Get a full corp group back from create endpoint.
+ onCreate({
+ urn: data?.createGroup || '',
+ type: EntityType.CorpGroup,
+ name: stagedName,
+ info: {
+ description: stagedDescription,
+ },
+ });
+ }
+ })
+ .catch((e) => {
+ message.destroy();
+ message.error({ content: `Failed to create group!: \n ${e.message || ''}`, duration: 3 });
+ })
+ .finally(() => {
+ setStagedName('');
+ setStagedDescription('');
+ });
+ onClose();
+ }
};
// Handle the Enter press
@@ -65,8 +81,13 @@ export default function CreateGroupModal({ onClose, onCreate }: Props) {
querySelectorToExecuteClick: '#createGroupButton',
});
+ function updateDescription(description: string) {
+ setStagedDescription(description);
+ }
+
return (
Description }>
An optional description for your new group.
-
- setStagedDescription(event.target.value)}
- />
+
+ {/* Styled editor for the group description */}
+
+
+
diff --git a/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js b/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js
index 70219a550cd8bb..978a245c3d9e33 100644
--- a/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js
+++ b/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js
@@ -72,8 +72,10 @@ describe("create and manage group", () => {
cy.focused().clear().type(`Test group EDITED ${test_id}{enter}`);
cy.waitTextVisible("Name Updated");
cy.contains(`Test group EDITED ${test_id}`).should("be.visible");
- cy.contains("Test group description").find('[aria-label="edit"]').click();
- cy.focused().type(" EDITED{enter}");
+ cy.get('[data-testid="edit-icon"]').click();
+ cy.waitTextVisible("Edit Description");
+ cy.get("#description").should("be.visible").type(" EDITED");
+ cy.get("#updateGroupButton").click();
cy.waitTextVisible("Changes saved.");
cy.contains("Test group description EDITED").should("be.visible");
cy.clickOptionWithText("Add Owners");
From 9d386fbd6f9a0436b25daa2b4603d1fa0b8f44ee Mon Sep 17 00:00:00 2001
From: Harshal Sheth
Date: Mon, 18 Dec 2023 05:38:16 -0500
Subject: [PATCH 036/540] feat(ingest): enable CLL for dbt by default (#9466)
---
.../ingestion/source/dbt/dbt_common.py | 7 +-
.../ingestion/source/looker/looker_common.py | 2 +-
.../source/looker/looker_lib_wrapper.py | 2 +-
.../dbt_enabled_with_schemas_mces_golden.json | 248 ++++++++++++
.../dbt_test_column_meta_mapping_golden.json | 383 ++++++++++++++++++
...th_complex_owner_patterns_mces_golden.json | 248 ++++++++++++
...th_data_platform_instance_mces_golden.json | 248 ++++++++++++
...h_non_incremental_lineage_mces_golden.json | 248 ++++++++++++
..._target_platform_instance_mces_golden.json | 248 ++++++++++++
9 files changed, 1630 insertions(+), 4 deletions(-)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py
index af28be310587a8..7bec07b40c4bdf 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py
@@ -300,7 +300,7 @@ class DBTCommonConfig(
description="When enabled, schemas will be inferred from the dbt node definition.",
)
include_column_lineage: bool = Field(
- default=False,
+ default=True,
description="When enabled, column-level lineage will be extracted from the dbt node definition. Requires `infer_dbt_schemas` to be enabled. "
"If you run into issues where the column name casing does not match up with properly, providing a datahub_api or using the rest sink will improve accuracy.",
)
@@ -696,7 +696,10 @@ def get_column_type(
@support_status(SupportStatus.CERTIFIED)
@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
-@capability(SourceCapability.LINEAGE_FINE, "Enabled using `include_column_lineage`")
+@capability(
+ SourceCapability.LINEAGE_FINE,
+ "Enabled by default, configure using `include_column_lineage`",
+)
class DBTSourceBase(StatefulIngestionSourceBase):
def __init__(self, config: DBTCommonConfig, ctx: PipelineContext, platform: str):
super().__init__(config, ctx)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py
index e440750cba0d08..53533a8d27c9b5 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py
@@ -1015,7 +1015,7 @@ def __init__(
self.report = report
self.source_config = source_config
- @lru_cache()
+ @lru_cache(maxsize=200)
def get_explore(self, model: str, explore: str) -> Optional[LookerExplore]:
looker_explore = LookerExplore.from_api(
model,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py
index 988caba1c0d748..8959868c27114c 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py
@@ -114,7 +114,7 @@ def get_available_permissions(self) -> Set[str]:
return permissions
- @lru_cache(maxsize=2000)
+ @lru_cache(maxsize=1000)
def get_user(self, id_: str, user_fields: str) -> Optional[User]:
self.client_stats.user_calls += 1
try:
diff --git a/metadata-ingestion/tests/integration/dbt/dbt_enabled_with_schemas_mces_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_enabled_with_schemas_mces_golden.json
index e4f01ef7a6c537..4deb725ed2b444 100644
--- a/metadata-ingestion/tests/integration/dbt/dbt_enabled_with_schemas_mces_golden.json
+++ b/metadata-ingestion/tests/integration/dbt/dbt_enabled_with_schemas_mces_golden.json
@@ -247,6 +247,86 @@
"dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD)",
"type": "TRANSFORMED"
}
+ ],
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD),customer_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.customer_details,PROD),customer_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD),first_name)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD),last_name)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.customer_details,PROD),full_name)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD),email)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.customer_details,PROD),email)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.address,PROD),address)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.customer_details,PROD),address)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.city,PROD),city)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.customer_details,PROD),city)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.address,PROD),postal_code)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.customer_details,PROD),postal_code)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.address,PROD),phone)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.customer_details,PROD),phone)"
+ ],
+ "confidenceScore": 0.9
+ }
]
}
},
@@ -428,6 +508,41 @@
"dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.dbt_postgres.payments_by_customer_by_month,PROD)",
"type": "TRANSFORMED"
}
+ ],
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.dbt_postgres.payments_by_customer_by_month,PROD),billing_month)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-monthly-billing,PROD),billing_month)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.dbt_postgres.payments_by_customer_by_month,PROD),customer_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-monthly-billing,PROD),customer_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.dbt_postgres.payments_by_customer_by_month,PROD),amount)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-monthly-billing,PROD),amount)"
+ ],
+ "confidenceScore": 0.9
+ }
]
}
},
@@ -650,6 +765,104 @@
"dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD)",
"type": "TRANSFORMED"
}
+ ],
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_01,PROD),amount)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_02,PROD),amount)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_03,PROD),amount)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_04,PROD),amount)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_05,PROD),amount)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD),amount)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),amount)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_01,PROD),customer_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_02,PROD),customer_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_03,PROD),customer_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_04,PROD),customer_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_05,PROD),customer_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD),customer_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),customer_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_01,PROD),payment_date)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_02,PROD),payment_date)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_03,PROD),payment_date)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_04,PROD),payment_date)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_05,PROD),payment_date)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD),payment_date)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),payment_date)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_01,PROD),payment_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_02,PROD),payment_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_03,PROD),payment_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_04,PROD),payment_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_05,PROD),payment_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD),payment_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),payment_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_01,PROD),rental_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_02,PROD),rental_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_03,PROD),rental_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_04,PROD),rental_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_05,PROD),rental_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD),rental_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),rental_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_01,PROD),staff_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_02,PROD),staff_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_03,PROD),staff_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_04,PROD),staff_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_05,PROD),staff_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD),staff_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),staff_id)"
+ ],
+ "confidenceScore": 0.9
+ }
]
}
},
@@ -789,6 +1002,41 @@
"dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.dbt_postgres.an-aliased-view-for-payments,PROD)",
"type": "TRANSFORMED"
}
+ ],
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),payment_date)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.payments_by_customer_by_month,PROD),billing_month)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),customer_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.payments_by_customer_by_month,PROD),customer_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),amount)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.payments_by_customer_by_month,PROD),amount)"
+ ],
+ "confidenceScore": 0.9
+ }
]
}
},
diff --git a/metadata-ingestion/tests/integration/dbt/dbt_test_column_meta_mapping_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_test_column_meta_mapping_golden.json
index 4d5b008b695f97..588470ef416314 100644
--- a/metadata-ingestion/tests/integration/dbt/dbt_test_column_meta_mapping_golden.json
+++ b/metadata-ingestion/tests/integration/dbt/dbt_test_column_meta_mapping_golden.json
@@ -201,6 +201,98 @@
"dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD)",
"type": "TRANSFORMED"
}
+ ],
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD),customer_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer_details,PROD),customer_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD),first_name)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD),last_name)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer_details,PROD),full_name)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.public.customer_snapshot,PROD),first_name)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.public.customer_snapshot,PROD),last_name)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer_details,PROD),initial_full_name)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD),email)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer_details,PROD),email)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.address,PROD),address)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer_details,PROD),address)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.city,PROD),city)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer_details,PROD),city)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.address,PROD),postal_code)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer_details,PROD),postal_code)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.address,PROD),phone)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer_details,PROD),phone)"
+ ],
+ "confidenceScore": 0.9
+ }
]
}
},
@@ -360,6 +452,52 @@
"dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.public.payments_by_customer_by_month,PROD)",
"type": "TRANSFORMED"
}
+ ],
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.public.payments_by_customer_by_month,PROD),billing_month)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.an-aliased-view-for-monthly-billing,PROD),billing_month)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.public.payments_by_customer_by_month,PROD),customer_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.an-aliased-view-for-monthly-billing,PROD),customer_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.public.payments_by_customer_by_month,PROD),amount)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.an-aliased-view-for-monthly-billing,PROD),amount)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer_details,PROD),email)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.an-aliased-view-for-monthly-billing,PROD),email)"
+ ],
+ "confidenceScore": 0.9
+ }
]
}
},
@@ -574,6 +712,104 @@
"dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD)",
"type": "TRANSFORMED"
}
+ ],
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_01,PROD),amount)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_02,PROD),amount)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_03,PROD),amount)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_04,PROD),amount)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_05,PROD),amount)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD),amount)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.an_aliased_view_for_payments,PROD),amount)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_01,PROD),customer_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_02,PROD),customer_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_03,PROD),customer_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_04,PROD),customer_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_05,PROD),customer_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD),customer_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.an_aliased_view_for_payments,PROD),customer_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_01,PROD),payment_date)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_02,PROD),payment_date)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_03,PROD),payment_date)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_04,PROD),payment_date)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_05,PROD),payment_date)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD),payment_date)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.an_aliased_view_for_payments,PROD),payment_date)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_01,PROD),payment_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_02,PROD),payment_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_03,PROD),payment_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_04,PROD),payment_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_05,PROD),payment_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD),payment_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.an_aliased_view_for_payments,PROD),payment_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_01,PROD),rental_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_02,PROD),rental_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_03,PROD),rental_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_04,PROD),rental_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_05,PROD),rental_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD),rental_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.an_aliased_view_for_payments,PROD),rental_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_01,PROD),staff_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_02,PROD),staff_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_03,PROD),staff_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_04,PROD),staff_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_05,PROD),staff_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD),staff_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.an_aliased_view_for_payments,PROD),staff_id)"
+ ],
+ "confidenceScore": 0.9
+ }
]
}
},
@@ -741,6 +977,41 @@
"dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.public.an_aliased_view_for_payments,PROD)",
"type": "TRANSFORMED"
}
+ ],
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.public.an_aliased_view_for_payments,PROD),payment_date)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payments_by_customer_by_month,PROD),billing_month)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.public.an_aliased_view_for_payments,PROD),customer_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payments_by_customer_by_month,PROD),customer_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.public.an_aliased_view_for_payments,PROD),amount)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payments_by_customer_by_month,PROD),amount)"
+ ],
+ "confidenceScore": 0.9
+ }
]
}
},
@@ -1011,6 +1282,118 @@
"dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD)",
"type": "TRANSFORMED"
}
+ ],
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD),active)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer_snapshot,PROD),active)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD),activebool)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer_snapshot,PROD),activebool)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD),address_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer_snapshot,PROD),address_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD),create_date)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer_snapshot,PROD),create_date)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD),customer_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer_snapshot,PROD),customer_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD),email)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer_snapshot,PROD),email)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD),first_name)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer_snapshot,PROD),first_name)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD),last_name)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer_snapshot,PROD),last_name)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD),last_update)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer_snapshot,PROD),last_update)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD),store_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer_snapshot,PROD),store_id)"
+ ],
+ "confidenceScore": 0.9
+ }
]
}
},
diff --git a/metadata-ingestion/tests/integration/dbt/dbt_test_with_complex_owner_patterns_mces_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_test_with_complex_owner_patterns_mces_golden.json
index 0bdd5e3c895c27..926e8b8c8ed84b 100644
--- a/metadata-ingestion/tests/integration/dbt/dbt_test_with_complex_owner_patterns_mces_golden.json
+++ b/metadata-ingestion/tests/integration/dbt/dbt_test_with_complex_owner_patterns_mces_golden.json
@@ -211,6 +211,86 @@
"dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD)",
"type": "TRANSFORMED"
}
+ ],
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD),customer_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.customer_details,PROD),customer_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD),first_name)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD),last_name)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.customer_details,PROD),full_name)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD),email)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.customer_details,PROD),email)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.address,PROD),address)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.customer_details,PROD),address)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.city,PROD),city)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.customer_details,PROD),city)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.address,PROD),postal_code)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.customer_details,PROD),postal_code)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.address,PROD),phone)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.customer_details,PROD),phone)"
+ ],
+ "confidenceScore": 0.9
+ }
]
}
},
@@ -375,6 +455,41 @@
"dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.dbt_postgres.payments_by_customer_by_month,PROD)",
"type": "TRANSFORMED"
}
+ ],
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.dbt_postgres.payments_by_customer_by_month,PROD),billing_month)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-monthly-billing,PROD),billing_month)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.dbt_postgres.payments_by_customer_by_month,PROD),customer_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-monthly-billing,PROD),customer_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.dbt_postgres.payments_by_customer_by_month,PROD),amount)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-monthly-billing,PROD),amount)"
+ ],
+ "confidenceScore": 0.9
+ }
]
}
},
@@ -597,6 +712,104 @@
"dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD)",
"type": "TRANSFORMED"
}
+ ],
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_01,PROD),amount)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_02,PROD),amount)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_03,PROD),amount)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_04,PROD),amount)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_05,PROD),amount)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD),amount)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),amount)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_01,PROD),customer_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_02,PROD),customer_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_03,PROD),customer_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_04,PROD),customer_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_05,PROD),customer_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD),customer_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),customer_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_01,PROD),payment_date)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_02,PROD),payment_date)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_03,PROD),payment_date)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_04,PROD),payment_date)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_05,PROD),payment_date)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD),payment_date)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),payment_date)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_01,PROD),payment_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_02,PROD),payment_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_03,PROD),payment_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_04,PROD),payment_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_05,PROD),payment_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD),payment_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),payment_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_01,PROD),rental_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_02,PROD),rental_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_03,PROD),rental_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_04,PROD),rental_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_05,PROD),rental_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD),rental_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),rental_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_01,PROD),staff_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_02,PROD),staff_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_03,PROD),staff_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_04,PROD),staff_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_05,PROD),staff_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD),staff_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),staff_id)"
+ ],
+ "confidenceScore": 0.9
+ }
]
}
},
@@ -736,6 +949,41 @@
"dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.dbt_postgres.an-aliased-view-for-payments,PROD)",
"type": "TRANSFORMED"
}
+ ],
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),payment_date)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.payments_by_customer_by_month,PROD),billing_month)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),customer_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.payments_by_customer_by_month,PROD),customer_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),amount)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.payments_by_customer_by_month,PROD),amount)"
+ ],
+ "confidenceScore": 0.9
+ }
]
}
},
diff --git a/metadata-ingestion/tests/integration/dbt/dbt_test_with_data_platform_instance_mces_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_test_with_data_platform_instance_mces_golden.json
index 5ab0b11e377716..3727603266f252 100644
--- a/metadata-ingestion/tests/integration/dbt/dbt_test_with_data_platform_instance_mces_golden.json
+++ b/metadata-ingestion/tests/integration/dbt/dbt_test_with_data_platform_instance_mces_golden.json
@@ -212,6 +212,86 @@
"dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.customer,PROD)",
"type": "TRANSFORMED"
}
+ ],
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.customer,PROD),customer_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.dbt_postgres.customer_details,PROD),customer_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.customer,PROD),first_name)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.customer,PROD),last_name)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.dbt_postgres.customer_details,PROD),full_name)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.customer,PROD),email)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.dbt_postgres.customer_details,PROD),email)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.address,PROD),address)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.dbt_postgres.customer_details,PROD),address)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.city,PROD),city)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.dbt_postgres.customer_details,PROD),city)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.address,PROD),postal_code)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.dbt_postgres.customer_details,PROD),postal_code)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.address,PROD),phone)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.dbt_postgres.customer_details,PROD),phone)"
+ ],
+ "confidenceScore": 0.9
+ }
]
}
},
@@ -376,6 +456,41 @@
"dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.dbt_postgres.payments_by_customer_by_month,PROD)",
"type": "TRANSFORMED"
}
+ ],
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.dbt_postgres.payments_by_customer_by_month,PROD),billing_month)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.dbt_postgres.an-aliased-view-for-monthly-billing,PROD),billing_month)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.dbt_postgres.payments_by_customer_by_month,PROD),customer_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.dbt_postgres.an-aliased-view-for-monthly-billing,PROD),customer_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.dbt_postgres.payments_by_customer_by_month,PROD),amount)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.dbt_postgres.an-aliased-view-for-monthly-billing,PROD),amount)"
+ ],
+ "confidenceScore": 0.9
+ }
]
}
},
@@ -598,6 +713,104 @@
"dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_06,PROD)",
"type": "TRANSFORMED"
}
+ ],
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_01,PROD),amount)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_02,PROD),amount)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_03,PROD),amount)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_04,PROD),amount)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_05,PROD),amount)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_06,PROD),amount)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.dbt_postgres.an-aliased-view-for-payments,PROD),amount)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_01,PROD),customer_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_02,PROD),customer_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_03,PROD),customer_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_04,PROD),customer_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_05,PROD),customer_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_06,PROD),customer_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.dbt_postgres.an-aliased-view-for-payments,PROD),customer_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_01,PROD),payment_date)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_02,PROD),payment_date)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_03,PROD),payment_date)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_04,PROD),payment_date)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_05,PROD),payment_date)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_06,PROD),payment_date)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.dbt_postgres.an-aliased-view-for-payments,PROD),payment_date)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_01,PROD),payment_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_02,PROD),payment_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_03,PROD),payment_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_04,PROD),payment_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_05,PROD),payment_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_06,PROD),payment_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.dbt_postgres.an-aliased-view-for-payments,PROD),payment_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_01,PROD),rental_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_02,PROD),rental_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_03,PROD),rental_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_04,PROD),rental_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_05,PROD),rental_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_06,PROD),rental_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.dbt_postgres.an-aliased-view-for-payments,PROD),rental_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_01,PROD),staff_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_02,PROD),staff_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_03,PROD),staff_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_04,PROD),staff_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_05,PROD),staff_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.public.payment_p2020_06,PROD),staff_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.dbt_postgres.an-aliased-view-for-payments,PROD),staff_id)"
+ ],
+ "confidenceScore": 0.9
+ }
]
}
},
@@ -737,6 +950,41 @@
"dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.dbt_postgres.an-aliased-view-for-payments,PROD)",
"type": "TRANSFORMED"
}
+ ],
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),payment_date)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.dbt_postgres.payments_by_customer_by_month,PROD),billing_month)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),customer_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.dbt_postgres.payments_by_customer_by_month,PROD),customer_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),amount)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.dbt_postgres.payments_by_customer_by_month,PROD),amount)"
+ ],
+ "confidenceScore": 0.9
+ }
]
}
},
diff --git a/metadata-ingestion/tests/integration/dbt/dbt_test_with_non_incremental_lineage_mces_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_test_with_non_incremental_lineage_mces_golden.json
index 3725e590fee9e4..ec879e6af766ac 100644
--- a/metadata-ingestion/tests/integration/dbt/dbt_test_with_non_incremental_lineage_mces_golden.json
+++ b/metadata-ingestion/tests/integration/dbt/dbt_test_with_non_incremental_lineage_mces_golden.json
@@ -212,6 +212,86 @@
"dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD)",
"type": "TRANSFORMED"
}
+ ],
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD),customer_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.customer_details,PROD),customer_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD),first_name)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD),last_name)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.customer_details,PROD),full_name)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD),email)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.customer_details,PROD),email)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.address,PROD),address)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.customer_details,PROD),address)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.city,PROD),city)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.customer_details,PROD),city)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.address,PROD),postal_code)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.customer_details,PROD),postal_code)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.address,PROD),phone)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.customer_details,PROD),phone)"
+ ],
+ "confidenceScore": 0.9
+ }
]
}
},
@@ -376,6 +456,41 @@
"dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.dbt_postgres.payments_by_customer_by_month,PROD)",
"type": "TRANSFORMED"
}
+ ],
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.dbt_postgres.payments_by_customer_by_month,PROD),billing_month)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-monthly-billing,PROD),billing_month)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.dbt_postgres.payments_by_customer_by_month,PROD),customer_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-monthly-billing,PROD),customer_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.dbt_postgres.payments_by_customer_by_month,PROD),amount)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-monthly-billing,PROD),amount)"
+ ],
+ "confidenceScore": 0.9
+ }
]
}
},
@@ -598,6 +713,104 @@
"dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD)",
"type": "TRANSFORMED"
}
+ ],
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_01,PROD),amount)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_02,PROD),amount)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_03,PROD),amount)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_04,PROD),amount)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_05,PROD),amount)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD),amount)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),amount)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_01,PROD),customer_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_02,PROD),customer_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_03,PROD),customer_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_04,PROD),customer_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_05,PROD),customer_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD),customer_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),customer_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_01,PROD),payment_date)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_02,PROD),payment_date)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_03,PROD),payment_date)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_04,PROD),payment_date)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_05,PROD),payment_date)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD),payment_date)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),payment_date)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_01,PROD),payment_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_02,PROD),payment_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_03,PROD),payment_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_04,PROD),payment_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_05,PROD),payment_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD),payment_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),payment_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_01,PROD),rental_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_02,PROD),rental_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_03,PROD),rental_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_04,PROD),rental_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_05,PROD),rental_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD),rental_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),rental_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_01,PROD),staff_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_02,PROD),staff_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_03,PROD),staff_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_04,PROD),staff_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_05,PROD),staff_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD),staff_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),staff_id)"
+ ],
+ "confidenceScore": 0.9
+ }
]
}
},
@@ -737,6 +950,41 @@
"dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.dbt_postgres.an-aliased-view-for-payments,PROD)",
"type": "TRANSFORMED"
}
+ ],
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),payment_date)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.payments_by_customer_by_month,PROD),billing_month)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),customer_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.payments_by_customer_by_month,PROD),customer_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),amount)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.payments_by_customer_by_month,PROD),amount)"
+ ],
+ "confidenceScore": 0.9
+ }
]
}
},
diff --git a/metadata-ingestion/tests/integration/dbt/dbt_test_with_target_platform_instance_mces_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_test_with_target_platform_instance_mces_golden.json
index a47abab6b40f7a..e25c5e4faf6afd 100644
--- a/metadata-ingestion/tests/integration/dbt/dbt_test_with_target_platform_instance_mces_golden.json
+++ b/metadata-ingestion/tests/integration/dbt/dbt_test_with_target_platform_instance_mces_golden.json
@@ -212,6 +212,86 @@
"dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD)",
"type": "TRANSFORMED"
}
+ ],
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD),customer_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.customer_details,PROD),customer_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD),first_name)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD),last_name)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.customer_details,PROD),full_name)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD),email)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.customer_details,PROD),email)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.address,PROD),address)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.customer_details,PROD),address)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.city,PROD),city)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.customer_details,PROD),city)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.address,PROD),postal_code)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.customer_details,PROD),postal_code)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.address,PROD),phone)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.customer_details,PROD),phone)"
+ ],
+ "confidenceScore": 0.9
+ }
]
}
},
@@ -376,6 +456,41 @@
"dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,ps-instance-1.pagila.dbt_postgres.payments_by_customer_by_month,PROD)",
"type": "TRANSFORMED"
}
+ ],
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,ps-instance-1.pagila.dbt_postgres.payments_by_customer_by_month,PROD),billing_month)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-monthly-billing,PROD),billing_month)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,ps-instance-1.pagila.dbt_postgres.payments_by_customer_by_month,PROD),customer_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-monthly-billing,PROD),customer_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,ps-instance-1.pagila.dbt_postgres.payments_by_customer_by_month,PROD),amount)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-monthly-billing,PROD),amount)"
+ ],
+ "confidenceScore": 0.9
+ }
]
}
},
@@ -598,6 +713,104 @@
"dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD)",
"type": "TRANSFORMED"
}
+ ],
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_01,PROD),amount)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_02,PROD),amount)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_03,PROD),amount)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_04,PROD),amount)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_05,PROD),amount)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD),amount)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),amount)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_01,PROD),customer_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_02,PROD),customer_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_03,PROD),customer_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_04,PROD),customer_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_05,PROD),customer_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD),customer_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),customer_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_01,PROD),payment_date)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_02,PROD),payment_date)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_03,PROD),payment_date)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_04,PROD),payment_date)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_05,PROD),payment_date)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD),payment_date)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),payment_date)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_01,PROD),payment_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_02,PROD),payment_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_03,PROD),payment_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_04,PROD),payment_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_05,PROD),payment_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD),payment_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),payment_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_01,PROD),rental_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_02,PROD),rental_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_03,PROD),rental_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_04,PROD),rental_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_05,PROD),rental_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD),rental_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),rental_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_01,PROD),staff_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_02,PROD),staff_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_03,PROD),staff_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_04,PROD),staff_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_05,PROD),staff_id)",
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD),staff_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-payments,PROD),staff_id)"
+ ],
+ "confidenceScore": 0.9
+ }
]
}
},
@@ -737,6 +950,41 @@
"dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,ps-instance-1.pagila.dbt_postgres.an-aliased-view-for-payments,PROD)",
"type": "TRANSFORMED"
}
+ ],
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,ps-instance-1.pagila.dbt_postgres.an-aliased-view-for-payments,PROD),payment_date)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.payments_by_customer_by_month,PROD),billing_month)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,ps-instance-1.pagila.dbt_postgres.an-aliased-view-for-payments,PROD),customer_id)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.payments_by_customer_by_month,PROD),customer_id)"
+ ],
+ "confidenceScore": 0.9
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,ps-instance-1.pagila.dbt_postgres.an-aliased-view-for-payments,PROD),amount)"
+ ],
+ "downstreamType": "FIELD_SET",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.payments_by_customer_by_month,PROD),amount)"
+ ],
+ "confidenceScore": 0.9
+ }
]
}
},
From 03590a194885b2fbbb5249aef909d761c3ffc12c Mon Sep 17 00:00:00 2001
From: Tamas Nemeth
Date: Mon, 18 Dec 2023 19:54:31 +0100
Subject: [PATCH 037/540] fix(ingest/snowflake) - Fixing snowflake url with
default region (#9443)
---
metadata-ingestion/setup.py | 8 +-
.../source/snowflake/snowflake_utils.py | 28 ++++-
.../snowflake/snowflake_golden.json | 116 +++++++++---------
.../integration/sql_server/test_sql_server.py | 5 +
.../tests/unit/test_snowflake_source.py | 27 ++++
5 files changed, 120 insertions(+), 64 deletions(-)
diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
index 1bc1bc5100b08d..cb13a40125c0da 100644
--- a/metadata-ingestion/setup.py
+++ b/metadata-ingestion/setup.py
@@ -354,7 +354,11 @@
"mlflow": {"mlflow-skinny>=2.3.0"},
"mode": {"requests", "tenacity>=8.0.1"} | sqllineage_lib,
"mongodb": {"pymongo[srv]>=3.11", "packaging"},
- "mssql": sql_common | {"sqlalchemy-pytds>=0.3", "pyOpenSSL"},
+ "mssql": sql_common
+ | {
+ "sqlalchemy-pytds>=0.3",
+ "pyOpenSSL",
+ },
"mssql-odbc": sql_common | {"pyodbc"},
"mysql": mysql,
# mariadb should have same dependency as mysql
@@ -559,7 +563,7 @@
"kafka-connect",
"ldap",
"mongodb",
- "mssql",
+ "mssql" if sys.version_info >= (3, 8) else None,
"mysql",
"mariadb",
"redash",
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py
index 5a451bf197d347..af8d8824a4b172 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py
@@ -9,8 +9,8 @@
from datahub.configuration.pattern_utils import is_schema_allowed
from datahub.ingestion.source.snowflake.constants import (
GENERIC_PERMISSION_ERROR_KEY,
- SNOWFLAKE_DEFAULT_CLOUD,
SNOWFLAKE_REGION_CLOUD_REGION_MAPPING,
+ SnowflakeCloudProvider,
SnowflakeObjectDomain,
)
from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config
@@ -72,6 +72,15 @@ def report_error(self, key: str, reason: str) -> None:
class SnowflakeCommonMixin:
platform = "snowflake"
+ CLOUD_REGION_IDS_WITHOUT_CLOUD_SUFFIX = [
+ "us-west-2",
+ "us-east-1",
+ "eu-west-1",
+ "eu-central-1",
+ "ap-southeast-1",
+ "ap-southeast-2",
+ ]
+
@staticmethod
def create_snowsight_base_url(
account_locator: str,
@@ -79,12 +88,23 @@ def create_snowsight_base_url(
cloud: str,
privatelink: bool = False,
) -> Optional[str]:
+ if cloud:
+ url_cloud_provider_suffix = f".{cloud}"
+
+ if cloud == SnowflakeCloudProvider.AWS:
+ # Some AWS regions do not have cloud suffix. See below the list:
+ # https://docs.snowflake.com/en/user-guide/admin-account-identifier#non-vps-account-locator-formats-by-cloud-platform-and-region
+ if (
+ cloud_region_id
+ in SnowflakeCommonMixin.CLOUD_REGION_IDS_WITHOUT_CLOUD_SUFFIX
+ ):
+ url_cloud_provider_suffix = ""
+ else:
+ url_cloud_provider_suffix = f".{cloud}"
if privatelink:
url = f"https://app.{account_locator}.{cloud_region_id}.privatelink.snowflakecomputing.com/"
- elif cloud == SNOWFLAKE_DEFAULT_CLOUD:
- url = f"https://app.snowflake.com/{cloud_region_id}/{account_locator}/"
else:
- url = f"https://app.snowflake.com/{cloud_region_id}.{cloud}/{account_locator}/"
+ url = f"https://app.snowflake.com/{cloud_region_id}{url_cloud_provider_suffix}/{account_locator}/"
return url
@staticmethod
diff --git a/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json b/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json
index c7273fee5a2e58..ece54f00eeaa04 100644
--- a/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json
+++ b/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json
@@ -11,20 +11,20 @@
"env": "PROD",
"database": "test_db"
},
- "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/",
+ "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/",
"name": "TEST_DB",
"description": "Comment for TEST_DB",
"created": {
- "time": 1623110400000
+ "time": 1623103200000
},
"lastModified": {
- "time": 1623110400000
+ "time": 1623103200000
}
}
},
"systemMetadata": {
- "lastObserved": 1654621200000,
- "runId": "snowflake-2022_06_07-17_00_00",
+ "lastObserved": 1615443388097,
+ "runId": "snowflake-2023_12_18-10_16_09",
"lastRunId": "no-run-id-provided"
}
},
@@ -144,20 +144,20 @@
"database": "test_db",
"schema": "test_schema"
},
- "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/",
+ "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/",
"name": "TEST_SCHEMA",
"description": "comment for TEST_DB.TEST_SCHEMA",
"created": {
- "time": 1623110400000
+ "time": 1623103200000
},
"lastModified": {
- "time": 1623110400000
+ "time": 1623103200000
}
}
},
"systemMetadata": {
- "lastObserved": 1654621200000,
- "runId": "snowflake-2022_06_07-17_00_00",
+ "lastObserved": 1615443388097,
+ "runId": "snowflake-2023_12_18-10_16_09",
"lastRunId": "no-run-id-provided"
}
},
@@ -489,22 +489,22 @@
"aspect": {
"json": {
"customProperties": {},
- "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_1/",
+ "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_1/",
"name": "TABLE_1",
"qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_1",
"description": "Comment for Table",
"created": {
- "time": 1623090600000
+ "time": 1623103200000
},
"lastModified": {
- "time": 1623090600000
+ "time": 1623103200000
},
"tags": []
}
},
"systemMetadata": {
"lastObserved": 1615443388097,
- "runId": "snowflake-2023_08_04-09_52_28",
+ "runId": "snowflake-2023_12_18-10_16_09",
"lastRunId": "no-run-id-provided"
}
},
@@ -788,22 +788,22 @@
"aspect": {
"json": {
"customProperties": {},
- "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_2/",
+ "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_2/",
"name": "TABLE_2",
"qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_2",
"description": "Comment for Table",
"created": {
- "time": 1623090600000
+ "time": 1623103200000
},
"lastModified": {
- "time": 1623090600000
+ "time": 1623103200000
},
"tags": []
}
},
"systemMetadata": {
"lastObserved": 1615443388097,
- "runId": "snowflake-2023_08_04-09_52_28",
+ "runId": "snowflake-2023_12_18-10_16_09",
"lastRunId": "no-run-id-provided"
}
},
@@ -1087,22 +1087,22 @@
"aspect": {
"json": {
"customProperties": {},
- "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_3/",
+ "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_3/",
"name": "TABLE_3",
"qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_3",
"description": "Comment for Table",
"created": {
- "time": 1623090600000
+ "time": 1623103200000
},
"lastModified": {
- "time": 1623090600000
+ "time": 1623103200000
},
"tags": []
}
},
"systemMetadata": {
"lastObserved": 1615443388097,
- "runId": "snowflake-2023_08_04-09_52_28",
+ "runId": "snowflake-2023_12_18-10_16_09",
"lastRunId": "no-run-id-provided"
}
},
@@ -1386,22 +1386,22 @@
"aspect": {
"json": {
"customProperties": {},
- "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_4/",
+ "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_4/",
"name": "TABLE_4",
"qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_4",
"description": "Comment for Table",
"created": {
- "time": 1623090600000
+ "time": 1623103200000
},
"lastModified": {
- "time": 1623090600000
+ "time": 1623103200000
},
"tags": []
}
},
"systemMetadata": {
"lastObserved": 1615443388097,
- "runId": "snowflake-2023_08_04-09_52_28",
+ "runId": "snowflake-2023_12_18-10_16_09",
"lastRunId": "no-run-id-provided"
}
},
@@ -1685,22 +1685,22 @@
"aspect": {
"json": {
"customProperties": {},
- "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_5/",
+ "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_5/",
"name": "TABLE_5",
"qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_5",
"description": "Comment for Table",
"created": {
- "time": 1623090600000
+ "time": 1623103200000
},
"lastModified": {
- "time": 1623090600000
+ "time": 1623103200000
},
"tags": []
}
},
"systemMetadata": {
"lastObserved": 1615443388097,
- "runId": "snowflake-2023_08_04-09_52_28",
+ "runId": "snowflake-2023_12_18-10_16_09",
"lastRunId": "no-run-id-provided"
}
},
@@ -1984,22 +1984,22 @@
"aspect": {
"json": {
"customProperties": {},
- "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_6/",
+ "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_6/",
"name": "TABLE_6",
"qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_6",
"description": "Comment for Table",
"created": {
- "time": 1623090600000
+ "time": 1623103200000
},
"lastModified": {
- "time": 1623090600000
+ "time": 1623103200000
},
"tags": []
}
},
"systemMetadata": {
"lastObserved": 1615443388097,
- "runId": "snowflake-2023_08_04-09_52_28",
+ "runId": "snowflake-2023_12_18-10_16_09",
"lastRunId": "no-run-id-provided"
}
},
@@ -2283,22 +2283,22 @@
"aspect": {
"json": {
"customProperties": {},
- "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_7/",
+ "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_7/",
"name": "TABLE_7",
"qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_7",
"description": "Comment for Table",
"created": {
- "time": 1623090600000
+ "time": 1623103200000
},
"lastModified": {
- "time": 1623090600000
+ "time": 1623103200000
},
"tags": []
}
},
"systemMetadata": {
"lastObserved": 1615443388097,
- "runId": "snowflake-2023_08_04-09_52_28",
+ "runId": "snowflake-2023_12_18-10_16_09",
"lastRunId": "no-run-id-provided"
}
},
@@ -2582,22 +2582,22 @@
"aspect": {
"json": {
"customProperties": {},
- "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_8/",
+ "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_8/",
"name": "TABLE_8",
"qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_8",
"description": "Comment for Table",
"created": {
- "time": 1623090600000
+ "time": 1623103200000
},
"lastModified": {
- "time": 1623090600000
+ "time": 1623103200000
},
"tags": []
}
},
"systemMetadata": {
"lastObserved": 1615443388097,
- "runId": "snowflake-2023_08_04-09_52_28",
+ "runId": "snowflake-2023_12_18-10_16_09",
"lastRunId": "no-run-id-provided"
}
},
@@ -2881,22 +2881,22 @@
"aspect": {
"json": {
"customProperties": {},
- "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_9/",
+ "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_9/",
"name": "TABLE_9",
"qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_9",
"description": "Comment for Table",
"created": {
- "time": 1623090600000
+ "time": 1623103200000
},
"lastModified": {
- "time": 1623090600000
+ "time": 1623103200000
},
"tags": []
}
},
"systemMetadata": {
"lastObserved": 1615443388097,
- "runId": "snowflake-2023_08_04-09_52_28",
+ "runId": "snowflake-2023_12_18-10_16_09",
"lastRunId": "no-run-id-provided"
}
},
@@ -3180,22 +3180,22 @@
"aspect": {
"json": {
"customProperties": {},
- "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_10/",
+ "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_10/",
"name": "TABLE_10",
"qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_10",
"description": "Comment for Table",
"created": {
- "time": 1623090600000
+ "time": 1623103200000
},
"lastModified": {
- "time": 1623090600000
+ "time": 1623103200000
},
"tags": []
}
},
"systemMetadata": {
"lastObserved": 1615443388097,
- "runId": "snowflake-2023_08_04-09_52_28",
+ "runId": "snowflake-2023_12_18-10_16_09",
"lastRunId": "no-run-id-provided"
}
},
@@ -3470,22 +3470,22 @@
"aspect": {
"json": {
"customProperties": {},
- "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/view/VIEW_1/",
+ "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/view/VIEW_1/",
"name": "VIEW_1",
"qualifiedName": "TEST_DB.TEST_SCHEMA.VIEW_1",
"description": "Comment for View",
"created": {
- "time": 1623090600000
+ "time": 1623103200000
},
"lastModified": {
- "time": 1623090600000
+ "time": 1623103200000
},
"tags": []
}
},
"systemMetadata": {
"lastObserved": 1615443388097,
- "runId": "snowflake-2023_08_04-09_52_28",
+ "runId": "snowflake-2023_12_18-10_16_09",
"lastRunId": "no-run-id-provided"
}
},
@@ -3805,22 +3805,22 @@
"aspect": {
"json": {
"customProperties": {},
- "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/view/VIEW_2/",
+ "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/view/VIEW_2/",
"name": "VIEW_2",
"qualifiedName": "TEST_DB.TEST_SCHEMA.VIEW_2",
"description": "Comment for View",
"created": {
- "time": 1623090600000
+ "time": 1623103200000
},
"lastModified": {
- "time": 1623090600000
+ "time": 1623103200000
},
"tags": []
}
},
"systemMetadata": {
"lastObserved": 1615443388097,
- "runId": "snowflake-2023_08_04-09_52_28",
+ "runId": "snowflake-2023_12_18-10_16_09",
"lastRunId": "no-run-id-provided"
}
},
diff --git a/metadata-ingestion/tests/integration/sql_server/test_sql_server.py b/metadata-ingestion/tests/integration/sql_server/test_sql_server.py
index f439a322c26771..5ed672d527264a 100644
--- a/metadata-ingestion/tests/integration/sql_server/test_sql_server.py
+++ b/metadata-ingestion/tests/integration/sql_server/test_sql_server.py
@@ -1,5 +1,6 @@
import os
import subprocess
+import sys
import time
import pytest
@@ -8,6 +9,10 @@
from tests.test_helpers.click_helpers import run_datahub_cmd
from tests.test_helpers.docker_helpers import cleanup_image, wait_for_port
+pytestmark = pytest.mark.skipif(
+ sys.version_info < (3, 8), reason="requires python 3.8 or higher"
+)
+
@pytest.fixture(scope="module")
def mssql_runner(docker_compose_runner, pytestconfig):
diff --git a/metadata-ingestion/tests/unit/test_snowflake_source.py b/metadata-ingestion/tests/unit/test_snowflake_source.py
index 536c91ace4f5ed..69a7510692df1d 100644
--- a/metadata-ingestion/tests/unit/test_snowflake_source.py
+++ b/metadata-ingestion/tests/unit/test_snowflake_source.py
@@ -24,6 +24,7 @@
from datahub.ingestion.source.snowflake.snowflake_usage_v2 import (
SnowflakeObjectAccessEntry,
)
+from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin
from datahub.ingestion.source.snowflake.snowflake_v2 import SnowflakeV2Source
from tests.test_helpers import test_connection_helpers
@@ -584,3 +585,29 @@ def test_email_filter_query_generation_with_case_insensitive_filter():
filter_query
== "AND (rlike(user_name, '.*@example.com','c')) AND NOT (rlike(user_name, '.*@example2.com','c'))"
)
+
+
+def test_create_snowsight_base_url_us_west():
+ (
+ cloud,
+ cloud_region_id,
+ ) = SnowflakeCommonMixin.get_cloud_region_from_snowflake_region_id("aws_us_west_2")
+
+ result = SnowflakeCommonMixin.create_snowsight_base_url(
+ "account_locator", cloud_region_id, cloud, False
+ )
+ assert result == "https://app.snowflake.com/us-west-2/account_locator/"
+
+
+def test_create_snowsight_base_url_ap_northeast_1():
+ (
+ cloud,
+ cloud_region_id,
+ ) = SnowflakeCommonMixin.get_cloud_region_from_snowflake_region_id(
+ "aws_ap_northeast_1"
+ )
+
+ result = SnowflakeCommonMixin.create_snowsight_base_url(
+ "account_locator", cloud_region_id, cloud, False
+ )
+ assert result == "https://app.snowflake.com/ap-northeast-1.aws/account_locator/"
From 193d1464a628fc800e926f04fcd4bd1d6774d858 Mon Sep 17 00:00:00 2001
From: noggi
Date: Mon, 18 Dec 2023 14:06:17 -0800
Subject: [PATCH 038/540] Fix downstream CI issue (#9479)
---
docker/datahub-ingestion-base/Dockerfile | 2 +-
docker/datahub-ingestion/Dockerfile | 2 +-
docker/datahub-ingestion/build.gradle | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/docker/datahub-ingestion-base/Dockerfile b/docker/datahub-ingestion-base/Dockerfile
index e0f9fdc997071c..81fec61ea50733 100644
--- a/docker/datahub-ingestion-base/Dockerfile
+++ b/docker/datahub-ingestion-base/Dockerfile
@@ -4,7 +4,7 @@ ARG BASE_IMAGE=base
# Defining custom repo urls for use in enterprise environments. Re-used between stages below.
ARG ALPINE_REPO_URL=http://dl-cdn.alpinelinux.org/alpine
ARG GITHUB_REPO_URL=https://github.com
-ARG DEBIAN_REPO_URL=http://deb.debian.org/debian
+ARG DEBIAN_REPO_URL=https://deb.debian.org/debian
ARG PIP_MIRROR_URL=null
FROM golang:1-alpine3.18 AS dockerize-binary
diff --git a/docker/datahub-ingestion/Dockerfile b/docker/datahub-ingestion/Dockerfile
index 9516c31a19e21b..2898a363a0a185 100644
--- a/docker/datahub-ingestion/Dockerfile
+++ b/docker/datahub-ingestion/Dockerfile
@@ -3,7 +3,7 @@ ARG APP_ENV=full
ARG BASE_IMAGE=acryldata/datahub-ingestion-base
ARG DOCKER_VERSION=head
ARG PIP_MIRROR_URL=null
-ARG DEBIAN_REPO_URL=http://deb.debian.org/debian
+ARG DEBIAN_REPO_URL=https://deb.debian.org/debian
FROM $BASE_IMAGE:$DOCKER_VERSION as base
USER 0
diff --git a/docker/datahub-ingestion/build.gradle b/docker/datahub-ingestion/build.gradle
index 36444210f1938b..0b08f189e6b45a 100644
--- a/docker/datahub-ingestion/build.gradle
+++ b/docker/datahub-ingestion/build.gradle
@@ -33,7 +33,7 @@ docker {
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
}
- def dockerBuildArgs = [DOCKER_VERSION: version, RELEASE_VERSION: version.replace('-SNAPSHOT', '').replace('v', '').replace("-slim", '')]
+ def dockerBuildArgs = [DOCKER_VERSION: version, RELEASE_VERSION: version.replace('-SNAPSHOT', '').replace('v', '').replace("-slim", ''), BASE_IMAGE: "${docker_registry}/datahub-ingestion-base"]
// Add build args if they are defined (needed for some CI or enterprise environments)
if (project.hasProperty('pipMirrorUrl')) {
From ecda3e618704c5eb335ad1a21c30f0c935581f64 Mon Sep 17 00:00:00 2001
From: Harshal Sheth
Date: Mon, 18 Dec 2023 18:26:33 -0500
Subject: [PATCH 039/540] feat(ingest): pydantic v2 compatibility (#9434)
---
.github/workflows/airflow-plugin.yml | 7 ++--
.../airflow-plugin/tox.ini | 9 +++++
metadata-ingestion/setup.py | 39 ++++++++++++++++---
.../api/entities/datacontract/assertion.py | 4 +-
.../datacontract/assertion_operator.py | 16 ++++----
.../datacontract/data_quality_assertion.py | 11 +++---
.../api/entities/datacontract/datacontract.py | 23 +++++------
.../datacontract/freshness_assertion.py | 15 ++++---
.../entities/datacontract/schema_assertion.py | 14 ++++---
.../src/datahub/cli/check_cli.py | 13 ++++++-
.../src/datahub/configuration/common.py | 16 +++++++-
.../src/datahub/configuration/datetimes.py | 4 +-
.../pydantic_migration_helpers.py | 29 ++++++++++++++
.../configuration/time_window_config.py | 16 ++++++--
.../configuration/validate_field_rename.py | 4 +-
.../ingestion/glossary/datahub_classifier.py | 11 +++++-
.../source/bigquery_v2/bigquery_config.py | 2 +-
.../ingestion/source/delta_lake/config.py | 4 +-
.../source/snowflake/snowflake_config.py | 2 +-
.../ingestion/source_config/sql/snowflake.py | 2 +-
.../src/datahub/utilities/urns/urn_iter.py | 2 +-
.../integration/snowflake/test_snowflake.py | 16 ++++----
.../unit/{ => config}/test_allow_deny.py | 0
.../unit/{ => config}/test_config_clean.py | 0
.../tests/unit/config/test_config_model.py | 18 +++++++--
.../{ => config}/test_pydantic_validators.py | 13 +++++--
.../{ => config}/test_time_window_config.py | 0
27 files changed, 209 insertions(+), 81 deletions(-)
rename metadata-ingestion/tests/unit/{ => config}/test_allow_deny.py (100%)
rename metadata-ingestion/tests/unit/{ => config}/test_config_clean.py (100%)
rename metadata-ingestion/tests/unit/{ => config}/test_pydantic_validators.py (92%)
rename metadata-ingestion/tests/unit/{ => config}/test_time_window_config.py (100%)
diff --git a/.github/workflows/airflow-plugin.yml b/.github/workflows/airflow-plugin.yml
index cd1e159b7d53cc..70816e5f093d13 100644
--- a/.github/workflows/airflow-plugin.yml
+++ b/.github/workflows/airflow-plugin.yml
@@ -32,6 +32,7 @@ jobs:
strategy:
matrix:
include:
+ # Note: this should be kept in sync with tox.ini.
- python-version: "3.8"
extra_pip_requirements: "apache-airflow~=2.1.4"
extra_pip_extras: plugin-v1
@@ -39,13 +40,13 @@ jobs:
extra_pip_requirements: "apache-airflow~=2.2.4"
extra_pip_extras: plugin-v1
- python-version: "3.10"
- extra_pip_requirements: "apache-airflow~=2.4.0"
+ extra_pip_requirements: 'apache-airflow~=2.4.0 pluggy==1.0.0 "pendulum<3.0"'
extra_pip_extras: plugin-v2
- python-version: "3.10"
- extra_pip_requirements: "apache-airflow~=2.6.0"
+ extra_pip_requirements: 'apache-airflow~=2.6.0 "pendulum<3.0"'
extra_pip_extras: plugin-v2
- python-version: "3.10"
- extra_pip_requirements: "apache-airflow>=2.7.0"
+ extra_pip_requirements: "apache-airflow>=2.7.0 pydantic==2.4.2"
extra_pip_extras: plugin-v2
fail-fast: false
steps:
diff --git a/metadata-ingestion-modules/airflow-plugin/tox.ini b/metadata-ingestion-modules/airflow-plugin/tox.ini
index 1010bd2933e452..27ae2ce65ba658 100644
--- a/metadata-ingestion-modules/airflow-plugin/tox.ini
+++ b/metadata-ingestion-modules/airflow-plugin/tox.ini
@@ -10,6 +10,7 @@ envlist = py38-airflow21, py38-airflow22, py310-airflow24, py310-airflow26, py31
use_develop = true
extras = dev,integration-tests,plugin-v1
deps =
+ # This should be kept in sync with the Github Actions matrix.
-e ../../metadata-ingestion/
# Airflow version
airflow21: apache-airflow~=2.1.0
@@ -20,7 +21,15 @@ deps =
# See https://github.com/datahub-project/datahub/pull/9365
airflow24: apache-airflow~=2.4.0,pluggy==1.0.0
airflow26: apache-airflow~=2.6.0
+ # Respect the constraints file on pendulum.
+ # See https://github.com/apache/airflow/issues/36274
+ airflow24,airflow26: pendulum>=2.0,<3.0
+ # The Airflow 2.7 constraints file points at pydantic v2, so we match that here.
+ # https://raw.githubusercontent.com/apache/airflow/constraints-2.7.3/constraints-3.10.txt
+ # Note that Airflow is actually compatible with both pydantic v1 and v2, and the
+ # constraints file is overly restrictive.
airflow27: apache-airflow~=2.7.0
+ airflow27: pydantic==2.4.2
commands =
pytest --cov-append {posargs}
diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
index cb13a40125c0da..13c9d3c99aaca1 100644
--- a/metadata-ingestion/setup.py
+++ b/metadata-ingestion/setup.py
@@ -14,9 +14,10 @@
"mypy_extensions>=0.4.3",
# Actual dependencies.
"typing-inspect",
+ # pydantic 1.8.2 is incompatible with mypy 0.910.
+ # See https://github.com/samuelcolvin/pydantic/pull/3175#issuecomment-995382910.
# pydantic 1.10.3 is incompatible with typing-extensions 4.1.1 - https://github.com/pydantic/pydantic/issues/4885
- # pydantic 2 makes major, backwards-incompatible changes - https://github.com/pydantic/pydantic/issues/4887
- "pydantic>=1.5.1,!=1.10.3,<2",
+ "pydantic>=1.10.0,!=1.10.3",
"mixpanel>=4.9.0",
"sentry-sdk",
}
@@ -53,6 +54,18 @@
"ruamel.yaml",
}
+pydantic_no_v2 = {
+ # pydantic 2 makes major, backwards-incompatible changes - https://github.com/pydantic/pydantic/issues/4887
+ # Tags sources that require the pydantic v2 API.
+ "pydantic<2",
+}
+
+plugin_common = {
+ # While pydantic v2 support is experimental, require that all plugins
+ # continue to use v1. This will ensure that no ingestion recipes break.
+ *pydantic_no_v2,
+}
+
rest_common = {"requests", "requests_file"}
kafka_common = {
@@ -118,6 +131,7 @@
"sqlalchemy>=1.4.39, <2",
# Required for SQL profiling.
"great-expectations>=0.15.12, <=0.15.50",
+ *pydantic_no_v2, # because of great-expectations
# scipy version restricted to reduce backtracking, used by great-expectations,
"scipy>=1.7.2",
# GE added handling for higher version of jinja2
@@ -229,6 +243,7 @@
iceberg_common = {
# Iceberg Python SDK
"pyiceberg",
+ *pydantic_no_v2, # because of pyiceberg
"pyarrow>=9.0.0, <13.0.0",
}
@@ -477,9 +492,6 @@
"flake8-bugbear==23.3.12",
"isort>=5.7.0",
"mypy==1.0.0",
- # pydantic 1.8.2 is incompatible with mypy 0.910.
- # See https://github.com/samuelcolvin/pydantic/pull/3175#issuecomment-995382910.
- "pydantic>=1.10.0",
*test_api_requirements,
pytest_dep,
"pytest-asyncio>=0.16.0",
@@ -740,7 +752,22 @@
extras_require={
"base": list(framework_common),
**{
- plugin: list(framework_common | dependencies)
+ plugin: list(
+ framework_common
+ | (
+ plugin_common
+ if plugin
+ not in {
+ "airflow",
+ "datahub-rest",
+ "datahub-kafka",
+ "sync-file-emitter",
+ "sql-parser",
+ }
+ else set()
+ )
+ | dependencies
+ )
for (plugin, dependencies) in plugins.items()
},
"all": list(
diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/assertion.py b/metadata-ingestion/src/datahub/api/entities/datacontract/assertion.py
index c45d4ddc924580..89ac528efe81a1 100644
--- a/metadata-ingestion/src/datahub/api/entities/datacontract/assertion.py
+++ b/metadata-ingestion/src/datahub/api/entities/datacontract/assertion.py
@@ -1,7 +1,7 @@
from typing import Optional
-from datahub.configuration import ConfigModel
+from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel
-class BaseAssertion(ConfigModel):
+class BaseAssertion(v1_ConfigModel):
description: Optional[str] = None
diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/assertion_operator.py b/metadata-ingestion/src/datahub/api/entities/datacontract/assertion_operator.py
index a41b0f7aafd9f2..dc0c97d1c74e56 100644
--- a/metadata-ingestion/src/datahub/api/entities/datacontract/assertion_operator.py
+++ b/metadata-ingestion/src/datahub/api/entities/datacontract/assertion_operator.py
@@ -2,7 +2,7 @@
from typing_extensions import Literal, Protocol
-from datahub.configuration import ConfigModel
+from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel
from datahub.metadata.schema_classes import (
AssertionStdOperatorClass,
AssertionStdParameterClass,
@@ -58,7 +58,7 @@ def _generate_assertion_std_parameters(
)
-class EqualToOperator(ConfigModel):
+class EqualToOperator(v1_ConfigModel):
type: Literal["equal_to"]
value: Union[str, int, float]
@@ -71,7 +71,7 @@ def generate_parameters(self) -> AssertionStdParametersClass:
return _generate_assertion_std_parameters(value=self.value)
-class BetweenOperator(ConfigModel):
+class BetweenOperator(v1_ConfigModel):
type: Literal["between"]
min: Union[int, float]
max: Union[int, float]
@@ -87,7 +87,7 @@ def generate_parameters(self) -> AssertionStdParametersClass:
)
-class LessThanOperator(ConfigModel):
+class LessThanOperator(v1_ConfigModel):
type: Literal["less_than"]
value: Union[int, float]
@@ -100,7 +100,7 @@ def generate_parameters(self) -> AssertionStdParametersClass:
return _generate_assertion_std_parameters(value=self.value)
-class GreaterThanOperator(ConfigModel):
+class GreaterThanOperator(v1_ConfigModel):
type: Literal["greater_than"]
value: Union[int, float]
@@ -113,7 +113,7 @@ def generate_parameters(self) -> AssertionStdParametersClass:
return _generate_assertion_std_parameters(value=self.value)
-class LessThanOrEqualToOperator(ConfigModel):
+class LessThanOrEqualToOperator(v1_ConfigModel):
type: Literal["less_than_or_equal_to"]
value: Union[int, float]
@@ -126,7 +126,7 @@ def generate_parameters(self) -> AssertionStdParametersClass:
return _generate_assertion_std_parameters(value=self.value)
-class GreaterThanOrEqualToOperator(ConfigModel):
+class GreaterThanOrEqualToOperator(v1_ConfigModel):
type: Literal["greater_than_or_equal_to"]
value: Union[int, float]
@@ -139,7 +139,7 @@ def generate_parameters(self) -> AssertionStdParametersClass:
return _generate_assertion_std_parameters(value=self.value)
-class NotNullOperator(ConfigModel):
+class NotNullOperator(v1_ConfigModel):
type: Literal["not_null"]
operator: str = AssertionStdOperatorClass.NOT_NULL
diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/data_quality_assertion.py b/metadata-ingestion/src/datahub/api/entities/datacontract/data_quality_assertion.py
index 6a3944ba36baf0..975aa359bd2031 100644
--- a/metadata-ingestion/src/datahub/api/entities/datacontract/data_quality_assertion.py
+++ b/metadata-ingestion/src/datahub/api/entities/datacontract/data_quality_assertion.py
@@ -1,12 +1,11 @@
from typing import List, Optional, Union
-import pydantic
from typing_extensions import Literal
import datahub.emitter.mce_builder as builder
from datahub.api.entities.datacontract.assertion import BaseAssertion
from datahub.api.entities.datacontract.assertion_operator import Operators
-from datahub.configuration.common import ConfigModel
+from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel, v1_Field
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.metadata.schema_classes import (
AssertionInfoClass,
@@ -25,7 +24,7 @@
class IdConfigMixin(BaseAssertion):
- id_raw: Optional[str] = pydantic.Field(
+ id_raw: Optional[str] = v1_Field(
default=None,
alias="id",
description="The id of the assertion. If not provided, one will be generated using the type.",
@@ -38,7 +37,7 @@ def generate_default_id(self) -> str:
class CustomSQLAssertion(IdConfigMixin, BaseAssertion):
type: Literal["custom_sql"]
sql: str
- operator: Operators = pydantic.Field(discriminator="type")
+ operator: Operators = v1_Field(discriminator="type")
def generate_default_id(self) -> str:
return f"{self.type}-{self.sql}-{self.operator.id()}"
@@ -89,11 +88,11 @@ def generate_assertion_info(self, entity_urn: str) -> AssertionInfoClass:
)
-class DataQualityAssertion(ConfigModel):
+class DataQualityAssertion(v1_ConfigModel):
__root__: Union[
CustomSQLAssertion,
ColumnUniqueAssertion,
- ] = pydantic.Field(discriminator="type")
+ ] = v1_Field(discriminator="type")
@property
def id(self) -> str:
diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/datacontract.py b/metadata-ingestion/src/datahub/api/entities/datacontract/datacontract.py
index f3c6be55e5fea9..e0ef85d5fd66c0 100644
--- a/metadata-ingestion/src/datahub/api/entities/datacontract/datacontract.py
+++ b/metadata-ingestion/src/datahub/api/entities/datacontract/datacontract.py
@@ -1,7 +1,6 @@
import collections
from typing import Iterable, List, Optional, Tuple
-import pydantic
from ruamel.yaml import YAML
from typing_extensions import Literal
@@ -11,7 +10,11 @@
)
from datahub.api.entities.datacontract.freshness_assertion import FreshnessAssertion
from datahub.api.entities.datacontract.schema_assertion import SchemaAssertion
-from datahub.configuration.common import ConfigModel
+from datahub.configuration.pydantic_migration_helpers import (
+ v1_ConfigModel,
+ v1_Field,
+ v1_validator,
+)
from datahub.emitter.mce_builder import datahub_guid, make_assertion_urn
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.metadata.schema_classes import (
@@ -26,7 +29,7 @@
from datahub.utilities.urns.urn import guess_entity_type
-class DataContract(ConfigModel):
+class DataContract(v1_ConfigModel):
"""A yml representation of a Data Contract.
This model is used as a simpler, Python-native representation of a DataHub data contract.
@@ -36,29 +39,27 @@ class DataContract(ConfigModel):
version: Literal[1]
- id: Optional[str] = pydantic.Field(
+ id: Optional[str] = v1_Field(
default=None,
alias="urn",
description="The data contract urn. If not provided, one will be generated.",
)
- entity: str = pydantic.Field(
+ entity: str = v1_Field(
description="The entity urn that the Data Contract is associated with"
)
# TODO: add support for properties
# properties: Optional[Dict[str, str]] = None
- schema_field: Optional[SchemaAssertion] = pydantic.Field(
- default=None, alias="schema"
- )
+ schema_field: Optional[SchemaAssertion] = v1_Field(default=None, alias="schema")
- freshness: Optional[FreshnessAssertion] = pydantic.Field(default=None)
+ freshness: Optional[FreshnessAssertion] = v1_Field(default=None)
# TODO: Add a validator to ensure that ids are unique
- data_quality: Optional[List[DataQualityAssertion]] = pydantic.Field(default=None)
+ data_quality: Optional[List[DataQualityAssertion]] = v1_Field(default=None)
_original_yaml_dict: Optional[dict] = None
- @pydantic.validator("data_quality")
+ @v1_validator("data_quality") # type: ignore
def validate_data_quality(
cls, data_quality: Optional[List[DataQualityAssertion]]
) -> Optional[List[DataQualityAssertion]]:
diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/freshness_assertion.py b/metadata-ingestion/src/datahub/api/entities/datacontract/freshness_assertion.py
index 71741d76b22fc4..86942766889676 100644
--- a/metadata-ingestion/src/datahub/api/entities/datacontract/freshness_assertion.py
+++ b/metadata-ingestion/src/datahub/api/entities/datacontract/freshness_assertion.py
@@ -3,11 +3,10 @@
from datetime import timedelta
from typing import List, Union
-import pydantic
from typing_extensions import Literal
from datahub.api.entities.datacontract.assertion import BaseAssertion
-from datahub.configuration.common import ConfigModel
+from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel, v1_Field
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.metadata.schema_classes import (
AssertionInfoClass,
@@ -25,10 +24,10 @@
class CronFreshnessAssertion(BaseAssertion):
type: Literal["cron"]
- cron: str = pydantic.Field(
+ cron: str = v1_Field(
description="The cron expression to use. See https://crontab.guru/ for help."
)
- timezone: str = pydantic.Field(
+ timezone: str = v1_Field(
"UTC",
description="The timezone to use for the cron schedule. Defaults to UTC.",
)
@@ -58,10 +57,10 @@ def generate_freshness_assertion_schedule(self) -> FreshnessAssertionScheduleCla
)
-class FreshnessAssertion(ConfigModel):
- __root__: Union[
- CronFreshnessAssertion, FixedIntervalFreshnessAssertion
- ] = pydantic.Field(discriminator="type")
+class FreshnessAssertion(v1_ConfigModel):
+ __root__: Union[CronFreshnessAssertion, FixedIntervalFreshnessAssertion] = v1_Field(
+ discriminator="type"
+ )
@property
def id(self):
diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/schema_assertion.py b/metadata-ingestion/src/datahub/api/entities/datacontract/schema_assertion.py
index b62f94e0592fce..39297d1a98d026 100644
--- a/metadata-ingestion/src/datahub/api/entities/datacontract/schema_assertion.py
+++ b/metadata-ingestion/src/datahub/api/entities/datacontract/schema_assertion.py
@@ -3,11 +3,10 @@
import json
from typing import List, Union
-import pydantic
from typing_extensions import Literal
from datahub.api.entities.datacontract.assertion import BaseAssertion
-from datahub.configuration.common import ConfigModel
+from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel, v1_Field
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.ingestion.extractor.json_schema_util import get_schema_metadata
from datahub.metadata.schema_classes import (
@@ -23,7 +22,7 @@
class JsonSchemaContract(BaseAssertion):
type: Literal["json-schema"]
- json_schema: dict = pydantic.Field(alias="json-schema")
+ json_schema: dict = v1_Field(alias="json-schema")
_schema_metadata: SchemaMetadataClass
@@ -37,7 +36,10 @@ def _init_private_attributes(self) -> None:
)
-class FieldListSchemaContract(BaseAssertion, arbitrary_types_allowed=True):
+class FieldListSchemaContract(BaseAssertion):
+ class Config:
+ arbitrary_types_allowed = True
+
type: Literal["field-list"]
fields: List[SchemaFieldClass]
@@ -56,8 +58,8 @@ def _init_private_attributes(self) -> None:
)
-class SchemaAssertion(ConfigModel):
- __root__: Union[JsonSchemaContract, FieldListSchemaContract] = pydantic.Field(
+class SchemaAssertion(v1_ConfigModel):
+ __root__: Union[JsonSchemaContract, FieldListSchemaContract] = v1_Field(
discriminator="type"
)
diff --git a/metadata-ingestion/src/datahub/cli/check_cli.py b/metadata-ingestion/src/datahub/cli/check_cli.py
index f7996900f7a7ad..2732a72aea5399 100644
--- a/metadata-ingestion/src/datahub/cli/check_cli.py
+++ b/metadata-ingestion/src/datahub/cli/check_cli.py
@@ -126,10 +126,21 @@ def metadata_diff(
default=False,
help="Include extra information for each plugin.",
)
+@click.option(
+ "--source",
+ type=str,
+ default=None,
+)
@telemetry.with_telemetry()
-def plugins(verbose: bool) -> None:
+def plugins(source: Optional[str], verbose: bool) -> None:
"""List the enabled ingestion plugins."""
+ if source:
+ # Quick helper for one-off checks with full stack traces.
+ source_registry.get(source)
+ click.echo(f"Source {source} is enabled.")
+ return
+
click.secho("Sources:", bold=True)
click.echo(source_registry.summary(verbose=verbose, col_width=25))
click.echo()
diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py
index f225856ca43ce4..0030332bcfd541 100644
--- a/metadata-ingestion/src/datahub/configuration/common.py
+++ b/metadata-ingestion/src/datahub/configuration/common.py
@@ -99,8 +99,20 @@ def _schema_extra(schema: Dict[str, Any], model: Type["ConfigModel"]) -> None:
@classmethod
def parse_obj_allow_extras(cls: Type[_ConfigSelf], obj: Any) -> _ConfigSelf:
- with unittest.mock.patch.object(cls.Config, "extra", pydantic.Extra.allow):
- return cls.parse_obj(obj)
+ if PYDANTIC_VERSION_2:
+ try:
+ with unittest.mock.patch.dict(
+ cls.model_config, # type: ignore
+ {"extra": "allow"},
+ clear=False,
+ ):
+ cls.model_rebuild(force=True) # type: ignore
+ return cls.parse_obj(obj)
+ finally:
+ cls.model_rebuild(force=True) # type: ignore
+ else:
+ with unittest.mock.patch.object(cls.Config, "extra", pydantic.Extra.allow):
+ return cls.parse_obj(obj)
class PermissiveConfigModel(ConfigModel):
diff --git a/metadata-ingestion/src/datahub/configuration/datetimes.py b/metadata-ingestion/src/datahub/configuration/datetimes.py
index 41af7565593d9b..1520462fa9bf8c 100644
--- a/metadata-ingestion/src/datahub/configuration/datetimes.py
+++ b/metadata-ingestion/src/datahub/configuration/datetimes.py
@@ -65,6 +65,8 @@ def parse_absolute_time(input: str) -> datetime:
def parse_relative_timespan(input: str) -> timedelta:
+ raw_input = input
+
neg = False
input = input.strip()
@@ -79,7 +81,7 @@ def parse_relative_timespan(input: str) -> timedelta:
if neg:
delta = -delta
- logger.debug(f'Parsed "{input}" as {delta}.')
+ logger.debug(f'Parsed "{raw_input}" as {delta}.')
return delta
diff --git a/metadata-ingestion/src/datahub/configuration/pydantic_migration_helpers.py b/metadata-ingestion/src/datahub/configuration/pydantic_migration_helpers.py
index f1876b500598ba..bd931abe2e84d1 100644
--- a/metadata-ingestion/src/datahub/configuration/pydantic_migration_helpers.py
+++ b/metadata-ingestion/src/datahub/configuration/pydantic_migration_helpers.py
@@ -19,12 +19,41 @@ class PydanticDeprecatedSince20(Warning): # type: ignore
if PYDANTIC_VERSION_2:
from pydantic import BaseModel as GenericModel
+ from pydantic.v1 import ( # type: ignore
+ BaseModel as v1_BaseModel,
+ Extra as v1_Extra,
+ Field as v1_Field,
+ root_validator as v1_root_validator,
+ validator as v1_validator,
+ )
else:
+ from pydantic import ( # type: ignore
+ BaseModel as v1_BaseModel,
+ Extra as v1_Extra,
+ Field as v1_Field,
+ root_validator as v1_root_validator,
+ validator as v1_validator,
+ )
from pydantic.generics import GenericModel # type: ignore
+class v1_ConfigModel(v1_BaseModel):
+ """A simplified variant of our main ConfigModel class.
+
+ This one only uses pydantic v1 features.
+ """
+
+ class Config:
+ extra = v1_Extra.forbid
+ underscore_attrs_are_private = True
+
+
__all__ = [
"PYDANTIC_VERSION_2",
"PydanticDeprecatedSince20",
"GenericModel",
+ "v1_ConfigModel",
+ "v1_Field",
+ "v1_root_validator",
+ "v1_validator",
]
diff --git a/metadata-ingestion/src/datahub/configuration/time_window_config.py b/metadata-ingestion/src/datahub/configuration/time_window_config.py
index 15de7470e4d823..f20ab85be05855 100644
--- a/metadata-ingestion/src/datahub/configuration/time_window_config.py
+++ b/metadata-ingestion/src/datahub/configuration/time_window_config.py
@@ -68,6 +68,12 @@ def default_start_time(
assert abs(delta) >= get_bucket_duration_delta(
values["bucket_duration"]
), "Relative start time should be in terms of configured bucket duration. e.g '-2 days' or '-2 hours'."
+
+ # The end_time's default value is not yet populated, in which case
+ # we can just manually generate it here.
+ if "end_time" not in values:
+ values["end_time"] = datetime.now(tz=timezone.utc)
+
return get_time_bucket(
values["end_time"] + delta, values["bucket_duration"]
)
@@ -80,9 +86,13 @@ def default_start_time(
@pydantic.validator("start_time", "end_time")
def ensure_timestamps_in_utc(cls, v: datetime) -> datetime:
- assert (
- v.tzinfo == timezone.utc
- ), 'timezone is not UTC; try adding a "Z" to the value e.g. "2021-07-20T00:00:00Z"'
+ if v.tzinfo is None:
+ raise ValueError(
+ "Timestamps must be in UTC. Try adding a 'Z' to the value e.g. '2021-07-20T00:00:00Z'"
+ )
+
+ # If the timestamp is timezone-aware but not in UTC, convert it to UTC.
+ v = v.astimezone(timezone.utc)
return v
diff --git a/metadata-ingestion/src/datahub/configuration/validate_field_rename.py b/metadata-ingestion/src/datahub/configuration/validate_field_rename.py
index bb01f2b787123a..de2a16e9bf247d 100644
--- a/metadata-ingestion/src/datahub/configuration/validate_field_rename.py
+++ b/metadata-ingestion/src/datahub/configuration/validate_field_rename.py
@@ -49,4 +49,6 @@ def _validate_field_rename(cls: Type, values: dict) -> dict:
# validator with pre=True gets all the values that were passed in.
# Given that a renamed field doesn't show up in the fields list, we can't use
# the field-level validator, even with a different field name.
- return pydantic.root_validator(pre=True, allow_reuse=True)(_validate_field_rename)
+ return pydantic.root_validator(pre=True, skip_on_failure=True, allow_reuse=True)(
+ _validate_field_rename
+ )
diff --git a/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py b/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py
index 1f2b7f5689ea3c..42eb930c80f9d4 100644
--- a/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py
+++ b/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py
@@ -8,6 +8,7 @@
from pydantic.fields import Field
from datahub.configuration.common import ConfigModel
+from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2
from datahub.ingestion.glossary.classifier import Classifier
@@ -50,7 +51,10 @@ class ValuesFactorConfig(ConfigModel):
class PredictionFactorsAndWeights(ConfigModel):
class Config:
- allow_population_by_field_name = True
+ if PYDANTIC_VERSION_2:
+ populate_by_name = True
+ else:
+ allow_population_by_field_name = True
Name: float = Field(alias="name")
Description: float = Field(alias="description")
@@ -60,7 +64,10 @@ class Config:
class InfoTypeConfig(ConfigModel):
class Config:
- allow_population_by_field_name = True
+ if PYDANTIC_VERSION_2:
+ populate_by_name = True
+ else:
+ allow_population_by_field_name = True
Prediction_Factors_and_Weights: PredictionFactorsAndWeights = Field(
description="Factors and their weights to consider when predicting info types",
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
index cbe68a454ea436..c13b08a6d9656b 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
@@ -284,7 +284,7 @@ def validate_bigquery_audit_metadata_datasets(
return v
- @root_validator(pre=False)
+ @root_validator(pre=False, skip_on_failure=True)
def backward_compatibility_configs_set(cls, values: Dict) -> Dict:
project_id = values.get("project_id")
project_id_pattern = values.get("project_id_pattern")
diff --git a/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py b/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py
index f3616ca648a3e6..81a54d1327d05a 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py
@@ -4,6 +4,7 @@
import pydantic
from cached_property import cached_property
from pydantic import Field
+from typing_extensions import Literal
from datahub.configuration.common import AllowDenyPattern
from datahub.configuration.source_common import (
@@ -46,10 +47,9 @@ class DeltaLakeSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin):
"'/' and URNs will be created using "
"relative_path only.",
)
- platform: str = Field(
+ platform: Literal["delta-lake"] = Field(
default="delta-lake",
description="The platform that this source connects to",
- const=True,
)
platform_instance: Optional[str] = Field(
default=None,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py
index 032bdef178fdf6..b896df1fa340e3 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py
@@ -176,7 +176,7 @@ def validate_include_column_lineage(cls, v, values):
)
return v
- @root_validator(pre=False)
+ @root_validator(pre=False, skip_on_failure=True)
def validate_unsupported_configs(cls, values: Dict) -> Dict:
value = values.get("include_read_operational_stats")
if value is not None and value:
diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py
index 46bd24c7e1f4c3..e9db82ce75cd99 100644
--- a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py
+++ b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py
@@ -107,7 +107,7 @@ def validate_account_id(cls, account_id: str) -> str:
return account_id
@pydantic.validator("authentication_type", always=True)
- def authenticator_type_is_valid(cls, v, values, field):
+ def authenticator_type_is_valid(cls, v, values):
if v not in VALID_AUTH_TYPES.keys():
raise ValueError(
f"unsupported authenticator type '{v}' was provided,"
diff --git a/metadata-ingestion/src/datahub/utilities/urns/urn_iter.py b/metadata-ingestion/src/datahub/utilities/urns/urn_iter.py
index 4f228494f416b8..3389a6fb05ee89 100644
--- a/metadata-ingestion/src/datahub/utilities/urns/urn_iter.py
+++ b/metadata-ingestion/src/datahub/utilities/urns/urn_iter.py
@@ -150,7 +150,7 @@ def modify_urn(urn: str) -> str:
if guess_entity_type(urn) == "dataset":
return _lowercase_dataset_urn(urn)
elif guess_entity_type(urn) == "schemaField":
- cur_urn = Urn.create_from_string(urn)
+ cur_urn = Urn.from_string(urn)
cur_urn._entity_ids[0] = _lowercase_dataset_urn(cur_urn._entity_ids[0])
return str(cur_urn)
return urn
diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py
index 1b58696e4014c9..39a62056a7e4ad 100644
--- a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py
+++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py
@@ -87,18 +87,18 @@ def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph):
confidence_level_threshold=0.58,
info_types_config={
"Age": InfoTypeConfig(
- Prediction_Factors_and_Weights=PredictionFactorsAndWeights(
- Name=0, Values=1, Description=0, Datatype=0
+ prediction_factors_and_weights=PredictionFactorsAndWeights(
+ name=0, values=1, description=0, datatype=0
)
),
"CloudRegion": InfoTypeConfig(
- Prediction_Factors_and_Weights=PredictionFactorsAndWeights(
- Name=0,
- Description=0,
- Datatype=0,
- Values=1,
+ prediction_factors_and_weights=PredictionFactorsAndWeights(
+ name=0,
+ description=0,
+ datatype=0,
+ values=1,
),
- Values=ValuesFactorConfig(
+ values=ValuesFactorConfig(
prediction_type="regex",
regex=[
r"(af|ap|ca|eu|me|sa|us)-(central|north|(north(?:east|west))|south|south(?:east|west)|east|west)-\d+"
diff --git a/metadata-ingestion/tests/unit/test_allow_deny.py b/metadata-ingestion/tests/unit/config/test_allow_deny.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_allow_deny.py
rename to metadata-ingestion/tests/unit/config/test_allow_deny.py
diff --git a/metadata-ingestion/tests/unit/test_config_clean.py b/metadata-ingestion/tests/unit/config/test_config_clean.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_config_clean.py
rename to metadata-ingestion/tests/unit/config/test_config_clean.py
diff --git a/metadata-ingestion/tests/unit/config/test_config_model.py b/metadata-ingestion/tests/unit/config/test_config_model.py
index ffac5c465f5541..f53390a3deb18c 100644
--- a/metadata-ingestion/tests/unit/config/test_config_model.py
+++ b/metadata-ingestion/tests/unit/config/test_config_model.py
@@ -3,8 +3,11 @@
import pydantic
import pytest
-from datahub.configuration.common import ConfigModel, redact_raw_config
-from datahub.ingestion.source.unity.config import UnityCatalogSourceConfig
+from datahub.configuration.common import (
+ AllowDenyPattern,
+ ConfigModel,
+ redact_raw_config,
+)
def test_extras_not_allowed():
@@ -76,8 +79,15 @@ def test_config_redaction():
def test_shared_defaults():
- c1 = UnityCatalogSourceConfig(token="s", workspace_url="https://workspace_url")
- c2 = UnityCatalogSourceConfig(token="s", workspace_url="https://workspace_url")
+ class SourceConfig(ConfigModel):
+ token: str
+ workspace_url: str
+ catalog_pattern: AllowDenyPattern = pydantic.Field(
+ default=AllowDenyPattern.allow_all(),
+ )
+
+ c1 = SourceConfig(token="s", workspace_url="https://workspace_url")
+ c2 = SourceConfig(token="s", workspace_url="https://workspace_url")
assert c2.catalog_pattern.allow == [".*"]
c1.catalog_pattern.allow += ["foo"]
diff --git a/metadata-ingestion/tests/unit/test_pydantic_validators.py b/metadata-ingestion/tests/unit/config/test_pydantic_validators.py
similarity index 92%
rename from metadata-ingestion/tests/unit/test_pydantic_validators.py
rename to metadata-ingestion/tests/unit/config/test_pydantic_validators.py
index 3e9ec6cbaf3579..399245736805cc 100644
--- a/metadata-ingestion/tests/unit/test_pydantic_validators.py
+++ b/metadata-ingestion/tests/unit/config/test_pydantic_validators.py
@@ -7,7 +7,10 @@
from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
from datahub.configuration.validate_field_removal import pydantic_removed_field
from datahub.configuration.validate_field_rename import pydantic_renamed_field
-from datahub.utilities.global_warning_util import get_global_warnings
+from datahub.utilities.global_warning_util import (
+ clear_global_warnings,
+ get_global_warnings,
+)
def test_field_rename():
@@ -76,9 +79,11 @@ class TestModel(ConfigModel):
def test_field_deprecated():
+ clear_global_warnings()
+
class TestModel(ConfigModel):
- d1: Optional[str]
- d2: Optional[str]
+ d1: Optional[str] = None
+ d2: Optional[str] = None
b: str
_validate_deprecated_d1 = pydantic_field_deprecated("d1")
@@ -93,3 +98,5 @@ class TestModel(ConfigModel):
assert v.d2 == "deprecated"
assert any(["d1 is deprecated" in warning for warning in get_global_warnings()])
assert any(["d2 is deprecated" in warning for warning in get_global_warnings()])
+
+ clear_global_warnings()
diff --git a/metadata-ingestion/tests/unit/test_time_window_config.py b/metadata-ingestion/tests/unit/config/test_time_window_config.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_time_window_config.py
rename to metadata-ingestion/tests/unit/config/test_time_window_config.py
From 7b067822bd8602c00fe5a0efdd15a6bb7a33bad6 Mon Sep 17 00:00:00 2001
From: John Joyce
Date: Mon, 18 Dec 2023 18:35:02 -0800
Subject: [PATCH 040/540] feat(gms): Add support for platform-based browse
(#9376)
Co-authored-by: John Joyce
---
.../graphql/featureflags/FeatureFlags.java | 1 +
.../resolvers/chart/BrowseV2Resolver.java | 20 +++-
.../resolvers/config/AppConfigResolver.java | 1 +
.../graphql/resolvers/search/SearchUtils.java | 14 +++
.../src/main/resources/app.graphql | 5 +
.../src/main/resources/search.graphql | 9 +-
.../browse/BrowseV2ResolverTest.java | 2 +-
datahub-web-react/src/appConfigContext.tsx | 1 +
datahub-web-react/src/graphql/app.graphql | 1 +
.../metadata/client/JavaEntityClient.java | 24 +++++
.../elasticsearch/ElasticSearchService.java | 12 +++
.../elasticsearch/query/ESBrowseDAO.java | 91 +++++++++++++++++++
.../src/main/resources/application.yml | 1 +
.../linkedin/entity/client/EntityClient.java | 22 +++++
.../entity/client/RestliEntityClient.java | 14 +++
.../metadata/search/EntitySearchService.java | 19 ++++
16 files changed, 231 insertions(+), 6 deletions(-)
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/featureflags/FeatureFlags.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/featureflags/FeatureFlags.java
index 07bd1fba5d8a86..e74ed09849763c 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/featureflags/FeatureFlags.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/featureflags/FeatureFlags.java
@@ -12,6 +12,7 @@ public class FeatureFlags {
private boolean readOnlyModeEnabled = false;
private boolean showSearchFiltersV2 = false;
private boolean showBrowseV2 = false;
+ private boolean platformBrowseV2 = false;
private PreProcessHooks preProcessHooks;
private boolean showAcrylInfo = false;
private boolean showAccessManagement = false;
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/chart/BrowseV2Resolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/chart/BrowseV2Resolver.java
index 292d6108b7a044..da4a3a76dd7e0e 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/chart/BrowseV2Resolver.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/chart/BrowseV2Resolver.java
@@ -2,14 +2,16 @@
import static com.linkedin.datahub.graphql.Constants.BROWSE_PATH_V2_DELIMITER;
import static com.linkedin.datahub.graphql.resolvers.ResolverUtils.bindArgument;
-import static com.linkedin.datahub.graphql.resolvers.search.SearchUtils.resolveView;
+import static com.linkedin.datahub.graphql.resolvers.search.SearchUtils.*;
+import com.google.common.collect.ImmutableList;
import com.linkedin.common.urn.UrnUtils;
import com.linkedin.datahub.graphql.QueryContext;
import com.linkedin.datahub.graphql.generated.BrowseResultGroupV2;
import com.linkedin.datahub.graphql.generated.BrowseResultMetadata;
import com.linkedin.datahub.graphql.generated.BrowseResultsV2;
import com.linkedin.datahub.graphql.generated.BrowseV2Input;
+import com.linkedin.datahub.graphql.generated.EntityType;
import com.linkedin.datahub.graphql.resolvers.EntityTypeMapper;
import com.linkedin.datahub.graphql.resolvers.ResolverUtils;
import com.linkedin.datahub.graphql.resolvers.search.SearchUtils;
@@ -43,8 +45,8 @@ public class BrowseV2Resolver implements DataFetcher get(DataFetchingEnvironment environment) {
final QueryContext context = environment.getContext();
final BrowseV2Input input = bindArgument(environment.getArgument("input"), BrowseV2Input.class);
- final String entityName = EntityTypeMapper.getName(input.getType());
+ final List entityNames = getEntityNames(input);
final int start = input.getStart() != null ? input.getStart() : DEFAULT_START;
final int count = input.getCount() != null ? input.getCount() : DEFAULT_COUNT;
final String query = input.getQuery() != null ? input.getQuery() : "*";
@@ -70,7 +72,7 @@ public CompletableFuture get(DataFetchingEnvironment environmen
BrowseResultV2 browseResults =
_entityClient.browseV2(
- entityName,
+ entityNames,
pathStr,
maybeResolvedView != null
? SearchUtils.combineFilters(
@@ -87,6 +89,18 @@ public CompletableFuture get(DataFetchingEnvironment environmen
});
}
+ public static List getEntityNames(BrowseV2Input input) {
+ List entityTypes;
+ if (input.getTypes() != null && input.getTypes().size() > 0) {
+ entityTypes = input.getTypes();
+ } else if (input.getType() != null) {
+ entityTypes = ImmutableList.of(input.getType());
+ } else {
+ entityTypes = BROWSE_ENTITY_TYPES;
+ }
+ return entityTypes.stream().map(EntityTypeMapper::getName).collect(Collectors.toList());
+ }
+
private BrowseResultsV2 mapBrowseResults(BrowseResultV2 browseResults) {
BrowseResultsV2 results = new BrowseResultsV2();
results.setTotal(browseResults.getNumGroups());
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java
index 34f7f133f6fb94..81b52991cde90c 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java
@@ -175,6 +175,7 @@ public CompletableFuture get(final DataFetchingEnvironment environmen
.setShowAcrylInfo(_featureFlags.isShowAcrylInfo())
.setShowAccessManagement(_featureFlags.isShowAccessManagement())
.setNestedDomainsEnabled(_featureFlags.isNestedDomainsEnabled())
+ .setPlatformBrowseV2(_featureFlags.isPlatformBrowseV2())
.build();
appConfig.setFeatureFlags(featureFlagsConfig);
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchUtils.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchUtils.java
index d04cb57e1a860e..444ab4bcc3c3c9 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchUtils.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchUtils.java
@@ -92,6 +92,20 @@ private SearchUtils() {}
EntityType.NOTEBOOK,
EntityType.DATA_PRODUCT);
+ /** Entities that are part of browse by default */
+ public static final List BROWSE_ENTITY_TYPES =
+ ImmutableList.of(
+ EntityType.DATASET,
+ EntityType.DASHBOARD,
+ EntityType.CHART,
+ EntityType.CONTAINER,
+ EntityType.MLMODEL,
+ EntityType.MLMODEL_GROUP,
+ EntityType.MLFEATURE_TABLE,
+ EntityType.DATA_FLOW,
+ EntityType.DATA_JOB,
+ EntityType.NOTEBOOK);
+
/** A prioritized list of source filter types used to generate quick filters */
public static final List PRIORITIZED_SOURCE_ENTITY_TYPES =
Stream.of(
diff --git a/datahub-graphql-core/src/main/resources/app.graphql b/datahub-graphql-core/src/main/resources/app.graphql
index 075a3b0fac43bc..52451e195ee841 100644
--- a/datahub-graphql-core/src/main/resources/app.graphql
+++ b/datahub-graphql-core/src/main/resources/app.graphql
@@ -437,6 +437,11 @@ type FeatureFlagsConfig {
"""
showBrowseV2: Boolean!
+ """
+ Whether browse v2 is platform mode, which means that platforms are displayed instead of entity types at the root.
+ """
+ platformBrowseV2: Boolean!
+
"""
Whether we should show CTAs in the UI related to moving to Managed DataHub by Acryl.
"""
diff --git a/datahub-graphql-core/src/main/resources/search.graphql b/datahub-graphql-core/src/main/resources/search.graphql
index e0cde5a2db9f99..8f2377edb546e0 100644
--- a/datahub-graphql-core/src/main/resources/search.graphql
+++ b/datahub-graphql-core/src/main/resources/search.graphql
@@ -1176,9 +1176,14 @@ Input required for browse queries
"""
input BrowseV2Input {
"""
- The browse entity type
+ The browse entity type - deprecated use types instead
"""
- type: EntityType!
+ type: EntityType
+
+ """
+ The browse entity type - deprecated use types instead. If not provided, all types will be used.
+ """
+ types: [EntityType!]
"""
The browse path V2 - a list with each entry being part of the browse path V2
diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/browse/BrowseV2ResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/browse/BrowseV2ResolverTest.java
index bffc2b31af2b9a..433772d7e2cfe1 100644
--- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/browse/BrowseV2ResolverTest.java
+++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/browse/BrowseV2ResolverTest.java
@@ -249,7 +249,7 @@ private static EntityClient initMockEntityClient(
EntityClient client = Mockito.mock(EntityClient.class);
Mockito.when(
client.browseV2(
- Mockito.eq(entityName),
+ Mockito.eq(ImmutableList.of(entityName)),
Mockito.eq(path),
Mockito.eq(filter),
Mockito.eq(query),
diff --git a/datahub-web-react/src/appConfigContext.tsx b/datahub-web-react/src/appConfigContext.tsx
index 4087ad453687c8..8c1089b868e5ab 100644
--- a/datahub-web-react/src/appConfigContext.tsx
+++ b/datahub-web-react/src/appConfigContext.tsx
@@ -50,6 +50,7 @@ export const DEFAULT_APP_CONFIG = {
showAcrylInfo: false,
showAccessManagement: false,
nestedDomainsEnabled: true,
+ platformBrowseV2: false,
},
};
diff --git a/datahub-web-react/src/graphql/app.graphql b/datahub-web-react/src/graphql/app.graphql
index 4e9bbb11d8c5aa..fe283403491479 100644
--- a/datahub-web-react/src/graphql/app.graphql
+++ b/datahub-web-react/src/graphql/app.graphql
@@ -65,6 +65,7 @@ query appConfig {
showAcrylInfo
showAccessManagement
nestedDomainsEnabled
+ platformBrowseV2
}
}
}
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java
index 53b974b560e2a6..e7ec4d313b5f58 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java
@@ -235,6 +235,30 @@ public BrowseResultV2 browseV2(
return _entitySearchService.browseV2(entityName, path, filter, input, start, count);
}
+ /**
+ * Gets browse V2 snapshot of a given path
+ *
+ * @param entityNames entities being browsed
+ * @param path path being browsed
+ * @param filter browse filter
+ * @param input search query
+ * @param start start offset of first group
+ * @param count max number of results requested
+ * @throws RemoteInvocationException
+ */
+ @Nonnull
+ public BrowseResultV2 browseV2(
+ @Nonnull List entityNames,
+ @Nonnull String path,
+ @Nullable Filter filter,
+ @Nonnull String input,
+ int start,
+ int count,
+ @Nonnull Authentication authentication) {
+ // TODO: cache browseV2 results
+ return _entitySearchService.browseV2(entityNames, path, filter, input, start, count);
+ }
+
@SneakyThrows
@Deprecated
public void update(@Nonnull final Entity entity, @Nonnull final Authentication authentication)
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java
index f40da59a149faa..fd7491fe32ea34 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java
@@ -210,6 +210,18 @@ public BrowseResultV2 browseV2(
return esBrowseDAO.browseV2(entityName, path, filter, input, start, count);
}
+ @Nonnull
+ @Override
+ public BrowseResultV2 browseV2(
+ @Nonnull List entityNames,
+ @Nonnull String path,
+ @Nullable Filter filter,
+ @Nonnull String input,
+ int start,
+ int count) {
+ return esBrowseDAO.browseV2(entityNames, path, filter, input, start, count);
+ }
+
@Nonnull
@Override
public List getBrowsePaths(@Nonnull String entityName, @Nonnull Urn urn) {
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAO.java
index 5ea60b24a577a0..3c71a2dfd91809 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAO.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAO.java
@@ -427,6 +427,44 @@ public BrowseResultV2 browseV2(
}
}
+ public BrowseResultV2 browseV2(
+ @Nonnull List entities,
+ @Nonnull String path,
+ @Nullable Filter filter,
+ @Nonnull String input,
+ int start,
+ int count) {
+ try {
+ final SearchResponse groupsResponse;
+
+ try (Timer.Context ignored = MetricUtils.timer(this.getClass(), "esGroupSearch").time()) {
+ final String finalInput = input.isEmpty() ? "*" : input;
+ groupsResponse =
+ client.search(
+ constructGroupsSearchRequestBrowseAcrossEntities(
+ entities, path, filter, finalInput),
+ RequestOptions.DEFAULT);
+ }
+
+ final BrowseGroupsResultV2 browseGroupsResult =
+ extractGroupsResponseV2(groupsResponse, path, start, count);
+ final int numGroups = browseGroupsResult.getTotalGroups();
+
+ return new BrowseResultV2()
+ .setMetadata(
+ new BrowseResultMetadata()
+ .setTotalNumEntities(browseGroupsResult.getTotalNumEntities())
+ .setPath(path))
+ .setGroups(new BrowseResultGroupV2Array(browseGroupsResult.getGroups()))
+ .setNumGroups(numGroups)
+ .setFrom(start)
+ .setPageSize(count);
+ } catch (Exception e) {
+ log.error("Browse Across Entities query failed: " + e.getMessage());
+ throw new ESQueryException("Browse Across Entities query failed: ", e);
+ }
+ }
+
@Nonnull
private SearchRequest constructGroupsSearchRequestV2(
@Nonnull String entityName,
@@ -448,6 +486,33 @@ private SearchRequest constructGroupsSearchRequestV2(
return searchRequest;
}
+ @Nonnull
+ private SearchRequest constructGroupsSearchRequestBrowseAcrossEntities(
+ @Nonnull List entities,
+ @Nonnull String path,
+ @Nullable Filter filter,
+ @Nonnull String input) {
+
+ List entitySpecs =
+ entities.stream().map(entityRegistry::getEntitySpec).collect(Collectors.toList());
+
+ String[] indexArray =
+ entities.stream().map(indexConvention::getEntityIndexName).toArray(String[]::new);
+
+ final SearchRequest searchRequest = new SearchRequest(indexArray);
+ final SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
+ searchSourceBuilder.size(0);
+ searchSourceBuilder.query(
+ buildQueryStringBrowseAcrossEntities(
+ entitySpecs,
+ path,
+ SearchUtil.transformFilterForEntities(filter, indexConvention),
+ input));
+ searchSourceBuilder.aggregation(buildAggregationsV2(path));
+ searchRequest.source(searchSourceBuilder);
+ return searchRequest;
+ }
+
/**
* Extracts the name of group from path.
*
@@ -494,6 +559,32 @@ private QueryBuilder buildQueryStringV2(
return queryBuilder;
}
+ @Nonnull
+ private QueryBuilder buildQueryStringBrowseAcrossEntities(
+ @Nonnull List entitySpecs,
+ @Nonnull String path,
+ @Nullable Filter filter,
+ @Nonnull String input) {
+ final int browseDepthVal = getPathDepthV2(path);
+
+ final BoolQueryBuilder queryBuilder = QueryBuilders.boolQuery();
+
+ QueryBuilder query =
+ SearchRequestHandler.getBuilder(entitySpecs, searchConfiguration, customSearchConfiguration)
+ .getQuery(input, false);
+ queryBuilder.must(query);
+
+ if (!path.isEmpty()) {
+ queryBuilder.filter(QueryBuilders.matchQuery(BROWSE_PATH_V2, path));
+ }
+
+ queryBuilder.filter(QueryBuilders.rangeQuery(BROWSE_PATH_V2_DEPTH).gt(browseDepthVal));
+
+ queryBuilder.filter(SearchRequestHandler.getFilterQuery(filter));
+
+ return queryBuilder;
+ }
+
@Nonnull
private AggregationBuilder buildAggregationsV2(@Nonnull String path) {
final String currentLevel = ESUtils.escapeReservedCharacters(path) + "␟.*";
diff --git a/metadata-service/configuration/src/main/resources/application.yml b/metadata-service/configuration/src/main/resources/application.yml
index a52b705cb8da63..0ea6b8712953e4 100644
--- a/metadata-service/configuration/src/main/resources/application.yml
+++ b/metadata-service/configuration/src/main/resources/application.yml
@@ -317,6 +317,7 @@ featureFlags:
showAccessManagement: ${SHOW_ACCESS_MANAGEMENT:false} #Whether we should show AccessManagement tab in the datahub UI.
showSearchFiltersV2: ${SHOW_SEARCH_FILTERS_V2:true} # Enables showing the search filters V2 experience.
showBrowseV2: ${SHOW_BROWSE_V2:true} # Enables showing the browse v2 sidebar experience.
+ platformBrowseV2: ${PLATFORM_BROWSE_V2:false} # Enables the platform browse experience, instead of the entity-oriented browse default.
preProcessHooks:
uiEnabled: ${PRE_PROCESS_HOOKS_UI_ENABLED:true} # Circumvents Kafka for processing index updates for UI changes sourced from GraphQL to avoid processing delays
showAcrylInfo: ${SHOW_ACRYL_INFO:false} # Show different CTAs within DataHub around moving to Managed DataHub. Set to true for the demo site.
diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClient.java
index 7bc50a8f3dc7e6..598c252b4f7664 100644
--- a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClient.java
+++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClient.java
@@ -153,6 +153,28 @@ public BrowseResultV2 browseV2(
@Nonnull Authentication authentication)
throws RemoteInvocationException;
+ /**
+ * Gets browse snapshot of a given path
+ *
+ * @param entityNames entities being browsed
+ * @param path path being browsed
+ * @param filter browse filter
+ * @param input search query
+ * @param start start offset of first group
+ * @param count max number of results requested
+ * @throws RemoteInvocationException
+ */
+ @Nonnull
+ public BrowseResultV2 browseV2(
+ @Nonnull List entityNames,
+ @Nonnull String path,
+ @Nullable Filter filter,
+ @Nonnull String input,
+ int start,
+ int count,
+ @Nonnull Authentication authentication)
+ throws RemoteInvocationException;
+
@Deprecated
public void update(@Nonnull final Entity entity, @Nonnull final Authentication authentication)
throws RemoteInvocationException;
diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java
index c854cb9dd279ec..d68c472ea91709 100644
--- a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java
+++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java
@@ -381,6 +381,20 @@ public BrowseResultV2 browseV2(
throw new NotImplementedException("BrowseV2 is not implemented in Restli yet");
}
+ @Nonnull
+ @Override
+ public BrowseResultV2 browseV2(
+ @Nonnull List entityNames,
+ @Nonnull String path,
+ @Nullable Filter filter,
+ @Nonnull String input,
+ int start,
+ int count,
+ @Nonnull Authentication authentication)
+ throws RemoteInvocationException {
+ throw new NotImplementedException("BrowseV2 is not implemented in Restli yet");
+ }
+
public void update(@Nonnull final Entity entity, @Nonnull final Authentication authentication)
throws RemoteInvocationException {
EntitiesDoIngestRequestBuilder requestBuilder =
diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/search/EntitySearchService.java b/metadata-service/services/src/main/java/com/linkedin/metadata/search/EntitySearchService.java
index 09a63e769f0253..189ae09e1b9382 100644
--- a/metadata-service/services/src/main/java/com/linkedin/metadata/search/EntitySearchService.java
+++ b/metadata-service/services/src/main/java/com/linkedin/metadata/search/EntitySearchService.java
@@ -207,6 +207,25 @@ public BrowseResultV2 browseV2(
int start,
int count);
+ /**
+ * Gets browse snapshot of a given path
+ *
+ * @param entityNames set of entities being browsed
+ * @param path path being browsed
+ * @param filter browse filter
+ * @param input search query
+ * @param start start offset of first group
+ * @param count max number of results requested
+ */
+ @Nonnull
+ public BrowseResultV2 browseV2(
+ @Nonnull List entityNames,
+ @Nonnull String path,
+ @Nullable Filter filter,
+ @Nonnull String input,
+ int start,
+ int count);
+
/**
* Gets a list of paths for a given urn.
*
From 1124ccc4ee02e60980af19d525d5203dd6719a1d Mon Sep 17 00:00:00 2001
From: kushagra-apptware <81357546+kushagra-apptware@users.noreply.github.com>
Date: Tue, 19 Dec 2023 17:29:37 +0530
Subject: [PATCH 041/540] fix(ui/users): searching for users on Users page
shows incorrect roles (#9474)
---
datahub-web-react/src/app/identity/user/UserList.tsx | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/datahub-web-react/src/app/identity/user/UserList.tsx b/datahub-web-react/src/app/identity/user/UserList.tsx
index dce3aa2c68a8dc..8e2bc21f0693f7 100644
--- a/datahub-web-react/src/app/identity/user/UserList.tsx
+++ b/datahub-web-react/src/app/identity/user/UserList.tsx
@@ -77,7 +77,7 @@ export const UserList = () => {
query: (query?.length && query) || undefined,
},
},
- fetchPolicy: (query?.length || 0) > 0 ? 'no-cache' : 'cache-first',
+ fetchPolicy: 'no-cache',
});
const totalUsers = usersData?.listUsers?.total || 0;
From 94a1603676b6a0fb9e2129b416caf39b100f6d0f Mon Sep 17 00:00:00 2001
From: Tamas Nemeth
Date: Tue, 19 Dec 2023 16:30:21 +0100
Subject: [PATCH 042/540] fix(ingest/redshift: Fixing operation query to not
return duplicate operations (#9481)
---
.../ingestion/source/redshift/usage.py | 26 ++++++++++++-------
1 file changed, 16 insertions(+), 10 deletions(-)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py
index 409027a8805a0d..e40406b994c9b2 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py
@@ -85,15 +85,18 @@
sq.endtime AS endtime,
'insert' AS operation_type
FROM
- stl_insert si
+ (select userid, query, sum(rows) as rows, tbl
+ from stl_insert si
+ where si.rows > 0
+ AND si.starttime >= '{start_time}'
+ AND si.starttime < '{end_time}'
+ group by userid, query, tbl
+ ) as si
JOIN svv_table_info sti ON si.tbl = sti.table_id
JOIN stl_query sq ON si.query = sq.query
JOIN svl_user_info sui ON sq.userid = sui.usesysid
WHERE
- si.starttime >= '{start_time}'
- AND si.starttime < '{end_time}'
- AND si.rows > 0
- AND sq.aborted = 0)
+ sq.aborted = 0)
UNION
(SELECT
DISTINCT sd.userid AS userid,
@@ -109,15 +112,18 @@
sq.endtime AS endtime,
'delete' AS operation_type
FROM
- stl_delete sd
+ (select userid, query, sum(rows) as rows, tbl
+ from stl_delete sd
+ where sd.rows > 0
+ AND sd.starttime >= '{start_time}'
+ AND sd.starttime < '{end_time}'
+ group by userid, query, tbl
+ ) as sd
JOIN svv_table_info sti ON sd.tbl = sti.table_id
JOIN stl_query sq ON sd.query = sq.query
JOIN svl_user_info sui ON sq.userid = sui.usesysid
WHERE
- sd.starttime >= '{start_time}'
- AND sd.starttime < '{end_time}'
- AND sd.rows > 0
- AND sq.aborted = 0)
+ sq.aborted = 0)
ORDER BY
endtime DESC
""".strip()
From 265d6bdb534c17b1b370033b81a5c20c434b49d0 Mon Sep 17 00:00:00 2001
From: purnimagarg1 <139125209+purnimagarg1@users.noreply.github.com>
Date: Tue, 19 Dec 2023 22:41:18 +0530
Subject: [PATCH 043/540] Fade recipe section to transparent on Ingestion Run
Details (#9404)
---
.../ExecutionRequestDetailsModal.tsx | 35 +++++++++++--------
1 file changed, 20 insertions(+), 15 deletions(-)
diff --git a/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx b/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx
index 96dfc05e391532..0799f8af1173dc 100644
--- a/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx
+++ b/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx
@@ -83,11 +83,11 @@ const ShowMoreButton = styled(Button)`
padding: 0px;
`;
-const LogsContainer = styled.div`
+const DetailsContainer = styled.div`
margin-bottom: -25px;
${(props) =>
- props.areLogsExpandable &&
- !props.showExpandedLogs &&
+ props.areDetailsExpandable &&
+ !props.showExpandedDetails &&
`
-webkit-mask-image: linear-gradient(to bottom, rgba(0,0,0,1) 50%, rgba(255,0,0,0.5) 60%, rgba(255,0,0,0) 90% );
mask-image: linear-gradient(to bottom, rgba(0,0,0,1) 50%, rgba(255,0,0,0.5) 60%, rgba(255,0,0,0) 90%);
@@ -102,9 +102,9 @@ const modalBodyStyle = {
padding: 0,
};
-type LogsContainerProps = {
- showExpandedLogs: boolean;
- areLogsExpandable: boolean;
+type DetailsContainerProps = {
+ showExpandedDetails: boolean;
+ areDetailsExpandable: boolean;
};
type Props = {
@@ -124,7 +124,7 @@ export const ExecutionDetailsModal = ({ urn, visible, onClose }: Props) => {
downloadFile(output, `exec-${urn}.log`);
};
- const logs = (showExpandedLogs && output) || output.slice(0, 250);
+ const logs = (showExpandedLogs && output) || output?.split('\n').slice(0, 5).join('\n');
const result = data?.executionRequest?.result?.status;
useEffect(() => {
@@ -154,10 +154,10 @@ export const ExecutionDetailsModal = ({ urn, visible, onClose }: Props) => {
} catch (e) {
recipeYaml = '';
}
- const recipe = showExpandedRecipe ? recipeYaml : recipeYaml?.split('\n').slice(0, 1).join('\n');
+ const recipe = showExpandedRecipe ? recipeYaml : recipeYaml?.split('\n').slice(0, 5).join('\n');
- const areLogsExpandable = output.length > 250;
- const isRecipeExpandable = recipeYaml?.includes('\n');
+ const areLogsExpandable = output?.split(/\r\n|\r|\n/)?.length > 5;
+ const isRecipeExpandable = recipeYaml?.split(/\r\n|\r|\n/)?.length > 5;
return (
{
Download
-
+
{`${logs}${!showExpandedLogs && areLogsExpandable ? '...' : ''}`}
-
+
{areLogsExpandable && (
setShowExpandedLogs(!showExpandedLogs)}>
{showExpandedLogs ? 'Hide' : 'Show More'}
@@ -216,9 +216,14 @@ export const ExecutionDetailsModal = ({ urn, visible, onClose }: Props) => {
The recipe used for this ingestion run.
-
- {`${recipe}${!showExpandedRecipe && isRecipeExpandable ? '\n...' : ''}`}
-
+
+
+ {`${recipe}${!showExpandedRecipe && isRecipeExpandable ? '...' : ''}`}
+
+
{isRecipeExpandable && (
setShowExpandedRecipe((v) => !v)}>
{showExpandedRecipe ? 'Hide' : 'Show More'}
From 92c9940bbd5fd2109f62b7145cfaf981d40704c3 Mon Sep 17 00:00:00 2001
From: Ellie O'Neil <110510035+eboneil@users.noreply.github.com>
Date: Tue, 19 Dec 2023 09:24:03 -0800
Subject: [PATCH 044/540] Allow message_name field for protobuf ingestion
(#9480)
---
.../java/datahub-protobuf/build.gradle | 9 +++------
.../src/main/java/datahub/protobuf/Proto2DataHub.java | 11 +++++++++++
.../java/datahub/protobuf/ProtobufDatasetTest.java | 6 +++---
.../test/java/datahub/protobuf/ProtobufUtilsTest.java | 4 ++--
.../java/datahub/protobuf/model/ProtobufEnumTest.java | 4 ++--
.../datahub/protobuf/model/ProtobufFieldTest.java | 4 ++--
.../datahub/protobuf/model/ProtobufGraphTest.java | 4 ++--
.../datahub/protobuf/model/ProtobufMessageTest.java | 4 ++--
.../protobuf/model/ProtobufOneOfFieldTest.java | 4 ++--
.../datahub/protobuf/visitors/VisitContextTest.java | 4 ++--
.../protobuf/visitors/dataset/DatasetVisitorTest.java | 4 ++--
.../visitors/dataset/DescriptionVisitorTest.java | 4 ++--
.../protobuf/visitors/dataset/DomainVisitorTest.java | 4 ++--
.../dataset/InstitutionalMemoryVisitorTest.java | 4 ++--
.../dataset/KafkaTopicPropertyVisitorTest.java | 4 ++--
.../visitors/dataset/OwnershipVisitorTest.java | 4 ++--
.../visitors/dataset/PropertyVisitorTest.java | 4 ++--
.../visitors/dataset/TermAssociationVisitorTest.java | 4 ++--
.../field/ProtobufExtensionFieldVisitorTest.java | 4 ++--
.../visitors/field/SchemaFieldVisitorTest.java | 4 ++--
.../datahub/protobuf/visitors/tag/TagVisitorTest.java | 4 ++--
21 files changed, 53 insertions(+), 45 deletions(-)
diff --git a/metadata-integration/java/datahub-protobuf/build.gradle b/metadata-integration/java/datahub-protobuf/build.gradle
index 2cb36a14cb9c7d..c8082b875d3212 100644
--- a/metadata-integration/java/datahub-protobuf/build.gradle
+++ b/metadata-integration/java/datahub-protobuf/build.gradle
@@ -31,10 +31,10 @@ dependencies {
implementation externalDependency.commonsCli
implementation externalDependency.httpAsyncClient
implementation externalDependency.slf4jApi
+ implementation externalDependency.jacksonCore
compileOnly externalDependency.lombok
annotationProcessor externalDependency.lombok
- testImplementation externalDependency.junitJupiterApi
- testRuntimeOnly externalDependency.junitJupiterEngine
+ testImplementation externalDependency.testng
}
import java.nio.file.Paths
@@ -61,10 +61,7 @@ jacocoTestReport {
dependsOn test // tests are required to run before generating the report
}
-test {
- useJUnit()
- finalizedBy jacocoTestReport
-}
+test.finalizedBy jacocoTestReport
task checkShadowJar(type: Exec) {
diff --git a/metadata-integration/java/datahub-protobuf/src/main/java/datahub/protobuf/Proto2DataHub.java b/metadata-integration/java/datahub-protobuf/src/main/java/datahub/protobuf/Proto2DataHub.java
index dcc95222fabf23..429c6d6bfeba48 100644
--- a/metadata-integration/java/datahub-protobuf/src/main/java/datahub/protobuf/Proto2DataHub.java
+++ b/metadata-integration/java/datahub-protobuf/src/main/java/datahub/protobuf/Proto2DataHub.java
@@ -67,6 +67,13 @@ public class Proto2DataHub {
"[Optional if using --directory] The protobuf source file. Typically a .proto file.")
.build();
+ private static final Option OPTION_MESSAGE_NAME =
+ Option.builder()
+ .longOpt("message_name")
+ .hasArg()
+ .desc("[Optional] The protobuf message name to read from.")
+ .build();
+
private static final Option OPTION_DIR =
Option.builder()
.longOpt("directory")
@@ -166,6 +173,7 @@ static class AppConfig {
private final String dataPlatform;
private final String protoc;
private final String inputFile;
+ private final String messageName;
private final String inputDir;
private final TransportOptions transport;
private final String filename;
@@ -191,6 +199,7 @@ static class AppConfig {
dataPlatform = cli.getOptionValue(OPTION_DATAHUB_PLATFORM, "kafka").toLowerCase(Locale.ROOT);
protoc = cli.getOptionValue(OPTION_DESCRIPTOR);
inputFile = cli.getOptionValue(OPTION_FILE, null);
+ messageName = cli.getOptionValue(OPTION_MESSAGE_NAME, null);
transport =
TransportOptions.valueOf(
cli.getOptionValue(OPTION_TRANSPORT, "rest").toUpperCase(Locale.ROOT));
@@ -250,6 +259,7 @@ public static void main(String[] args) throws Exception {
.addOption(OPTION_DATAHUB_TOKEN)
.addOption(OPTION_DESCRIPTOR)
.addOption(OPTION_FILE)
+ .addOption(OPTION_MESSAGE_NAME)
.addOption(OPTION_DIR)
.addOption(OPTION_EXCLUDE_PATTERN)
.addOption(OPTION_DATAHUB_USER)
@@ -354,6 +364,7 @@ public static void main(String[] args) throws Exception {
.setGithubOrganization(config.githubOrg)
.setSlackTeamId(config.slackId)
.setSubType(config.subType)
+ .setMessageName(config.messageName)
.build();
dataset
diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/ProtobufDatasetTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/ProtobufDatasetTest.java
index e96bb63220b04e..62f3b0453be097 100644
--- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/ProtobufDatasetTest.java
+++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/ProtobufDatasetTest.java
@@ -1,8 +1,8 @@
package datahub.protobuf;
import static datahub.protobuf.TestFixtures.*;
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.testng.Assert.assertEquals;
+import static org.testng.Assert.assertNotNull;
import com.linkedin.common.FabricType;
import com.linkedin.common.GlobalTags;
@@ -34,7 +34,7 @@
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
-import org.junit.jupiter.api.Test;
+import org.testng.annotations.Test;
public class ProtobufDatasetTest {
diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/ProtobufUtilsTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/ProtobufUtilsTest.java
index e2599cb4c3f685..9bf649041e035f 100644
--- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/ProtobufUtilsTest.java
+++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/ProtobufUtilsTest.java
@@ -2,13 +2,13 @@
import static datahub.protobuf.TestFixtures.getTestProtobufFileSet;
import static datahub.protobuf.TestFixtures.getTestProtoc;
-import static org.junit.jupiter.api.Assertions.*;
+import static org.testng.Assert.*;
import com.google.protobuf.DescriptorProtos;
import com.google.protobuf.ExtensionRegistry;
import datahub.protobuf.model.ProtobufGraph;
import java.io.IOException;
-import org.junit.jupiter.api.Test;
+import org.testng.annotations.Test;
public class ProtobufUtilsTest {
diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufEnumTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufEnumTest.java
index fed9f250b359fe..ae539a8e8fa4a1 100644
--- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufEnumTest.java
+++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufEnumTest.java
@@ -1,6 +1,6 @@
package datahub.protobuf.model;
-import static org.junit.jupiter.api.Assertions.*;
+import static org.testng.Assert.*;
import com.google.protobuf.DescriptorProtos.DescriptorProto;
import com.google.protobuf.DescriptorProtos.EnumDescriptorProto;
@@ -11,7 +11,7 @@
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
-import org.junit.jupiter.api.Test;
+import org.testng.annotations.Test;
public class ProtobufEnumTest {
diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufFieldTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufFieldTest.java
index 6d4dc8bc4d5850..9508f4778e5c88 100644
--- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufFieldTest.java
+++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufFieldTest.java
@@ -1,7 +1,7 @@
package datahub.protobuf.model;
import static datahub.protobuf.TestFixtures.*;
-import static org.junit.jupiter.api.Assertions.*;
+import static org.testng.Assert.*;
import com.google.protobuf.DescriptorProtos.DescriptorProto;
import com.google.protobuf.DescriptorProtos.FieldDescriptorProto;
@@ -22,7 +22,7 @@
import java.io.IOException;
import java.util.Arrays;
import java.util.Set;
-import org.junit.jupiter.api.Test;
+import org.testng.annotations.Test;
public class ProtobufFieldTest {
private static final DescriptorProto EXPECTED_MESSAGE_PROTO =
diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufGraphTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufGraphTest.java
index 488222b87766d8..6ca0c5b45cb5ec 100644
--- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufGraphTest.java
+++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufGraphTest.java
@@ -2,14 +2,14 @@
import static datahub.protobuf.TestFixtures.getTestProtobufFileSet;
import static datahub.protobuf.TestFixtures.getTestProtobufGraph;
-import static org.junit.jupiter.api.Assertions.*;
+import static org.testng.Assert.*;
import com.google.protobuf.DescriptorProtos.FileDescriptorSet;
import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.stream.Collectors;
-import org.junit.jupiter.api.Test;
+import org.testng.annotations.Test;
public class ProtobufGraphTest {
diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufMessageTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufMessageTest.java
index 1d6b3907d76d9d..1126895aec57a6 100644
--- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufMessageTest.java
+++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufMessageTest.java
@@ -1,6 +1,6 @@
package datahub.protobuf.model;
-import static org.junit.jupiter.api.Assertions.*;
+import static org.testng.Assert.*;
import com.google.protobuf.DescriptorProtos.DescriptorProto;
import com.google.protobuf.DescriptorProtos.FileDescriptorProto;
@@ -11,7 +11,7 @@
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
-import org.junit.jupiter.api.Test;
+import org.testng.annotations.Test;
public class ProtobufMessageTest {
diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufOneOfFieldTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufOneOfFieldTest.java
index c8bd8a322aad56..9db06f23a2bdf3 100644
--- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufOneOfFieldTest.java
+++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufOneOfFieldTest.java
@@ -1,6 +1,6 @@
package datahub.protobuf.model;
-import static org.junit.jupiter.api.Assertions.*;
+import static org.testng.Assert.*;
import com.google.protobuf.DescriptorProtos.DescriptorProto;
import com.google.protobuf.DescriptorProtos.FieldDescriptorProto;
@@ -12,7 +12,7 @@
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
-import org.junit.jupiter.api.Test;
+import org.testng.annotations.Test;
public class ProtobufOneOfFieldTest {
diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/VisitContextTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/VisitContextTest.java
index 2fc5f3834a749f..fe27af7461860b 100644
--- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/VisitContextTest.java
+++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/VisitContextTest.java
@@ -2,7 +2,7 @@
import static datahub.protobuf.TestFixtures.getTestProtobufFileSet;
import static datahub.protobuf.TestFixtures.getTestProtobufGraph;
-import static org.junit.jupiter.api.Assertions.assertNotEquals;
+import static org.testng.Assert.assertNotEquals;
import com.google.protobuf.DescriptorProtos.FileDescriptorSet;
import datahub.protobuf.model.FieldTypeEdge;
@@ -13,7 +13,7 @@
import java.util.Set;
import java.util.stream.Collectors;
import org.jgrapht.GraphPath;
-import org.junit.jupiter.api.Test;
+import org.testng.annotations.Test;
public class VisitContextTest {
diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DatasetVisitorTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DatasetVisitorTest.java
index de9a0f5ec4abee..6e99599c852b43 100644
--- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DatasetVisitorTest.java
+++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DatasetVisitorTest.java
@@ -1,7 +1,7 @@
package datahub.protobuf.visitors.dataset;
import static datahub.protobuf.TestFixtures.*;
-import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.testng.Assert.assertEquals;
import com.linkedin.common.urn.DatasetUrn;
import com.linkedin.data.template.RecordTemplate;
@@ -14,7 +14,7 @@
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
-import org.junit.jupiter.api.Test;
+import org.testng.annotations.Test;
public class DatasetVisitorTest {
diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DescriptionVisitorTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DescriptionVisitorTest.java
index 679048fb48a53d..42d8f1ad4c83c6 100644
--- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DescriptionVisitorTest.java
+++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DescriptionVisitorTest.java
@@ -1,14 +1,14 @@
package datahub.protobuf.visitors.dataset;
import static datahub.protobuf.TestFixtures.*;
-import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.testng.Assert.assertEquals;
import datahub.protobuf.model.ProtobufGraph;
import java.io.IOException;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
-import org.junit.jupiter.api.Test;
+import org.testng.annotations.Test;
public class DescriptionVisitorTest {
diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DomainVisitorTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DomainVisitorTest.java
index c24fc30766f0ed..3330c09c494364 100644
--- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DomainVisitorTest.java
+++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DomainVisitorTest.java
@@ -2,7 +2,7 @@
import static datahub.protobuf.TestFixtures.getTestProtobufGraph;
import static datahub.protobuf.TestFixtures.getVisitContextBuilder;
-import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.testng.Assert.assertEquals;
import com.linkedin.common.urn.Urn;
import datahub.protobuf.model.ProtobufGraph;
@@ -10,7 +10,7 @@
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
-import org.junit.jupiter.api.Test;
+import org.testng.annotations.Test;
public class DomainVisitorTest {
diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/InstitutionalMemoryVisitorTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/InstitutionalMemoryVisitorTest.java
index a57916441bfcb3..45be30fe96210b 100644
--- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/InstitutionalMemoryVisitorTest.java
+++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/InstitutionalMemoryVisitorTest.java
@@ -1,7 +1,7 @@
package datahub.protobuf.visitors.dataset;
import static datahub.protobuf.TestFixtures.*;
-import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.testng.Assert.assertEquals;
import com.linkedin.common.InstitutionalMemoryMetadata;
import com.linkedin.common.url.Url;
@@ -9,7 +9,7 @@
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
-import org.junit.jupiter.api.Test;
+import org.testng.annotations.Test;
public class InstitutionalMemoryVisitorTest {
diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/KafkaTopicPropertyVisitorTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/KafkaTopicPropertyVisitorTest.java
index 5f8572cf6ddd83..2da53dad2c0be1 100644
--- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/KafkaTopicPropertyVisitorTest.java
+++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/KafkaTopicPropertyVisitorTest.java
@@ -2,7 +2,7 @@
import static datahub.protobuf.TestFixtures.getTestProtobufGraph;
import static datahub.protobuf.TestFixtures.getVisitContextBuilder;
-import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.testng.Assert.assertEquals;
import com.linkedin.data.template.StringMap;
import com.linkedin.dataset.DatasetProperties;
@@ -11,7 +11,7 @@
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
-import org.junit.jupiter.api.Test;
+import org.testng.annotations.Test;
public class KafkaTopicPropertyVisitorTest {
diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/OwnershipVisitorTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/OwnershipVisitorTest.java
index 1b0aff28eb5176..adc94487dab3cc 100644
--- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/OwnershipVisitorTest.java
+++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/OwnershipVisitorTest.java
@@ -2,7 +2,7 @@
import static datahub.protobuf.TestFixtures.getTestProtobufGraph;
import static datahub.protobuf.TestFixtures.getVisitContextBuilder;
-import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.testng.Assert.assertEquals;
import com.linkedin.common.Owner;
import com.linkedin.common.OwnershipSource;
@@ -14,7 +14,7 @@
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
-import org.junit.jupiter.api.Test;
+import org.testng.annotations.Test;
public class OwnershipVisitorTest {
diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/PropertyVisitorTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/PropertyVisitorTest.java
index 13912100f28a5d..be653309540519 100644
--- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/PropertyVisitorTest.java
+++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/PropertyVisitorTest.java
@@ -3,7 +3,7 @@
import static datahub.protobuf.TestFixtures.getTestProtobufGraph;
import static datahub.protobuf.TestFixtures.getVisitContextBuilder;
import static java.util.Map.entry;
-import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.testng.Assert.assertEquals;
import com.linkedin.data.template.StringMap;
import com.linkedin.dataset.DatasetProperties;
@@ -11,7 +11,7 @@
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
-import org.junit.jupiter.api.Test;
+import org.testng.annotations.Test;
public class PropertyVisitorTest {
diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/TermAssociationVisitorTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/TermAssociationVisitorTest.java
index f734c00bb76e08..79e7075c652094 100644
--- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/TermAssociationVisitorTest.java
+++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/TermAssociationVisitorTest.java
@@ -2,7 +2,7 @@
import static datahub.protobuf.TestFixtures.getTestProtobufGraph;
import static datahub.protobuf.TestFixtures.getVisitContextBuilder;
-import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.testng.Assert.assertEquals;
import com.linkedin.common.GlossaryTermAssociation;
import com.linkedin.common.urn.GlossaryTermUrn;
@@ -10,7 +10,7 @@
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
-import org.junit.jupiter.api.Test;
+import org.testng.annotations.Test;
public class TermAssociationVisitorTest {
diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/field/ProtobufExtensionFieldVisitorTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/field/ProtobufExtensionFieldVisitorTest.java
index eec397011a4ce4..ff1aa643ac8df6 100644
--- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/field/ProtobufExtensionFieldVisitorTest.java
+++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/field/ProtobufExtensionFieldVisitorTest.java
@@ -1,7 +1,7 @@
package datahub.protobuf.visitors.field;
import static datahub.protobuf.TestFixtures.*;
-import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.testng.Assert.assertEquals;
import com.linkedin.common.GlobalTags;
import com.linkedin.common.GlossaryTermAssociation;
@@ -23,7 +23,7 @@
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
-import org.junit.jupiter.api.Test;
+import org.testng.annotations.Test;
public class ProtobufExtensionFieldVisitorTest {
diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/field/SchemaFieldVisitorTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/field/SchemaFieldVisitorTest.java
index af31a80d3b53ad..59d9e0ca6e5183 100644
--- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/field/SchemaFieldVisitorTest.java
+++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/field/SchemaFieldVisitorTest.java
@@ -2,7 +2,7 @@
import static datahub.protobuf.TestFixtures.getTestProtobufGraph;
import static datahub.protobuf.TestFixtures.getVisitContextBuilder;
-import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.testng.Assert.assertEquals;
import com.linkedin.schema.NumberType;
import com.linkedin.schema.SchemaField;
@@ -15,7 +15,7 @@
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
-import org.junit.jupiter.api.Test;
+import org.testng.annotations.Test;
public class SchemaFieldVisitorTest {
diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/tag/TagVisitorTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/tag/TagVisitorTest.java
index 258d816d9d1da3..ab477e19aabe4d 100644
--- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/tag/TagVisitorTest.java
+++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/tag/TagVisitorTest.java
@@ -2,7 +2,7 @@
import static datahub.protobuf.TestFixtures.getTestProtobufGraph;
import static datahub.protobuf.TestFixtures.getVisitContextBuilder;
-import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.testng.Assert.assertEquals;
import com.linkedin.tag.TagProperties;
import datahub.event.MetadataChangeProposalWrapper;
@@ -11,7 +11,7 @@
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
-import org.junit.jupiter.api.Test;
+import org.testng.annotations.Test;
public class TagVisitorTest {
From 8f19138f68ce6376588f4e09617be7e3c325a70f Mon Sep 17 00:00:00 2001
From: david-leifker <114954101+david-leifker@users.noreply.github.com>
Date: Tue, 19 Dec 2023 12:00:54 -0600
Subject: [PATCH 045/540] feat(docker-compose): consolidate docker-compose
profiles (#9478)
---
build.gradle | 1 +
.../upgrade/config/NoCodeCleanupConfig.java | 12 +
.../upgrade/config/NoCodeUpgradeConfig.java | 12 +
.../upgrade/config/RestoreBackupConfig.java | 12 +
.../upgrade/config/RestoreIndicesConfig.java | 12 +
.../datahub/upgrade/nocode/NoCodeUpgrade.java | 12 +-
.../nocodecleanup/NoCodeCleanupUpgrade.java | 12 +-
.../upgrade/restorebackup/RestoreBackup.java | 12 +-
.../restoreindices/RestoreIndices.java | 9 +-
docker/build.gradle | 216 ++++-----
docker/profiles/README.md | 104 +++++
docker/profiles/cassandra | 1 +
docker/profiles/datahub-actions | 1 +
docker/profiles/datahub-frontend | 1 +
docker/profiles/datahub-gms | 1 +
docker/profiles/datahub-mae-consumer | 1 +
docker/profiles/datahub-mce-consumer | 1 +
docker/profiles/datahub-upgrade | 1 +
docker/profiles/docker-compose.actions.yml | 45 ++
docker/profiles/docker-compose.frontend.yml | 119 +++++
docker/profiles/docker-compose.gms.yml | 429 ++++++++++++++++++
.../profiles/docker-compose.prerequisites.yml | 387 ++++++++++++++++
docker/profiles/docker-compose.yml | 13 +
docker/profiles/elasticsearch | 1 +
docker/profiles/elasticsearch-setup | 1 +
docker/profiles/kafka-broker | 1 +
docker/profiles/kafka-setup | 1 +
docker/profiles/monitoring | 1 +
docker/profiles/mysql | 1 +
docker/profiles/mysql-setup | 1 +
docker/profiles/neo4j | 1 +
docker/profiles/postgres | 1 +
docker/profiles/postgres-setup | 1 +
33 files changed, 1288 insertions(+), 136 deletions(-)
create mode 100644 docker/profiles/README.md
create mode 120000 docker/profiles/cassandra
create mode 120000 docker/profiles/datahub-actions
create mode 120000 docker/profiles/datahub-frontend
create mode 120000 docker/profiles/datahub-gms
create mode 120000 docker/profiles/datahub-mae-consumer
create mode 120000 docker/profiles/datahub-mce-consumer
create mode 120000 docker/profiles/datahub-upgrade
create mode 100644 docker/profiles/docker-compose.actions.yml
create mode 100644 docker/profiles/docker-compose.frontend.yml
create mode 100644 docker/profiles/docker-compose.gms.yml
create mode 100644 docker/profiles/docker-compose.prerequisites.yml
create mode 100644 docker/profiles/docker-compose.yml
create mode 120000 docker/profiles/elasticsearch
create mode 120000 docker/profiles/elasticsearch-setup
create mode 120000 docker/profiles/kafka-broker
create mode 120000 docker/profiles/kafka-setup
create mode 120000 docker/profiles/monitoring
create mode 120000 docker/profiles/mysql
create mode 120000 docker/profiles/mysql-setup
create mode 120000 docker/profiles/neo4j
create mode 120000 docker/profiles/postgres
create mode 120000 docker/profiles/postgres-setup
diff --git a/build.gradle b/build.gradle
index a7a85db0398e21..bb01a15a7db8d6 100644
--- a/build.gradle
+++ b/build.gradle
@@ -46,6 +46,7 @@ plugins {
id 'com.gorylenko.gradle-git-properties' version '2.4.1'
id 'com.github.johnrengelman.shadow' version '8.1.1' apply false
id 'com.palantir.docker' version '0.35.0' apply false
+ id 'com.avast.gradle.docker-compose' version '0.17.5'
id "com.diffplug.spotless" version "6.23.3"
// https://blog.ltgt.net/javax-jakarta-mess-and-gradle-solution/
// TODO id "org.gradlex.java-ecosystem-capabilities" version "1.0"
diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/NoCodeCleanupConfig.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/NoCodeCleanupConfig.java
index 24bcec5852b4fc..5ba5c8a90fd4ac 100644
--- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/NoCodeCleanupConfig.java
+++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/NoCodeCleanupConfig.java
@@ -7,13 +7,16 @@
import com.linkedin.metadata.utils.elasticsearch.IndexConvention;
import io.ebean.Database;
import javax.annotation.Nonnull;
+import lombok.extern.slf4j.Slf4j;
import org.opensearch.client.RestHighLevelClient;
import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.context.ApplicationContext;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.DependsOn;
+@Slf4j
@Configuration
public class NoCodeCleanupConfig {
@@ -26,6 +29,7 @@ public class NoCodeCleanupConfig {
"elasticSearchRestHighLevelClient",
INDEX_CONVENTION_BEAN
})
+ @ConditionalOnProperty(name = "entityService.impl", havingValue = "ebean", matchIfMissing = true)
@Nonnull
public NoCodeCleanupUpgrade createInstance() {
final Database ebeanServer = applicationContext.getBean(Database.class);
@@ -34,4 +38,12 @@ public NoCodeCleanupUpgrade createInstance() {
final IndexConvention indexConvention = applicationContext.getBean(IndexConvention.class);
return new NoCodeCleanupUpgrade(ebeanServer, graphClient, searchClient, indexConvention);
}
+
+ @Bean(name = "noCodeCleanup")
+ @ConditionalOnProperty(name = "entityService.impl", havingValue = "cassandra")
+ @Nonnull
+ public NoCodeCleanupUpgrade createNotImplInstance() {
+ log.warn("NoCode is not supported for cassandra!");
+ return new NoCodeCleanupUpgrade(null, null, null, null);
+ }
}
diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/NoCodeUpgradeConfig.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/NoCodeUpgradeConfig.java
index 68009d7ed1718a..d968e8521867e8 100644
--- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/NoCodeUpgradeConfig.java
+++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/NoCodeUpgradeConfig.java
@@ -6,12 +6,15 @@
import com.linkedin.metadata.models.registry.EntityRegistry;
import io.ebean.Database;
import javax.annotation.Nonnull;
+import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.context.ApplicationContext;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.DependsOn;
+@Slf4j
@Configuration
public class NoCodeUpgradeConfig {
@@ -19,6 +22,7 @@ public class NoCodeUpgradeConfig {
@Bean(name = "noCodeUpgrade")
@DependsOn({"ebeanServer", "entityService", "systemRestliEntityClient", "entityRegistry"})
+ @ConditionalOnProperty(name = "entityService.impl", havingValue = "ebean", matchIfMissing = true)
@Nonnull
public NoCodeUpgrade createInstance() {
final Database ebeanServer = applicationContext.getBean(Database.class);
@@ -29,4 +33,12 @@ public NoCodeUpgrade createInstance() {
return new NoCodeUpgrade(ebeanServer, entityService, entityRegistry, entityClient);
}
+
+ @Bean(name = "noCodeUpgrade")
+ @ConditionalOnProperty(name = "entityService.impl", havingValue = "cassandra")
+ @Nonnull
+ public NoCodeUpgrade createNotImplInstance() {
+ log.warn("NoCode is not supported for cassandra!");
+ return new NoCodeUpgrade(null, null, null, null);
+ }
}
diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/RestoreBackupConfig.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/RestoreBackupConfig.java
index 743e4ffe84b0e4..116d62878f5c6e 100644
--- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/RestoreBackupConfig.java
+++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/RestoreBackupConfig.java
@@ -8,12 +8,15 @@
import com.linkedin.metadata.search.EntitySearchService;
import io.ebean.Database;
import javax.annotation.Nonnull;
+import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.context.ApplicationContext;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.DependsOn;
+@Slf4j
@Configuration
public class RestoreBackupConfig {
@Autowired ApplicationContext applicationContext;
@@ -27,6 +30,7 @@ public class RestoreBackupConfig {
"searchService",
"entityRegistry"
})
+ @ConditionalOnProperty(name = "entityService.impl", havingValue = "ebean", matchIfMissing = true)
@Nonnull
public RestoreBackup createInstance() {
final Database ebeanServer = applicationContext.getBean(Database.class);
@@ -40,4 +44,12 @@ public RestoreBackup createInstance() {
return new RestoreBackup(
ebeanServer, entityService, entityRegistry, entityClient, graphClient, searchClient);
}
+
+ @Bean(name = "restoreBackup")
+ @ConditionalOnProperty(name = "entityService.impl", havingValue = "cassandra")
+ @Nonnull
+ public RestoreBackup createNotImplInstance() {
+ log.warn("restoreIndices is not supported for cassandra!");
+ return new RestoreBackup(null, null, null, null, null, null);
+ }
}
diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/RestoreIndicesConfig.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/RestoreIndicesConfig.java
index d258c4a4d1a529..9d229f315d709d 100644
--- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/RestoreIndicesConfig.java
+++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/RestoreIndicesConfig.java
@@ -7,18 +7,22 @@
import com.linkedin.metadata.search.EntitySearchService;
import io.ebean.Database;
import javax.annotation.Nonnull;
+import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.context.ApplicationContext;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.DependsOn;
+@Slf4j
@Configuration
public class RestoreIndicesConfig {
@Autowired ApplicationContext applicationContext;
@Bean(name = "restoreIndices")
@DependsOn({"ebeanServer", "entityService", "searchService", "graphService", "entityRegistry"})
+ @ConditionalOnProperty(name = "entityService.impl", havingValue = "ebean", matchIfMissing = true)
@Nonnull
public RestoreIndices createInstance() {
final Database ebeanServer = applicationContext.getBean(Database.class);
@@ -31,4 +35,12 @@ public RestoreIndices createInstance() {
return new RestoreIndices(
ebeanServer, entityService, entityRegistry, entitySearchService, graphService);
}
+
+ @Bean(name = "restoreIndices")
+ @ConditionalOnProperty(name = "entityService.impl", havingValue = "cassandra")
+ @Nonnull
+ public RestoreIndices createNotImplInstance() {
+ log.warn("restoreIndices is not supported for cassandra!");
+ return new RestoreIndices(null, null, null, null, null);
+ }
}
diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/nocode/NoCodeUpgrade.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/nocode/NoCodeUpgrade.java
index 6753d309b9f501..674efb2b8ba78c 100644
--- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/nocode/NoCodeUpgrade.java
+++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/nocode/NoCodeUpgrade.java
@@ -13,6 +13,7 @@
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
+import javax.annotation.Nullable;
public class NoCodeUpgrade implements Upgrade {
@@ -26,12 +27,17 @@ public class NoCodeUpgrade implements Upgrade {
// Upgrade requires the Database.
public NoCodeUpgrade(
- final Database server,
+ @Nullable final Database server,
final EntityService entityService,
final EntityRegistry entityRegistry,
final SystemRestliEntityClient entityClient) {
- _steps = buildUpgradeSteps(server, entityService, entityRegistry, entityClient);
- _cleanupSteps = buildCleanupSteps();
+ if (server != null) {
+ _steps = buildUpgradeSteps(server, entityService, entityRegistry, entityClient);
+ _cleanupSteps = buildCleanupSteps();
+ } else {
+ _steps = List.of();
+ _cleanupSteps = List.of();
+ }
}
@Override
diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/nocodecleanup/NoCodeCleanupUpgrade.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/nocodecleanup/NoCodeCleanupUpgrade.java
index 8a267be6ad8086..6d3125423b4433 100644
--- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/nocodecleanup/NoCodeCleanupUpgrade.java
+++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/nocodecleanup/NoCodeCleanupUpgrade.java
@@ -9,6 +9,7 @@
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
+import javax.annotation.Nullable;
import org.opensearch.client.RestHighLevelClient;
public class NoCodeCleanupUpgrade implements Upgrade {
@@ -18,12 +19,17 @@ public class NoCodeCleanupUpgrade implements Upgrade {
// Upgrade requires the Database.
public NoCodeCleanupUpgrade(
- final Database server,
+ @Nullable final Database server,
final GraphService graphClient,
final RestHighLevelClient searchClient,
final IndexConvention indexConvention) {
- _steps = buildUpgradeSteps(server, graphClient, searchClient, indexConvention);
- _cleanupSteps = buildCleanupSteps();
+ if (server != null) {
+ _steps = buildUpgradeSteps(server, graphClient, searchClient, indexConvention);
+ _cleanupSteps = buildCleanupSteps();
+ } else {
+ _steps = List.of();
+ _cleanupSteps = List.of();
+ }
}
@Override
diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreBackup.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreBackup.java
index b11abb2d6bc23a..4ac295b4fdfb75 100644
--- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreBackup.java
+++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreBackup.java
@@ -16,20 +16,26 @@
import io.ebean.Database;
import java.util.ArrayList;
import java.util.List;
+import javax.annotation.Nullable;
public class RestoreBackup implements Upgrade {
private final List _steps;
public RestoreBackup(
- final Database server,
+ @Nullable final Database server,
final EntityService entityService,
final EntityRegistry entityRegistry,
final SystemRestliEntityClient entityClient,
final GraphService graphClient,
final EntitySearchService searchClient) {
- _steps =
- buildSteps(server, entityService, entityRegistry, entityClient, graphClient, searchClient);
+ if (server != null) {
+ _steps =
+ buildSteps(
+ server, entityService, entityRegistry, entityClient, graphClient, searchClient);
+ } else {
+ _steps = List.of();
+ }
}
@Override
diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/RestoreIndices.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/RestoreIndices.java
index 8bb3b0073710a3..d38685553dff2f 100644
--- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/RestoreIndices.java
+++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/RestoreIndices.java
@@ -13,6 +13,7 @@
import io.ebean.Database;
import java.util.ArrayList;
import java.util.List;
+import javax.annotation.Nullable;
public class RestoreIndices implements Upgrade {
public static final String BATCH_SIZE_ARG_NAME = "batchSize";
@@ -29,12 +30,16 @@ public class RestoreIndices implements Upgrade {
private final List _steps;
public RestoreIndices(
- final Database server,
+ @Nullable final Database server,
final EntityService entityService,
final EntityRegistry entityRegistry,
final EntitySearchService entitySearchService,
final GraphService graphService) {
- _steps = buildSteps(server, entityService, entityRegistry, entitySearchService, graphService);
+ if (server != null) {
+ _steps = buildSteps(server, entityService, entityRegistry, entitySearchService, graphService);
+ } else {
+ _steps = List.of();
+ }
}
@Override
diff --git a/docker/build.gradle b/docker/build.gradle
index bc79be501b3952..190202620c382c 100644
--- a/docker/build.gradle
+++ b/docker/build.gradle
@@ -1,6 +1,9 @@
plugins {
id 'java' // required by versioning
+ id 'docker-compose'
}
+import com.avast.gradle.dockercompose.tasks.ComposeUp
+import com.avast.gradle.dockercompose.tasks.ComposeDownForced
apply from: "../gradle/versioning/versioning.gradle"
@@ -18,144 +21,107 @@ ext {
debug_modules = quickstart_modules - [':metadata-jobs:mce-consumer-job',
':metadata-jobs:mae-consumer-job']
- debug_compose_args = [
- '-f', 'docker-compose-without-neo4j.yml',
- '-f', 'docker-compose-without-neo4j.override.yml',
- '-f', 'docker-compose-without-neo4j.m1.yml', // updates to mariadb
- '-f', 'docker-compose.dev.yml'
- ]
+ compose_args = ['-f', 'profiles/docker-compose.yml']
debug_reloadable = [
- 'datahub-gms',
- 'datahub-frontend-react'
+ 'datahub-gms-debug',
+ 'system-update-debug',
+ 'frontend-debug'
]
-
// Postgres
pg_quickstart_modules = quickstart_modules - [':docker:mysql-setup'] + [':docker:postgres-setup']
- pg_compose_args = [
- '-f', 'docker-compose-without-neo4j.yml',
- '-f', 'docker-compose-without-neo4j.postgres.override.yml'
- ]
}
-task quickstart(type: Exec, dependsOn: ':metadata-ingestion:install') {
- dependsOn(quickstart_modules.collect { it + ':dockerTag' })
- shouldRunAfter ':metadata-ingestion:clean', 'quickstartNuke'
-
- environment "DATAHUB_TELEMETRY_ENABLED", "false"
- environment "DOCKER_COMPOSE_BASE", "file://${rootProject.projectDir}"
- // environment "ACTIONS_VERSION", 'alpine3.18-slim'
- // environment "DATAHUB_ACTIONS_IMAGE", 'nginx'
-
- // Elastic
- // environment "DATAHUB_SEARCH_IMAGE", 'elasticsearch'
- // environment "DATAHUB_SEARCH_TAG", '7.10.1'
-
- // OpenSearch
- environment "DATAHUB_SEARCH_IMAGE", 'opensearchproject/opensearch'
- environment "DATAHUB_SEARCH_TAG", '2.9.0'
- environment "XPACK_SECURITY_ENABLED", 'plugins.security.disabled=true'
- environment "USE_AWS_ELASTICSEARCH", 'true'
-
- def cmd = [
- 'source ../metadata-ingestion/venv/bin/activate && ',
- 'datahub docker quickstart',
- '--no-pull-images',
- '--standalone_consumers',
- '--version', "v${version}",
- '--dump-logs-on-failure'
- ]
+tasks.register('quickstart') {}
+tasks.register('quickstartSlim') {}
+tasks.register('quickstartDebug') {}
+tasks.register('quickstartPg') {}
- commandLine 'bash', '-c', cmd.join(" ")
+tasks.withType(ComposeDownForced) {
+ removeVolumes = true
}
-
-task quickstartSlim(type: Exec, dependsOn: ':metadata-ingestion:install') {
- dependsOn(([':docker:datahub-ingestion'] + quickstart_modules).collect { it + ':dockerTag' })
- shouldRunAfter ':metadata-ingestion:clean', 'quickstartNuke'
-
- environment "DATAHUB_TELEMETRY_ENABLED", "false"
- environment "DOCKER_COMPOSE_BASE", "file://${rootProject.projectDir}"
- environment "DATAHUB_ACTIONS_IMAGE", "acryldata/datahub-ingestion"
- environment "ACTIONS_VERSION", "v${version}-slim"
- environment "ACTIONS_EXTRA_PACKAGES", 'acryl-datahub-actions[executor] acryl-datahub-actions'
- environment "ACTIONS_CONFIG", 'https://raw.githubusercontent.com/acryldata/datahub-actions/main/docker/config/executor.yaml'
-
- def cmd = [
- 'source ../metadata-ingestion/venv/bin/activate && ',
- 'datahub docker quickstart',
- '--no-pull-images',
- '--standalone_consumers',
- '--version', "v${version}",
- '--dump-logs-on-failure'
- ]
-
- commandLine 'bash', '-c', cmd.join(" ")
+task quickstartNuke {
+ finalizedBy(tasks.withType(ComposeDownForced))
}
-task quickstartNuke(type: Exec, dependsOn: ":metadata-ingestion:install") {
- shouldRunAfter(':metadata-ingestion:clean')
-
- def cmd = [
- 'source ../metadata-ingestion/venv/bin/activate && ',
- 'datahub docker nuke'
- ]
- commandLine 'bash', '-c', cmd.join(" ")
+dockerCompose {
+ quickstart {
+ isRequiredBy(tasks.named('quickstart'))
+ composeAdditionalArgs = ['--profile', 'quickstart-consumers']
+
+ environment.put 'DATAHUB_VERSION', "v${version}"
+
+ useComposeFiles = ['profiles/docker-compose.yml']
+ projectName = 'datahub'
+ projectNamePrefix = ''
+ buildBeforeUp = false
+ buildBeforePull = false
+ stopContainers = false
+ removeVolumes = false
+ }
+
+ quickstartPg {
+ isRequiredBy(tasks.named('quickstartPg'))
+ composeAdditionalArgs = ['--profile', 'quickstart-postgres']
+
+ environment.put 'DATAHUB_VERSION', "v${version}"
+
+ useComposeFiles = ['profiles/docker-compose.yml']
+ projectName = 'datahub'
+ projectNamePrefix = ''
+ buildBeforeUp = false
+ buildBeforePull = false
+ stopContainers = false
+ removeVolumes = false
+ }
+
+ quickstartSlim {
+ isRequiredBy(tasks.named('quickstartSlim'))
+ composeAdditionalArgs = ['--profile', 'quickstart-consumers']
+
+ environment.put 'DATAHUB_VERSION', "v${version}"
+ environment.put "DATAHUB_ACTIONS_IMAGE", "acryldata/datahub-ingestion"
+ environment.put "ACTIONS_VERSION", "v${version}-slim"
+ environment.put "ACTIONS_EXTRA_PACKAGES", 'acryl-datahub-actions[executor] acryl-datahub-actions'
+ environment.put "ACTIONS_CONFIG", 'https://raw.githubusercontent.com/acryldata/datahub-actions/main/docker/config/executor.yaml'
+
+ useComposeFiles = ['profiles/docker-compose.yml']
+ projectName = 'datahub'
+ projectNamePrefix = ''
+ buildBeforeUp = false
+ buildBeforePull = false
+ stopContainers = false
+ removeVolumes = false
+ }
+
+ quickstartDebug {
+ isRequiredBy(tasks.named('quickstartDebug'))
+ composeAdditionalArgs = ['--profile', 'debug']
+
+ useComposeFiles = ['profiles/docker-compose.yml']
+ projectName = 'datahub'
+ projectNamePrefix = ''
+ buildBeforeUp = false
+ buildBeforePull = false
+ stopContainers = false
+ removeVolumes = false
+ }
}
-
-task quickstartDebug(type: Exec, dependsOn: ':metadata-ingestion:install') {
- dependsOn(debug_modules.collect { it + ':dockerTagDebug' })
- shouldRunAfter ':metadata-ingestion:clean', 'quickstartNuke'
-
- environment "DATAHUB_TELEMETRY_ENABLED", "false"
- environment "DOCKER_COMPOSE_BASE", "file://${rootProject.projectDir}"
-
- // Elastic
- // environment "DATAHUB_SEARCH_IMAGE", 'elasticsearch'
- // environment "DATAHUB_SEARCH_TAG", '7.10.1'
-
- // OpenSearch
- environment "DATAHUB_SEARCH_IMAGE", 'opensearchproject/opensearch'
- environment "DATAHUB_SEARCH_TAG", '2.9.0'
- environment "XPACK_SECURITY_ENABLED", 'plugins.security.disabled=true'
- environment "USE_AWS_ELASTICSEARCH", 'true'
-
-
- def cmd = [
- 'source ../metadata-ingestion/venv/bin/activate && ',
- 'datahub docker quickstart',
- '--no-pull-images',
- '--version', "debug",
- '--dump-logs-on-failure'
- ] + debug_compose_args
- commandLine 'bash', '-c', cmd.join(" ")
+tasks.getByName('quickstartComposeUp').dependsOn(
+ quickstart_modules.collect { it + ':dockerTag' })
+tasks.getByName('quickstartPgComposeUp').dependsOn(
+ pg_quickstart_modules.collect { it + ':dockerTag' })
+tasks.getByName('quickstartSlimComposeUp').dependsOn(
+ ([':docker:datahub-ingestion'] + quickstart_modules)
+ .collect { it + ':dockerTag' })
+tasks.getByName('quickstartDebugComposeUp').dependsOn(
+ debug_modules.collect { it + ':dockerTagDebug' }
+)
+tasks.withType(ComposeUp).configureEach {
+ shouldRunAfter('quickstartNuke')
}
+
task debugReload(type: Exec) {
- def cmd = ['docker compose -p datahub'] + debug_compose_args + ['restart'] + debug_reloadable
+ def cmd = ['docker compose -p datahub --profile debug'] + compose_args + ['restart'] + debug_reloadable
commandLine 'bash', '-c', cmd.join(" ")
}
-
-task quickstartPg(type: Exec, dependsOn: ':metadata-ingestion:install') {
- dependsOn(pg_quickstart_modules.collect { it + ':dockerTag' })
- shouldRunAfter ':metadata-ingestion:clean', 'quickstartNuke'
-
- environment "DATAHUB_TELEMETRY_ENABLED", "false"
- environment "DOCKER_COMPOSE_BASE", "file://${rootProject.projectDir}"
- environment "DATAHUB_POSTGRES_VERSION", "15.5"
-
- // OpenSearch
- environment "DATAHUB_SEARCH_IMAGE", 'opensearchproject/opensearch'
- environment "DATAHUB_SEARCH_TAG", '2.9.0'
- environment "XPACK_SECURITY_ENABLED", 'plugins.security.disabled=true'
- environment "USE_AWS_ELASTICSEARCH", 'true'
-
- def cmd = [
- 'source ../metadata-ingestion/venv/bin/activate && ',
- 'datahub docker quickstart',
- '--no-pull-images',
- '--standalone_consumers',
- '--version', "v${version}",
- '--dump-logs-on-failure'
- ] + pg_compose_args
-
- commandLine 'bash', '-c', cmd.join(" ")
-}
\ No newline at end of file
diff --git a/docker/profiles/README.md b/docker/profiles/README.md
new file mode 100644
index 00000000000000..df09f15cd85cee
--- /dev/null
+++ b/docker/profiles/README.md
@@ -0,0 +1,104 @@
+# Docker Compose Profiles
+
+This directory contains a set of docker compose definitions which are designed to run several configurations
+for quickstart use-cases as well as development use-cases. These configurations cover a few of the wide variety of
+infrastructure configurations that DataHub can operate on.
+
+Requirements:
+* Use the profiles requires a modern version of docker.
+* If using the debug/development profiles, you will need to have built the `debug` docker images locally. See the Development Profiles section for more details.
+
+```bash
+$ cd docker/profiles
+$ docker compose --profile up
+```
+
+Use Control-c (`^c`) to terminate the running system. This will automatically stop all running containers.
+
+To remove the containers use the following:
+
+```bash
+docker compose --profile rm
+```
+
+Please refer to docker's documentation for more details.
+
+The following sections detail a few of the profiles and their intended use-cases. For a complete list of profiles
+and their configuration please see the table at the end of each section.
+
+## Quickstart Profiles
+
+Quickstart profiles are primarily a way to test drive DataHub features before committing to a production ready deployment.
+A couple of these profiles are also used in our continuous integration (CI) tests.
+
+Note: Quickstart profiles use docker images with the `head` tag. These images up updated when changes are committed
+to the DataHub github repository. This can be overridden to use a stable release tag by prefixing the commands with
+`DATAHUB_VERSION=v0.12.1` for example.
+
+### `quickstart`
+
+This is the default configuration MySQL and OpenSearch for the storage and GMS running with integrated consumers.
+
+### `quickstart-consumers`
+
+This configuration is identical to `quickstart` how it runs standalone consumers instead of consumers integrated with the GMS container.
+
+### `quickstart-postgres`
+
+Identical to `quickstart` with Postgres instead of MySQL.
+
+### `quickstart-cassandra`
+
+Uses Cassandra as the primary data store along with Neo4j as the graph database.
+
+### `quickstart-storage`
+
+Just run the `quickstart` data stores without the DataHub components. This mode is useful for debugging when running the frontend and GMS components outside
+of docker.
+
+### Quickstart Profiles Table
+| Profile Name | MySQL | Postgres | Cassandra | Neo4j | Frontend | GMS | Actions | SystemUpdate | MAE | MCE | Kafka | OpenSearch |
+|----------------------|-------|----------|-----------|-------|----------|-----|---------|--------------|-----|-----|-------|------------|
+| quickstart | X | | | | X | X | X | X | | | X | X |
+| quickstart-frontend | X | | | | X | | | X | | | X | X |
+| quickstart-backend | X | | | | | X | X | X | | | X | X |
+| quickstart-postgres | | X | | | X | X | X | X | | | X | X |
+| quickstart-cassandra | | | X | X | X | X | X | X | | | X | X |
+| quickstart-consumers | X | | | | X | X | X | X | X | X | X | X |
+| quickstart-storage | X | | | | | | | | | | X | X |
+
+## Development Profiles
+
+* Runs `debug` tagged images
+* JVM Debug Mode Enabled
+* Exposes local jars and scripts to the containers
+* Can run non-default one-off configurations (neo4j, cassandra, elasticsearch)
+
+The docker images used are the `debug` images which are created by building locally. These images are
+created by running the gradle command.
+
+```bash
+./gradlew dockerTagDebug
+```
+
+For a complete list of profiles see the table at the end of this section.
+
+### `quickstart-backend`
+
+Run everything except for the `frontend` component. Useful for running just a local (non-docker) frontend.
+
+### `quickstart-frontend`
+
+Runs everything except for the GMS. Useful for running just a local (non-docker) GMS instance.
+
+### Development Profiles Table
+| Profile Name | MySQL | Postgres | Cassandra | Neo4j | Frontend | GMS | Actions | SystemUpdate | MAE | MCE | Kafka | OpenSearch | Elasticsearch |
+|---------------------|-------|----------|-----------|-------|----------|-----|---------|--------------|-----|-----|-------|------------|---------------|
+| debug | X | | | | X | X | X | X | | | X | X | |
+| debug-frontend | X | | | | X | | | X | | | X | X | |
+| debug-backend | X | | | | | X | X | X | | | X | X | |
+| debug-postgres | | X | | | X | X | X | X | | | X | X | |
+| debug-cassandra | | | X | | X | X | X | X | | | X | X | |
+| debug-consumers | X | | | | X | X | X | X | X | X | X | X | |
+| debug-neo4j | X | | | X | X | X | X | X | | | X | X | |
+| debug-elasticsearch | X | | | | X | X | X | X | | | X | | X |
\ No newline at end of file
diff --git a/docker/profiles/cassandra b/docker/profiles/cassandra
new file mode 120000
index 00000000000000..d9af9adbce5cad
--- /dev/null
+++ b/docker/profiles/cassandra
@@ -0,0 +1 @@
+../cassandra
\ No newline at end of file
diff --git a/docker/profiles/datahub-actions b/docker/profiles/datahub-actions
new file mode 120000
index 00000000000000..fea4275be45ffc
--- /dev/null
+++ b/docker/profiles/datahub-actions
@@ -0,0 +1 @@
+../datahub-actions/
\ No newline at end of file
diff --git a/docker/profiles/datahub-frontend b/docker/profiles/datahub-frontend
new file mode 120000
index 00000000000000..74a18b81b7e3b8
--- /dev/null
+++ b/docker/profiles/datahub-frontend
@@ -0,0 +1 @@
+../datahub-frontend
\ No newline at end of file
diff --git a/docker/profiles/datahub-gms b/docker/profiles/datahub-gms
new file mode 120000
index 00000000000000..de2f067e4c0e0d
--- /dev/null
+++ b/docker/profiles/datahub-gms
@@ -0,0 +1 @@
+../datahub-gms
\ No newline at end of file
diff --git a/docker/profiles/datahub-mae-consumer b/docker/profiles/datahub-mae-consumer
new file mode 120000
index 00000000000000..90974047792c50
--- /dev/null
+++ b/docker/profiles/datahub-mae-consumer
@@ -0,0 +1 @@
+../datahub-mae-consumer
\ No newline at end of file
diff --git a/docker/profiles/datahub-mce-consumer b/docker/profiles/datahub-mce-consumer
new file mode 120000
index 00000000000000..288c9d91c28b3e
--- /dev/null
+++ b/docker/profiles/datahub-mce-consumer
@@ -0,0 +1 @@
+../datahub-mce-consumer
\ No newline at end of file
diff --git a/docker/profiles/datahub-upgrade b/docker/profiles/datahub-upgrade
new file mode 120000
index 00000000000000..8ff77fd5562e7f
--- /dev/null
+++ b/docker/profiles/datahub-upgrade
@@ -0,0 +1 @@
+../datahub-upgrade
\ No newline at end of file
diff --git a/docker/profiles/docker-compose.actions.yml b/docker/profiles/docker-compose.actions.yml
new file mode 100644
index 00000000000000..a509a6a67d2705
--- /dev/null
+++ b/docker/profiles/docker-compose.actions.yml
@@ -0,0 +1,45 @@
+
+x-datahub-actions-service: &datahub-actions-service
+ hostname: actions
+ image: ${DATAHUB_ACTIONS_IMAGE:-acryldata/datahub-actions}:${ACTIONS_VERSION:-head}
+ env_file: datahub-actions/env/docker.env
+ environment:
+ ACTIONS_EXTRA_PACKAGES: ${ACTIONS_EXTRA_PACKAGES:-}
+ ACTIONS_CONFIG: ${ACTIONS_CONFIG:-}
+ KAFKA_BOOTSTRAP_SERVER: kafka-broker:29092
+ SCHEMA_REGISTRY_URL: http://datahub-gms:8080/schema-registry/api/
+
+services:
+ datahub-actions-quickstart:
+ <<: *datahub-actions-service
+ container_name: actions
+ profiles:
+ - quickstart
+ - quickstart-backend
+ depends_on:
+ datahub-gms-quickstart:
+ condition: service_healthy
+ datahub-actions-quickstart-cassandra:
+ <<: *datahub-actions-service
+ container_name: actions
+ profiles:
+ - quickstart-cassandra
+ depends_on:
+ datahub-gms-quickstart-cassandra:
+ condition: service_healthy
+ datahub-actions-quickstart-postgres:
+ <<: *datahub-actions-service
+ container_name: actions
+ profiles:
+ - quickstart-postgres
+ depends_on:
+ datahub-gms-quickstart-postgres:
+ condition: service_healthy
+ datahub-actions-quickstart-consumers:
+ <<: *datahub-actions-service
+ container_name: actions
+ profiles:
+ - quickstart-consumers
+ depends_on:
+ datahub-gms-quickstart-consumers:
+ condition: service_healthy
diff --git a/docker/profiles/docker-compose.frontend.yml b/docker/profiles/docker-compose.frontend.yml
new file mode 100644
index 00000000000000..2b82829648dacb
--- /dev/null
+++ b/docker/profiles/docker-compose.frontend.yml
@@ -0,0 +1,119 @@
+
+x-datahub-frontend-service: &datahub-frontend-service
+ hostname: datahub-frontend-react
+ image: ${DATAHUB_FRONTEND_IMAGE:-linkedin/datahub-frontend-react}:${DATAHUB_VERSION:-head}
+ ports:
+ - ${DATAHUB_MAPPED_FRONTEND_PORT:-9002}:9002
+ env_file: datahub-frontend/env/docker.env
+ environment: &datahub-frontend-service-env
+ KAFKA_BOOTSTRAP_SERVER: kafka-broker:29092
+ volumes:
+ - ${HOME}/.datahub/plugins:/etc/datahub/plugins
+
+x-datahub-frontend-service-dev: &datahub-frontend-service-dev
+ <<: *datahub-frontend-service
+ image: linkedin/datahub-frontend-react:debug
+ ports:
+ - ${DATAHUB_MAPPED_FRONTEND_DEBUG_PORT:-5002}:5002
+ - ${DATAHUB_MAPPED_FRONTEND_PORT:-9002}:9002
+ environment:
+ <<: *datahub-frontend-service-env
+ JAVA_TOOL_OPTIONS: -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5002
+ DATAHUB_ANALYTICS_ENABLED: ${DATAHUB_ANALYTICS_ENABLED:-true}
+ volumes:
+ - ../../datahub-frontend/build/stage/playBinary:/datahub-frontend
+
+services:
+ frontend-quickstart:
+ <<: *datahub-frontend-service
+ container_name: frontend
+ profiles:
+ - quickstart
+ - quickstart-frontend
+ depends_on:
+ system-update-quickstart:
+ condition: service_completed_successfully
+ frontend-quickstart-cassandra:
+ <<: *datahub-frontend-service
+ container_name: frontend
+ profiles:
+ - quickstart-cassandra
+ depends_on:
+ system-update-quickstart-cassandra:
+ condition: service_completed_successfully
+ frontend-quickstart-postgres:
+ <<: *datahub-frontend-service
+ container_name: frontend
+ profiles:
+ - quickstart-postgres
+ depends_on:
+ system-update-quickstart-postgres:
+ condition: service_completed_successfully
+ frontend-quickstart-consumers:
+ <<: *datahub-frontend-service
+ container_name: frontend
+ profiles:
+ - quickstart-consumers
+ depends_on:
+ system-update-quickstart:
+ condition: service_completed_successfully
+ frontend-debug:
+ <<: *datahub-frontend-service-dev
+ container_name: datahub-frontend-dev
+ profiles:
+ - debug
+ depends_on:
+ system-update-debug:
+ condition: service_completed_successfully
+ frontend-debug-frontend:
+ <<: *datahub-frontend-service-dev
+ container_name: datahub-frontend-dev
+ profiles:
+ - debug-frontend
+ depends_on:
+ mysql-setup-dev:
+ condition: service_completed_successfully
+ opensearch-setup-dev:
+ condition: service_completed_successfully
+ kafka-setup-dev:
+ condition: service_completed_successfully
+ frontend-debug-postgres:
+ <<: *datahub-frontend-service-dev
+ container_name: datahub-frontend-dev
+ profiles:
+ - debug-postgres
+ depends_on:
+ system-update-debug-postgres:
+ condition: service_completed_successfully
+ frontend-debug-cassandra:
+ <<: *datahub-frontend-service-dev
+ container_name: datahub-frontend-dev
+ profiles:
+ - debug-cassandra
+ depends_on:
+ system-update-debug-cassandra:
+ condition: service_completed_successfully
+ frontend-debug-consumers:
+ <<: *datahub-frontend-service-dev
+ container_name: datahub-frontend-dev
+ profiles:
+ - debug-consumers
+ depends_on:
+ system-update-debug:
+ condition: service_completed_successfully
+ frontend-debug-neo4j:
+ <<: *datahub-frontend-service-dev
+ container_name: datahub-frontend-dev
+ profiles:
+ - debug-neo4j
+ depends_on:
+ system-update-debug-neo4j:
+ condition: service_completed_successfully
+ frontend-debug-elasticsearch:
+ <<: *datahub-frontend-service-dev
+ container_name: datahub-frontend-dev
+ profiles:
+ - debug-elasticsearch
+ depends_on:
+ system-update-debug-elasticsearch:
+ condition: service_completed_successfully
\ No newline at end of file
diff --git a/docker/profiles/docker-compose.gms.yml b/docker/profiles/docker-compose.gms.yml
new file mode 100644
index 00000000000000..01602c8b906b91
--- /dev/null
+++ b/docker/profiles/docker-compose.gms.yml
@@ -0,0 +1,429 @@
+#################################
+# Common Environment Variables
+#################################
+x-primary-datastore-mysql-env: &primary-datastore-mysql-env
+ EBEAN_DATASOURCE_HOST: mysql:3306
+ EBEAN_DATASOURCE_URL: 'jdbc:mysql://mysql:3306/datahub?verifyServerCertificate=false&useSSL=true&useUnicode=yes&characterEncoding=UTF-8&enabledTLSProtocols=TLSv1.2'
+ EBEAN_DATASOURCE_DRIVER: com.mysql.jdbc.Driver
+
+x-primary-datastore-postgres-env: &primary-datastore-postgres-env
+ EBEAN_DATASOURCE_HOST: postgres:5432
+ EBEAN_DATASOURCE_URL: 'jdbc:postgresql://postgres:5432/datahub'
+ EBEAN_DATASOURCE_DRIVER: org.postgresql.Driver
+ EBEAN_POSTGRES_USE_AWS_IAM_AUTH: ${EBEAN_POSTGRES_USE_AWS_IAM_AUTH:-false}
+
+x-primary-datastore-cassandra-env: &primary-datastore-cassandra-env
+ CASSANDRA_DATASOURCE_USERNAME: cassandra
+ CASSANDRA_DATASOURCE_PASSWORD: cassandra
+ CASSANDRA_HOSTS: cassandra
+ CASSANDRA_PORT: 9042
+ CASSANDRA_DATASOURCE_HOST: 'cassandra:9042'
+ ENTITY_SERVICE_IMPL: cassandra
+
+x-graph-datastore-neo4j-env: &graph-datastore-neo4j-env
+ GRAPH_SERVICE_IMPL: neo4j
+ NEO4J_HOST: 'http://neo4j:7474'
+ NEO4J_URI: 'bolt://neo4j'
+ NEO4J_USERNAME: neo4j
+ NEO4J_PASSWORD: datahub
+x-graph-datastore-search-env: &graph-datastore-search-env
+ GRAPH_SERVICE_IMPL: elasticsearch
+
+x-search-datastore-elasticsearch-env: &search-datastore-env
+ ELASTICSEARCH_HOST: search
+ ELASTICSEARCH_PORT: 9200
+ ELASTICSEARCH_PROTOCOL: http
+ ELASTICSEARCH_USE_SSL: ${ELASTICSEARCH_USE_SSL:-false}
+
+x-kafka-env: &kafka-env
+ KAFKA_BOOTSTRAP_SERVER: kafka-broker:29092
+ # KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081
+ SCHEMA_REGISTRY_TYPE: INTERNAL
+ KAFKA_SCHEMAREGISTRY_URL: http://datahub-gms:8080/schema-registry/api/
+
+x-datahub-quickstart-telemetry-env: &datahub-quickstart-telemetry-env
+ DATAHUB_SERVER_TYPE: ${DATAHUB_SERVER_TYPE:-quickstart}
+ DATAHUB_TELEMETRY_ENABLED: ${DATAHUB_TELEMETRY_ENABLED:-true}
+
+x-datahub-dev-telemetry-env: &datahub-dev-telemetry-env
+ DATAHUB_SERVER_TYPE: ${DATAHUB_SERVER_TYPE:-dev}
+ DATAHUB_TELEMETRY_ENABLED: ${DATAHUB_TELEMETRY_ENABLED:-true}
+
+#################################
+# System Update
+#################################
+x-datahub-system-update-service: &datahub-system-update-service
+ hostname: datahub-system-update
+ image: ${DATAHUB_UPGRADE_IMAGE:-acryldata/datahub-upgrade}:${DATAHUB_VERSION:-head}
+ command:
+ - -u
+ - SystemUpdate
+ env_file: datahub-upgrade/env/docker.env
+ environment: &datahub-system-update-env
+ <<: [*primary-datastore-mysql-env, *graph-datastore-search-env, *search-datastore-env, *kafka-env]
+ SCHEMA_REGISTRY_SYSTEM_UPDATE: ${SCHEMA_REGISTRY_SYSTEM_UPDATE:-true}
+ SPRING_KAFKA_PROPERTIES_AUTO_REGISTER_SCHEMAS: ${SPRING_KAFKA_PROPERTIES_AUTO_REGISTER_SCHEMAS:-true}
+ SPRING_KAFKA_PROPERTIES_USE_LATEST_VERSION: ${SPRING_KAFKA_PROPERTIES_USE_LATEST_VERSION:-true}
+
+x-datahub-system-update-service-dev: &datahub-system-update-service-dev
+ <<: *datahub-system-update-service
+ image: ${DATAHUB_UPGRADE_IMAGE:-acryldata/datahub-upgrade}:debug
+ ports:
+ - ${DATAHUB_MAPPED_UPGRADE_DEBUG_PORT:-5003}:5003
+ environment: &datahub-system-update-dev-env
+ <<: [*datahub-dev-telemetry-env, *datahub-system-update-env]
+ SKIP_ELASTICSEARCH_CHECK: false
+ REPROCESS_DEFAULT_BROWSE_PATHS_V2: ${REPROCESS_DEFAULT_BROWSE_PATHS_V2:-false}
+ JAVA_TOOL_OPTIONS: '-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5003'
+ volumes:
+ - ../../datahub-upgrade/build/libs/:/datahub/datahub-upgrade/bin/
+ - ../../metadata-models/src/main/resources/:/datahub/datahub-gms/resources
+ - ${HOME}/.datahub/plugins:/etc/datahub/plugins
+
+#################################
+# GMS
+#################################
+x-datahub-gms-service: &datahub-gms-service
+ hostname: datahub-gms
+ image: ${DATAHUB_GMS_IMAGE:-linkedin/datahub-gms}:${DATAHUB_VERSION:-head}
+ ports:
+ - ${DATAHUB_MAPPED_GMS_PORT:-8080}:8080
+ env_file: datahub-gms/env/docker.env
+ environment: &datahub-gms-env
+ <<: [*primary-datastore-mysql-env, *graph-datastore-search-env, *search-datastore-env, *datahub-quickstart-telemetry-env, *kafka-env]
+ healthcheck:
+ test: curl -sS --fail http://datahub-gms:${DATAHUB_GMS_PORT:-8080}/health
+ start_period: 90s
+ interval: 1s
+ retries: 3
+ timeout: 5s
+ volumes:
+ - ${HOME}/.datahub/plugins:/etc/datahub/plugins
+
+x-datahub-gms-service-dev: &datahub-gms-service-dev
+ <<: *datahub-gms-service
+ image: ${DATAHUB_GMS_IMAGE:-linkedin/datahub-gms}:debug
+ ports:
+ - ${DATAHUB_MAPPED_GMS_DEBUG_PORT:-5001}:5001
+ - ${DATAHUB_MAPPED_GMS_PORT:-8080}:8080
+ environment: &datahub-gms-dev-env
+ <<: [*datahub-dev-telemetry-env, *datahub-gms-env]
+ SKIP_ELASTICSEARCH_CHECK: false
+ METADATA_SERVICE_AUTH_ENABLED: false
+ JAVA_TOOL_OPTIONS: '-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5001'
+ BOOTSTRAP_SYSTEM_UPDATE_WAIT_FOR_SYSTEM_UPDATE: false
+ SEARCH_SERVICE_ENABLE_CACHE: false
+ LINEAGE_SEARCH_CACHE_ENABLED: false
+ SHOW_BROWSE_V2: true
+ volumes:
+ - ./datahub-gms/start.sh:/datahub/datahub-gms/scripts/start.sh
+ - ./datahub-gms/jetty.xml:/datahub/datahub-gms/scripts/jetty.xml
+ - ./monitoring/client-prometheus-config.yaml:/datahub/datahub-gms/scripts/prometheus-config.yaml
+ - ../../metadata-models/src/main/resources/:/datahub/datahub-gms/resources
+ - ../../metadata-service/war/build/libs/:/datahub/datahub-gms/bin
+ - ${HOME}/.datahub/plugins:/etc/datahub/plugins
+
+#################################
+# MAE Consumer
+#################################
+x-datahub-mae-consumer-service: &datahub-mae-consumer-service
+ hostname: datahub-mae-consumer
+ image: ${DATAHUB_MAE_CONSUMER_IMAGE:-linkedin/datahub-mae-consumer}:${DATAHUB_VERSION:-head}
+ ports:
+ - 9091:9091
+ env_file: datahub-mae-consumer/env/docker.env
+ environment: &datahub-mae-consumer-env
+ <<: [*primary-datastore-mysql-env, *graph-datastore-search-env, *search-datastore-env, *kafka-env]
+
+x-datahub-mae-consumer-service-dev: &datahub-mae-consumer-service-dev
+ <<: *datahub-mae-consumer-service
+ image: ${DATAHUB_MAE_CONSUMER_IMAGE:-linkedin/datahub-mae-consumer}:debug
+ environment:
+ <<: [*datahub-dev-telemetry-env, *datahub-mae-consumer-env]
+ volumes:
+ - ./datahub-mae-consumer/start.sh:/datahub/datahub-mae-consumer/scripts/start.sh
+ - ../../metadata-models/src/main/resources/:/datahub/datahub-mae-consumer/resources
+ - ../../metadata-jobs/mae-consumer-job/build/libs/:/datahub/datahub-mae-consumer/bin/
+ - ./monitoring/client-prometheus-config.yaml:/datahub/datahub-mae-consumer/scripts/prometheus-config.yaml
+
+#################################
+# MCE Consumer
+#################################
+x-datahub-mce-consumer-service: &datahub-mce-consumer-service
+ hostname: datahub-mce-consumer
+ image: ${DATAHUB_MCE_CONSUMER_IMAGE:-linkedin/datahub-mce-consumer}:${DATAHUB_VERSION:-head}
+ ports:
+ - 9090:9090
+ env_file: datahub-mce-consumer/env/docker.env
+ environment: &datahub-mce-consumer-env
+ <<: [*primary-datastore-mysql-env, *graph-datastore-search-env, *search-datastore-env, *datahub-quickstart-telemetry-env, *kafka-env]
+
+x-datahub-mce-consumer-service-dev: &datahub-mce-consumer-service-dev
+ <<: *datahub-mce-consumer-service
+ image: ${DATAHUB_MCE_CONSUMER_IMAGE:-linkedin/datahub-mce-consumer}:debug
+ environment:
+ <<: [*datahub-dev-telemetry-env, *datahub-mce-consumer-env]
+ volumes:
+ - ./datahub-mce-consumer/start.sh:/datahub/datahub-mce-consumer/scripts/start.sh
+ - ../../metadata-jobs/mce-consumer-job/build/libs/:/datahub/datahub-mce-consumer/bin
+ - ./monitoring/client-prometheus-config.yaml:/datahub/datahub-mce-consumer/scripts/prometheus-config.yaml
+
+services:
+ #################################
+ # System Update
+ #################################
+ system-update-quickstart:
+ <<: *datahub-system-update-service
+ container_name: system-update
+ profiles:
+ - quickstart
+ - quickstart-storage
+ - quickstart-consumers
+ - quickstart-frontend
+ - quickstart-backend
+ depends_on:
+ mysql-setup:
+ condition: service_completed_successfully
+ opensearch-setup:
+ condition: service_completed_successfully
+ kafka-setup:
+ condition: service_completed_successfully
+ system-update-quickstart-cassandra:
+ <<: *datahub-system-update-service
+ container_name: system-update
+ profiles:
+ - quickstart-cassandra
+ environment:
+ <<: [*primary-datastore-cassandra-env, *graph-datastore-neo4j-env, *datahub-system-update-env]
+ depends_on:
+ neo4j:
+ condition: service_healthy
+ cassandra-setup:
+ condition: service_completed_successfully
+ opensearch-setup:
+ condition: service_completed_successfully
+ kafka-setup:
+ condition: service_completed_successfully
+ system-update-quickstart-postgres:
+ <<: *datahub-system-update-service
+ container_name: system-update
+ profiles:
+ - quickstart-postgres
+ environment:
+ <<: [*primary-datastore-postgres-env, *datahub-system-update-env]
+ depends_on:
+ postgres-setup:
+ condition: service_completed_successfully
+ opensearch-setup:
+ condition: service_completed_successfully
+ kafka-setup:
+ condition: service_completed_successfully
+ system-update-debug:
+ <<: *datahub-system-update-service-dev
+ container_name: system-update-dev
+ profiles:
+ - debug
+ - debug-backend
+ - debug-consumers
+ depends_on:
+ mysql-setup-dev:
+ condition: service_completed_successfully
+ opensearch-setup-dev:
+ condition: service_completed_successfully
+ kafka-setup-dev:
+ condition: service_completed_successfully
+ system-update-debug-elasticsearch:
+ <<: *datahub-system-update-service-dev
+ container_name: system-update-dev
+ profiles:
+ - debug-elasticsearch
+ depends_on:
+ mysql-setup-dev:
+ condition: service_completed_successfully
+ elasticsearch-setup-dev:
+ condition: service_completed_successfully
+ kafka-setup-dev:
+ condition: service_completed_successfully
+ system-update-debug-postgres:
+ <<: *datahub-system-update-service-dev
+ container_name: system-update-dev
+ profiles:
+ - debug-postgres
+ environment:
+ <<: [*primary-datastore-postgres-env, *datahub-system-update-dev-env]
+ depends_on:
+ postgres-setup-dev:
+ condition: service_completed_successfully
+ opensearch-setup-dev:
+ condition: service_completed_successfully
+ kafka-setup-dev:
+ condition: service_completed_successfully
+ system-update-debug-cassandra:
+ <<: *datahub-system-update-service-dev
+ container_name: system-update-dev
+ profiles:
+ - debug-cassandra
+ environment:
+ <<: [*primary-datastore-cassandra-env, *datahub-system-update-dev-env]
+ depends_on:
+ cassandra-setup:
+ condition: service_completed_successfully
+ opensearch-setup-dev:
+ condition: service_completed_successfully
+ kafka-setup-dev:
+ condition: service_completed_successfully
+ system-update-debug-neo4j:
+ <<: *datahub-system-update-service-dev
+ container_name: system-update-dev
+ profiles:
+ - debug-neo4j
+ environment:
+ <<: [*graph-datastore-neo4j-env, *datahub-system-update-dev-env]
+ depends_on:
+ neo4j:
+ condition: service_healthy
+ opensearch-setup-dev:
+ condition: service_completed_successfully
+ kafka-setup-dev:
+ condition: service_completed_successfully
+ #################################
+ # GMS
+ #################################
+ datahub-gms-quickstart:
+ <<: *datahub-gms-service
+ profiles:
+ - quickstart
+ - quickstart-backend
+ container_name: datahub-gms
+ depends_on:
+ system-update-quickstart:
+ condition: service_completed_successfully
+ datahub-gms-quickstart-cassandra:
+ <<: *datahub-gms-service
+ profiles:
+ - quickstart-cassandra
+ container_name: datahub-gms
+ environment:
+ <<: [*primary-datastore-cassandra-env, *graph-datastore-neo4j-env, *datahub-gms-env]
+ depends_on:
+ system-update-quickstart-cassandra:
+ condition: service_completed_successfully
+ datahub-gms-quickstart-postgres:
+ <<: *datahub-gms-service
+ profiles:
+ - quickstart-postgres
+ container_name: datahub-gms
+ environment:
+ <<: [*primary-datastore-postgres-env, *datahub-gms-env]
+ depends_on:
+ system-update-quickstart-postgres:
+ condition: service_completed_successfully
+ datahub-gms-quickstart-consumers:
+ <<: *datahub-gms-service
+ profiles:
+ - quickstart-consumers
+ container_name: datahub-gms
+ environment:
+ <<: *datahub-gms-env
+ MAE_CONSUMER_ENABLED: false
+ MCE_CONSUMER_ENABLED: false
+ depends_on:
+ system-update-quickstart:
+ condition: service_completed_successfully
+ datahub-gms-debug:
+ <<: *datahub-gms-service-dev
+ profiles:
+ - debug
+ - debug-backend
+ container_name: datahub-gms-dev
+ depends_on:
+ system-update-debug:
+ condition: service_completed_successfully
+ datahub-gms-debug-postgres:
+ <<: *datahub-gms-service-dev
+ profiles:
+ - debug-postgres
+ environment:
+ <<: [*primary-datastore-postgres-env, *datahub-gms-dev-env]
+ container_name: datahub-gms-dev
+ depends_on:
+ system-update-debug-postgres:
+ condition: service_completed_successfully
+ datahub-gms-debug-cassandra:
+ <<: *datahub-gms-service-dev
+ profiles:
+ - debug-cassandra
+ environment:
+ <<: [*primary-datastore-cassandra-env, *datahub-gms-dev-env]
+ container_name: datahub-gms-dev
+ depends_on:
+ system-update-debug-cassandra:
+ condition: service_completed_successfully
+ datahub-gms-debug-consumers:
+ <<: *datahub-gms-service-dev
+ profiles:
+ - debug-consumers
+ environment:
+ <<: *datahub-gms-dev-env
+ MAE_CONSUMER_ENABLED: false
+ MCE_CONSUMER_ENABLED: false
+ container_name: datahub-gms-dev
+ depends_on:
+ system-update-debug:
+ condition: service_completed_successfully
+ datahub-gms-debug-neo4j:
+ <<: *datahub-gms-service-dev
+ profiles:
+ - debug-neo4j
+ environment:
+ <<: [*graph-datastore-neo4j-env, *datahub-gms-dev-env]
+ container_name: datahub-gms-dev
+ depends_on:
+ system-update-debug-neo4j:
+ condition: service_completed_successfully
+ datahub-gms-debug-elasticsearch:
+ <<: *datahub-gms-service-dev
+ profiles:
+ - debug-elasticsearch
+ container_name: datahub-gms-dev
+ depends_on:
+ system-update-debug-elasticsearch:
+ condition: service_completed_successfully
+ #################################
+ # MAE Consumer
+ #################################
+ datahub-mae-consumer-quickstart-consumers:
+ <<: *datahub-mae-consumer-service
+ profiles:
+ - quickstart-consumers
+ container_name: datahub-mae-consumer
+ depends_on:
+ datahub-gms-quickstart-consumers:
+ condition: service_healthy
+ datahub-mae-consumer-quickstart-consumers-dev:
+ <<: *datahub-mae-consumer-service-dev
+ profiles:
+ - debug-consumers
+ container_name: datahub-mae-consumer-dev
+ depends_on:
+ datahub-gms-debug-consumers:
+ condition: service_healthy
+ #################################
+ # MCE Consumer
+ #################################
+ datahub-mce-consumer-quickstart-consumers:
+ <<: *datahub-mce-consumer-service
+ profiles:
+ - quickstart-consumers
+ container_name: datahub-mce-consumer
+ depends_on:
+ datahub-gms-quickstart-consumers:
+ condition: service_healthy
+ datahub-mce-consumer-quickstart-consumers-dev:
+ <<: *datahub-mce-consumer-service-dev
+ profiles:
+ - debug-consumers
+ container_name: datahub-mce-consumer-dev
+ depends_on:
+ datahub-gms-debug-consumers:
+ condition: service_healthy
\ No newline at end of file
diff --git a/docker/profiles/docker-compose.prerequisites.yml b/docker/profiles/docker-compose.prerequisites.yml
new file mode 100644
index 00000000000000..d90d4a252f9935
--- /dev/null
+++ b/docker/profiles/docker-compose.prerequisites.yml
@@ -0,0 +1,387 @@
+# Common environment
+x-search-datastore-search: &search-datastore-environment
+ ELASTICSEARCH_HOST: search
+ ELASTICSEARCH_PORT: 9200
+ ELASTICSEARCH_PROTOCOL: http
+ ELASTICSEARCH_USE_SSL: ${ELASTICSEARCH_USE_SSL:-false}
+
+# Primary Storage Profiles
+x-mysql-profiles-quickstart: &mysql-profiles-quickstart
+ - quickstart
+ - quickstart-backend
+ - quickstart-frontend
+ - quickstart-storage
+ - quickstart-consumers
+x-mysql-profiles-dev: &mysql-profiles-dev
+ - debug
+ - debug-frontend
+ - debug-backend
+ - debug-consumers
+ - debug-neo4j
+ - debug-elasticsearch
+x-mysql-profiles: &mysql-profiles
+ - quickstart
+ - quickstart-backend
+ - quickstart-frontend
+ - quickstart-storage
+ - quickstart-consumers
+ - debug
+ - debug-frontend
+ - debug-backend
+ - debug-consumers
+ - debug-neo4j
+ - debug-elasticsearch
+
+x-postgres-profiles-quickstart: &postgres-profiles-quickstart
+ - quickstart-postgres
+x-postgres-profiles-dev: &postgres-profiles-dev
+ - debug-postgres
+x-postgres-profiles: &postgres-profiles
+ - quickstart-postgres
+ - debug-postgres
+
+x-cassandra-profiles: &cassandra-profiles
+ - quickstart-cassandra
+ - debug-cassandra
+
+# Graph Storage Profiles
+x-neo4j-profiles: &neo4j-profiles
+ - quickstart-cassandra
+ - debug-neo4j
+
+# Search Storage Profiles
+x-elasticsearch-profiles: &elasticsearch-profiles
+ - debug-elasticsearch
+
+x-opensearch-profiles-quickstart: &opensearch-profiles-quickstart
+ - quickstart
+ - quickstart-backend
+ - quickstart-frontend
+ - quickstart-storage
+ - quickstart-cassandra
+ - quickstart-postgres
+ - quickstart-consumers
+x-opensearch-profiles-dev: &opensearch-profiles-dev
+ - debug
+ - debug-frontend
+ - debug-backend
+ - debug-postgres
+ - debug-cassandra
+ - debug-consumers
+ - debug-neo4j
+x-opensearch-profiles: &opensearch-profiles
+ - quickstart
+ - quickstart-backend
+ - quickstart-frontend
+ - quickstart-storage
+ - quickstart-cassandra
+ - quickstart-postgres
+ - quickstart-consumers
+ - debug
+ - debug-frontend
+ - debug-backend
+ - debug-postgres
+ - debug-cassandra
+ - debug-consumers
+ - debug-neo4j
+
+# Debug vs Quickstart Profiles
+x-profiles-quickstart: &profiles-quickstart
+ - quickstart
+ - quickstart-backend
+ - quickstart-frontend
+ - quickstart-storage
+ - quickstart-cassandra
+ - quickstart-postgres
+ - quickstart-consumers
+x-profiles-dev: &profiles-dev
+ - debug
+ - debug-frontend
+ - debug-backend
+ - debug-postgres
+ - debug-cassandra
+ - debug-consumers
+ - debug-neo4j
+ - debug-elasticsearch
+
+services:
+ mysql:
+ container_name: mysql
+ profiles: *mysql-profiles
+ hostname: mysql
+ image: mysql:${DATAHUB_MYSQL_VERSION:-8.2}
+ command: --character-set-server=utf8mb4 --collation-server=utf8mb4_bin --default-authentication-plugin=caching_sha2_password
+ ports:
+ - ${DATAHUB_MAPPED_MYSQL_PORT:-3306}:3306
+ env_file: mysql/env/docker.env
+ restart: on-failure
+ healthcheck:
+ test: mysqladmin ping -h mysql -u $$MYSQL_USER --password=$$MYSQL_PASSWORD
+ start_period: 10s
+ interval: 1s
+ retries: 3
+ timeout: 5s
+ volumes:
+ - ./mysql/init.sql:/docker-entrypoint-initdb.d/init.sql
+ - mysqldata:/var/lib/mysql
+ mysql-setup: &mysql-setup
+ container_name: mysql-setup
+ profiles: *mysql-profiles-quickstart
+ hostname: mysql-setup
+ image: ${DATAHUB_MYSQL_SETUP_IMAGE:-acryldata/datahub-mysql-setup}:${DATAHUB_VERSION:-head}
+ env_file: mysql-setup/env/docker.env
+ depends_on:
+ mysql:
+ condition: service_healthy
+ labels:
+ datahub_setup_job: true
+ mysql-setup-dev:
+ <<: *mysql-setup
+ container_name: mysql-setup-dev
+ profiles: *mysql-profiles-dev
+ image: ${DATAHUB_MYSQL_SETUP_IMAGE:-acryldata/datahub-mysql-setup}:debug
+ postgres:
+ container_name: postgres
+ profiles: *postgres-profiles
+ hostname: postgres
+ image: postgres:${DATAHUB_POSTGRES_VERSION:-15.5}
+ env_file: postgres/env/docker.env
+ ports:
+ - '5432:5432'
+ restart: on-failure
+ healthcheck:
+ test: [ "CMD-SHELL", "pg_isready" ]
+ start_period: 20s
+ interval: 2s
+ timeout: 10s
+ retries: 5
+ volumes:
+ - ./postgres/init.sql:/docker-entrypoint-initdb.d/init.sql
+ - postgresdata:/var/lib/postgresql/data
+ postgres-setup: &postgres-setup
+ container_name: postgres-setup
+ profiles: *postgres-profiles-quickstart
+ hostname: postgres-setup
+ image: ${DATAHUB_POSTGRES_SETUP_IMAGE:-acryldata/datahub-postgres-setup}:${DATAHUB_VERSION:-head}
+ env_file: postgres-setup/env/docker.env
+ depends_on:
+ postgres:
+ condition: service_healthy
+ labels:
+ datahub_setup_job: true
+ postgres-setup-dev:
+ <<: *postgres-setup
+ container_name: postgres-setup-dev
+ profiles: *postgres-profiles-dev
+ image: ${DATAHUB_POSTGRES_SETUP_IMAGE:-acryldata/datahub-postgres-setup}:debug
+ cassandra:
+ container_name: cassandra
+ profiles: *cassandra-profiles
+ hostname: cassandra
+ image: cassandra:4.1
+ ports:
+ - 9042:9042
+ healthcheck:
+ test: cqlsh -u cassandra -p cassandra -e 'describe keyspaces'
+ interval: 15s
+ timeout: 10s
+ retries: 10
+ volumes:
+ - cassandradata:/var/lib/cassandra
+ cassandra-setup:
+ container_name: cassandra-setup
+ profiles: *cassandra-profiles
+ hostname: cassandra-setup
+ image: cassandra:4.1
+ command: /bin/bash -c "cqlsh cassandra -f /init.cql"
+ depends_on:
+ cassandra:
+ condition: service_healthy
+ volumes:
+ - ./cassandra/init.cql:/init.cql
+ labels:
+ datahub_setup_job: true
+ neo4j:
+ container_name: neo4j
+ profiles: *neo4j-profiles
+ hostname: neo4j
+ image: neo4j:4.4.28-community
+ ports:
+ - ${DATAHUB_MAPPED_NEO4J_HTTP_PORT:-7474}:7474
+ - ${DATAHUB_MAPPED_NEO4J_BOLT_PORT:-7687}:7687
+ env_file: neo4j/env/docker.env
+ healthcheck:
+ test: wget http://neo4j:$${DATAHUB_NEO4J_HTTP_PORT:-7474}
+ start_period: 5s
+ interval: 1s
+ retries: 5
+ timeout: 5s
+ volumes:
+ - neo4jdata:/data
+ kafka-broker:
+ container_name: kafka-broker
+ hostname: kafka-broker
+ image: confluentinc/cp-kafka:7.4.0
+ command:
+ - /bin/bash
+ - -c
+ - |
+ # Generate KRaft clusterID
+ file_path="/var/lib/kafka/data/clusterID"
+
+ if [ ! -f "$$file_path" ]; then
+ /bin/kafka-storage random-uuid > $$file_path
+ echo "Cluster id has been created..."
+ # KRaft required step: Format the storage directory with a new cluster ID
+ kafka-storage format --ignore-formatted -t $$(cat "$$file_path") -c /etc/kafka/kafka.properties
+ fi
+
+ export CLUSTER_ID=$$(cat "$$file_path")
+ echo "CLUSTER_ID=$$CLUSTER_ID"
+
+ /etc/confluent/docker/run
+ ports:
+ - ${DATAHUB_MAPPED_KAFKA_BROKER_PORT:-9092}:9092
+ env_file: kafka-broker/env/docker.env
+ environment:
+ KAFKA_NODE_ID: 1
+ KAFKA_ADVERTISED_LISTENERS: BROKER://kafka-broker:29092,EXTERNAL://kafka-broker:9092
+ KAFKA_LISTENERS: BROKER://kafka-broker:29092,EXTERNAL://kafka-broker:9092,CONTROLLER://kafka-broker:39092
+ KAFKA_INTER_BROKER_LISTENER_NAME: BROKER
+ KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER
+ KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: CONTROLLER:PLAINTEXT,BROKER:PLAINTEXT,EXTERNAL:PLAINTEXT
+ KAFKA_PROCESS_ROLES: controller, broker
+ KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka-broker:39092
+ # https://github.com/confluentinc/cp-all-in-one/issues/120
+ KAFKA_LOG4J_LOGGERS: 'org.apache.kafka.image.loader.MetadataLoader=WARN'
+ KAFKA_ZOOKEEPER_CONNECT: null
+ healthcheck:
+ test: nc -z kafka-broker $${DATAHUB_KAFKA_BROKER_PORT:-9092}
+ start_period: 60s
+ interval: 1s
+ retries: 5
+ timeout: 5s
+ volumes:
+ - broker:/var/lib/kafka/data/
+ kafka-setup: &kafka-setup
+ container_name: kafka-setup
+ profiles: *profiles-quickstart
+ hostname: kafka-setup
+ image: ${DATAHUB_KAFKA_SETUP_IMAGE:-linkedin/datahub-kafka-setup}:${DATAHUB_VERSION:-head}
+ env_file: kafka-setup/env/docker.env
+ environment: &kafka-setup-env
+ DATAHUB_PRECREATE_TOPICS: ${DATAHUB_PRECREATE_TOPICS:-false}
+ KAFKA_BOOTSTRAP_SERVER: kafka-broker:29092
+ USE_CONFLUENT_SCHEMA_REGISTRY: false
+ depends_on:
+ kafka-broker:
+ condition: service_healthy
+ labels:
+ datahub_setup_job: true
+ kafka-setup-dev:
+ <<: *kafka-setup
+ container_name: kafka-setup-dev
+ profiles: *profiles-dev
+ environment:
+ <<: *kafka-setup-env
+ DATAHUB_PRECREATE_TOPICS: ${DATAHUB_PRECREATE_TOPICS:-true}
+ image: ${DATAHUB_KAFKA_SETUP_IMAGE:-linkedin/datahub-kafka-setup}:debug
+ elasticsearch:
+ container_name: elasticsearch
+ profiles: *elasticsearch-profiles
+ hostname: search
+ image: ${DATAHUB_SEARCH_IMAGE:-elasticsearch}:${DATAHUB_SEARCH_TAG:-7.10.1}
+ ports:
+ - ${DATAHUB_MAPPED_ELASTIC_PORT:-9200}:9200
+ env_file: elasticsearch/env/docker.env
+ environment:
+ - discovery.type=single-node
+ - ${XPACK_SECURITY_ENABLED:-xpack.security.enabled=false}
+ deploy:
+ resources:
+ limits:
+ memory: 1G
+ healthcheck:
+ test: curl -sS --fail http://search:$${DATAHUB_ELASTIC_PORT:-9200}/_cluster/health?wait_for_status=yellow&timeout=0s
+ start_period: 20s
+ interval: 1s
+ retries: 3
+ timeout: 5s
+ volumes:
+ - esdata:/usr/share/elasticsearch/data
+ elasticsearch-setup-dev: &elasticsearch-setup-dev
+ container_name: elasticsearch-setup-dev
+ image: ${DATAHUB_ELASTIC_SETUP_IMAGE:-linkedin/datahub-elasticsearch-setup}:debug
+ profiles: *elasticsearch-profiles
+ hostname: elasticsearch-setup
+ env_file: elasticsearch-setup/env/docker.env
+ environment:
+ <<: *search-datastore-environment
+ USE_AWS_ELASTICSEARCH: ${USE_AWS_ELASTICSEARCH:-false}
+ depends_on:
+ elasticsearch:
+ condition: service_healthy
+ labels:
+ datahub_setup_job: true
+ opensearch:
+ container_name: opensearch
+ profiles: *opensearch-profiles
+ hostname: search
+ image: ${DATAHUB_SEARCH_IMAGE:-opensearchproject/opensearch}:${DATAHUB_SEARCH_TAG:-2.9.0}
+ ports:
+ - ${DATAHUB_MAPPED_ELASTIC_PORT:-9200}:9200
+ env_file: elasticsearch/env/docker.env
+ environment:
+ - discovery.type=single-node
+ - ${XPACK_SECURITY_ENABLED:-plugins.security.disabled=true}
+ deploy:
+ resources:
+ limits:
+ memory: 1G
+ healthcheck:
+ test: curl -sS --fail http://search:$${DATAHUB_ELASTIC_PORT:-9200}/_cluster/health?wait_for_status=yellow&timeout=0s
+ start_period: 20s
+ interval: 1s
+ retries: 3
+ timeout: 5s
+ volumes:
+ - osdata:/usr/share/elasticsearch/data
+ opensearch-setup: &opensearch-setup
+ <<: *elasticsearch-setup-dev
+ container_name: opensearch-setup
+ profiles: *opensearch-profiles-quickstart
+ hostname: opensearch-setup
+ image: ${DATAHUB_ELASTIC_SETUP_IMAGE:-linkedin/datahub-elasticsearch-setup}:${DATAHUB_VERSION:-head}
+ environment:
+ <<: *search-datastore-environment
+ USE_AWS_ELASTICSEARCH: ${USE_AWS_ELASTICSEARCH:-true}
+ depends_on:
+ opensearch:
+ condition: service_healthy
+ labels:
+ datahub_setup_job: true
+ opensearch-setup-dev:
+ <<: *opensearch-setup
+ container_name: opensearch-setup-dev
+ profiles: *opensearch-profiles-dev
+ hostname: opensearch-setup-dev
+ image: ${DATAHUB_ELASTIC_SETUP_IMAGE:-linkedin/datahub-elasticsearch-setup}:debug
+ environment:
+ <<: *search-datastore-environment
+ USE_AWS_ELASTICSEARCH: ${USE_AWS_ELASTICSEARCH:-true}
+ depends_on:
+ opensearch:
+ condition: service_healthy
+
+networks:
+ default:
+ name: datahub_network
+
+volumes:
+ neo4jdata:
+ esdata:
+ osdata:
+ broker:
+ mysqldata:
+ cassandradata:
+ postgresdata:
diff --git a/docker/profiles/docker-compose.yml b/docker/profiles/docker-compose.yml
new file mode 100644
index 00000000000000..534ca9702e2d79
--- /dev/null
+++ b/docker/profiles/docker-compose.yml
@@ -0,0 +1,13 @@
+---
+version: '3.9'
+name: datahub
+
+include:
+ # Contains storage layers: i.e. mysql, kafka, elasticsearch
+ - docker-compose.prerequisites.yml
+ # Actions pod
+ - docker-compose.actions.yml
+ # Frontend
+ - docker-compose.frontend.yml
+ # Remaining components: i.e. gms, system-update, consumers
+ - docker-compose.gms.yml
diff --git a/docker/profiles/elasticsearch b/docker/profiles/elasticsearch
new file mode 120000
index 00000000000000..7712783b3e8d64
--- /dev/null
+++ b/docker/profiles/elasticsearch
@@ -0,0 +1 @@
+../elasticsearch
\ No newline at end of file
diff --git a/docker/profiles/elasticsearch-setup b/docker/profiles/elasticsearch-setup
new file mode 120000
index 00000000000000..670a10e8c37865
--- /dev/null
+++ b/docker/profiles/elasticsearch-setup
@@ -0,0 +1 @@
+../elasticsearch-setup
\ No newline at end of file
diff --git a/docker/profiles/kafka-broker b/docker/profiles/kafka-broker
new file mode 120000
index 00000000000000..23b248a4e0bbd4
--- /dev/null
+++ b/docker/profiles/kafka-broker
@@ -0,0 +1 @@
+../broker
\ No newline at end of file
diff --git a/docker/profiles/kafka-setup b/docker/profiles/kafka-setup
new file mode 120000
index 00000000000000..35b9c167ac26e9
--- /dev/null
+++ b/docker/profiles/kafka-setup
@@ -0,0 +1 @@
+../kafka-setup
\ No newline at end of file
diff --git a/docker/profiles/monitoring b/docker/profiles/monitoring
new file mode 120000
index 00000000000000..1371b42ae4593c
--- /dev/null
+++ b/docker/profiles/monitoring
@@ -0,0 +1 @@
+../monitoring
\ No newline at end of file
diff --git a/docker/profiles/mysql b/docker/profiles/mysql
new file mode 120000
index 00000000000000..057b59f7601652
--- /dev/null
+++ b/docker/profiles/mysql
@@ -0,0 +1 @@
+../mysql
\ No newline at end of file
diff --git a/docker/profiles/mysql-setup b/docker/profiles/mysql-setup
new file mode 120000
index 00000000000000..f9199ec3fc58f0
--- /dev/null
+++ b/docker/profiles/mysql-setup
@@ -0,0 +1 @@
+../mysql-setup
\ No newline at end of file
diff --git a/docker/profiles/neo4j b/docker/profiles/neo4j
new file mode 120000
index 00000000000000..0d4849d989d436
--- /dev/null
+++ b/docker/profiles/neo4j
@@ -0,0 +1 @@
+../neo4j
\ No newline at end of file
diff --git a/docker/profiles/postgres b/docker/profiles/postgres
new file mode 120000
index 00000000000000..be56a57bd0ab8f
--- /dev/null
+++ b/docker/profiles/postgres
@@ -0,0 +1 @@
+../postgres
\ No newline at end of file
diff --git a/docker/profiles/postgres-setup b/docker/profiles/postgres-setup
new file mode 120000
index 00000000000000..38f51721feacb9
--- /dev/null
+++ b/docker/profiles/postgres-setup
@@ -0,0 +1 @@
+../postgres-setup/
\ No newline at end of file
From a29fce9d823dee31480e2efee1dc1bf16fd4c739 Mon Sep 17 00:00:00 2001
From: Nate Bryant
Date: Tue, 19 Dec 2023 15:08:55 -0500
Subject: [PATCH 046/540] Adds urnBasedPagination option to datahub-upgrade
RestoreIndices (#9232)
Co-authored-by: RyanHolstien
---
.../restoreindices/RestoreIndices.java | 1 +
.../upgrade/restoreindices/SendMAEStep.java | 62 ++++++++++++++++---
docker/datahub-upgrade/README.md | 12 +++-
.../metadata/entity/EntityServiceImpl.java | 2 +
.../metadata/entity/ebean/EbeanAspectDao.java | 22 ++++++-
.../restoreindices/RestoreIndicesArgs.java | 8 +++
.../restoreindices/RestoreIndicesResult.java | 2 +
7 files changed, 96 insertions(+), 13 deletions(-)
diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/RestoreIndices.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/RestoreIndices.java
index d38685553dff2f..f46bb9b05624db 100644
--- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/RestoreIndices.java
+++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/RestoreIndices.java
@@ -24,6 +24,7 @@ public class RestoreIndices implements Upgrade {
public static final String WRITER_POOL_SIZE = "WRITER_POOL_SIZE";
public static final String URN_ARG_NAME = "urn";
public static final String URN_LIKE_ARG_NAME = "urnLike";
+ public static final String URN_BASED_PAGINATION_ARG_NAME = "urnBasedPagination";
public static final String STARTING_OFFSET_ARG_NAME = "startingOffset";
diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/SendMAEStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/SendMAEStep.java
index ce59cf2edb84e9..574b1f08b5f543 100644
--- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/SendMAEStep.java
+++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/SendMAEStep.java
@@ -31,6 +31,7 @@ public class SendMAEStep implements UpgradeStep {
private static final int DEFAULT_STARTING_OFFSET = 0;
private static final int DEFAULT_THREADS = 1;
+ private static final boolean DEFAULT_URN_BASED_PAGINATION = false;
private final Database _server;
private final EntityService _entityService;
@@ -89,6 +90,7 @@ private RestoreIndicesArgs getArgs(UpgradeContext context) {
result.numThreads = getThreadCount(context.parsedArgs());
result.batchDelayMs = getBatchDelayMs(context.parsedArgs());
result.start = getStartingOffset(context.parsedArgs());
+ result.urnBasedPagination = getUrnBasedPagination(context.parsedArgs());
if (containsKey(context.parsedArgs(), RestoreIndices.ASPECT_NAME_ARG_NAME)) {
result.aspectName = context.parsedArgs().get(RestoreIndices.ASPECT_NAME_ARG_NAME).get();
}
@@ -140,18 +142,49 @@ public Function executable() {
List> futures = new ArrayList<>();
startTime = System.currentTimeMillis();
- while (start < rowCount) {
- args = args.clone();
- args.start = start;
- futures.add(executor.submit(new KafkaJob(context, args)));
- start = start + args.batchSize;
- }
- while (futures.size() > 0) {
- List tmpResults = iterateFutures(futures);
- for (RestoreIndicesResult tmpResult : tmpResults) {
- reportStats(context, finalJobResult, tmpResult, rowCount, startTime);
+ if (args.urnBasedPagination) {
+ RestoreIndicesResult previousResult = null;
+ int rowsProcessed = 1;
+ while (rowsProcessed > 0) {
+ args = args.clone();
+ if (previousResult != null) {
+ args.lastUrn = previousResult.lastUrn;
+ args.lastAspect = previousResult.lastAspect;
+ }
+ args.start = start;
+ context
+ .report()
+ .addLine(
+ String.format(
+ "Getting next batch of urns + aspects, starting with %s - %s",
+ args.lastUrn, args.lastAspect));
+ Future future = executor.submit(new KafkaJob(context, args));
+ try {
+ RestoreIndicesResult result = future.get();
+ reportStats(context, finalJobResult, result, rowCount, startTime);
+ previousResult = result;
+ rowsProcessed = result.rowsMigrated + result.ignored;
+ context.report().addLine(String.format("Rows processed this loop %d", rowsProcessed));
+ start += args.batchSize;
+ } catch (InterruptedException | ExecutionException e) {
+ return new DefaultUpgradeStepResult(id(), UpgradeStepResult.Result.FAILED);
+ }
+ }
+ } else {
+ while (start < rowCount) {
+ args = args.clone();
+ args.start = start;
+ futures.add(executor.submit(new KafkaJob(context, args)));
+ start = start + args.batchSize;
+ }
+ while (futures.size() > 0) {
+ List tmpResults = iterateFutures(futures);
+ for (RestoreIndicesResult tmpResult : tmpResults) {
+ reportStats(context, finalJobResult, tmpResult, rowCount, startTime);
+ }
}
}
+
executor.shutdown();
if (finalJobResult.rowsMigrated != rowCount) {
float percentFailed = 0.0f;
@@ -233,6 +266,15 @@ private int getThreadCount(final Map> parsedArgs) {
return getInt(parsedArgs, DEFAULT_THREADS, RestoreIndices.NUM_THREADS_ARG_NAME);
}
+ private boolean getUrnBasedPagination(final Map> parsedArgs) {
+ boolean urnBasedPagination = DEFAULT_URN_BASED_PAGINATION;
+ if (containsKey(parsedArgs, RestoreIndices.URN_BASED_PAGINATION_ARG_NAME)) {
+ urnBasedPagination =
+ Boolean.parseBoolean(parsedArgs.get(RestoreIndices.URN_BASED_PAGINATION_ARG_NAME).get());
+ }
+ return urnBasedPagination;
+ }
+
private int getInt(
final Map> parsedArgs, int defaultVal, String argKey) {
int result = defaultVal;
diff --git a/docker/datahub-upgrade/README.md b/docker/datahub-upgrade/README.md
index 0d019971604d6b..9c96114cdb2dd9 100644
--- a/docker/datahub-upgrade/README.md
+++ b/docker/datahub-upgrade/README.md
@@ -15,8 +15,16 @@ to metadata_aspect_v2 table. Arguments:
2. **NoCodeDataMigrationCleanup**: Cleanses graph index, search index, and key-value store of legacy DataHub data (metadata_aspect table) once
the No Code Data Migration has completed successfully. No arguments.
-3. **RestoreIndices**: Restores indices by fetching the latest version of each aspect and producing MAE
-
+3. **RestoreIndices**: Restores indices by fetching the latest version of each aspect and producing MAE. Arguments:
+ - *batchSize* (Optional): The number of rows to migrate at a time. Defaults to 1000.
+ - *batchDelayMs* (Optional): The number of milliseconds of delay between migrated batches. Used for rate limiting. Defaults to 250.
+ - *numThreads* (Optional): The number of threads to use, defaults to 1. Note that this is not used if `urnBasedPagination` is true.
+ - *aspectName* (Optional): The aspect name for producing events.
+ - *urn* (Optional): The urn for producing events.
+ - *urnLike* (Optional): The urn pattern for producing events, using `%` as a wild card
+ - *urnBasedPagination* (Optional): Paginate the SQL results using the urn + aspect string instead of `OFFSET`. Defaults to false,
+ though should improve performance for large amounts of data.
+
4. **RestoreBackup**: Restores the storage stack from a backup of the local database
## Environment Variables
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java
index a3338394165567..7bd8e763cdc27a 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java
@@ -1161,6 +1161,7 @@ public RestoreIndicesResult restoreIndices(
Urn urn;
try {
urn = Urn.createFromString(aspect.getKey().getUrn());
+ result.lastUrn = urn.toString();
} catch (Exception e) {
logger.accept(
String.format(
@@ -1188,6 +1189,7 @@ public RestoreIndicesResult restoreIndices(
result.timeEntityRegistryCheckMs += System.currentTimeMillis() - startTime;
startTime = System.currentTimeMillis();
final String aspectName = aspect.getKey().getAspect();
+ result.lastAspect = aspectName;
// 3. Verify that the aspect is a valid aspect associated with the entity
AspectSpec aspectSpec = entitySpec.getAspectSpec(aspectName);
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java
index b2b47c1d5ba32f..26946890daa3b7 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java
@@ -477,11 +477,31 @@ public PagedList getPagedAspects(final RestoreIndicesArgs args) {
if (args.urnLike != null) {
exp = exp.like(EbeanAspectV2.URN_COLUMN, args.urnLike);
}
+
+ int start = args.start;
+ if (args.urnBasedPagination) {
+ start = 0;
+ if (args.lastUrn != null && !args.lastUrn.isEmpty()) {
+ exp = exp.where().ge(EbeanAspectV2.URN_COLUMN, args.lastUrn);
+
+ // To prevent processing the same aspect multiple times in a restore, it compares against
+ // the last aspect if the urn matches the last urn
+ if (args.lastAspect != null && !args.lastAspect.isEmpty()) {
+ exp =
+ exp.where()
+ .and()
+ .or()
+ .ne(EbeanAspectV2.URN_COLUMN, args.lastUrn)
+ .gt(EbeanAspectV2.ASPECT_COLUMN, args.lastAspect);
+ }
+ }
+ }
+
return exp.orderBy()
.asc(EbeanAspectV2.URN_COLUMN)
.orderBy()
.asc(EbeanAspectV2.ASPECT_COLUMN)
- .setFirstRow(args.start)
+ .setFirstRow(start)
.setMaxRows(args.batchSize)
.findPagedList();
}
diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/entity/restoreindices/RestoreIndicesArgs.java b/metadata-service/services/src/main/java/com/linkedin/metadata/entity/restoreindices/RestoreIndicesArgs.java
index d8fcbe0b7d44d3..e50b44b7f0eca3 100644
--- a/metadata-service/services/src/main/java/com/linkedin/metadata/entity/restoreindices/RestoreIndicesArgs.java
+++ b/metadata-service/services/src/main/java/com/linkedin/metadata/entity/restoreindices/RestoreIndicesArgs.java
@@ -11,6 +11,9 @@ public class RestoreIndicesArgs implements Cloneable {
public String aspectName;
public String urn;
public String urnLike;
+ public Boolean urnBasedPagination = false;
+ public String lastUrn = "";
+ public String lastAspect = "";
@Override
public RestoreIndicesArgs clone() {
@@ -51,4 +54,9 @@ public RestoreIndicesArgs setBatchSize(Integer batchSize) {
}
return this;
}
+
+ public RestoreIndicesArgs setUrnBasedPagination(Boolean urnBasedPagination) {
+ this.urnBasedPagination = urnBasedPagination;
+ return this;
+ }
}
diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/entity/restoreindices/RestoreIndicesResult.java b/metadata-service/services/src/main/java/com/linkedin/metadata/entity/restoreindices/RestoreIndicesResult.java
index 8479338660db0b..a270cf4548bed5 100644
--- a/metadata-service/services/src/main/java/com/linkedin/metadata/entity/restoreindices/RestoreIndicesResult.java
+++ b/metadata-service/services/src/main/java/com/linkedin/metadata/entity/restoreindices/RestoreIndicesResult.java
@@ -13,4 +13,6 @@ public class RestoreIndicesResult {
public long aspectCheckMs = 0;
public long createRecordMs = 0;
public long sendMessageMs = 0;
+ public String lastUrn = "";
+ public String lastAspect = "";
}
From 3777730d782bc1069f7752f74a199aa6447be0d0 Mon Sep 17 00:00:00 2001
From: david-leifker <114954101+david-leifker@users.noreply.github.com>
Date: Tue, 19 Dec 2023 15:30:47 -0600
Subject: [PATCH 047/540] fix(quickstart): force strings for mysql version
(#9485)
---
docker/quickstart/quickstart_version_mapping.yaml | 8 ++++----
.../src/datahub/cli/quickstart_versioning.py | 4 ++--
2 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/docker/quickstart/quickstart_version_mapping.yaml b/docker/quickstart/quickstart_version_mapping.yaml
index 9948bd55fdc0b6..b08cfda175aa9f 100644
--- a/docker/quickstart/quickstart_version_mapping.yaml
+++ b/docker/quickstart/quickstart_version_mapping.yaml
@@ -23,7 +23,7 @@ quickstart_version_map:
default:
composefile_git_ref: master
docker_tag: head
- mysql_tag: 5.7
+ mysql_tag: "5.7"
# default: # Use this to pin default to a specific version.
# composefile_git_ref: fd1bd51541a132017a648f4a2f037eec8f70ba26 # v0.10.0 + quickstart compose file fixes
# docker_tag: v0.10.0
@@ -31,19 +31,19 @@ quickstart_version_map:
head:
composefile_git_ref: master
docker_tag: head
- mysql_tag: 5.7
+ mysql_tag: "5.7"
# v0.13.0 we upgraded MySQL image for EOL
v0.13.0:
composefile_git_ref: master
docker_tag: head
- mysql_tag: 8.2
+ mysql_tag: "8.2"
# v0.9.6 images contain security vulnerabilities
v0.9.6:
composefile_git_ref: v0.9.6.1
docker_tag: v0.9.6.1
- mysql_tag: 5.7
+ mysql_tag: "5.7"
# If stable is not defined the latest released version will be used.
# stable:
diff --git a/metadata-ingestion/src/datahub/cli/quickstart_versioning.py b/metadata-ingestion/src/datahub/cli/quickstart_versioning.py
index be7439f330dfb6..1c3ce93c1f7887 100644
--- a/metadata-ingestion/src/datahub/cli/quickstart_versioning.py
+++ b/metadata-ingestion/src/datahub/cli/quickstart_versioning.py
@@ -94,7 +94,7 @@ def fetch_quickstart_config(cls) -> "QuickstartVersionMappingConfig":
try:
release = cls._fetch_latest_version()
config.quickstart_version_map["stable"] = QuickstartExecutionPlan(
- composefile_git_ref=release, docker_tag=release, mysql_tag=release
+ composefile_git_ref=release, docker_tag=release, mysql_tag="5.7"
)
except Exception:
click.echo(
@@ -123,7 +123,7 @@ def get_quickstart_execution_plan(
QuickstartExecutionPlan(
composefile_git_ref=composefile_git_ref,
docker_tag=docker_tag,
- mysql_tag=mysql_tag,
+ mysql_tag=str(mysql_tag),
),
)
# new CLI version is downloading the composefile corresponding to the requested version
From 76be5173b292b936216aad1409090b70615a78f8 Mon Sep 17 00:00:00 2001
From: david-leifker <114954101+david-leifker@users.noreply.github.com>
Date: Tue, 19 Dec 2023 15:52:59 -0600
Subject: [PATCH 048/540] fix(docker): fix frontend dev docker path (#9488)
---
docker/docker-compose.dev.yml | 2 +-
docker/profiles/docker-compose.frontend.yml | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/docker/docker-compose.dev.yml b/docker/docker-compose.dev.yml
index 774c4e17bee21f..a69fb977a3417e 100644
--- a/docker/docker-compose.dev.yml
+++ b/docker/docker-compose.dev.yml
@@ -24,7 +24,7 @@ services:
- JAVA_TOOL_OPTIONS=-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5002
- DATAHUB_ANALYTICS_ENABLED=${DATAHUB_ANALYTICS_ENABLED:-true}
volumes:
- - ../datahub-frontend/build/stage/playBinary:/datahub-frontend
+ - ../datahub-frontend/build/stage/main:/datahub-frontend
datahub-gms:
image: linkedin/datahub-gms:debug
ports:
diff --git a/docker/profiles/docker-compose.frontend.yml b/docker/profiles/docker-compose.frontend.yml
index 2b82829648dacb..80cb4e7b4b596d 100644
--- a/docker/profiles/docker-compose.frontend.yml
+++ b/docker/profiles/docker-compose.frontend.yml
@@ -21,7 +21,7 @@ x-datahub-frontend-service-dev: &datahub-frontend-service-dev
JAVA_TOOL_OPTIONS: -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5002
DATAHUB_ANALYTICS_ENABLED: ${DATAHUB_ANALYTICS_ENABLED:-true}
volumes:
- - ../../datahub-frontend/build/stage/playBinary:/datahub-frontend
+ - ../../datahub-frontend/build/stage/main:/datahub-frontend
services:
frontend-quickstart:
From 16d3df620f07c4d41118be9c8f38dc0cf46df76f Mon Sep 17 00:00:00 2001
From: Salman-Apptware <101426513+Salman-Apptware@users.noreply.github.com>
Date: Wed, 20 Dec 2023 16:32:52 +0530
Subject: [PATCH 049/540] fix(ui): Tab doesn't represent the page you are on
for non-data asset pages (#9468)
---
datahub-web-react/src/app/AppProviders.tsx | 13 ++++----
.../src/app/entity/group/GroupInfoSideBar.tsx | 17 +++++++++++
.../src/app/entity/user/UserInfoSideBar.tsx | 19 +++++++++++-
.../src/app/search/SearchablePage.tsx | 27 +++++++++++++++++
.../src/app/shared/BrowserTabTitleContext.tsx | 30 +++++++++++++++++++
5 files changed, 100 insertions(+), 6 deletions(-)
create mode 100644 datahub-web-react/src/app/shared/BrowserTabTitleContext.tsx
diff --git a/datahub-web-react/src/app/AppProviders.tsx b/datahub-web-react/src/app/AppProviders.tsx
index 81a8ddbfc9bace..00597e1cf76406 100644
--- a/datahub-web-react/src/app/AppProviders.tsx
+++ b/datahub-web-react/src/app/AppProviders.tsx
@@ -5,6 +5,7 @@ import UserContextProvider from './context/UserContextProvider';
import QuickFiltersProvider from '../providers/QuickFiltersProvider';
import SearchContextProvider from './search/context/SearchContextProvider';
import EntityRegistryProvider from './EntityRegistryProvider';
+import { BrowserTitleProvider } from './shared/BrowserTabTitleContext';
interface Props {
children: React.ReactNode;
@@ -15,11 +16,13 @@ export default function AppProviders({ children }: Props) {
-
-
- {children}
-
-
+
+
+
+ {children}
+
+
+
diff --git a/datahub-web-react/src/app/entity/group/GroupInfoSideBar.tsx b/datahub-web-react/src/app/entity/group/GroupInfoSideBar.tsx
index 07885a4d0f6304..044b09dc185e53 100644
--- a/datahub-web-react/src/app/entity/group/GroupInfoSideBar.tsx
+++ b/datahub-web-react/src/app/entity/group/GroupInfoSideBar.tsx
@@ -21,6 +21,7 @@ import {
} from '../shared/SidebarStyledComponents';
import GroupMembersSideBarSection from './GroupMembersSideBarSection';
import { useUserContext } from '../../context/useUserContext';
+import { useBrowserTitle } from '../../shared/BrowserTabTitleContext';
import StripMarkdownText, { removeMarkdown } from '../shared/components/styled/StripMarkdownText';
import { Editor } from '../shared/tabs/Documentation/components/editor/Editor';
import EditGroupDescriptionModal from './EditGroupDescriptionModal';
@@ -157,6 +158,22 @@ export default function GroupInfoSidebar({ sideBarData, refetch }: Props) {
const { url } = useRouteMatch();
const history = useHistory();
+ const { updateTitle } = useBrowserTitle();
+
+ useEffect(()=>{
+ // You can use the title and updateTitle function here
+ // For example, updating the title when the component mounts
+ if(name){
+ updateTitle(`Group | ${name}`);
+ }
+ // // Don't forget to clean up the title when the component unmounts
+ return () => {
+ if(name){ // added to condition for rerendering issue
+ updateTitle('');
+ }
+ };
+ }, [name, updateTitle]);
+
/* eslint-disable @typescript-eslint/no-unused-vars */
const [editGroupModal, showEditGroupModal] = useState(false);
const me = useUserContext();
diff --git a/datahub-web-react/src/app/entity/user/UserInfoSideBar.tsx b/datahub-web-react/src/app/entity/user/UserInfoSideBar.tsx
index c01dd3a6359245..71bfbfcd49a16e 100644
--- a/datahub-web-react/src/app/entity/user/UserInfoSideBar.tsx
+++ b/datahub-web-react/src/app/entity/user/UserInfoSideBar.tsx
@@ -1,5 +1,5 @@
import { Divider, message, Space, Button, Typography, Tag } from 'antd';
-import React, { useState } from 'react';
+import React, { useEffect, useState } from 'react';
import { EditOutlined, MailOutlined, PhoneOutlined, SlackOutlined } from '@ant-design/icons';
import { useUpdateCorpUserPropertiesMutation } from '../../../graphql/user.generated';
import { EntityRelationship, DataHubRole } from '../../../types.generated';
@@ -21,6 +21,7 @@ import {
import EntityGroups from '../shared/EntityGroups';
import { mapRoleIcon } from '../../identity/user/UserUtils';
import { useUserContext } from '../../context/useUserContext';
+import { useBrowserTitle } from '../../shared/BrowserTabTitleContext';
const { Paragraph } = Typography;
@@ -61,6 +62,22 @@ export default function UserInfoSideBar({ sideBarData, refetch }: Props) {
const me = useUserContext();
const isProfileOwner = me?.user?.urn === urn;
+ const { updateTitle } = useBrowserTitle();
+
+ useEffect(()=>{
+ // You can use the title and updateTitle function here
+ // For example, updating the title when the component mounts
+ if(name){
+ updateTitle(`User | ${name}`);
+ }
+ // // Don't forget to clean up the title when the component unmounts
+ return () => {
+ if(name){ // added to condition for rerendering issue
+ updateTitle('');
+ }
+ };
+ }, [name, updateTitle]);
+
const getEditModalData = {
urn,
name,
diff --git a/datahub-web-react/src/app/search/SearchablePage.tsx b/datahub-web-react/src/app/search/SearchablePage.tsx
index 9d02d85d3634c0..53dfc866b9b64b 100644
--- a/datahub-web-react/src/app/search/SearchablePage.tsx
+++ b/datahub-web-react/src/app/search/SearchablePage.tsx
@@ -3,6 +3,7 @@ import { useHistory, useLocation } from 'react-router';
import { debounce } from 'lodash';
import * as QueryString from 'query-string';
import { useTheme } from 'styled-components';
+import { Helmet } from 'react-helmet-async';
import { SearchHeader } from './SearchHeader';
import { useEntityRegistry } from '../useEntityRegistry';
import { EntityType, FacetFilterInput } from '../../types.generated';
@@ -19,6 +20,7 @@ import { useQuickFiltersContext } from '../../providers/QuickFiltersContext';
import { useUserContext } from '../context/useUserContext';
import { useSelectedSortOption } from './context/SearchContext';
import { HALF_SECOND_IN_MS } from '../entity/shared/tabs/Dataset/Queries/utils/constants';
+import { useBrowserTitle } from '../shared/BrowserTabTitleContext';
const styles = {
children: {
@@ -68,6 +70,28 @@ export const SearchablePage = ({ onSearch, onAutoComplete, children }: Props) =>
const { user } = userContext;
const viewUrn = userContext.localState?.selectedViewUrn;
+ const { title, updateTitle } = useBrowserTitle();
+
+ useEffect(() => {
+ // Update the title only if it's not already set and there is a valid pathname
+ if (!title && location.pathname) {
+ const formattedPath = location.pathname
+ .split('/')
+ .filter(word => word !== '')
+ .map(word => word.charAt(0).toUpperCase() + word.slice(1))
+ .join(' | ');
+
+ if (formattedPath) {
+ return updateTitle(formattedPath);
+ }
+ }
+
+ // Clean up the title when the component unmounts
+ return () => {
+ updateTitle('');
+ };
+ }, [location.pathname, title, updateTitle]);
+
useEffect(() => {
if (suggestionsData !== undefined) {
setNewSuggestionData(suggestionsData);
@@ -140,6 +164,9 @@ export const SearchablePage = ({ onSearch, onAutoComplete, children }: Props) =>
authenticatedUserPictureLink={user?.editableProperties?.pictureLink}
entityRegistry={entityRegistry}
/>
+
+ {title}
+
{children}
>
);
diff --git a/datahub-web-react/src/app/shared/BrowserTabTitleContext.tsx b/datahub-web-react/src/app/shared/BrowserTabTitleContext.tsx
new file mode 100644
index 00000000000000..284e2771124c83
--- /dev/null
+++ b/datahub-web-react/src/app/shared/BrowserTabTitleContext.tsx
@@ -0,0 +1,30 @@
+import React, { createContext, ReactNode, useContext } from 'react';
+
+interface BrowserTitleContextProps {
+ title: string;
+ updateTitle: (newTitle: string) => void;
+}
+
+const BrowserTitleContext = createContext(undefined);
+
+export const BrowserTitleProvider: React.FC<{ children: ReactNode }> = ({ children }) => {
+ const [title, setTitle] = React.useState('');
+
+ const updateTitle = (newTitle: string) => {
+ setTitle(newTitle);
+ };
+
+ return (
+
+ {children}
+
+ );
+};
+
+export const useBrowserTitle = () => {
+ const context = useContext(BrowserTitleContext);
+ if (!context) {
+ throw new Error('useBrowserTitle must be used within a BrowserTitleProvider');
+ }
+ return context;
+};
From c8e59aabedb9a6f43f4bcfbf20bdffad6abc85d5 Mon Sep 17 00:00:00 2001
From: noggi
Date: Wed, 20 Dec 2023 12:33:23 -0800
Subject: [PATCH 050/540] Do not sync demo in downstream repos (#9493)
---
.github/workflows/docker-unified.yml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml
index 169a86000adccb..7cef38b1cd47ce 100644
--- a/.github/workflows/docker-unified.yml
+++ b/.github/workflows/docker-unified.yml
@@ -911,13 +911,13 @@ jobs:
]
steps:
- uses: aws-actions/configure-aws-credentials@v1
- if: ${{ needs.setup.outputs.publish != 'false' }}
+ if: ${{ needs.setup.outputs.publish != 'false' && github.repository_owner == 'datahub-project' && needs.setup.outputs.repository_name == 'datahub' }}
with:
aws-access-key-id: ${{ secrets.AWS_SQS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SQS_ACCESS_KEY }}
aws-region: us-west-2
- uses: isbang/sqs-action@v0.2.0
- if: ${{ needs.setup.outputs.publish != 'false' }}
+ if: ${{ needs.setup.outputs.publish != 'false' && github.repository_owner == 'datahub-project' && needs.setup.outputs.repository_name == 'datahub' }}
with:
sqs-url: ${{ secrets.DATAHUB_HEAD_SYNC_QUEUE }}
message: '{ "command": "git-sync", "args" : {"repoName": "${{ needs.setup.outputs.repository_name }}", "repoOrg": "${{ github.repository_owner }}", "repoBranch": "${{ needs.setup.outputs.branch_name }}", "repoShaShort": "${{ needs.setup.outputs.short_sha }}" }}'
From bf813d1d24107d858260dc2852489e034eb4cf8c Mon Sep 17 00:00:00 2001
From: Harshal Sheth
Date: Wed, 20 Dec 2023 15:49:03 -0500
Subject: [PATCH 051/540] fix(ingest): update ingest_stats event with
transformer types (#9487)
---
metadata-ingestion/src/datahub/ingestion/run/pipeline.py | 3 +++
1 file changed, 3 insertions(+)
diff --git a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py
index 25e17d692109a5..d7c70dbea0b141 100644
--- a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py
+++ b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py
@@ -528,6 +528,9 @@ def log_ingestion_stats(self) -> None:
{
"source_type": self.config.source.type,
"sink_type": self.config.sink.type,
+ "transformer_types": [
+ transformer.type for transformer in self.config.transformers or []
+ ],
"records_written": stats.discretize(
self.sink.get_report().total_records_written
),
From 50be329492048534cb83c6f81bad87c5c49ee05c Mon Sep 17 00:00:00 2001
From: Sumit Patil <91715217+sumitappt@users.noreply.github.com>
Date: Thu, 21 Dec 2023 13:24:33 +0530
Subject: [PATCH 052/540] feat(ui/glossary): Keep the same tab selected when
browsing Glossary (#9469)
---
.../shared/EntityDropdown/EntityDropdown.tsx | 1 +
.../containers/profile/header/EntityTabs.tsx | 1 +
.../entity/shared/containers/profile/utils.ts | 16 ++++++++
.../app/glossary/GlossaryBrowser/NodeItem.tsx | 2 +-
.../app/glossary/GlossaryBrowser/TermItem.tsx | 9 ++++-
.../e2e/glossary/glossary_navigation.js | 38 +++++++++++++++++++
6 files changed, 64 insertions(+), 3 deletions(-)
diff --git a/datahub-web-react/src/app/entity/shared/EntityDropdown/EntityDropdown.tsx b/datahub-web-react/src/app/entity/shared/EntityDropdown/EntityDropdown.tsx
index 8d7f1cca9c1cbd..664a77a731d348 100644
--- a/datahub-web-react/src/app/entity/shared/EntityDropdown/EntityDropdown.tsx
+++ b/datahub-web-react/src/app/entity/shared/EntityDropdown/EntityDropdown.tsx
@@ -180,6 +180,7 @@ function EntityDropdown(props: Props) {
)}
{menuItems.has(EntityMenuItems.ADD_TERM) && (
setIsCreateTermModalVisible(true)}
diff --git a/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityTabs.tsx b/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityTabs.tsx
index 58693eca8af0e8..25e044259f240e 100644
--- a/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityTabs.tsx
+++ b/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityTabs.tsx
@@ -39,6 +39,7 @@ export const EntityTabs = ({ tabs, selectedTab }: Props) => {
return (
(
-
+
))}
)}
diff --git a/datahub-web-react/src/app/glossary/GlossaryBrowser/TermItem.tsx b/datahub-web-react/src/app/glossary/GlossaryBrowser/TermItem.tsx
index 6980c15a1c256a..56495b53eded35 100644
--- a/datahub-web-react/src/app/glossary/GlossaryBrowser/TermItem.tsx
+++ b/datahub-web-react/src/app/glossary/GlossaryBrowser/TermItem.tsx
@@ -5,6 +5,7 @@ import { useEntityRegistry } from '../../useEntityRegistry';
import { ANTD_GRAY } from '../../entity/shared/constants';
import { ChildGlossaryTermFragment } from '../../../graphql/glossaryNode.generated';
import { useGlossaryEntityData } from '../../entity/shared/GlossaryEntityContext';
+import { useGlossaryActiveTabPath } from '../../entity/shared/containers/profile/utils';
const TermWrapper = styled.div`
font-weight: normal;
@@ -47,13 +48,15 @@ interface Props {
term: ChildGlossaryTermFragment;
isSelecting?: boolean;
selectTerm?: (urn: string, displayName: string) => void;
+ includeActiveTabPath?: boolean;
}
function TermItem(props: Props) {
- const { term, isSelecting, selectTerm } = props;
+ const { term, isSelecting, selectTerm, includeActiveTabPath } = props;
const { entityData } = useGlossaryEntityData();
const entityRegistry = useEntityRegistry();
+ const activeTabPath = useGlossaryActiveTabPath();
function handleSelectTerm() {
if (selectTerm) {
@@ -68,7 +71,9 @@ function TermItem(props: Props) {
{!isSelecting && (
{entityRegistry.getDisplayName(term.type, isOnEntityPage ? entityData : term)}
diff --git a/smoke-test/tests/cypress/cypress/e2e/glossary/glossary_navigation.js b/smoke-test/tests/cypress/cypress/e2e/glossary/glossary_navigation.js
index 7ddf36aa87c2d1..dd3b0a567c75f8 100644
--- a/smoke-test/tests/cypress/cypress/e2e/glossary/glossary_navigation.js
+++ b/smoke-test/tests/cypress/cypress/e2e/glossary/glossary_navigation.js
@@ -1,4 +1,5 @@
const glossaryTerm = "CypressGlosssaryNavigationTerm";
+const glossarySecondTerm = "CypressGlossarySecondTerm";
const glossaryTermGroup = "CypressGlosssaryNavigationGroup";
const glossaryParentGroup = "CypressNode";
@@ -30,6 +31,39 @@ describe("glossary sidebar navigation test", () => {
cy.get('[data-testid="glossary-browser-sidebar"]').contains(glossaryTermGroup).click().wait(3000);
cy.get('*[class^="GlossaryEntitiesList"]').contains(glossaryTerm).should("be.visible");
+ // Create another term and move it to the same term group
+ cy.clickOptionWithText(glossaryTermGroup);
+ cy.openThreeDotDropdown();
+ cy.clickOptionWithTestId("entity-menu-add-term-button");
+
+ // Wait for the create term modal to be visible
+ cy.waitTextVisible("Create Glossary Term");
+ cy.enterTextInTestId("create-glossary-entity-modal-name", glossarySecondTerm);
+ cy.clickOptionWithTestId("glossary-entity-modal-create-button");
+
+ // Wait for the new term to be visible in the sidebar
+ cy.clickOptionWithText(glossarySecondTerm).wait(3000);
+
+ // Move the term to the created term group
+ cy.openThreeDotDropdown();
+ cy.clickOptionWithTestId("entity-menu-move-button");
+ cy.get('[data-testid="move-glossary-entity-modal"]').contains(glossaryTermGroup).click({ force: true });
+ cy.get('[data-testid="move-glossary-entity-modal"]').contains(glossaryTermGroup).should("be.visible");
+ cy.clickOptionWithTestId("glossary-entity-modal-move-button");
+ cy.waitTextVisible("Moved Glossary Term!");
+
+ // Ensure the new term is under the parent term group in the navigation sidebar
+ cy.get('[data-testid="glossary-browser-sidebar"]').contains(glossaryTermGroup).click();
+ cy.get('*[class^="GlossaryEntitiesList"]').contains(glossarySecondTerm).should("be.visible");
+
+
+ // Switch between terms and ensure the "Properties" tab is active
+ cy.clickOptionWithText(glossaryTerm);
+ cy.get('[data-testid="entity-tab-headers-test-id"]').contains("Properties").click({ force: true });
+ cy.get('[data-node-key="Properties"]').contains("Properties").should("have.attr", "aria-selected", "true");
+ cy.clickOptionWithText(glossarySecondTerm);
+ cy.get('[data-node-key="Properties"]').contains("Properties").should("have.attr", "aria-selected", "true");
+
// Move a term group from the root level to be under a parent term group
cy.goToGlossaryList();
cy.clickOptionWithText(glossaryTermGroup);
@@ -52,6 +86,10 @@ describe("glossary sidebar navigation test", () => {
cy.clickOptionWithText(glossaryTerm).wait(3000);
cy.deleteFromDropdown();
cy.waitTextVisible("Deleted Glossary Term!");
+ cy.clickOptionWithText(glossaryTermGroup);
+ cy.clickOptionWithText(glossarySecondTerm).wait(3000);
+ cy.deleteFromDropdown();
+ cy.waitTextVisible("Deleted Glossary Term!");
cy.clickOptionWithText(glossaryParentGroup);
cy.clickOptionWithText(glossaryTermGroup).wait(3000);
cy.deleteFromDropdown();
From 80fb145a7b85b323f339d7901658dd9fde5bd4db Mon Sep 17 00:00:00 2001
From: Sumit Patil <91715217+sumitappt@users.noreply.github.com>
Date: Thu, 21 Dec 2023 17:57:41 +0530
Subject: [PATCH 053/540] style(search): Tag overflow add padding (#9497)
---
datahub-web-react/src/app/preview/DefaultPreviewCard.tsx | 1 +
1 file changed, 1 insertion(+)
diff --git a/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx b/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx
index 36c4c020e71317..a6d8422f827d58 100644
--- a/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx
+++ b/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx
@@ -114,6 +114,7 @@ const TagContainer = styled.div`
margin-left: 0px;
margin-top: 3px;
flex-wrap: wrap;
+ margin-right: 8px;
`;
const TagSeparator = styled.div`
From a49a435eef92b20cdc9878c8189b8ca0288e8b7f Mon Sep 17 00:00:00 2001
From: Aseem Bansal
Date: Thu, 21 Dec 2023 19:38:46 +0530
Subject: [PATCH 054/540] feat(analytics): change MAU chart to be until last
month (#9499)
---
.../datahub/graphql/analytics/resolver/GetChartsResolver.java | 3 ++-
.../main/java/com/linkedin/datahub/graphql/util/DateUtil.java | 4 ++++
2 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/analytics/resolver/GetChartsResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/analytics/resolver/GetChartsResolver.java
index 3f635872747a57..6ba3c5090f1c40 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/analytics/resolver/GetChartsResolver.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/analytics/resolver/GetChartsResolver.java
@@ -91,6 +91,7 @@ private List getProductAnalyticsCharts(Authentication authentica
final List charts = new ArrayList<>();
DateUtil dateUtil = new DateUtil();
final DateTime startOfNextWeek = dateUtil.getStartOfNextWeek();
+ final DateTime startOfThisMonth = dateUtil.getStartOfThisMonth();
final DateTime startOfNextMonth = dateUtil.getStartOfNextMonth();
final DateRange trailingWeekDateRange = dateUtil.getTrailingWeekDateRange();
@@ -103,7 +104,7 @@ private List getProductAnalyticsCharts(Authentication authentica
charts.add(
getActiveUsersTimeSeriesChart(
startOfNextMonth.minusMonths(12),
- startOfNextMonth.minusMillis(1),
+ startOfThisMonth.minusMillis(1),
"Monthly Active Users",
DateInterval.MONTH));
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/util/DateUtil.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/util/DateUtil.java
index 4b837605d4e318..677ad8afbaca31 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/util/DateUtil.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/util/DateUtil.java
@@ -13,6 +13,10 @@ public DateTime getStartOfNextWeek() {
return setTimeToZero(getNow().withDayOfWeek(DateTimeConstants.SUNDAY).plusDays(1));
}
+ public DateTime getStartOfThisMonth() {
+ return setTimeToZero(getNow().withDayOfMonth(1));
+ }
+
public DateTime getStartOfNextMonth() {
return setTimeToZero(getNow().withDayOfMonth(1).plusMonths(1));
}
From 55cb56821c00ec993ee5a4c560d7b49d8d71258b Mon Sep 17 00:00:00 2001
From: RyanHolstien
Date: Thu, 21 Dec 2023 10:33:25 -0600
Subject: [PATCH 055/540] fix(kafka): fix infinite deserialization logging
(#9494)
---
docker/docker-compose-without-neo4j.yml | 2 ++
...docker-compose.consumers-without-neo4j.yml | 3 ++
docker/docker-compose.consumers.yml | 3 ++
docker/docker-compose.dev.yml | 1 +
docker/docker-compose.yml | 2 ++
.../docker-compose-m1.quickstart.yml | 1 +
...er-compose-without-neo4j-m1.quickstart.yml | 1 +
...ocker-compose-without-neo4j.quickstart.yml | 1 +
...ose.consumers-without-neo4j.quickstart.yml | 2 ++
.../docker-compose.consumers.quickstart.yml | 2 ++
.../quickstart/docker-compose.quickstart.yml | 1 +
.../config/kafka/ConsumerConfiguration.java | 1 +
.../src/main/resources/application.yml | 1 +
.../kafka/KafkaEventConsumerFactory.java | 30 ++++++++++++++++---
14 files changed, 47 insertions(+), 4 deletions(-)
diff --git a/docker/docker-compose-without-neo4j.yml b/docker/docker-compose-without-neo4j.yml
index 6191994eaa1ea5..0d58a1d91b70b1 100644
--- a/docker/docker-compose-without-neo4j.yml
+++ b/docker/docker-compose-without-neo4j.yml
@@ -43,6 +43,8 @@ services:
context: ../
dockerfile: docker/datahub-gms/Dockerfile
env_file: datahub-gms/env/docker-without-neo4j.env
+ environment:
+ - KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR=${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:-true}
healthcheck:
test: curl -sS --fail http://datahub-gms:${DATAHUB_GMS_PORT:-8080}/health
start_period: 90s
diff --git a/docker/docker-compose.consumers-without-neo4j.yml b/docker/docker-compose.consumers-without-neo4j.yml
index 8228951d9385f8..f1be585232a1a8 100644
--- a/docker/docker-compose.consumers-without-neo4j.yml
+++ b/docker/docker-compose.consumers-without-neo4j.yml
@@ -15,6 +15,8 @@ services:
context: ../
dockerfile: docker/datahub-mae-consumer/Dockerfile
env_file: datahub-mae-consumer/env/docker-without-neo4j.env
+ environment:
+ - KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR=${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:-true}
datahub-mce-consumer:
container_name: datahub-mce-consumer
hostname: datahub-mce-consumer
@@ -28,3 +30,4 @@ services:
environment:
- DATAHUB_SERVER_TYPE=${DATAHUB_SERVER_TYPE:-quickstart}
- DATAHUB_TELEMETRY_ENABLED=${DATAHUB_TELEMETRY_ENABLED:-true}
+ - KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR=${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:-true}
diff --git a/docker/docker-compose.consumers.yml b/docker/docker-compose.consumers.yml
index 2d37094035859b..8d331cea2f0b95 100644
--- a/docker/docker-compose.consumers.yml
+++ b/docker/docker-compose.consumers.yml
@@ -15,6 +15,8 @@ services:
context: ../
dockerfile: docker/datahub-mae-consumer/Dockerfile
env_file: datahub-mae-consumer/env/docker.env
+ environment:
+ - KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR=${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:-true}
depends_on:
neo4j:
condition: service_healthy
@@ -36,6 +38,7 @@ services:
- NEO4J_USERNAME=neo4j
- NEO4J_PASSWORD=datahub
- GRAPH_SERVICE_IMPL=neo4j
+ - KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR=${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:-true}
depends_on:
neo4j:
condition: service_healthy
diff --git a/docker/docker-compose.dev.yml b/docker/docker-compose.dev.yml
index a69fb977a3417e..7067b68fba3f9c 100644
--- a/docker/docker-compose.dev.yml
+++ b/docker/docker-compose.dev.yml
@@ -45,6 +45,7 @@ services:
- SEARCH_SERVICE_ENABLE_CACHE=false
- LINEAGE_SEARCH_CACHE_ENABLED=false
- SHOW_BROWSE_V2=true
+ - KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR=${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:-true}
volumes:
- ./datahub-gms/start.sh:/datahub/datahub-gms/scripts/start.sh
- ./datahub-gms/jetty.xml:/datahub/datahub-gms/scripts/jetty.xml
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
index 95f56fe47e3cca..146055830d04e5 100644
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@@ -36,6 +36,8 @@ services:
container_name: datahub-gms
hostname: datahub-gms
image: ${DATAHUB_GMS_IMAGE:-linkedin/datahub-gms}:${DATAHUB_VERSION:-head}
+ environment:
+ - KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR=${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:-true}
ports:
- ${DATAHUB_MAPPED_GMS_PORT:-8080}:8080
build:
diff --git a/docker/quickstart/docker-compose-m1.quickstart.yml b/docker/quickstart/docker-compose-m1.quickstart.yml
index 7b7ca4052f3245..8b870019152834 100644
--- a/docker/quickstart/docker-compose-m1.quickstart.yml
+++ b/docker/quickstart/docker-compose-m1.quickstart.yml
@@ -97,6 +97,7 @@ services:
- GRAPH_SERVICE_IMPL=${GRAPH_SERVICE_IMPL:-elasticsearch}
- JAVA_OPTS=-Xms1g -Xmx1g
- KAFKA_BOOTSTRAP_SERVER=broker:29092
+ - KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR=${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:-true}
- KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081
- MAE_CONSUMER_ENABLED=true
- MCE_CONSUMER_ENABLED=true
diff --git a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml
index 53dacaf6ef63b0..5373e93da6bcb7 100644
--- a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml
+++ b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml
@@ -97,6 +97,7 @@ services:
- GRAPH_SERVICE_IMPL=elasticsearch
- JAVA_OPTS=-Xms1g -Xmx1g
- KAFKA_BOOTSTRAP_SERVER=broker:29092
+ - KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR=${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:-true}
- KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081
- MAE_CONSUMER_ENABLED=true
- MCE_CONSUMER_ENABLED=true
diff --git a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml
index 1ca91aa19206da..51a40395e3459f 100644
--- a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml
+++ b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml
@@ -97,6 +97,7 @@ services:
- GRAPH_SERVICE_IMPL=elasticsearch
- JAVA_OPTS=-Xms1g -Xmx1g
- KAFKA_BOOTSTRAP_SERVER=broker:29092
+ - KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR=${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:-true}
- KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081
- MAE_CONSUMER_ENABLED=true
- MCE_CONSUMER_ENABLED=true
diff --git a/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml
index d05933df96a433..4ed57dca1f080a 100644
--- a/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml
+++ b/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml
@@ -6,6 +6,7 @@ services:
datahub-mae-consumer:
container_name: datahub-mae-consumer
environment:
+ - KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR=${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:-true}
- DATAHUB_UPGRADE_HISTORY_KAFKA_CONSUMER_GROUP_ID=generic-duhe-consumer-job-client-mcl
- DATAHUB_GMS_HOST=datahub-gms
- DATAHUB_GMS_PORT=8080
@@ -44,6 +45,7 @@ services:
- GRAPH_SERVICE_IMPL=elasticsearch
- JAVA_OPTS=-Xms1g -Xmx1g
- KAFKA_BOOTSTRAP_SERVER=broker:29092
+ - KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR=${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:-true}
- KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081
- MAE_CONSUMER_ENABLED=false
- MCE_CONSUMER_ENABLED=true
diff --git a/docker/quickstart/docker-compose.consumers.quickstart.yml b/docker/quickstart/docker-compose.consumers.quickstart.yml
index f0bd3a0f927c80..ba8432d8a89afe 100644
--- a/docker/quickstart/docker-compose.consumers.quickstart.yml
+++ b/docker/quickstart/docker-compose.consumers.quickstart.yml
@@ -9,6 +9,7 @@ services:
neo4j:
condition: service_healthy
environment:
+ - KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR=${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:-true}
- DATAHUB_UPGRADE_HISTORY_KAFKA_CONSUMER_GROUP_ID=generic-duhe-consumer-job-client-mcl
- DATAHUB_GMS_HOST=datahub-gms
- DATAHUB_GMS_PORT=8080
@@ -54,6 +55,7 @@ services:
- GRAPH_SERVICE_IMPL=neo4j
- JAVA_OPTS=-Xms1g -Xmx1g
- KAFKA_BOOTSTRAP_SERVER=broker:29092
+ - KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR=${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:-true}
- KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081
- MAE_CONSUMER_ENABLED=false
- MCE_CONSUMER_ENABLED=true
diff --git a/docker/quickstart/docker-compose.quickstart.yml b/docker/quickstart/docker-compose.quickstart.yml
index c77b4418b6f366..56071cfe1e9e60 100644
--- a/docker/quickstart/docker-compose.quickstart.yml
+++ b/docker/quickstart/docker-compose.quickstart.yml
@@ -97,6 +97,7 @@ services:
- GRAPH_SERVICE_IMPL=${GRAPH_SERVICE_IMPL:-elasticsearch}
- JAVA_OPTS=-Xms1g -Xmx1g
- KAFKA_BOOTSTRAP_SERVER=broker:29092
+ - KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR=${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:-true}
- KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081
- MAE_CONSUMER_ENABLED=true
- MCE_CONSUMER_ENABLED=true
diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/kafka/ConsumerConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/kafka/ConsumerConfiguration.java
index b505674f2ed9c2..61b9d5c8167900 100644
--- a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/kafka/ConsumerConfiguration.java
+++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/kafka/ConsumerConfiguration.java
@@ -6,4 +6,5 @@
public class ConsumerConfiguration {
private int maxPartitionFetchBytes;
+ private boolean stopOnDeserializationError;
}
diff --git a/metadata-service/configuration/src/main/resources/application.yml b/metadata-service/configuration/src/main/resources/application.yml
index 0ea6b8712953e4..36498f7c45fea1 100644
--- a/metadata-service/configuration/src/main/resources/application.yml
+++ b/metadata-service/configuration/src/main/resources/application.yml
@@ -236,6 +236,7 @@ kafka:
maxRequestSize: ${KAFKA_PRODUCER_MAX_REQUEST_SIZE:5242880} # the max bytes sent by the producer, also see kafka-setup MAX_MESSAGE_BYTES for matching value
consumer:
maxPartitionFetchBytes: ${KAFKA_CONSUMER_MAX_PARTITION_FETCH_BYTES:5242880} # the max bytes consumed per partition
+ stopOnDeserializationError: ${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:true} # Stops kafka listener container on deserialization error, allows user to fix problems before moving past problematic offset. If false will log and move forward past the offset
schemaRegistry:
type: ${SCHEMA_REGISTRY_TYPE:KAFKA} # INTERNAL or KAFKA or AWS_GLUE
url: ${KAFKA_SCHEMAREGISTRY_URL:http://localhost:8081}
diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/KafkaEventConsumerFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/KafkaEventConsumerFactory.java
index 2a6338ac15e93d..4c0308546d857f 100644
--- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/KafkaEventConsumerFactory.java
+++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/KafkaEventConsumerFactory.java
@@ -21,6 +21,11 @@
import org.springframework.kafka.config.ConcurrentKafkaListenerContainerFactory;
import org.springframework.kafka.config.KafkaListenerContainerFactory;
import org.springframework.kafka.core.DefaultKafkaConsumerFactory;
+import org.springframework.kafka.listener.CommonContainerStoppingErrorHandler;
+import org.springframework.kafka.listener.CommonDelegatingErrorHandler;
+import org.springframework.kafka.listener.DefaultErrorHandler;
+import org.springframework.kafka.support.serializer.DeserializationException;
+import org.springframework.kafka.support.serializer.ErrorHandlingDeserializer;
@Slf4j
@Configuration
@@ -66,8 +71,6 @@ private static Map buildCustomizedProperties(
SchemaRegistryConfig schemaRegistryConfig) {
KafkaProperties.Consumer consumerProps = baseKafkaProperties.getConsumer();
- // Specify (de)serializers for record keys and for record values.
- consumerProps.setKeyDeserializer(StringDeserializer.class);
// Records will be flushed every 10 seconds.
consumerProps.setEnableAutoCommit(true);
consumerProps.setAutoCommitInterval(Duration.ofSeconds(10));
@@ -81,7 +84,13 @@ private static Map buildCustomizedProperties(
Map customizedProperties = baseKafkaProperties.buildConsumerProperties();
customizedProperties.put(
- ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, schemaRegistryConfig.getDeserializer());
+ ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, ErrorHandlingDeserializer.class);
+ customizedProperties.put(
+ ErrorHandlingDeserializer.KEY_DESERIALIZER_CLASS, StringDeserializer.class);
+ customizedProperties.put(
+ ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, ErrorHandlingDeserializer.class);
+ customizedProperties.put(
+ ErrorHandlingDeserializer.VALUE_DESERIALIZER_CLASS, schemaRegistryConfig.getDeserializer());
// Override KafkaProperties with SchemaRegistryConfig only for non-empty values
schemaRegistryConfig.getProperties().entrySet().stream()
@@ -98,7 +107,8 @@ private static Map buildCustomizedProperties(
@Bean(name = "kafkaEventConsumer")
protected KafkaListenerContainerFactory> createInstance(
@Qualifier("kafkaConsumerFactory")
- DefaultKafkaConsumerFactory kafkaConsumerFactory) {
+ DefaultKafkaConsumerFactory kafkaConsumerFactory,
+ @Qualifier("configurationProvider") ConfigurationProvider configurationProvider) {
ConcurrentKafkaListenerContainerFactory factory =
new ConcurrentKafkaListenerContainerFactory<>();
@@ -106,6 +116,18 @@ protected KafkaListenerContainerFactory> createInstance(
factory.setContainerCustomizer(new ThreadPoolContainerCustomizer());
factory.setConcurrency(kafkaEventConsumerConcurrency);
+ /* Sets up a delegating error handler for Deserialization errors, if disabled will
+ use DefaultErrorHandler (does back-off retry and then logs) rather than stopping the container. Stopping the container
+ prevents lost messages until the error can be examined, disabling this will allow progress, but may lose data
+ */
+ if (configurationProvider.getKafka().getConsumer().isStopOnDeserializationError()) {
+ CommonDelegatingErrorHandler delegatingErrorHandler =
+ new CommonDelegatingErrorHandler(new DefaultErrorHandler());
+ delegatingErrorHandler.addDelegate(
+ DeserializationException.class, new CommonContainerStoppingErrorHandler());
+ factory.setCommonErrorHandler(delegatingErrorHandler);
+ }
+
log.info(
String.format(
"Event-based KafkaListenerContainerFactory built successfully. Consumer concurrency = %s",
From b80d2f471c559cd31cedb47a79cf07e779b065b9 Mon Sep 17 00:00:00 2001
From: Harshal Sheth
Date: Thu, 21 Dec 2023 13:35:34 -0500
Subject: [PATCH 056/540] fix(ingest/fivetran): only materialize upstream
lineage (#9490)
---
.../ingestion/source/fivetran/fivetran.py | 19 +++++++----
.../integration/fivetran/fivetran_golden.json | 32 -------------------
2 files changed, 12 insertions(+), 39 deletions(-)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py
index c0395b4e4e7963..12e362fa8a3e3f 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py
@@ -7,6 +7,7 @@
DataProcessInstance,
InstanceRunResult,
)
+from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.api.decorators import (
SourceCapability,
@@ -248,13 +249,17 @@ def _get_connector_workunits(
# Map Fivetran's connector entity with Datahub's datajob entity
datajob = self._generate_datajob_from_connector(connector)
- for mcp in datajob.generate_mcp(materialize_iolets=True):
- if mcp.entityType == "dataset" and isinstance(mcp.aspect, StatusClass):
- # While we "materialize" the referenced datasets, we don't want them
- # to be tracked by stateful ingestion.
- yield mcp.as_workunit(is_primary_source=False)
- else:
- yield mcp.as_workunit()
+ for mcp in datajob.generate_mcp(materialize_iolets=False):
+ yield mcp.as_workunit()
+
+ # Materialize the upstream referenced datasets.
+ # We assume that the downstreams are materialized by other ingestion sources.
+ for iolet in datajob.inlets:
+ # We don't want these to be tracked by stateful ingestion.
+ yield MetadataChangeProposalWrapper(
+ entityUrn=str(iolet),
+ aspect=StatusClass(removed=False),
+ ).as_workunit(is_primary_source=False)
# Map Fivetran's job/sync history entity with Datahub's data process entity
for job in connector.jobs:
diff --git a/metadata-ingestion/tests/integration/fivetran/fivetran_golden.json b/metadata-ingestion/tests/integration/fivetran/fivetran_golden.json
index a72c960a722969..b8f05fa6e93aad 100644
--- a/metadata-ingestion/tests/integration/fivetran/fivetran_golden.json
+++ b/metadata-ingestion/tests/integration/fivetran/fivetran_golden.json
@@ -178,38 +178,6 @@
"lastRunId": "no-run-id-provided"
}
},
-{
- "entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.employee,PROD)",
- "changeType": "UPSERT",
- "aspectName": "status",
- "aspect": {
- "json": {
- "removed": false
- }
- },
- "systemMetadata": {
- "lastObserved": 1654621200000,
- "runId": "powerbi-test",
- "lastRunId": "no-run-id-provided"
- }
-},
-{
- "entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.company,PROD)",
- "changeType": "UPSERT",
- "aspectName": "status",
- "aspect": {
- "json": {
- "removed": false
- }
- },
- "systemMetadata": {
- "lastObserved": 1654621200000,
- "runId": "powerbi-test",
- "lastRunId": "no-run-id-provided"
- }
-},
{
"entityType": "dataJob",
"entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)",
From a18c72083d763b08282b67146881d4f918b257de Mon Sep 17 00:00:00 2001
From: Harshal Sheth
Date: Thu, 21 Dec 2023 13:50:39 -0500
Subject: [PATCH 057/540] feat(ingest): handle multiline string coercion
(#9484)
---
docs-website/download_historical_versions.py | 4 +-
docs/developers.md | 6 +--
.../src/datahub/configuration/git.py | 12 +----
.../validate_multiline_string.py | 31 ++++++++++++
.../ingestion/source/bigquery_v2/lineage.py | 2 +-
.../ingestion/source/looker/lookml_source.py | 7 ++-
.../source_config/usage/bigquery_usage.py | 3 ++
.../src/datahub/utilities/logging_manager.py | 1 +
.../unit/config/test_pydantic_validators.py | 50 +++++++++++++++----
9 files changed, 86 insertions(+), 30 deletions(-)
create mode 100644 metadata-ingestion/src/datahub/configuration/validate_multiline_string.py
diff --git a/docs-website/download_historical_versions.py b/docs-website/download_historical_versions.py
index 53ee9cf1e63ef5..7493210ffa2a5f 100644
--- a/docs-website/download_historical_versions.py
+++ b/docs-website/download_historical_versions.py
@@ -37,9 +37,9 @@ def fetch_urls(
except Exception as e:
if attempt < max_retries:
print(f"Attempt {attempt + 1}/{max_retries}: {e}")
- time.sleep(retry_delay)
+ time.sleep(retry_delay * 2**attempt)
else:
- print(f"Max retries reached. Unable to fetch data.")
+ print("Max retries reached. Unable to fetch data.")
raise
diff --git a/docs/developers.md b/docs/developers.md
index 60d31f5e4523f7..fe007a56ddc68f 100644
--- a/docs/developers.md
+++ b/docs/developers.md
@@ -17,10 +17,8 @@ title: "Local Development"
On macOS, these can be installed using [Homebrew](https://brew.sh/).
```shell
-# Install Java 8 and 11
-brew tap homebrew/cask-versions
-brew install java11
-brew install --cask zulu8
+# Install Java
+brew install openjdk@17
# Install Python
brew install python@3.10 # you may need to add this to your PATH
diff --git a/metadata-ingestion/src/datahub/configuration/git.py b/metadata-ingestion/src/datahub/configuration/git.py
index a5f88744661a4a..3c76c8da0d5717 100644
--- a/metadata-ingestion/src/datahub/configuration/git.py
+++ b/metadata-ingestion/src/datahub/configuration/git.py
@@ -1,4 +1,3 @@
-import os
import pathlib
from typing import Any, Dict, Optional, Union
@@ -6,6 +5,7 @@
from datahub.configuration.common import ConfigModel
from datahub.configuration.validate_field_rename import pydantic_renamed_field
+from datahub.configuration.validate_multiline_string import pydantic_multiline_string
_GITHUB_PREFIX = "https://github.com/"
_GITLAB_PREFIX = "https://gitlab.com/"
@@ -92,15 +92,7 @@ class GitInfo(GitReference):
description="The url to call `git clone` on. We infer this for github and gitlab repos, but it is required for other hosts.",
)
- @validator("deploy_key_file")
- def deploy_key_file_should_be_readable(
- cls, v: Optional[FilePath]
- ) -> Optional[FilePath]:
- if v is not None:
- # pydantic does existence checks, we just need to check if we can read it
- if not os.access(v, os.R_OK):
- raise ValueError(f"Unable to read deploy key file {v}")
- return v
+ _fix_deploy_key_newlines = pydantic_multiline_string("deploy_key")
@validator("deploy_key", pre=True, always=True)
def deploy_key_filled_from_deploy_key_file(
diff --git a/metadata-ingestion/src/datahub/configuration/validate_multiline_string.py b/metadata-ingestion/src/datahub/configuration/validate_multiline_string.py
new file mode 100644
index 00000000000000..0baaf4f0264b99
--- /dev/null
+++ b/metadata-ingestion/src/datahub/configuration/validate_multiline_string.py
@@ -0,0 +1,31 @@
+from typing import Optional, Type, Union
+
+import pydantic
+
+
+def pydantic_multiline_string(field: str) -> classmethod:
+ """If the field is present and contains an escaped newline, replace it with a real newline.
+
+ This makes the assumption that the field value is never supposed to have a
+ r"\n" in it, and instead should only have newline characters. This is generally
+ a safe assumption for SSH keys and similar.
+
+ The purpose of this helper is to make us more forgiving of small formatting issues
+ in recipes, without sacrificing correctness across the board.
+ """
+
+ def _validate_field(
+ cls: Type, v: Union[None, str, pydantic.SecretStr]
+ ) -> Optional[str]:
+ if v is not None:
+ if isinstance(v, pydantic.SecretStr):
+ v = v.get_secret_value()
+ v = v.replace(r"\n", "\n")
+
+ return v
+
+ # Hack: Pydantic maintains unique list of validators by referring its __name__.
+ # https://github.com/pydantic/pydantic/blob/v1.10.9/pydantic/main.py#L264
+ # This hack ensures that multiple field deprecated do not overwrite each other.
+ _validate_field.__name__ = f"{_validate_field.__name__}_{field}"
+ return pydantic.validator(field, pre=True, allow_reuse=True)(_validate_field)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
index eddd08c92b808d..b44b06feb95af2 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
@@ -175,7 +175,7 @@ def make_lineage_edges_from_parsing_result(
table_name = str(
BigQueryTableRef.from_bigquery_table(
BigqueryTableIdentifier.from_string_name(
- DatasetUrn.create_from_string(table_urn).get_dataset_name()
+ DatasetUrn.from_string(table_urn).name
)
)
)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py
index b76bef49a7e6f0..33079f3fd9ac17 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py
@@ -2060,10 +2060,9 @@ def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901
)
logger.debug("Failed to process explore", exc_info=e)
- processed_view_files = processed_view_map.get(model.connection)
- if processed_view_files is None:
- processed_view_map[model.connection] = set()
- processed_view_files = processed_view_map[model.connection]
+ processed_view_files = processed_view_map.setdefault(
+ model.connection, set()
+ )
project_name = self.get_project_name(model_name)
logger.debug(f"Model: {model_name}; Includes: {model.resolved_includes}")
diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/usage/bigquery_usage.py b/metadata-ingestion/src/datahub/ingestion/source_config/usage/bigquery_usage.py
index 5eb9c83236e4f9..13abe73cc4e098 100644
--- a/metadata-ingestion/src/datahub/ingestion/source_config/usage/bigquery_usage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source_config/usage/bigquery_usage.py
@@ -11,6 +11,7 @@
from datahub.configuration.common import AllowDenyPattern, ConfigurationError
from datahub.configuration.source_common import EnvConfigMixin
from datahub.configuration.validate_field_removal import pydantic_removed_field
+from datahub.configuration.validate_multiline_string import pydantic_multiline_string
from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
from datahub.ingestion.source_config.bigquery import BigQueryBaseConfig
@@ -44,6 +45,8 @@ class BigQueryCredential(ConfigModel):
description="If not set it will be default to https://www.googleapis.com/robot/v1/metadata/x509/client_email",
)
+ _fix_private_key_newlines = pydantic_multiline_string("private_key")
+
@pydantic.root_validator(skip_on_failure=True)
def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]:
if values.get("client_x509_cert_url") is None:
diff --git a/metadata-ingestion/src/datahub/utilities/logging_manager.py b/metadata-ingestion/src/datahub/utilities/logging_manager.py
index a8eacb0a9938df..62aa1ca7ab7918 100644
--- a/metadata-ingestion/src/datahub/utilities/logging_manager.py
+++ b/metadata-ingestion/src/datahub/utilities/logging_manager.py
@@ -199,6 +199,7 @@ def configure_logging(debug: bool, log_file: Optional[str] = None) -> Iterator[N
for handler in handlers:
root_logger.removeHandler(handler)
for lib in DATAHUB_PACKAGES:
+ lib_logger = logging.getLogger(lib)
lib_logger.removeHandler(handler)
lib_logger.propagate = True
diff --git a/metadata-ingestion/tests/unit/config/test_pydantic_validators.py b/metadata-ingestion/tests/unit/config/test_pydantic_validators.py
index 399245736805cc..f687a2776f6e2d 100644
--- a/metadata-ingestion/tests/unit/config/test_pydantic_validators.py
+++ b/metadata-ingestion/tests/unit/config/test_pydantic_validators.py
@@ -1,12 +1,14 @@
from typing import Optional
+import pydantic
import pytest
from pydantic import ValidationError
-from datahub.configuration.common import ConfigModel
+from datahub.configuration.common import ConfigModel, ConfigurationWarning
from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
from datahub.configuration.validate_field_removal import pydantic_removed_field
from datahub.configuration.validate_field_rename import pydantic_renamed_field
+from datahub.configuration.validate_multiline_string import pydantic_multiline_string
from datahub.utilities.global_warning_util import (
clear_global_warnings,
get_global_warnings,
@@ -22,8 +24,9 @@ class TestModel(ConfigModel):
v = TestModel.parse_obj({"b": "original"})
assert v.b == "original"
- v = TestModel.parse_obj({"a": "renamed"})
- assert v.b == "renamed"
+ with pytest.warns(ConfigurationWarning, match="a is deprecated"):
+ v = TestModel.parse_obj({"a": "renamed"})
+ assert v.b == "renamed"
with pytest.raises(ValidationError):
TestModel.parse_obj({"a": "foo", "b": "bar"})
@@ -44,9 +47,10 @@ class TestModel(ConfigModel):
assert v.b == "original"
assert v.b1 == "original"
- v = TestModel.parse_obj({"a": "renamed", "a1": "renamed"})
- assert v.b == "renamed"
- assert v.b1 == "renamed"
+ with pytest.warns(ConfigurationWarning, match=r"a.* is deprecated"):
+ v = TestModel.parse_obj({"a": "renamed", "a1": "renamed"})
+ assert v.b == "renamed"
+ assert v.b1 == "renamed"
with pytest.raises(ValidationError):
TestModel.parse_obj({"a": "foo", "b": "bar", "b1": "ok"})
@@ -74,8 +78,9 @@ class TestModel(ConfigModel):
v = TestModel.parse_obj({"b": "original"})
assert v.b == "original"
- v = TestModel.parse_obj({"b": "original", "r1": "removed", "r2": "removed"})
- assert v.b == "original"
+ with pytest.warns(ConfigurationWarning, match=r"r\d was removed"):
+ v = TestModel.parse_obj({"b": "original", "r1": "removed", "r2": "removed"})
+ assert v.b == "original"
def test_field_deprecated():
@@ -92,7 +97,10 @@ class TestModel(ConfigModel):
v = TestModel.parse_obj({"b": "original"})
assert v.b == "original"
- v = TestModel.parse_obj({"b": "original", "d1": "deprecated", "d2": "deprecated"})
+ with pytest.warns(ConfigurationWarning, match=r"d\d.+ deprecated"):
+ v = TestModel.parse_obj(
+ {"b": "original", "d1": "deprecated", "d2": "deprecated"}
+ )
assert v.b == "original"
assert v.d1 == "deprecated"
assert v.d2 == "deprecated"
@@ -100,3 +108,27 @@ class TestModel(ConfigModel):
assert any(["d2 is deprecated" in warning for warning in get_global_warnings()])
clear_global_warnings()
+
+
+def test_multiline_string_fixer():
+ class TestModel(ConfigModel):
+ s: str
+ m: Optional[pydantic.SecretStr] = None
+
+ _validate_s = pydantic_multiline_string("s")
+ _validate_m = pydantic_multiline_string("m")
+
+ v = TestModel.parse_obj({"s": "foo\nbar"})
+ assert v.s == "foo\nbar"
+
+ v = TestModel.parse_obj({"s": "foo\\nbar"})
+ assert v.s == "foo\nbar"
+
+ v = TestModel.parse_obj({"s": "normal", "m": "foo\\nbar"})
+ assert v.s == "normal"
+ assert v.m
+ assert v.m.get_secret_value() == "foo\nbar"
+
+ v = TestModel.parse_obj({"s": "normal", "m": pydantic.SecretStr("foo\\nbar")})
+ assert v.m
+ assert v.m.get_secret_value() == "foo\nbar"
From cfc641f0d03408b85ae75c2e4830c5f307ce6a68 Mon Sep 17 00:00:00 2001
From: Tamas Nemeth
Date: Thu, 21 Dec 2023 20:32:51 +0100
Subject: [PATCH 058/540] fix(ingest/databricks): Pinning databricks sdk to not
fail on mypy issues (#9500)
---
metadata-ingestion/setup.py | 4 +++-
.../src/datahub/ingestion/source/aws/aws_common.py | 2 +-
2 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
index 13c9d3c99aaca1..0dcac7a7fc1b41 100644
--- a/metadata-ingestion/setup.py
+++ b/metadata-ingestion/setup.py
@@ -274,7 +274,9 @@
databricks = {
# 0.1.11 appears to have authentication issues with azure databricks
- "databricks-sdk>=0.9.0",
+ # 0.16.0 added py.typed support which caused mypy to fail. The databricks sdk is pinned until we resolve mypy issues.
+ # https://github.com/databricks/databricks-sdk-py/pull/483
+ "databricks-sdk>=0.9.0,<0.16.0",
"pyspark~=3.3.0",
"requests",
# Version 2.4.0 includes sqlalchemy dialect, 2.8.0 includes some bug fixes
diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/aws_common.py b/metadata-ingestion/src/datahub/ingestion/source/aws/aws_common.py
index 0fb211a5d7b162..421991a0966c3a 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/aws/aws_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/aws/aws_common.py
@@ -167,7 +167,7 @@ def get_session(self) -> Session:
return session
- def get_credentials(self) -> Dict[str, str]:
+ def get_credentials(self) -> Dict[str, Optional[str]]:
credentials = self.get_session().get_credentials()
if credentials is not None:
return {
From ca518d6c78d994d59879b29f5afa8ffd1cff56df Mon Sep 17 00:00:00 2001
From: Harshal Sheth
Date: Thu, 21 Dec 2023 20:28:45 -0500
Subject: [PATCH 059/540] feat(ingest): remove librdkafka hacks (#9507)
---
docker/datahub-ingestion-base/Dockerfile | 9 -----
.../base-requirements.txt | 2 +-
metadata-ingestion/developing.md | 3 +-
.../scripts/datahub_preflight.sh | 19 +++-------
metadata-ingestion/scripts/install_deps.sh | 5 ++-
metadata-ingestion/setup.py | 35 ++++---------------
6 files changed, 14 insertions(+), 59 deletions(-)
diff --git a/docker/datahub-ingestion-base/Dockerfile b/docker/datahub-ingestion-base/Dockerfile
index 81fec61ea50733..558a5afe2c2cf9 100644
--- a/docker/datahub-ingestion-base/Dockerfile
+++ b/docker/datahub-ingestion-base/Dockerfile
@@ -30,9 +30,6 @@ ARG DEBIAN_REPO_URL
ARG PIP_MIRROR_URL
ARG GITHUB_REPO_URL
-ENV LIBRDKAFKA_VERSION=1.6.2
-ENV CONFLUENT_KAFKA_VERSION=1.6.1
-
ENV DEBIAN_FRONTEND noninteractive
# Optionally set corporate mirror for apk and pip
@@ -40,7 +37,6 @@ RUN if [ "${DEBIAN_REPO_URL}" != "http://deb.debian.org/debian" ] ; then sed -i
RUN if [ "${PIP_MIRROR_URL}" != "null" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi
RUN apt-get update && apt-get install -y -qq \
- make \
python3-ldap \
libldap2-dev \
libsasl2-dev \
@@ -53,11 +49,6 @@ RUN apt-get update && apt-get install -y -qq \
unzip \
ldap-utils \
&& python -m pip install --no-cache --upgrade pip wheel setuptools \
- && wget -q ${GITHUB_REPO_URL}/edenhill/librdkafka/archive/v${LIBRDKAFKA_VERSION}.tar.gz -O - | \
- tar -xz -C /root \
- && cd /root/librdkafka-${LIBRDKAFKA_VERSION} \
- && ./configure --prefix /usr && make && make install && cd .. && rm -rf /root/librdkafka-${LIBRDKAFKA_VERSION} \
- && apt-get remove -y make \
&& rm -rf /var/lib/apt/lists/* /var/cache/apk/*
# compiled against newer golang for security fixes
diff --git a/docker/datahub-ingestion-base/base-requirements.txt b/docker/datahub-ingestion-base/base-requirements.txt
index eb082d50b3020f..141382466ab9f6 100644
--- a/docker/datahub-ingestion-base/base-requirements.txt
+++ b/docker/datahub-ingestion-base/base-requirements.txt
@@ -65,7 +65,7 @@ colorlog==4.8.0
comm==0.1.4
confection==0.1.3
ConfigUpdater==3.1.1
-confluent-kafka==1.8.2
+confluent-kafka==2.3.0
connexion==2.14.2
cron-descriptor==1.4.0
croniter==2.0.1
diff --git a/metadata-ingestion/developing.md b/metadata-ingestion/developing.md
index d5f834936cdcfc..d1eef21974f1df 100644
--- a/metadata-ingestion/developing.md
+++ b/metadata-ingestion/developing.md
@@ -11,8 +11,7 @@ Also take a look at the guide to [adding a source](./adding-source.md).
1. Python 3.7+ must be installed in your host environment.
2. Java8 (gradle won't work with newer versions)
-3. On MacOS: `brew install librdkafka`
-4. On Debian/Ubuntu: `sudo apt install librdkafka-dev python3-dev python3-venv`
+4. On Debian/Ubuntu: `sudo apt install python3-dev python3-venv`
5. On Fedora (if using LDAP source integration): `sudo yum install openldap-devel`
### Set up your Python environment
diff --git a/metadata-ingestion/scripts/datahub_preflight.sh b/metadata-ingestion/scripts/datahub_preflight.sh
index e82be9d7b27b71..9676964f4d49d1 100755
--- a/metadata-ingestion/scripts/datahub_preflight.sh
+++ b/metadata-ingestion/scripts/datahub_preflight.sh
@@ -45,8 +45,6 @@ arm64_darwin_preflight() {
pip3 install --no-use-pep517 scipy
fi
- printf "✨ Setting up librdkafka prerequisities\n"
- brew_install "librdkafka" "1.9.1"
brew_install "openssl@1.1"
brew install "postgresql@14"
@@ -69,25 +67,16 @@ arm64_darwin_preflight() {
export GRPC_PYTHON_BUILD_SYSTEM_OPENSSL
GRPC_PYTHON_BUILD_SYSTEM_ZLIB=1
export GRPC_PYTHON_BUILD_SYSTEM_ZLIB
- CPPFLAGS="-I$(brew --prefix openssl@1.1)/include -I$(brew --prefix librdkafka)/include"
+ CPPFLAGS="-I$(brew --prefix openssl@1.1)/include"
export CPPFLAGS
- LDFLAGS="-L$(brew --prefix openssl@1.1)/lib -L$(brew --prefix librdkafka)/lib"
+ LDFLAGS="-L$(brew --prefix openssl@1.1)/lib"
export LDFLAGS
- CPATH="$(brew --prefix librdkafka)/include"
- export CPATH
- C_INCLUDE_PATH="$(brew --prefix librdkafka)/include"
- export C_INCLUDE_PATH
- LIBRARY_PATH="$(brew --prefix librdkafka)/lib"
- export LIBRARY_PATH
cat << EOF
export GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
export GRPC_PYTHON_BUILD_SYSTEM_ZLIB=1
- export CPPFLAGS="-I$(brew --prefix openssl@1.1)/include -I$(brew --prefix librdkafka)/include"
- export LDFLAGS="-L$(brew --prefix openssl@1.1)/lib -L$(brew --prefix librdkafka)/lib -L$(brew --prefix postgresql@14)/lib/postgresql@14"
- export CPATH="$(brew --prefix librdkafka)/include"
- export C_INCLUDE_PATH="$(brew --prefix librdkafka)/include"
- export LIBRARY_PATH="$(brew --prefix librdkafka)/lib"
+ export CPPFLAGS="-I$(brew --prefix openssl@1.1)/include"
+ export LDFLAGS="-L$(brew --prefix openssl@1.1)/lib -L$(brew --prefix postgresql@14)/lib/postgresql@14"
EOF
diff --git a/metadata-ingestion/scripts/install_deps.sh b/metadata-ingestion/scripts/install_deps.sh
index 7e6b6956d8bb84..bae0278056ebbd 100755
--- a/metadata-ingestion/scripts/install_deps.sh
+++ b/metadata-ingestion/scripts/install_deps.sh
@@ -2,7 +2,8 @@
set -euxo pipefail
if [ "$(uname)" == "Darwin" ]; then
- brew install librdkafka
+ # None
+ true
else
sudo_cmd=""
if command -v sudo; then
@@ -11,7 +12,6 @@ else
if command -v yum; then
$sudo_cmd yum install -y \
- librdkafka-devel \
openldap-devel \
cyrus-sasl-devel \
openldap-clients \
@@ -21,7 +21,6 @@ else
libxslt-devel
else
$sudo_cmd apt-get update && $sudo_cmd apt-get install -y \
- librdkafka-dev \
python3-ldap \
libldap2-dev \
libsasl2-dev \
diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
index 0dcac7a7fc1b41..c834700388d627 100644
--- a/metadata-ingestion/setup.py
+++ b/metadata-ingestion/setup.py
@@ -69,35 +69,12 @@
rest_common = {"requests", "requests_file"}
kafka_common = {
- # The confluent_kafka package provides a number of pre-built wheels for
- # various platforms and architectures. However, it does not provide wheels
- # for arm64 (including M1 Macs) or aarch64 (Docker's linux/arm64). This has
- # remained an open issue on the confluent_kafka project for a year:
- # - https://github.com/confluentinc/confluent-kafka-python/issues/1182
- # - https://github.com/confluentinc/confluent-kafka-python/pull/1161
- #
- # When a wheel is not available, we must build from source instead.
- # Building from source requires librdkafka to be installed.
- # Most platforms have an easy way to install librdkafka:
- # - MacOS: `brew install librdkafka` gives latest, which is 1.9.x or newer.
- # - Debian: `apt install librdkafka` gives 1.6.0 (https://packages.debian.org/bullseye/librdkafka-dev).
- # - Ubuntu: `apt install librdkafka` gives 1.8.0 (https://launchpad.net/ubuntu/+source/librdkafka).
- #
- # Moreover, confluent_kafka 1.9.0 introduced a hard compatibility break, and
- # requires librdkafka >=1.9.0. As such, installing confluent_kafka 1.9.x on
- # most arm64 Linux machines will fail, since it will build from source but then
- # fail because librdkafka is too old. Hence, we have added an extra requirement
- # that requires confluent_kafka<1.9.0 on non-MacOS arm64/aarch64 machines, which
- # should ideally allow the builds to succeed in default conditions. We still
- # want to allow confluent_kafka >= 1.9.0 for M1 Macs, which is why we can't
- # broadly restrict confluent_kafka to <1.9.0.
- #
- # Note that this is somewhat of a hack, since we don't actually require the
- # older version of confluent_kafka on those machines. Additionally, we will
- # need monitor the Debian/Ubuntu PPAs and modify this rule if they start to
- # support librdkafka >= 1.9.0.
- "confluent_kafka>=1.5.0",
- 'confluent_kafka<1.9.0; platform_system != "Darwin" and (platform_machine == "aarch64" or platform_machine == "arm64")',
+ # Note that confluent_kafka 1.9.0 introduced a hard compatibility break, and
+ # requires librdkafka >=1.9.0. This is generally not an issue, since they
+ # now provide prebuilt wheels for most platforms, including M1 Macs and
+ # Linux aarch64 (e.g. Docker's linux/arm64). Installing confluent_kafka
+ # from source remains a pain.
+ "confluent_kafka>=1.9.0",
# We currently require both Avro libraries. The codegen uses avro-python3 (above)
# schema parsers at runtime for generating and reading JSON into Python objects.
# At the same time, we use Kafka's AvroSerializer, which internally relies on
From be329986ab4b177899d16990fec31597ae765c58 Mon Sep 17 00:00:00 2001
From: Harshal Sheth
Date: Thu, 21 Dec 2023 20:30:36 -0500
Subject: [PATCH 060/540] feat(ingest): rename custom package path from models
to metadata (#9502)
---
docs/modeling/extending-the-metadata-model.md | 8 ++++++++
metadata-ingestion/scripts/custom_package_codegen.py | 10 +++++-----
metadata-ingestion/src/datahub/telemetry/telemetry.py | 5 +++++
3 files changed, 18 insertions(+), 5 deletions(-)
diff --git a/docs/modeling/extending-the-metadata-model.md b/docs/modeling/extending-the-metadata-model.md
index dc4edd3306f95c..8b308fb65d243c 100644
--- a/docs/modeling/extending-the-metadata-model.md
+++ b/docs/modeling/extending-the-metadata-model.md
@@ -289,6 +289,14 @@ Alternatively, publish it to PyPI with `twine upload custom-package/my-company-d
This will generate some Python build artifacts, which you can distribute within your team or publish to PyPI.
The command output contains additional details and exact CLI commands you can use.
+Once this package is installed, you can use the DataHub CLI as normal, and it will use your custom models.
+You'll also be able to import those models, with IDE support, by changing your imports.
+
+```diff
+- from datahub.metadata.schema_classes import DatasetPropertiesClass
++ from my_company_datahub_models.metadata.schema_classes import DatasetPropertiesClass
+```
+
diff --git a/metadata-ingestion/scripts/custom_package_codegen.py b/metadata-ingestion/scripts/custom_package_codegen.py
index 3f59fdf2cc548d..714728087d4b6d 100644
--- a/metadata-ingestion/scripts/custom_package_codegen.py
+++ b/metadata-ingestion/scripts/custom_package_codegen.py
@@ -62,7 +62,7 @@ def generate(
entity_registry=entity_registry,
pdl_path=pdl_path,
schemas_path=schemas_path,
- outdir=str(src_path / "models"),
+ outdir=str(src_path / "metadata"),
enable_custom_loader=False,
)
@@ -91,13 +91,13 @@ def generate(
],
package_data={{
"{python_package_name}": ["py.typed"],
- "{python_package_name}.models": ["schema.avsc"],
- "{python_package_name}.models.schemas": ["*.avsc"],
+ "{python_package_name}.metadata": ["schema.avsc"],
+ "{python_package_name}.metadata.schemas": ["*.avsc"],
}},
entry_points={{
"datahub.custom_packages": [
- "models={python_package_name}.models.schema_classes",
- "urns={python_package_name}.models._urns.urn_defs",
+ "models={python_package_name}.metadata.schema_classes",
+ "urns={python_package_name}.metadata._urns.urn_defs",
],
}},
)
diff --git a/metadata-ingestion/src/datahub/telemetry/telemetry.py b/metadata-ingestion/src/datahub/telemetry/telemetry.py
index 615be00d5455f5..c399f2e1a27e55 100644
--- a/metadata-ingestion/src/datahub/telemetry/telemetry.py
+++ b/metadata-ingestion/src/datahub/telemetry/telemetry.py
@@ -16,6 +16,7 @@
from datahub.cli.cli_utils import DATAHUB_ROOT_FOLDER, get_boolean_env_variable
from datahub.configuration.common import ExceptionWithProps
from datahub.ingestion.graph.client import DataHubGraph
+from datahub.metadata.schema_classes import _custom_package_path
from datahub.utilities.perf_timer import PerfTimer
logger = logging.getLogger(__name__)
@@ -89,6 +90,10 @@
if any(var in os.environ for var in CI_ENV_VARS):
ENV_ENABLED = False
+# Also disable if a custom metadata model package is in use.
+if _custom_package_path:
+ ENV_ENABLED = False
+
TIMEOUT = int(os.environ.get("DATAHUB_TELEMETRY_TIMEOUT", "10"))
MIXPANEL_ENDPOINT = "track.datahubproject.io/mp"
MIXPANEL_TOKEN = "5ee83d940754d63cacbf7d34daa6f44a"
From 4fe1df6892a7e45fe59a26990b441a67dd4faf93 Mon Sep 17 00:00:00 2001
From: kushagra-apptware <81357546+kushagra-apptware@users.noreply.github.com>
Date: Fri, 22 Dec 2023 11:57:24 +0530
Subject: [PATCH 061/540] feat(ui): edit link option (#9498)
---
.../Documentation/components/LinkList.tsx | 119 ++++++++++++++++--
1 file changed, 110 insertions(+), 9 deletions(-)
diff --git a/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx b/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx
index bcce994c3f0f80..1b5c3d54009da8 100644
--- a/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx
+++ b/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx
@@ -1,14 +1,15 @@
-import React from 'react';
+import React, { useState } from 'react';
import { Link } from 'react-router-dom';
import styled from 'styled-components/macro';
-import { message, Button, List, Typography } from 'antd';
-import { LinkOutlined, DeleteOutlined } from '@ant-design/icons';
+import { message, Button, List, Typography, Modal, Form, Input } from 'antd';
+import { LinkOutlined, DeleteOutlined, EditOutlined } from '@ant-design/icons';
import { EntityType, InstitutionalMemoryMetadata } from '../../../../../../types.generated';
-import { useEntityData } from '../../../EntityContext';
+import { useEntityData, useMutationUrn } from '../../../EntityContext';
import { useEntityRegistry } from '../../../../../useEntityRegistry';
import { ANTD_GRAY } from '../../../constants';
import { formatDateString } from '../../../containers/profile/utils';
-import { useRemoveLinkMutation } from '../../../../../../graphql/mutations.generated';
+import { useAddLinkMutation, useRemoveLinkMutation } from '../../../../../../graphql/mutations.generated';
+import analytics, { EntityActionType, EventType } from '../../../../../analytics';
const LinkListItem = styled(List.Item)`
border-radius: 5px;
@@ -33,10 +34,15 @@ type LinkListProps = {
};
export const LinkList = ({ refetch }: LinkListProps) => {
- const { urn: entityUrn, entityData } = useEntityData();
+ const [editModalVisble, setEditModalVisible] = useState(false);
+ const [linkDetails, setLinkDetails] = useState(undefined);
+ const { urn: entityUrn, entityData, entityType } = useEntityData();
const entityRegistry = useEntityRegistry();
const [removeLinkMutation] = useRemoveLinkMutation();
const links = entityData?.institutionalMemory?.elements || [];
+ const [form] = Form.useForm();
+ const [addLinkMutation] = useAddLinkMutation();
+ const mutationUrn = useMutationUrn();
const handleDeleteLink = async (metadata: InstitutionalMemoryMetadata) => {
try {
@@ -53,8 +59,98 @@ export const LinkList = ({ refetch }: LinkListProps) => {
refetch?.();
};
+ const handleEditLink = (metadata: InstitutionalMemoryMetadata) => {
+ form.setFieldsValue({
+ url: metadata.url,
+ label: metadata.description,
+ });
+ setLinkDetails(metadata);
+ setEditModalVisible(true);
+ };
+
+ const handleClose = () => {
+ form.resetFields();
+ setEditModalVisible(false);
+ };
+
+ const handleEdit = async (formData: any) => {
+ if (!linkDetails) return;
+ try {
+ await removeLinkMutation({
+ variables: { input: { linkUrl: linkDetails.url, resourceUrn: linkDetails.associatedUrn || entityUrn } },
+ });
+ await addLinkMutation({
+ variables: { input: { linkUrl: formData.url, label: formData.label, resourceUrn: mutationUrn } },
+ });
+
+ message.success({ content: 'Link Updated', duration: 2 });
+
+ analytics.event({
+ type: EventType.EntityActionEvent,
+ entityType,
+ entityUrn: mutationUrn,
+ actionType: EntityActionType.UpdateLinks,
+ });
+
+ refetch?.();
+ handleClose();
+ } catch (e: unknown) {
+ message.destroy();
+
+ if (e instanceof Error) {
+ message.error({ content: `Error updating link: \n ${e.message || ''}`, duration: 2 });
+ }
+ }
+ };
+
return entityData ? (
<>
+
+ Cancel
+ ,
+
+ Edit
+ ,
+ ]}
+ >
+
+
+
+
+
+
+
+
{links.length > 0 && (
{
renderItem={(link) => (
handleDeleteLink(link)} type="text" shape="circle" danger>
-
-
+ <>
+ handleEditLink(link)} type="text" shape="circle" danger>
+
+
+ handleDeleteLink(link)} type="text" shape="circle" danger>
+
+
+ >
}
>
Date: Fri, 22 Dec 2023 02:18:22 -0500
Subject: [PATCH 062/540] feat(ingest): support CLL for redshift materialized
views with auto refresh (#9508)
---
metadata-ingestion/setup.py | 2 +-
.../src/datahub/utilities/sqlglot_lineage.py | 122 ++++++++++++------
...dshift_materialized_view_auto_refresh.json | 54 ++++++++
.../tests/unit/sql_parsing/test_sql_detach.py | 46 +++++++
.../unit/sql_parsing/test_sqlglot_lineage.py | 72 ++++-------
5 files changed, 207 insertions(+), 89 deletions(-)
create mode 100644 metadata-ingestion/tests/unit/sql_parsing/goldens/test_redshift_materialized_view_auto_refresh.json
create mode 100644 metadata-ingestion/tests/unit/sql_parsing/test_sql_detach.py
diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
index c834700388d627..4632c20cd3b969 100644
--- a/metadata-ingestion/setup.py
+++ b/metadata-ingestion/setup.py
@@ -98,7 +98,7 @@
sqlglot_lib = {
# Using an Acryl fork of sqlglot.
# https://github.com/tobymao/sqlglot/compare/main...hsheth2:sqlglot:hsheth?expand=1
- "acryl-sqlglot==19.0.2.dev10",
+ "acryl-sqlglot==20.4.1.dev14",
}
sql_common = (
diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py
index fc3efef2ba5322..f84b3f8b94a2e0 100644
--- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py
+++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py
@@ -5,7 +5,7 @@
import logging
import pathlib
from collections import defaultdict
-from typing import Any, Dict, List, Optional, Set, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
import pydantic.dataclasses
import sqlglot
@@ -60,6 +60,8 @@
),
)
)
+# Quick check that the rules were loaded correctly.
+assert 0 < len(RULES_BEFORE_TYPE_ANNOTATION) < len(sqlglot.optimizer.optimizer.RULES)
class GraphQLSchemaField(TypedDict):
@@ -150,12 +152,16 @@ class _TableName(_FrozenModel):
def as_sqlglot_table(self) -> sqlglot.exp.Table:
return sqlglot.exp.Table(
- catalog=self.database, db=self.db_schema, this=self.table
+ catalog=sqlglot.exp.Identifier(this=self.database)
+ if self.database
+ else None,
+ db=sqlglot.exp.Identifier(this=self.db_schema) if self.db_schema else None,
+ this=sqlglot.exp.Identifier(this=self.table),
)
def qualified(
self,
- dialect: str,
+ dialect: sqlglot.Dialect,
default_db: Optional[str] = None,
default_schema: Optional[str] = None,
) -> "_TableName":
@@ -271,7 +277,9 @@ def make_from_error(cls, error: Exception) -> "SqlParsingResult":
)
-def _parse_statement(sql: sqlglot.exp.ExpOrStr, dialect: str) -> sqlglot.Expression:
+def _parse_statement(
+ sql: sqlglot.exp.ExpOrStr, dialect: sqlglot.Dialect
+) -> sqlglot.Expression:
statement: sqlglot.Expression = sqlglot.maybe_parse(
sql, dialect=dialect, error_level=sqlglot.ErrorLevel.RAISE
)
@@ -279,8 +287,7 @@ def _parse_statement(sql: sqlglot.exp.ExpOrStr, dialect: str) -> sqlglot.Express
def _table_level_lineage(
- statement: sqlglot.Expression,
- dialect: str,
+ statement: sqlglot.Expression, dialect: sqlglot.Dialect
) -> Tuple[Set[_TableName], Set[_TableName]]:
# Generate table-level lineage.
modified = {
@@ -482,6 +489,26 @@ def close(self) -> None:
]
_SupportedColumnLineageTypesTuple = (sqlglot.exp.Subqueryable, sqlglot.exp.DerivedTable)
+DIALECTS_WITH_CASE_INSENSITIVE_COLS = {
+ # Column identifiers are case-insensitive in BigQuery, so we need to
+ # do a normalization step beforehand to make sure it's resolved correctly.
+ "bigquery",
+ # Our snowflake source lowercases column identifiers, so we are forced
+ # to do fuzzy (case-insensitive) resolution instead of exact resolution.
+ "snowflake",
+ # Teradata column names are case-insensitive.
+ # A name, even when enclosed in double quotation marks, is not case sensitive. For example, CUSTOMER and Customer are the same.
+ # See more below:
+ # https://documentation.sas.com/doc/en/pgmsascdc/9.4_3.5/acreldb/n0ejgx4895bofnn14rlguktfx5r3.htm
+ "teradata",
+}
+DIALECTS_WITH_DEFAULT_UPPERCASE_COLS = {
+ # In some dialects, column identifiers are effectively case insensitive
+ # because they are automatically converted to uppercase. Most other systems
+ # automatically lowercase unquoted identifiers.
+ "snowflake",
+}
+
class UnsupportedStatementTypeError(TypeError):
pass
@@ -495,8 +522,8 @@ class SqlUnderstandingError(Exception):
# TODO: Break this up into smaller functions.
def _column_level_lineage( # noqa: C901
statement: sqlglot.exp.Expression,
- dialect: str,
- input_tables: Dict[_TableName, SchemaInfo],
+ dialect: sqlglot.Dialect,
+ table_schemas: Dict[_TableName, SchemaInfo],
output_table: Optional[_TableName],
default_db: Optional[str],
default_schema: Optional[str],
@@ -515,19 +542,9 @@ def _column_level_lineage( # noqa: C901
column_lineage: List[_ColumnLineageInfo] = []
- use_case_insensitive_cols = dialect in {
- # Column identifiers are case-insensitive in BigQuery, so we need to
- # do a normalization step beforehand to make sure it's resolved correctly.
- "bigquery",
- # Our snowflake source lowercases column identifiers, so we are forced
- # to do fuzzy (case-insensitive) resolution instead of exact resolution.
- "snowflake",
- # Teradata column names are case-insensitive.
- # A name, even when enclosed in double quotation marks, is not case sensitive. For example, CUSTOMER and Customer are the same.
- # See more below:
- # https://documentation.sas.com/doc/en/pgmsascdc/9.4_3.5/acreldb/n0ejgx4895bofnn14rlguktfx5r3.htm
- "teradata",
- }
+ use_case_insensitive_cols = _is_dialect_instance(
+ dialect, DIALECTS_WITH_CASE_INSENSITIVE_COLS
+ )
sqlglot_db_schema = sqlglot.MappingSchema(
dialect=dialect,
@@ -537,14 +554,16 @@ def _column_level_lineage( # noqa: C901
table_schema_normalized_mapping: Dict[_TableName, Dict[str, str]] = defaultdict(
dict
)
- for table, table_schema in input_tables.items():
+ for table, table_schema in table_schemas.items():
normalized_table_schema: SchemaInfo = {}
for col, col_type in table_schema.items():
if use_case_insensitive_cols:
col_normalized = (
# This is required to match Sqlglot's behavior.
col.upper()
- if dialect in {"snowflake"}
+ if _is_dialect_instance(
+ dialect, DIALECTS_WITH_DEFAULT_UPPERCASE_COLS
+ )
else col.lower()
)
else:
@@ -561,7 +580,7 @@ def _column_level_lineage( # noqa: C901
if use_case_insensitive_cols:
def _sqlglot_force_column_normalizer(
- node: sqlglot.exp.Expression, dialect: "sqlglot.DialectType" = None
+ node: sqlglot.exp.Expression,
) -> sqlglot.exp.Expression:
if isinstance(node, sqlglot.exp.Column):
node.this.set("quoted", False)
@@ -572,9 +591,7 @@ def _sqlglot_force_column_normalizer(
# "Prior to case normalization sql %s",
# statement.sql(pretty=True, dialect=dialect),
# )
- statement = statement.transform(
- _sqlglot_force_column_normalizer, dialect, copy=False
- )
+ statement = statement.transform(_sqlglot_force_column_normalizer, copy=False)
# logger.debug(
# "Sql after casing normalization %s",
# statement.sql(pretty=True, dialect=dialect),
@@ -595,7 +612,8 @@ def _schema_aware_fuzzy_column_resolve(
# Optimize the statement + qualify column references.
logger.debug(
- "Prior to qualification sql %s", statement.sql(pretty=True, dialect=dialect)
+ "Prior to column qualification sql %s",
+ statement.sql(pretty=True, dialect=dialect),
)
try:
# Second time running qualify, this time with:
@@ -678,7 +696,7 @@ def _schema_aware_fuzzy_column_resolve(
# Otherwise, we can't process it.
continue
- if dialect == "bigquery" and output_col.lower() in {
+ if _is_dialect_instance(dialect, "bigquery") and output_col.lower() in {
"_partitiontime",
"_partitiondate",
}:
@@ -923,7 +941,7 @@ def _translate_sqlglot_type(
def _translate_internal_column_lineage(
table_name_urn_mapping: Dict[_TableName, str],
raw_column_lineage: _ColumnLineageInfo,
- dialect: str,
+ dialect: sqlglot.Dialect,
) -> ColumnLineageInfo:
downstream_urn = None
if raw_column_lineage.downstream.table:
@@ -956,18 +974,44 @@ def _translate_internal_column_lineage(
)
-def _get_dialect(platform: str) -> str:
+def _get_dialect_str(platform: str) -> str:
# TODO: convert datahub platform names to sqlglot dialect
if platform == "presto-on-hive":
return "hive"
- if platform == "mssql":
+ elif platform == "mssql":
return "tsql"
- if platform == "athena":
+ elif platform == "athena":
return "trino"
+ elif platform == "mysql":
+ # In sqlglot v20+, MySQL is now case-sensitive by default, which is the
+ # default behavior on Linux. However, MySQL's default case sensitivity
+ # actually depends on the underlying OS.
+ # For us, it's simpler to just assume that it's case-insensitive, and
+ # let the fuzzy resolution logic handle it.
+ return "mysql, normalization_strategy = lowercase"
else:
return platform
+def _get_dialect(platform: str) -> sqlglot.Dialect:
+ return sqlglot.Dialect.get_or_raise(_get_dialect_str(platform))
+
+
+def _is_dialect_instance(
+ dialect: sqlglot.Dialect, platforms: Union[str, Iterable[str]]
+) -> bool:
+ if isinstance(platforms, str):
+ platforms = [platforms]
+ else:
+ platforms = list(platforms)
+
+ dialects = [sqlglot.Dialect.get_or_raise(platform) for platform in platforms]
+
+ if any(isinstance(dialect, dialect_class.__class__) for dialect_class in dialects):
+ return True
+ return False
+
+
def _sqlglot_lineage_inner(
sql: sqlglot.exp.ExpOrStr,
schema_resolver: SchemaResolver,
@@ -975,7 +1019,7 @@ def _sqlglot_lineage_inner(
default_schema: Optional[str] = None,
) -> SqlParsingResult:
dialect = _get_dialect(schema_resolver.platform)
- if dialect == "snowflake":
+ if _is_dialect_instance(dialect, "snowflake"):
# in snowflake, table identifiers must be uppercased to match sqlglot's behavior.
if default_db:
default_db = default_db.upper()
@@ -1064,7 +1108,7 @@ def _sqlglot_lineage_inner(
column_lineage = _column_level_lineage(
select_statement,
dialect=dialect,
- input_tables=table_name_schema_mapping,
+ table_schemas=table_name_schema_mapping,
output_table=downstream_table,
default_db=default_db,
default_schema=default_schema,
@@ -1204,13 +1248,13 @@ def replace_cte_refs(node: sqlglot.exp.Expression) -> sqlglot.exp.Expression:
full_new_name, dialect=dialect, into=sqlglot.exp.Table
)
- # We expect node.parent to be a Table or Column.
- # Either way, it should support catalog/db/name.
parent = node.parent
- if "catalog" in parent.arg_types:
+ # We expect node.parent to be a Table or Column, both of which support catalog/db/name.
+ # However, we check the parent's arg_types to be safe.
+ if "catalog" in parent.arg_types and table_expr.catalog:
parent.set("catalog", table_expr.catalog)
- if "db" in parent.arg_types:
+ if "db" in parent.arg_types and table_expr.db:
parent.set("db", table_expr.db)
new_node = sqlglot.exp.Identifier(this=table_expr.name)
diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_redshift_materialized_view_auto_refresh.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_redshift_materialized_view_auto_refresh.json
new file mode 100644
index 00000000000000..fce65056a32f7b
--- /dev/null
+++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_redshift_materialized_view_auto_refresh.json
@@ -0,0 +1,54 @@
+{
+ "query_type": "CREATE",
+ "in_tables": [
+ "urn:li:dataset:(urn:li:dataPlatform:redshift,customer,PROD)",
+ "urn:li:dataset:(urn:li:dataPlatform:redshift,orders,PROD)"
+ ],
+ "out_tables": [
+ "urn:li:dataset:(urn:li:dataPlatform:redshift,mv_total_orders,PROD)"
+ ],
+ "column_lineage": [
+ {
+ "downstream": {
+ "table": "urn:li:dataset:(urn:li:dataPlatform:redshift,mv_total_orders,PROD)",
+ "column": "cust_id",
+ "column_type": null,
+ "native_column_type": null
+ },
+ "upstreams": [
+ {
+ "table": "urn:li:dataset:(urn:li:dataPlatform:redshift,customer,PROD)",
+ "column": "cust_id"
+ }
+ ]
+ },
+ {
+ "downstream": {
+ "table": "urn:li:dataset:(urn:li:dataPlatform:redshift,mv_total_orders,PROD)",
+ "column": "first_name",
+ "column_type": null,
+ "native_column_type": null
+ },
+ "upstreams": [
+ {
+ "table": "urn:li:dataset:(urn:li:dataPlatform:redshift,customer,PROD)",
+ "column": "first_name"
+ }
+ ]
+ },
+ {
+ "downstream": {
+ "table": "urn:li:dataset:(urn:li:dataPlatform:redshift,mv_total_orders,PROD)",
+ "column": "total_amount",
+ "column_type": null,
+ "native_column_type": null
+ },
+ "upstreams": [
+ {
+ "table": "urn:li:dataset:(urn:li:dataPlatform:redshift,orders,PROD)",
+ "column": "amount"
+ }
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sql_detach.py b/metadata-ingestion/tests/unit/sql_parsing/test_sql_detach.py
new file mode 100644
index 00000000000000..c99b05c35e0f57
--- /dev/null
+++ b/metadata-ingestion/tests/unit/sql_parsing/test_sql_detach.py
@@ -0,0 +1,46 @@
+from datahub.utilities.sqlglot_lineage import detach_ctes
+
+
+def test_detach_ctes_simple():
+ original = "WITH __cte_0 AS (SELECT * FROM table1) SELECT * FROM table2 JOIN __cte_0 ON table2.id = __cte_0.id"
+ detached_expr = detach_ctes(
+ original,
+ platform="snowflake",
+ cte_mapping={"__cte_0": "_my_cte_table"},
+ )
+ detached = detached_expr.sql(dialect="snowflake")
+
+ assert (
+ detached
+ == "WITH __cte_0 AS (SELECT * FROM table1) SELECT * FROM table2 JOIN _my_cte_table ON table2.id = _my_cte_table.id"
+ )
+
+
+def test_detach_ctes_with_alias():
+ original = "WITH __cte_0 AS (SELECT * FROM table1) SELECT * FROM table2 JOIN __cte_0 AS tablealias ON table2.id = tablealias.id"
+ detached_expr = detach_ctes(
+ original,
+ platform="snowflake",
+ cte_mapping={"__cte_0": "_my_cte_table"},
+ )
+ detached = detached_expr.sql(dialect="snowflake")
+
+ assert (
+ detached
+ == "WITH __cte_0 AS (SELECT * FROM table1) SELECT * FROM table2 JOIN _my_cte_table AS tablealias ON table2.id = tablealias.id"
+ )
+
+
+def test_detach_ctes_with_multipart_replacement():
+ original = "WITH __cte_0 AS (SELECT * FROM table1) SELECT * FROM table2 JOIN __cte_0 ON table2.id = __cte_0.id"
+ detached_expr = detach_ctes(
+ original,
+ platform="snowflake",
+ cte_mapping={"__cte_0": "my_db.my_schema.my_table"},
+ )
+ detached = detached_expr.sql(dialect="snowflake")
+
+ assert (
+ detached
+ == "WITH __cte_0 AS (SELECT * FROM table1) SELECT * FROM table2 JOIN my_db.my_schema.my_table ON table2.id = my_db.my_schema.my_table.id"
+ )
diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py
index 7f69e358f8f119..eb1ba06669112f 100644
--- a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py
+++ b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py
@@ -3,59 +3,11 @@
import pytest
from datahub.testing.check_sql_parser_result import assert_sql_result
-from datahub.utilities.sqlglot_lineage import (
- _UPDATE_ARGS_NOT_SUPPORTED_BY_SELECT,
- detach_ctes,
-)
+from datahub.utilities.sqlglot_lineage import _UPDATE_ARGS_NOT_SUPPORTED_BY_SELECT
RESOURCE_DIR = pathlib.Path(__file__).parent / "goldens"
-def test_detach_ctes_simple():
- original = "WITH __cte_0 AS (SELECT * FROM table1) SELECT * FROM table2 JOIN __cte_0 ON table2.id = __cte_0.id"
- detached_expr = detach_ctes(
- original,
- platform="snowflake",
- cte_mapping={"__cte_0": "_my_cte_table"},
- )
- detached = detached_expr.sql(dialect="snowflake")
-
- assert (
- detached
- == "WITH __cte_0 AS (SELECT * FROM table1) SELECT * FROM table2 JOIN _my_cte_table ON table2.id = _my_cte_table.id"
- )
-
-
-def test_detach_ctes_with_alias():
- original = "WITH __cte_0 AS (SELECT * FROM table1) SELECT * FROM table2 JOIN __cte_0 AS tablealias ON table2.id = tablealias.id"
- detached_expr = detach_ctes(
- original,
- platform="snowflake",
- cte_mapping={"__cte_0": "_my_cte_table"},
- )
- detached = detached_expr.sql(dialect="snowflake")
-
- assert (
- detached
- == "WITH __cte_0 AS (SELECT * FROM table1) SELECT * FROM table2 JOIN _my_cte_table AS tablealias ON table2.id = tablealias.id"
- )
-
-
-def test_detach_ctes_with_multipart_replacement():
- original = "WITH __cte_0 AS (SELECT * FROM table1) SELECT * FROM table2 JOIN __cte_0 ON table2.id = __cte_0.id"
- detached_expr = detach_ctes(
- original,
- platform="snowflake",
- cte_mapping={"__cte_0": "my_db.my_schema.my_table"},
- )
- detached = detached_expr.sql(dialect="snowflake")
-
- assert (
- detached
- == "WITH __cte_0 AS (SELECT * FROM table1) SELECT * FROM table2 JOIN my_db.my_schema.my_table ON table2.id = my_db.my_schema.my_table.id"
- )
-
-
def test_select_max():
# The COL2 should get normalized to col2.
assert_sql_result(
@@ -1023,3 +975,25 @@ def test_postgres_complex_update():
},
expected_file=RESOURCE_DIR / "test_postgres_complex_update.json",
)
+
+
+def test_redshift_materialized_view_auto_refresh():
+ # Example query from the redshift docs: https://docs.aws.amazon.com/prescriptive-guidance/latest/materialized-views-redshift/refreshing-materialized-views.html
+ assert_sql_result(
+ """
+CREATE MATERIALIZED VIEW mv_total_orders
+AUTO REFRESH YES -- Add this clause to auto refresh the MV
+AS
+ SELECT c.cust_id,
+ c.first_name,
+ sum(o.amount) as total_amount
+ FROM orders o
+ JOIN customer c
+ ON c.cust_id = o.customer_id
+ GROUP BY c.cust_id,
+ c.first_name;
+""",
+ dialect="redshift",
+ expected_file=RESOURCE_DIR
+ / "test_redshift_materialized_view_auto_refresh.json",
+ )
From db55fadb734546b796352aeb38ec2719ce770cf9 Mon Sep 17 00:00:00 2001
From: kushagra-apptware <81357546+kushagra-apptware@users.noreply.github.com>
Date: Fri, 22 Dec 2023 19:48:30 +0530
Subject: [PATCH 063/540] feat(ui): add custom cron option for UI based
ingestion (#9510)
---
.../source/builder/CreateScheduleStep.tsx | 38 ++++++++++++++-----
.../source/builder/SelectTemplateStep.tsx | 4 +-
2 files changed, 32 insertions(+), 10 deletions(-)
diff --git a/datahub-web-react/src/app/ingest/source/builder/CreateScheduleStep.tsx b/datahub-web-react/src/app/ingest/source/builder/CreateScheduleStep.tsx
index 7a14b6a7941896..3745ee0f44dc01 100644
--- a/datahub-web-react/src/app/ingest/source/builder/CreateScheduleStep.tsx
+++ b/datahub-web-react/src/app/ingest/source/builder/CreateScheduleStep.tsx
@@ -1,4 +1,4 @@
-import { Button, Form, Switch, Typography } from 'antd';
+import { Button, Checkbox, Form, Input, Switch, Typography } from 'antd';
import React, { useMemo, useState } from 'react';
import { Cron } from 'react-js-cron';
import 'react-js-cron/dist/styles.css';
@@ -31,6 +31,10 @@ const CronText = styled(Typography.Paragraph)`
color: ${ANTD_GRAY[7]};
`;
+const AdvancedCheckBox = styled(Typography.Text)`
+ margin-right: 10px;
+ margin-bottom: 8px;
+`;
const CronSuccessCheck = styled(CheckCircleOutlined)`
color: ${REDESIGN_COLORS.BLUE};
margin-right: 4px;
@@ -68,8 +72,8 @@ export const CreateScheduleStep = ({ state, updateState, goTo, prev }: StepProps
const { schedule } = state;
const interval = schedule?.interval?.replaceAll(', ', ' ') || DAILY_MIDNIGHT_CRON_INTERVAL;
const timezone = schedule?.timezone || Intl.DateTimeFormat().resolvedOptions().timeZone;
-
const [scheduleEnabled, setScheduleEnabled] = useState(!!schedule);
+ const [advancedCronCheck, setAdvancedCronCheck] = useState(false);
const [scheduleCronInterval, setScheduleCronInterval] = useState(interval);
const [scheduleTimezone, setScheduleTimezone] = useState(timezone);
@@ -137,13 +141,29 @@ export const CreateScheduleStep = ({ state, updateState, goTo, prev }: StepProps
)}
Schedule}>
-
+
+
Advanced
+
setAdvancedCronCheck(event.target.checked)}
+ />
+
+ {advancedCronCheck ? (
+ setScheduleCronInterval(e.target.value)}
+ />
+ ) : (
+
+ )}
{cronAsText.error && <>Invalid cron schedule. Cron must be of UNIX form:>}
{!cronAsText.text && (
diff --git a/datahub-web-react/src/app/ingest/source/builder/SelectTemplateStep.tsx b/datahub-web-react/src/app/ingest/source/builder/SelectTemplateStep.tsx
index 8aaa4f3448686f..6b771d459c4ef9 100644
--- a/datahub-web-react/src/app/ingest/source/builder/SelectTemplateStep.tsx
+++ b/datahub-web-react/src/app/ingest/source/builder/SelectTemplateStep.tsx
@@ -70,7 +70,9 @@ export const SelectTemplateStep = ({ state, updateState, goTo, cancel, ingestion
};
const filteredSources = ingestionSources.filter(
- (source) => source.displayName.includes(searchFilter) || source.name.includes(searchFilter),
+ (source) =>
+ source.displayName.toLocaleLowerCase().includes(searchFilter.toLocaleLowerCase()) ||
+ source.name.toLocaleLowerCase().includes(searchFilter.toLocaleLowerCase()),
);
return (
From 0d8568e087b5489b49161423ed299dec84e32f1e Mon Sep 17 00:00:00 2001
From: Harshal Sheth
Date: Fri, 22 Dec 2023 14:59:14 -0500
Subject: [PATCH 064/540] fix(ingest): update dbt type inference (#9512)
---
.../integration/dbt/dbt_enabled_with_schemas_mces_golden.json | 2 +-
.../integration/dbt/dbt_test_column_meta_mapping_golden.json | 2 +-
.../dbt/dbt_test_with_complex_owner_patterns_mces_golden.json | 2 +-
.../dbt/dbt_test_with_data_platform_instance_mces_golden.json | 2 +-
.../dbt/dbt_test_with_non_incremental_lineage_mces_golden.json | 2 +-
.../dbt/dbt_test_with_target_platform_instance_mces_golden.json | 2 +-
6 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/metadata-ingestion/tests/integration/dbt/dbt_enabled_with_schemas_mces_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_enabled_with_schemas_mces_golden.json
index 4deb725ed2b444..fa26a93479a4f8 100644
--- a/metadata-ingestion/tests/integration/dbt/dbt_enabled_with_schemas_mces_golden.json
+++ b/metadata-ingestion/tests/integration/dbt/dbt_enabled_with_schemas_mces_golden.json
@@ -153,7 +153,7 @@
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
- "nativeDataType": "VARCHAR",
+ "nativeDataType": "TEXT",
"recursive": false,
"isPartOfKey": false
},
diff --git a/metadata-ingestion/tests/integration/dbt/dbt_test_column_meta_mapping_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_test_column_meta_mapping_golden.json
index 588470ef416314..f2208fd98c2030 100644
--- a/metadata-ingestion/tests/integration/dbt/dbt_test_column_meta_mapping_golden.json
+++ b/metadata-ingestion/tests/integration/dbt/dbt_test_column_meta_mapping_golden.json
@@ -87,7 +87,7 @@
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
- "nativeDataType": "VARCHAR",
+ "nativeDataType": "TEXT",
"recursive": false,
"isPartOfKey": false
},
diff --git a/metadata-ingestion/tests/integration/dbt/dbt_test_with_complex_owner_patterns_mces_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_test_with_complex_owner_patterns_mces_golden.json
index 926e8b8c8ed84b..a27eeb37759608 100644
--- a/metadata-ingestion/tests/integration/dbt/dbt_test_with_complex_owner_patterns_mces_golden.json
+++ b/metadata-ingestion/tests/integration/dbt/dbt_test_with_complex_owner_patterns_mces_golden.json
@@ -117,7 +117,7 @@
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
- "nativeDataType": "VARCHAR",
+ "nativeDataType": "TEXT",
"recursive": false,
"isPartOfKey": false
},
diff --git a/metadata-ingestion/tests/integration/dbt/dbt_test_with_data_platform_instance_mces_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_test_with_data_platform_instance_mces_golden.json
index 3727603266f252..43336ca585bcc3 100644
--- a/metadata-ingestion/tests/integration/dbt/dbt_test_with_data_platform_instance_mces_golden.json
+++ b/metadata-ingestion/tests/integration/dbt/dbt_test_with_data_platform_instance_mces_golden.json
@@ -118,7 +118,7 @@
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
- "nativeDataType": "VARCHAR",
+ "nativeDataType": "TEXT",
"recursive": false,
"isPartOfKey": false
},
diff --git a/metadata-ingestion/tests/integration/dbt/dbt_test_with_non_incremental_lineage_mces_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_test_with_non_incremental_lineage_mces_golden.json
index ec879e6af766ac..27ea568d010fa1 100644
--- a/metadata-ingestion/tests/integration/dbt/dbt_test_with_non_incremental_lineage_mces_golden.json
+++ b/metadata-ingestion/tests/integration/dbt/dbt_test_with_non_incremental_lineage_mces_golden.json
@@ -118,7 +118,7 @@
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
- "nativeDataType": "VARCHAR",
+ "nativeDataType": "TEXT",
"recursive": false,
"isPartOfKey": false
},
diff --git a/metadata-ingestion/tests/integration/dbt/dbt_test_with_target_platform_instance_mces_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_test_with_target_platform_instance_mces_golden.json
index e25c5e4faf6afd..07296e175d9ec6 100644
--- a/metadata-ingestion/tests/integration/dbt/dbt_test_with_target_platform_instance_mces_golden.json
+++ b/metadata-ingestion/tests/integration/dbt/dbt_test_with_target_platform_instance_mces_golden.json
@@ -118,7 +118,7 @@
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
- "nativeDataType": "VARCHAR",
+ "nativeDataType": "TEXT",
"recursive": false,
"isPartOfKey": false
},
From ed5bdfc5aec65978145a72d2701941ed21b35554 Mon Sep 17 00:00:00 2001
From: Harshal Sheth
Date: Fri, 22 Dec 2023 17:12:31 -0500
Subject: [PATCH 065/540] feat(ingest/redshift): merge CLL instead of
overwriting (#9513)
---
.../ingestion/source/redshift/lineage.py | 74 ++++++++++++-------
.../src/datahub/utilities/sqlglot_lineage.py | 5 +-
2 files changed, 49 insertions(+), 30 deletions(-)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py
index abed8505f168bf..8135e1d44c1021 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py
@@ -41,6 +41,7 @@
UpstreamLineageClass,
)
from datahub.utilities import memory_footprint
+from datahub.utilities.dedup_list import deduplicate_list
from datahub.utilities.urns import dataset_urn
logger: logging.Logger = logging.getLogger(__name__)
@@ -85,6 +86,30 @@ def __post_init__(self):
else:
self.dataset_lineage_type = DatasetLineageTypeClass.TRANSFORMED
+ def merge_lineage(
+ self,
+ upstreams: Set[LineageDataset],
+ cll: Optional[List[sqlglot_l.ColumnLineageInfo]],
+ ) -> None:
+ self.upstreams = self.upstreams.union(upstreams)
+
+ # Merge CLL using the output column name as the merge key.
+ self.cll = self.cll or []
+ existing_cll: Dict[str, sqlglot_l.ColumnLineageInfo] = {
+ c.downstream.column: c for c in self.cll
+ }
+ for c in cll or []:
+ if c.downstream.column in existing_cll:
+ # Merge using upstream + column name as the merge key.
+ existing_cll[c.downstream.column].upstreams = deduplicate_list(
+ [*existing_cll[c.downstream.column].upstreams, *c.upstreams]
+ )
+ else:
+ # New output column, just add it as is.
+ self.cll.append(c)
+
+ self.cll = self.cll or None
+
class RedshiftLineageExtractor:
def __init__(
@@ -161,7 +186,12 @@ def _get_sources_from_query(
)
sources.append(source)
- return sources, parsed_result.column_lineage
+ return (
+ sources,
+ parsed_result.column_lineage
+ if self.config.include_view_column_lineage
+ else None,
+ )
def _build_s3_path_from_row(self, filename: str) -> str:
path = filename.strip()
@@ -208,7 +238,7 @@ def _get_sources(
"Only s3 source supported with copy. The source was: {path}."
)
self.report.num_lineage_dropped_not_support_copy_path += 1
- return sources, cll
+ return [], None
path = strip_s3_prefix(self._get_s3_path(path))
urn = make_dataset_urn_with_platform_instance(
platform=platform.value,
@@ -284,7 +314,6 @@ def _populate_lineage_map(
ddl=lineage_row.ddl,
filename=lineage_row.filename,
)
- target.cll = cll
target.upstreams.update(
self._get_upstream_lineages(
@@ -294,13 +323,13 @@ def _populate_lineage_map(
raw_db_name=raw_db_name,
)
)
+ target.cll = cll
- # Merging downstreams if dataset already exists and has downstreams
+ # Merging upstreams if dataset already exists and has upstreams
if target.dataset.urn in self._lineage_map:
- self._lineage_map[target.dataset.urn].upstreams = self._lineage_map[
- target.dataset.urn
- ].upstreams.union(target.upstreams)
-
+ self._lineage_map[target.dataset.urn].merge_lineage(
+ upstreams=target.upstreams, cll=target.cll
+ )
else:
self._lineage_map[target.dataset.urn] = target
@@ -420,7 +449,10 @@ def populate_lineage(
) -> None:
populate_calls: List[Tuple[str, LineageCollectorType]] = []
- if self.config.table_lineage_mode == LineageMode.STL_SCAN_BASED:
+ if self.config.table_lineage_mode in {
+ LineageMode.STL_SCAN_BASED,
+ LineageMode.MIXED,
+ }:
# Populate table level lineage by getting upstream tables from stl_scan redshift table
query = RedshiftQuery.stl_scan_based_lineage_query(
self.config.database,
@@ -428,15 +460,10 @@ def populate_lineage(
self.end_time,
)
populate_calls.append((query, LineageCollectorType.QUERY_SCAN))
- elif self.config.table_lineage_mode == LineageMode.SQL_BASED:
- # Populate table level lineage by parsing table creating sqls
- query = RedshiftQuery.list_insert_create_queries_sql(
- db_name=database,
- start_time=self.start_time,
- end_time=self.end_time,
- )
- populate_calls.append((query, LineageCollectorType.QUERY_SQL_PARSER))
- elif self.config.table_lineage_mode == LineageMode.MIXED:
+ if self.config.table_lineage_mode in {
+ LineageMode.SQL_BASED,
+ LineageMode.MIXED,
+ }:
# Populate table level lineage by parsing table creating sqls
query = RedshiftQuery.list_insert_create_queries_sql(
db_name=database,
@@ -445,15 +472,7 @@ def populate_lineage(
)
populate_calls.append((query, LineageCollectorType.QUERY_SQL_PARSER))
- # Populate table level lineage by getting upstream tables from stl_scan redshift table
- query = RedshiftQuery.stl_scan_based_lineage_query(
- db_name=database,
- start_time=self.start_time,
- end_time=self.end_time,
- )
- populate_calls.append((query, LineageCollectorType.QUERY_SCAN))
-
- if self.config.include_views:
+ if self.config.include_views and self.config.include_view_lineage:
# Populate table level lineage for views
query = RedshiftQuery.view_lineage_query()
populate_calls.append((query, LineageCollectorType.VIEW))
@@ -540,7 +559,6 @@ def get_lineage(
dataset_urn: str,
schema: RedshiftSchema,
) -> Optional[Tuple[UpstreamLineageClass, Dict[str, str]]]:
-
upstream_lineage: List[UpstreamClass] = []
cll_lineage: List[FineGrainedLineage] = []
diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py
index f84b3f8b94a2e0..b43c8de4c8f3d8 100644
--- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py
+++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py
@@ -193,7 +193,7 @@ class _ColumnRef(_FrozenModel):
column: str
-class ColumnRef(_ParserBaseModel):
+class ColumnRef(_FrozenModel):
table: Urn
column: str
@@ -929,6 +929,7 @@ def _translate_sqlglot_type(
TypeClass = ArrayTypeClass
elif sqlglot_type in {
sqlglot.exp.DataType.Type.UNKNOWN,
+ sqlglot.exp.DataType.Type.NULL,
}:
return None
else:
@@ -1090,7 +1091,7 @@ def _sqlglot_lineage_inner(
table_schemas_resolved=total_schemas_resolved,
)
logger.debug(
- f"Resolved {len(table_name_schema_mapping)} of {len(tables)} table schemas"
+ f"Resolved {total_schemas_resolved} of {total_tables_discovered} table schemas"
)
# Simplify the input statement for column-level lineage generation.
From 4448cf1f2d777c82d913e5ee0aeabd0e2785fad3 Mon Sep 17 00:00:00 2001
From: gaurav2733 <77378510+gaurav2733@users.noreply.github.com>
Date: Tue, 26 Dec 2023 16:30:24 +0530
Subject: [PATCH 066/540] fix(ui/ingestion): add debounce on search on
ingestion listing page (#9516)
---
.../entity/shared/tabs/Dataset/Queries/utils/constants.ts | 1 +
datahub-web-react/src/app/ingest/secret/SecretsList.tsx | 8 +++++++-
.../src/app/ingest/source/IngestionSourceList.tsx | 8 +++++++-
3 files changed, 15 insertions(+), 2 deletions(-)
diff --git a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Queries/utils/constants.ts b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Queries/utils/constants.ts
index 5176c1207874c9..025705abc580ea 100644
--- a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Queries/utils/constants.ts
+++ b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Queries/utils/constants.ts
@@ -16,5 +16,6 @@ export const DEFAULT_MAX_RECENT_QUERIES = 9;
*/
export const MAX_ROWS_BEFORE_DEBOUNCE = 50;
export const HALF_SECOND_IN_MS = 500;
+export const ONE_SECOND_IN_MS = 1000;
export const ADD_UNAUTHORIZED_MESSAGE = 'You are not authorized to add Queries to this entity.';
diff --git a/datahub-web-react/src/app/ingest/secret/SecretsList.tsx b/datahub-web-react/src/app/ingest/secret/SecretsList.tsx
index 2728fff0ccba34..1a960997e6beeb 100644
--- a/datahub-web-react/src/app/ingest/secret/SecretsList.tsx
+++ b/datahub-web-react/src/app/ingest/secret/SecretsList.tsx
@@ -1,5 +1,6 @@
import React, { useEffect, useState } from 'react';
import { Button, Empty, message, Modal, Pagination, Typography } from 'antd';
+import { debounce } from 'lodash';
import { DeleteOutlined, PlusOutlined } from '@ant-design/icons';
import * as QueryString from 'query-string';
import { useLocation } from 'react-router';
@@ -18,6 +19,7 @@ import { SearchBar } from '../../search/SearchBar';
import { useEntityRegistry } from '../../useEntityRegistry';
import { scrollToTop } from '../../shared/searchUtils';
import { addSecretToListSecretsCache, removeSecretFromListSecretsCache } from './cacheUtils';
+import { ONE_SECOND_IN_MS } from '../../entity/shared/tabs/Dataset/Queries/utils/constants';
const DeleteButtonContainer = styled.div`
display: flex;
@@ -84,6 +86,10 @@ export const SecretsList = () => {
setPage(newPage);
};
+ const debouncedSetQuery = debounce((newQuery: string | undefined) => {
+ setQuery(newQuery);
+ }, ONE_SECOND_IN_MS);
+
const onSubmit = (state: SecretBuilderState, resetBuilderState: () => void) => {
createSecretMutation({
variables: {
@@ -199,7 +205,7 @@ export const SecretsList = () => {
onSearch={() => null}
onQueryChange={(q) => {
setPage(1);
- setQuery(q);
+ debouncedSetQuery(q);
}}
entityRegistry={entityRegistry}
hideRecommendations
diff --git a/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx b/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx
index 6188845694f9eb..e6db6bfcc9a61b 100644
--- a/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx
+++ b/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx
@@ -1,5 +1,6 @@
import { PlusOutlined, RedoOutlined } from '@ant-design/icons';
import React, { useCallback, useEffect, useState } from 'react';
+import { debounce } from 'lodash';
import * as QueryString from 'query-string';
import { useLocation } from 'react-router';
import { Button, message, Modal, Pagination, Select } from 'antd';
@@ -30,6 +31,7 @@ import {
INGESTION_CREATE_SOURCE_ID,
INGESTION_REFRESH_SOURCES_ID,
} from '../../onboarding/config/IngestionOnboardingConfig';
+import { ONE_SECOND_IN_MS } from '../../entity/shared/tabs/Dataset/Queries/utils/constants';
const PLACEHOLDER_URN = 'placeholder-urn';
@@ -133,6 +135,10 @@ export const IngestionSourceList = () => {
setLastRefresh(new Date().getTime());
}, [refetch]);
+ const debouncedSetQuery = debounce((newQuery: string | undefined) => {
+ setQuery(newQuery);
+ }, ONE_SECOND_IN_MS);
+
function hasActiveExecution() {
return !!filteredSources.find((source) =>
source.executions?.executionRequests.find((request) => isExecutionRequestActive(request)),
@@ -401,7 +407,7 @@ export const IngestionSourceList = () => {
onSearch={() => null}
onQueryChange={(q) => {
setPage(1);
- setQuery(q);
+ debouncedSetQuery(q);
}}
entityRegistry={entityRegistry}
hideRecommendations
From d399a530576974da9beb1af24d7ea5f98922b6d3 Mon Sep 17 00:00:00 2001
From: kushagra-apptware <81357546+kushagra-apptware@users.noreply.github.com>
Date: Tue, 26 Dec 2023 18:26:40 +0530
Subject: [PATCH 067/540] fix(ui): correct the color of edit links (#9517)
---
.../entity/shared/tabs/Documentation/components/LinkList.tsx | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx b/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx
index 1b5c3d54009da8..9f94a830ac1cfa 100644
--- a/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx
+++ b/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx
@@ -159,7 +159,7 @@ export const LinkList = ({ refetch }: LinkListProps) => {
- handleEditLink(link)} type="text" shape="circle" danger>
+ handleEditLink(link)} type="text" shape="circle">
handleDeleteLink(link)} type="text" shape="circle" danger>
From 1e64a75339cd1e4e99ef0ab4b926057a2cceb511 Mon Sep 17 00:00:00 2001
From: ethan-cartwright
Date: Tue, 26 Dec 2023 09:04:05 -0500
Subject: [PATCH 068/540] fix(frontend): Add JSON list oidc group extraction
logic (#9495)
Co-authored-by: Ethan Cartwright
---
.../app/auth/sso/oidc/OidcCallbackLogic.java | 43 ++++++++-----
.../test/oidc/OidcCallbackLogicTest.java | 64 +++++++++++++++++++
2 files changed, 90 insertions(+), 17 deletions(-)
create mode 100644 datahub-frontend/test/oidc/OidcCallbackLogicTest.java
diff --git a/datahub-frontend/app/auth/sso/oidc/OidcCallbackLogic.java b/datahub-frontend/app/auth/sso/oidc/OidcCallbackLogic.java
index fa562f54312eca..c72c3537084834 100644
--- a/datahub-frontend/app/auth/sso/oidc/OidcCallbackLogic.java
+++ b/datahub-frontend/app/auth/sso/oidc/OidcCallbackLogic.java
@@ -10,6 +10,8 @@
import auth.sso.SsoManager;
import client.AuthServiceClient;
import com.datahub.authentication.Authentication;
+import com.fasterxml.jackson.core.type.TypeReference;
+import com.fasterxml.jackson.databind.ObjectMapper;
import com.linkedin.common.AuditStamp;
import com.linkedin.common.CorpGroupUrnArray;
import com.linkedin.common.CorpuserUrnArray;
@@ -300,6 +302,29 @@ private CorpUserSnapshot extractUser(CorpuserUrn urn, CommonProfile profile) {
return corpUserSnapshot;
}
+ public static Collection getGroupNames(CommonProfile profile, Object groupAttribute, String groupsClaimName) {
+ Collection groupNames = Collections.emptyList();
+ try {
+ if (groupAttribute instanceof Collection) {
+ // List of group names
+ groupNames = (Collection) profile.getAttribute(groupsClaimName, Collection.class);
+ } else if (groupAttribute instanceof String) {
+ String groupString = (String) groupAttribute;
+ ObjectMapper objectMapper = new ObjectMapper();
+ try {
+ // Json list of group names
+ groupNames = objectMapper.readValue(groupString, new TypeReference>(){});
+ } catch (Exception e) {
+ groupNames = Arrays.asList(groupString.split(","));
+ }
+ }
+ } catch (Exception e) {
+ log.error(String.format(
+ "Failed to parse group names: Expected to find a list of strings for attribute with name %s, found %s",
+ groupsClaimName, profile.getAttribute(groupsClaimName).getClass()));
+ }
+ return groupNames;
+ }
private List extractGroups(CommonProfile profile) {
log.debug(
@@ -320,23 +345,7 @@ private List extractGroups(CommonProfile profile) {
if (profile.containsAttribute(groupsClaimName)) {
try {
final List groupSnapshots = new ArrayList<>();
- final Collection groupNames;
- final Object groupAttribute = profile.getAttribute(groupsClaimName);
- if (groupAttribute instanceof Collection) {
- // List of group names
- groupNames =
- (Collection) profile.getAttribute(groupsClaimName, Collection.class);
- } else if (groupAttribute instanceof String) {
- // Single group name
- groupNames = Collections.singleton(profile.getAttribute(groupsClaimName, String.class));
- } else {
- log.error(
- String.format(
- "Fail to parse OIDC group claim with name %s. Unknown type %s provided.",
- groupsClaimName, groupAttribute.getClass()));
- // Skip over group attribute. Do not throw.
- groupNames = Collections.emptyList();
- }
+ Collection groupNames = getGroupNames(profile, profile.getAttribute(groupsClaimName), groupsClaimName);
for (String groupName : groupNames) {
// Create a basic CorpGroupSnapshot from the information.
diff --git a/datahub-frontend/test/oidc/OidcCallbackLogicTest.java b/datahub-frontend/test/oidc/OidcCallbackLogicTest.java
new file mode 100644
index 00000000000000..f4784c29e91f2e
--- /dev/null
+++ b/datahub-frontend/test/oidc/OidcCallbackLogicTest.java
@@ -0,0 +1,64 @@
+package oidc;
+
+import auth.sso.oidc.OidcConfigs;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+
+import static auth.sso.oidc.OidcCallbackLogic.getGroupNames;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+
+import org.junit.jupiter.api.Test;
+import org.mockito.Mockito;
+import org.pac4j.core.profile.CommonProfile;
+
+public class OidcCallbackLogicTest {
+
+ @Test
+ public void testGetGroupsClaimNamesJsonArray() {
+ CommonProfile profile = createMockProfileWithAttribute("[\"group1\", \"group2\"]", "groupsClaimName");
+ Collection result = getGroupNames(profile, "[\"group1\", \"group2\"]", "groupsClaimName");
+ assertEquals(Arrays.asList("group1", "group2"), result);
+ }
+ @Test
+ public void testGetGroupNamesWithSingleGroup() {
+ CommonProfile profile = createMockProfileWithAttribute("group1", "groupsClaimName");
+ Collection result = getGroupNames(profile, "group1", "groupsClaimName");
+ assertEquals(Arrays.asList("group1"), result);
+ }
+
+ @Test
+ public void testGetGroupNamesWithCommaSeparated() {
+ CommonProfile profile = createMockProfileWithAttribute("group1,group2", "groupsClaimName");
+ Collection result = getGroupNames(profile, "group1,group2", "groupsClaimName");
+ assertEquals(Arrays.asList("group1", "group2"), result);
+ }
+
+ @Test
+ public void testGetGroupNamesWithCollection() {
+ CommonProfile profile = createMockProfileWithAttribute(Arrays.asList("group1", "group2"), "groupsClaimName");
+ Collection result = getGroupNames(profile, Arrays.asList("group1", "group2"), "groupsClaimName");
+ assertEquals(Arrays.asList("group1", "group2"), result);
+ }
+ // Helper method to create a mock CommonProfile with given attribute
+ private CommonProfile createMockProfileWithAttribute(Object attribute, String attributeName) {
+ CommonProfile profile = mock(CommonProfile.class);
+
+ // Mock for getAttribute(String)
+ when(profile.getAttribute(attributeName)).thenReturn(attribute);
+
+ // Mock for getAttribute(String, Class)
+ if (attribute instanceof Collection) {
+ when(profile.getAttribute(attributeName, Collection.class)).thenReturn((Collection) attribute);
+ } else if (attribute instanceof String) {
+ when(profile.getAttribute(attributeName, String.class)).thenReturn((String) attribute);
+ }
+ // Add more conditions here if needed for other types
+
+ return profile;
+ }
+}
From 651998de44afcac6efce44aadac6b04cb5e0ff30 Mon Sep 17 00:00:00 2001
From: RyanHolstien
Date: Tue, 26 Dec 2023 14:34:10 -0600
Subject: [PATCH 069/540] feat(frontend): align frontend sso code with
refactors (#9506)
---
datahub-frontend/app/auth/AuthModule.java | 110 ++++----
datahub-frontend/app/auth/AuthUtils.java | 20 ++
datahub-frontend/app/auth/sso/SsoConfigs.java | 75 ++++-
datahub-frontend/app/auth/sso/SsoManager.java | 164 ++++++++++-
.../app/auth/sso/oidc/OidcConfigs.java | 257 +++++++++++++-----
.../app/client/AuthServiceClient.java | 2 +
.../controllers/SsoCallbackController.java | 22 +-
.../test/app/ApplicationTest.java | 3 +
.../test/security/OidcConfigurationTest.java | 4 +-
.../settings/global/GlobalSettingsInfo.pdl | 5 +
.../linkedin/settings/global/OidcSettings.pdl | 96 +++++++
.../linkedin/settings/global/SsoSettings.pdl | 16 ++
.../com/linkedin/settings/global/SsoType.pdl | 11 +
.../authentication/AuthServiceController.java | 127 +++++++++
14 files changed, 768 insertions(+), 144 deletions(-)
create mode 100644 metadata-models/src/main/pegasus/com/linkedin/settings/global/OidcSettings.pdl
create mode 100644 metadata-models/src/main/pegasus/com/linkedin/settings/global/SsoSettings.pdl
create mode 100644 metadata-models/src/main/pegasus/com/linkedin/settings/global/SsoType.pdl
diff --git a/datahub-frontend/app/auth/AuthModule.java b/datahub-frontend/app/auth/AuthModule.java
index ef33bde8f61d39..699dd360fa5232 100644
--- a/datahub-frontend/app/auth/AuthModule.java
+++ b/datahub-frontend/app/auth/AuthModule.java
@@ -1,13 +1,9 @@
package auth;
import static auth.AuthUtils.*;
-import static auth.sso.oidc.OidcConfigs.*;
import static utils.ConfigUtil.*;
-import auth.sso.SsoConfigs;
import auth.sso.SsoManager;
-import auth.sso.oidc.OidcConfigs;
-import auth.sso.oidc.OidcProvider;
import client.AuthServiceClient;
import com.datahub.authentication.Actor;
import com.datahub.authentication.ActorType;
@@ -23,14 +19,11 @@
import config.ConfigurationProvider;
import controllers.SsoCallbackController;
import java.nio.charset.StandardCharsets;
-import java.util.ArrayList;
import java.util.Collections;
-import java.util.List;
+import lombok.extern.slf4j.Slf4j;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
-import org.pac4j.core.client.Client;
-import org.pac4j.core.client.Clients;
import org.pac4j.core.config.Config;
import org.pac4j.core.context.session.SessionStore;
import org.pac4j.play.LogoutController;
@@ -45,6 +38,7 @@
import utils.ConfigUtil;
/** Responsible for configuring, validating, and providing authentication related components. */
+@Slf4j
public class AuthModule extends AbstractModule {
/**
@@ -58,6 +52,7 @@ public class AuthModule extends AbstractModule {
private static final String PAC4J_SESSIONSTORE_PROVIDER_CONF = "pac4j.sessionStore.provider";
private static final String ENTITY_CLIENT_RETRY_INTERVAL = "entityClient.retryInterval";
private static final String ENTITY_CLIENT_NUM_RETRIES = "entityClient.numRetries";
+ private static final String GET_SSO_SETTINGS_ENDPOINT = "auth/getSsoSettings";
private final com.typesafe.config.Config _configs;
@@ -111,6 +106,7 @@ protected void configure() {
Authentication.class,
SystemEntityClient.class,
AuthServiceClient.class,
+ org.pac4j.core.config.Config.class,
com.typesafe.config.Config.class));
} catch (NoSuchMethodException | SecurityException e) {
throw new RuntimeException(
@@ -124,34 +120,20 @@ protected void configure() {
@Provides
@Singleton
- protected Config provideConfig(SsoManager ssoManager) {
- if (ssoManager.isSsoEnabled()) {
- final Clients clients = new Clients();
- final List clientList = new ArrayList<>();
- clientList.add(ssoManager.getSsoProvider().client());
- clients.setClients(clientList);
- final Config config = new Config(clients);
- config.setHttpActionAdapter(new PlayHttpActionAdapter());
- return config;
- }
- return new Config();
+ protected Config provideConfig() {
+ Config config = new Config();
+ config.setHttpActionAdapter(new PlayHttpActionAdapter());
+ return config;
}
@Provides
@Singleton
- protected SsoManager provideSsoManager() {
- SsoManager manager = new SsoManager();
- // Seed the SSO manager with a default SSO provider.
- if (isSsoEnabled(_configs)) {
- SsoConfigs ssoConfigs = new SsoConfigs(_configs);
- if (ssoConfigs.isOidcEnabled()) {
- // Register OIDC Provider, add to list of managers.
- OidcConfigs oidcConfigs = new OidcConfigs(_configs);
- OidcProvider oidcProvider = new OidcProvider(oidcConfigs);
- // Set the default SSO provider to this OIDC client.
- manager.setSsoProvider(oidcProvider);
- }
- }
+ protected SsoManager provideSsoManager(
+ Authentication systemAuthentication, CloseableHttpClient httpClient) {
+ SsoManager manager =
+ new SsoManager(
+ _configs, systemAuthentication, getSsoSettingsRequestUrl(_configs), httpClient);
+ manager.initializeSsoProvider();
return manager;
}
@@ -191,33 +173,16 @@ protected SystemEntityClient provideEntityClient(
configurationProvider.getCache().getClient().getEntityClient());
}
- @Provides
- @Singleton
- protected CloseableHttpClient provideHttpClient() {
- return HttpClients.createDefault();
- }
-
@Provides
@Singleton
protected AuthServiceClient provideAuthClient(
Authentication systemAuthentication, CloseableHttpClient httpClient) {
// Init a GMS auth client
- final String metadataServiceHost =
- _configs.hasPath(METADATA_SERVICE_HOST_CONFIG_PATH)
- ? _configs.getString(METADATA_SERVICE_HOST_CONFIG_PATH)
- : Configuration.getEnvironmentVariable(GMS_HOST_ENV_VAR, DEFAULT_GMS_HOST);
+ final String metadataServiceHost = getMetadataServiceHost(_configs);
- final int metadataServicePort =
- _configs.hasPath(METADATA_SERVICE_PORT_CONFIG_PATH)
- ? _configs.getInt(METADATA_SERVICE_PORT_CONFIG_PATH)
- : Integer.parseInt(
- Configuration.getEnvironmentVariable(GMS_PORT_ENV_VAR, DEFAULT_GMS_PORT));
+ final int metadataServicePort = getMetadataServicePort(_configs);
- final Boolean metadataServiceUseSsl =
- _configs.hasPath(METADATA_SERVICE_USE_SSL_CONFIG_PATH)
- ? _configs.getBoolean(METADATA_SERVICE_USE_SSL_CONFIG_PATH)
- : Boolean.parseBoolean(
- Configuration.getEnvironmentVariable(GMS_USE_SSL_ENV_VAR, DEFAULT_GMS_USE_SSL));
+ final boolean metadataServiceUseSsl = doesMetadataServiceUseSsl(_configs);
return new AuthServiceClient(
metadataServiceHost,
@@ -227,6 +192,12 @@ protected AuthServiceClient provideAuthClient(
httpClient);
}
+ @Provides
+ @Singleton
+ protected CloseableHttpClient provideHttpClient() {
+ return HttpClients.createDefault();
+ }
+
private com.linkedin.restli.client.Client buildRestliClient() {
final String metadataServiceHost =
utils.ConfigUtil.getString(
@@ -255,16 +226,33 @@ private com.linkedin.restli.client.Client buildRestliClient() {
metadataServiceSslProtocol);
}
- protected boolean isSsoEnabled(com.typesafe.config.Config configs) {
- // If OIDC is enabled, we infer SSO to be enabled.
- return configs.hasPath(OIDC_ENABLED_CONFIG_PATH)
- && Boolean.TRUE.equals(Boolean.parseBoolean(configs.getString(OIDC_ENABLED_CONFIG_PATH)));
+ protected boolean doesMetadataServiceUseSsl(com.typesafe.config.Config configs) {
+ return configs.hasPath(METADATA_SERVICE_USE_SSL_CONFIG_PATH)
+ ? configs.getBoolean(METADATA_SERVICE_USE_SSL_CONFIG_PATH)
+ : Boolean.parseBoolean(
+ Configuration.getEnvironmentVariable(GMS_USE_SSL_ENV_VAR, DEFAULT_GMS_USE_SSL));
}
- protected boolean isMetadataServiceAuthEnabled(com.typesafe.config.Config configs) {
- // If OIDC is enabled, we infer SSO to be enabled.
- return configs.hasPath(METADATA_SERVICE_AUTH_ENABLED_CONFIG_PATH)
- && Boolean.TRUE.equals(
- Boolean.parseBoolean(configs.getString(METADATA_SERVICE_AUTH_ENABLED_CONFIG_PATH)));
+ protected String getMetadataServiceHost(com.typesafe.config.Config configs) {
+ return configs.hasPath(METADATA_SERVICE_HOST_CONFIG_PATH)
+ ? configs.getString(METADATA_SERVICE_HOST_CONFIG_PATH)
+ : Configuration.getEnvironmentVariable(GMS_HOST_ENV_VAR, DEFAULT_GMS_HOST);
+ }
+
+ protected Integer getMetadataServicePort(com.typesafe.config.Config configs) {
+ return configs.hasPath(METADATA_SERVICE_PORT_CONFIG_PATH)
+ ? configs.getInt(METADATA_SERVICE_PORT_CONFIG_PATH)
+ : Integer.parseInt(
+ Configuration.getEnvironmentVariable(GMS_PORT_ENV_VAR, DEFAULT_GMS_PORT));
+ }
+
+ protected String getSsoSettingsRequestUrl(com.typesafe.config.Config configs) {
+ final String protocol = doesMetadataServiceUseSsl(configs) ? "https" : "http";
+ final String metadataServiceHost = getMetadataServiceHost(configs);
+ final Integer metadataServicePort = getMetadataServicePort(configs);
+
+ return String.format(
+ "%s://%s:%s/%s",
+ protocol, metadataServiceHost, metadataServicePort, GET_SSO_SETTINGS_ENDPOINT);
}
}
diff --git a/datahub-frontend/app/auth/AuthUtils.java b/datahub-frontend/app/auth/AuthUtils.java
index 283a2164584b95..84488a43f253e1 100644
--- a/datahub-frontend/app/auth/AuthUtils.java
+++ b/datahub-frontend/app/auth/AuthUtils.java
@@ -56,6 +56,26 @@ public class AuthUtils {
public static final String TITLE = "title";
public static final String INVITE_TOKEN = "inviteToken";
public static final String RESET_TOKEN = "resetToken";
+ public static final String BASE_URL = "baseUrl";
+ public static final String OIDC_ENABLED = "oidcEnabled";
+ public static final String CLIENT_ID = "clientId";
+ public static final String CLIENT_SECRET = "clientSecret";
+ public static final String DISCOVERY_URI = "discoveryUri";
+
+ public static final String USER_NAME_CLAIM = "userNameClaim";
+ public static final String USER_NAME_CLAIM_REGEX = "userNameClaimRegex";
+ public static final String SCOPE = "scope";
+ public static final String CLIENT_NAME = "clientName";
+ public static final String CLIENT_AUTHENTICATION_METHOD = "clientAuthenticationMethod";
+ public static final String JIT_PROVISIONING_ENABLED = "jitProvisioningEnabled";
+ public static final String PRE_PROVISIONING_REQUIRED = "preProvisioningRequired";
+ public static final String EXTRACT_GROUPS_ENABLED = "extractGroupsEnabled";
+ public static final String GROUPS_CLAIM = "groupsClaim";
+ public static final String RESPONSE_TYPE = "responseType";
+ public static final String RESPONSE_MODE = "responseMode";
+ public static final String USE_NONCE = "useNonce";
+ public static final String READ_TIMEOUT = "readTimeout";
+ public static final String EXTRACT_JWT_ACCESS_TOKEN_CLAIMS = "extractJwtAccessTokenClaims";
/**
* Determines whether the inbound request should be forward to downstream Metadata Service. Today,
diff --git a/datahub-frontend/app/auth/sso/SsoConfigs.java b/datahub-frontend/app/auth/sso/SsoConfigs.java
index 1f8455e773ffb1..976d0826f22770 100644
--- a/datahub-frontend/app/auth/sso/SsoConfigs.java
+++ b/datahub-frontend/app/auth/sso/SsoConfigs.java
@@ -1,8 +1,16 @@
package auth.sso;
-import static auth.ConfigUtil.*;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
-/** Class responsible for extracting and validating top-level SSO related configurations. */
+import static auth.AuthUtils.*;
+
+
+/**
+ * Class responsible for extracting and validating top-level SSO related configurations. TODO:
+ * Refactor SsoConfigs to have OidcConfigs and other identity provider specific configs as instance
+ * variables. SSoManager should ideally not know about identity provider specific configs.
+ */
public class SsoConfigs {
/** Required configs */
@@ -22,16 +30,11 @@ public class SsoConfigs {
private final String _authSuccessRedirectPath;
private final Boolean _oidcEnabled;
- public SsoConfigs(final com.typesafe.config.Config configs) {
- _authBaseUrl = getRequired(configs, AUTH_BASE_URL_CONFIG_PATH);
- _authBaseCallbackPath =
- getOptional(configs, AUTH_BASE_CALLBACK_PATH_CONFIG_PATH, DEFAULT_BASE_CALLBACK_PATH);
- _authSuccessRedirectPath =
- getOptional(configs, AUTH_SUCCESS_REDIRECT_PATH_CONFIG_PATH, DEFAULT_SUCCESS_REDIRECT_PATH);
- _oidcEnabled =
- configs.hasPath(OIDC_ENABLED_CONFIG_PATH)
- && Boolean.TRUE.equals(
- Boolean.parseBoolean(configs.getString(OIDC_ENABLED_CONFIG_PATH)));
+ public SsoConfigs(Builder> builder) {
+ _authBaseUrl = builder._authBaseUrl;
+ _authBaseCallbackPath = builder._authBaseCallbackPath;
+ _authSuccessRedirectPath = builder._authSuccessRedirectPath;
+ _oidcEnabled = builder._oidcEnabled;
}
public String getAuthBaseUrl() {
@@ -49,4 +52,52 @@ public String getAuthSuccessRedirectPath() {
public Boolean isOidcEnabled() {
return _oidcEnabled;
}
+
+ public static class Builder> {
+ protected String _authBaseUrl = null;
+ private String _authBaseCallbackPath = DEFAULT_BASE_CALLBACK_PATH;
+ private String _authSuccessRedirectPath = DEFAULT_SUCCESS_REDIRECT_PATH;
+ protected Boolean _oidcEnabled = false;
+ private final ObjectMapper _objectMapper = new ObjectMapper();
+ protected JsonNode jsonNode = null;
+
+ // No need to check if changes are made since this method is only called at start-up.
+ public Builder from(final com.typesafe.config.Config configs) {
+ if (configs.hasPath(AUTH_BASE_URL_CONFIG_PATH)) {
+ _authBaseUrl = configs.getString(AUTH_BASE_URL_CONFIG_PATH);
+ }
+ if (configs.hasPath(AUTH_BASE_CALLBACK_PATH_CONFIG_PATH)) {
+ _authBaseCallbackPath = configs.getString(AUTH_BASE_CALLBACK_PATH_CONFIG_PATH);
+ }
+ if (configs.hasPath(OIDC_ENABLED_CONFIG_PATH)) {
+ _oidcEnabled =
+ Boolean.TRUE.equals(Boolean.parseBoolean(configs.getString(OIDC_ENABLED_CONFIG_PATH)));
+ }
+ if (configs.hasPath(AUTH_SUCCESS_REDIRECT_PATH_CONFIG_PATH)) {
+ _authSuccessRedirectPath = configs.getString(AUTH_SUCCESS_REDIRECT_PATH_CONFIG_PATH);
+ }
+ return this;
+ }
+
+ public Builder from(String ssoSettingsJsonStr) {
+ try {
+ jsonNode = _objectMapper.readTree(ssoSettingsJsonStr);
+ } catch (Exception e) {
+ throw new RuntimeException(
+ String.format("Failed to parse ssoSettingsJsonStr %s into JSON", ssoSettingsJsonStr));
+ }
+ if (jsonNode.has(BASE_URL)) {
+ _authBaseUrl = jsonNode.get(BASE_URL).asText();
+ }
+ if (jsonNode.has(OIDC_ENABLED)) {
+ _oidcEnabled = jsonNode.get(OIDC_ENABLED).asBoolean();
+ }
+
+ return this;
+ }
+
+ public SsoConfigs build() {
+ return new SsoConfigs(this);
+ }
+ }
}
diff --git a/datahub-frontend/app/auth/sso/SsoManager.java b/datahub-frontend/app/auth/sso/SsoManager.java
index bf33f4148a5531..8377eb40e237f7 100644
--- a/datahub-frontend/app/auth/sso/SsoManager.java
+++ b/datahub-frontend/app/auth/sso/SsoManager.java
@@ -1,13 +1,49 @@
package auth.sso;
+import auth.sso.oidc.OidcConfigs;
+import auth.sso.oidc.OidcProvider;
+import com.datahub.authentication.Authentication;
+import java.util.Objects;
+import java.util.Optional;
import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
+import lombok.extern.slf4j.Slf4j;
+import org.apache.http.HttpEntity;
+import org.apache.http.HttpStatus;
+import org.apache.http.client.methods.CloseableHttpResponse;
+import org.apache.http.client.methods.HttpPost;
+import org.apache.http.entity.StringEntity;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.util.EntityUtils;
+import play.mvc.Http;
-/** Singleton class that stores & serves reference to a single {@link SsoProvider} if one exists. */
+/**
+ * Singleton class that stores & serves reference to a single {@link SsoProvider} if one exists.
+ * TODO: Refactor SsoManager to only accept SsoConfigs when initialized. See SsoConfigs TODO as
+ * well.
+ */
+@Slf4j
public class SsoManager {
private SsoProvider> _provider; // Only one active provider at a time.
+ private final Authentication
+ _authentication; // Authentication used to fetch SSO settings from GMS
+ private final String _ssoSettingsRequestUrl; // SSO settings request URL.
+ private final CloseableHttpClient _httpClient; // HTTP client for making requests to GMS.
+ private com.typesafe.config.Config _configs;
- public SsoManager() {}
+ public SsoManager(
+ com.typesafe.config.Config configs,
+ Authentication authentication,
+ String ssoSettingsRequestUrl,
+ CloseableHttpClient httpClient) {
+ _configs = configs;
+ _authentication = Objects.requireNonNull(authentication, "authentication cannot be null");
+ _ssoSettingsRequestUrl =
+ Objects.requireNonNull(ssoSettingsRequestUrl, "ssoSettingsRequestUrl cannot be null");
+ _httpClient = Objects.requireNonNull(httpClient, "httpClient cannot be null");
+ _provider = null;
+ }
/**
* Returns true if SSO is enabled, meaning a non-null {@link SsoProvider} has been provided to the
@@ -16,6 +52,7 @@ public SsoManager() {}
* @return true if SSO logic is enabled, false otherwise.
*/
public boolean isSsoEnabled() {
+ refreshSsoProvider();
return _provider != null;
}
@@ -24,17 +61,138 @@ public boolean isSsoEnabled() {
*
* @param provider the new {@link SsoProvider} to be used during authentication.
*/
- public void setSsoProvider(@Nonnull final SsoProvider> provider) {
+ public void setSsoProvider(final SsoProvider> provider) {
_provider = provider;
}
+ public void setConfigs(final com.typesafe.config.Config configs) {
+ _configs = configs;
+ }
+
+ public void clearSsoProvider() {
+ _provider = null;
+ }
+
/**
* Gets the active {@link SsoProvider} instance.
*
* @return the {@SsoProvider} that should be used during authentication and on IdP callback, or
* null if SSO is not enabled.
*/
+ @Nullable
public SsoProvider> getSsoProvider() {
return _provider;
}
+
+ public void initializeSsoProvider() {
+ SsoConfigs ssoConfigs = null;
+ try {
+ ssoConfigs = new SsoConfigs.Builder().from(_configs).build();
+ } catch (Exception e) {
+ // Debug-level logging since this is expected to fail if SSO has not been configured.
+ log.debug(String.format("Missing SSO settings in static configs %s", _configs), e);
+ }
+
+ if (ssoConfigs != null && ssoConfigs.isOidcEnabled()) {
+ try {
+ OidcConfigs oidcConfigs = new OidcConfigs.Builder().from(_configs).build();
+ maybeUpdateOidcProvider(oidcConfigs);
+ } catch (Exception e) {
+ // Error-level logging since this is unexpected to fail if SSO has been configured.
+ log.error(String.format("Error building OidcConfigs from static configs %s", _configs), e);
+ }
+ } else {
+ // Clear the SSO Provider since no SSO is enabled.
+ clearSsoProvider();
+ }
+
+ refreshSsoProvider();
+ }
+
+ private void refreshSsoProvider() {
+ final Optional maybeSsoSettingsJsonStr = getDynamicSsoSettings();
+ if (maybeSsoSettingsJsonStr.isEmpty()) {
+ return;
+ }
+
+ // If we receive a non-empty response, try to update the SSO provider.
+ final String ssoSettingsJsonStr = maybeSsoSettingsJsonStr.get();
+ SsoConfigs ssoConfigs;
+ try {
+ ssoConfigs = new SsoConfigs.Builder().from(ssoSettingsJsonStr).build();
+ } catch (Exception e) {
+ log.error(
+ String.format(
+ "Error building SsoConfigs from invalid json %s, reusing previous settings",
+ ssoSettingsJsonStr),
+ e);
+ return;
+ }
+
+ if (ssoConfigs != null && ssoConfigs.isOidcEnabled()) {
+ try {
+ OidcConfigs oidcConfigs =
+ new OidcConfigs.Builder().from(_configs, ssoSettingsJsonStr).build();
+ maybeUpdateOidcProvider(oidcConfigs);
+ } catch (Exception e) {
+ log.error(
+ String.format(
+ "Error building OidcConfigs from invalid json %s, reusing previous settings",
+ ssoSettingsJsonStr),
+ e);
+ }
+ } else {
+ // Clear the SSO Provider since no SSO is enabled.
+ clearSsoProvider();
+ }
+ }
+
+ private void maybeUpdateOidcProvider(OidcConfigs oidcConfigs) {
+ SsoProvider existingSsoProvider = getSsoProvider();
+ if (existingSsoProvider instanceof OidcProvider) {
+ OidcProvider existingOidcProvider = (OidcProvider) existingSsoProvider;
+ // If the existing provider is an OIDC provider and the configs are the same, do nothing.
+ if (existingOidcProvider.configs().equals(oidcConfigs)) {
+ return;
+ }
+ }
+
+ OidcProvider oidcProvider = new OidcProvider(oidcConfigs);
+ setSsoProvider(oidcProvider);
+ }
+
+ /** Call the Auth Service to get SSO settings */
+ @Nonnull
+ private Optional getDynamicSsoSettings() {
+ CloseableHttpResponse response = null;
+ try {
+ final HttpPost request = new HttpPost(_ssoSettingsRequestUrl);
+
+ // Build JSON request to verify credentials for a native user.
+ request.setEntity(new StringEntity(""));
+
+ // Add authorization header with DataHub frontend system id and secret.
+ request.addHeader(Http.HeaderNames.AUTHORIZATION, _authentication.getCredentials());
+
+ response = _httpClient.execute(request);
+ final HttpEntity entity = response.getEntity();
+ if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK && entity != null) {
+ // Successfully received the SSO settings
+ return Optional.of(EntityUtils.toString(entity));
+ } else {
+ log.debug("No SSO settings received from Auth Service, reusing previous settings");
+ }
+ } catch (Exception e) {
+ log.warn("Failed to get SSO settings due to exception, reusing previous settings", e);
+ } finally {
+ try {
+ if (response != null) {
+ response.close();
+ }
+ } catch (Exception e) {
+ log.warn("Failed to close http response", e);
+ }
+ }
+ return Optional.empty();
+ }
}
diff --git a/datahub-frontend/app/auth/sso/oidc/OidcConfigs.java b/datahub-frontend/app/auth/sso/oidc/OidcConfigs.java
index 6877ca187da973..bf3384527af11f 100644
--- a/datahub-frontend/app/auth/sso/oidc/OidcConfigs.java
+++ b/datahub-frontend/app/auth/sso/oidc/OidcConfigs.java
@@ -1,8 +1,10 @@
package auth.sso.oidc;
+import static auth.AuthUtils.*;
import static auth.ConfigUtil.*;
import auth.sso.SsoConfigs;
+import java.util.Objects;
import java.util.Optional;
import lombok.Getter;
@@ -44,79 +46,204 @@ public class OidcConfigs extends SsoConfigs {
private static final String DEFAULT_OIDC_USERNAME_CLAIM = "email";
private static final String DEFAULT_OIDC_USERNAME_CLAIM_REGEX = "(.*)";
- private static final String DEFAULT_OIDC_SCOPE =
- "openid profile email"; // Often "group" must be included for groups.
+ private static final String DEFAULT_OIDC_SCOPE = "openid profile email";
+ // Often "group" must be included for groups.
private static final String DEFAULT_OIDC_CLIENT_NAME = "oidc";
private static final String DEFAULT_OIDC_CLIENT_AUTHENTICATION_METHOD = "client_secret_basic";
private static final String DEFAULT_OIDC_JIT_PROVISIONING_ENABLED = "true";
private static final String DEFAULT_OIDC_PRE_PROVISIONING_REQUIRED = "false";
- private static final String DEFAULT_OIDC_EXTRACT_GROUPS_ENABLED =
- "false"; // False since extraction of groups can overwrite existing group membership.
+ private static final String DEFAULT_OIDC_EXTRACT_GROUPS_ENABLED = "false";
+ // False since extraction of groups can overwrite existing group membership.
private static final String DEFAULT_OIDC_GROUPS_CLAIM = "groups";
private static final String DEFAULT_OIDC_READ_TIMEOUT = "5000";
- private String clientId;
- private String clientSecret;
- private String discoveryUri;
- private String userNameClaim;
- private String userNameClaimRegex;
- private String scope;
- private String clientName;
- private String clientAuthenticationMethod;
- private boolean jitProvisioningEnabled;
- private boolean preProvisioningRequired;
- private boolean extractGroupsEnabled;
- private String groupsClaimName;
- private Optional responseType;
- private Optional responseMode;
- private Optional useNonce;
- private Optional customParamResource;
- private String readTimeout;
- private Optional extractJwtAccessTokenClaims;
+ private final String clientId;
+ private final String clientSecret;
+ private final String discoveryUri;
+ private final String userNameClaim;
+ private final String userNameClaimRegex;
+ private final String scope;
+ private final String clientName;
+ private final String clientAuthenticationMethod;
+ private final boolean jitProvisioningEnabled;
+ private final boolean preProvisioningRequired;
+ private final boolean extractGroupsEnabled;
+ private final String groupsClaimName;
+ private final Optional responseType;
+ private final Optional responseMode;
+ private final Optional useNonce;
+ private final Optional customParamResource;
+ private final String readTimeout;
+ private final Optional extractJwtAccessTokenClaims;
private Optional preferredJwsAlgorithm;
- public OidcConfigs(final com.typesafe.config.Config configs) {
- super(configs);
- clientId = getRequired(configs, OIDC_CLIENT_ID_CONFIG_PATH);
- clientSecret = getRequired(configs, OIDC_CLIENT_SECRET_CONFIG_PATH);
- discoveryUri = getRequired(configs, OIDC_DISCOVERY_URI_CONFIG_PATH);
- userNameClaim =
- getOptional(configs, OIDC_USERNAME_CLAIM_CONFIG_PATH, DEFAULT_OIDC_USERNAME_CLAIM);
- userNameClaimRegex =
- getOptional(
- configs, OIDC_USERNAME_CLAIM_REGEX_CONFIG_PATH, DEFAULT_OIDC_USERNAME_CLAIM_REGEX);
- scope = getOptional(configs, OIDC_SCOPE_CONFIG_PATH, DEFAULT_OIDC_SCOPE);
- clientName = getOptional(configs, OIDC_CLIENT_NAME_CONFIG_PATH, DEFAULT_OIDC_CLIENT_NAME);
- clientAuthenticationMethod =
- getOptional(
- configs,
- OIDC_CLIENT_AUTHENTICATION_METHOD_CONFIG_PATH,
- DEFAULT_OIDC_CLIENT_AUTHENTICATION_METHOD);
- jitProvisioningEnabled =
- Boolean.parseBoolean(
- getOptional(
- configs,
- OIDC_JIT_PROVISIONING_ENABLED_CONFIG_PATH,
- DEFAULT_OIDC_JIT_PROVISIONING_ENABLED));
- preProvisioningRequired =
- Boolean.parseBoolean(
- getOptional(
- configs,
- OIDC_PRE_PROVISIONING_REQUIRED_CONFIG_PATH,
- DEFAULT_OIDC_PRE_PROVISIONING_REQUIRED));
- extractGroupsEnabled =
- Boolean.parseBoolean(
- getOptional(configs, OIDC_EXTRACT_GROUPS_ENABLED, DEFAULT_OIDC_EXTRACT_GROUPS_ENABLED));
- groupsClaimName =
- getOptional(configs, OIDC_GROUPS_CLAIM_CONFIG_PATH_CONFIG_PATH, DEFAULT_OIDC_GROUPS_CLAIM);
- responseType = getOptional(configs, OIDC_RESPONSE_TYPE);
- responseMode = getOptional(configs, OIDC_RESPONSE_MODE);
- useNonce = getOptional(configs, OIDC_USE_NONCE).map(Boolean::parseBoolean);
- customParamResource = getOptional(configs, OIDC_CUSTOM_PARAM_RESOURCE);
- readTimeout = getOptional(configs, OIDC_READ_TIMEOUT, DEFAULT_OIDC_READ_TIMEOUT);
- extractJwtAccessTokenClaims =
- getOptional(configs, OIDC_EXTRACT_JWT_ACCESS_TOKEN_CLAIMS).map(Boolean::parseBoolean);
- preferredJwsAlgorithm =
- Optional.ofNullable(getOptional(configs, OIDC_PREFERRED_JWS_ALGORITHM, null));
+ public OidcConfigs(Builder builder) {
+ super(builder);
+ this.clientId = builder.clientId;
+ this.clientSecret = builder.clientSecret;
+ this.discoveryUri = builder.discoveryUri;
+ this.userNameClaim = builder.userNameClaim;
+ this.userNameClaimRegex = builder.userNameClaimRegex;
+ this.scope = builder.scope;
+ this.clientName = builder.clientName;
+ this.clientAuthenticationMethod = builder.clientAuthenticationMethod;
+ this.jitProvisioningEnabled = builder.jitProvisioningEnabled;
+ this.preProvisioningRequired = builder.preProvisioningRequired;
+ this.extractGroupsEnabled = builder.extractGroupsEnabled;
+ this.groupsClaimName = builder.groupsClaimName;
+ this.responseType = builder.responseType;
+ this.responseMode = builder.responseMode;
+ this.useNonce = builder.useNonce;
+ this.customParamResource = builder.customParamResource;
+ this.readTimeout = builder.readTimeout;
+ this.extractJwtAccessTokenClaims = builder.extractJwtAccessTokenClaims;
+ this.preferredJwsAlgorithm = builder.preferredJwsAlgorithm;
+ }
+
+ public static class Builder extends SsoConfigs.Builder {
+ private String clientId;
+ private String clientSecret;
+ private String discoveryUri;
+ private String userNameClaim = DEFAULT_OIDC_USERNAME_CLAIM;
+ private String userNameClaimRegex = DEFAULT_OIDC_USERNAME_CLAIM_REGEX;
+ private String scope = DEFAULT_OIDC_SCOPE;
+ private String clientName = DEFAULT_OIDC_CLIENT_NAME;
+ private String clientAuthenticationMethod = DEFAULT_OIDC_CLIENT_AUTHENTICATION_METHOD;
+ private boolean jitProvisioningEnabled =
+ Boolean.parseBoolean(DEFAULT_OIDC_JIT_PROVISIONING_ENABLED);
+ private boolean preProvisioningRequired =
+ Boolean.parseBoolean(DEFAULT_OIDC_PRE_PROVISIONING_REQUIRED);
+ private boolean extractGroupsEnabled =
+ Boolean.parseBoolean(DEFAULT_OIDC_EXTRACT_GROUPS_ENABLED);
+ private String groupsClaimName = DEFAULT_OIDC_GROUPS_CLAIM;
+ private Optional responseType = Optional.empty();
+ private Optional responseMode = Optional.empty();
+ private Optional useNonce = Optional.empty();
+ private Optional customParamResource = Optional.empty();
+ private String readTimeout = DEFAULT_OIDC_READ_TIMEOUT;
+ private Optional extractJwtAccessTokenClaims = Optional.empty();
+ private Optional preferredJwsAlgorithm = Optional.empty();
+
+ public Builder from(final com.typesafe.config.Config configs) {
+ super.from(configs);
+ clientId = getRequired(configs, OIDC_CLIENT_ID_CONFIG_PATH);
+ clientSecret = getRequired(configs, OIDC_CLIENT_SECRET_CONFIG_PATH);
+ discoveryUri = getRequired(configs, OIDC_DISCOVERY_URI_CONFIG_PATH);
+ userNameClaim =
+ getOptional(configs, OIDC_USERNAME_CLAIM_CONFIG_PATH, DEFAULT_OIDC_USERNAME_CLAIM);
+ userNameClaimRegex =
+ getOptional(
+ configs, OIDC_USERNAME_CLAIM_REGEX_CONFIG_PATH, DEFAULT_OIDC_USERNAME_CLAIM_REGEX);
+ scope = getOptional(configs, OIDC_SCOPE_CONFIG_PATH, DEFAULT_OIDC_SCOPE);
+ clientName = getOptional(configs, OIDC_CLIENT_NAME_CONFIG_PATH, DEFAULT_OIDC_CLIENT_NAME);
+ clientAuthenticationMethod =
+ getOptional(
+ configs,
+ OIDC_CLIENT_AUTHENTICATION_METHOD_CONFIG_PATH,
+ DEFAULT_OIDC_CLIENT_AUTHENTICATION_METHOD);
+ jitProvisioningEnabled =
+ Boolean.parseBoolean(
+ getOptional(
+ configs,
+ OIDC_JIT_PROVISIONING_ENABLED_CONFIG_PATH,
+ DEFAULT_OIDC_JIT_PROVISIONING_ENABLED));
+ preProvisioningRequired =
+ Boolean.parseBoolean(
+ getOptional(
+ configs,
+ OIDC_PRE_PROVISIONING_REQUIRED_CONFIG_PATH,
+ DEFAULT_OIDC_PRE_PROVISIONING_REQUIRED));
+ extractGroupsEnabled =
+ Boolean.parseBoolean(
+ getOptional(
+ configs, OIDC_EXTRACT_GROUPS_ENABLED, DEFAULT_OIDC_EXTRACT_GROUPS_ENABLED));
+ groupsClaimName =
+ getOptional(
+ configs, OIDC_GROUPS_CLAIM_CONFIG_PATH_CONFIG_PATH, DEFAULT_OIDC_GROUPS_CLAIM);
+ responseType = getOptional(configs, OIDC_RESPONSE_TYPE);
+ responseMode = getOptional(configs, OIDC_RESPONSE_MODE);
+ useNonce = getOptional(configs, OIDC_USE_NONCE).map(Boolean::parseBoolean);
+ customParamResource = getOptional(configs, OIDC_CUSTOM_PARAM_RESOURCE);
+ readTimeout = getOptional(configs, OIDC_READ_TIMEOUT, DEFAULT_OIDC_READ_TIMEOUT);
+ extractJwtAccessTokenClaims =
+ getOptional(configs, OIDC_EXTRACT_JWT_ACCESS_TOKEN_CLAIMS).map(Boolean::parseBoolean);
+ preferredJwsAlgorithm =
+ Optional.ofNullable(getOptional(configs, OIDC_PREFERRED_JWS_ALGORITHM, null));
+ return this;
+ }
+
+ public Builder from(final com.typesafe.config.Config configs, final String ssoSettingsJsonStr) {
+ super.from(ssoSettingsJsonStr);
+ if (jsonNode.has(CLIENT_ID)) {
+ clientId = jsonNode.get(CLIENT_ID).asText();
+ }
+ if (jsonNode.has(CLIENT_SECRET)) {
+ clientSecret = jsonNode.get(CLIENT_SECRET).asText();
+ }
+ if (jsonNode.has(DISCOVERY_URI)) {
+ discoveryUri = jsonNode.get(DISCOVERY_URI).asText();
+ }
+ if (jsonNode.has(USER_NAME_CLAIM)) {
+ userNameClaim = jsonNode.get(USER_NAME_CLAIM).asText();
+ }
+ if (jsonNode.has(USER_NAME_CLAIM_REGEX)) {
+ userNameClaimRegex = jsonNode.get(USER_NAME_CLAIM_REGEX).asText();
+ }
+ if (jsonNode.has(SCOPE)) {
+ scope = jsonNode.get(SCOPE).asText();
+ }
+ if (jsonNode.has(CLIENT_NAME)) {
+ clientName = jsonNode.get(CLIENT_NAME).asText();
+ }
+ if (jsonNode.has(CLIENT_AUTHENTICATION_METHOD)) {
+ clientAuthenticationMethod = jsonNode.get(CLIENT_AUTHENTICATION_METHOD).asText();
+ }
+ if (jsonNode.has(JIT_PROVISIONING_ENABLED)) {
+ jitProvisioningEnabled = jsonNode.get(JIT_PROVISIONING_ENABLED).asBoolean();
+ }
+ if (jsonNode.has(PRE_PROVISIONING_REQUIRED)) {
+ preProvisioningRequired = jsonNode.get(PRE_PROVISIONING_REQUIRED).asBoolean();
+ }
+ if (jsonNode.has(EXTRACT_GROUPS_ENABLED)) {
+ extractGroupsEnabled = jsonNode.get(EXTRACT_GROUPS_ENABLED).asBoolean();
+ }
+ if (jsonNode.has(GROUPS_CLAIM)) {
+ groupsClaimName = jsonNode.get(GROUPS_CLAIM).asText();
+ }
+ if (jsonNode.has(RESPONSE_TYPE)) {
+ responseType = Optional.of(jsonNode.get(RESPONSE_TYPE).asText());
+ }
+ if (jsonNode.has(RESPONSE_MODE)) {
+ responseMode = Optional.of(jsonNode.get(RESPONSE_MODE).asText());
+ }
+ if (jsonNode.has(USE_NONCE)) {
+ useNonce = Optional.of(jsonNode.get(USE_NONCE).asBoolean());
+ }
+ if (jsonNode.has(READ_TIMEOUT)) {
+ readTimeout = jsonNode.get(READ_TIMEOUT).asText();
+ }
+ if (jsonNode.has(EXTRACT_JWT_ACCESS_TOKEN_CLAIMS)) {
+ extractJwtAccessTokenClaims =
+ Optional.of(jsonNode.get(EXTRACT_JWT_ACCESS_TOKEN_CLAIMS).asBoolean());
+ }
+ if (jsonNode.has(OIDC_PREFERRED_JWS_ALGORITHM)) {
+ preferredJwsAlgorithm = Optional.of(jsonNode.get(OIDC_PREFERRED_JWS_ALGORITHM).asText());
+ } else {
+ preferredJwsAlgorithm =
+ Optional.ofNullable(getOptional(configs, OIDC_PREFERRED_JWS_ALGORITHM, null));
+ }
+
+ return this;
+ }
+
+ public OidcConfigs build() {
+ Objects.requireNonNull(_oidcEnabled, "oidcEnabled is required");
+ Objects.requireNonNull(clientId, "clientId is required");
+ Objects.requireNonNull(clientSecret, "clientSecret is required");
+ Objects.requireNonNull(discoveryUri, "discoveryUri is required");
+ Objects.requireNonNull(_authBaseUrl, "authBaseUrl is required");
+
+ return new OidcConfigs(this);
+ }
}
}
diff --git a/datahub-frontend/app/client/AuthServiceClient.java b/datahub-frontend/app/client/AuthServiceClient.java
index 4d40f45cd09b48..baa992994d8ba6 100644
--- a/datahub-frontend/app/client/AuthServiceClient.java
+++ b/datahub-frontend/app/client/AuthServiceClient.java
@@ -3,6 +3,7 @@
import com.datahub.authentication.Authentication;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ObjectNode;
+import com.google.inject.Inject;
import java.nio.charset.StandardCharsets;
import java.util.Objects;
import javax.annotation.Nonnull;
@@ -47,6 +48,7 @@ public class AuthServiceClient {
private final Authentication systemAuthentication;
private final CloseableHttpClient httpClient;
+ @Inject
public AuthServiceClient(
@Nonnull final String metadataServiceHost,
@Nonnull final Integer metadataServicePort,
diff --git a/datahub-frontend/app/controllers/SsoCallbackController.java b/datahub-frontend/app/controllers/SsoCallbackController.java
index 9f4445b1aa5c7f..fb0c324932b6af 100644
--- a/datahub-frontend/app/controllers/SsoCallbackController.java
+++ b/datahub-frontend/app/controllers/SsoCallbackController.java
@@ -9,11 +9,15 @@
import com.linkedin.entity.client.SystemEntityClient;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.CompletionStage;
import javax.annotation.Nonnull;
import javax.inject.Inject;
import lombok.extern.slf4j.Slf4j;
+import org.pac4j.core.client.Client;
+import org.pac4j.core.client.Clients;
import org.pac4j.core.config.Config;
import org.pac4j.core.engine.CallbackLogic;
import org.pac4j.core.http.adapter.HttpActionAdapter;
@@ -34,6 +38,7 @@
public class SsoCallbackController extends CallbackController {
private final SsoManager _ssoManager;
+ private final Config _config;
@Inject
public SsoCallbackController(
@@ -41,8 +46,10 @@ public SsoCallbackController(
@Nonnull Authentication systemAuthentication,
@Nonnull SystemEntityClient entityClient,
@Nonnull AuthServiceClient authClient,
+ @Nonnull Config config,
@Nonnull com.typesafe.config.Config configs) {
_ssoManager = ssoManager;
+ _config = config;
setDefaultUrl("/"); // By default, redirects to Home Page on log in.
setSaveInSession(false);
setCallbackLogic(
@@ -126,7 +133,18 @@ public Result perform(
}
private boolean shouldHandleCallback(final String protocol) {
- return _ssoManager.isSsoEnabled()
- && _ssoManager.getSsoProvider().protocol().getCommonName().equals(protocol);
+ if (!_ssoManager.isSsoEnabled()) {
+ return false;
+ }
+ updateConfig();
+ return _ssoManager.getSsoProvider().protocol().getCommonName().equals(protocol);
+ }
+
+ private void updateConfig() {
+ final Clients clients = new Clients();
+ final List clientList = new ArrayList<>();
+ clientList.add(_ssoManager.getSsoProvider().client());
+ clients.setClients(clientList);
+ _config.setClients(clients);
}
}
diff --git a/datahub-frontend/test/app/ApplicationTest.java b/datahub-frontend/test/app/ApplicationTest.java
index a5da0951d16328..8d80c2cfaa47dc 100644
--- a/datahub-frontend/test/app/ApplicationTest.java
+++ b/datahub-frontend/test/app/ApplicationTest.java
@@ -91,6 +91,9 @@ public int gmsServerPort() {
@BeforeAll
public void init() throws IOException {
_gmsServer = new MockWebServer();
+ _gmsServer.enqueue(new MockResponse().setResponseCode(404)); // dynamic settings - not tested
+ _gmsServer.enqueue(new MockResponse().setResponseCode(404)); // dynamic settings - not tested
+ _gmsServer.enqueue(new MockResponse().setResponseCode(404)); // dynamic settings - not tested
_gmsServer.enqueue(new MockResponse().setBody(String.format("{\"value\":\"%s\"}", TEST_USER)));
_gmsServer.enqueue(
new MockResponse().setBody(String.format("{\"accessToken\":\"%s\"}", TEST_TOKEN)));
diff --git a/datahub-frontend/test/security/OidcConfigurationTest.java b/datahub-frontend/test/security/OidcConfigurationTest.java
index a27a1462a8a277..c1147ae936b3ac 100644
--- a/datahub-frontend/test/security/OidcConfigurationTest.java
+++ b/datahub-frontend/test/security/OidcConfigurationTest.java
@@ -311,7 +311,9 @@ public Config withValue(String path, ConfigValue value) {
public void readTimeoutPropagation() {
CONFIG.withValue(OIDC_READ_TIMEOUT, ConfigValueFactory.fromAnyRef("10000"));
- OidcConfigs oidcConfigs = new OidcConfigs(CONFIG);
+ OidcConfigs.Builder oidcConfigsBuilder = new OidcConfigs.Builder();
+ oidcConfigsBuilder.from(CONFIG);
+ OidcConfigs oidcConfigs = oidcConfigsBuilder.build();
OidcProvider oidcProvider = new OidcProvider(oidcConfigs);
assertEquals(10000, ((OidcClient) oidcProvider.client()).getConfiguration().getReadTimeout());
}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/settings/global/GlobalSettingsInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/settings/global/GlobalSettingsInfo.pdl
index 7d83d333843cc6..91dca1de0c452e 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/settings/global/GlobalSettingsInfo.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/settings/global/GlobalSettingsInfo.pdl
@@ -7,6 +7,11 @@ namespace com.linkedin.settings.global
"name": "globalSettingsInfo"
}
record GlobalSettingsInfo {
+
+ /**
+ * SSO integrations between DataHub and identity providers
+ */
+ sso: optional SsoSettings
/**
* Settings related to the Views Feature
*/
diff --git a/metadata-models/src/main/pegasus/com/linkedin/settings/global/OidcSettings.pdl b/metadata-models/src/main/pegasus/com/linkedin/settings/global/OidcSettings.pdl
new file mode 100644
index 00000000000000..d5b23c28cb2279
--- /dev/null
+++ b/metadata-models/src/main/pegasus/com/linkedin/settings/global/OidcSettings.pdl
@@ -0,0 +1,96 @@
+namespace com.linkedin.settings.global
+
+/**
+ * Settings for OIDC SSO integration.
+ */
+record OidcSettings {
+ /**
+ * Whether OIDC SSO is enabled.
+ */
+ enabled: boolean
+
+ /**
+ * Unique client id issued by the identity provider.
+ */
+ clientId: string
+
+ /**
+ * Unique client secret issued by the identity provider.
+ */
+ clientSecret: string
+
+ /**
+ * The IdP OIDC discovery url.
+ */
+ discoveryUri: string
+
+ /**
+ * ADVANCED. The attribute / claim used to derive the DataHub username. Defaults to "preferred_username".
+ */
+ userNameClaim: optional string
+
+ /**
+ * ADVANCED. TThe regex used to parse the DataHub username from the user name claim. Defaults to (.*) (all).
+ */
+ userNameClaimRegex: optional string
+
+ /**
+ * ADVANCED. String representing the requested scope from the IdP. Defaults to "oidc email profile".
+ */
+ scope: optional string
+
+ /**
+ * ADVANCED. Which authentication method to use to pass credentials (clientId and clientSecret) to the token endpoint: Defaults to "client_secret_basic".
+ */
+ clientAuthenticationMethod: optional string
+
+ /**
+ * ADVANCED. Whether DataHub users should be provisioned on login if they do not exist. Defaults to true.
+ */
+ jitProvisioningEnabled: optional boolean
+
+ /**
+ * ADVANCED. Whether the user should already exist in DataHub on login, failing login if they are not. Defaults to false.
+ */
+ preProvisioningRequired: optional boolean
+
+ /**
+ * ADVANCED. Whether groups should be extracted from a claim in the OIDC profile. Only applies if JIT provisioning is enabled. Groups will be created if they do not exist. Defaults to true.
+ */
+ extractGroupsEnabled: optional boolean
+
+ /**
+ * ADVANCED. The OIDC claim to extract groups information from. Defaults to 'groups'.
+ */
+ groupsClaim: optional string
+
+ /**
+ * ADVANCED. Response type.
+ */
+ responseType: optional string
+
+ /**
+ * ADVANCED. Response mode.
+ */
+ responseMode: optional string
+
+ /**
+ * ADVANCED. Use Nonce.
+ */
+ useNonce: optional boolean
+
+ /**
+ * ADVANCED. Read timeout.
+ */
+ readTimeout: optional long
+
+ /**
+ * ADVANCED. Whether to extract claims from JWT access token. Defaults to false.
+ */
+ extractJwtAccessTokenClaims: optional boolean
+
+ /**
+ * ADVANCED. Which jws algorithm to use.
+ */
+ preferredJwsAlgorithm: optional string
+}
\ No newline at end of file
diff --git a/metadata-models/src/main/pegasus/com/linkedin/settings/global/SsoSettings.pdl b/metadata-models/src/main/pegasus/com/linkedin/settings/global/SsoSettings.pdl
new file mode 100644
index 00000000000000..047aa675717ca6
--- /dev/null
+++ b/metadata-models/src/main/pegasus/com/linkedin/settings/global/SsoSettings.pdl
@@ -0,0 +1,16 @@
+namespace com.linkedin.settings.global
+
+/**
+ * SSO Integrations, supported on the UI.
+ */
+record SsoSettings {
+ /**
+ * Auth base URL.
+ */
+ baseUrl: string
+
+ /**
+ * Optional OIDC SSO settings.
+ */
+ oidcSettings: optional OidcSettings
+}
\ No newline at end of file
diff --git a/metadata-models/src/main/pegasus/com/linkedin/settings/global/SsoType.pdl b/metadata-models/src/main/pegasus/com/linkedin/settings/global/SsoType.pdl
new file mode 100644
index 00000000000000..8596ed20b6afdb
--- /dev/null
+++ b/metadata-models/src/main/pegasus/com/linkedin/settings/global/SsoType.pdl
@@ -0,0 +1,11 @@
+namespace com.linkedin.settings.global
+
+/**
+ * Enum to define SSO protocol type.
+ */
+enum SsoType {
+ /**
+ * OIDC SSO is used.
+ */
+ OIDC
+}
\ No newline at end of file
diff --git a/metadata-service/auth-servlet-impl/src/main/java/com/datahub/auth/authentication/AuthServiceController.java b/metadata-service/auth-servlet-impl/src/main/java/com/datahub/auth/authentication/AuthServiceController.java
index 4e9fe3e335dc37..ae04382dcbf5a3 100644
--- a/metadata-service/auth-servlet-impl/src/main/java/com/datahub/auth/authentication/AuthServiceController.java
+++ b/metadata-service/auth-servlet-impl/src/main/java/com/datahub/auth/authentication/AuthServiceController.java
@@ -17,6 +17,12 @@
import com.linkedin.common.urn.CorpuserUrn;
import com.linkedin.common.urn.Urn;
import com.linkedin.gms.factory.config.ConfigurationProvider;
+import com.linkedin.metadata.entity.EntityService;
+import com.linkedin.metadata.secret.SecretService;
+import com.linkedin.settings.global.GlobalSettingsInfo;
+import com.linkedin.settings.global.OidcSettings;
+import com.linkedin.settings.global.SsoSettings;
+import java.util.Objects;
import java.util.concurrent.CompletableFuture;
import javax.annotation.Nullable;
import javax.inject.Inject;
@@ -48,6 +54,25 @@ public class AuthServiceController {
private static final String ARE_NATIVE_USER_CREDENTIALS_RESET_FIELD_NAME =
"areNativeUserCredentialsReset";
private static final String DOES_PASSWORD_MATCH_FIELD_NAME = "doesPasswordMatch";
+ private static final String BASE_URL = "baseUrl";
+ private static final String OIDC_ENABLED = "oidcEnabled";
+ private static final String CLIENT_ID = "clientId";
+ private static final String CLIENT_SECRET = "clientSecret";
+ private static final String DISCOVERY_URI = "discoveryUri";
+ private static final String USER_NAME_CLAIM = "userNameClaim";
+ private static final String USER_NAME_CLAIM_REGEX = "userNameClaimRegex";
+ private static final String SCOPE = "scope";
+ private static final String CLIENT_AUTHENTICATION_METHOD = "clientAuthenticationMethod";
+ private static final String JIT_PROVISIONING_ENABLED = "jitProvisioningEnabled";
+ private static final String PRE_PROVISIONING_REQUIRED = "preProvisioningRequired";
+ private static final String EXTRACT_GROUPS_ENABLED = "extractGroupsEnabled";
+ private static final String GROUPS_CLAIM = "groupsClaim";
+ private static final String RESPONSE_TYPE = "responseType";
+ private static final String RESPONSE_MODE = "responseMode";
+ private static final String USE_NONCE = "useNonce";
+ private static final String READ_TIMEOUT = "readTimeout";
+ private static final String EXTRACT_JWT_ACCESS_TOKEN_CLAIMS = "extractJwtAccessTokenClaims";
+ private static final String PREFERRED_JWS_ALGORITHM = "preferredJwsAlgorithm";
@Inject StatelessTokenService _statelessTokenService;
@@ -59,6 +84,10 @@ public class AuthServiceController {
@Inject NativeUserService _nativeUserService;
+ @Inject EntityService _entityService;
+
+ @Inject SecretService _secretService;
+
@Inject InviteTokenService _inviteTokenService;
@Inject @Nullable TrackingService _trackingService;
@@ -361,6 +390,41 @@ CompletableFuture> track(final HttpEntity httpEnt
});
}
+ /**
+ * Gets possible SSO settings.
+ *
+ * Example Request:
+ *
+ *
POST /getSsoSettings -H "Authorization: Basic :" {
+ * "userUrn": "urn:li:corpuser:test" "password": "password123" }
+ *
+ * Example Response:
+ *
+ *
{ "clientId": "clientId", "clientSecret": "secret", "discoveryUri = "discoveryUri" }
+ */
+ @PostMapping(value = "/getSsoSettings", produces = "application/json;charset=utf-8")
+ CompletableFuture> getSsoSettings(final HttpEntity httpEntity) {
+ return CompletableFuture.supplyAsync(
+ () -> {
+ try {
+ GlobalSettingsInfo globalSettingsInfo =
+ (GlobalSettingsInfo)
+ _entityService.getLatestAspect(
+ GLOBAL_SETTINGS_URN, GLOBAL_SETTINGS_INFO_ASPECT_NAME);
+ if (globalSettingsInfo == null || !globalSettingsInfo.hasSso()) {
+ log.debug("There are no SSO settings available");
+ return new ResponseEntity<>(HttpStatus.NOT_FOUND);
+ }
+ SsoSettings ssoSettings =
+ Objects.requireNonNull(globalSettingsInfo.getSso(), "ssoSettings cannot be null");
+ String response = buildSsoSettingsResponse(ssoSettings);
+ return new ResponseEntity<>(response, HttpStatus.OK);
+ } catch (Exception e) {
+ return new ResponseEntity<>(HttpStatus.INTERNAL_SERVER_ERROR);
+ }
+ });
+ }
+
// Currently, only internal system is authorized to generate a token on behalf of a user!
private boolean isAuthorizedToGenerateSessionToken(final String actorId) {
// Verify that the actor is an internal system caller.
@@ -391,4 +455,67 @@ private String buildVerifyNativeUserPasswordResponse(final boolean doesPasswordM
json.put(DOES_PASSWORD_MATCH_FIELD_NAME, doesPasswordMatch);
return json.toString();
}
+
+ private String buildSsoSettingsResponse(final SsoSettings ssoSettings) {
+ String baseUrl = Objects.requireNonNull(ssoSettings.getBaseUrl());
+ JSONObject json = new JSONObject();
+ json.put(BASE_URL, baseUrl);
+
+ if (ssoSettings.hasOidcSettings()) {
+ OidcSettings oidcSettings =
+ Objects.requireNonNull(ssoSettings.getOidcSettings(), "oidcSettings cannot be null");
+ buildOidcSettingsResponse(json, oidcSettings);
+ }
+
+ return json.toString();
+ }
+
+ private void buildOidcSettingsResponse(JSONObject json, final OidcSettings oidcSettings) {
+ json.put(OIDC_ENABLED, oidcSettings.isEnabled());
+ json.put(CLIENT_ID, oidcSettings.getClientId());
+ json.put(CLIENT_SECRET, _secretService.decrypt(oidcSettings.getClientSecret()));
+ json.put(DISCOVERY_URI, oidcSettings.getDiscoveryUri());
+ if (oidcSettings.hasUserNameClaim()) {
+ json.put(USER_NAME_CLAIM, oidcSettings.getUserNameClaim());
+ }
+ if (oidcSettings.hasUserNameClaimRegex()) {
+ json.put(USER_NAME_CLAIM_REGEX, oidcSettings.getUserNameClaimRegex());
+ }
+ if (oidcSettings.hasScope()) {
+ json.put(SCOPE, oidcSettings.getScope());
+ }
+ if (oidcSettings.hasClientAuthenticationMethod()) {
+ json.put(CLIENT_AUTHENTICATION_METHOD, oidcSettings.getClientAuthenticationMethod());
+ }
+ if (oidcSettings.hasJitProvisioningEnabled()) {
+ json.put(JIT_PROVISIONING_ENABLED, oidcSettings.isJitProvisioningEnabled());
+ }
+ if (oidcSettings.hasPreProvisioningRequired()) {
+ json.put(PRE_PROVISIONING_REQUIRED, oidcSettings.isPreProvisioningRequired());
+ }
+ if (oidcSettings.hasExtractGroupsEnabled()) {
+ json.put(EXTRACT_GROUPS_ENABLED, oidcSettings.isExtractGroupsEnabled());
+ }
+ if (oidcSettings.hasGroupsClaim()) {
+ json.put(GROUPS_CLAIM, oidcSettings.getGroupsClaim());
+ }
+ if (oidcSettings.hasResponseType()) {
+ json.put(RESPONSE_TYPE, oidcSettings.getResponseType());
+ }
+ if (oidcSettings.hasResponseMode()) {
+ json.put(RESPONSE_MODE, oidcSettings.getResponseMode());
+ }
+ if (oidcSettings.hasUseNonce()) {
+ json.put(USE_NONCE, oidcSettings.isUseNonce());
+ }
+ if (oidcSettings.hasReadTimeout()) {
+ json.put(READ_TIMEOUT, oidcSettings.getReadTimeout());
+ }
+ if (oidcSettings.hasExtractJwtAccessTokenClaims()) {
+ json.put(EXTRACT_JWT_ACCESS_TOKEN_CLAIMS, oidcSettings.isExtractJwtAccessTokenClaims());
+ }
+ if (oidcSettings.hasPreferredJwsAlgorithm()) {
+ json.put(PREFERRED_JWS_ALGORITHM, oidcSettings.getPreferredJwsAlgorithm());
+ }
+ }
}
From 19f104a1091127cf33d26637449bc105e0d27a47 Mon Sep 17 00:00:00 2001
From: Dmytro Kulyk <34435869+KulykDmytro@users.noreply.github.com>
Date: Wed, 27 Dec 2023 21:43:28 +0200
Subject: [PATCH 070/540] fix(ingest/glue): glue table partition (Athena v3)
profiling parameters retrieval error (#9521)
Co-authored-by: Harshal Sheth
---
metadata-ingestion/src/datahub/ingestion/source/aws/glue.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py
index aa7e5aa352a3e2..826c18f69fd013 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py
@@ -852,7 +852,7 @@ def get_profile_if_enabled(
partition_keys = [k["Name"] for k in partition_keys]
for p in partitions:
- table_stats = p["Parameters"]
+ table_stats = p.get("Parameters", {})
column_stats = p["StorageDescriptor"]["Columns"]
# only support single partition key
From c804b3c1aa3bcff14df14731b242cb34c8b40744 Mon Sep 17 00:00:00 2001
From: Ryan Despain <2043940+ryaminal@users.noreply.github.com>
Date: Wed, 27 Dec 2023 12:49:16 -0700
Subject: [PATCH 071/540] fix(ingest/fivetran): handle missing user_id for a
connection (#9460)
Co-authored-by: Harshal Sheth
---
.../src/datahub/ingestion/source/fivetran/data_classes.py | 4 ++--
.../src/datahub/ingestion/source/fivetran/fivetran.py | 2 +-
.../src/datahub/ingestion/source/fivetran/fivetran_log_api.py | 4 +++-
3 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py
index 82bb5f3467c2a6..8f779e0cd6df22 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py
@@ -1,5 +1,5 @@
from dataclasses import dataclass
-from typing import List
+from typing import List, Optional
@dataclass
@@ -23,7 +23,7 @@ class Connector:
paused: bool
sync_frequency: int
destination_id: str
- user_name: str
+ user_name: Optional[str]
table_lineage: List[TableLineage]
jobs: List["Job"]
diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py
index 12e362fa8a3e3f..834d9bff0b5cd8 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py
@@ -179,7 +179,7 @@ def _generate_datajob_from_connector(self, connector: Connector) -> DataJob:
id=connector.connector_id,
flow_urn=dataflow_urn,
name=connector.connector_name,
- owners={connector.user_name},
+ owners={connector.user_name} if connector.user_name else set(),
)
job_property_bag: Dict[str, str] = {}
diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py
index d5d146559d9183..5680b10982c498 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py
@@ -117,7 +117,9 @@ def _get_jobs_list(self, connector_id: str) -> List[Job]:
)
return jobs
- def _get_user_name(self, user_id: str) -> str:
+ def _get_user_name(self, user_id: Optional[str]) -> Optional[str]:
+ if not user_id:
+ return None
user_details = self._query(FivetranLogQuery.get_user_query(user_id=user_id))[0]
return (
f"{user_details[Constant.GIVEN_NAME]} {user_details[Constant.FAMILY_NAME]}"
From cfb4d2f95f892f71d1a99e8c3b237f7d9a09b712 Mon Sep 17 00:00:00 2001
From: Sumit Patil <91715217+sumitappt@users.noreply.github.com>
Date: Thu, 28 Dec 2023 01:25:37 +0530
Subject: [PATCH 072/540] feat(ui): Allow copying assertion urn from the UI
(#9523)
---
.../tabs/Dataset/Validations/AssertionMenu.tsx | 16 ++++++++++++++++
.../Validations/DatasetAssertionsList.tsx | 17 +++++++++++++++--
2 files changed, 31 insertions(+), 2 deletions(-)
create mode 100644 datahub-web-react/src/app/entity/shared/tabs/Dataset/Validations/AssertionMenu.tsx
diff --git a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Validations/AssertionMenu.tsx b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Validations/AssertionMenu.tsx
new file mode 100644
index 00000000000000..9d8231d4df0e31
--- /dev/null
+++ b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Validations/AssertionMenu.tsx
@@ -0,0 +1,16 @@
+import React from 'react';
+import { Menu } from 'antd';
+import CopyUrnMenuItem from '../../../../../shared/share/items/CopyUrnMenuItem';
+
+
+interface AssertionMenuProps {
+ urn: string;
+}
+
+export default function AssertionMenu({urn} : AssertionMenuProps){
+ return (
+
+
+
+ );
+}
diff --git a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Validations/DatasetAssertionsList.tsx b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Validations/DatasetAssertionsList.tsx
index b0216eec3408e4..05fc2d1c496db1 100644
--- a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Validations/DatasetAssertionsList.tsx
+++ b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Validations/DatasetAssertionsList.tsx
@@ -1,7 +1,7 @@
-import { Button, Empty, Image, message, Modal, Tag, Tooltip, Typography } from 'antd';
+import { Button, Dropdown, Empty, Image, message, Modal, Tag, Tooltip, Typography } from 'antd';
import React from 'react';
import styled from 'styled-components';
-import { DeleteOutlined, DownOutlined, RightOutlined, StopOutlined } from '@ant-design/icons';
+import { DeleteOutlined, DownOutlined, MoreOutlined, RightOutlined, StopOutlined } from '@ant-design/icons';
import { DatasetAssertionDescription } from './DatasetAssertionDescription';
import { StyledTable } from '../../../components/styled/StyledTable';
import { DatasetAssertionDetails } from './DatasetAssertionDetails';
@@ -9,6 +9,7 @@ import { Assertion, AssertionRunStatus } from '../../../../../../types.generated
import { getResultColor, getResultIcon, getResultText } from './assertionUtils';
import { useDeleteAssertionMutation } from '../../../../../../graphql/assertion.generated';
import { capitalizeFirstLetterOnly } from '../../../../../shared/textUtil';
+import AssertionMenu from './AssertionMenu';
const ResultContainer = styled.div`
display: flex;
@@ -30,6 +31,10 @@ const PlatformContainer = styled.div`
margin-right: 8px;
`;
+const StyledMoreOutlined = styled(MoreOutlined)`
+ font-size: 18px;
+`;
+
type Props = {
assertions: Array;
onDelete?: (urn: string) => void;
@@ -141,6 +146,14 @@ export const DatasetAssertionsList = ({ assertions, onDelete }: Props) => {
onDeleteAssertion(record.urn)} type="text" shape="circle" danger>
+
+ }
+ trigger={['click']}
+ >
+
+
),
},
From b7a0bbcb3d6000d3d9827ab19f13c3118d0bfc19 Mon Sep 17 00:00:00 2001
From: Fernando Marino`
Date: Thu, 28 Dec 2023 01:24:25 +0100
Subject: [PATCH 073/540] feat(ingest/openapi): support proxies and alternate
auth schemes (#9492)
Co-authored-by: Fernando Marino
Co-authored-by: Harshal Sheth
---
.../src/datahub/ingestion/source/openapi.py | 41 +++++++++++++++----
.../ingestion/source/openapi_parser.py | 26 ++++++++----
2 files changed, 51 insertions(+), 16 deletions(-)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/openapi.py b/metadata-ingestion/src/datahub/ingestion/source/openapi.py
index 3925ba51c16dd9..ad62ef7362aebd 100755
--- a/metadata-ingestion/src/datahub/ingestion/source/openapi.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/openapi.py
@@ -52,6 +52,13 @@ class OpenApiConfig(ConfigModel):
ignore_endpoints: list = Field(default=[], description="")
username: str = Field(default="", description="")
password: str = Field(default="", description="")
+ proxies: Optional[dict] = Field(
+ default=None,
+ description="Eg. "
+ "`{'http': 'http://10.10.1.10:3128', 'https': 'http://10.10.1.10:1080'}`."
+ "If authentication is required, add it to the proxy url directly e.g. "
+ "`http://user:pass@10.10.1.10:3128/`.",
+ )
forced_examples: dict = Field(default={}, description="")
token: Optional[str] = Field(default=None, description="")
get_token: dict = Field(default={}, description="")
@@ -87,9 +94,13 @@ def get_swagger(self) -> Dict:
password=self.password,
tok_url=url4req,
method=self.get_token["request_type"],
+ proxies=self.proxies,
)
sw_dict = get_swag_json(
- self.url, token=self.token, swagger_file=self.swagger_file
+ self.url,
+ token=self.token,
+ swagger_file=self.swagger_file,
+ proxies=self.proxies,
) # load the swagger file
else: # using basic auth for accessing endpoints
@@ -98,6 +109,7 @@ def get_swagger(self) -> Dict:
username=self.username,
password=self.password,
swagger_file=self.swagger_file,
+ proxies=self.proxies,
)
return sw_dict
@@ -258,10 +270,15 @@ def get_workunits_internal(self) -> Iterable[ApiWorkUnit]: # noqa: C901
tot_url = clean_url(config.url + self.url_basepath + endpoint_k)
if config.token:
- response = request_call(tot_url, token=config.token)
+ response = request_call(
+ tot_url, token=config.token, proxies=config.proxies
+ )
else:
response = request_call(
- tot_url, username=config.username, password=config.password
+ tot_url,
+ username=config.username,
+ password=config.password,
+ proxies=config.proxies,
)
if response.status_code == 200:
fields2add, root_dataset_samples[dataset_name] = extract_fields(
@@ -281,10 +298,15 @@ def get_workunits_internal(self) -> Iterable[ApiWorkUnit]: # noqa: C901
url_guess = try_guessing(endpoint_k, root_dataset_samples)
tot_url = clean_url(config.url + self.url_basepath + url_guess)
if config.token:
- response = request_call(tot_url, token=config.token)
+ response = request_call(
+ tot_url, token=config.token, proxies=config.proxies
+ )
else:
response = request_call(
- tot_url, username=config.username, password=config.password
+ tot_url,
+ username=config.username,
+ password=config.password,
+ proxies=config.proxies,
)
if response.status_code == 200:
fields2add, _ = extract_fields(response, dataset_name)
@@ -304,10 +326,15 @@ def get_workunits_internal(self) -> Iterable[ApiWorkUnit]: # noqa: C901
)
tot_url = clean_url(config.url + self.url_basepath + composed_url)
if config.token:
- response = request_call(tot_url, token=config.token)
+ response = request_call(
+ tot_url, token=config.token, proxies=config.proxies
+ )
else:
response = request_call(
- tot_url, username=config.username, password=config.password
+ tot_url,
+ username=config.username,
+ password=config.password,
+ proxies=config.proxies,
)
if response.status_code == 200:
fields2add, _ = extract_fields(response, dataset_name)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py b/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py
index 1ab40bc8be73d4..84bb3ad4526117 100755
--- a/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py
@@ -51,6 +51,7 @@ def request_call(
token: Optional[str] = None,
username: Optional[str] = None,
password: Optional[str] = None,
+ proxies: Optional[dict] = None,
) -> requests.Response:
headers = {"accept": "application/json"}
@@ -60,8 +61,8 @@ def request_call(
)
elif token is not None:
- headers["Authorization"] = f"Bearer {token}"
- return requests.get(url, headers=headers)
+ headers["Authorization"] = f"{token}"
+ return requests.get(url, proxies=proxies, headers=headers)
else:
return requests.get(url, headers=headers)
@@ -72,12 +73,15 @@ def get_swag_json(
username: Optional[str] = None,
password: Optional[str] = None,
swagger_file: str = "",
+ proxies: Optional[dict] = None,
) -> Dict:
tot_url = url + swagger_file
if token is not None:
- response = request_call(url=tot_url, token=token)
+ response = request_call(url=tot_url, token=token, proxies=proxies)
else:
- response = request_call(url=tot_url, username=username, password=password)
+ response = request_call(
+ url=tot_url, username=username, password=password, proxies=proxies
+ )
if response.status_code != 200:
raise Exception(f"Unable to retrieve {tot_url}, error {response.status_code}")
@@ -251,7 +255,7 @@ def compose_url_attr(raw_url: str, attr_list: list) -> str:
attr_list=["2",])
asd2 == "http://asd.com/2"
"""
- splitted = re.split(r"\{[^}]+\}", raw_url)
+ splitted = re.split(r"\{[^}]+}", raw_url)
if splitted[-1] == "": # it can happen that the last element is empty
splitted = splitted[:-1]
composed_url = ""
@@ -265,7 +269,7 @@ def compose_url_attr(raw_url: str, attr_list: list) -> str:
def maybe_theres_simple_id(url: str) -> str:
- dets = re.findall(r"(\{[^}]+\})", url) # searching the fields between parenthesis
+ dets = re.findall(r"(\{[^}]+})", url) # searching the fields between parenthesis
if len(dets) == 0:
return url
dets_w_id = [det for det in dets if "id" in det] # the fields containing "id"
@@ -349,6 +353,7 @@ def get_tok(
password: str = "",
tok_url: str = "",
method: str = "post",
+ proxies: Optional[dict] = None,
) -> str:
"""
Trying to post username/password to get auth.
@@ -357,12 +362,15 @@ def get_tok(
url4req = url + tok_url
if method == "post":
# this will make a POST call with username and password
- data = {"username": username, "password": password}
+ data = {"username": username, "password": password, "maxDuration": True}
# url2post = url + "api/authenticate/"
- response = requests.post(url4req, data=data)
+ response = requests.post(url4req, proxies=proxies, json=data)
if response.status_code == 200:
cont = json.loads(response.content)
- token = cont["tokens"]["access"]
+ if "token" in cont: # other authentication scheme
+ token = cont["token"]
+ else: # works only for bearer authentication scheme
+ token = f"Bearer {cont['tokens']['access']}"
elif method == "get":
# this will make a GET call with username and password
response = requests.get(url4req)
From 754d8814477d050e907aeca6c561d98372b60dc5 Mon Sep 17 00:00:00 2001
From: cburroughs
Date: Wed, 27 Dec 2023 19:33:41 -0500
Subject: [PATCH 074/540] build(ingest/feast): upgrade to latest feast version
(#9439)
---
metadata-ingestion/setup.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
index 4632c20cd3b969..32d49ffc73fa34 100644
--- a/metadata-ingestion/setup.py
+++ b/metadata-ingestion/setup.py
@@ -316,7 +316,7 @@
# https://github.com/elastic/elasticsearch-py/issues/1639#issuecomment-883587433
"elasticsearch": {"elasticsearch==7.13.4"},
"feast": {
- "feast~=0.31.1",
+ "feast~=0.34.1",
"flask-openid>=1.3.0",
# typeguard 3.x, released on 2023-03-14, seems to cause issues with Feast.
"typeguard<3",
From 9f79f44dd69a5a86864ccc31473305bdf1c2f4bb Mon Sep 17 00:00:00 2001
From: Harshal Sheth
Date: Wed, 27 Dec 2023 20:05:17 -0500
Subject: [PATCH 075/540] build: enable gradle caching (#9525)
---
.github/workflows/airflow-plugin.yml | 1 +
.github/workflows/build-and-test.yml | 1 +
.github/workflows/check-datahub-jars.yml | 1 +
.github/workflows/docker-unified.yml | 27 ++++++++++++---------
.github/workflows/documentation.yml | 1 +
.github/workflows/metadata-ingestion.yml | 1 +
.github/workflows/metadata-io.yml | 2 ++
.github/workflows/metadata-model.yml | 2 ++
.github/workflows/publish-datahub-jars.yml | 2 ++
.github/workflows/spark-smoke-test.yml | 2 ++
gradle.properties | 2 +-
gradle/wrapper/gradle-wrapper.jar | Bin 61624 -> 61608 bytes
gradlew | 4 +--
13 files changed, 32 insertions(+), 14 deletions(-)
diff --git a/.github/workflows/airflow-plugin.yml b/.github/workflows/airflow-plugin.yml
index 70816e5f093d13..97a0da8546ed17 100644
--- a/.github/workflows/airflow-plugin.yml
+++ b/.github/workflows/airflow-plugin.yml
@@ -55,6 +55,7 @@ jobs:
with:
distribution: "zulu"
java-version: 17
+ - uses: gradle/gradle-build-action@v2
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
index dab64cf2dca5e6..6daf1904ba3ae3 100644
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@@ -42,6 +42,7 @@ jobs:
with:
distribution: "zulu"
java-version: 17
+ - uses: gradle/gradle-build-action@v2
- uses: actions/setup-python@v4
with:
python-version: "3.10"
diff --git a/.github/workflows/check-datahub-jars.yml b/.github/workflows/check-datahub-jars.yml
index 46d97ffec88618..556cd87f12df04 100644
--- a/.github/workflows/check-datahub-jars.yml
+++ b/.github/workflows/check-datahub-jars.yml
@@ -33,6 +33,7 @@ jobs:
with:
distribution: "zulu"
java-version: 17
+ - uses: gradle/gradle-build-action@v2
- uses: actions/setup-python@v4
with:
python-version: "3.10"
diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml
index 7cef38b1cd47ce..454e7661402459 100644
--- a/.github/workflows/docker-unified.yml
+++ b/.github/workflows/docker-unified.yml
@@ -84,6 +84,7 @@ jobs:
with:
distribution: "zulu"
java-version: 17
+ - uses: gradle/gradle-build-action@v2
- name: Check out the repo
uses: hsheth2/sane-checkout-action@v1
- name: Pre-build artifacts for docker image
@@ -145,6 +146,7 @@ jobs:
with:
distribution: "zulu"
java-version: 17
+ - uses: gradle/gradle-build-action@v2
- name: Check out the repo
uses: hsheth2/sane-checkout-action@v1
- name: Pre-build artifacts for docker image
@@ -206,6 +208,7 @@ jobs:
with:
distribution: "zulu"
java-version: 17
+ - uses: gradle/gradle-build-action@v2
- name: Check out the repo
uses: hsheth2/sane-checkout-action@v1
- name: Pre-build artifacts for docker image
@@ -267,6 +270,7 @@ jobs:
with:
distribution: "zulu"
java-version: 17
+ - uses: gradle/gradle-build-action@v2
- name: Check out the repo
uses: hsheth2/sane-checkout-action@v1
- name: Pre-build artifacts for docker image
@@ -328,6 +332,7 @@ jobs:
with:
distribution: "zulu"
java-version: 17
+ - uses: gradle/gradle-build-action@v2
- name: Check out the repo
uses: hsheth2/sane-checkout-action@v1
- name: Pre-build artifacts for docker image
@@ -567,6 +572,7 @@ jobs:
with:
distribution: "zulu"
java-version: 17
+ - uses: gradle/gradle-build-action@v2
- name: Check out the repo
uses: hsheth2/sane-checkout-action@v1
- uses: dorny/paths-filter@v2
@@ -653,6 +659,7 @@ jobs:
with:
distribution: "zulu"
java-version: 17
+ - uses: gradle/gradle-build-action@v2
- name: Check out the repo
uses: hsheth2/sane-checkout-action@v1
- uses: dorny/paths-filter@v2
@@ -731,12 +738,13 @@ jobs:
strategy:
fail-fast: false
matrix:
- test_strategy: [
- "no_cypress_suite0",
- "no_cypress_suite1",
- "cypress_suite1",
- "cypress_rest"
- ]
+ test_strategy:
+ [
+ "no_cypress_suite0",
+ "no_cypress_suite1",
+ "cypress_suite1",
+ "cypress_rest",
+ ]
needs:
[
setup,
@@ -760,6 +768,7 @@ jobs:
with:
distribution: "zulu"
java-version: 17
+ - uses: gradle/gradle-build-action@v2
- uses: actions/setup-python@v4
with:
python-version: "3.10"
@@ -904,11 +913,7 @@ jobs:
deploy_datahub_head:
name: Deploy to Datahub HEAD
runs-on: ubuntu-latest
- needs:
- [
- setup,
- smoke_test
- ]
+ needs: [setup, smoke_test]
steps:
- uses: aws-actions/configure-aws-credentials@v1
if: ${{ needs.setup.outputs.publish != 'false' && github.repository_owner == 'datahub-project' && needs.setup.outputs.repository_name == 'datahub' }}
diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
index 29953b8b70d911..e1671cc0219198 100644
--- a/.github/workflows/documentation.yml
+++ b/.github/workflows/documentation.yml
@@ -32,6 +32,7 @@ jobs:
with:
distribution: "zulu"
java-version: 17
+ - uses: gradle/gradle-build-action@v2
- uses: actions/setup-python@v4
with:
python-version: "3.10"
diff --git a/.github/workflows/metadata-ingestion.yml b/.github/workflows/metadata-ingestion.yml
index 4e04fef3b3980b..af73db483f9aeb 100644
--- a/.github/workflows/metadata-ingestion.yml
+++ b/.github/workflows/metadata-ingestion.yml
@@ -49,6 +49,7 @@ jobs:
with:
distribution: "zulu"
java-version: 17
+ - uses: gradle/gradle-build-action@v2
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
diff --git a/.github/workflows/metadata-io.yml b/.github/workflows/metadata-io.yml
index 2188fcb07c77a6..96229642244b67 100644
--- a/.github/workflows/metadata-io.yml
+++ b/.github/workflows/metadata-io.yml
@@ -34,9 +34,11 @@ jobs:
with:
distribution: "zulu"
java-version: 17
+ - uses: gradle/gradle-build-action@v2
- uses: actions/setup-python@v4
with:
python-version: "3.10"
+ cache: "pip"
- name: Gradle build (and test)
# there is some race condition in gradle build, which makes gradle never terminate in ~30% of the runs
# running build first without datahub-web-react:yarnBuild and then with it is 100% stable
diff --git a/.github/workflows/metadata-model.yml b/.github/workflows/metadata-model.yml
index d0112f1b14e7af..265a66aa236ae9 100644
--- a/.github/workflows/metadata-model.yml
+++ b/.github/workflows/metadata-model.yml
@@ -34,10 +34,12 @@ jobs:
with:
distribution: "zulu"
java-version: 17
+ - uses: gradle/gradle-build-action@v2
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: "3.10"
+ cache: "pip"
- name: Install dependencies
run: ./metadata-ingestion/scripts/install_deps.sh
- name: Run model generation
diff --git a/.github/workflows/publish-datahub-jars.yml b/.github/workflows/publish-datahub-jars.yml
index 24d1c5436b3156..0a311be33cd30a 100644
--- a/.github/workflows/publish-datahub-jars.yml
+++ b/.github/workflows/publish-datahub-jars.yml
@@ -54,9 +54,11 @@ jobs:
with:
distribution: "zulu"
java-version: 17
+ - uses: gradle/gradle-build-action@v2
- uses: actions/setup-python@v4
with:
python-version: "3.10"
+ cache: "pip"
- name: checkout upstream repo
run: |
git remote add upstream https://github.com/datahub-project/datahub.git
diff --git a/.github/workflows/spark-smoke-test.yml b/.github/workflows/spark-smoke-test.yml
index 60e183cce5179c..94692bd3c2336a 100644
--- a/.github/workflows/spark-smoke-test.yml
+++ b/.github/workflows/spark-smoke-test.yml
@@ -35,9 +35,11 @@ jobs:
with:
distribution: "zulu"
java-version: 17
+ - uses: gradle/gradle-build-action@v2
- uses: actions/setup-python@v4
with:
python-version: "3.10"
+ cache: "pip"
- name: Install dependencies
run: ./metadata-ingestion/scripts/install_deps.sh
- name: Remove images
diff --git a/gradle.properties b/gradle.properties
index 1cd349344b432d..f410ff01bf397d 100644
--- a/gradle.properties
+++ b/gradle.properties
@@ -1,7 +1,7 @@
org.gradle.daemon=false
org.gradle.configureondemand=true
org.gradle.parallel=true
-org.gradle.caching=false
+org.gradle.caching=true
# Increase gradle JVM memory to 3GB to allow tests to run locally
org.gradle.jvmargs=-Xmx3000m
diff --git a/gradle/wrapper/gradle-wrapper.jar b/gradle/wrapper/gradle-wrapper.jar
index afba109285af78dbd2a1d187e33ac4f87c76e392..ccebba7710deaf9f98673a68957ea02138b60d0a 100644
GIT binary patch
delta 11632
zcmZvCWmsHIvn}otoWWsmcPF@o;2JczyX!!3mq9{scP9i11PcU$4GzKGB{&IihkW_o
zbKd)$`O#H-t*YJKyL-C(sjk*_`0{mlL^UON1bi?o20=0j9xOh%ei@J~uLo722sQ!?
z42-jzH3vM*|5z;${D%Z1z>-6?+!a8R2&^_5R82;A{zEDJH88;dK(!yMfTk1-S2$Sw
zCIswy1gQ9kmj(+JN(3g)qKZ%!32Jsub^v`?A}@l6ieT!WPuco!LiX35zyZJ<06|Me)};q$8UbE@JzIV7
zjabmR_!-#~61M#3PnCX3B*N=plNEo@)#a7bm0u@^k2l!XFA*mqTmj$fmF4iiq}LnC
zF*Umt9<5P>#-!gN=S1d=2veDI!b$hC1Ln*>#i7yq^5KWPfkp2dxeSgIp8vhCpfFTE
zV1wZQzExKaNRfI@`aAs-h+&iccdh_M%o_IV1>F#1(UCqNF2(lopwF*!G|`Yi+*}KY
z_^VrRSQ!Go{X8FQ)7(c)FxfA3b^cwoy{7NHo*g_DAfbd4l;(a`|9s2tSzN_Vx?Em;
z;hF;K(bxSLnqQ=w$d`>%SU^n@^Pi|(mMvuh0UNbZ*3pPEqIl6{lDJInhNCldqRNw9
z=$)@$*s!12V~KYN7t(143=hwuLM(2DGf-$GhYeQgp>5QzNMHH6(LkJ>v>BOt=Oi#|f+HvN*zP?|OMLfmXfJOvg~^
z+P0-<6_#rYiv&UYihhcOFRn*&!j^szdCaBEt|z&sf|%3)F<{;T^hTp{k+(<;eKdg;pqg8c@Q`
zvCZht=B@Tg0J)?P!+-rXNJTM^34+xq)yspMn|7Anogw~^%`&!pux&Y%yDi%Mb7G(m
zjY7wbTtyllcE!Hrs5L07C!{8~cWPum|1DckPu~UHMdQq}>5oNYcGnd@$)0$-K
z#}DDT47{IUYHZ~MA|y6n4;#JRzjWeiFK9xX-#%#B(dX|(jG1efSmt#Sd(tTAct61T
z)lZ?Q%2zprzKM^&ZMOUre~z3cju$$pHN_ya<43#01FE66$%`B`Hi|iNYsJIiZI5{G
zv1T@cX`#C_P44Iwdju!pycPl!wR10ll2wz+!%2g0Toe{r2#O|WVb!mSl7B^25B2i~
zqHg!|n_0MZ3abhl!52%AOp^trBq>g1{glL>jFeQRZ9K&aA6aoSs90x1D%)HFi5_vl!N2ggjGZ#w)G+DyCm7Cdn
zMp32|$lo&_%lM`T$)rI(^D?Ux(fSdVJe1*(NWW<|#aOr5Brdj@BBp^L%CL%W@88nJ
z$5O#=5Tr(Ds4-!gcli+TZJ-c>V&~LYj0f5JnKAC}}
zo4;Tk!k;E25$qg9CrG~x3%6E>n3ROQ?5Td7+>=MS35r`Ne&$x)>M`tnjcN9wzpdUk@Fq^Ro
zEY4~4y-m&6j=jeH^V5$MH(L*=21{y3s_a+<&Kol<
z#C{>*WrRxAJ_83`ePJ4_3xKX%BWo?OZL|fx~xpRi-Q73L-#oVqhw14kdH6
zanOGujOl8pv3Ko{)%?paa|;Y1N==Molf3)Z%D_kocrd;@J1};jBc*E{pYj*~AV%D4
z6BbLqXd-)Y@Z#xieFLC#-<36^E62EO6c_wt;(ETD^c(nR^KL5Ret6zggSP-pU3`bn
z>fsguY(YTsXYV`{LvA_{9Djz@w*Li#bGLDt&_%1BH|tMgcTkI_28q*WQ6R1LEUsmR
zo;Phy#X?-Dm@>dXfr>SAAQS|v?*W>~t=pe{=WQmT;`v+w?zSWOtDV^a|JSzuxr4wa
z2p$H87zqZ39(-;`2GX~6(R8qNVK;X$b$1WfQu3aX!+)f21b)@=LDA?fsvx0I#81+v
z3a8~ol(Ml^_IV#hUmP3FBPeY-lr~VjUz+t8eT(hUSLk8twy4>Dns~5JzJNe9A{m?6
zme{uHO<4qbeAuK5zPPOo7JUbou!)D4r!VNdAq7jsr5zkH4!Yb=3IFbw*TPx!V8vz#
z`MInAzUeGHyh^{zYP~(&7hQmr72gHV1Z$0DX|lAAEx^36e*z>cr!mS!yovW6qwUT|
zn)8TbNP(5#?vTc}dro>%u%xxO&oWnm%{x`-Ba>zXL?c@()UH=R3Wqlp10KMXEF+4c
zW7$bIe9zh!0!kGpq-B?3UNGL`YY(#ed>JObE%~j;PE98|uCfWwxeA40_;-$ROHboQ
z+Ntw}U*wc&CcCV&Ip$TbzNMgXH1Mls>Pk{{dAZ&%@igsP3o#XqT4oh>b4x%;
z=iLLvEjM@gLwXm1q)Ll+w~r(>lBL%V!98)BCcqDRE$1)FUI*BlBUbTKgcy$2=xV#H
z&K*3dJAsHklpeWePq&)AKY>KZmk78Yi6M#^DDc=1dlcyC{+Q0zI+s!ZK20BbRoOJX
zb&_?FCCU^_NkUT@<$%D7?6tBneYk(kH#AcqFE8}3XGHJ?}g&)N=vHD78
zoJ7>3)EALmaLa@^ef|nfT`YHi%wh(RSFr$Ifih9Z8>&m+L_2>yb<9*)cuYIKaVl
z6$=UE-2g(iI?>&Vv-aN`WMrAxBXl8*bV;ztb3@(x!ng3%qB2%dT)D|Ljb0)V6gk4-
zZ;=g;Fe|g-vm=CT;POEWc~`BJ^k;To=D>A!LEL1PgA
zz=?m2z*Jzpg?WkQdL#oUMPxLxG??w~dD3}tF?T}=lWQj4&FxgP;T0^>dT9P*P>fPJ
zB+Yu!=Eg98)-DglC(^ePp>|-gTwv;4V!_)jiEAqALdJ?=@_;*+K}=vW93*i&Ol*nG
z^9Da=y-4s=pN);>hl)LL8aTQU5-aK*ZtlYRabLtXr_o>451$GwqnzFC#PSkMX<2-+
zS2CvMOlUcYQPkD6h~a;HlbUJAQh&{n`YMi0m-z{ZitrF%ho?=0*fNIarq(!rm8{tK
znd(rlO+EL%mC_8kgF2uZ4dgJUe_-Wimy+DL96E-~U6a`P0{cuYKX82j@c@+CMV?&|
zhBtrPQPD`PUx;|kp_jpyZ3iB352>T$`GIck$dORnCCCj6VkJj+$vCXJm<^^EePm!y
z%(IVe3pAJxya?=&@W`$B^idQyMKFACi}udo}EqNUCI_{_C^E
zsm|pN26tJ;3LkH!H|3mb5p#nvmq5MZ;W~YbOSrK_s;=u9>A#0`*o`3&6%hso0}Td-
z?eCD{gZ{Gu$LkpA;_2c)Vky4o#4x5olMKfU53(xRTj+2>>~_~&(gc{pym84`bbcqHy-
z`b`TAE{+4YGMe&7DdDnzf8X~RgOZT;GaHDS4j3m3&~?!s;QI_rrvpe9TN(~xHn1YQ
zYAUQ?(g4@f!?*xBddcS$E6#(C0aWXKR9fpc+N^IHwrE`dfi!-^_P9SWhCiFSuvApF
zj5zjOVd2T;by%&!gV=KtQVlX`-WkC|JzSJF;|1H
z%)UACPG#~|-?~CuhgXcWVhMPBy?DOICRmZQgn2TLez4loup9bT>;7H2mSeHo0EidU
zWvy~YX1L&-Ut>JY+UuxqfY0riUd>AM%B7V%F6<(omjo?Z)kM|lS9J41KMcUqKwqgf&yPcGMII@*lJ?0LAb_xM=ls8~lf
z1J+$Fk85i3EjB#;|xIr*Xybc096dJ7(kBml}>Z!
zm2@E}AAeQyds@R6NGG}p{npJ>h{OPq>S?|w>1199ij~52`gwdBh<Jf>sij;7IoV
z@^tO(yvaa8y|_nf+=K-c845Y|;aD7p2d~jU!_ce?%Y|sdr1#ut{gZ$P4@!Ly(GiM;
zej;8!o{coegW297K8=FL7}IANaSkLK8?0XH37P^SY!5o>bHc&*Sr(|9{R-&5%o_xL
z7s+{wuP4T-#C>DdLA=Iv%Z?uF7{{S~U^Hz^1`kFt3(m&)zL0AJLYtkT}Q
zaoP*W#Rz+EX0qBie?pnucfs=)p_hAsQ7aImOUo9BiJS%3Ba!e%;=>n-m|rSlW{FF<
zG0pxcFpSY&XcY9g5r*IJ*cFnk!j2+bV0e6DH40sv+^BxxP-MHKKOr31!nPppF5jC_
z4NIg>9W9APBY9EIKJ+0zH~aZ$&0O~N
zf5*bCl`L21p`kGl8w?E9pY^m*uP9u78QR2|iV?&v#00W|_@
zL~vM1Nc(46j-hX?t~;2tlZ!F4ogZsHvdu2Hcr?_M)4r{=`Pgb;?_D#mZ*OootL>=W
z_0;BU#jQvI_xMubB=D5o`XSGKyuc2EAz@+lk;kK_a{dtTIW^G+hk@l>wYEF_Gd1R2;Rx=IvTiJ&ocJF7}PAo49=yUuQOXXr}l*O2$rj
zB;E0V&x+oTMvmK-J`;|`4y3E&T)pZt7Sld}d3c!ayrlN2Wj)(-hE#`(U^>s*L4X-3
zSKZkNc@<`Uto^#renX1G%8{FfX=+fH#}{~h?Zwl>Tk$g2rp`f?xCW=mO}=;Uh!-#I
z$_`f~u;!e+L177craEY@carrpZuKD1k>`_%Qwy*!Fuu6=6UtdQ7`Ps
zmhc~%sC{UI=Gt^EQ=q={R}Imvg>>Gz6p~o614C2l@&}Ts5G%Q;#$Q$
zN%fw`7331JG|<=_?_+!KpeXxxg)Bssi9(^IpUIDS!;QRtCL3gDkEgfeXf(EUz}dk`
z5#js;zwa~SJBt`)D%+M~T+^x!H_QoCn_84lS&Hb*hK_Kdy{vQXde*~m3{z3Bn)$r@
zennVvZijwmgPjVa)?m|3i7FyxE3*7SQlz>5deffw@tvGM?<37r>#-Hh(rk`>&(*nL
zlIH+bO(gSiR0c0drEGhLK8o`t5eCbMg%Zt@?XUS|5*3Lb>imHt^2~>Q!x%wXmz6&F
z5u^f~Z`(z9B=N|aU56wZGIVTzmjv-mtylHGigDj1!qz@|9|LUsSboea*kjY6AtxhA
zCk{}h(|=9yqHm;DrRgiT!Au;7nK^DHQqFm5RlEGeyAt3h5SC#j#Qg0fu~Q;IhZ%OK
zDM8XciVr5W*mlMX)<2!tB_}f~0<0Jf)4%rCPK{T!UXWVEDRkJu$eE3o0KEh-t7-5dl-X5dWB2nYh*z;m-
z`_l&;%}K}Jo|k&%beab~A*ntWA3Y>;!cgBPE*zvX7zTI9)pdWZm`60V2}m)|-uCM@
zAE=h`o<$w*p4X3N)nVZ|K#<$1*ZhEGZ7~(zuN<+m2+DRHOb
zNf1j3{6Xw6Z04&Is7RQ4vf+cVfM)vsCB9hAl*zt!4n;n)=RG|1UWM9Erfht&MA+zc
z#@@8jFVFUvFK`14T35rRbq5}ZMio<*9&H0AQ@$Vple&w!JEVB$)Ql{Tu1sgholX}b
z-(F;XV9SdR<=r25q-xZ<(zMY7+A{=<*&dJdAoNTcZ+?a-N!a$E6pt-qtLvu+zr`M7
z#=OsNlIyd-6UT${FR<@aa&CSa39pXcC^V_=Qglh~8v7;hFHNO=?pe|R3KlU60q02s
zz+nCA7kgk8v)nHvR1ltRb61>_RS9_56W4$TdFom<*B=N4*Uc`&eURwTHRU%q1qR
zA4hjh^>!ZuBzFDVRCg)Qjb2-mYe(^(((3hxLK5ji#%|6ZNGPZ8V1truuNwFW=K!eG
zA7;e%>d{j{BiQ>Mq|3DzeZTQ?uzQM^8Wcgpr7p3<8$CW*Q?qZgV0T+`4e?oG_+Xb3
z>-*-Nkg}9{2%F<<%J6Y|$W5!vt<+x!8qpR?R`u0Syzc=dtYY!J%ZV1`*I$<0
zgRp&eh|(#h7UmJd4n-_LIhj^A;4{nmhnB0R8KhM5VJ
zTOZq&S*XYPNdl{|3mq{wMYiF!6y_^WX3cAt*$lRQVV>g8aNIq1+N@Za6L&aWuvhBh
zU9)9oI)Zjqhs3Du^6J``)~g3Hy?g2S)feWMg=2{qc*5dLs;WI99(rcIt7Hi!n|dgs
z@yr`G1oJQ+)?Lh^Y~&EZSM+necNdN14*`
zRB`d;)-vab?Ic%p9$=4*AByKT<$98OyDq+w{>jrXOaxPVn+}n*LnUS$CB4Ez8h>5nG#}f
z%ehQGu|h+NuIZ|3Jh(EMzHMOlS9TwgXP-yXKWxGc1K}L
zR5c>?qP8L#5-(W7)WcDrgw9>0e+SI@&=6DP2r{Ov}
zakYS6{E03GaI>^GYd8&7=?vrv3#sRt=N<)n_+Yw_>i&q}o$6vMyX5>%a
zxgvlrm|4#rurzD>Rg>mCN8%t`STLjw3Gq#lB`xcDw*u3zgLo~BuaagTLy{}TK8ZZ2
zr_5NZ-2ZT!__;av)jl_|UB=C@A(U-fs=o`lp-I!zotr}OO=WEL6Yjg122x`kk?!+%b?G$#IcP
z4`8#FRE|uxkqLXRFw0D`griN1Z`C83q>?WIa+wo3K&WrF5^<-|LI+^z6
zutc8neP67R$LPT-r8!zsG8AYNW&1q+9ylvMZHNfIeMQVg-y)R0%7AQFetB0bBaUQW
zS6BrJ!g9`oMHB$t+m>{unk>pcmcBDB+J_iV-ayVK8v~2e(occTwqIY8H0ZkCiY`WP
z$%>GQu-LMWhG4kcGLIH%+P*Xj&D6-E4-Dy2D9)9k*HNX>&h^7>GbDb%HHyVawi3Tn
zl>X9O9Vu5U$x~TyNkI}438D7xUY16rPV_LBGLFN>_W1>}anQ8wCOu2d(7x0##g6w6
z9#|Dp7?YiFf7$xLxi8)akJ)wt(2LXNh0(A}%b--y&P2~9JcH+a>t
zk7UD->bIgGU^>IZo9Q$F&T#5+67YiFsoj=-x?1nx+UXndM8vz+e5*{U?7)0jqKH|G
zcmgTflkWm0l}+S$^C;J%`1slUg!3-=RSI-YLl&)-Vid9*PtIWRvpJxcls5K7@xe~N
znNXzmrbNv0+uuqW{Mc=xPY&bq&B*6ApQ8xymcYq-V?Dzr?(!
zkHdo*b%s#9Z^Rp4jVRz-gnJQ)+_cet*jU?^(1k7ytd`jJ_lQh+(71p30K+XXv41J8
z<{J77q|pw(ez#&Y&y&~;q&l3jDU;jeYd0wJ*DDGPbFQsm;X61Flqfo{RZm(i+9#R$
ztkj3x+DMet48M(H<0tc|;WII%<{E6pdB)k2KOab$#@jYbdZW1?#N2_$v;9q1z`WAz
zEcFm(bOb1&HSsx*nX;$wO(@|||B*-<#zyQuC^hj~fA7K4CV(6f_Pd5eV@wxTjdT9T
z;Egtzb-q_7bf&Xbj%cOTxmevTzWHwK%SwUu3;XrI6FCn{e{t5ad>>wVc#roZo_5Y3
ze|m4N|Hg)-TAt=6bYC!#EUsUSk07$Vz4>#-PS_Fw?tAyTvW4!w&!dPJd?h+>%)zAa
z=pd3TQG$5o?#gn&&r3*eX>eHvA^B34Y);MoanfEEFGe{rdLkNqXTM(FW<$=H=oHXt1Z
z^{Kcz8PAqne|sRt3IG1h6QM%sdsv@jJjEb(pNuBrO6?H>p?E>c9{4seCbz{L1_^Y@
z4592bZEz51ySawkmNvdiywbQLaoQ`|R$lp4UOJ>*4`<9#9zMQ)pBCTg(Vo+(OYIN5
zm_9B|3v6z$NF1|Cxw$vipHjl4wz&9MeL7nVDy=iOsKtUk>Y!Jq^Kiuv%Snr&d>Q)mC!c)IDUVr@t;X8*7=>u-rXBKm!
z3Z-BT^05gV8?|s~5aKK$d9#yM5Q8Mdd6kS#%vg9@AbOX-@zz6)EMDy)Y
z*-l4!Nu>VaceD`-yAGf~+%M2fin!#CZtC^s5u0tB=iY-qL9y@8xeVTQ+SAhLtY5FDGRFa5Mo`9BH$@N<)$KM@4eL;U&Xqf$+6bc=j_=
zcFgC{zlk>95lwN)K|Q#J%cA1mFSpdu0uxs;eBz_Wpr7m^Jg&{t!<1
zF1Q+={Dz!C!J2r@31~)+!R~U^(|SwhdhA(j0V2C=Oby7lm}7dWH&?3}fI_Lb3}9G6
zdkOTh_%JTPup`uT=fZ%NMHY6(*k2mTHJ^E{9w+G=2
z^-2`8%EzT!cgs;3;-Tj7lmp|w#|Q2aWCD+E
zJ@?=c3M>gi0U=~HC0ICo_~(odAl#7%7$PDJ#@>3)#%zHC`(l5b5O$zh9pDBD0GM+J
z4Q^B74`{jb{H6kF05}T@PD%f%?}s$bU-yk2ARMFIACOay;xEXl2*7|QZ~plehnOis
zX&TJ`Tr2*!9Ywo9I7CA**&Yq}U>5^g!|^qtEmE
zAIE=P$NtL^+ZF1_@lU6_4uEjxUQmbtdg1Wm9~k=(2q)+L7e)mKc+>qu1wpAf|9>dV
z0OEhBH7NBR^p`?O2}b&d;Wz@q?WX*ff!e~@LbhJ*Pz{-a|fC?-U
znB(|4?dR1o5Uwx>ELlbfp2)%a$2mUte@~CNT&Mv#cTv+|U@C(G9N?EHNdId{
z{&n^7uWxf&0W~N4|E3q_7XY~W1Pu=H%bzBVo;=U&wG$v*L(d-|_!JY&a7qCMbNT>a
zlT$Rfw7$Q@b0+hLQy|>(@E>6M^mzrig#yRpe}MQiWU$cLvz6u<5Kd+4FG!6B#Dh+}
I<>$8jKcJOiWB>pF
delta 11565
zcmZ{K1yoy27j1AU(&Ad&wYa+$cZcFq+-ZSe#XV5my|@(D;_g!1gS)oK3;n;>@BiOg
zFIkzp@7`xl=H9t0=Vay-Z@@Hez`!ZX!N4Mepir>m2r!|LJuo$}p4+%a^QWQY;^-xv
zB=+K6{R8+lKS~zYNtWXWCP7euYcKcE3DYdSx4vbLn{z}&9HsHs0$=7;Oz=8cx0P=4#l(@j~f
z;OEdqG}B`CT!MR)x?0FaqpQ}U9i_sk!cvdtdBS1hlgE2@kb?gT(};b6F@b$VXZs@l
zr*dFb5i-ApMc@22V$fn4td0kKo@>Ex?)@8u#X>AmdcLy8uLqO1fgLa(29DVWG!j25
zd>pv?p1S=g_lhAlD?xxp&!RL#{`Rnt!O_jG;`5;e!D8sw9j&-w#Sb`SeGTq5F#c|}
zyf~@H?)yvns*)e9k0Z^;6c0`aM)4c+8|zPin&UjVL^AtF1sE(lv@T2yN)7XH1-U4m
z8zbJk`AUj)fn~90=|NzsR?@S}XJIh1Da++Wjn$H>;n(x2kzr>;O-EwIKp5WErT0rt
zkGg>{dMSa^1N+(Q&Sl%?A)U+T2ln6X?D&&-jM|FVP4sg~!=>O9w7-48PsL{Jmqe}r
zcI)~t49*Txc73yR5)3`KSZ^kiy#2ODHJeJM7?oP(D$U8H<@UY2`CEz&wHILtXB5q8
zvx43gkM#|Pp~ba3y(e4?tY&@cw^iYF`|A8?$GYZRqb;2caFf&`Io
zHi_X*O_Bn9Je9h9qb1}l`F04@O?Bp?`SO>3)uoVh{zgWtxh7ffmNmQ`mh2*PzmvYx
z{y_t6le4Mfh$jupEHs))J0UP?lPSSLgI;DVWha{LOU=ooEZC+gY3JEGv}mw02h<-f
z^fzkEhO;T0TznW4MNOJx?QJ&7d-`mhDO;0cY9l)7ILJ=AyOSn1B2bVf+b|BARTKnX
z4EuHJlE3@HM-glx3KtbRr#?h5q=-aN62DQl`Ag7T_ud`ZHrYK(IAcz)uy?C@i_x&&
zgzkF1K4mPSH8b5tT`_pVG$Q@$AP~fV!P9lE3dL;WR}#`9TYW^Sh)
zP+C(x#F2#bE&}f-QAN5$`k9sXl2aQbrzMbXV+)g1JzhE35kb{=e{kvTO!W?0`
zg$uRQ>0HW(IZ8m>!lw?-2vxG}`0-)=vV(r+H
zHCn(y*qCG^im|S>H^w${z)?1dFjbwFbgv4QWK2%uEv*;@ZCIbw>*%={k#Md^Qj83x
z&MAtwF*v%oV$eN$v~L`;^;@I7Ot`=GyiKRMF$wcT(83r)&SJ0lG3)1HqVUDT#2}&-
z^P}EThW6U)m4=N}krqM9FV(>2+`L$${s2?j!*XRYBU4QWrz@ABmX6M&Jf5wk1==C@
zc^Bu(eSHi0XejLJ~Hs?^fJ!EweF#@mXefD);HCG|e
z$J>|g6NycFz1nYuKlZOD6g>8)qEsU2P3cY^kMAGg#p^3GEj}`5$#MgF)?n2B^ggGf
z@3mC!4W-dO$A=5h`rIf+MJi|jZ9Q7Yu1Xo}PD_}^+TELki6;py6+nV4)HX5DKo&r3
zxrUZ!5%8^@VtnjS1_$@)M{Q)jrrVv_t35%^aF^EZzKQn8S7W2F=EhIik#X~rI>J)!
zpam~i9;0=B#)ELq$jtyC%=6Put&V^cFK4ueJA7qx1UC6hDuk5npK>bV0`0tBGpV>k
zdv`7W@5T{`acdMB2!KQES3Cn924m7DI&Q5M^Fj6Ur|Y57*O{jUINDJcCfLU2RsRaU
z&&hi&T?lKNR3s7+lc>Op&y65Skw-&IfPWa68|M_E#o>L>TTYpWNGQ)~L$U?Xtqs9H$Z;PQ
z#PE>>_utq|VC1tL0|NkvL*g?#$oL}>P}$hkO5NGm-Q3y5*jC)i-1MWlGqb6!v5QNh
zy4>d_2`oX(pK-PfKE*-Q)=1<&l$v2PII07I@tcLpj?;diggWDQPKEWlI&2iYWDHyn
zVRz_N`_S5r_v541uj1NH|vwIUH-%b&rSE=Io&PmP`dj>K!*abJ@9dG%+VfTNvm
z#HTywLc13ej^Y_YtYvFh7)RnX~#%7j;s5^KDQ{
z{z*TIA#XfqCH1{MX{+WYLM90{P%K#BY|c~FD9KxCAQ_>cxmXC>Ij$@6-ZaOxv{lrX
zc|0NOOz;Fwpv)^#;+L^q^_{~TsNj+|P8n`hS7`P}
zKOR}zkL6Sp$i1qDZXXDg9%#)5?$2R_m!odBIONQ-DkYe7usyuLunW;j0kt-gd14ym
zN3sW4uQBfIUxyN)+YjoNlKn8-Y58X59=E(NcV2hg?~X^s)qw4lHf*!9hRLNf1j8Ti
z4qCTOL`X96-oJht!_2MN$_&$3)|&W8-QLrJGQxhIxY=2+z_V}MB>{IxT|xU8E0UM0Zr_pQo-ro9Lm4Eopxy7p#3QePg(>#)QDR
zy(Pot`GZwBTRUc`5<#F*2_7ePQSWvuQHq^68>0^?2??94gdLCNK&gDGQPGBdMZ9V2(28U}|Kl7=2KMXzCYN_hOA;F8cn_C14Thk1v$;(!a&N
zdMJ+oQDol-Y)H0zsMg
zZ}5+QzWW6$2{%HHj%!JKhNNGhT`|dGUmr85PXaQwv9n9z6B!kY8A0ziD^r7=lhXXW
zt6jujwU~f#C{Iu5R>rLwwy&K>S1>;)uZn@CjmE9qDKKW@FT*H~F%tjF;2D98IOWF|
zW{TAQw|V66RM)HzM=8=D;Y^7m^)Q3JO=uhOCRt}F$Q?bw{Q0T*n`-AG^3h;WXkzm>E^w{1@bp$Wn&zrMHkCun4(GtJh)fXnU$Lkq`}B-?@;u
zjLkOPj*T0w0Rf)jZGg+h+RzrLla2Nm1rl^`4mg}l>%0{gf8A0a=5=Cl2}UFS$Gp|Z
z5JB^CGZxdrH{h_5v4UB10fkTY*{}S8XNm8I-dfhsVs-aB+5c8q8{$x#q+yVzFwevF
zIzjGVSGSpVo8mKX$a19>p$;S9^)J`-CKDc#0Z+nteUn8C}h3Q
z3Ni(uBnS#D0+kA{tBFkzTA+^^xS5ro^RDbp#i-Id7uOULQ718&syup|Z8rkBtOc;|
zblGZo+2?MK_6}+(Fwa7!ssSENi_NyERzJ3Yn-_OYyPiBGQuBL~bph_a&xAcm2fXWT
z%2eo%86G@P6U>vU><4NWNR!{CYL-P!PZ(^nEca9-9TZ8`KJBlF<$b!Xj!gX4W7DhO
z%~u|>w9twA2};GVx69#wXVDspv6KW30H8z$i5B65M4X6#OKO-}c+bQxKdTr+-_eH!
zREx<1Fj>B9R%u4jPQWoi+0`vF50aJHyYE}%P1u~4w+Y<(H#{NTP%W^1pH6Zw1cJ7%
z_*-OUO%V08Fs@V;fg0Guu_>4oDQ9dCbCi(Yf7Oc
znd$d^X4{PgV)wU3zQ!@r@6vE;B@t6puK40;@B;xum5scS%#?_m%6lQq(3?{9m9fgg
zP(&Un8ndx%pDz??aBh$}y!jAEQ^vbG=aeM;fFf5l4@LGnj6nvJr$~);lqHRo?b^gC
z@&@pC$v|4;*`bep*rRj3pQH9XzPi=KsWY`CGcW3P0oV6i>g#KM9AEJcI}f3?jaW6O
zxN$6P-0+BJGghF$gdH9*9Y=rV;;2m?Yn)D1*t`6?f2Pros`tGPn#N>inzadJn5;&qE!HGaY=F@n4Zp@L+lcdk
zf5NB-2MpE@^czh=$@G|d8$;>AO=E9Ykl89z`ctjDNVYw~Ik7`9ue{L&!_}FqIu*TB
z2Q*rjgEQLrc#!{Tial6kH@6+Rz3T(9emMb0ggOe;wjSKthJV26xnJQCc8KkIXaLC>
zMfU^)J?oWFCD8h!UK@4i$T~wxms0Ml!IV{v5_M+AY~T6yfu_UcZrJLqlvbI)pDELb
zLfo1}jgwVTU)mv`21q?CPW{XI*P$7H3ZUw?qI>z#nd;>y8A?4G*N=j+L*}Eq)V*nC
z4mHh1V5=}5f(xz=TbC$|<;2Q`AIgt{Q`>WS$Y$qddGr-pf{)g+XCrAW0A=G76f#U
z%28wCW~@gG^c2(aZb%(X(UoNsIL~-rec>;y1^q5C6a#OgX39W{!T{iIZ$TQ+n-V^<&+^4f>`X8qLR-~<O4pA~qUV}!D^Lv)@+bmZ)3&!1
z(zAowG~^Qd`DI(*j5e&>%17u|tI5n%HOHZF2#(b?6&BWu6T6
z653gtSdk*6n|ILkV$q;w(q<<{cuP6~XD~c+JV_w{9TN{q0QLs9ACYI!Y`L9BX&=UH
z^VCD_%kw`gU=;_DRN8|Gl;K_u$$Xy=Aj?-C9xsbp_J@#E{PW^V4n6@Sgsh^0NRIG9
zaG*H|@;!oRpyQY2tu+XINd5Mbf){1%LyY4x(-9#D=AvHc|4N?uOooisumFHiCIEmG
zWbA+|JN@k}zLgTDHi+9{
zF^Jwq5>8a*r|%VCI<^PSwri?(~qFvt1ZJi
z&){Vjhqpe@W1mBzIszF3J(0f}g`)B@G}ra3ZC$e!-oM3FC0
z@c9i>9b=W&F$;L<2H2Y@Wwb6Ctt~$46Xb{KQ(q2qRv$J&GGqrkBHJQZkD!~$BM
zb_bWdz4Wrre~^Ch(v8JE-z1$q$GF$0A7!&9&)XxNhxKggqZvL6C-q5JiW@_RYt+nS
z2-NQ{1w!kyP?SpkQZwI7B~{oB22>`3tqHi{t&u9H8|h?*JJB)@3x1N0?u%LGG1{S>
zQSbjOJI9G?fWPa8vX6_l6V)ADD!7-4&D@Hyd3YB!9y`w-=t@8+sykdx&;V!Mrff8P
z48^sRC!QMGmm#`1Vn84r=#7)bq`)YnS3vVNA9yUl)QRLpT@VJ_ls1l3g}sfn^d)_l
zICp9ca63?|1lN*xAJc9%R*Is
zkE8v-t8?u&c^5$1#?-zSziZd7yv1VW{RXreEAvx|QFGqBKBHU+vrq$SM{q39zD&(k
zHWc|P$-evK+n3PuxnyObNxQD#2VQO0bl}fv9J!)g{*l?qWSylsi$c}I$ee*yyc-#4)x!3Bx0xO-LI=5n)jOWl%54mOG`7OA6CMjc?UPHw^_P93kFL$~MAS
za)*C<|9eaB*)oULSacXJUT!cX~Oea?jgfsKi1E6YW;BP%u)wOt;;RA3oS
zS}4QfMSsdrbSf5y4Z>%J#ni8-qneS{k>Wzei?eCmTp!*$Jtgcin?B|Cj_&hJ-z68f
z(>p-!Y;KE_C-?LZ3SF<=P>uC6B(R1G1I%!OQ
zu2wJTEN#Ao_bGZ;tUWvMA4)k_CV^yCj0^4YgVW|W*}}+c?zF*hPOyWnAtFPza(XH)
z4WEc7kgO`g-BalZ>ja3Q12>80gC#r`;c2Er6oQ{P-Pg$BG@$N&z*U@J6*Q2ad;Anq9t$_OdZO~
zOM+55qMn9*?CyPJ7o$7jo-R(ZRejJ`%qBiT)<%sVvO$fCn!I{qI}sGZq%Wj?gw06+
zbWr4nN|XAlJIlduKuS*3jtCO
zk-XS!6@TW1#x0UAtJ~2{jvA2j6Rdn{UItRPqTUCgTf{mkZ5R>R6n7u=D$58c?qp4Y
z)c3Nmr0^kXfw^!2y>v~!2QmRF$nO0WJs2>&
zGT(7)%gPQjOiw0@Bk|KK;PHw|jgW0r3E_U{S2
zVBb-S*+6F5Y`=W`Fo_aKr=AC0Gw9`dh`HaGGJNrtDn~2Sw5#c78LwA9Y04GfTE@&p
zcy}1{h8JW$VGisDPgYyy`#CHp*zWNs49F0^v9?J2Phw)?!lLe^rnrE|%nw{w=xcu;
zJX_W4>Y_3Rv%=zpqx#cU-HgRc&-0oYm10iwK&yFr`T_3NZ&-GYPSkQ&^NTmQ#t9
z9%x(H5$d<9<7D|_MZ|Eb;a)M~ZP&;*&JeLa+NOi}7jM)!%GCK8
z%^b(6d#NWhHpwR|!`Fe(t~-Jc1{odw-1`qc?PB1xdjuKn6Z6QvhP?;zb`5ZwciLFP
zCGphJbW=>_Y*Uuu2fj;nDhHm98N_jB*-^9xno)J0xVmm&v8NqHDk5Q+p?qzXAPp75
z7}C7u%|TTLipK{U#5BQ$In%ZcuO_cH7v%yJORtHotKXR_H$WMqLodc)CJhj^E%~mFPk5+c9$2XATCL@>V=1%jTNtO
z7E$IX;%W-mdf;;runR0*AF=W;+fh|doS=%w7;fx{)X~FC@}@Z9aWG=5^P~nWe$_^2
zc1WT!?hdd@5kPdMb5GgXKx4oU@v&vdz)<4pT?P`VFs2)khN!Gb{-7#}m*C0?F5pWf
zE8+(a*UM+%s~AG1IF{4nNTcjMyf%vXW%b&Ka2ko9fZN3_5_YA#m>#~?q1+t5Ek3}_
zH^^|}Ud2})+`n^-#1osDl#Ey}g_**G1Fe)B&Z7^<1-tT7?Mr1jL#D*wwoZ*z+?cRv
znLltyjqf=jYwq3>`cC#FkxXsC{L;P8^@;ac4DnGl)d*FEUW@nO;bV2E))Ae`P1;di
z4tDm~_&^;#i9W2&L)loz4%pS#&>6fP$4hM*aM5hWH&;Q9V9sX3sz(#jg79j`QFhY&?m$l4|nYVg6rJ}Q{fp)pGy6&m-N#!=sYsuEh=C~
z*+^bp$?*FV*XTIYU`@{AYw4cB*C9I^4f2>P?^WI6V=vn3azr!j;ayq`MCq2T7sbDm
zXz6l2d@>)ZlJ&yH!IU$or2QdX!#|DR@0@WZyfTV~im8`T>U@A|I0O5R#Y4g6=~=%D
z_4}d<1K3dZSJa4X<=pj1p`*V6&W9c#)I>!^pd)kz8LpJ#PXL^>sAz#`E-Jy8OH(i{
z{}0YMT-uMg-gZVh)Z9w@C+^)}2~dd9w!Q7D&5#A3Zsa9A<%YL)Z5}3UODa^Pe-3g6
zHpuoCpjO~rZ0XNhd(hdwQmW1%Dk=Ip7nPpX;`dFU)!lv?#+{8J1V>@Ezr`*c==(@)
zW2sD2{HCWqrmm{=u4&OHbLNDmCEV=_EZ{)@iMj01pGI1n5p3B|Wt1YnhzOPUnHago
zVyVjZyi=R*H$jvubg%#SIgxRc+og;y+QEHxVV8O$uE#yM^E7Zaa~zS@fr(txYo`;@
z^dt49)c1W;^>pV#wz9sh$%2Y~K$e8z%Gt5RIZqMCAcD!6Z?;jdJ`)^^%*aZ&17Rba
zUKftZP+}IVm_~De-WxC*s&nM`Op)8S=fOLq)MZG`NIm+Giu76-QfH+nWe2bJDLQrJ
zzxKt14`FLCpCXJcwo}(K8ID03q+p(-l&lzD72BP)Uzk-l!OM`9_<5SM1Pqwgqwp~L
z86cZs5ks$BtL2{4ZKySXEnnkerSLQ28{dgpJ8FtpruCdkb=jb*v%A$#?QKOT7nhS5
z9WNM8`ZP{z5At%t_~#md3vlBFq8Yk%{Nx4oj2pftJ9&>pjppQfR0(+w55jJat60Ik
zBH^bJuKHn_ayeZKpOMIJe&7dAdgGggl<^Jwp`ACVnlAQ~mr76J?am*5gq9q8fWDZ1
zO=L?Sfc#mb%M+LUR=Z3hywv23j=Jg6qoM28-W$BUK+OEwo4;B%#TEKEmyP^FB~(Z@
zi@Z>2tJKe3EpGY4dYeoI`^@E~tN$lM8KX`Y=v_kz+`LEfb|E-qLjj=W*IPB?djOnX
zxKvb8$Zy#->AYcvn>bCwrkIBvjo(%yhP@kdRxuR9i(c9~3YgYHX4)d`R%96x5qlVs
z3H%)U!&lhk*VJs)pD;_1XVM0CL{GK$Rj)!he$&YiWHQkIvO419*h(Eb&3@;JPjVjI
z>Z6WzK4>b|6Z14l_6z_j*wUfji?~rUS1;^^_HJR{;rOPeS|6ToRdWL%3Jq
zWf7PD>73Yxg^u;sM^_VxSJpnR@wcA{urH$0mZaGmg1x6SYOcM7PkFEMtJ2&Uf{0#i
zr9(q0<>+2t(bn1lb4=>^v*#2?_TXH{zC?ITo?khoDr9B9=R-eq?@rF@`nIAT118|9
zN2*vsZm9oJz2c|G0=TsnITpJz$bG_e>&%zrh-g>f{=q{w@ql*hPuFt(WPqoeX9t+P
zWu>oZ5dfzS?^ELu7{k%86{RH}W#y4fFvT{7DHf0fgySv)wI<5zaIfsUQ@`XBL+f-%
zwQ!`CO&$}hbP*g=5Vc#B%@wnp2}SicNE!PkSy-#zqD8&k9P$;JLP`+N-p9Nj-VilX
zn}tdp?d5R&t4Lp&3PqsR}SB6{ij%@+~0if5I8Y=9YgxOR~WJbbF
zf{olEeA3enqPc|DSjz9-yv3Fu2@u9zZ%VK)N}MU;Fo3#|^Uo(Kf#nMSI`QZU%2!L8
zZjXMe9Hz2LDf4P(DMo{Jtx%Qb85j0)<&jh{Ve;&CCPvdo1S@Ln)P?5h0Qohlk}c##czc|4z;*%r%M!X3nQ6t=QIT&gu|`*L8L;nSxCwc@+7Pmd
ziw4?l=CHaW+WFN7Zc2v>ElQJM;%!b!TV743e9lpLr#c5$rz6NBdEoe=F^W3J2XD?7
z_U1Fn@13pWb4t=X%PG5skKy$jrihe3zyG3-v?mBewKw3%@P&7T@3f|{dWNNB`y)4+
z1cqe|#VqvoYmnB>Xz3uh>E0Wi{1{H&$n&zQ>;f6@5&Oun!0C!0;MfgI$)OX;VM^bd
zrLe)dOEWkHn4R=tl{|sA2HmTwf`}g2S|(9Bn9>^jlrPV`ORSK2TP%|&e0k*|k}vYq
znot6D((UnDjrnah^UudTg@xg9jP1ACwdSwh?J)Hz^D~DzIm;EmN}32$Q
zU|@m)k1=SmUnXR~2R%HR-Kz2BCYJs(mX!UnQqHH^&e6+coNvrqkDSfpe{S(Y@rL(rr>j=kRowbHWB$RMNgQ;dvidvC>Gm74y
zr+&+;&x!UXu9)m(g1>!G*m&4X2TrmSc39TeMeLIxgTK%UJ;^=?q}Aw*>)bn(Va?CP
z!%x03B51pQ%QQ=ircVNW`JfytYAKP|fW#@bwxoVXLa!_-W*@PD;tt6=+4OqmCuvxj
z@J}V_=GCE=6hG=5wFUR7YnDKtEDMGh!r)nwo?oHZyL_YGF}b+JA{Zx21@=uV`^p!k
z2v9g+vJ6`5Lm#f?axjaAw&!p;6I99P
zt8*QC9uOI^6>3?@Km~Th8c;#@i&rax{a{d}h(zj5>1LU6r*MYfZOZN)%var+F^a&i
zvan4S)KA7`5ZQH7hi-~OFXCz{q|^x|8hg^^Z;XqEBAO_6dzi%^`F+$l+rA!^={v+BEq~W#UV%Lc|K$OKBrMsBeY#
z9}0Gr2!hB~dHy+fSY?2qv{fxg8}|wygtEr{N5KaftO@>OQ?*6|K?`e!kd}Mh3esk;
zzu@O}O-NgcgAa1p;DexZB79KyhQLcLli`J
zTiOtHl?@-nxQz^*%Ldw@r2y$|Lxv71_Mm{u2IXu^LGT6-J_@7)^3N;ZzkuhD8SBo=
z<(%{5zxr4D4szNNg|H`r_@I#;Wat{fe@x*)%t9y#%5qT9STM*StX&G|GLiq1AhPJ|
zzcLUar;GoQ`*(36?$#j0x#S;$222Sd#xciUSI0Blkb_0}v5(
zc7O)Cwr2<_=JA&VdQkthw?CwSp8ot-syZZr*iS&ne&9bO+Fx6eBMRuP_`kNKAe_IF
zFGQLK{gE1vs39)aAjD$o9|Aq?-{J8cQ$S~C{GnZsA!WRvxXk~4>OwRBNTlBoK*Pr`
zBWEu9Z<(s-uS|G?|6jN0fA%*1-k_JMIV}073+mso2A)trBX&S&vcJ>>gsSTTRd
Date: Thu, 28 Dec 2023 04:06:41 -0500
Subject: [PATCH 076/540] chore(build): update base-requirements + add script
for regeneration (#9524)
---
.../base-requirements.txt | 317 +++++++++---------
.../regenerate-base-requirements.sh | 37 ++
2 files changed, 195 insertions(+), 159 deletions(-)
create mode 100755 docker/datahub-ingestion-base/regenerate-base-requirements.sh
diff --git a/docker/datahub-ingestion-base/base-requirements.txt b/docker/datahub-ingestion-base/base-requirements.txt
index 141382466ab9f6..90928759027942 100644
--- a/docker/datahub-ingestion-base/base-requirements.txt
+++ b/docker/datahub-ingestion-base/base-requirements.txt
@@ -1,149 +1,147 @@
-# Excluded for slim
-# pyspark==3.0.3
-# pydeequ==1.0.1
-
+# Generated requirements file. Run ./regenerate-base-requirements.sh to regenerate.
acryl-datahub-classify==0.0.8
-acryl-PyHive==0.6.14
-acryl-sqlglot==18.5.2.dev45
+acryl-PyHive==0.6.16
+acryl-sqlglot==20.4.1.dev14
aenum==3.1.15
-aiohttp==3.8.6
+aiohttp==3.9.1
aiosignal==1.3.1
-alembic==1.12.0
+alembic==1.13.1
altair==4.2.0
+annotated-types==0.6.0
anyio==3.7.1
-apache-airflow==2.7.2
-apache-airflow-providers-common-sql==1.7.2
-apache-airflow-providers-ftp==3.5.2
-apache-airflow-providers-http==4.5.2
-apache-airflow-providers-imap==3.3.2
-apache-airflow-providers-sqlite==3.4.3
-apispec==6.3.0
+apache-airflow==2.7.3
+apache-airflow-providers-common-sql==1.9.0
+apache-airflow-providers-ftp==3.7.0
+apache-airflow-providers-http==4.8.0
+apache-airflow-providers-imap==3.5.0
+apache-airflow-providers-sqlite==3.6.0
+apispec==6.3.1
appdirs==1.4.4
appnope==0.1.3
-argcomplete==3.1.2
+argcomplete==3.2.1
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
asgiref==3.7.2
asn1crypto==1.5.1
-asttokens==2.4.0
+asttokens==2.4.1
async-timeout==4.0.3
-asynch==0.2.2
+asynch==0.2.3
attrs==23.1.0
-avro==1.10.2
+avro==1.11.3
avro-gen3==0.7.11
-Babel==2.13.0
-backcall==0.2.0
+Babel==2.14.0
backoff==2.2.1
beautifulsoup4==4.12.2
bleach==6.1.0
-blinker==1.6.3
+blinker==1.7.0
blis==0.7.11
-boto3==1.28.62
-botocore==1.31.62
+boto3==1.34.8
+botocore==1.34.8
bowler==0.9.0
bracex==2.4
cached-property==1.5.2
cachelib==0.9.0
-cachetools==5.3.1
+cachetools==5.3.2
catalogue==2.0.10
-cattrs==23.1.2
-certifi==2023.7.22
+cattrs==23.2.3
+certifi==2023.11.17
cffi==1.16.0
chardet==5.2.0
-charset-normalizer==3.3.0
-ciso8601==2.3.0
+charset-normalizer==3.3.2
+ciso8601==2.3.1
click==8.1.7
click-default-group==1.2.4
click-spinner==0.1.10
clickclick==20.10.2
-clickhouse-cityhash==1.0.2.4
clickhouse-driver==0.2.6
clickhouse-sqlalchemy==0.2.4
-cloudpickle==2.2.1
+cloudpickle==3.0.0
colorama==0.4.6
colorlog==4.8.0
-comm==0.1.4
-confection==0.1.3
-ConfigUpdater==3.1.1
+comm==0.2.0
+confection==0.1.4
+ConfigUpdater==3.2
confluent-kafka==2.3.0
connexion==2.14.2
cron-descriptor==1.4.0
croniter==2.0.1
-cryptography==41.0.4
+cryptography==41.0.7
cx-Oracle==8.3.0
cymem==2.0.8
-dask==2023.9.3
+dask==2023.12.1
databricks-cli==0.18.0
databricks-dbapi==0.6.0
-databricks-sdk==0.10.0
+databricks-sdk==0.15.0
+databricks-sql-connector==2.9.3
debugpy==1.8.0
decorator==5.1.1
defusedxml==0.7.1
-deltalake==0.11.0
+deltalake==0.14.0
Deprecated==1.2.14
dill==0.3.7
dnspython==2.4.2
-docker==6.1.3
+docker==7.0.0
docutils==0.20.1
ecdsa==0.18.0
elasticsearch==7.13.4
email-validator==1.3.1
entrypoints==0.4
et-xmlfile==1.1.0
-exceptiongroup==1.1.3
-executing==2.0.0
-expandvars==0.11.0
-fastapi==0.103.2
-fastavro==1.8.4
-fastjsonschema==2.18.1
+exceptiongroup==1.2.0
+executing==2.0.1
+expandvars==0.12.0
+fastapi==0.108.0
+fastavro==1.9.2
+fastjsonschema==2.19.0
feast==0.31.1
-filelock==3.12.4
+filelock==3.13.1
fissix==21.11.13
Flask==2.2.5
flatdict==4.0.1
-frozenlist==1.4.0
-fsspec==2023.9.2
+frozenlist==1.4.1
+fsspec==2023.12.2
future==0.18.3
-GeoAlchemy2==0.14.1
-gitdb==4.0.10
-GitPython==3.1.37
-google-api-core==2.12.0
-google-auth==2.23.3
-google-cloud-appengine-logging==1.3.2
+GeoAlchemy2==0.14.3
+gitdb==4.0.11
+GitPython==3.1.40
+google-api-core==2.15.0
+google-auth==2.25.2
+google-cloud-appengine-logging==1.4.0
google-cloud-audit-log==0.2.5
-google-cloud-bigquery==3.12.0
-google-cloud-core==2.3.3
+google-cloud-bigquery==3.14.1
+google-cloud-core==2.4.1
google-cloud-datacatalog-lineage==0.2.2
google-cloud-logging==3.5.0
google-crc32c==1.5.0
google-re2==1.1
-google-resumable-media==2.6.0
-googleapis-common-protos==1.60.0
+google-resumable-media==2.7.0
+googleapis-common-protos==1.62.0
gql==3.4.1
graphql-core==3.2.3
graphviz==0.20.1
great-expectations==0.15.50
-greenlet==3.0.0
-grpc-google-iam-v1==0.12.6
-grpcio==1.59.0
-grpcio-reflection==1.59.0
-grpcio-status==1.59.0
-grpcio-tools==1.59.0
+greenlet==3.0.3
+grpc-google-iam-v1==0.13.0
+grpcio==1.60.0
+grpcio-reflection==1.60.0
+grpcio-status==1.60.0
+grpcio-tools==1.60.0
gssapi==1.8.3
gunicorn==21.2.0
h11==0.14.0
-httpcore==0.18.0
-httptools==0.6.0
-httpx==0.25.0
+hdbcli==2.19.20
+httpcore==1.0.2
+httptools==0.6.1
+httpx==0.26.0
humanfriendly==10.0
-idna==3.4
+idna==3.6
ijson==3.2.3
-importlib-metadata==6.8.0
-importlib-resources==6.1.0
+importlib-metadata==6.11.0
+importlib-resources==6.1.1
inflection==0.5.1
ipaddress==1.0.23
ipykernel==6.17.1
-ipython==8.16.1
+ipython==8.19.0
ipython-genutils==0.2.0
ipywidgets==8.1.1
iso3166==2.1.1
@@ -152,34 +150,34 @@ itsdangerous==2.1.2
jedi==0.19.1
Jinja2==3.1.2
jmespath==1.0.1
-JPype1==1.4.1
+JPype1==1.5.0
jsonlines==4.0.0
jsonpatch==1.33
jsonpointer==2.4
jsonref==1.1.0
-jsonschema==4.19.1
-jsonschema-specifications==2023.7.1
+jsonschema==4.20.0
+jsonschema-specifications==2023.12.1
jupyter-server==1.24.0
jupyter_client==7.4.9
jupyter_core==4.12.0
-jupyterlab-pygments==0.2.2
jupyterlab-widgets==3.0.9
+jupyterlab_pygments==0.3.0
langcodes==3.3.0
lark==1.1.4
-lazy-object-proxy==1.9.0
+lazy-object-proxy==1.10.0
leb128==1.0.5
-limits==3.6.0
+limits==3.7.0
linear-tsv==1.1.0
linkify-it-py==2.0.2
-lkml==1.3.1
+lkml==1.3.3
locket==1.0.0
lockfile==0.12.2
looker-sdk==23.0.0
-lxml==4.9.3
+lxml==4.9.4
lz4==4.3.2
-makefun==1.15.1
-Mako==1.2.4
-Markdown==3.5
+makefun==1.15.2
+Mako==1.3.0
+Markdown==3.5.1
markdown-it-py==3.0.0
MarkupSafe==2.1.3
marshmallow==3.20.1
@@ -190,26 +188,26 @@ mdit-py-plugins==0.4.0
mdurl==0.1.2
mistune==3.0.2
mixpanel==4.10.0
-mlflow-skinny==2.7.1
+mlflow-skinny==2.9.2
mmh3==4.0.1
mmhash3==3.0.1
more-itertools==10.1.0
moreorless==0.4.0
-moto==4.2.5
+moto==4.2.12
msal==1.22.0
multidict==6.0.4
murmurhash==1.0.10
-mypy==1.6.0
+mypy==1.8.0
mypy-extensions==1.0.0
nbclassic==1.0.0
nbclient==0.6.3
-nbconvert==7.9.2
+nbconvert==7.13.1
nbformat==5.9.1
nest-asyncio==1.5.8
-networkx==3.1
+networkx==3.2.1
notebook==6.5.6
notebook_shim==0.2.3
-numpy==1.26.0
+numpy==1.26.2
oauthlib==3.2.2
okta==1.7.0
openlineage-airflow==1.2.0
@@ -217,110 +215,107 @@ openlineage-integration-common==1.2.0
openlineage-python==1.2.0
openlineage_sql==1.2.0
openpyxl==3.1.2
-opentelemetry-api==1.20.0
-opentelemetry-exporter-otlp==1.20.0
-opentelemetry-exporter-otlp-proto-common==1.20.0
-opentelemetry-exporter-otlp-proto-grpc==1.20.0
-opentelemetry-exporter-otlp-proto-http==1.20.0
-opentelemetry-proto==1.20.0
-opentelemetry-sdk==1.20.0
-opentelemetry-semantic-conventions==0.41b0
+opentelemetry-api==1.22.0
+opentelemetry-exporter-otlp==1.22.0
+opentelemetry-exporter-otlp-proto-common==1.22.0
+opentelemetry-exporter-otlp-proto-grpc==1.22.0
+opentelemetry-exporter-otlp-proto-http==1.22.0
+opentelemetry-proto==1.22.0
+opentelemetry-sdk==1.22.0
+opentelemetry-semantic-conventions==0.43b0
ordered-set==4.1.0
-oscrypto==1.3.0
packaging==23.2
pandas==1.5.3
pandavro==1.5.2
pandocfilters==1.5.0
-parse==1.19.1
+parse==1.20.0
parso==0.8.3
partd==1.4.1
-pathspec==0.11.2
-pathy==0.10.2
+pathspec==0.12.1
+pathy==0.10.3
pendulum==2.1.2
-pexpect==4.8.0
+pexpect==4.9.0
phonenumbers==8.13.0
-pickleshare==0.7.5
platformdirs==3.11.0
pluggy==1.3.0
preshed==3.0.9
prison==0.2.1
-progressbar2==4.2.0
-prometheus-client==0.17.1
-prompt-toolkit==3.0.39
-proto-plus==1.22.3
-protobuf==4.24.4
-psutil==5.9.5
+progressbar2==4.3.2
+prometheus-client==0.19.0
+prompt-toolkit==3.0.43
+proto-plus==1.23.0
+protobuf==4.25.1
+psutil==5.9.7
psycopg2-binary==2.9.9
ptyprocess==0.7.0
pure-eval==0.2.2
pure-sasl==0.6.2
-py-partiql-parser==0.3.7
+py-partiql-parser==0.5.0
pyarrow==11.0.0
-pyasn1==0.5.0
+pyasn1==0.5.1
pyasn1-modules==0.3.0
-pyathena==2.4.1
-pycountry==22.3.5
+pyathena==2.25.2
+pycountry==23.12.11
pycparser==2.21
pycryptodome==3.19.0
-pycryptodomex==3.19.0
pydantic==1.10.13
+pydantic_core==2.14.6
pydash==7.0.6
-pydruid==0.6.5
-Pygments==2.16.1
+pydruid==0.6.6
+Pygments==2.17.2
pyiceberg==0.4.0
-pymongo==4.5.0
+pymongo==4.6.1
PyMySQL==1.1.0
-pyOpenSSL==23.2.0
+pyOpenSSL==23.3.0
pyparsing==3.0.9
pyspnego==0.10.2
python-daemon==3.0.1
python-dateutil==2.8.2
python-dotenv==1.0.0
python-jose==3.3.0
-python-ldap==3.4.3
+python-ldap==3.4.4
python-nvd3==0.15.0
python-slugify==8.0.1
python-stdnum==1.19
-python-tds==1.13.0
+python-tds==1.14.0
python-utils==3.8.1
python3-openid==3.2.0
pytz==2023.3.post1
pytzdata==2020.1
PyYAML==6.0.1
pyzmq==24.0.1
-ratelimiter==1.2.0.post0
redash-toolbelt==0.1.9
-redshift-connector==2.0.914
-referencing==0.30.2
-regex==2023.10.3
+redshift-connector==2.0.918
+referencing==0.32.0
+regex==2023.12.25
requests==2.31.0
requests-file==1.5.1
requests-gssapi==1.2.3
requests-ntlm==1.2.0
requests-toolbelt==0.10.1
-responses==0.23.3
+responses==0.24.1
rfc3339-validator==0.1.4
rfc3986==2.0.0
-rich==13.6.0
-rich-argparse==1.3.0
-rpds-py==0.10.6
+rich==13.7.0
+rich-argparse==1.4.0
+rpds-py==0.15.2
rsa==4.9
ruamel.yaml==0.17.17
ruamel.yaml.clib==0.2.8
-s3transfer==0.7.0
-schwifty==2023.9.0
-scipy==1.11.3
+s3transfer==0.10.0
+schwifty==2023.11.2
+scipy==1.11.4
scramp==1.4.4
Send2Trash==1.8.2
-sentry-sdk==1.32.0
+sentry-sdk==1.39.1
setproctitle==1.3.3
simple-salesforce==1.12.5
six==1.16.0
smart-open==6.4.0
smmap==5.0.1
sniffio==1.3.0
-snowflake-connector-python==3.2.1
-snowflake-sqlalchemy==1.5.0
+snowflake-connector-python==3.6.0
+snowflake-sqlalchemy==1.5.1
sortedcontainers==2.4.0
soupsieve==2.5
spacy==3.4.3
@@ -328,67 +323,71 @@ spacy-legacy==3.0.12
spacy-loggers==1.0.5
sql-metadata==2.2.2
SQLAlchemy==1.4.44
-sqlalchemy-bigquery==1.8.0
-SQLAlchemy-JSONField==1.0.1.post0
+sqlalchemy-bigquery==1.9.0
+sqlalchemy-hana==1.1.1
+SQLAlchemy-JSONField==1.0.2
sqlalchemy-pytds==0.3.5
sqlalchemy-redshift==0.8.14
SQLAlchemy-Utils==0.41.1
-sqlalchemy2-stubs==0.0.2a35
+sqlalchemy2-stubs==0.0.2a37
sqllineage==1.3.8
sqlparse==0.4.4
srsly==2.4.8
stack-data==0.6.3
-starlette==0.27.0
+starlette==0.32.0.post1
strictyaml==1.7.3
tableauserverclient==0.25
tableschema==1.20.2
tabulate==0.9.0
tabulator==1.53.5
tenacity==8.2.3
-termcolor==2.3.0
-terminado==0.17.1
+teradatasql==20.0.0.2
+teradatasqlalchemy==17.20.0.0
+termcolor==2.4.0
+terminado==0.18.0
text-unidecode==1.3
thinc==8.1.12
-thrift==0.13.0
+thrift==0.16.0
thrift-sasl==0.4.3
tinycss2==1.2.1
toml==0.10.2
tomli==2.0.1
-tomlkit==0.12.1
+tomlkit==0.12.3
toolz==0.12.0
-tornado==6.3.3
+tornado==6.4
tqdm==4.66.1
traitlets==5.2.1.post0
trino==0.327.0
typeguard==2.13.3
typer==0.7.0
-types-PyYAML==6.0.12.12
typing-inspect==0.9.0
-typing_extensions==4.8.0
-tzlocal==5.1
+typing_extensions==4.9.0
+tzlocal==5.2
uc-micro-py==1.0.2
-ujson==5.8.0
+ujson==5.9.0
unicodecsv==0.14.1
-urllib3==1.26.17
-uvicorn==0.23.2
-uvloop==0.17.0
-vertica-python==1.3.5
-vertica-sqlalchemy-dialect==0.0.8
+universal-pathlib==0.1.4
+urllib3==1.26.18
+uvicorn==0.25.0
+uvloop==0.19.0
+vertica-python==1.3.8
+vertica-sqlalchemy-dialect==0.0.8.1
vininfo==1.7.0
volatile==2.1.0
wasabi==0.10.1
-watchfiles==0.20.0
+watchfiles==0.21.0
wcmatch==8.5
-wcwidth==0.2.8
+wcwidth==0.2.12
webencodings==0.5.1
-websocket-client==1.6.4
-websockets==11.0.3
+websocket-client==1.7.0
+websockets==12.0
Werkzeug==2.2.3
widgetsnbextension==4.0.9
-wrapt==1.15.0
-WTForms==3.1.0
+wrapt==1.16.0
+WTForms==3.0.1
xlrd==2.0.1
xmltodict==0.13.0
-yarl==1.9.2
+yarl==1.9.4
zeep==4.2.1
-zstd==1.5.5.1
\ No newline at end of file
+zipp==3.17.0
+zstd==1.5.5.1
diff --git a/docker/datahub-ingestion-base/regenerate-base-requirements.sh b/docker/datahub-ingestion-base/regenerate-base-requirements.sh
new file mode 100755
index 00000000000000..6fb331afa484a3
--- /dev/null
+++ b/docker/datahub-ingestion-base/regenerate-base-requirements.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+# This script is used to regenerate the base-requirements.txt file
+
+set -euxo pipefail
+cd "$( dirname "${BASH_SOURCE[0]}" )"
+
+SCRIPT_NAME=$(basename "$0")
+DATAHUB_DIR=$(pwd)/../..
+
+# Create a virtualenv.
+VENV_DIR=$(mktemp -d)
+python -c "import sys; assert sys.version_info >= (3, 9), 'Python 3.9 or higher is required.'"
+python -m venv $VENV_DIR
+source $VENV_DIR/bin/activate
+pip install --upgrade pip setuptools wheel
+echo "Using virtualenv at $VENV_DIR"
+
+# Install stuff.
+pushd $DATAHUB_DIR/metadata-ingestion
+pip install -e .
+pip install -e '../metadata-ingestion-modules/airflow-plugin/[plugin-v2]'
+pip install -e '.[all]'
+popd
+
+# Generate the requirements file.
+# Removing Flask deps due as per https://github.com/datahub-project/datahub/pull/6867/files
+# Removing py4j and PyJWT due to https://github.com/datahub-project/datahub/pull/6868/files
+# Removing pyspark and pydeequ because we don't want them in the slim image, so they can be added separately.
+# TODO: It's unclear if these removals are still actually needed.
+echo "# Generated requirements file. Run ./$SCRIPT_NAME to regenerate." > base-requirements.txt
+pip freeze \
+ | grep -v -E "^-e" \
+ | grep -v "Flask-" \
+ | grep -v -E "(py4j|PyJWT)==" \
+ | grep -v -E "(pyspark|pydeequ)==" \
+ >> base-requirements.txt
From 4efa46f8c91dfdedc21b7081143d196c7a0be0da Mon Sep 17 00:00:00 2001
From: gaurav2733 <77378510+gaurav2733@users.noreply.github.com>
Date: Thu, 28 Dec 2023 15:05:14 +0530
Subject: [PATCH 077/540] test(cypress/users): add automatic reset password
test (#9515)
---
.../src/app/identity/user/UserListItem.tsx | 17 ++-
.../app/identity/user/ViewResetTokenModal.tsx | 7 +-
.../cypress/e2e/mutations/add_users.js | 135 +++++++++++++-----
3 files changed, 114 insertions(+), 45 deletions(-)
diff --git a/datahub-web-react/src/app/identity/user/UserListItem.tsx b/datahub-web-react/src/app/identity/user/UserListItem.tsx
index 69b8a6c2d1355f..8ad3d7d93d6573 100644
--- a/datahub-web-react/src/app/identity/user/UserListItem.tsx
+++ b/datahub-web-react/src/app/identity/user/UserListItem.tsx
@@ -98,8 +98,8 @@ export default function UserListItem({ user, canManageUserCredentials, selectRol
{displayName}
-
-
{user.username}
+
+ {user.username}
{userStatus && (
@@ -121,8 +121,12 @@ export default function UserListItem({ user, canManageUserCredentials, selectRol
trigger={['click']}
overlay={
- setIsViewingResetToken(true)}>
- Reset user password
+ setIsViewingResetToken(true)}
+ data-testid="reset-menu-item"
+ >
+ Reset user password
Delete
@@ -130,7 +134,10 @@ export default function UserListItem({ user, canManageUserCredentials, selectRol
}
>
-
+
Generate a new reset link! Note, any old links will cease to be active .
-
+
diff --git a/smoke-test/tests/cypress/cypress/e2e/mutations/add_users.js b/smoke-test/tests/cypress/cypress/e2e/mutations/add_users.js
index e19c6065d42743..ba225ba37884ba 100644
--- a/smoke-test/tests/cypress/cypress/e2e/mutations/add_users.js
+++ b/smoke-test/tests/cypress/cypress/e2e/mutations/add_users.js
@@ -1,47 +1,104 @@
const tryToSignUp = () => {
- let number = Math.floor(Math.random() * 100000);
- let name = `Example Name ${number}`;
- cy.enterTextInTestId("email", `example${number}@example.com`);
- cy.enterTextInTestId("name", name);
- cy.enterTextInTestId("password", "Example password");
- cy.enterTextInTestId("confirmPassword", "Example password");
-
- cy.mouseover("#title").click();
- cy.waitTextVisible("Other").click();
-
- cy.get("[type=submit]").click();
- return name;
+ let number = Math.floor(Math.random() * 100000);
+ let name = `Example Name ${number}`;
+ let email = `example${number}@example.com`;
+ cy.enterTextInTestId("email", email);
+ cy.enterTextInTestId("name", name);
+ cy.enterTextInTestId("password", "Example password");
+ cy.enterTextInTestId("confirmPassword", "Example password");
+
+ cy.mouseover("#title").click();
+ cy.waitTextVisible("Other").click();
+
+ cy.get("[type=submit]").click();
+ return { name, email };
};
describe("add_user", () => {
- it("go to user link and invite a user", () => {
- cy.login();
+ let registeredEmail = "";
+ it("go to user link and invite a user", () => {
+ cy.login();
+
+ cy.visit("/settings/identities/users");
+ cy.waitTextVisible("Invite Users");
+ cy.clickOptionWithText("Invite Users");
+
+ cy.waitTextVisible(/signup\?invite_token=\w{32}/)
+ .then(($elem) => {
+ const inviteLink = $elem.text();
+ cy.log(inviteLink);
cy.visit("/settings/identities/users");
- cy.waitTextVisible("Invite Users");
-
- cy.clickOptionWithText("Invite Users");
-
- cy.waitTextVisible(/signup\?invite_token=\w{32}/).then(($elem) => {
- const inviteLink = $elem.text();
- cy.log(inviteLink);
- cy.visit("/settings/identities/users");
- cy.logout();
- cy.visit(inviteLink);
- let name = tryToSignUp();
- cy.waitTextVisible("Welcome to DataHub");
- cy.hideOnboardingTour();
- cy.waitTextVisible(name);
- }).then(() => {
- cy.logout();
- cy.visit("/signup?invite_token=bad_token");
- tryToSignUp();
- cy.waitTextVisible("Failed to log in! An unexpected error occurred.");
- });
+ cy.logout();
+ cy.visit(inviteLink);
+ const { name, email } = tryToSignUp();
+ registeredEmail = email;
+ cy.waitTextVisible("Welcome to DataHub");
+ cy.hideOnboardingTour();
+ cy.waitTextVisible(name);
+ })
+ .then(() => {
+ cy.logout();
+ cy.visit("/signup?invite_token=bad_token");
+ tryToSignUp();
+ cy.waitTextVisible("Failed to log in! An unexpected error occurred.");
+ });
+ });
+
+ it("Verify you can’t generate a reset password link for a non-native user", () => {
+ cy.login();
+ cy.visit("/settings/identities/users");
+ cy.waitTextVisible("Invite Users");
+ cy.get("[data-testid=userItem-non-native]").first().click();
+ cy.get('[data-testid="reset-menu-item"]').should(
+ "have.attr",
+ "aria-disabled",
+ "true"
+ );
+ });
+
+ it("Generate a reset password link for a native user", () => {
+ cy.login();
+ cy.visit("/settings/identities/users");
+ cy.waitTextVisible("Invite Users");
+ cy.get(`[data-testid="email-native"]`)
+ .contains(registeredEmail)
+ .should("exist")
+ .parents(".ant-list-item")
+ .find('[data-testid="userItem-native"]')
+ .should("be.visible")
+ .click();
+
+ cy.get("[data-testid=resetButton]").first().click();
+ cy.get("[data-testid=refreshButton]").click();
+ cy.waitTextVisible("Generated new link to reset credentials");
+
+ cy.window().then((win) => {
+ cy.stub(win, "prompt");
});
-});
+ cy.get(".ant-typography-copy").should("be.visible").click();
+ cy.get(".ant-modal-close").should("be.visible").click();
-// Verify you can’t generate a reset password link for a non-native user (root, for example)
-// Generate a reset password link for a native user
-// Log out, then verify that using a bad reset token in the URL doesn’t allow you to reset password
-// Use the correct reset link to reset native user credentials
\ No newline at end of file
+ cy.waitTextVisible(/reset\?reset_token=\w{32}/)
+ .then(($elem) => {
+ const inviteLink = $elem.text();
+ cy.logout();
+ cy.visit(inviteLink);
+ cy.enterTextInTestId("email", registeredEmail);
+ cy.enterTextInTestId("password", "Example Reset Password");
+ cy.enterTextInTestId("confirmPassword", "Example Reset Password");
+ cy.get("[type=submit]").click();
+ cy.waitTextVisible("Welcome back");
+ cy.hideOnboardingTour();
+ })
+ .then(() => {
+ cy.logout();
+ cy.visit("/reset?reset_token=bad_token");
+ cy.enterTextInTestId("email", registeredEmail);
+ cy.enterTextInTestId("password", "Example Reset Password");
+ cy.enterTextInTestId("confirmPassword", "Example Reset Password");
+ cy.get("[type=submit]").click();
+ cy.waitTextVisible("Failed to log in!");
+ });
+ });
+});
From 3635c1c2213cfb8421d89b7cc106ab236d72c7ec Mon Sep 17 00:00:00 2001
From: Shubham Jagtap <132359390+shubhamjagtap639@users.noreply.github.com>
Date: Thu, 28 Dec 2023 15:24:26 +0530
Subject: [PATCH 078/540] feat(ingestion/bigquery): Use sqlglot_lineage for
usage and add more perf timers (#9247)
Co-authored-by: Andrew Sikowitz
---
metadata-ingestion/setup.py | 2 -
.../ingestion/source/bigquery_v2/bigquery.py | 22 +-
.../source/bigquery_v2/bigquery_audit.py | 16 +-
.../source/bigquery_v2/bigquery_config.py | 5 +
.../source/bigquery_v2/bigquery_report.py | 12 +-
.../ingestion/source/bigquery_v2/usage.py | 86 ++---
.../datahub/utilities/bigquery_sql_parser.py | 92 -----
.../src/datahub/utilities/sqlglot_lineage.py | 8 +-
.../bigquery/test_bigquery_usage.py | 8 +-
.../tests/unit/test_bigquery_sql_lineage.py | 66 +++-
.../tests/unit/test_bigquery_sql_parser.py | 327 ------------------
.../tests/unit/test_bigquery_usage.py | 14 +-
.../unit/test_bigqueryv2_usage_source.py | 6 +-
13 files changed, 159 insertions(+), 505 deletions(-)
delete mode 100644 metadata-ingestion/src/datahub/utilities/bigquery_sql_parser.py
delete mode 100644 metadata-ingestion/tests/unit/test_bigquery_sql_parser.py
diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
index 32d49ffc73fa34..8e4791e253c7cf 100644
--- a/metadata-ingestion/setup.py
+++ b/metadata-ingestion/setup.py
@@ -295,8 +295,6 @@
"bigquery": sql_common
| bigquery_common
| {
- # TODO: I doubt we need all three sql parsing libraries.
- *sqllineage_lib,
*sqlglot_lib,
"sqlalchemy-bigquery>=1.4.1",
"google-cloud-datacatalog-lineage==0.2.2",
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
index 9813945683289c..3704eae96aece0 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
@@ -221,6 +221,7 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config):
self.bigquery_data_dictionary = BigQuerySchemaApi(
self.report.schema_api_perf, self.config.get_bigquery_client()
)
+ self.sql_parser_schema_resolver = self._init_schema_resolver()
redundant_lineage_run_skip_handler: Optional[
RedundantLineageRunSkipHandler
@@ -253,6 +254,7 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config):
self.usage_extractor = BigQueryUsageExtractor(
config,
self.report,
+ schema_resolver=self.sql_parser_schema_resolver,
dataset_urn_builder=self.gen_dataset_urn_from_ref,
redundant_run_skip_handler=redundant_usage_run_skip_handler,
)
@@ -283,8 +285,6 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config):
# Maps view ref -> actual sql
self.view_definitions: FileBackedDict[str] = FileBackedDict()
- self.sql_parser_schema_resolver = self._init_schema_resolver()
-
self.add_config_to_report()
atexit.register(cleanup, config)
@@ -371,7 +371,10 @@ def usage_capability_test(
report: BigQueryV2Report,
) -> CapabilityReport:
usage_extractor = BigQueryUsageExtractor(
- connection_conf, report, lambda ref: ""
+ connection_conf,
+ report,
+ schema_resolver=SchemaResolver(platform="bigquery"),
+ dataset_urn_builder=lambda ref: "",
)
for project_id in project_ids:
try:
@@ -447,7 +450,9 @@ def _init_schema_resolver(self) -> SchemaResolver:
self.config.lineage_parse_view_ddl or self.config.lineage_use_sql_parser
)
schema_ingestion_enabled = (
- self.config.include_views and self.config.include_tables
+ self.config.include_schema_metadata
+ and self.config.include_tables
+ and self.config.include_views
)
if schema_resolution_required and not schema_ingestion_enabled:
@@ -545,10 +550,11 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
if not projects:
return
- for project_id in projects:
- self.report.set_ingestion_stage(project_id.id, METADATA_EXTRACTION)
- logger.info(f"Processing project: {project_id.id}")
- yield from self._process_project(project_id)
+ if self.config.include_schema_metadata:
+ for project_id in projects:
+ self.report.set_ingestion_stage(project_id.id, METADATA_EXTRACTION)
+ logger.info(f"Processing project: {project_id.id}")
+ yield from self._process_project(project_id)
if self.config.include_usage_statistics:
yield from self.usage_extractor.get_usage_workunits(
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py
index 55366d6c57cf83..8cef10ca234481 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py
@@ -12,6 +12,7 @@
get_first_missing_key,
get_first_missing_key_any,
)
+from datahub.utilities.urns.dataset_urn import DatasetUrn
AuditLogEntry = Any
@@ -178,6 +179,17 @@ def from_string_name(cls, ref: str) -> "BigQueryTableRef":
raise ValueError(f"invalid BigQuery table reference: {ref}")
return cls(BigqueryTableIdentifier(parts[1], parts[3], parts[5]))
+ @classmethod
+ def from_urn(cls, urn: str) -> "BigQueryTableRef":
+ """Raises: ValueError if urn is not a valid BigQuery table URN."""
+ dataset_urn = DatasetUrn.create_from_string(urn)
+ split = dataset_urn.get_dataset_name().rsplit(".", 3)
+ if len(split) == 3:
+ project, dataset, table = split
+ else:
+ _, project, dataset, table = split
+ return cls(BigqueryTableIdentifier(project, dataset, table))
+
def is_temporary_table(self, prefixes: List[str]) -> bool:
for prefix in prefixes:
if self.table_identifier.dataset.startswith(prefix):
@@ -566,7 +578,7 @@ def from_query_event(
query_event: QueryEvent,
debug_include_full_payloads: bool = False,
) -> "ReadEvent":
- readEvent = ReadEvent(
+ return ReadEvent(
actor_email=query_event.actor_email,
timestamp=query_event.timestamp,
resource=read_resource,
@@ -577,8 +589,6 @@ def from_query_event(
from_query=True,
)
- return readEvent
-
@classmethod
def from_exported_bigquery_audit_metadata(
cls, row: BigQueryAuditMetadata, debug_include_full_payloads: bool = False
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
index c13b08a6d9656b..58f2a600c2ff7d 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
@@ -94,6 +94,11 @@ class BigQueryV2Config(
description="Regex patterns for project_id to filter in ingestion.",
)
+ include_schema_metadata: bool = Field(
+ default=True,
+ description="Whether to ingest the BigQuery schema, i.e. projects, schemas, tables, and views.",
+ )
+
usage: BigQueryUsageConfig = Field(
default=BigQueryUsageConfig(), description="Usage related configs"
)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py
index 9d92b011ee2856..69913b383af874 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py
@@ -33,6 +33,13 @@ class BigQueryAuditLogApiPerfReport(Report):
list_log_entries: PerfTimer = field(default_factory=PerfTimer)
+@dataclass
+class BigQueryProcessingPerfReport(Report):
+ sql_parsing_sec: PerfTimer = field(default_factory=PerfTimer)
+ store_usage_event_sec: PerfTimer = field(default_factory=PerfTimer)
+ usage_state_size: Optional[str] = None
+
+
@dataclass
class BigQueryV2Report(ProfilingSqlReport, IngestionStageReport, BaseTimeWindowReport):
num_total_lineage_entries: TopKDict[str, int] = field(default_factory=TopKDict)
@@ -120,8 +127,6 @@ class BigQueryV2Report(ProfilingSqlReport, IngestionStageReport, BaseTimeWindowR
read_reasons_stat: Counter[str] = field(default_factory=collections.Counter)
operation_types_stat: Counter[str] = field(default_factory=collections.Counter)
- usage_state_size: Optional[str] = None
-
exclude_empty_projects: Optional[bool] = None
schema_api_perf: BigQuerySchemaApiPerfReport = field(
@@ -130,6 +135,9 @@ class BigQueryV2Report(ProfilingSqlReport, IngestionStageReport, BaseTimeWindowR
audit_log_api_perf: BigQueryAuditLogApiPerfReport = field(
default_factory=BigQueryAuditLogApiPerfReport
)
+ processing_perf: BigQueryProcessingPerfReport = field(
+ default_factory=BigQueryProcessingPerfReport
+ )
lineage_start_time: Optional[datetime] = None
lineage_end_time: Optional[datetime] = None
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py
index 65b559550ffc59..ccc64184f3346f 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py
@@ -35,7 +35,6 @@
AuditEvent,
AuditLogEntry,
BigQueryAuditMetadata,
- BigqueryTableIdentifier,
BigQueryTableRef,
QueryEvent,
ReadEvent,
@@ -60,9 +59,9 @@
USAGE_EXTRACTION_USAGE_AGGREGATION,
)
from datahub.metadata.schema_classes import OperationClass, OperationTypeClass
-from datahub.utilities.bigquery_sql_parser import BigQuerySQLParser
from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedDict
from datahub.utilities.perf_timer import PerfTimer
+from datahub.utilities.sqlglot_lineage import SchemaResolver, sqlglot_lineage
logger: logging.Logger = logging.getLogger(__name__)
@@ -284,7 +283,7 @@ def delete_original_read_events_for_view_query_events(self) -> None:
)
def report_disk_usage(self, report: BigQueryV2Report) -> None:
- report.usage_state_size = str(
+ report.processing_perf.usage_state_size = str(
{
"main": humanfriendly.format_size(os.path.getsize(self.conn.filename)),
"queries": humanfriendly.format_size(
@@ -310,11 +309,14 @@ def __init__(
self,
config: BigQueryV2Config,
report: BigQueryV2Report,
+ *,
+ schema_resolver: SchemaResolver,
dataset_urn_builder: Callable[[BigQueryTableRef], str],
redundant_run_skip_handler: Optional[RedundantUsageRunSkipHandler] = None,
):
self.config: BigQueryV2Config = config
self.report: BigQueryV2Report = report
+ self.schema_resolver = schema_resolver
self.dataset_urn_builder = dataset_urn_builder
# Replace hash of query with uuid if there are hash conflicts
self.uuid_to_query: Dict[str, str] = {}
@@ -415,10 +417,11 @@ def generate_read_events_from_query(
) -> Iterable[AuditEvent]:
try:
tables = self.get_tables_from_query(
- query_event_on_view.project_id,
query_event_on_view.query,
+ default_project=query_event_on_view.project_id,
+ default_dataset=query_event_on_view.default_dataset,
)
- assert tables is not None and len(tables) != 0
+ assert len(tables) != 0
for table in tables:
yield AuditEvent.create(
ReadEvent.from_query_event(table, query_event_on_view)
@@ -462,12 +465,15 @@ def _ingest_events(
self.report.num_view_query_events += 1
for new_event in self.generate_read_events_from_query(query_event):
- num_generated += self._store_usage_event(
- new_event, usage_state, table_refs
- )
- num_aggregated += self._store_usage_event(
- audit_event, usage_state, table_refs
- )
+ with self.report.processing_perf.store_usage_event_sec:
+ num_generated += self._store_usage_event(
+ new_event, usage_state, table_refs
+ )
+ with self.report.processing_perf.store_usage_event_sec:
+ num_aggregated += self._store_usage_event(
+ audit_event, usage_state, table_refs
+ )
+
except Exception as e:
logger.warning(
f"Unable to store usage event {audit_event}", exc_info=True
@@ -905,54 +911,38 @@ def _generate_filter(self, corrected_start_time, corrected_end_time):
)
def get_tables_from_query(
- self, default_project: str, query: str
- ) -> Optional[List[BigQueryTableRef]]:
+ self, query: str, default_project: str, default_dataset: Optional[str] = None
+ ) -> List[BigQueryTableRef]:
"""
This method attempts to parse bigquery objects read in the query
"""
if not query:
- return None
+ return []
- parsed_tables = set()
try:
- parser = BigQuerySQLParser(
- query,
- self.config.sql_parser_use_external_process,
- use_raw_names=self.config.lineage_sql_parser_use_raw_names,
- )
- tables = parser.get_tables()
- except Exception as ex:
+ with self.report.processing_perf.sql_parsing_sec:
+ result = sqlglot_lineage(
+ query,
+ self.schema_resolver,
+ default_db=default_project,
+ default_schema=default_dataset,
+ )
+ except Exception:
logger.debug(
- f"Sql parsing failed on this query on view: {query}. "
- f"Usage won't be added. The error was {ex}."
+ f"Sql parsing failed on this query on view: {query}. Usage won't be added."
)
- return None
+ logger.debug(result.debug_info)
+ return []
- for table in tables:
- parts = table.split(".")
- if len(parts) == 2:
- parsed_tables.add(
- BigQueryTableRef(
- BigqueryTableIdentifier(
- project_id=default_project, dataset=parts[0], table=parts[1]
- )
- ).get_sanitized_table_ref()
- )
- elif len(parts) == 3:
- parsed_tables.add(
- BigQueryTableRef(
- BigqueryTableIdentifier(
- project_id=parts[0], dataset=parts[1], table=parts[2]
- )
- ).get_sanitized_table_ref()
- )
- else:
- logger.debug(
- f"Invalid table identifier {table} when parsing query on view {query}"
- )
+ parsed_table_refs = []
+ for urn in result.in_tables:
+ try:
+ parsed_table_refs.append(BigQueryTableRef.from_urn(urn))
+ except ValueError:
+ logger.debug(f"Invalid urn {urn} when parsing query on view {query}")
self.report.num_view_query_events_failed_table_identification += 1
- return list(parsed_tables)
+ return parsed_table_refs
def _report_error(
self, label: str, e: Exception, group: Optional[str] = None
diff --git a/metadata-ingestion/src/datahub/utilities/bigquery_sql_parser.py b/metadata-ingestion/src/datahub/utilities/bigquery_sql_parser.py
deleted file mode 100644
index 4ad41f1fe23c9a..00000000000000
--- a/metadata-ingestion/src/datahub/utilities/bigquery_sql_parser.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import re
-from typing import List
-
-import sqlparse
-
-from datahub.utilities.sql_parser import SqlLineageSQLParser, SQLParser
-
-
-class BigQuerySQLParser(SQLParser):
- parser: SQLParser
-
- def __init__(
- self,
- sql_query: str,
- use_external_process: bool = False,
- use_raw_names: bool = False,
- ) -> None:
- super().__init__(sql_query)
-
- self._parsed_sql_query = self.parse_sql_query(sql_query)
- self.parser = SqlLineageSQLParser(
- self._parsed_sql_query, use_external_process, use_raw_names
- )
-
- def parse_sql_query(self, sql_query: str) -> str:
- sql_query = BigQuerySQLParser._parse_bigquery_comment_sign(sql_query)
- sql_query = BigQuerySQLParser._escape_keyword_from_as_field_name(sql_query)
- sql_query = BigQuerySQLParser._escape_cte_name_after_keyword_with(sql_query)
-
- sql_query = sqlparse.format(
- sql_query.strip(),
- reindent_aligned=True,
- strip_comments=True,
- )
-
- sql_query = BigQuerySQLParser._escape_table_or_view_name_at_create_statement(
- sql_query
- )
- sql_query = BigQuerySQLParser._escape_object_name_after_keyword_from(sql_query)
- sql_query = BigQuerySQLParser._remove_comma_before_from(sql_query)
-
- return sql_query
-
- @staticmethod
- def _parse_bigquery_comment_sign(sql_query: str) -> str:
- return re.sub(r"#(.*)", r"-- \1", sql_query, flags=re.IGNORECASE)
-
- @staticmethod
- def _escape_keyword_from_as_field_name(sql_query: str) -> str:
- return re.sub(r"(\w*\.from)", r"`\1`", sql_query, flags=re.IGNORECASE)
-
- @staticmethod
- def _escape_cte_name_after_keyword_with(sql_query: str) -> str:
- """
- Escape the first cte name in case it is one of reserved words
- """
- return re.sub(r"(with\s)([^`\s()]+)", r"\1`\2`", sql_query, flags=re.IGNORECASE)
-
- @staticmethod
- def _escape_table_or_view_name_at_create_statement(sql_query: str) -> str:
- """
- Reason: in case table name contains hyphens which breaks sqllineage later on
- """
- return re.sub(
- r"(create.*\s)(table\s|view\s)([^`\s()]+)(?=\sas)",
- r"\1\2`\3`",
- sql_query,
- flags=re.IGNORECASE,
- )
-
- @staticmethod
- def _remove_comma_before_from(sql_query: str) -> str:
- return re.sub(r",(\s*?)(?=from)", r" ", sql_query, flags=re.IGNORECASE)
-
- @staticmethod
- def _escape_object_name_after_keyword_from(sql_query: str) -> str:
- """
- Reason: in case table name contains hyphens which breaks sqllineage later on
- Note: ignore cases of having keyword FROM as part of datetime function EXTRACT
- """
- return re.sub(
- r"(? List[str]:
- return self.parser.get_tables()
-
- def get_columns(self) -> List[str]:
- return self.parser.get_columns()
diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py
index b43c8de4c8f3d8..0f84871d6c96ac 100644
--- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py
+++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py
@@ -333,6 +333,9 @@ def _table_level_lineage(
return tables, modified
+TABLE_CASE_SENSITIVE_PLATFORMS = {"bigquery"}
+
+
class SchemaResolver(Closeable):
def __init__(
self,
@@ -402,7 +405,10 @@ def resolve_table(self, table: _TableName) -> Tuple[str, Optional[SchemaInfo]]:
if schema_info:
return urn_lower, schema_info
- return urn_lower, None
+ if self.platform in TABLE_CASE_SENSITIVE_PLATFORMS:
+ return urn, None
+ else:
+ return urn_lower, None
def _resolve_schema_info(self, urn: str) -> Optional[SchemaInfo]:
if urn in self._schema_cache:
diff --git a/metadata-ingestion/tests/performance/bigquery/test_bigquery_usage.py b/metadata-ingestion/tests/performance/bigquery/test_bigquery_usage.py
index bbc3378450bffd..9bbe9c45887a8f 100644
--- a/metadata-ingestion/tests/performance/bigquery/test_bigquery_usage.py
+++ b/metadata-ingestion/tests/performance/bigquery/test_bigquery_usage.py
@@ -14,6 +14,7 @@
from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor
from datahub.utilities.perf_timer import PerfTimer
+from datahub.utilities.sqlglot_lineage import SchemaResolver
from tests.performance.bigquery.bigquery_events import generate_events, ref_from_table
from tests.performance.data_generation import (
NormalDistribution,
@@ -47,7 +48,10 @@ def run_test():
usage_extractor = BigQueryUsageExtractor(
config,
report,
- lambda ref: make_dataset_urn("bigquery", str(ref.table_identifier)),
+ schema_resolver=SchemaResolver(platform="bigquery"),
+ dataset_urn_builder=lambda ref: make_dataset_urn(
+ "bigquery", str(ref.table_identifier)
+ ),
)
report.set_ingestion_stage("All", "Event Generation")
@@ -83,7 +87,7 @@ def run_test():
print(
f"Peak Memory Used: {humanfriendly.format_size(peak_memory_usage - pre_mem_usage)}"
)
- print(f"Disk Used: {report.usage_state_size}")
+ print(f"Disk Used: {report.processing_perf.usage_state_size}")
print(f"Hash collisions: {report.num_usage_query_hash_collisions}")
diff --git a/metadata-ingestion/tests/unit/test_bigquery_sql_lineage.py b/metadata-ingestion/tests/unit/test_bigquery_sql_lineage.py
index f807be747a193a..755e9081dda390 100644
--- a/metadata-ingestion/tests/unit/test_bigquery_sql_lineage.py
+++ b/metadata-ingestion/tests/unit/test_bigquery_sql_lineage.py
@@ -1,4 +1,35 @@
-from datahub.utilities.bigquery_sql_parser import BigQuerySQLParser
+from typing import List
+
+from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigQueryTableRef
+from datahub.utilities.sqlglot_lineage import SchemaResolver, sqlglot_lineage
+
+
+class BigQuerySQLParser:
+ def __init__(self, sql_query: str, schema_resolver: SchemaResolver) -> None:
+ self.result = sqlglot_lineage(sql_query, schema_resolver)
+
+ def get_tables(self) -> List[str]:
+ ans = []
+ for urn in self.result.in_tables:
+ table_ref = BigQueryTableRef.from_urn(urn)
+ ans.append(str(table_ref.table_identifier))
+ return ans
+
+ def get_columns(self) -> List[str]:
+ ans = []
+ for col_info in self.result.column_lineage or []:
+ for col_ref in col_info.upstreams:
+ ans.append(col_ref.column)
+ return ans
+
+
+def test_bigquery_sql_lineage_basic():
+ parser = BigQuerySQLParser(
+ sql_query="""SELECT * FROM project_1.database_1.view_1""",
+ schema_resolver=SchemaResolver(platform="bigquery"),
+ )
+
+ assert parser.get_tables() == ["project_1.database_1.view_1"]
def test_bigquery_sql_lineage_hash_as_comment_sign_is_accepted():
@@ -14,7 +45,8 @@ def test_bigquery_sql_lineage_hash_as_comment_sign_is_accepted():
-- this comment will not break sqllineage either
# this comment will not break sqllineage either
FROM `project.dataset.src_tbl`
- """
+ """,
+ schema_resolver=SchemaResolver(platform="bigquery"),
)
assert parser.get_tables() == ["project.dataset.src_tbl"]
@@ -39,7 +71,7 @@ def test_bigquery_sql_lineage_camel_case_table():
# this comment will not break sqllineage either
FROM `project.dataset.CamelCaseTable`
""",
- use_raw_names=True,
+ schema_resolver=SchemaResolver(platform="bigquery"),
)
assert parser.get_tables() == ["project.dataset.CamelCaseTable"]
@@ -64,7 +96,7 @@ def test_bigquery_sql_lineage_camel_case_dataset():
# this comment will not break sqllineage either
FROM `project.DataSet.table`
""",
- use_raw_names=True,
+ schema_resolver=SchemaResolver(platform="bigquery"),
)
assert parser.get_tables() == ["project.DataSet.table"]
@@ -89,7 +121,7 @@ def test_bigquery_sql_lineage_camel_case_table_and_dataset():
# this comment will not break sqllineage either
FROM `project.DataSet.CamelTable`
""",
- use_raw_names=True,
+ schema_resolver=SchemaResolver(platform="bigquery"),
)
assert parser.get_tables() == ["project.DataSet.CamelTable"]
@@ -117,7 +149,7 @@ def test_bigquery_sql_lineage_camel_case_table_and_dataset_subquery():
SELECT * FROM `project.DataSet.CamelTable`
)
""",
- use_raw_names=True,
+ schema_resolver=SchemaResolver(platform="bigquery"),
)
assert parser.get_tables() == ["project.DataSet.CamelTable"]
@@ -146,7 +178,7 @@ def test_bigquery_sql_lineage_camel_case_table_and_dataset_joins():
LEFT JOIN `project.DataSet3.CamelTable3`
on c.id = b.id
""",
- use_raw_names=True,
+ schema_resolver=SchemaResolver(platform="bigquery"),
)
assert parser.get_tables() == [
@@ -179,7 +211,7 @@ def test_bigquery_sql_lineage_camel_case_table_and_dataset_joins_and_subquery():
LEFT JOIN (SELECT * FROM `project.DataSet3.CamelTable3`) c
ON c.id = b.id
""",
- use_raw_names=True,
+ schema_resolver=SchemaResolver(platform="bigquery"),
)
assert parser.get_tables() == [
@@ -199,7 +231,8 @@ def test_bigquery_sql_lineage_keyword_data_is_accepted():
FROM `project.example_dataset.example_table`
)
SELECT * FROM data
- """
+ """,
+ schema_resolver=SchemaResolver(platform="bigquery"),
)
assert parser.get_tables() == ["project.example_dataset.example_table"]
@@ -213,7 +246,8 @@ def test_bigquery_sql_lineage_keyword_admin_is_accepted():
FROM `project.example_dataset.example_table`
)
SELECT * FROM admin
- """
+ """,
+ schema_resolver=SchemaResolver(platform="bigquery"),
)
assert parser.get_tables() == ["project.example_dataset.example_table"]
@@ -238,7 +272,8 @@ def test_bigquery_sql_lineage_cte_alias_as_keyword_is_accepted():
)
SELECT *
FROM map
- """
+ """,
+ schema_resolver=SchemaResolver(platform="bigquery"),
)
assert parser.get_tables() == [
@@ -255,7 +290,8 @@ def test_bigquery_sql_lineage_create_or_replace_view_name_with_hyphens_is_accept
FROM project.dataset.src_table_a
UNION
SELECT * FROM `project.dataset.src_table_b`
- """
+ """,
+ schema_resolver=SchemaResolver(platform="bigquery"),
)
assert parser.get_tables() == [
@@ -270,7 +306,8 @@ def test_bigquery_sql_lineage_source_table_name_with_hyphens_is_accepted():
CREATE OR REPLACE VIEW `project.dataset.test_view` AS
SELECT *
FROM test-project.dataset.src_table
- """
+ """,
+ schema_resolver=SchemaResolver(platform="bigquery"),
)
assert parser.get_tables() == ["test-project.dataset.src_table"]
@@ -282,7 +319,8 @@ def test_bigquery_sql_lineage_from_as_column_name_is_accepted():
CREATE OR REPLACE VIEW `project.dataset.test_view` AS
SELECT x.from AS col
FROM project.dataset.src_table AS x
- """
+ """,
+ schema_resolver=SchemaResolver(platform="bigquery"),
)
assert parser.get_tables() == ["project.dataset.src_table"]
diff --git a/metadata-ingestion/tests/unit/test_bigquery_sql_parser.py b/metadata-ingestion/tests/unit/test_bigquery_sql_parser.py
deleted file mode 100644
index 2a73bfc5e8b686..00000000000000
--- a/metadata-ingestion/tests/unit/test_bigquery_sql_parser.py
+++ /dev/null
@@ -1,327 +0,0 @@
-import pytest
-
-from datahub.utilities.bigquery_sql_parser import BigQuerySQLParser
-
-
-def test_bigquery_sql_parser_comments_are_removed():
- parser = BigQuerySQLParser(
- sql_query="""
-/*
-HERE IS A STANDARD COMMENT BLOCK
-THIS WILL NOT BREAK sqllineage
-*/
-CREATE OR REPLACE TABLE `project.dataset.test_view` AS
-#This, comment will not break sqllineage
-SELECT foo
--- this comment will not break sqllineage either
-# this comment will not break sqllineage either
- FROM `project.dataset.src_table`
-"""
- )
-
- assert (
- parser._parsed_sql_query
- == """CREATE OR REPLACE TABLE `project.dataset.test_view` AS SELECT foo
- FROM `project.dataset.src_table`"""
- )
-
- assert parser.get_tables() == ["project.dataset.src_table"]
-
-
-def test_bigquery_sql_parser_formats_input_sql():
- parser = BigQuerySQLParser(
- sql_query="""
-CREATE OR REPLACE TABLE `project.dataset.test_view` AS
-SELECT foo FROM `project.dataset.src_table_a` AS a
-INNER JOIN `project.dataset.src_table_b` AS b ON a.key_field = b.key_field
-"""
- )
-
- assert (
- parser._parsed_sql_query
- == """CREATE OR REPLACE TABLE `project.dataset.test_view` AS SELECT foo
- FROM `project.dataset.src_table_a` AS a
- INNER JOIN `project.dataset.src_table_b` AS b
- ON a.key_field = b.key_field"""
- )
-
- assert parser.get_tables() == [
- "project.dataset.src_table_a",
- "project.dataset.src_table_b",
- ]
-
-
-def test_remove_comma_before_from():
- assert (
- BigQuerySQLParser._remove_comma_before_from(
- """
-select a, b,from `project.dataset.table_name_1`
-"""
- )
- == """
-select a, b from `project.dataset.table_name_1`
-"""
- )
-
- assert (
- BigQuerySQLParser._remove_comma_before_from(
- """
-select a, b from `project.dataset.table_name_1`
-"""
- )
- == """
-select a, b from `project.dataset.table_name_1`
-"""
- )
-
- assert (
- BigQuerySQLParser._remove_comma_before_from(
- """
-select
- a,
- b,
-from `project.dataset.table_name_1`
-"""
- )
- == """
-select
- a,
- b from `project.dataset.table_name_1`
-"""
- )
-
-
-def test_bigquery_sql_parser_subquery():
- parser = BigQuerySQLParser(
- sql_query="""
- create or replace table smoke_test_db.table_from_view_and_table
- as (select b.date_utc, v.revenue from smoke_test_db.base_table b, smoke_test_db.view_from_table v
- """
- )
- assert parser.get_tables() == [
- "smoke_test_db.base_table",
- "smoke_test_db.view_from_table",
- ]
-
-
-def test_bigquery_sql_parser_comment_sign_switched_correctly():
- sql_query = BigQuerySQLParser._parse_bigquery_comment_sign(
- """
-#upper comment
-SELECT * FROM hello
-# lower comment
-"""
- )
-
- assert (
- sql_query
- == """
--- upper comment
-SELECT * FROM hello
--- lower comment
-"""
- )
-
-
-def test_bigquery_sql_parser_keyword_from_is_escaped_if_used_as_fieldname():
- sql_query = BigQuerySQLParser._escape_keyword_from_as_field_name(
- """
-SELECT hello.from AS col FROM hello
-"""
- )
-
- assert (
- sql_query
- == """
-SELECT `hello.from` AS col FROM hello
-"""
- )
-
-
-def test_bigquery_sql_parser_first_cte_name_is_escaped():
- sql_query = BigQuerySQLParser._escape_cte_name_after_keyword_with(
- """
-CREATE OR REPLACE VIEW `test_view` AS
-WITH cte_1 AS (
- SELECT * FROM foo
-),
-cte_2 AS (
- SELECT * FROM bar
-)
-SELECT * FROM cte_1 UNION ALL
-SELECT * FROM cte_2
-"""
- )
-
- assert (
- sql_query
- == """
-CREATE OR REPLACE VIEW `test_view` AS
-WITH `cte_1` AS (
- SELECT * FROM foo
-),
-cte_2 AS (
- SELECT * FROM bar
-)
-SELECT * FROM cte_1 UNION ALL
-SELECT * FROM cte_2
-"""
- )
-
-
-def test_bigquery_sql_parser_table_name_is_escaped_at_create_statement():
- sql_query_create = BigQuerySQLParser._escape_table_or_view_name_at_create_statement(
- """
-CREATE TABLE project.dataset.test_table AS
-col_1 STRING,
-col_2 STRING
-"""
- )
-
- sql_query_create_or_replace = BigQuerySQLParser._escape_table_or_view_name_at_create_statement(
- """
-CREATE OR REPLACE TABLE project.dataset.test_table AS
-col_1 STRING,
-col_2 STRING
-"""
- )
-
- assert (
- sql_query_create
- == """
-CREATE TABLE `project.dataset.test_table` AS
-col_1 STRING,
-col_2 STRING
-"""
- )
- assert (
- sql_query_create_or_replace
- == """
-CREATE OR REPLACE TABLE `project.dataset.test_table` AS
-col_1 STRING,
-col_2 STRING
-"""
- )
-
-
-def test_bigquery_sql_parser_view_name_is_escaped_at_create_statement():
- sql_query_create = BigQuerySQLParser._escape_table_or_view_name_at_create_statement(
- """
-CREATE VIEW project.dataset.test_view AS
-SELECT * FROM project.dataset.src_table
-"""
- )
-
- sql_query_create_or_replace = BigQuerySQLParser._escape_table_or_view_name_at_create_statement(
- """
-CREATE OR REPLACE VIEW project.dataset.test_view AS
-SELECT * FROM project.dataset.src_table
-"""
- )
-
- assert (
- sql_query_create
- == """
-CREATE VIEW `project.dataset.test_view` AS
-SELECT * FROM project.dataset.src_table
-"""
- )
- assert (
- sql_query_create_or_replace
- == """
-CREATE OR REPLACE VIEW `project.dataset.test_view` AS
-SELECT * FROM project.dataset.src_table
-"""
- )
-
-
-def test_bigquery_sql_parser_object_name_is_escaped_after_keyword_from():
- sql_query = BigQuerySQLParser._escape_object_name_after_keyword_from(
- """
-CREATE OR REPLACE VIEW `project.dataset.test_view` AS
-SELECT * FROM src-project.dataset.src_table_a UNION ALL
-SELECT * FROM project.dataset.src_table_b
-"""
- )
-
- assert (
- sql_query
- == """
-CREATE OR REPLACE VIEW `project.dataset.test_view` AS
-SELECT * FROM `src-project.dataset.src_table_a` UNION ALL
-SELECT * FROM `project.dataset.src_table_b`
-"""
- )
-
-
-def test_bigquery_sql_parser_field_name_is_not_escaped_after_keyword_from_in_datetime_functions():
- sql_query = BigQuerySQLParser._escape_object_name_after_keyword_from(
- """
-CREATE OR REPLACE VIEW `project.dataset.test_view` AS
-SELECT
-EXTRACT(MICROSECOND FROM time_field) AS col_1,
-EXTRACT(MILLISECOND FROM time_field) AS col_2,
-EXTRACT(SECOND FROM time_field) AS col_3,
-EXTRACT(MINUTE FROM time_field) AS col_4,
-EXTRACT(HOUR FROM time_field) AS col_5,
-EXTRACT(DAYOFWEEK FROM time_field) AS col_6,
-EXTRACT(DAY FROM time_field) AS col_7,
-EXTRACT(DAYOFYEAR FROM time_field) AS col_8,
-EXTRACT(WEEK FROM time_field) AS col_9,
-EXTRACT(WEEK FROM time_field) AS col_10,
-EXTRACT(ISOWEEK FROM time_field) AS col_11,
-EXTRACT(MONTH FROM time_field) AS col_12,
-EXTRACT(QUARTER FROM time_field) AS col_13,
-EXTRACT(YEAR FROM time_field) AS col_14,
-EXTRACT(ISOYEAR FROM time_field) AS col_15,
-EXTRACT(DATE FROM time_field) AS col_16,
-EXTRACT(TIME FROM time_field) AS col_17
-FROM src-project.dataset.src_table_a
-"""
- )
-
- assert (
- sql_query
- == """
-CREATE OR REPLACE VIEW `project.dataset.test_view` AS
-SELECT
-EXTRACT(MICROSECOND FROM time_field) AS col_1,
-EXTRACT(MILLISECOND FROM time_field) AS col_2,
-EXTRACT(SECOND FROM time_field) AS col_3,
-EXTRACT(MINUTE FROM time_field) AS col_4,
-EXTRACT(HOUR FROM time_field) AS col_5,
-EXTRACT(DAYOFWEEK FROM time_field) AS col_6,
-EXTRACT(DAY FROM time_field) AS col_7,
-EXTRACT(DAYOFYEAR FROM time_field) AS col_8,
-EXTRACT(WEEK FROM time_field) AS col_9,
-EXTRACT(WEEK FROM time_field) AS col_10,
-EXTRACT(ISOWEEK FROM time_field) AS col_11,
-EXTRACT(MONTH FROM time_field) AS col_12,
-EXTRACT(QUARTER FROM time_field) AS col_13,
-EXTRACT(YEAR FROM time_field) AS col_14,
-EXTRACT(ISOYEAR FROM time_field) AS col_15,
-EXTRACT(DATE FROM time_field) AS col_16,
-EXTRACT(TIME FROM time_field) AS col_17
-FROM `src-project.dataset.src_table_a`
-"""
- )
-
-
-def test_bigquery_sql_parser_with_semicolon_in_from():
- sql_query = """CREATE VIEW `acryl-staging.smoke_test_db.view_from_table`\nAS select * from smoke_test_db.base_table;"""
-
- table_list = BigQuerySQLParser(sql_query).get_tables()
- table_list.sort()
- assert table_list == ["smoke_test_db.base_table"]
-
-
-@pytest.mark.xfail
-def test_bigquery_sql_parser_with_parenthesis_in_from():
- sql_query = """
- CREATE VIEW `acryl-staging.smoke_test_db.view_from_table` AS
- select * from smoke_test_db.base_table LEFT JOIN UNNEST(my_array) ON day1 = day2;
- """
-
- table_list = BigQuerySQLParser(sql_query).get_tables()
- table_list.sort()
- assert table_list == ["smoke_test_db.base_table"]
diff --git a/metadata-ingestion/tests/unit/test_bigquery_usage.py b/metadata-ingestion/tests/unit/test_bigquery_usage.py
index c0055763bc15b4..664d3112810ff0 100644
--- a/metadata-ingestion/tests/unit/test_bigquery_usage.py
+++ b/metadata-ingestion/tests/unit/test_bigquery_usage.py
@@ -35,6 +35,7 @@
TimeWindowSizeClass,
)
from datahub.testing.compare_metadata_json import diff_metadata_json
+from datahub.utilities.sqlglot_lineage import SchemaResolver
from tests.performance.bigquery.bigquery_events import generate_events, ref_from_table
from tests.performance.data_generation import generate_data, generate_queries
from tests.performance.data_model import Container, FieldAccess, Query, Table, View
@@ -202,7 +203,10 @@ def usage_extractor(config: BigQueryV2Config) -> BigQueryUsageExtractor:
return BigQueryUsageExtractor(
config,
report,
- lambda ref: make_dataset_urn("bigquery", str(ref.table_identifier)),
+ schema_resolver=SchemaResolver(platform="bigquery"),
+ dataset_urn_builder=lambda ref: make_dataset_urn(
+ "bigquery", str(ref.table_identifier)
+ ),
)
@@ -961,21 +965,21 @@ def test_operational_stats(
def test_get_tables_from_query(usage_extractor):
assert usage_extractor.get_tables_from_query(
- PROJECT_1, "SELECT * FROM project-1.database_1.view_1"
+ "SELECT * FROM project-1.database_1.view_1", default_project=PROJECT_1
) == [
BigQueryTableRef(BigqueryTableIdentifier("project-1", "database_1", "view_1"))
]
assert usage_extractor.get_tables_from_query(
- PROJECT_1, "SELECT * FROM database_1.view_1"
+ "SELECT * FROM database_1.view_1", default_project=PROJECT_1
) == [
BigQueryTableRef(BigqueryTableIdentifier("project-1", "database_1", "view_1"))
]
assert sorted(
usage_extractor.get_tables_from_query(
- PROJECT_1,
"SELECT v.id, v.name, v.total, t.name as name1 FROM database_1.view_1 as v inner join database_1.table_1 as t on v.id=t.id",
+ default_project=PROJECT_1,
)
) == [
BigQueryTableRef(BigqueryTableIdentifier("project-1", "database_1", "table_1")),
@@ -984,8 +988,8 @@ def test_get_tables_from_query(usage_extractor):
assert sorted(
usage_extractor.get_tables_from_query(
- PROJECT_1,
"CREATE TABLE database_1.new_table AS SELECT v.id, v.name, v.total, t.name as name1 FROM database_1.view_1 as v inner join database_1.table_1 as t on v.id=t.id",
+ default_project=PROJECT_1,
)
) == [
BigQueryTableRef(BigqueryTableIdentifier("project-1", "database_1", "table_1")),
diff --git a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py
index 44fd840f28d594..25e849a5092938 100644
--- a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py
+++ b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py
@@ -10,6 +10,7 @@
from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor
+from datahub.utilities.sqlglot_lineage import SchemaResolver
FROZEN_TIME = "2021-07-20 00:00:00"
@@ -114,7 +115,10 @@ def test_bigqueryv2_filters():
corrected_start_time = config.start_time - config.max_query_duration
corrected_end_time = config.end_time + config.max_query_duration
filter: str = BigQueryUsageExtractor(
- config, BigQueryV2Report(), lambda x: ""
+ config,
+ BigQueryV2Report(),
+ schema_resolver=SchemaResolver(platform="bigquery"),
+ dataset_urn_builder=lambda x: "",
)._generate_filter(corrected_start_time, corrected_end_time)
assert filter == expected_filter
From 60347d6735ea2136d721bbf6644ae82df6519d9c Mon Sep 17 00:00:00 2001
From: Diego Reiriz Cores
Date: Thu, 28 Dec 2023 12:09:10 +0100
Subject: [PATCH 079/540] fix(ingest/mongodb): support disabling
schemaSamplingSize (#9295)
Co-authored-by: Harshal Sheth
---
.../src/datahub/ingestion/source/mongodb.py | 8 +++++---
.../tests/integration/mongodb/test_mongodb.py | 1 +
2 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py
index 2aa8b1d37d4776..283ab652f23c62 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py
@@ -102,7 +102,7 @@ class MongoDBConfig(
)
schemaSamplingSize: Optional[PositiveInt] = Field(
default=1000,
- description="Number of documents to use when inferring schema size. If set to `0`, all documents will be scanned.",
+ description="Number of documents to use when inferring schema size. If set to `null`, all documents will be scanned.",
)
useRandomSampling: bool = Field(
default=True,
@@ -225,13 +225,15 @@ def construct_schema_pymongo(
]
if use_random_sampling:
# get sample documents in collection
- aggregations.append({"$sample": {"size": sample_size}})
+ if sample_size:
+ aggregations.append({"$sample": {"size": sample_size}})
documents = collection.aggregate(
aggregations,
allowDiskUse=True,
)
else:
- aggregations.append({"$limit": sample_size})
+ if sample_size:
+ aggregations.append({"$limit": sample_size})
documents = collection.aggregate(aggregations, allowDiskUse=True)
return construct_schema(list(documents), delimiter)
diff --git a/metadata-ingestion/tests/integration/mongodb/test_mongodb.py b/metadata-ingestion/tests/integration/mongodb/test_mongodb.py
index 56fb471d4c9f1b..0a0ba55ff5b802 100644
--- a/metadata-ingestion/tests/integration/mongodb/test_mongodb.py
+++ b/metadata-ingestion/tests/integration/mongodb/test_mongodb.py
@@ -26,6 +26,7 @@ def test_mongodb_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time
"password": "examplepass",
"maxDocumentSize": 25000,
"platform_instance": "instance",
+ "schemaSamplingSize": None,
},
},
"sink": {
From 2cd38a469d5ac607bd510a0ca045d151b4657afd Mon Sep 17 00:00:00 2001
From: Tony Ouyang
Date: Thu, 28 Dec 2023 03:09:30 -0800
Subject: [PATCH 080/540] fix(ingest): Fix mongodb ingestion when
platform_instance is missing from recipe (#9486)
Co-authored-by: Harshal Sheth
---
metadata-ingestion/src/datahub/ingestion/source/mongodb.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py
index 283ab652f23c62..577da91ee82da9 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py
@@ -379,6 +379,8 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
platform_instance=self.config.platform_instance,
)
+ # Initialize data_platform_instance with a default value
+ data_platform_instance = None
if self.config.platform_instance:
data_platform_instance = DataPlatformInstanceClass(
platform=make_data_platform_urn(platform),
From e343b69ce4881ceefdf4af0cafea29188092de52 Mon Sep 17 00:00:00 2001
From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com>
Date: Thu, 28 Dec 2023 16:50:13 +0530
Subject: [PATCH 081/540] fix(ingest/snowflake): explicit set schema if public
schema is absent (#9526)
---
.../source/snowflake/snowflake_profiler.py | 14 ++++++++++++++
.../ingestion/source/snowflake/snowflake_query.py | 4 ++++
.../source/state/stateful_ingestion_base.py | 2 +-
3 files changed, 19 insertions(+), 1 deletion(-)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py
index 4bda7da422e9d6..9a37f779bbcd58 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py
@@ -24,6 +24,8 @@
logger = logging.getLogger(__name__)
+PUBLIC_SCHEMA = "PUBLIC"
+
class SnowflakeProfiler(GenericProfiler, SnowflakeCommonMixin):
def __init__(
@@ -36,6 +38,7 @@ def __init__(
self.config: SnowflakeV2Config = config
self.report: SnowflakeV2Report = report
self.logger = logger
+ self.database_default_schema: Dict[str, str] = dict()
def get_workunits(
self, database: SnowflakeDatabase, db_tables: Dict[str, List[SnowflakeTable]]
@@ -47,6 +50,10 @@ def get_workunits(
"max_overflow", self.config.profiling.max_workers
)
+ if PUBLIC_SCHEMA not in db_tables:
+ # If PUBLIC schema is absent, we use any one of schemas as default schema
+ self.database_default_schema[database.name] = list(db_tables.keys())[0]
+
profile_requests = []
for schema in database.schemas:
for table in db_tables[schema.name]:
@@ -136,9 +143,16 @@ def get_profiler_instance(
)
def callable_for_db_connection(self, db_name: str) -> Callable:
+ schema_name = self.database_default_schema.get(db_name)
+
def get_db_connection():
conn = self.config.get_connection()
conn.cursor().execute(SnowflakeQuery.use_database(db_name))
+
+ # As mentioned here - https://docs.snowflake.com/en/sql-reference/sql/use-database#usage-notes
+ # no schema is selected if PUBLIC schema is absent. We need to explicitly call `USE SCHEMA `
+ if schema_name:
+ conn.cursor().execute(SnowflakeQuery.use_schema(schema_name))
return conn
return get_db_connection
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py
index 267f7cf0749099..724e4392f1d612 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py
@@ -80,6 +80,10 @@ def show_tags() -> str:
def use_database(db_name: str) -> str:
return f'use database "{db_name}"'
+ @staticmethod
+ def use_schema(schema_name: str) -> str:
+ return f'use schema "{schema_name}"'
+
@staticmethod
def get_databases(db_name: Optional[str]) -> str:
db_clause = f'"{db_name}".' if db_name is not None else ""
diff --git a/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py b/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py
index 8a448f40e95b4b..61d39b18f523d2 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py
@@ -98,7 +98,7 @@ class StatefulIngestionConfigBase(GenericModel, Generic[CustomConfig]):
)
-class StatefulLineageConfigMixin:
+class StatefulLineageConfigMixin(ConfigModel):
enable_stateful_lineage_ingestion: bool = Field(
default=True,
description="Enable stateful lineage ingestion."
From 4de2c24249697fa68831f880fda216ddb46fba3d Mon Sep 17 00:00:00 2001
From: Sumit Patil <91715217+sumitappt@users.noreply.github.com>
Date: Thu, 28 Dec 2023 21:37:57 +0530
Subject: [PATCH 082/540] style(search): Border is too thick for sidebar
(#9528)
---
.../src/app/search/sidebar/BrowseSidebar.tsx | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/datahub-web-react/src/app/search/sidebar/BrowseSidebar.tsx b/datahub-web-react/src/app/search/sidebar/BrowseSidebar.tsx
index c16bcdcaf6c727..1731727c14cfc1 100644
--- a/datahub-web-react/src/app/search/sidebar/BrowseSidebar.tsx
+++ b/datahub-web-react/src/app/search/sidebar/BrowseSidebar.tsx
@@ -9,7 +9,6 @@ import useSidebarEntities from './useSidebarEntities';
import { ANTD_GRAY_V2 } from '../../entity/shared/constants';
import { ProfileSidebarResizer } from '../../entity/shared/containers/profile/sidebar/ProfileSidebarResizer';
-
export const MAX_BROWSER_WIDTH = 500;
export const MIN_BROWSWER_WIDTH = 200;
@@ -18,7 +17,6 @@ export const SidebarWrapper = styled.div<{ visible: boolean; width: number }>`
width: ${(props) => (props.visible ? `${props.width}px` : '0')};
min-width: ${(props) => (props.visible ? `${props.width}px` : '0')};
transition: width 250ms ease-in-out;
- border-right: 1px solid ${(props) => props.theme.styles['border-color-base']};
background-color: ${ANTD_GRAY_V2[1]};
background: white;
`;
@@ -53,7 +51,12 @@ const BrowseSidebar = ({ visible }: Props) => {
return (
<>
-