diff --git a/.github/workflows/check-datahub-jars.yml b/.github/workflows/check-datahub-jars.yml
index 7a49f32729ec1f..dc770f7fc83a61 100644
--- a/.github/workflows/check-datahub-jars.yml
+++ b/.github/workflows/check-datahub-jars.yml
@@ -5,12 +5,12 @@ on:
branches:
- master
paths:
- - "metadata-integration"
+ - "metadata-integration/**"
pull_request:
branches:
- "**"
paths:
- - "metadata-integration"
+ - "metadata-integration/**"
release:
types: [published]
@@ -28,15 +28,22 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: acryldata/sane-checkout-action@v3
+ - uses: actions/setup-python@v5
+ with:
+ python-version: "3.10"
+ - uses: actions/cache@v4
+ with:
+ path: |
+ ~/.cache/uv
+ key: ${{ runner.os }}-uv-${{ hashFiles('**/requirements.txt') }}
+ - name: Install dependencies
+ run: ./metadata-ingestion/scripts/install_deps.sh
- name: Set up JDK 17
uses: actions/setup-java@v4
with:
distribution: "zulu"
java-version: 17
- uses: gradle/actions/setup-gradle@v3
- - uses: actions/setup-python@v5
- with:
- python-version: "3.10"
- name: check ${{ matrix.command }} jar
run: |
./gradlew :metadata-integration:java:${{ matrix.command }}:build --info
diff --git a/build.gradle b/build.gradle
index 6893a2ca93d365..a3d807a7333494 100644
--- a/build.gradle
+++ b/build.gradle
@@ -373,6 +373,7 @@ configure(subprojects.findAll {! it.name.startsWith('spark-lineage')}) {
exclude group: "org.slf4j", module: "slf4j-log4j12"
exclude group: "org.slf4j", module: "slf4j-nop"
exclude group: "org.slf4j", module: "slf4j-ext"
+ exclude group: "org.codehaus.jackson", module: "jackson-mapper-asl"
resolutionStrategy.force externalDependency.antlr4Runtime
resolutionStrategy.force externalDependency.antlr4
diff --git a/datahub-frontend/app/auth/AuthModule.java b/datahub-frontend/app/auth/AuthModule.java
index 7fa99ab3cb2621..b95515684f01fc 100644
--- a/datahub-frontend/app/auth/AuthModule.java
+++ b/datahub-frontend/app/auth/AuthModule.java
@@ -27,6 +27,7 @@
import io.datahubproject.metadata.context.EntityRegistryContext;
import io.datahubproject.metadata.context.OperationContext;
import io.datahubproject.metadata.context.OperationContextConfig;
+import io.datahubproject.metadata.context.RetrieverContext;
import io.datahubproject.metadata.context.SearchContext;
import io.datahubproject.metadata.context.ValidationContext;
import java.nio.charset.StandardCharsets;
@@ -195,6 +196,7 @@ protected OperationContext provideOperationContext(
.searchContext(SearchContext.EMPTY)
.entityRegistryContext(EntityRegistryContext.builder().build(EmptyEntityRegistry.EMPTY))
.validationContext(ValidationContext.builder().alternateValidation(false).build())
+ .retrieverContext(RetrieverContext.EMPTY)
.build(systemAuthentication);
}
diff --git a/datahub-frontend/conf/logback.xml b/datahub-frontend/conf/logback.xml
index 78da231b4a71c5..de37c56cba38a7 100644
--- a/datahub-frontend/conf/logback.xml
+++ b/datahub-frontend/conf/logback.xml
@@ -61,7 +61,7 @@
-
+
diff --git a/datahub-upgrade/build.gradle b/datahub-upgrade/build.gradle
index 372b0eb0570b98..a3b2e9ad6b3e22 100644
--- a/datahub-upgrade/build.gradle
+++ b/datahub-upgrade/build.gradle
@@ -60,7 +60,7 @@ dependencies {
// mock internal schema registry
implementation externalDependency.kafkaAvroSerde
implementation externalDependency.kafkaAvroSerializer
- implementation "org.apache.kafka:kafka_2.12:3.7.1"
+ implementation "org.apache.kafka:kafka_2.13:3.7.2"
implementation externalDependency.slf4jApi
compileOnly externalDependency.lombok
diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java
index 661717c6309cfc..fdd84da6044f73 100644
--- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java
+++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java
@@ -13,6 +13,7 @@
import com.linkedin.gms.factory.kafka.common.TopicConventionFactory;
import com.linkedin.gms.factory.kafka.schemaregistry.InternalSchemaRegistryFactory;
import com.linkedin.gms.factory.search.BaseElasticSearchComponentsFactory;
+import com.linkedin.metadata.aspect.CachingAspectRetriever;
import com.linkedin.metadata.config.kafka.KafkaConfiguration;
import com.linkedin.metadata.dao.producer.KafkaEventProducer;
import com.linkedin.metadata.dao.producer.KafkaHealthChecker;
@@ -186,6 +187,7 @@ protected OperationContext javaSystemOperationContext(
components.getIndexConvention(),
RetrieverContext.builder()
.aspectRetriever(entityServiceAspectRetriever)
+ .cachingAspectRetriever(CachingAspectRetriever.EMPTY)
.graphRetriever(systemGraphRetriever)
.searchRetriever(searchServiceSearchRetriever)
.build(),
diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreStorageStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreStorageStep.java
index 4d53b603c1eaff..1e5cd6cdb24174 100644
--- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreStorageStep.java
+++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreStorageStep.java
@@ -180,7 +180,7 @@ private void readerExecutable(ReaderWrapper reader, UpgradeContext context) {
try {
aspectRecord =
EntityUtils.toSystemAspect(
- context.opContext().getRetrieverContext().get(), aspect.toEntityAspect())
+ context.opContext().getRetrieverContext(), aspect.toEntityAspect())
.get()
.getRecordTemplate();
} catch (Exception e) {
diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java
index cd7947ce3c11aa..56feffd211bcd7 100644
--- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java
+++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java
@@ -113,8 +113,7 @@ public Function executable() {
List, SystemAspect>> futures;
futures =
EntityUtils.toSystemAspectFromEbeanAspects(
- opContext.getRetrieverContext().get(),
- batch.collect(Collectors.toList()))
+ opContext.getRetrieverContext(), batch.collect(Collectors.toList()))
.stream()
.map(
systemAspect -> {
diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/bootstrapmcps/BootstrapMCPUtil.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/bootstrapmcps/BootstrapMCPUtil.java
index 4cc3edff3eb52d..5b807c6c450afb 100644
--- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/bootstrapmcps/BootstrapMCPUtil.java
+++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/bootstrapmcps/BootstrapMCPUtil.java
@@ -100,8 +100,8 @@ static AspectsBatch generateAspectBatch(
.collect(Collectors.toList());
return AspectsBatchImpl.builder()
- .mcps(mcps, auditStamp, opContext.getRetrieverContext().get())
- .retrieverContext(opContext.getRetrieverContext().get())
+ .mcps(mcps, auditStamp, opContext.getRetrieverContext())
+ .retrieverContext(opContext.getRetrieverContext())
.build();
}
diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/dataprocessinstances/BackfillDataProcessInstancesHasRunEventsStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/dataprocessinstances/BackfillDataProcessInstancesHasRunEventsStep.java
index 55cdcae931ab5b..1bdea10123999a 100644
--- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/dataprocessinstances/BackfillDataProcessInstancesHasRunEventsStep.java
+++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/dataprocessinstances/BackfillDataProcessInstancesHasRunEventsStep.java
@@ -2,6 +2,8 @@
import static com.linkedin.metadata.Constants.*;
+import com.fasterxml.jackson.databind.node.JsonNodeFactory;
+import com.fasterxml.jackson.databind.node.ObjectNode;
import com.google.common.base.Throwables;
import com.linkedin.common.urn.Urn;
import com.linkedin.datahub.upgrade.UpgradeContext;
@@ -23,8 +25,6 @@
import java.util.Set;
import java.util.function.Function;
import lombok.extern.slf4j.Slf4j;
-import org.codehaus.jackson.node.JsonNodeFactory;
-import org.codehaus.jackson.node.ObjectNode;
import org.opensearch.action.search.SearchRequest;
import org.opensearch.action.search.SearchResponse;
import org.opensearch.client.RequestOptions;
diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java
index 55bc8edbf6a768..de03538907432f 100644
--- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java
+++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java
@@ -168,13 +168,13 @@ public Function executable() {
AspectsBatch aspectsBatch =
AspectsBatchImpl.builder()
- .retrieverContext(opContext.getRetrieverContext().get())
+ .retrieverContext(opContext.getRetrieverContext())
.items(
batch
.flatMap(
ebeanAspectV2 ->
EntityUtils.toSystemAspectFromEbeanAspects(
- opContext.getRetrieverContext().get(),
+ opContext.getRetrieverContext(),
Set.of(ebeanAspectV2))
.stream())
.map(
@@ -189,11 +189,7 @@ public Function executable() {
.auditStamp(systemAspect.getAuditStamp())
.systemMetadata(
withAppSource(systemAspect.getSystemMetadata()))
- .build(
- opContext
- .getRetrieverContext()
- .get()
- .getAspectRetriever()))
+ .build(opContext.getAspectRetriever()))
.collect(Collectors.toList()))
.build();
diff --git a/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/schemafield/GenerateSchemaFieldsFromSchemaMetadataStepTest.java b/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/schemafield/GenerateSchemaFieldsFromSchemaMetadataStepTest.java
index 3a2728b4e1d3d6..04b1095e770e0e 100644
--- a/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/schemafield/GenerateSchemaFieldsFromSchemaMetadataStepTest.java
+++ b/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/schemafield/GenerateSchemaFieldsFromSchemaMetadataStepTest.java
@@ -22,7 +22,6 @@
import com.linkedin.upgrade.DataHubUpgradeState;
import io.datahubproject.metadata.context.OperationContext;
import io.datahubproject.metadata.context.RetrieverContext;
-import java.util.Optional;
import java.util.stream.Stream;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@@ -48,7 +47,7 @@ public void setup() {
step =
new GenerateSchemaFieldsFromSchemaMetadataStep(
mockOpContext, mockEntityService, mockAspectDao, 10, 100, 1000);
- when(mockOpContext.getRetrieverContext()).thenReturn(Optional.of(mockRetrieverContext));
+ when(mockOpContext.getRetrieverContext()).thenReturn(mockRetrieverContext);
}
/** Test to verify the correct step ID is returned. */
diff --git a/datahub-web-react/src/app/entity/schemaField/SchemaFieldPropertiesEntity.tsx b/datahub-web-react/src/app/entity/schemaField/SchemaFieldPropertiesEntity.tsx
index 2c59c476195d0b..fdc0e33d77a057 100644
--- a/datahub-web-react/src/app/entity/schemaField/SchemaFieldPropertiesEntity.tsx
+++ b/datahub-web-react/src/app/entity/schemaField/SchemaFieldPropertiesEntity.tsx
@@ -35,11 +35,9 @@ export class SchemaFieldPropertiesEntity implements Entity {
// Currently unused.
getPathName = () => 'schemaField';
- // Currently unused.
- getEntityName = () => 'schemaField';
+ getEntityName = () => 'Column';
- // Currently unused.
- getCollectionName = () => 'schemaFields';
+ getCollectionName = () => 'Columns';
// Currently unused.
renderProfile = (_: string) => <>>;
diff --git a/datahub-web-react/src/app/entity/shared/components/styled/DeprecationPill.tsx b/datahub-web-react/src/app/entity/shared/components/styled/DeprecationPill.tsx
index 08e9636f760de5..613264709ac23c 100644
--- a/datahub-web-react/src/app/entity/shared/components/styled/DeprecationPill.tsx
+++ b/datahub-web-react/src/app/entity/shared/components/styled/DeprecationPill.tsx
@@ -19,8 +19,6 @@ const DeprecatedContainer = styled.div`
justify-content: center;
align-items: center;
color: #cd0d24;
- margin-left: 0px;
- margin-right: 8px;
padding-top: 8px;
padding-bottom: 8px;
padding-right: 4px;
diff --git a/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/MultipleStringInput.tsx b/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/MultipleOpenEndedInput.tsx
similarity index 87%
rename from datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/MultipleStringInput.tsx
rename to datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/MultipleOpenEndedInput.tsx
index fe6c0bbb99ce22..fe6cd1115419ae 100644
--- a/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/MultipleStringInput.tsx
+++ b/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/MultipleOpenEndedInput.tsx
@@ -4,6 +4,8 @@ import React from 'react';
import styled from 'styled-components';
import { ANTD_GRAY_V2 } from '../../../constants';
+const MultiStringWrapper = styled.div``;
+
const StyledInput = styled(Input)`
width: 75%;
min-width: 350px;
@@ -29,10 +31,11 @@ const DeleteButton = styled(Button)`
interface Props {
selectedValues: any[];
+ inputType?: string;
updateSelectedValues: (values: any[]) => void;
}
-export default function MultipleStringInput({ selectedValues, updateSelectedValues }: Props) {
+export default function MultipleOpenEndedInput({ selectedValues, updateSelectedValues, inputType = 'text' }: Props) {
function updateInput(text: string, index: number) {
const updatedValues =
selectedValues.length > 0 ? selectedValues.map((value, i) => (i === index ? text : value)) : [text];
@@ -53,14 +56,14 @@ export default function MultipleStringInput({ selectedValues, updateSelectedValu
}
return (
-
+
{selectedValues.length > 1 &&
selectedValues.map((selectedValue, index) => {
const key = `${index}`;
return (
updateInput(e.target.value, index)}
/>
@@ -70,7 +73,7 @@ export default function MultipleStringInput({ selectedValues, updateSelectedValu
})}
{selectedValues.length <= 1 && (
updateInput(e.target.value, 0)}
/>
@@ -78,6 +81,6 @@ export default function MultipleStringInput({ selectedValues, updateSelectedValu
+ Add More
-
+
);
}
diff --git a/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/NumberInput.tsx b/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/NumberInput.tsx
index c56d85db7ef712..f4cedc4cf80ee5 100644
--- a/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/NumberInput.tsx
+++ b/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/NumberInput.tsx
@@ -1,7 +1,9 @@
import { Input } from 'antd';
import React, { ChangeEvent } from 'react';
import styled from 'styled-components';
+import { PropertyCardinality } from '@src/types.generated';
import { ANTD_GRAY_V2 } from '../../../constants';
+import MultipleOpenEndedInput from './MultipleOpenEndedInput';
const StyledInput = styled(Input)`
border: 1px solid ${ANTD_GRAY_V2[6]};
@@ -10,15 +12,31 @@ const StyledInput = styled(Input)`
interface Props {
selectedValues: any[];
+ cardinality?: PropertyCardinality | null;
updateSelectedValues: (values: string[] | number[]) => void;
}
-export default function NumberInput({ selectedValues, updateSelectedValues }: Props) {
+export default function NumberInput({ selectedValues, cardinality, updateSelectedValues }: Props) {
function updateInput(event: ChangeEvent) {
const number = Number(event.target.value);
updateSelectedValues([number]);
}
+ function updateMultipleValues(values: string[] | number[]) {
+ const numbers = values.map((v) => Number(v));
+ updateSelectedValues(numbers);
+ }
+
+ if (cardinality === PropertyCardinality.Multiple) {
+ return (
+
+ );
+ }
+
return (
;
+ return ;
}
- return ;
+ return (
+
+ );
}
diff --git a/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/StructuredPropertyInput.tsx b/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/StructuredPropertyInput.tsx
index 894a304335b0f6..305347ee0bce80 100644
--- a/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/StructuredPropertyInput.tsx
+++ b/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/StructuredPropertyInput.tsx
@@ -60,7 +60,11 @@ export default function StructuredPropertyInput({
)}
{!allowedValues && valueType.info.type === StdDataType.Number && (
-
+
)}
{!allowedValues && valueType.info.type === StdDataType.Urn && (
{
+ it('should not return parent rows when there are none', () => {
+ const propertyRows = [
+ { displayName: 'test1', qualifiedName: 'test1' },
+ { displayName: 'test2', qualifiedName: 'test2' },
+ ];
+ expect(identifyAndAddParentRows(propertyRows)).toMatchObject([]);
+ });
+
+ it('should not return parent rows when another row starts with the same letters but is a different token', () => {
+ const propertyRows = [
+ { displayName: 'test1', qualifiedName: 'testing.one' },
+ { displayName: 'test2', qualifiedName: 'testingAgain.two' },
+ ];
+ expect(identifyAndAddParentRows(propertyRows)).toMatchObject([]);
+ });
+
+ it('should return parent rows properly', () => {
+ const propertyRows = [
+ { displayName: 'test1', qualifiedName: 'testing.one' },
+ { displayName: 'test2', qualifiedName: 'testing.two' },
+ { displayName: 'test3', qualifiedName: 'testing.three' },
+ ];
+ expect(identifyAndAddParentRows(propertyRows)).toMatchObject([
+ { displayName: 'testing', qualifiedName: 'testing', childrenCount: 3 },
+ ]);
+ });
+
+ it('should return parent rows properly with multiple layers of nesting', () => {
+ const propertyRows = [
+ { displayName: 'test1', qualifiedName: 'testing.one.two.a.1' },
+ { displayName: 'test1', qualifiedName: 'testing.one.two.a.2' },
+ { displayName: 'test1', qualifiedName: 'testing.one.two.b' },
+ { displayName: 'test1', qualifiedName: 'testing.one.three' },
+ { displayName: 'test2', qualifiedName: 'testing.two.c.d' },
+ { displayName: 'test3', qualifiedName: 'testing.three' },
+ { displayName: 'test3', qualifiedName: 'testParent' },
+ ];
+ expect(identifyAndAddParentRows(propertyRows)).toMatchObject([
+ { displayName: 'testing', qualifiedName: 'testing', isParentRow: true, childrenCount: 6 },
+ { displayName: 'testing.one', qualifiedName: 'testing.one', isParentRow: true, childrenCount: 4 },
+ { displayName: 'testing.one.two', qualifiedName: 'testing.one.two', isParentRow: true, childrenCount: 3 },
+ {
+ displayName: 'testing.one.two.a',
+ qualifiedName: 'testing.one.two.a',
+ isParentRow: true,
+ childrenCount: 2,
+ },
+ ]);
+ });
+
+ it('should return parent rows properly with multiple layers of nesting regardless of order', () => {
+ const propertyRows = [
+ { displayName: 'test1', qualifiedName: 'testing.one.two.a.1' },
+ { displayName: 'test3', qualifiedName: 'testParent' },
+ { displayName: 'test1', qualifiedName: 'testing.one.three' },
+ { displayName: 'test2', qualifiedName: 'testing.two.c.d' },
+ { displayName: 'test1', qualifiedName: 'testing.one.two.b' },
+ { displayName: 'test3', qualifiedName: 'testing.three' },
+ { displayName: 'test1', qualifiedName: 'testing.one.two.a.2' },
+ ];
+ expect(identifyAndAddParentRows(propertyRows)).toMatchObject([
+ { displayName: 'testing', qualifiedName: 'testing', isParentRow: true, childrenCount: 6 },
+ { displayName: 'testing.one', qualifiedName: 'testing.one', isParentRow: true, childrenCount: 4 },
+ { displayName: 'testing.one.two', qualifiedName: 'testing.one.two', isParentRow: true, childrenCount: 3 },
+ {
+ displayName: 'testing.one.two.a',
+ qualifiedName: 'testing.one.two.a',
+ isParentRow: true,
+ childrenCount: 2,
+ },
+ ]);
+ });
+
+ it('should return parent rows properly with simpler layers of nesting', () => {
+ const propertyRows = [
+ { displayName: 'test2', qualifiedName: 'testing.two.c.d' },
+ { displayName: 'test3', qualifiedName: 'testing.three' },
+ { displayName: 'test3', qualifiedName: 'testParent' },
+ ];
+ expect(identifyAndAddParentRows(propertyRows)).toMatchObject([
+ { displayName: 'testing', qualifiedName: 'testing', isParentRow: true, childrenCount: 2 },
+ ]);
+ });
+});
diff --git a/datahub-web-react/src/app/entity/shared/tabs/Properties/useStructuredProperties.tsx b/datahub-web-react/src/app/entity/shared/tabs/Properties/useStructuredProperties.tsx
index 18ee6bb18da3d3..60d0aac30eb4ce 100644
--- a/datahub-web-react/src/app/entity/shared/tabs/Properties/useStructuredProperties.tsx
+++ b/datahub-web-react/src/app/entity/shared/tabs/Properties/useStructuredProperties.tsx
@@ -122,10 +122,10 @@ export function identifyAndAddParentRows(rows?: Array): Array name.startsWith(token)).length;
+ const currentCount = qualifiedNames.filter((name) => name.startsWith(`${token}.`)).length;
- // If we're at the beginning of the path and there is no nesting, break
- if (index === 0 && currentCount === 1) {
+ // If there's only one child, don't nest it
+ if (currentCount === 1) {
break;
}
diff --git a/datahub-web-react/src/app/govern/structuredProperties/AllowedValuesDrawer.tsx b/datahub-web-react/src/app/govern/structuredProperties/AllowedValuesDrawer.tsx
index f1dccb6db0c22c..16c07e8257cd9b 100644
--- a/datahub-web-react/src/app/govern/structuredProperties/AllowedValuesDrawer.tsx
+++ b/datahub-web-react/src/app/govern/structuredProperties/AllowedValuesDrawer.tsx
@@ -127,6 +127,7 @@ const AllowedValuesDrawer = ({
setTimeout(() => scrollToBottom(), 0);
}}
color="violet"
+ type="button"
>
Add
diff --git a/datahub-web-react/src/app/govern/structuredProperties/DisplayPreferences.tsx b/datahub-web-react/src/app/govern/structuredProperties/DisplayPreferences.tsx
index 260c91ef93207c..95823de0f27c40 100644
--- a/datahub-web-react/src/app/govern/structuredProperties/DisplayPreferences.tsx
+++ b/datahub-web-react/src/app/govern/structuredProperties/DisplayPreferences.tsx
@@ -153,7 +153,8 @@ const DisplayPreferences = ({
clickable={false}
/>
is already being shown on asset previews, but only one property is allowed at a time.
- Do you want to replace the current property? This will hide PropVal on all asset previews.
+ Do you want to replace the current property? This will hide {getDisplayName(badgeProperty)}{' '}
+ on all asset previews.
}
/>
diff --git a/datahub-web-react/src/app/govern/structuredProperties/StructuredPropsDrawer.tsx b/datahub-web-react/src/app/govern/structuredProperties/StructuredPropsDrawer.tsx
index 4b2bbaaf96826b..debffeac7d583c 100644
--- a/datahub-web-react/src/app/govern/structuredProperties/StructuredPropsDrawer.tsx
+++ b/datahub-web-react/src/app/govern/structuredProperties/StructuredPropsDrawer.tsx
@@ -192,6 +192,7 @@ const StructuredPropsDrawer = ({
form.validateFields().then(() => {
const createInput = {
...form.getFieldsValue(),
+ qualifiedName: form.getFieldValue('qualifiedName') || undefined,
valueType: valueTypes.find((type) => type.value === form.getFieldValue('valueType'))?.urn,
allowedValues,
cardinality,
diff --git a/datahub-web-react/src/app/govern/structuredProperties/cacheUtils.ts b/datahub-web-react/src/app/govern/structuredProperties/cacheUtils.ts
index 590189d06e6b16..c8052784c6972a 100644
--- a/datahub-web-react/src/app/govern/structuredProperties/cacheUtils.ts
+++ b/datahub-web-react/src/app/govern/structuredProperties/cacheUtils.ts
@@ -17,7 +17,6 @@ const addToCache = (existingProperties, newProperty) => {
allowedValues: newProperty.definition.allowedValues,
created: newProperty.definition.created,
lastModified: newProperty.definition.lastModified,
- filterStatus: newProperty.definition.filterStatus,
},
settings: {
isHidden: newProperty.settings.isHidden,
diff --git a/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx b/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx
index 4c8948a6664e07..a19862e83ae510 100644
--- a/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx
+++ b/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx
@@ -68,6 +68,7 @@ const TitleContainer = styled.div`
const EntityTitleContainer = styled.div`
display: flex;
align-items: center;
+ gap: 8px;
`;
const EntityTitle = styled(Typography.Text)<{ $titleSizePx?: number }>`
@@ -77,7 +78,6 @@ const EntityTitle = styled(Typography.Text)<{ $titleSizePx?: number }>`
}
&&& {
- margin-right 8px;
font-size: ${(props) => props.$titleSizePx || 16}px;
font-weight: 600;
vertical-align: middle;
diff --git a/datahub-web-react/src/graphql/search.graphql b/datahub-web-react/src/graphql/search.graphql
index ce0fde27f4c425..58c9a51f3d7e90 100644
--- a/datahub-web-react/src/graphql/search.graphql
+++ b/datahub-web-react/src/graphql/search.graphql
@@ -963,6 +963,7 @@ fragment facetFields on FacetMetadata {
entity {
urn
type
+ ...entityDisplayNameFields
... on Tag {
name
properties {
diff --git a/docker/kafka-setup/Dockerfile b/docker/kafka-setup/Dockerfile
index a11f823f5efa55..324357b942e8e1 100644
--- a/docker/kafka-setup/Dockerfile
+++ b/docker/kafka-setup/Dockerfile
@@ -22,7 +22,7 @@ ARG ALPINE_REPO_URL
ARG APACHE_DOWNLOAD_URL
ARG GITHUB_REPO_URL
-ENV KAFKA_VERSION=3.7.1
+ENV KAFKA_VERSION=3.7.2
ENV SCALA_VERSION=2.13
LABEL name="kafka" version=${KAFKA_VERSION}
diff --git a/docs-website/vercel-setup.sh b/docs-website/vercel-setup.sh
index 4bb40eaddf4775..e9ba87b75be779 100755
--- a/docs-website/vercel-setup.sh
+++ b/docs-website/vercel-setup.sh
@@ -5,8 +5,8 @@ set -euxo pipefail
./metadata-ingestion/scripts/install_deps.sh
# Set up java version for gradle
-yum install java-17-amazon-corretto -y
-java --version
+yum install java-17-amazon-corretto-devel -y
+javac --version
# Build python from source.
# Amazon Linux 2 has Python 3.8, but it's version of OpenSSL is super old and hence it
diff --git a/docs/cli.md b/docs/cli.md
index c633b7f4a38ad3..1c38077d0d12ef 100644
--- a/docs/cli.md
+++ b/docs/cli.md
@@ -115,6 +115,19 @@ datahub ingest -c ./examples/recipes/example_to_datahub_rest.dhub.yaml --dry-run
datahub ingest -c ./examples/recipes/example_to_datahub_rest.dhub.yaml -n
```
+#### ingest --list-source-runs
+
+The `--list-source-runs` option of the `ingest` command lists the previous runs, displaying their run ID, source name,
+start time, status, and source URN. This command allows you to filter results using the --urn option for URN-based
+filtering or the --source option to filter by source name (partial or complete matches are supported).
+
+```shell
+# List all ingestion runs
+datahub ingest --list-source-runs
+# Filter runs by a source name containing "demo"
+datahub ingest --list-source-runs --source "demo"
+```
+
#### ingest --preview
The `--preview` option of the `ingest` command performs all of the ingestion steps, but limits the processing to only the first 10 workunits produced by the source.
diff --git a/docs/how/delete-metadata.md b/docs/how/delete-metadata.md
index f720a66ce57652..e36940bf398356 100644
--- a/docs/how/delete-metadata.md
+++ b/docs/how/delete-metadata.md
@@ -4,7 +4,7 @@
To follow this guide, you'll need the [DataHub CLI](../cli.md).
:::
-There are a two ways to delete metadata from DataHub:
+There are two ways to delete metadata from DataHub:
1. Delete metadata attached to entities by providing a specific urn or filters that identify a set of urns (delete CLI).
2. Delete metadata created by a single ingestion run (rollback).
@@ -233,7 +233,13 @@ To view the ids of the most recent set of ingestion batches, execute
datahub ingest list-runs
```
-That will print out a table of all the runs. Once you have an idea of which run you want to roll back, run
+That will print out a table of all the runs. To see run statuses or to filter runs by URN/source run
+
+```shell
+datahub ingest list-source-runs
+```
+
+Once you have an idea of which run you want to roll back, run
```shell
datahub ingest show --run-id
diff --git a/docs/lineage/airflow.md b/docs/lineage/airflow.md
index 72b5cbf57592d3..345213a0672d37 100644
--- a/docs/lineage/airflow.md
+++ b/docs/lineage/airflow.md
@@ -339,6 +339,37 @@ TypeError: on_task_instance_success() missing 3 required positional arguments: '
The solution is to upgrade `acryl-datahub-airflow-plugin>=0.12.0.4` or upgrade `pluggy>=1.2.0`. See this [PR](https://github.com/datahub-project/datahub/pull/9365) for details.
+### Disabling the DataHub Plugin v2
+
+There are two ways to disable the DataHub Plugin v2:
+
+#### 1. Disable via Configuration
+
+Set the `datahub.enabled` configuration property to `False` in the `airflow.cfg` file and restart the Airflow environment to reload the configuration and disable the plugin.
+
+```ini title="airflow.cfg"
+[datahub]
+enabled = False
+```
+
+#### 2. Disable via Airflow Variable (Kill-Switch)
+
+If a restart is not possible and you need a faster way to disable the plugin, you can use the kill-switch. Create and set the `datahub_airflow_plugin_disable_listener` Airflow variable to `true`. This ensures that the listener won't process anything.
+
+#### Command Line
+
+```shell
+airflow variables set datahub_airflow_plugin_disable_listener true
+```
+
+#### Airflow UI
+
+1. Go to Admin -> Variables.
+2. Click the "+" symbol to create a new variable.
+3. Set the key to `datahub_airflow_plugin_disable_listener` and the value to `true`.
+
+This will immediately disable the plugin without requiring a restart.
+
## Compatibility
We no longer officially support Airflow <2.3. However, you can use older versions of `acryl-datahub-airflow-plugin` with older versions of Airflow.
diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/CachingAspectRetriever.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/CachingAspectRetriever.java
index 77e799f752455c..375dd8cf8911e1 100644
--- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/CachingAspectRetriever.java
+++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/CachingAspectRetriever.java
@@ -1,4 +1,38 @@
package com.linkedin.metadata.aspect;
+import com.linkedin.common.urn.Urn;
+import com.linkedin.entity.Aspect;
+import com.linkedin.metadata.models.registry.EmptyEntityRegistry;
+import com.linkedin.metadata.models.registry.EntityRegistry;
+import java.util.Collections;
+import java.util.Map;
+import java.util.Set;
+import javax.annotation.Nonnull;
+
/** Responses can be cached based on application.yaml caching configuration for the EntityClient */
-public interface CachingAspectRetriever extends AspectRetriever {}
+public interface CachingAspectRetriever extends AspectRetriever {
+
+ CachingAspectRetriever EMPTY = new EmptyAspectRetriever();
+
+ class EmptyAspectRetriever implements CachingAspectRetriever {
+ @Nonnull
+ @Override
+ public Map> getLatestAspectObjects(
+ Set urns, Set aspectNames) {
+ return Collections.emptyMap();
+ }
+
+ @Nonnull
+ @Override
+ public Map> getLatestSystemAspects(
+ Map> urnAspectNames) {
+ return Collections.emptyMap();
+ }
+
+ @Nonnull
+ @Override
+ public EntityRegistry getEntityRegistry() {
+ return EmptyEntityRegistry.EMPTY;
+ }
+ }
+}
diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/GraphRetriever.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/GraphRetriever.java
index f6858e7da4ba63..30a2c1eb9df8c1 100644
--- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/GraphRetriever.java
+++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/GraphRetriever.java
@@ -4,6 +4,7 @@
import com.linkedin.metadata.query.filter.Filter;
import com.linkedin.metadata.query.filter.RelationshipFilter;
import com.linkedin.metadata.query.filter.SortCriterion;
+import java.util.Collections;
import java.util.List;
import java.util.function.Function;
import javax.annotation.Nonnull;
@@ -97,4 +98,26 @@ default void consumeRelatedEntities(
}
}
}
+
+ GraphRetriever EMPTY = new EmptyGraphRetriever();
+
+ class EmptyGraphRetriever implements GraphRetriever {
+
+ @Nonnull
+ @Override
+ public RelatedEntitiesScrollResult scrollRelatedEntities(
+ @Nullable List sourceTypes,
+ @Nonnull Filter sourceEntityFilter,
+ @Nullable List destinationTypes,
+ @Nonnull Filter destinationEntityFilter,
+ @Nonnull List relationshipTypes,
+ @Nonnull RelationshipFilter relationshipFilter,
+ @Nonnull List sortCriterion,
+ @Nullable String scrollId,
+ int count,
+ @Nullable Long startTimeMillis,
+ @Nullable Long endTimeMillis) {
+ return new RelatedEntitiesScrollResult(0, 0, null, Collections.emptyList());
+ }
+ }
}
diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/DataJobInputOutputPatchBuilder.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/DataJobInputOutputPatchBuilder.java
index 6fffb17521ddb7..14fc92a1bf3c86 100644
--- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/DataJobInputOutputPatchBuilder.java
+++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/DataJobInputOutputPatchBuilder.java
@@ -15,6 +15,8 @@
import com.linkedin.metadata.aspect.patch.PatchOperationType;
import com.linkedin.metadata.graph.LineageDirection;
import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
+import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutableTriple;
public class DataJobInputOutputPatchBuilder
@@ -24,6 +26,7 @@ public class DataJobInputOutputPatchBuilder
private static final String OUTPUT_DATASET_EDGES_PATH_START = "/outputDatasetEdges/";
private static final String INPUT_DATASET_FIELDS_PATH_START = "/inputDatasetFields/";
private static final String OUTPUT_DATASET_FIELDS_PATH_START = "/outputDatasetFields/";
+ private static final String FINE_GRAINED_PATH_START = "/fineGrainedLineages/";
// Simplified with just Urn
public DataJobInputOutputPatchBuilder addInputDatajobEdge(@Nonnull DataJobUrn dataJobUrn) {
@@ -136,6 +139,103 @@ public DataJobInputOutputPatchBuilder addEdge(
return this;
}
+ /**
+ * Adds a field as a fine grained upstream
+ *
+ * @param upstreamSchemaField a schema field to be marked as upstream, format:
+ * urn:li:schemaField(DATASET_URN, COLUMN NAME)
+ * @param confidenceScore optional, confidence score for the lineage edge. Defaults to 1.0 for
+ * full confidence
+ * @param transformationOperation string operation type that describes the transformation
+ * operation happening in the lineage edge
+ * @param downstreamSchemaField the downstream schema field this upstream is derived from, format:
+ * urn:li:schemaField(DATASET_URN, COLUMN NAME)
+ * @param queryUrn query urn the relationship is derived from
+ * @return this builder
+ */
+ public DataJobInputOutputPatchBuilder addFineGrainedUpstreamField(
+ @Nonnull Urn upstreamSchemaField,
+ @Nullable Float confidenceScore,
+ @Nonnull String transformationOperation,
+ @Nonnull Urn downstreamSchemaField,
+ @Nullable Urn queryUrn) {
+ Float finalConfidenceScore = getConfidenceScoreOrDefault(confidenceScore);
+ String finalQueryUrn;
+ if (queryUrn == null || StringUtils.isBlank(queryUrn.toString())) {
+ finalQueryUrn = "NONE";
+ } else {
+ finalQueryUrn = queryUrn.toString();
+ }
+
+ ObjectNode fineGrainedLineageNode = instance.objectNode();
+ fineGrainedLineageNode.put("confidenceScore", instance.numberNode(finalConfidenceScore));
+ pathValues.add(
+ ImmutableTriple.of(
+ PatchOperationType.ADD.getValue(),
+ FINE_GRAINED_PATH_START
+ + transformationOperation
+ + "/"
+ + encodeValueUrn(downstreamSchemaField)
+ + "/"
+ + finalQueryUrn
+ + "/"
+ + encodeValueUrn(upstreamSchemaField),
+ fineGrainedLineageNode));
+
+ return this;
+ }
+
+ private Float getConfidenceScoreOrDefault(@Nullable Float confidenceScore) {
+ float finalConfidenceScore;
+ if (confidenceScore != null && confidenceScore > 0 && confidenceScore <= 1.0f) {
+ finalConfidenceScore = confidenceScore;
+ } else {
+ finalConfidenceScore = 1.0f;
+ }
+
+ return finalConfidenceScore;
+ }
+
+ /**
+ * Removes a field as a fine grained upstream
+ *
+ * @param upstreamSchemaField a schema field to be marked as upstream, format:
+ * urn:li:schemaField(DATASET_URN, COLUMN NAME)
+ * @param transformationOperation string operation type that describes the transformation
+ * operation happening in the lineage edge
+ * @param downstreamSchemaField the downstream schema field this upstream is derived from, format:
+ * urn:li:schemaField(DATASET_URN, COLUMN NAME)
+ * @param queryUrn query urn the relationship is derived from
+ * @return this builder
+ */
+ public DataJobInputOutputPatchBuilder removeFineGrainedUpstreamField(
+ @Nonnull Urn upstreamSchemaField,
+ @Nonnull String transformationOperation,
+ @Nonnull Urn downstreamSchemaField,
+ @Nullable Urn queryUrn) {
+
+ String finalQueryUrn;
+ if (queryUrn == null || StringUtils.isBlank(queryUrn.toString())) {
+ finalQueryUrn = "NONE";
+ } else {
+ finalQueryUrn = queryUrn.toString();
+ }
+ pathValues.add(
+ ImmutableTriple.of(
+ PatchOperationType.REMOVE.getValue(),
+ FINE_GRAINED_PATH_START
+ + transformationOperation
+ + "/"
+ + encodeValueUrn(downstreamSchemaField)
+ + "/"
+ + finalQueryUrn
+ + "/"
+ + encodeValueUrn(upstreamSchemaField),
+ null));
+
+ return this;
+ }
+
public DataJobInputOutputPatchBuilder removeEdge(
@Nonnull Edge edge, @Nonnull LineageDirection direction) {
String path = getEdgePath(edge, direction);
diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/UpstreamLineagePatchBuilder.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/UpstreamLineagePatchBuilder.java
index 08182761aeb03f..d0a46a35d51820 100644
--- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/UpstreamLineagePatchBuilder.java
+++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/UpstreamLineagePatchBuilder.java
@@ -142,7 +142,7 @@ public UpstreamLineagePatchBuilder removeFineGrainedUpstreamField(
FINE_GRAINED_PATH_START
+ transformationOperation
+ "/"
- + downstreamSchemaField
+ + encodeValueUrn(downstreamSchemaField)
+ "/"
+ finalQueryUrn
+ "/"
diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/FineGrainedLineageTemplateHelper.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/FineGrainedLineageTemplateHelper.java
new file mode 100644
index 00000000000000..1f6a58c52ba248
--- /dev/null
+++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/FineGrainedLineageTemplateHelper.java
@@ -0,0 +1,282 @@
+package com.linkedin.metadata.aspect.patch.template;
+
+import static com.fasterxml.jackson.databind.node.JsonNodeFactory.*;
+import static com.linkedin.metadata.Constants.*;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.node.ArrayNode;
+import com.fasterxml.jackson.databind.node.ObjectNode;
+import com.google.common.collect.Streams;
+import com.linkedin.common.urn.Urn;
+import com.linkedin.common.urn.UrnUtils;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.stream.Collectors;
+import javax.annotation.Nullable;
+import org.codehaus.plexus.util.StringUtils;
+
+public class FineGrainedLineageTemplateHelper {
+
+ private static final String FINE_GRAINED_UPSTREAM_TYPE = "upstreamType";
+ private static final String FINE_GRAINED_UPSTREAMS = "upstreams";
+ private static final String FINE_GRAINED_DOWNSTREAM_TYPE = "downstreamType";
+ private static final String FINE_GRAINED_DOWNSTREAMS = "downstreams";
+ private static final String FINE_GRAINED_TRANSFORMATION_OPERATION = "transformOperation";
+ private static final String FINE_GRAINED_CONFIDENCE_SCORE = "confidenceScore";
+ private static final String FINE_GRAINED_QUERY_ID = "query";
+
+ // Template support
+ private static final String NONE_TRANSFORMATION_TYPE = "NONE";
+ private static final Float DEFAULT_CONFIDENCE_SCORE = 1.0f;
+ private static final String DEFAULT_QUERY_ID = "NONE";
+
+ /**
+ * Combines fine grained lineage array into a map using upstream and downstream types as keys,
+ * defaulting when not present. Due to this construction, patches will look like: path:
+ * /fineGrainedLineages/TRANSFORMATION_OPERATION/DOWNSTREAM_FIELD_URN/QUERY_ID/UPSTREAM_FIELD_URN,
+ * op: ADD/REMOVE, value: float (confidenceScore) Due to the way FineGrainedLineage was designed
+ * it doesn't necessarily have a consistent key we can reference, so this specialized method
+ * mimics the arrayFieldToMap of the super class with the specialization that it does not put the
+ * full value of the aspect at the end of the key, just the particular array. This prevents
+ * unintended overwrites through improper MCP construction that is technically allowed by the
+ * schema when combining under fields that form the natural key.
+ *
+ * @param fineGrainedLineages the fine grained lineage array node
+ * @return the modified {@link JsonNode} with array fields transformed to maps
+ */
+ public static JsonNode combineAndTransformFineGrainedLineages(
+ @Nullable JsonNode fineGrainedLineages) {
+ ObjectNode mapNode = instance.objectNode();
+ if (!(fineGrainedLineages instanceof ArrayNode) || fineGrainedLineages.isEmpty()) {
+ return mapNode;
+ }
+ JsonNode lineageCopy = fineGrainedLineages.deepCopy();
+
+ lineageCopy
+ .elements()
+ .forEachRemaining(
+ node -> {
+ JsonNode nodeClone = node.deepCopy();
+ String transformationOperation =
+ nodeClone.has(FINE_GRAINED_TRANSFORMATION_OPERATION)
+ ? nodeClone.get(FINE_GRAINED_TRANSFORMATION_OPERATION).asText()
+ : NONE_TRANSFORMATION_TYPE;
+
+ if (!mapNode.has(transformationOperation)) {
+ mapNode.set(transformationOperation, instance.objectNode());
+ }
+ ObjectNode transformationOperationNode =
+ (ObjectNode) mapNode.get(transformationOperation);
+
+ ArrayNode downstreams =
+ nodeClone.has(FINE_GRAINED_DOWNSTREAMS)
+ ? (ArrayNode) nodeClone.get(FINE_GRAINED_DOWNSTREAMS)
+ : null;
+
+ if (downstreams == null || downstreams.size() != 1) {
+ throw new UnsupportedOperationException(
+ "Patching not supported on fine grained lineages with not"
+ + " exactly one downstream. Current fine grained lineage implementation is downstream derived and "
+ + "patches are keyed on the root of this derivation.");
+ }
+
+ Float confidenceScore =
+ nodeClone.has(FINE_GRAINED_CONFIDENCE_SCORE)
+ ? nodeClone.get(FINE_GRAINED_CONFIDENCE_SCORE).floatValue()
+ : DEFAULT_CONFIDENCE_SCORE;
+
+ String upstreamType =
+ nodeClone.has(FINE_GRAINED_UPSTREAM_TYPE)
+ ? nodeClone.get(FINE_GRAINED_UPSTREAM_TYPE).asText()
+ : null;
+ String downstreamType =
+ nodeClone.has(FINE_GRAINED_DOWNSTREAM_TYPE)
+ ? nodeClone.get(FINE_GRAINED_DOWNSTREAM_TYPE).asText()
+ : null;
+ ArrayNode upstreams =
+ nodeClone.has(FINE_GRAINED_UPSTREAMS)
+ ? (ArrayNode) nodeClone.get(FINE_GRAINED_UPSTREAMS)
+ : null;
+
+ String queryId =
+ nodeClone.has(FINE_GRAINED_QUERY_ID)
+ ? nodeClone.get(FINE_GRAINED_QUERY_ID).asText()
+ : DEFAULT_QUERY_ID;
+
+ if (upstreamType == null) {
+ // Determine default type
+ Urn upstreamUrn =
+ upstreams != null ? UrnUtils.getUrn(upstreams.get(0).asText()) : null;
+ if (upstreamUrn != null
+ && DATASET_ENTITY_NAME.equals(upstreamUrn.getEntityType())) {
+ upstreamType = FINE_GRAINED_LINEAGE_DATASET_TYPE;
+ } else {
+ upstreamType = FINE_GRAINED_LINEAGE_FIELD_SET_TYPE;
+ }
+ }
+
+ if (downstreamType == null) {
+ // Always use FIELD type, only support patches for single field downstream
+ downstreamType = FINE_GRAINED_LINEAGE_FIELD_TYPE;
+ }
+
+ String downstreamRoot = downstreams.get(0).asText();
+ if (!transformationOperationNode.has(downstreamRoot)) {
+ transformationOperationNode.set(downstreamRoot, instance.objectNode());
+ }
+ ObjectNode downstreamRootNode =
+ (ObjectNode) transformationOperationNode.get(downstreamRoot);
+ if (!downstreamRootNode.has(queryId)) {
+ downstreamRootNode.set(queryId, instance.objectNode());
+ }
+ ObjectNode queryNode = (ObjectNode) downstreamRootNode.get(queryId);
+ if (upstreams != null) {
+ addUrnsToParent(
+ queryNode, upstreams, confidenceScore, upstreamType, downstreamType);
+ }
+ });
+ return mapNode;
+ }
+
+ private static void addUrnsToParent(
+ JsonNode parentNode,
+ ArrayNode urnsList,
+ Float confidenceScore,
+ String upstreamType,
+ String downstreamType) {
+ // Will overwrite repeat urns with different confidence scores with the most recently seen
+ ((ObjectNode) parentNode)
+ .setAll(
+ Streams.stream(urnsList.elements())
+ .map(JsonNode::asText)
+ .distinct()
+ .collect(
+ Collectors.toMap(
+ urn -> urn,
+ urn ->
+ mapToLineageValueNode(confidenceScore, upstreamType, downstreamType))));
+ }
+
+ private static JsonNode mapToLineageValueNode(
+ Float confidenceScore, String upstreamType, String downstreamType) {
+ ObjectNode objectNode = instance.objectNode();
+ objectNode.set(FINE_GRAINED_CONFIDENCE_SCORE, instance.numberNode(confidenceScore));
+ objectNode.set(FINE_GRAINED_UPSTREAM_TYPE, instance.textNode(upstreamType));
+ objectNode.set(FINE_GRAINED_DOWNSTREAM_TYPE, instance.textNode(downstreamType));
+ return objectNode;
+ }
+
+ /**
+ * Takes the transformed fine grained lineages map from pre-processing and reconstructs an array
+ * of FineGrainedLineages Avoids producing side effects by copying nodes, use resulting node and
+ * not the original
+ *
+ * @param transformedFineGrainedLineages the transformed fine grained lineage map
+ * @return the modified {@link JsonNode} formatted consistent with the original schema
+ */
+ public static ArrayNode reconstructFineGrainedLineages(JsonNode transformedFineGrainedLineages) {
+ if (transformedFineGrainedLineages instanceof ArrayNode) {
+ // We already have an ArrayNode, no need to transform. This happens during `replace`
+ // operations
+ return (ArrayNode) transformedFineGrainedLineages;
+ }
+ ObjectNode mapNode = (ObjectNode) transformedFineGrainedLineages;
+ ArrayNode fineGrainedLineages = instance.arrayNode();
+
+ mapNode
+ .fieldNames()
+ .forEachRemaining(
+ transformationOperation -> {
+ final ObjectNode transformationOperationNode =
+ (ObjectNode) mapNode.get(transformationOperation);
+ transformationOperationNode
+ .fieldNames()
+ .forEachRemaining(
+ downstreamName -> {
+ final ObjectNode downstreamNode =
+ (ObjectNode) transformationOperationNode.get(downstreamName);
+ downstreamNode
+ .fieldNames()
+ .forEachRemaining(
+ queryId ->
+ buildFineGrainedLineage(
+ downstreamName,
+ downstreamNode,
+ queryId,
+ transformationOperation,
+ fineGrainedLineages));
+ });
+ });
+
+ return fineGrainedLineages;
+ }
+
+ private static void buildFineGrainedLineage(
+ final String downstreamName,
+ final ObjectNode downstreamNode,
+ final String queryId,
+ final String transformationOperation,
+ final ArrayNode fineGrainedLineages) {
+ final ObjectNode fineGrainedLineage = instance.objectNode();
+ final ObjectNode queryNode = (ObjectNode) downstreamNode.get(queryId);
+ if (queryNode.isEmpty()) {
+ // Short circuit if no upstreams left
+ return;
+ }
+ ArrayNode downstream = instance.arrayNode();
+ downstream.add(instance.textNode(downstreamName));
+ // Set defaults, if found in sub nodes override, for confidenceScore take lowest
+ AtomicReference minimumConfidenceScore = new AtomicReference<>(DEFAULT_CONFIDENCE_SCORE);
+ AtomicReference upstreamType =
+ new AtomicReference<>(FINE_GRAINED_LINEAGE_FIELD_SET_TYPE);
+ AtomicReference downstreamType = new AtomicReference<>(FINE_GRAINED_LINEAGE_FIELD_TYPE);
+ ArrayNode upstreams = instance.arrayNode();
+ queryNode
+ .fieldNames()
+ .forEachRemaining(
+ upstream ->
+ processUpstream(
+ queryNode,
+ upstream,
+ minimumConfidenceScore,
+ upstreamType,
+ downstreamType,
+ upstreams));
+ fineGrainedLineage.set(FINE_GRAINED_DOWNSTREAMS, downstream);
+ fineGrainedLineage.set(FINE_GRAINED_UPSTREAMS, upstreams);
+ if (StringUtils.isNotBlank(queryId) && !DEFAULT_QUERY_ID.equals(queryId)) {
+ fineGrainedLineage.set(FINE_GRAINED_QUERY_ID, instance.textNode(queryId));
+ }
+ fineGrainedLineage.set(FINE_GRAINED_UPSTREAM_TYPE, instance.textNode(upstreamType.get()));
+ fineGrainedLineage.set(FINE_GRAINED_DOWNSTREAM_TYPE, instance.textNode(downstreamType.get()));
+ fineGrainedLineage.set(
+ FINE_GRAINED_CONFIDENCE_SCORE, instance.numberNode(minimumConfidenceScore.get()));
+ fineGrainedLineage.set(
+ FINE_GRAINED_TRANSFORMATION_OPERATION, instance.textNode(transformationOperation));
+ fineGrainedLineages.add(fineGrainedLineage);
+ }
+
+ private static void processUpstream(
+ final ObjectNode queryNode,
+ final String upstream,
+ final AtomicReference minimumConfidenceScore,
+ final AtomicReference upstreamType,
+ final AtomicReference downstreamType,
+ final ArrayNode upstreams) {
+ final ObjectNode upstreamNode = (ObjectNode) queryNode.get(upstream);
+ if (upstreamNode.has(FINE_GRAINED_CONFIDENCE_SCORE)) {
+ Float scoreValue = upstreamNode.get(FINE_GRAINED_CONFIDENCE_SCORE).floatValue();
+ if (scoreValue <= minimumConfidenceScore.get()) {
+ minimumConfidenceScore.set(scoreValue);
+ }
+ }
+ // Set types to last encountered, should never change, but this at least tries to support
+ // other types being specified.
+ if (upstreamNode.has(FINE_GRAINED_UPSTREAM_TYPE)) {
+ upstreamType.set(upstreamNode.get(FINE_GRAINED_UPSTREAM_TYPE).asText());
+ }
+ if (upstreamNode.has(FINE_GRAINED_DOWNSTREAM_TYPE)) {
+ downstreamType.set(upstreamNode.get(FINE_GRAINED_DOWNSTREAM_TYPE).asText());
+ }
+ upstreams.add(instance.textNode(upstream));
+ }
+}
diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/TemplateUtil.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/TemplateUtil.java
index 2423e37e6d5419..23879ad1c2e353 100644
--- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/TemplateUtil.java
+++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/TemplateUtil.java
@@ -84,7 +84,7 @@ public static JsonNode populateTopLevelKeys(JsonNode transformedNode, JsonPatch
// Skip first as it will always be blank due to path starting with /
for (int i = 1; i < endIdx; i++) {
String decodedKey = decodeValue(keys[i]);
- if (parent.get(keys[i]) == null) {
+ if (parent.get(decodedKey) == null) {
((ObjectNode) parent).set(decodedKey, instance.objectNode());
}
parent = parent.get(decodedKey);
diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/datajob/DataJobInputOutputTemplate.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/datajob/DataJobInputOutputTemplate.java
index 3d398d97b50c38..ef26eed2f814f8 100644
--- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/datajob/DataJobInputOutputTemplate.java
+++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/datajob/DataJobInputOutputTemplate.java
@@ -1,6 +1,10 @@
package com.linkedin.metadata.aspect.patch.template.datajob;
+import static com.fasterxml.jackson.databind.node.JsonNodeFactory.*;
+import static com.linkedin.metadata.Constants.*;
+
import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.node.ObjectNode;
import com.linkedin.common.DataJobUrnArray;
import com.linkedin.common.DatasetUrnArray;
import com.linkedin.common.EdgeArray;
@@ -9,6 +13,7 @@
import com.linkedin.datajob.DataJobInputOutput;
import com.linkedin.dataset.FineGrainedLineageArray;
import com.linkedin.metadata.aspect.patch.template.ArrayMergingTemplate;
+import com.linkedin.metadata.aspect.patch.template.FineGrainedLineageTemplateHelper;
import java.util.Collections;
import javax.annotation.Nonnull;
@@ -23,6 +28,8 @@ public class DataJobInputOutputTemplate implements ArrayMergingTemplate {
@@ -27,18 +19,6 @@ public class UpstreamLineageTemplate extends CompoundKeyTemplate {
- JsonNode nodeClone = node.deepCopy();
- String transformationOperation =
- nodeClone.has(FINE_GRAINED_TRANSFORMATION_OPERATION)
- ? nodeClone.get(FINE_GRAINED_TRANSFORMATION_OPERATION).asText()
- : NONE_TRANSFORMATION_TYPE;
-
- if (!mapNode.has(transformationOperation)) {
- mapNode.set(transformationOperation, instance.objectNode());
- }
- ObjectNode transformationOperationNode =
- (ObjectNode) mapNode.get(transformationOperation);
-
- ArrayNode downstreams =
- nodeClone.has(FINE_GRAINED_DOWNSTREAMS)
- ? (ArrayNode) nodeClone.get(FINE_GRAINED_DOWNSTREAMS)
- : null;
-
- if (downstreams == null || downstreams.size() != 1) {
- throw new UnsupportedOperationException(
- "Patching not supported on fine grained lineages with not"
- + " exactly one downstream. Current fine grained lineage implementation is downstream derived and "
- + "patches are keyed on the root of this derivation.");
- }
-
- Float confidenceScore =
- nodeClone.has(FINE_GRAINED_CONFIDENCE_SCORE)
- ? nodeClone.get(FINE_GRAINED_CONFIDENCE_SCORE).floatValue()
- : DEFAULT_CONFIDENCE_SCORE;
-
- String upstreamType =
- nodeClone.has(FINE_GRAINED_UPSTREAM_TYPE)
- ? nodeClone.get(FINE_GRAINED_UPSTREAM_TYPE).asText()
- : null;
- String downstreamType =
- nodeClone.has(FINE_GRAINED_DOWNSTREAM_TYPE)
- ? nodeClone.get(FINE_GRAINED_DOWNSTREAM_TYPE).asText()
- : null;
- ArrayNode upstreams =
- nodeClone.has(FINE_GRAINED_UPSTREAMS)
- ? (ArrayNode) nodeClone.get(FINE_GRAINED_UPSTREAMS)
- : null;
-
- String queryId =
- nodeClone.has(FINE_GRAINED_QUERY_ID)
- ? nodeClone.get(FINE_GRAINED_QUERY_ID).asText()
- : DEFAULT_QUERY_ID;
-
- if (upstreamType == null) {
- // Determine default type
- Urn upstreamUrn =
- upstreams != null ? UrnUtils.getUrn(upstreams.get(0).asText()) : null;
- if (upstreamUrn != null
- && DATASET_ENTITY_NAME.equals(upstreamUrn.getEntityType())) {
- upstreamType = FINE_GRAINED_LINEAGE_DATASET_TYPE;
- } else {
- upstreamType = FINE_GRAINED_LINEAGE_FIELD_SET_TYPE;
- }
- }
-
- if (downstreamType == null) {
- // Always use FIELD type, only support patches for single field downstream
- downstreamType = FINE_GRAINED_LINEAGE_FIELD_TYPE;
- }
-
- String downstreamRoot = downstreams.get(0).asText();
- if (!transformationOperationNode.has(downstreamRoot)) {
- transformationOperationNode.set(downstreamRoot, instance.objectNode());
- }
- ObjectNode downstreamRootNode =
- (ObjectNode) transformationOperationNode.get(downstreamRoot);
- if (!downstreamRootNode.has(queryId)) {
- downstreamRootNode.set(queryId, instance.objectNode());
- }
- ObjectNode queryNode = (ObjectNode) downstreamRootNode.get(queryId);
- if (upstreams != null) {
- addUrnsToParent(
- queryNode, upstreams, confidenceScore, upstreamType, downstreamType);
- }
- });
- return mapNode;
- }
-
- private void addUrnsToParent(
- JsonNode parentNode,
- ArrayNode urnsList,
- Float confidenceScore,
- String upstreamType,
- String downstreamType) {
- // Will overwrite repeat urns with different confidence scores with the most recently seen
- ((ObjectNode) parentNode)
- .setAll(
- Streams.stream(urnsList.elements())
- .map(JsonNode::asText)
- .distinct()
- .collect(
- Collectors.toMap(
- urn -> urn,
- urn ->
- mapToLineageValueNode(confidenceScore, upstreamType, downstreamType))));
- }
-
- private JsonNode mapToLineageValueNode(
- Float confidenceScore, String upstreamType, String downstreamType) {
- ObjectNode objectNode = instance.objectNode();
- objectNode.set(FINE_GRAINED_CONFIDENCE_SCORE, instance.numberNode(confidenceScore));
- objectNode.set(FINE_GRAINED_UPSTREAM_TYPE, instance.textNode(upstreamType));
- objectNode.set(FINE_GRAINED_DOWNSTREAM_TYPE, instance.textNode(downstreamType));
- return objectNode;
- }
-
- /**
- * Takes the transformed fine grained lineages map from pre-processing and reconstructs an array
- * of FineGrainedLineages Avoids producing side effects by copying nodes, use resulting node and
- * not the original
- *
- * @param transformedFineGrainedLineages the transformed fine grained lineage map
- * @return the modified {@link JsonNode} formatted consistent with the original schema
- */
- private ArrayNode reconstructFineGrainedLineages(JsonNode transformedFineGrainedLineages) {
- if (transformedFineGrainedLineages instanceof ArrayNode) {
- // We already have an ArrayNode, no need to transform. This happens during `replace`
- // operations
- return (ArrayNode) transformedFineGrainedLineages;
- }
- ObjectNode mapNode = (ObjectNode) transformedFineGrainedLineages;
- ArrayNode fineGrainedLineages = instance.arrayNode();
-
- mapNode
- .fieldNames()
- .forEachRemaining(
- transformationOperation -> {
- final ObjectNode transformationOperationNode =
- (ObjectNode) mapNode.get(transformationOperation);
- transformationOperationNode
- .fieldNames()
- .forEachRemaining(
- downstreamName -> {
- final ObjectNode downstreamNode =
- (ObjectNode) transformationOperationNode.get(downstreamName);
- downstreamNode
- .fieldNames()
- .forEachRemaining(
- queryId ->
- buildFineGrainedLineage(
- downstreamName,
- downstreamNode,
- queryId,
- transformationOperation,
- fineGrainedLineages));
- });
- });
-
- return fineGrainedLineages;
- }
-
- private void buildFineGrainedLineage(
- final String downstreamName,
- final ObjectNode downstreamNode,
- final String queryId,
- final String transformationOperation,
- final ArrayNode fineGrainedLineages) {
- final ObjectNode fineGrainedLineage = instance.objectNode();
- final ObjectNode queryNode = (ObjectNode) downstreamNode.get(queryId);
- if (queryNode.isEmpty()) {
- // Short circuit if no upstreams left
- return;
- }
- ArrayNode downstream = instance.arrayNode();
- downstream.add(instance.textNode(downstreamName));
- // Set defaults, if found in sub nodes override, for confidenceScore take lowest
- AtomicReference minimumConfidenceScore = new AtomicReference<>(DEFAULT_CONFIDENCE_SCORE);
- AtomicReference upstreamType =
- new AtomicReference<>(FINE_GRAINED_LINEAGE_FIELD_SET_TYPE);
- AtomicReference downstreamType = new AtomicReference<>(FINE_GRAINED_LINEAGE_FIELD_TYPE);
- ArrayNode upstreams = instance.arrayNode();
- queryNode
- .fieldNames()
- .forEachRemaining(
- upstream ->
- processUpstream(
- queryNode,
- upstream,
- minimumConfidenceScore,
- upstreamType,
- downstreamType,
- upstreams));
- fineGrainedLineage.set(FINE_GRAINED_DOWNSTREAMS, downstream);
- fineGrainedLineage.set(FINE_GRAINED_UPSTREAMS, upstreams);
- if (StringUtils.isNotBlank(queryId) && !DEFAULT_QUERY_ID.equals(queryId)) {
- fineGrainedLineage.set(FINE_GRAINED_QUERY_ID, instance.textNode(queryId));
- }
- fineGrainedLineage.set(FINE_GRAINED_UPSTREAM_TYPE, instance.textNode(upstreamType.get()));
- fineGrainedLineage.set(FINE_GRAINED_DOWNSTREAM_TYPE, instance.textNode(downstreamType.get()));
- fineGrainedLineage.set(
- FINE_GRAINED_CONFIDENCE_SCORE, instance.numberNode(minimumConfidenceScore.get()));
- fineGrainedLineage.set(
- FINE_GRAINED_TRANSFORMATION_OPERATION, instance.textNode(transformationOperation));
- fineGrainedLineages.add(fineGrainedLineage);
- }
-
- private void processUpstream(
- final ObjectNode queryNode,
- final String upstream,
- final AtomicReference minimumConfidenceScore,
- final AtomicReference upstreamType,
- final AtomicReference downstreamType,
- final ArrayNode upstreams) {
- final ObjectNode upstreamNode = (ObjectNode) queryNode.get(upstream);
- if (upstreamNode.has(FINE_GRAINED_CONFIDENCE_SCORE)) {
- Float scoreValue = upstreamNode.get(FINE_GRAINED_CONFIDENCE_SCORE).floatValue();
- if (scoreValue <= minimumConfidenceScore.get()) {
- minimumConfidenceScore.set(scoreValue);
- }
- }
- // Set types to last encountered, should never change, but this at least tries to support
- // other types being specified.
- if (upstreamNode.has(FINE_GRAINED_UPSTREAM_TYPE)) {
- upstreamType.set(upstreamNode.get(FINE_GRAINED_UPSTREAM_TYPE).asText());
- }
- if (upstreamNode.has(FINE_GRAINED_DOWNSTREAM_TYPE)) {
- downstreamType.set(upstreamNode.get(FINE_GRAINED_DOWNSTREAM_TYPE).asText());
- }
- upstreams.add(instance.textNode(upstream));
- }
}
diff --git a/entity-registry/src/main/java/com/linkedin/metadata/entity/SearchRetriever.java b/entity-registry/src/main/java/com/linkedin/metadata/entity/SearchRetriever.java
index eaa106b8d1f638..d4894c97015f8f 100644
--- a/entity-registry/src/main/java/com/linkedin/metadata/entity/SearchRetriever.java
+++ b/entity-registry/src/main/java/com/linkedin/metadata/entity/SearchRetriever.java
@@ -2,6 +2,7 @@
import com.linkedin.metadata.query.filter.Filter;
import com.linkedin.metadata.search.ScrollResult;
+import com.linkedin.metadata.search.SearchEntityArray;
import java.util.List;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
@@ -21,4 +22,22 @@ ScrollResult scroll(
@Nullable Filter filters,
@Nullable String scrollId,
int count);
+
+ SearchRetriever EMPTY = new EmptySearchRetriever();
+
+ class EmptySearchRetriever implements SearchRetriever {
+
+ @Override
+ public ScrollResult scroll(
+ @Nonnull List entities,
+ @Nullable Filter filters,
+ @Nullable String scrollId,
+ int count) {
+ ScrollResult empty = new ScrollResult();
+ empty.setEntities(new SearchEntityArray());
+ empty.setNumEntities(0);
+ empty.setPageSize(0);
+ return empty;
+ }
+ }
}
diff --git a/entity-registry/src/test/java/com/linkedin/metadata/aspect/patch/template/DataJobInputOutputTemplateTest.java b/entity-registry/src/test/java/com/linkedin/metadata/aspect/patch/template/DataJobInputOutputTemplateTest.java
new file mode 100644
index 00000000000000..d2a26221a3bb9f
--- /dev/null
+++ b/entity-registry/src/test/java/com/linkedin/metadata/aspect/patch/template/DataJobInputOutputTemplateTest.java
@@ -0,0 +1,255 @@
+package com.linkedin.metadata.aspect.patch.template;
+
+import static com.linkedin.metadata.utils.GenericRecordUtils.*;
+import static org.testng.Assert.*;
+
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.linkedin.common.UrnArray;
+import com.linkedin.common.urn.Urn;
+import com.linkedin.common.urn.UrnUtils;
+import com.linkedin.data.DataMap;
+import com.linkedin.datajob.DataJobInputOutput;
+import com.linkedin.dataset.FineGrainedLineage;
+import com.linkedin.dataset.FineGrainedLineageDownstreamType;
+import com.linkedin.dataset.FineGrainedLineageUpstreamType;
+import com.linkedin.metadata.aspect.patch.template.datajob.DataJobInputOutputTemplate;
+import jakarta.json.Json;
+import jakarta.json.JsonObjectBuilder;
+import jakarta.json.JsonPatch;
+import jakarta.json.JsonPatchBuilder;
+import jakarta.json.JsonValue;
+import org.testng.annotations.Test;
+
+public class DataJobInputOutputTemplateTest {
+ private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ @Test
+ public void testPatchUpstream() throws Exception {
+ DataJobInputOutputTemplate dataJobInputOutputTemplate = new DataJobInputOutputTemplate();
+ DataJobInputOutput dataJobInputOutput = dataJobInputOutputTemplate.getDefault();
+ JsonPatchBuilder jsonPatchBuilder = Json.createPatchBuilder();
+
+ JsonObjectBuilder fineGrainedLineageNode = Json.createObjectBuilder();
+ JsonValue upstreamConfidenceScore = Json.createValue(1.0f);
+ fineGrainedLineageNode.add("confidenceScore", upstreamConfidenceScore);
+ jsonPatchBuilder.add(
+ "/fineGrainedLineages/CREATE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c1)//urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c1)",
+ fineGrainedLineageNode.build());
+
+ // Initial population test
+ DataJobInputOutput result =
+ dataJobInputOutputTemplate.applyPatch(dataJobInputOutput, jsonPatchBuilder.build());
+ // Hack because Jackson parses values to doubles instead of floats
+ DataMap dataMap = new DataMap();
+ dataMap.put("confidenceScore", 1.0);
+ FineGrainedLineage fineGrainedLineage = new FineGrainedLineage(dataMap);
+ UrnArray urns = new UrnArray();
+ Urn urn1 =
+ UrnUtils.getUrn(
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c1)");
+ urns.add(urn1);
+ UrnArray upstreams = new UrnArray();
+ Urn upstreamUrn =
+ UrnUtils.getUrn(
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c1)");
+ upstreams.add(upstreamUrn);
+ fineGrainedLineage.setDownstreams(urns);
+ fineGrainedLineage.setUpstreams(upstreams);
+ fineGrainedLineage.setTransformOperation("CREATE");
+ fineGrainedLineage.setUpstreamType(FineGrainedLineageUpstreamType.FIELD_SET);
+ fineGrainedLineage.setDownstreamType(FineGrainedLineageDownstreamType.FIELD);
+ assertEquals(result.getFineGrainedLineages().get(0), fineGrainedLineage);
+
+ // Test non-overwrite upstreams and correct confidence score and types w/ overwrite
+ JsonObjectBuilder finegrainedLineageNode2 = Json.createObjectBuilder();
+ finegrainedLineageNode2.add(
+ "upstreamType", Json.createValue(FineGrainedLineageUpstreamType.FIELD_SET.name()));
+ finegrainedLineageNode2.add("confidenceScore", upstreamConfidenceScore);
+ finegrainedLineageNode2.add(
+ "downstreamType", Json.createValue(FineGrainedLineageDownstreamType.FIELD.name()));
+
+ JsonPatchBuilder patchOperations2 = Json.createPatchBuilder();
+ patchOperations2.add(
+ "/fineGrainedLineages/CREATE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c2)/urn:li:query:someQuery/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)",
+ finegrainedLineageNode2.build());
+
+ JsonValue upstreamConfidenceScore2 = Json.createValue(0.1f);
+ JsonObjectBuilder finegrainedLineageNode3 = Json.createObjectBuilder();
+ finegrainedLineageNode3.add(
+ "upstreamType", Json.createValue(FineGrainedLineageUpstreamType.DATASET.name()));
+ finegrainedLineageNode3.add("confidenceScore", upstreamConfidenceScore2);
+ finegrainedLineageNode3.add(
+ "downstreamType", Json.createValue(FineGrainedLineageDownstreamType.FIELD_SET.name()));
+
+ patchOperations2.add(
+ "/fineGrainedLineages/CREATE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c2)/urn:li:query:someQuery/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)",
+ finegrainedLineageNode3.build());
+
+ JsonPatch jsonPatch2 = patchOperations2.build();
+
+ DataJobInputOutput result2 = dataJobInputOutputTemplate.applyPatch(result, jsonPatch2);
+ // Hack because Jackson parses values to doubles instead of floats
+ DataMap dataMap2 = new DataMap();
+ dataMap2.put("confidenceScore", 0.1);
+ FineGrainedLineage fineGrainedLineage2 = new FineGrainedLineage(dataMap2);
+ UrnArray urns2 = new UrnArray();
+ Urn urn2 =
+ UrnUtils.getUrn(
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)");
+ urns2.add(urn2);
+ Urn downstreamUrn2 =
+ UrnUtils.getUrn(
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c2)");
+ UrnArray downstreams2 = new UrnArray();
+ downstreams2.add(downstreamUrn2);
+ fineGrainedLineage2.setUpstreams(urns2);
+ fineGrainedLineage2.setDownstreams(downstreams2);
+ fineGrainedLineage2.setTransformOperation("CREATE");
+ fineGrainedLineage2.setUpstreamType(FineGrainedLineageUpstreamType.DATASET);
+ fineGrainedLineage2.setDownstreamType(FineGrainedLineageDownstreamType.FIELD_SET);
+ fineGrainedLineage2.setQuery(UrnUtils.getUrn("urn:li:query:someQuery"));
+ assertEquals(result2.getFineGrainedLineages().get(1), fineGrainedLineage2);
+
+ // Check different queries
+ JsonObjectBuilder finegrainedLineageNode4 = Json.createObjectBuilder();
+ finegrainedLineageNode4.add(
+ "upstreamType", Json.createValue(FineGrainedLineageUpstreamType.FIELD_SET.name()));
+ finegrainedLineageNode4.add("confidenceScore", upstreamConfidenceScore);
+ finegrainedLineageNode4.add(
+ "downstreamType", Json.createValue(FineGrainedLineageDownstreamType.FIELD.name()));
+
+ JsonPatchBuilder patchOperations3 = Json.createPatchBuilder();
+ patchOperations3.add(
+ "/fineGrainedLineages/CREATE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c2)/urn:li:query:anotherQuery/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)",
+ finegrainedLineageNode4.build());
+
+ JsonPatch jsonPatch3 = patchOperations3.build();
+ DataJobInputOutput result3 = dataJobInputOutputTemplate.applyPatch(result2, jsonPatch3);
+ // Hack because Jackson parses values to doubles instead of floats
+ DataMap dataMap3 = new DataMap();
+ dataMap3.put("confidenceScore", 1.0);
+ FineGrainedLineage fineGrainedLineage3 = new FineGrainedLineage(dataMap3);
+ UrnArray urns3 = new UrnArray();
+ Urn urn3 =
+ UrnUtils.getUrn(
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c2)");
+ urns3.add(urn3);
+
+ Urn upstreamUrn3 =
+ UrnUtils.getUrn(
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)");
+ UrnArray upstreamUrns3 = new UrnArray();
+ upstreamUrns3.add(upstreamUrn3);
+ fineGrainedLineage3.setDownstreams(urns3);
+ fineGrainedLineage3.setUpstreams(upstreamUrns3);
+ fineGrainedLineage3.setTransformOperation("CREATE");
+ fineGrainedLineage3.setUpstreamType(FineGrainedLineageUpstreamType.FIELD_SET);
+ fineGrainedLineage3.setDownstreamType(FineGrainedLineageDownstreamType.FIELD);
+ fineGrainedLineage3.setQuery(UrnUtils.getUrn("urn:li:query:anotherQuery"));
+ // Splits into two for different types
+ assertEquals(result3.getFineGrainedLineages().get(2), fineGrainedLineage3);
+
+ // Check different transform types
+ JsonObjectBuilder finegrainedLineageNode5 = Json.createObjectBuilder();
+ finegrainedLineageNode5.add(
+ "upstreamType", Json.createValue(FineGrainedLineageUpstreamType.FIELD_SET.name()));
+ finegrainedLineageNode5.add("confidenceScore", upstreamConfidenceScore);
+ finegrainedLineageNode5.add(
+ "downstreamType", Json.createValue(FineGrainedLineageDownstreamType.FIELD.name()));
+
+ JsonPatchBuilder patchOperations4 = Json.createPatchBuilder();
+ patchOperations4.add(
+ "/fineGrainedLineages/TRANSFORM/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c2)/urn:li:query:anotherQuery/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)",
+ finegrainedLineageNode5.build());
+ JsonPatch jsonPatch4 = patchOperations4.build();
+
+ DataJobInputOutput result4 = dataJobInputOutputTemplate.applyPatch(result3, jsonPatch4);
+ // Hack because Jackson parses values to doubles instead of floats
+ DataMap dataMap4 = new DataMap();
+ dataMap4.put("confidenceScore", 1.0);
+ FineGrainedLineage fineGrainedLineage4 = new FineGrainedLineage(dataMap4);
+ fineGrainedLineage4.setUpstreams(upstreamUrns3);
+ fineGrainedLineage4.setDownstreams(urns3);
+ fineGrainedLineage4.setTransformOperation("TRANSFORM");
+ fineGrainedLineage4.setUpstreamType(FineGrainedLineageUpstreamType.FIELD_SET);
+ fineGrainedLineage4.setDownstreamType(FineGrainedLineageDownstreamType.FIELD);
+ fineGrainedLineage4.setQuery(UrnUtils.getUrn("urn:li:query:anotherQuery"));
+ // New entry in array because of new transformation type
+ assertEquals(result4.getFineGrainedLineages().get(3), fineGrainedLineage4);
+
+ // Remove
+ JsonPatchBuilder removeOperations = Json.createPatchBuilder();
+ removeOperations.remove(
+ "/fineGrainedLineages/CREATE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c1)/NONE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c1)");
+ removeOperations.remove(
+ "/fineGrainedLineages/CREATE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c2)/urn:li:query:someQuery/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)");
+ removeOperations.remove(
+ "/fineGrainedLineages/CREATE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c2)/urn:li:query:anotherQuery/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)");
+ removeOperations.remove(
+ "/fineGrainedLineages/TRANSFORM/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c2)/urn:li:query:anotherQuery/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)");
+
+ JsonPatch removePatch = removeOperations.build();
+ DataJobInputOutput finalResult = dataJobInputOutputTemplate.applyPatch(result4, removePatch);
+ assertEquals(finalResult, dataJobInputOutputTemplate.getDefault());
+ }
+
+ @Test
+ public void testPatchWithFieldWithForwardSlash() throws JsonProcessingException {
+
+ String downstreamUrn =
+ "/fineGrainedLineages/CREATE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c1)";
+ String unescapedUpstreamUrn =
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),slash/column)";
+ String escapedUpstreamUrn =
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),slash~1column)";
+ String lineagePath = downstreamUrn + "//" + escapedUpstreamUrn;
+
+ DataJobInputOutputTemplate dataJobInputOutputTemplate = new DataJobInputOutputTemplate();
+ DataJobInputOutput dataJobInputOutput = dataJobInputOutputTemplate.getDefault();
+ JsonPatchBuilder jsonPatchBuilder = Json.createPatchBuilder();
+
+ JsonObjectBuilder fineGrainedLineageNode = Json.createObjectBuilder();
+ JsonValue upstreamConfidenceScore = Json.createValue(1.0f);
+ fineGrainedLineageNode.add("confidenceScore", upstreamConfidenceScore);
+
+ jsonPatchBuilder.add(lineagePath, fineGrainedLineageNode.build());
+
+ // Initial population test
+ DataJobInputOutput result =
+ dataJobInputOutputTemplate.applyPatch(dataJobInputOutput, jsonPatchBuilder.build());
+
+ assertEquals(
+ result.getFineGrainedLineages().get(0).getUpstreams().get(0).toString(),
+ unescapedUpstreamUrn);
+ }
+
+ @Test
+ public void testPatchWithFieldWithTilde() throws JsonProcessingException {
+
+ String downstreamUrn =
+ "/fineGrainedLineages/CREATE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c1)";
+ String unescapedUpstreamUrn =
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),tilde~column)";
+ String escapedUpstreamUrn =
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),tilde~0column)";
+ String lineagePath = downstreamUrn + "//" + escapedUpstreamUrn;
+
+ DataJobInputOutputTemplate dataJobInputOutputTemplate = new DataJobInputOutputTemplate();
+ DataJobInputOutput dataJobInputOutput = dataJobInputOutputTemplate.getDefault();
+ JsonPatchBuilder jsonPatchBuilder = Json.createPatchBuilder();
+
+ JsonObjectBuilder fineGrainedLineageNode = Json.createObjectBuilder();
+ JsonValue upstreamConfidenceScore = Json.createValue(1.0f);
+ fineGrainedLineageNode.add("confidenceScore", upstreamConfidenceScore);
+
+ jsonPatchBuilder.add(lineagePath, fineGrainedLineageNode.build());
+
+ // Initial population test
+ DataJobInputOutput result =
+ dataJobInputOutputTemplate.applyPatch(dataJobInputOutput, jsonPatchBuilder.build());
+ assertEquals(
+ result.getFineGrainedLineages().get(0).getUpstreams().get(0).toString(),
+ unescapedUpstreamUrn);
+ }
+}
diff --git a/entity-registry/src/test/java/com/linkedin/metadata/aspect/patch/template/UpstreamLineageTemplateTest.java b/entity-registry/src/test/java/com/linkedin/metadata/aspect/patch/template/UpstreamLineageTemplateTest.java
index f934dd8961ca37..ab0e7f960251c4 100644
--- a/entity-registry/src/test/java/com/linkedin/metadata/aspect/patch/template/UpstreamLineageTemplateTest.java
+++ b/entity-registry/src/test/java/com/linkedin/metadata/aspect/patch/template/UpstreamLineageTemplateTest.java
@@ -221,6 +221,7 @@ public void testPatchUpstream() throws Exception {
JsonPatch removePatch = removeOperations.build();
UpstreamLineage finalResult = upstreamLineageTemplate.applyPatch(result4, removePatch);
+
assertEquals(finalResult, upstreamLineageTemplate.getDefault());
}
@@ -337,4 +338,39 @@ public void testPatchWithFieldWithTilde() throws JsonProcessingException {
result.getFineGrainedLineages().get(0).getUpstreams().get(0).toString(),
unescapedUpstreamUrn);
}
+
+ @Test
+ public void testPatchRemoveWithFields() throws JsonProcessingException {
+
+ String downstreamUrn =
+ "/fineGrainedLineages/CREATE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:s3,~1tmp~1test.parquet,PROD),c1)";
+ String upstreamUrn =
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c1)";
+ String upstreamUrn2 =
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c2)";
+
+ String lineagePath1 = downstreamUrn + "/NONE/" + upstreamUrn;
+ String lineagePath2 = downstreamUrn + "/NONE/" + upstreamUrn2;
+
+ UpstreamLineageTemplate upstreamLineageTemplate = new UpstreamLineageTemplate();
+ UpstreamLineage upstreamLineage = upstreamLineageTemplate.getDefault();
+ JsonPatchBuilder jsonPatchBuilder = Json.createPatchBuilder();
+
+ JsonObjectBuilder fineGrainedLineageNode = Json.createObjectBuilder();
+ JsonValue upstreamConfidenceScore = Json.createValue(1.0f);
+ fineGrainedLineageNode.add("confidenceScore", upstreamConfidenceScore);
+
+ jsonPatchBuilder.add(lineagePath1, fineGrainedLineageNode.build());
+ jsonPatchBuilder.add(lineagePath2, fineGrainedLineageNode.build());
+
+ // Initial population test
+ UpstreamLineage result =
+ upstreamLineageTemplate.applyPatch(upstreamLineage, jsonPatchBuilder.build());
+ assertEquals(
+ result.getFineGrainedLineages().get(0).getUpstreams().get(0).toString(), upstreamUrn);
+ assertEquals(
+ result.getFineGrainedLineages().get(0).getUpstreams().get(1).toString(), upstreamUrn2);
+
+ assertEquals(result.getFineGrainedLineages().get(0).getUpstreams().size(), 2);
+ }
}
diff --git a/entity-registry/src/testFixtures/java/com/linkedin/test/metadata/aspect/MockAspectRetriever.java b/entity-registry/src/testFixtures/java/com/linkedin/test/metadata/aspect/MockAspectRetriever.java
index 65705f15022b6b..98a6d59004a92a 100644
--- a/entity-registry/src/testFixtures/java/com/linkedin/test/metadata/aspect/MockAspectRetriever.java
+++ b/entity-registry/src/testFixtures/java/com/linkedin/test/metadata/aspect/MockAspectRetriever.java
@@ -5,7 +5,7 @@
import com.linkedin.data.DataMap;
import com.linkedin.data.template.RecordTemplate;
import com.linkedin.entity.Aspect;
-import com.linkedin.metadata.aspect.AspectRetriever;
+import com.linkedin.metadata.aspect.CachingAspectRetriever;
import com.linkedin.metadata.aspect.SystemAspect;
import com.linkedin.metadata.models.registry.EntityRegistry;
import com.linkedin.mxe.SystemMetadata;
@@ -22,7 +22,7 @@
import javax.annotation.Nonnull;
import org.mockito.Mockito;
-public class MockAspectRetriever implements AspectRetriever {
+public class MockAspectRetriever implements CachingAspectRetriever {
private final Map> data;
private final Map> systemData = new HashMap<>();
diff --git a/li-utils/src/main/java/com/linkedin/metadata/Constants.java b/li-utils/src/main/java/com/linkedin/metadata/Constants.java
index ff6a79108600a3..09f873ebf7bc96 100644
--- a/li-utils/src/main/java/com/linkedin/metadata/Constants.java
+++ b/li-utils/src/main/java/com/linkedin/metadata/Constants.java
@@ -409,6 +409,8 @@ public class Constants {
/** User Status */
public static final String CORP_USER_STATUS_ACTIVE = "ACTIVE";
+ public static final String CORP_USER_STATUS_SUSPENDED = "SUSPENDED";
+
/** Task Runs */
public static final String DATA_PROCESS_INSTANCE_ENTITY_NAME = "dataProcessInstance";
diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py
index aa7b3108f64f1e..640991a90a1d28 100644
--- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py
+++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py
@@ -9,6 +9,7 @@
import airflow
import datahub.emitter.mce_builder as builder
+from airflow.models import Variable
from airflow.models.serialized_dag import SerializedDagModel
from datahub.api.entities.datajob import DataJob
from datahub.api.entities.dataprocess.dataprocess_instance import InstanceRunResult
@@ -78,6 +79,8 @@ def hookimpl(f: _F) -> _F: # type: ignore[misc] # noqa: F811
)
_DATAHUB_CLEANUP_DAG = "Datahub_Cleanup"
+KILL_SWITCH_VARIABLE_NAME = "datahub_airflow_plugin_disable_listener"
+
def get_airflow_plugin_listener() -> Optional["DataHubListener"]:
# Using globals instead of functools.lru_cache to make testing easier.
@@ -364,6 +367,12 @@ def _extract_lineage(
redact_with_exclusions(v)
)
+ def check_kill_switch(self):
+ if Variable.get(KILL_SWITCH_VARIABLE_NAME, "false").lower() == "true":
+ logger.debug("DataHub listener disabled by kill switch")
+ return True
+ return False
+
@hookimpl
@run_in_thread
def on_task_instance_running(
@@ -372,6 +381,8 @@ def on_task_instance_running(
task_instance: "TaskInstance",
session: "Session", # This will always be QUEUED
) -> None:
+ if self.check_kill_switch():
+ return
self._set_log_level()
# This if statement mirrors the logic in https://github.com/OpenLineage/OpenLineage/pull/508.
@@ -454,6 +465,9 @@ def on_task_instance_running(
f"DataHub listener finished processing notification about task instance start for {task_instance.task_id}"
)
+ self.materialize_iolets(datajob)
+
+ def materialize_iolets(self, datajob: DataJob) -> None:
if self.config.materialize_iolets:
for outlet in datajob.outlets:
reported_time: int = int(time.time() * 1000)
@@ -541,6 +555,9 @@ def on_task_instance_finish(
def on_task_instance_success(
self, previous_state: None, task_instance: "TaskInstance", session: "Session"
) -> None:
+ if self.check_kill_switch():
+ return
+
self._set_log_level()
logger.debug(
@@ -556,6 +573,9 @@ def on_task_instance_success(
def on_task_instance_failed(
self, previous_state: None, task_instance: "TaskInstance", session: "Session"
) -> None:
+ if self.check_kill_switch():
+ return
+
self._set_log_level()
logger.debug(
@@ -696,6 +716,9 @@ def on_dag_start(self, dag_run: "DagRun") -> None:
@hookimpl
@run_in_thread
def on_dag_run_running(self, dag_run: "DagRun", msg: str) -> None:
+ if self.check_kill_switch():
+ return
+
self._set_log_level()
logger.debug(
diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
index 415871d30175f8..c6994dd6d5aa65 100644
--- a/metadata-ingestion/setup.py
+++ b/metadata-ingestion/setup.py
@@ -76,7 +76,7 @@
# now provide prebuilt wheels for most platforms, including M1 Macs and
# Linux aarch64 (e.g. Docker's linux/arm64). Installing confluent_kafka
# from source remains a pain.
- "confluent_kafka>=1.9.0",
+ "confluent_kafka[schemaregistry]>=1.9.0",
# We currently require both Avro libraries. The codegen uses avro-python3 (above)
# schema parsers at runtime for generating and reading JSON into Python objects.
# At the same time, we use Kafka's AvroSerializer, which internally relies on
@@ -101,7 +101,7 @@
# We heavily monkeypatch sqlglot.
# Prior to the patching, we originally maintained an acryl-sqlglot fork:
# https://github.com/tobymao/sqlglot/compare/main...hsheth2:sqlglot:main?expand=1
- "sqlglot[rs]==25.26.0",
+ "sqlglot[rs]==25.32.1",
"patchy==2.8.0",
}
@@ -741,7 +741,7 @@
"hive-metastore = datahub.ingestion.source.sql.hive_metastore:HiveMetastoreSource",
"json-schema = datahub.ingestion.source.schema.json_schema:JsonSchemaSource",
"kafka = datahub.ingestion.source.kafka.kafka:KafkaSource",
- "kafka-connect = datahub.ingestion.source.kafka.kafka_connect:KafkaConnectSource",
+ "kafka-connect = datahub.ingestion.source.kafka_connect.kafka_connect:KafkaConnectSource",
"ldap = datahub.ingestion.source.ldap:LDAPSource",
"looker = datahub.ingestion.source.looker.looker_source:LookerDashboardSource",
"lookml = datahub.ingestion.source.looker.lookml_source:LookMLSource",
diff --git a/metadata-ingestion/sink_docs/metadata-file.md b/metadata-ingestion/sink_docs/metadata-file.md
index 49ca3c75397af4..36c868828070ed 100644
--- a/metadata-ingestion/sink_docs/metadata-file.md
+++ b/metadata-ingestion/sink_docs/metadata-file.md
@@ -25,7 +25,7 @@ source:
sink:
type: file
config:
- path: ./path/to/mce/file.json
+ filename: ./path/to/mce/file.json
```
## Config details
diff --git a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py
index fd3fe7ca098ecb..619f69b016262d 100644
--- a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py
+++ b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py
@@ -9,27 +9,18 @@
from datahub.configuration.common import ConfigModel
from datahub.emitter.mcp import MetadataChangeProposalWrapper
-from datahub.ingestion.api.global_context import get_graph_context, set_graph_context
-from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
+from datahub.ingestion.graph.client import DataHubGraph
from datahub.metadata.schema_classes import (
PropertyValueClass,
StructuredPropertyDefinitionClass,
)
-from datahub.utilities.urns.urn import Urn
+from datahub.metadata.urns import DataTypeUrn, StructuredPropertyUrn, Urn
+from datahub.utilities.urns._urn_base import URN_TYPES
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
-class StructuredPropertiesConfig:
- """Configuration class to hold the graph client"""
-
- @classmethod
- def get_graph_required(cls) -> DataHubGraph:
- """Get the current graph, falling back to default if none set"""
- return get_graph_context() or get_default_graph()
-
-
class AllowedTypes(Enum):
STRING = "string"
RICH_TEXT = "rich_text"
@@ -51,29 +42,28 @@ class AllowedValue(ConfigModel):
description: Optional[str] = None
-VALID_ENTITY_TYPES_PREFIX_STRING = ", ".join(
- [
- f"urn:li:entityType:datahub.{x}"
- for x in ["dataset", "dashboard", "dataFlow", "schemaField"]
- ]
-)
-VALID_ENTITY_TYPES_STRING = f"Valid entity type urns are {VALID_ENTITY_TYPES_PREFIX_STRING}, etc... Ensure that the entity type is valid."
+VALID_ENTITY_TYPE_URNS = [
+ Urn.make_entity_type_urn(entity_type) for entity_type in URN_TYPES.keys()
+]
+_VALID_ENTITY_TYPES_STRING = f"Valid entity type urns are {', '.join(VALID_ENTITY_TYPE_URNS)}, etc... Ensure that the entity type is valid."
+
+
+def _validate_entity_type_urn(v: str) -> str:
+ urn = Urn.make_entity_type_urn(v)
+ if urn not in VALID_ENTITY_TYPE_URNS:
+ raise ValueError(
+ f"Input {v} is not a valid entity type urn. {_VALID_ENTITY_TYPES_STRING}"
+ )
+ v = str(urn)
+ return v
class TypeQualifierAllowedTypes(ConfigModel):
allowed_types: List[str]
- @validator("allowed_types", each_item=True)
- def validate_allowed_types(cls, v):
- if v:
- graph = StructuredPropertiesConfig.get_graph_required()
- validated_urn = Urn.make_entity_type_urn(v)
- if not graph.exists(validated_urn):
- raise ValueError(
- f"Input {v} is not a valid entity type urn. {VALID_ENTITY_TYPES_STRING}"
- )
- v = str(validated_urn)
- return v
+ _check_allowed_types = validator("allowed_types", each_item=True, allow_reuse=True)(
+ _validate_entity_type_urn
+ )
class StructuredProperties(ConfigModel):
@@ -90,22 +80,42 @@ class StructuredProperties(ConfigModel):
type_qualifier: Optional[TypeQualifierAllowedTypes] = None
immutable: Optional[bool] = False
- @validator("entity_types", each_item=True)
- def validate_entity_types(cls, v):
- if v:
- graph = StructuredPropertiesConfig.get_graph_required()
- validated_urn = Urn.make_entity_type_urn(v)
- if not graph.exists(validated_urn):
- raise ValueError(
- f"Input {v} is not a valid entity type urn. {VALID_ENTITY_TYPES_STRING}"
- )
- v = str(validated_urn)
- return v
+ _check_entity_types = validator("entity_types", each_item=True, allow_reuse=True)(
+ _validate_entity_type_urn
+ )
+
+ @validator("type")
+ def validate_type(cls, v: str) -> str:
+ # This logic is somewhat hacky, since we need to deal with
+ # 1. fully qualified urns
+ # 2. raw data types, that need to get the datahub namespace prefix
+ # While keeping the user-facing interface and error messages clean.
+
+ if not v.startswith("urn:li:") and not v.islower():
+ # Convert to lowercase if needed
+ v = v.lower()
+ logger.warning(
+ f"Structured property type should be lowercase. Updated to {v}"
+ )
+
+ urn = Urn.make_data_type_urn(v)
+
+ # Check if type is allowed
+ data_type_urn = DataTypeUrn.from_string(urn)
+ unqualified_data_type = data_type_urn.id
+ if unqualified_data_type.startswith("datahub."):
+ unqualified_data_type = unqualified_data_type[len("datahub.") :]
+ if not AllowedTypes.check_allowed_type(unqualified_data_type):
+ raise ValueError(
+ f"Type {unqualified_data_type} is not allowed. Allowed types are {AllowedTypes.values()}"
+ )
+
+ return urn
@property
def fqn(self) -> str:
assert self.urn is not None
- id = Urn.create_from_string(self.urn).get_entity_id()[0]
+ id = StructuredPropertyUrn.from_string(self.urn).id
if self.qualified_name is not None:
# ensure that qualified name and ID match
assert (
@@ -122,101 +132,90 @@ def urn_must_be_present(cls, v, values):
return v
@staticmethod
- def create(file: str, graph: Optional[DataHubGraph] = None) -> None:
- with set_graph_context(graph):
- graph = StructuredPropertiesConfig.get_graph_required()
-
- with open(file) as fp:
- structuredproperties: List[dict] = yaml.safe_load(fp)
- for structuredproperty_raw in structuredproperties:
- structuredproperty = StructuredProperties.parse_obj(
- structuredproperty_raw
- )
-
- if not structuredproperty.type.islower():
- structuredproperty.type = structuredproperty.type.lower()
- logger.warning(
- f"Structured property type should be lowercase. Updated to {structuredproperty.type}"
- )
- if not AllowedTypes.check_allowed_type(structuredproperty.type):
- raise ValueError(
- f"Type {structuredproperty.type} is not allowed. Allowed types are {AllowedTypes.values()}"
- )
- mcp = MetadataChangeProposalWrapper(
- entityUrn=structuredproperty.urn,
- aspect=StructuredPropertyDefinitionClass(
- qualifiedName=structuredproperty.fqn,
- valueType=Urn.make_data_type_urn(structuredproperty.type),
- displayName=structuredproperty.display_name,
- description=structuredproperty.description,
- entityTypes=[
- Urn.make_entity_type_urn(entity_type)
- for entity_type in structuredproperty.entity_types or []
- ],
- cardinality=structuredproperty.cardinality,
- immutable=structuredproperty.immutable,
- allowedValues=(
- [
- PropertyValueClass(
- value=v.value, description=v.description
- )
- for v in structuredproperty.allowed_values
- ]
- if structuredproperty.allowed_values
- else None
- ),
- typeQualifier=(
- {
- "allowedTypes": structuredproperty.type_qualifier.allowed_types
- }
- if structuredproperty.type_qualifier
- else None
- ),
- ),
- )
- graph.emit_mcp(mcp)
-
- logger.info(f"Created structured property {structuredproperty.urn}")
-
- @classmethod
- def from_datahub(cls, graph: DataHubGraph, urn: str) -> "StructuredProperties":
- with set_graph_context(graph):
- structured_property: Optional[
- StructuredPropertyDefinitionClass
- ] = graph.get_aspect(urn, StructuredPropertyDefinitionClass)
- if structured_property is None:
- raise Exception(
- "StructuredPropertyDefinition aspect is None. Unable to create structured property."
- )
- return StructuredProperties(
- urn=urn,
- qualified_name=structured_property.qualifiedName,
- display_name=structured_property.displayName,
- type=structured_property.valueType,
- description=structured_property.description,
- entity_types=structured_property.entityTypes,
- cardinality=structured_property.cardinality,
- allowed_values=(
+ def from_yaml(file: str) -> List["StructuredProperties"]:
+ with open(file) as fp:
+ structuredproperties: List[dict] = yaml.safe_load(fp)
+
+ result: List[StructuredProperties] = []
+ for structuredproperty_raw in structuredproperties:
+ result.append(StructuredProperties.parse_obj(structuredproperty_raw))
+ return result
+
+ def generate_mcps(self) -> List[MetadataChangeProposalWrapper]:
+ mcp = MetadataChangeProposalWrapper(
+ entityUrn=self.urn,
+ aspect=StructuredPropertyDefinitionClass(
+ qualifiedName=self.fqn,
+ valueType=Urn.make_data_type_urn(self.type),
+ displayName=self.display_name,
+ description=self.description,
+ entityTypes=[
+ Urn.make_entity_type_urn(entity_type)
+ for entity_type in self.entity_types or []
+ ],
+ cardinality=self.cardinality,
+ immutable=self.immutable,
+ allowedValues=(
[
- AllowedValue(
- value=av.value,
- description=av.description,
- )
- for av in structured_property.allowedValues or []
+ PropertyValueClass(value=v.value, description=v.description)
+ for v in self.allowed_values
]
- if structured_property.allowedValues is not None
+ if self.allowed_values
else None
),
- type_qualifier=(
- {
- "allowed_types": structured_property.typeQualifier.get(
- "allowedTypes"
- )
- }
- if structured_property.typeQualifier
+ typeQualifier=(
+ {"allowedTypes": self.type_qualifier.allowed_types}
+ if self.type_qualifier
else None
),
+ ),
+ )
+ return [mcp]
+
+ @staticmethod
+ def create(file: str, graph: DataHubGraph) -> None:
+ # TODO: Deprecate this method.
+ structuredproperties = StructuredProperties.from_yaml(file)
+ for structuredproperty in structuredproperties:
+ for mcp in structuredproperty.generate_mcps():
+ graph.emit_mcp(mcp)
+
+ logger.info(f"Created structured property {structuredproperty.urn}")
+
+ @classmethod
+ def from_datahub(cls, graph: DataHubGraph, urn: str) -> "StructuredProperties":
+ structured_property: Optional[
+ StructuredPropertyDefinitionClass
+ ] = graph.get_aspect(urn, StructuredPropertyDefinitionClass)
+ if structured_property is None:
+ raise Exception(
+ "StructuredPropertyDefinition aspect is None. Unable to create structured property."
)
+ return StructuredProperties(
+ urn=urn,
+ qualified_name=structured_property.qualifiedName,
+ display_name=structured_property.displayName,
+ type=structured_property.valueType,
+ description=structured_property.description,
+ entity_types=structured_property.entityTypes,
+ cardinality=structured_property.cardinality,
+ allowed_values=(
+ [
+ AllowedValue(
+ value=av.value,
+ description=av.description,
+ )
+ for av in structured_property.allowedValues or []
+ ]
+ if structured_property.allowedValues is not None
+ else None
+ ),
+ type_qualifier=(
+ {"allowed_types": structured_property.typeQualifier.get("allowedTypes")}
+ if structured_property.typeQualifier
+ else None
+ ),
+ )
def to_yaml(
self,
diff --git a/metadata-ingestion/src/datahub/cli/ingest_cli.py b/metadata-ingestion/src/datahub/cli/ingest_cli.py
index 51f095751f7dd9..fcab07a1c2aaf6 100644
--- a/metadata-ingestion/src/datahub/cli/ingest_cli.py
+++ b/metadata-ingestion/src/datahub/cli/ingest_cli.py
@@ -27,6 +27,7 @@
logger = logging.getLogger(__name__)
+INGEST_SRC_TABLE_COLUMNS = ["runId", "source", "startTime", "status", "URN"]
RUNS_TABLE_COLUMNS = ["runId", "rows", "created at"]
RUN_TABLE_COLUMNS = ["urn", "aspect name", "created at"]
@@ -437,6 +438,115 @@ def mcps(path: str) -> None:
sys.exit(ret)
+@ingest.command()
+@click.argument("page_offset", type=int, default=0)
+@click.argument("page_size", type=int, default=100)
+@click.option("--urn", type=str, default=None, help="Filter by ingestion source URN.")
+@click.option(
+ "--source", type=str, default=None, help="Filter by ingestion source name."
+)
+@upgrade.check_upgrade
+@telemetry.with_telemetry()
+def list_source_runs(page_offset: int, page_size: int, urn: str, source: str) -> None:
+ """List ingestion source runs with their details, optionally filtered by URN or source."""
+
+ query = """
+ query listIngestionRuns($input: ListIngestionSourcesInput!) {
+ listIngestionSources(input: $input) {
+ ingestionSources {
+ urn
+ name
+ executions {
+ executionRequests {
+ id
+ result {
+ startTimeMs
+ status
+ }
+ }
+ }
+ }
+ }
+ }
+ """
+
+ # filter by urn and/or source using CONTAINS
+ filters = []
+ if urn:
+ filters.append({"field": "urn", "values": [urn], "condition": "CONTAIN"})
+ if source:
+ filters.append({"field": "name", "values": [source], "condition": "CONTAIN"})
+
+ variables = {
+ "input": {
+ "start": page_offset,
+ "count": page_size,
+ "filters": filters,
+ }
+ }
+
+ client = get_default_graph()
+ session = client._session
+ gms_host = client.config.server
+
+ url = f"{gms_host}/api/graphql"
+ try:
+ response = session.post(url, json={"query": query, "variables": variables})
+ response.raise_for_status()
+ except Exception as e:
+ click.echo(f"Error fetching data: {str(e)}")
+ return
+
+ try:
+ data = response.json()
+ except ValueError:
+ click.echo("Failed to parse JSON response from server.")
+ return
+
+ if not data:
+ click.echo("No response received from the server.")
+ return
+
+ # when urn or source filter does not match, exit gracefully
+ if (
+ not isinstance(data.get("data"), dict)
+ or "listIngestionSources" not in data["data"]
+ ):
+ click.echo("No matching ingestion sources found. Please check your filters.")
+ return
+
+ ingestion_sources = data["data"]["listIngestionSources"]["ingestionSources"]
+ if not ingestion_sources:
+ click.echo("No ingestion sources or executions found.")
+ return
+
+ rows = []
+ for ingestion_source in ingestion_sources:
+ urn = ingestion_source.get("urn", "N/A")
+ name = ingestion_source.get("name", "N/A")
+
+ executions = ingestion_source.get("executions", {}).get("executionRequests", [])
+ for execution in executions:
+ execution_id = execution.get("id", "N/A")
+ start_time = execution.get("result", {}).get("startTimeMs", "N/A")
+ start_time = (
+ datetime.fromtimestamp(start_time / 1000).strftime("%Y-%m-%d %H:%M:%S")
+ if start_time != "N/A"
+ else "N/A"
+ )
+ status = execution.get("result", {}).get("status", "N/A")
+
+ rows.append([execution_id, name, start_time, status, urn])
+
+ click.echo(
+ tabulate(
+ rows,
+ headers=INGEST_SRC_TABLE_COLUMNS,
+ tablefmt="grid",
+ )
+ )
+
+
@ingest.command()
@click.argument("page_offset", type=int, default=0)
@click.argument("page_size", type=int, default=100)
diff --git a/metadata-ingestion/src/datahub/cli/specific/structuredproperties_cli.py b/metadata-ingestion/src/datahub/cli/specific/structuredproperties_cli.py
index 4162d44b9b0ea8..42285cf13a5ddc 100644
--- a/metadata-ingestion/src/datahub/cli/specific/structuredproperties_cli.py
+++ b/metadata-ingestion/src/datahub/cli/specific/structuredproperties_cli.py
@@ -31,7 +31,8 @@ def properties() -> None:
def upsert(file: Path) -> None:
"""Upsert structured properties in DataHub."""
- StructuredProperties.create(str(file))
+ with get_default_graph() as graph:
+ StructuredProperties.create(str(file), graph)
@properties.command(
diff --git a/metadata-ingestion/src/datahub/configuration/git.py b/metadata-ingestion/src/datahub/configuration/git.py
index d237cd9ddd306c..e7e9bfd43adca5 100644
--- a/metadata-ingestion/src/datahub/configuration/git.py
+++ b/metadata-ingestion/src/datahub/configuration/git.py
@@ -24,7 +24,11 @@ class GitReference(ConfigModel):
"main",
description="Branch on which your files live by default. Typically main or master. This can also be a commit hash.",
)
-
+ url_subdir: Optional[str] = Field(
+ default=None,
+ description="Prefix to prepend when generating URLs for files - useful when files are in a subdirectory. "
+ "Only affects URL generation, not git operations.",
+ )
url_template: Optional[str] = Field(
None,
description=f"Template for generating a URL to a file in the repo e.g. '{_GITHUB_URL_TEMPLATE}'. We can infer this for GitHub and GitLab repos, and it is otherwise required."
@@ -68,6 +72,8 @@ def infer_url_template(cls, url_template: Optional[str], values: dict) -> str:
def get_url_for_file_path(self, file_path: str) -> str:
assert self.url_template
+ if self.url_subdir:
+ file_path = f"{self.url_subdir}/{file_path}"
return self.url_template.format(
repo_url=self.repo, branch=self.branch, file_path=file_path
)
diff --git a/metadata-ingestion/src/datahub/configuration/source_common.py b/metadata-ingestion/src/datahub/configuration/source_common.py
index 44c737f1bd13d4..8e41e9fb917878 100644
--- a/metadata-ingestion/src/datahub/configuration/source_common.py
+++ b/metadata-ingestion/src/datahub/configuration/source_common.py
@@ -63,3 +63,16 @@ class DatasetLineageProviderConfigBase(EnvConfigMixin):
default=None,
description="A holder for platform -> platform_instance mappings to generate correct dataset urns",
)
+
+
+class PlatformDetail(ConfigModel):
+ platform_instance: Optional[str] = Field(
+ default=None,
+ description="DataHub platform instance name. To generate correct urn for upstream dataset, this should match "
+ "with platform instance name used in ingestion "
+ "recipe of other datahub sources.",
+ )
+ env: str = Field(
+ default=DEFAULT_ENV,
+ description="The environment that all assets produced by DataHub platform ingestion source belong to",
+ )
diff --git a/metadata-ingestion/src/datahub/ingestion/api/source.py b/metadata-ingestion/src/datahub/ingestion/api/source.py
index c80da04e481a9f..c3638635b19aac 100644
--- a/metadata-ingestion/src/datahub/ingestion/api/source.py
+++ b/metadata-ingestion/src/datahub/ingestion/api/source.py
@@ -184,6 +184,7 @@ def infos(self) -> LossyList[StructuredLogEntry]:
@dataclass
class SourceReport(Report):
+ event_not_produced_warn: bool = True
events_produced: int = 0
events_produced_per_sec: int = 0
diff --git a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py
index 0c86e1cf47203f..7791ea2797be34 100644
--- a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py
+++ b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py
@@ -150,7 +150,7 @@ def auto_workunit_reporter(report: "SourceReport", stream: Iterable[T]) -> Itera
report.report_workunit(wu)
yield wu
- if report.events_produced == 0:
+ if report.event_not_produced_warn and report.events_produced == 0:
report.warning(
title="No metadata was produced by the source",
message="Please check the source configuration, filters, and permissions.",
diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py
index faa281097de4cd..80906ca63115f5 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py
@@ -147,6 +147,47 @@ def query(self) -> str:
version
"""
+ def execute_server_cursor(
+ self, query: str, params: Dict[str, Any]
+ ) -> Iterable[Dict[str, Any]]:
+ with self.engine.connect() as conn:
+ if self.engine.dialect.name == "postgresql":
+ with conn.begin(): # Transaction required for PostgreSQL server-side cursor
+ conn = conn.execution_options(
+ stream_results=True,
+ yield_per=self.config.database_query_batch_size,
+ )
+ result = conn.execute(query, params)
+ for row in result:
+ yield dict(row)
+ elif self.engine.dialect.name == "mysql": # MySQL
+ import MySQLdb
+
+ with contextlib.closing(
+ conn.connection.cursor(MySQLdb.cursors.SSCursor)
+ ) as cursor:
+ logger.debug(f"Using Cursor type: {cursor.__class__.__name__}")
+ cursor.execute(query, params)
+
+ columns = [desc[0] for desc in cursor.description]
+ while True:
+ rows = cursor.fetchmany(self.config.database_query_batch_size)
+ if not rows:
+ break # Use break instead of return in generator
+ for row in rows:
+ yield dict(zip(columns, row))
+ else:
+ raise ValueError(f"Unsupported dialect: {self.engine.dialect.name}")
+
+ def _get_rows(
+ self, from_createdon: datetime, stop_time: datetime
+ ) -> Iterable[Dict[str, Any]]:
+ params = {
+ "exclude_aspects": list(self.config.exclude_aspects),
+ "since_createdon": from_createdon.strftime(DATETIME_FORMAT),
+ }
+ yield from self.execute_server_cursor(self.query, params)
+
def get_aspects(
self, from_createdon: datetime, stop_time: datetime
) -> Iterable[Tuple[MetadataChangeProposalWrapper, datetime]]:
@@ -159,27 +200,6 @@ def get_aspects(
if mcp:
yield mcp, row["createdon"]
- def _get_rows(
- self, from_createdon: datetime, stop_time: datetime
- ) -> Iterable[Dict[str, Any]]:
- with self.engine.connect() as conn:
- with contextlib.closing(conn.connection.cursor()) as cursor:
- cursor.execute(
- self.query,
- {
- "exclude_aspects": list(self.config.exclude_aspects),
- "since_createdon": from_createdon.strftime(DATETIME_FORMAT),
- },
- )
-
- columns = [desc[0] for desc in cursor.description]
- while True:
- rows = cursor.fetchmany(self.config.database_query_batch_size)
- if not rows:
- return
- for row in rows:
- yield dict(zip(columns, row))
-
def get_soft_deleted_rows(self) -> Iterable[Dict[str, Any]]:
"""
Fetches all soft-deleted entities from the database.
diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py
index 63cea45f75864b..cb72441344088c 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py
@@ -1,5 +1,5 @@
import logging
-from datetime import datetime, timezone
+from datetime import datetime, timedelta, timezone
from functools import partial
from typing import Dict, Iterable, List, Optional
@@ -26,6 +26,7 @@
StatefulIngestionSourceBase,
)
from datahub.metadata.schema_classes import ChangeTypeClass
+from datahub.utilities.progress_timer import ProgressTimer
logger = logging.getLogger(__name__)
@@ -105,11 +106,17 @@ def _get_database_workunits(
self, from_createdon: datetime, reader: DataHubDatabaseReader
) -> Iterable[MetadataWorkUnit]:
logger.info(f"Fetching database aspects starting from {from_createdon}")
+ progress = ProgressTimer(report_every=timedelta(seconds=60))
mcps = reader.get_aspects(from_createdon, self.report.stop_time)
for i, (mcp, createdon) in enumerate(mcps):
if not self.urn_pattern.allowed(str(mcp.entityUrn)):
continue
+ if progress.should_report():
+ logger.info(
+ f"Ingested {i} database aspects so far, currently at {createdon}"
+ )
+
yield mcp.as_workunit()
self.report.num_database_aspects_ingested += 1
diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py b/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py
index 814f65ecb45cf0..4eecbb4d9d7177 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py
@@ -65,18 +65,18 @@ class DataHubGcSourceConfig(ConfigModel):
description="Sleep between truncation monitoring.",
)
- dataprocess_cleanup: Optional[DataProcessCleanupConfig] = Field(
- default=None,
+ dataprocess_cleanup: DataProcessCleanupConfig = Field(
+ default_factory=DataProcessCleanupConfig,
description="Configuration for data process cleanup",
)
- soft_deleted_entities_cleanup: Optional[SoftDeletedEntitiesCleanupConfig] = Field(
- default=None,
+ soft_deleted_entities_cleanup: SoftDeletedEntitiesCleanupConfig = Field(
+ default_factory=SoftDeletedEntitiesCleanupConfig,
description="Configuration for soft deleted entities cleanup",
)
- execution_request_cleanup: Optional[DatahubExecutionRequestCleanupConfig] = Field(
- default=None,
+ execution_request_cleanup: DatahubExecutionRequestCleanupConfig = Field(
+ default_factory=DatahubExecutionRequestCleanupConfig,
description="Configuration for execution request cleanup",
)
@@ -108,28 +108,22 @@ def __init__(self, ctx: PipelineContext, config: DataHubGcSourceConfig):
self.ctx = ctx
self.config = config
self.report = DataHubGcSourceReport()
+ self.report.event_not_produced_warn = False
self.graph = ctx.require_graph("The DataHubGc source")
- self.dataprocess_cleanup: Optional[DataProcessCleanup] = None
- self.soft_deleted_entities_cleanup: Optional[SoftDeletedEntitiesCleanup] = None
- self.execution_request_cleanup: Optional[DatahubExecutionRequestCleanup] = None
-
- if self.config.dataprocess_cleanup:
- self.dataprocess_cleanup = DataProcessCleanup(
- ctx, self.config.dataprocess_cleanup, self.report, self.config.dry_run
- )
- if self.config.soft_deleted_entities_cleanup:
- self.soft_deleted_entities_cleanup = SoftDeletedEntitiesCleanup(
- ctx,
- self.config.soft_deleted_entities_cleanup,
- self.report,
- self.config.dry_run,
- )
- if self.config.execution_request_cleanup:
- self.execution_request_cleanup = DatahubExecutionRequestCleanup(
- config=self.config.execution_request_cleanup,
- graph=self.graph,
- report=self.report,
- )
+ self.dataprocess_cleanup = DataProcessCleanup(
+ ctx, self.config.dataprocess_cleanup, self.report, self.config.dry_run
+ )
+ self.soft_deleted_entities_cleanup = SoftDeletedEntitiesCleanup(
+ ctx,
+ self.config.soft_deleted_entities_cleanup,
+ self.report,
+ self.config.dry_run,
+ )
+ self.execution_request_cleanup = DatahubExecutionRequestCleanup(
+ config=self.config.execution_request_cleanup,
+ graph=self.graph,
+ report=self.report,
+ )
@classmethod
def create(cls, config_dict, ctx):
@@ -153,19 +147,19 @@ def get_workunits_internal(
self.truncate_indices()
except Exception as e:
self.report.failure("While trying to truncate indices ", exc=e)
- if self.soft_deleted_entities_cleanup:
+ if self.config.soft_deleted_entities_cleanup.enabled:
try:
self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
except Exception as e:
self.report.failure(
"While trying to cleanup soft deleted entities ", exc=e
)
- if self.execution_request_cleanup:
+ if self.config.execution_request_cleanup.enabled:
try:
self.execution_request_cleanup.run()
except Exception as e:
self.report.failure("While trying to cleanup execution request ", exc=e)
- if self.dataprocess_cleanup:
+ if self.config.dataprocess_cleanup.enabled:
try:
yield from self.dataprocess_cleanup.get_workunits_internal()
except Exception as e:
diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py
index 8aacf13cdb00fb..6d16aaab2d7980 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py
@@ -98,6 +98,9 @@
class DataProcessCleanupConfig(ConfigModel):
+ enabled: bool = Field(
+ default=True, description="Whether to do data process cleanup."
+ )
retention_days: Optional[int] = Field(
10,
description="Number of days to retain metadata in DataHub",
@@ -371,17 +374,26 @@ def get_data_flows(self) -> Iterable[DataFlowEntity]:
previous_scroll_id: Optional[str] = None
while True:
- result = self.ctx.graph.execute_graphql(
- DATAFLOW_QUERY,
- {
- "query": "*",
- "scrollId": scroll_id if scroll_id else None,
- "batchSize": self.config.batch_size,
- },
- )
+ result = None
+ try:
+ result = self.ctx.graph.execute_graphql(
+ DATAFLOW_QUERY,
+ {
+ "query": "*",
+ "scrollId": scroll_id if scroll_id else None,
+ "batchSize": self.config.batch_size,
+ },
+ )
+ except Exception as e:
+ self.report.failure(
+ f"While trying to get dataflows with {scroll_id}", exc=e
+ )
+ break
+
scrollAcrossEntities = result.get("scrollAcrossEntities")
if not scrollAcrossEntities:
raise ValueError("Missing scrollAcrossEntities in response")
+ logger.info(f"Got {scrollAcrossEntities.get('count')} DataFlow entities")
scroll_id = scrollAcrossEntities.get("nextScrollId")
for flow in scrollAcrossEntities.get("searchResults"):
@@ -398,6 +410,8 @@ def get_data_flows(self) -> Iterable[DataFlowEntity]:
previous_scroll_id = scroll_id
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
+ if not self.config.enabled:
+ return []
assert self.ctx.graph
dataFlows: Dict[str, DataFlowEntity] = {}
@@ -411,14 +425,20 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
deleted_jobs: int = 0
while True:
- result = self.ctx.graph.execute_graphql(
- DATAJOB_QUERY,
- {
- "query": "*",
- "scrollId": scroll_id if scroll_id else None,
- "batchSize": self.config.batch_size,
- },
- )
+ try:
+ result = self.ctx.graph.execute_graphql(
+ DATAJOB_QUERY,
+ {
+ "query": "*",
+ "scrollId": scroll_id if scroll_id else None,
+ "batchSize": self.config.batch_size,
+ },
+ )
+ except Exception as e:
+ self.report.failure(
+ f"While trying to get data jobs with {scroll_id}", exc=e
+ )
+ break
scrollAcrossEntities = result.get("scrollAcrossEntities")
if not scrollAcrossEntities:
raise ValueError("Missing scrollAcrossEntities in response")
diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py
index bb4ab753543b7b..93f004ab675edc 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py
@@ -20,6 +20,9 @@
class SoftDeletedEntitiesCleanupConfig(ConfigModel):
+ enabled: bool = Field(
+ default=True, description="Whether to do soft deletion cleanup."
+ )
retention_days: Optional[int] = Field(
10,
description="Number of days to retain metadata in DataHub",
@@ -156,6 +159,8 @@ def delete_soft_deleted_entity(self, urn: str) -> None:
self.delete_entity(urn)
def cleanup_soft_deleted_entities(self) -> None:
+ if not self.config.enabled:
+ return
assert self.ctx.graph
start_time = time.time()
diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka_connect.py b/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka_connect.py
deleted file mode 100644
index 23a99ccb310e13..00000000000000
--- a/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka_connect.py
+++ /dev/null
@@ -1,1468 +0,0 @@
-import logging
-import re
-from dataclasses import dataclass, field
-from typing import Dict, Iterable, List, Optional, Tuple
-
-import jpype
-import jpype.imports
-import requests
-from pydantic.fields import Field
-from sqlalchemy.engine.url import make_url
-
-import datahub.emitter.mce_builder as builder
-import datahub.metadata.schema_classes as models
-from datahub.configuration.common import AllowDenyPattern, ConfigModel
-from datahub.configuration.source_common import (
- DatasetLineageProviderConfigBase,
- PlatformInstanceConfigMixin,
-)
-from datahub.emitter.mcp import MetadataChangeProposalWrapper
-from datahub.ingestion.api.common import PipelineContext
-from datahub.ingestion.api.decorators import (
- SourceCapability,
- SupportStatus,
- capability,
- config_class,
- platform_name,
- support_status,
-)
-from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source
-from datahub.ingestion.api.workunit import MetadataWorkUnit
-from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
- get_platform_from_sqlalchemy_uri,
-)
-from datahub.ingestion.source.state.stale_entity_removal_handler import (
- StaleEntityRemovalHandler,
- StaleEntityRemovalSourceReport,
- StatefulStaleMetadataRemovalConfig,
-)
-from datahub.ingestion.source.state.stateful_ingestion_base import (
- StatefulIngestionConfigBase,
- StatefulIngestionSourceBase,
-)
-
-logger = logging.getLogger(__name__)
-
-KAFKA = "kafka"
-SOURCE = "source"
-SINK = "sink"
-CONNECTOR_CLASS = "connector.class"
-
-
-class ProvidedConfig(ConfigModel):
- provider: str
- path_key: str
- value: str
-
-
-class GenericConnectorConfig(ConfigModel):
- connector_name: str
- source_dataset: str
- source_platform: str
-
-
-class KafkaConnectSourceConfig(
- PlatformInstanceConfigMixin,
- DatasetLineageProviderConfigBase,
- StatefulIngestionConfigBase,
-):
- # See the Connect REST Interface for details
- # https://docs.confluent.io/platform/current/connect/references/restapi.html#
- connect_uri: str = Field(
- default="http://localhost:8083/", description="URI to connect to."
- )
- username: Optional[str] = Field(default=None, description="Kafka Connect username.")
- password: Optional[str] = Field(default=None, description="Kafka Connect password.")
- cluster_name: Optional[str] = Field(
- default="connect-cluster", description="Cluster to ingest from."
- )
- # convert lineage dataset's urns to lowercase
- convert_lineage_urns_to_lowercase: bool = Field(
- default=False,
- description="Whether to convert the urns of ingested lineage dataset to lowercase",
- )
- connector_patterns: AllowDenyPattern = Field(
- default=AllowDenyPattern.allow_all(),
- description="regex patterns for connectors to filter for ingestion.",
- )
- provided_configs: Optional[List[ProvidedConfig]] = Field(
- default=None, description="Provided Configurations"
- )
- connect_to_platform_map: Optional[Dict[str, Dict[str, str]]] = Field(
- default=None,
- description='Platform instance mapping when multiple instances for a platform is available. Entry for a platform should be in either `platform_instance_map` or `connect_to_platform_map`. e.g.`connect_to_platform_map: { "postgres-connector-finance-db": "postgres": "core_finance_instance" }`',
- )
- platform_instance_map: Optional[Dict[str, str]] = Field(
- default=None,
- description='Platform instance mapping to use when constructing URNs. e.g.`platform_instance_map: { "hive": "warehouse" }`',
- )
- generic_connectors: List[GenericConnectorConfig] = Field(
- default=[],
- description="Provide lineage graph for sources connectors other than Confluent JDBC Source Connector, Debezium Source Connector, and Mongo Source Connector",
- )
-
- stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
-
-
-@dataclass
-class KafkaConnectSourceReport(StaleEntityRemovalSourceReport):
- connectors_scanned: int = 0
- filtered: List[str] = field(default_factory=list)
-
- def report_connector_scanned(self, connector: str) -> None:
- self.connectors_scanned += 1
-
- def report_dropped(self, connector: str) -> None:
- self.filtered.append(connector)
-
-
-@dataclass
-class KafkaConnectLineage:
- """Class to store Kafka Connect lineage mapping, Each instance is potential DataJob"""
-
- source_platform: str
- target_dataset: str
- target_platform: str
- job_property_bag: Optional[Dict[str, str]] = None
- source_dataset: Optional[str] = None
-
-
-@dataclass
-class ConnectorManifest:
- """Each instance is potential DataFlow"""
-
- name: str
- type: str
- config: Dict
- tasks: Dict
- url: Optional[str] = None
- flow_property_bag: Optional[Dict[str, str]] = None
- lineages: List[KafkaConnectLineage] = field(default_factory=list)
- topic_names: Iterable[str] = field(default_factory=list)
-
-
-def remove_prefix(text: str, prefix: str) -> str:
- if text.startswith(prefix):
- index = len(prefix)
- return text[index:]
- return text
-
-
-def unquote(
- string: str, leading_quote: str = '"', trailing_quote: Optional[str] = None
-) -> str:
- """
- If string starts and ends with a quote, unquote it
- """
- trailing_quote = trailing_quote if trailing_quote else leading_quote
- if string.startswith(leading_quote) and string.endswith(trailing_quote):
- string = string[1:-1]
- return string
-
-
-def get_dataset_name(
- database_name: Optional[str],
- source_table: str,
-) -> str:
- if database_name:
- dataset_name = database_name + "." + source_table
- else:
- dataset_name = source_table
-
- return dataset_name
-
-
-def get_platform_instance(
- config: KafkaConnectSourceConfig, connector_name: str, platform: str
-) -> Optional[str]:
- instance_name = None
- if (
- config.connect_to_platform_map
- and config.connect_to_platform_map.get(connector_name)
- and config.connect_to_platform_map[connector_name].get(platform)
- ):
- instance_name = config.connect_to_platform_map[connector_name][platform]
- if config.platform_instance_map and config.platform_instance_map.get(platform):
- logger.warning(
- f"Same source platform {platform} configured in both platform_instance_map and connect_to_platform_map."
- "Will prefer connector specific platform instance from connect_to_platform_map."
- )
- elif config.platform_instance_map and config.platform_instance_map.get(platform):
- instance_name = config.platform_instance_map[platform]
- logger.info(
- f"Instance name assigned is: {instance_name} for Connector Name {connector_name} and platform {platform}"
- )
- return instance_name
-
-
-@dataclass
-class ConfluentJDBCSourceConnector:
- connector_manifest: ConnectorManifest
- report: KafkaConnectSourceReport
-
- def __init__(
- self,
- connector_manifest: ConnectorManifest,
- config: KafkaConnectSourceConfig,
- report: KafkaConnectSourceReport,
- ) -> None:
- self.connector_manifest = connector_manifest
- self.config = config
- self.report = report
- self._extract_lineages()
-
- REGEXROUTER = "org.apache.kafka.connect.transforms.RegexRouter"
- KNOWN_TOPICROUTING_TRANSFORMS = [REGEXROUTER]
- # https://kafka.apache.org/documentation/#connect_included_transformation
- KAFKA_NONTOPICROUTING_TRANSFORMS = [
- "InsertField",
- "InsertField$Key",
- "InsertField$Value",
- "ReplaceField",
- "ReplaceField$Key",
- "ReplaceField$Value",
- "MaskField",
- "MaskField$Key",
- "MaskField$Value",
- "ValueToKey",
- "ValueToKey$Key",
- "ValueToKey$Value",
- "HoistField",
- "HoistField$Key",
- "HoistField$Value",
- "ExtractField",
- "ExtractField$Key",
- "ExtractField$Value",
- "SetSchemaMetadata",
- "SetSchemaMetadata$Key",
- "SetSchemaMetadata$Value",
- "Flatten",
- "Flatten$Key",
- "Flatten$Value",
- "Cast",
- "Cast$Key",
- "Cast$Value",
- "HeadersFrom",
- "HeadersFrom$Key",
- "HeadersFrom$Value",
- "TimestampConverter",
- "Filter",
- "InsertHeader",
- "DropHeaders",
- ]
- # https://docs.confluent.io/platform/current/connect/transforms/overview.html
- CONFLUENT_NONTOPICROUTING_TRANSFORMS = [
- "Drop",
- "Drop$Key",
- "Drop$Value",
- "Filter",
- "Filter$Key",
- "Filter$Value",
- "TombstoneHandler",
- ]
- KNOWN_NONTOPICROUTING_TRANSFORMS = (
- KAFKA_NONTOPICROUTING_TRANSFORMS
- + [
- f"org.apache.kafka.connect.transforms.{t}"
- for t in KAFKA_NONTOPICROUTING_TRANSFORMS
- ]
- + CONFLUENT_NONTOPICROUTING_TRANSFORMS
- + [
- f"io.confluent.connect.transforms.{t}"
- for t in CONFLUENT_NONTOPICROUTING_TRANSFORMS
- ]
- )
-
- @dataclass
- class JdbcParser:
- db_connection_url: str
- source_platform: str
- database_name: str
- topic_prefix: str
- query: str
- transforms: list
-
- def get_parser(
- self,
- connector_manifest: ConnectorManifest,
- ) -> JdbcParser:
- url = remove_prefix(
- str(connector_manifest.config.get("connection.url")), "jdbc:"
- )
- url_instance = make_url(url)
- source_platform = get_platform_from_sqlalchemy_uri(str(url_instance))
- database_name = url_instance.database
- assert database_name
- db_connection_url = f"{url_instance.drivername}://{url_instance.host}:{url_instance.port}/{database_name}"
-
- topic_prefix = self.connector_manifest.config.get("topic.prefix", None)
-
- query = self.connector_manifest.config.get("query", None)
-
- transform_names = (
- self.connector_manifest.config.get("transforms", "").split(",")
- if self.connector_manifest.config.get("transforms")
- else []
- )
-
- transforms = []
- for name in transform_names:
- transform = {"name": name}
- transforms.append(transform)
- for key in self.connector_manifest.config.keys():
- if key.startswith(f"transforms.{name}."):
- transform[
- key.replace(f"transforms.{name}.", "")
- ] = self.connector_manifest.config[key]
-
- return self.JdbcParser(
- db_connection_url,
- source_platform,
- database_name,
- topic_prefix,
- query,
- transforms,
- )
-
- def default_get_lineages(
- self,
- topic_prefix: str,
- database_name: str,
- source_platform: str,
- topic_names: Optional[Iterable[str]] = None,
- include_source_dataset: bool = True,
- ) -> List[KafkaConnectLineage]:
- lineages: List[KafkaConnectLineage] = []
- if not topic_names:
- topic_names = self.connector_manifest.topic_names
- table_name_tuples: List[Tuple] = self.get_table_names()
- for topic in topic_names:
- # All good for NO_TRANSFORM or (SINGLE_TRANSFORM and KNOWN_NONTOPICROUTING_TRANSFORM) or (not SINGLE_TRANSFORM and all(KNOWN_NONTOPICROUTING_TRANSFORM))
- source_table: str = (
- remove_prefix(topic, topic_prefix) if topic_prefix else topic
- )
- # include schema name for three-level hierarchies
- if has_three_level_hierarchy(source_platform):
- table_name_tuple: Tuple = next(
- iter([t for t in table_name_tuples if t and t[-1] == source_table]),
- (),
- )
- if len(table_name_tuple) > 1:
- source_table = f"{table_name_tuple[-2]}.{source_table}"
- else:
- include_source_dataset = False
- self.report.warning(
- "Could not find schema for table"
- f"{self.connector_manifest.name} : {source_table}",
- )
- dataset_name: str = get_dataset_name(database_name, source_table)
- lineage = KafkaConnectLineage(
- source_dataset=dataset_name if include_source_dataset else None,
- source_platform=source_platform,
- target_dataset=topic,
- target_platform=KAFKA,
- )
- lineages.append(lineage)
- return lineages
-
- def get_table_names(self) -> List[Tuple]:
- sep: str = "."
- leading_quote_char: str = '"'
- trailing_quote_char: str = leading_quote_char
-
- table_ids: List[str] = []
- if self.connector_manifest.tasks:
- table_ids = (
- ",".join(
- [
- task["config"].get("tables")
- for task in self.connector_manifest.tasks
- ]
- )
- ).split(",")
- quote_method = self.connector_manifest.config.get(
- "quote.sql.identifiers", "always"
- )
- if (
- quote_method == "always"
- and table_ids
- and table_ids[0]
- and table_ids[-1]
- ):
- leading_quote_char = table_ids[0][0]
- trailing_quote_char = table_ids[-1][-1]
- # This will only work for single character quotes
- elif self.connector_manifest.config.get("table.whitelist"):
- table_ids = self.connector_manifest.config.get("table.whitelist").split(",") # type: ignore
-
- # List of Tuple containing (schema, table)
- tables: List[Tuple] = [
- (
- (
- unquote(
- table_id.split(sep)[-2], leading_quote_char, trailing_quote_char
- )
- if len(table_id.split(sep)) > 1
- else ""
- ),
- unquote(
- table_id.split(sep)[-1], leading_quote_char, trailing_quote_char
- ),
- )
- for table_id in table_ids
- ]
- return tables
-
- def _extract_lineages(self):
- lineages: List[KafkaConnectLineage] = list()
- parser = self.get_parser(self.connector_manifest)
- source_platform = parser.source_platform
- database_name = parser.database_name
- query = parser.query
- topic_prefix = parser.topic_prefix
- transforms = parser.transforms
- self.connector_manifest.flow_property_bag = self.connector_manifest.config
-
- # Mask/Remove properties that may reveal credentials
- self.connector_manifest.flow_property_bag[
- "connection.url"
- ] = parser.db_connection_url
- if "connection.password" in self.connector_manifest.flow_property_bag:
- del self.connector_manifest.flow_property_bag["connection.password"]
- if "connection.user" in self.connector_manifest.flow_property_bag:
- del self.connector_manifest.flow_property_bag["connection.user"]
-
- logging.debug(
- f"Extracting source platform: {source_platform} and database name: {database_name} from connection url "
- )
-
- if not self.connector_manifest.topic_names:
- self.connector_manifest.lineages = lineages
- return
-
- if query:
- # Lineage source_table can be extracted by parsing query
- for topic in self.connector_manifest.topic_names:
- # default method - as per earlier implementation
- dataset_name: str = get_dataset_name(database_name, topic)
-
- lineage = KafkaConnectLineage(
- source_dataset=None,
- source_platform=source_platform,
- target_dataset=topic,
- target_platform=KAFKA,
- )
- lineages.append(lineage)
- self.report.warning(
- "Could not find input dataset, the connector has query configuration set",
- self.connector_manifest.name,
- )
- self.connector_manifest.lineages = lineages
- return
-
- SINGLE_TRANSFORM = len(transforms) == 1
- NO_TRANSFORM = len(transforms) == 0
- UNKNOWN_TRANSFORM = any(
- [
- transform["type"]
- not in self.KNOWN_TOPICROUTING_TRANSFORMS
- + self.KNOWN_NONTOPICROUTING_TRANSFORMS
- for transform in transforms
- ]
- )
- ALL_TRANSFORMS_NON_TOPICROUTING = all(
- [
- transform["type"] in self.KNOWN_NONTOPICROUTING_TRANSFORMS
- for transform in transforms
- ]
- )
-
- if NO_TRANSFORM or ALL_TRANSFORMS_NON_TOPICROUTING:
- self.connector_manifest.lineages = self.default_get_lineages(
- database_name=database_name,
- source_platform=source_platform,
- topic_prefix=topic_prefix,
- )
- return
-
- if SINGLE_TRANSFORM and transforms[0]["type"] == self.REGEXROUTER:
- tables = self.get_table_names()
- topic_names = list(self.connector_manifest.topic_names)
-
- from java.util.regex import Pattern
-
- for table in tables:
- source_table: str = table[-1]
- topic = topic_prefix + source_table if topic_prefix else source_table
-
- transform_regex = Pattern.compile(transforms[0]["regex"])
- transform_replacement = transforms[0]["replacement"]
-
- matcher = transform_regex.matcher(topic)
- if matcher.matches():
- topic = str(matcher.replaceFirst(transform_replacement))
-
- # Additional check to confirm that the topic present
- # in connector topics
-
- if topic in self.connector_manifest.topic_names:
- # include schema name for three-level hierarchies
- if has_three_level_hierarchy(source_platform) and len(table) > 1:
- source_table = f"{table[-2]}.{table[-1]}"
-
- dataset_name = get_dataset_name(database_name, source_table)
-
- lineage = KafkaConnectLineage(
- source_dataset=dataset_name,
- source_platform=source_platform,
- target_dataset=topic,
- target_platform=KAFKA,
- )
- topic_names.remove(topic)
- lineages.append(lineage)
-
- if topic_names:
- lineages.extend(
- self.default_get_lineages(
- database_name=database_name,
- source_platform=source_platform,
- topic_prefix=topic_prefix,
- topic_names=topic_names,
- include_source_dataset=False,
- )
- )
- self.report.warning(
- "Could not find input dataset for connector topics",
- f"{self.connector_manifest.name} : {topic_names}",
- )
- self.connector_manifest.lineages = lineages
- return
- else:
- include_source_dataset = True
- if SINGLE_TRANSFORM and UNKNOWN_TRANSFORM:
- self.report.warning(
- "Could not find input dataset, connector has unknown transform",
- f"{self.connector_manifest.name} : {transforms[0]['type']}",
- )
- include_source_dataset = False
- if not SINGLE_TRANSFORM and UNKNOWN_TRANSFORM:
- self.report.warning(
- "Could not find input dataset, connector has one or more unknown transforms",
- self.connector_manifest.name,
- )
- include_source_dataset = False
- lineages = self.default_get_lineages(
- database_name=database_name,
- source_platform=source_platform,
- topic_prefix=topic_prefix,
- include_source_dataset=include_source_dataset,
- )
- self.connector_manifest.lineages = lineages
- return
-
-
-@dataclass
-class MongoSourceConnector:
- # https://www.mongodb.com/docs/kafka-connector/current/source-connector/
-
- connector_manifest: ConnectorManifest
-
- def __init__(
- self, connector_manifest: ConnectorManifest, config: KafkaConnectSourceConfig
- ) -> None:
- self.connector_manifest = connector_manifest
- self.config = config
- self._extract_lineages()
-
- @dataclass
- class MongoSourceParser:
- db_connection_url: Optional[str]
- source_platform: str
- database_name: Optional[str]
- topic_prefix: Optional[str]
- transforms: List[str]
-
- def get_parser(
- self,
- connector_manifest: ConnectorManifest,
- ) -> MongoSourceParser:
- parser = self.MongoSourceParser(
- db_connection_url=connector_manifest.config.get("connection.uri"),
- source_platform="mongodb",
- database_name=connector_manifest.config.get("database"),
- topic_prefix=connector_manifest.config.get("topic_prefix"),
- transforms=(
- connector_manifest.config["transforms"].split(",")
- if "transforms" in connector_manifest.config
- else []
- ),
- )
-
- return parser
-
- def _extract_lineages(self):
- lineages: List[KafkaConnectLineage] = list()
- parser = self.get_parser(self.connector_manifest)
- source_platform = parser.source_platform
- topic_naming_pattern = r"mongodb\.(\w+)\.(\w+)"
-
- if not self.connector_manifest.topic_names:
- return lineages
-
- for topic in self.connector_manifest.topic_names:
- found = re.search(re.compile(topic_naming_pattern), topic)
-
- if found:
- table_name = get_dataset_name(found.group(1), found.group(2))
-
- lineage = KafkaConnectLineage(
- source_dataset=table_name,
- source_platform=source_platform,
- target_dataset=topic,
- target_platform=KAFKA,
- )
- lineages.append(lineage)
- self.connector_manifest.lineages = lineages
-
-
-@dataclass
-class DebeziumSourceConnector:
- connector_manifest: ConnectorManifest
- report: KafkaConnectSourceReport
-
- def __init__(
- self,
- connector_manifest: ConnectorManifest,
- config: KafkaConnectSourceConfig,
- report: KafkaConnectSourceReport,
- ) -> None:
- self.connector_manifest = connector_manifest
- self.config = config
- self.report = report
- self._extract_lineages()
-
- @dataclass
- class DebeziumParser:
- source_platform: str
- server_name: Optional[str]
- database_name: Optional[str]
-
- def get_server_name(self, connector_manifest: ConnectorManifest) -> str:
- if "topic.prefix" in connector_manifest.config:
- return connector_manifest.config["topic.prefix"]
- else:
- return connector_manifest.config.get("database.server.name", "")
-
- def get_parser(
- self,
- connector_manifest: ConnectorManifest,
- ) -> DebeziumParser:
- connector_class = connector_manifest.config.get(CONNECTOR_CLASS, "")
-
- if connector_class == "io.debezium.connector.mysql.MySqlConnector":
- parser = self.DebeziumParser(
- source_platform="mysql",
- server_name=self.get_server_name(connector_manifest),
- database_name=None,
- )
- elif connector_class == "MySqlConnector":
- parser = self.DebeziumParser(
- source_platform="mysql",
- server_name=self.get_server_name(connector_manifest),
- database_name=None,
- )
- elif connector_class == "io.debezium.connector.mongodb.MongoDbConnector":
- parser = self.DebeziumParser(
- source_platform="mongodb",
- server_name=self.get_server_name(connector_manifest),
- database_name=None,
- )
- elif connector_class == "io.debezium.connector.postgresql.PostgresConnector":
- parser = self.DebeziumParser(
- source_platform="postgres",
- server_name=self.get_server_name(connector_manifest),
- database_name=connector_manifest.config.get("database.dbname"),
- )
- elif connector_class == "io.debezium.connector.oracle.OracleConnector":
- parser = self.DebeziumParser(
- source_platform="oracle",
- server_name=self.get_server_name(connector_manifest),
- database_name=connector_manifest.config.get("database.dbname"),
- )
- elif connector_class == "io.debezium.connector.sqlserver.SqlServerConnector":
- database_name = connector_manifest.config.get(
- "database.names"
- ) or connector_manifest.config.get("database.dbname")
-
- if "," in str(database_name):
- raise Exception(
- f"Only one database is supported for Debezium's SQL Server connector. Found: {database_name}"
- )
-
- parser = self.DebeziumParser(
- source_platform="mssql",
- server_name=self.get_server_name(connector_manifest),
- database_name=database_name,
- )
- elif connector_class == "io.debezium.connector.db2.Db2Connector":
- parser = self.DebeziumParser(
- source_platform="db2",
- server_name=self.get_server_name(connector_manifest),
- database_name=connector_manifest.config.get("database.dbname"),
- )
- elif connector_class == "io.debezium.connector.vitess.VitessConnector":
- parser = self.DebeziumParser(
- source_platform="vitess",
- server_name=self.get_server_name(connector_manifest),
- database_name=connector_manifest.config.get("vitess.keyspace"),
- )
- else:
- raise ValueError(f"Connector class '{connector_class}' is unknown.")
-
- return parser
-
- def _extract_lineages(self):
- lineages: List[KafkaConnectLineage] = list()
-
- try:
- parser = self.get_parser(self.connector_manifest)
- source_platform = parser.source_platform
- server_name = parser.server_name
- database_name = parser.database_name
- topic_naming_pattern = rf"({server_name})\.(\w+\.\w+)"
-
- if not self.connector_manifest.topic_names:
- return lineages
-
- for topic in self.connector_manifest.topic_names:
- found = re.search(re.compile(topic_naming_pattern), topic)
-
- if found:
- table_name = get_dataset_name(database_name, found.group(2))
-
- lineage = KafkaConnectLineage(
- source_dataset=table_name,
- source_platform=source_platform,
- target_dataset=topic,
- target_platform=KAFKA,
- )
- lineages.append(lineage)
- self.connector_manifest.lineages = lineages
- except Exception as e:
- self.report.warning(
- "Error resolving lineage for connector",
- self.connector_manifest.name,
- exc=e,
- )
-
- return
-
-
-@dataclass
-class BigQuerySinkConnector:
- connector_manifest: ConnectorManifest
- report: KafkaConnectSourceReport
-
- def __init__(
- self, connector_manifest: ConnectorManifest, report: KafkaConnectSourceReport
- ) -> None:
- self.connector_manifest = connector_manifest
- self.report = report
- self._extract_lineages()
-
- @dataclass
- class BQParser:
- project: str
- target_platform: str
- sanitizeTopics: str
- transforms: list
- topicsToTables: Optional[str] = None
- datasets: Optional[str] = None
- defaultDataset: Optional[str] = None
- version: str = "v1"
-
- def get_parser(
- self,
- connector_manifest: ConnectorManifest,
- ) -> BQParser:
- project = connector_manifest.config["project"]
- sanitizeTopics = connector_manifest.config.get("sanitizeTopics", "false")
- transform_names = (
- self.connector_manifest.config.get("transforms", "").split(",")
- if self.connector_manifest.config.get("transforms")
- else []
- )
- transforms = []
- for name in transform_names:
- transform = {"name": name}
- transforms.append(transform)
- for key in self.connector_manifest.config.keys():
- if key.startswith(f"transforms.{name}."):
- transform[
- key.replace(f"transforms.{name}.", "")
- ] = self.connector_manifest.config[key]
-
- if "defaultDataset" in connector_manifest.config:
- defaultDataset = connector_manifest.config["defaultDataset"]
- return self.BQParser(
- project=project,
- defaultDataset=defaultDataset,
- target_platform="bigquery",
- sanitizeTopics=sanitizeTopics.lower() == "true",
- version="v2",
- transforms=transforms,
- )
- else:
- # version 1.6.x and similar configs supported
- datasets = connector_manifest.config["datasets"]
- topicsToTables = connector_manifest.config.get("topicsToTables")
-
- return self.BQParser(
- project=project,
- topicsToTables=topicsToTables,
- datasets=datasets,
- target_platform="bigquery",
- sanitizeTopics=sanitizeTopics.lower() == "true",
- transforms=transforms,
- )
-
- def get_list(self, property: str) -> Iterable[Tuple[str, str]]:
- entries = property.split(",")
- for entry in entries:
- key, val = entry.rsplit("=")
- yield (key.strip(), val.strip())
-
- def get_dataset_for_topic_v1(self, topic: str, parser: BQParser) -> Optional[str]:
- topicregex_dataset_map: Dict[str, str] = dict(self.get_list(parser.datasets)) # type: ignore
- from java.util.regex import Pattern
-
- for pattern, dataset in topicregex_dataset_map.items():
- patternMatcher = Pattern.compile(pattern).matcher(topic)
- if patternMatcher.matches():
- return dataset
- return None
-
- def sanitize_table_name(self, table_name):
- table_name = re.sub("[^a-zA-Z0-9_]", "_", table_name)
- if re.match("^[^a-zA-Z_].*", table_name):
- table_name = "_" + table_name
-
- return table_name
-
- def get_dataset_table_for_topic(
- self, topic: str, parser: BQParser
- ) -> Optional[str]:
- if parser.version == "v2":
- dataset = parser.defaultDataset
- parts = topic.split(":")
- if len(parts) == 2:
- dataset = parts[0]
- table = parts[1]
- else:
- table = parts[0]
- else:
- dataset = self.get_dataset_for_topic_v1(topic, parser)
- if dataset is None:
- return None
-
- table = topic
- if parser.topicsToTables:
- topicregex_table_map: Dict[str, str] = dict(
- self.get_list(parser.topicsToTables) # type: ignore
- )
- from java.util.regex import Pattern
-
- for pattern, tbl in topicregex_table_map.items():
- patternMatcher = Pattern.compile(pattern).matcher(topic)
- if patternMatcher.matches():
- table = tbl
- break
-
- if parser.sanitizeTopics:
- table = self.sanitize_table_name(table)
- return f"{dataset}.{table}"
-
- def apply_transformations(
- self, topic: str, transforms: List[Dict[str, str]]
- ) -> str:
- for transform in transforms:
- if transform["type"] == "org.apache.kafka.connect.transforms.RegexRouter":
- regex = transform["regex"]
- replacement = transform["replacement"]
- pattern = re.compile(regex)
- if pattern.match(topic):
- topic = pattern.sub(replacement, topic, count=1)
- return topic
-
- def _extract_lineages(self):
- lineages: List[KafkaConnectLineage] = list()
- parser = self.get_parser(self.connector_manifest)
- if not parser:
- return lineages
- target_platform = parser.target_platform
- project = parser.project
- transforms = parser.transforms
- self.connector_manifest.flow_property_bag = self.connector_manifest.config
- # Mask/Remove properties that may reveal credentials
- if "keyfile" in self.connector_manifest.flow_property_bag:
- del self.connector_manifest.flow_property_bag["keyfile"]
-
- for topic in self.connector_manifest.topic_names:
- transformed_topic = self.apply_transformations(topic, transforms)
- dataset_table = self.get_dataset_table_for_topic(transformed_topic, parser)
- if dataset_table is None:
- self.report.warning(
- "Could not find target dataset for topic, please check your connector configuration"
- f"{self.connector_manifest.name} : {transformed_topic} ",
- )
- continue
- target_dataset = f"{project}.{dataset_table}"
-
- lineages.append(
- KafkaConnectLineage(
- source_dataset=transformed_topic,
- source_platform=KAFKA,
- target_dataset=target_dataset,
- target_platform=target_platform,
- )
- )
- self.connector_manifest.lineages = lineages
- return
-
-
-@dataclass
-class SnowflakeSinkConnector:
- connector_manifest: ConnectorManifest
- report: KafkaConnectSourceReport
-
- def __init__(
- self, connector_manifest: ConnectorManifest, report: KafkaConnectSourceReport
- ) -> None:
- self.connector_manifest = connector_manifest
- self.report = report
- self._extract_lineages()
-
- @dataclass
- class SnowflakeParser:
- database_name: str
- schema_name: str
- topics_to_tables: Dict[str, str]
-
- def get_table_name_from_topic_name(self, topic_name: str) -> str:
- """
- This function converts the topic name to a valid Snowflake table name using some rules.
- Refer below link for more info
- https://docs.snowflake.com/en/user-guide/kafka-connector-overview#target-tables-for-kafka-topics
- """
- table_name = re.sub("[^a-zA-Z0-9_]", "_", topic_name)
- if re.match("^[^a-zA-Z_].*", table_name):
- table_name = "_" + table_name
- # Connector may append original topic's hash code as suffix for conflict resolution
- # if generated table names for 2 topics are similar. This corner case is not handled here.
- # Note that Snowflake recommends to choose topic names that follow the rules for
- # Snowflake identifier names so this case is not recommended by snowflake.
- return table_name
-
- def get_parser(
- self,
- connector_manifest: ConnectorManifest,
- ) -> SnowflakeParser:
- database_name = connector_manifest.config["snowflake.database.name"]
- schema_name = connector_manifest.config["snowflake.schema.name"]
-
- # Fetch user provided topic to table map
- provided_topics_to_tables: Dict[str, str] = {}
- if connector_manifest.config.get("snowflake.topic2table.map"):
- for each in connector_manifest.config["snowflake.topic2table.map"].split(
- ","
- ):
- topic, table = each.split(":")
- provided_topics_to_tables[topic.strip()] = table.strip()
-
- topics_to_tables: Dict[str, str] = {}
- # Extract lineage for only those topics whose data ingestion started
- for topic in connector_manifest.topic_names:
- if topic in provided_topics_to_tables:
- # If user provided which table to get mapped with this topic
- topics_to_tables[topic] = provided_topics_to_tables[topic]
- else:
- # Else connector converts topic name to a valid Snowflake table name.
- topics_to_tables[topic] = self.get_table_name_from_topic_name(topic)
-
- return self.SnowflakeParser(
- database_name=database_name,
- schema_name=schema_name,
- topics_to_tables=topics_to_tables,
- )
-
- def _extract_lineages(self):
- self.connector_manifest.flow_property_bag = self.connector_manifest.config
-
- # For all snowflake sink connector properties, refer below link
- # https://docs.snowflake.com/en/user-guide/kafka-connector-install#configuring-the-kafka-connector
- # remove private keys, secrets from properties
- secret_properties = [
- "snowflake.private.key",
- "snowflake.private.key.passphrase",
- "value.converter.basic.auth.user.info",
- ]
- for k in secret_properties:
- if k in self.connector_manifest.flow_property_bag:
- del self.connector_manifest.flow_property_bag[k]
-
- lineages: List[KafkaConnectLineage] = list()
- parser = self.get_parser(self.connector_manifest)
-
- for topic, table in parser.topics_to_tables.items():
- target_dataset = f"{parser.database_name}.{parser.schema_name}.{table}"
- lineages.append(
- KafkaConnectLineage(
- source_dataset=topic,
- source_platform=KAFKA,
- target_dataset=target_dataset,
- target_platform="snowflake",
- )
- )
-
- self.connector_manifest.lineages = lineages
- return
-
-
-@dataclass
-class ConfluentS3SinkConnector:
- connector_manifest: ConnectorManifest
-
- def __init__(
- self, connector_manifest: ConnectorManifest, report: KafkaConnectSourceReport
- ) -> None:
- self.connector_manifest = connector_manifest
- self.report = report
- self._extract_lineages()
-
- @dataclass
- class S3SinkParser:
- target_platform: str
- bucket: str
- topics_dir: str
- topics: Iterable[str]
-
- def _get_parser(self, connector_manifest: ConnectorManifest) -> S3SinkParser:
- # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#s3
- bucket = connector_manifest.config.get("s3.bucket.name")
- if not bucket:
- raise ValueError(
- "Could not find 's3.bucket.name' in connector configuration"
- )
-
- # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#storage
- topics_dir = connector_manifest.config.get("topics.dir", "topics")
-
- return self.S3SinkParser(
- target_platform="s3",
- bucket=bucket,
- topics_dir=topics_dir,
- topics=connector_manifest.topic_names,
- )
-
- def _extract_lineages(self):
- self.connector_manifest.flow_property_bag = self.connector_manifest.config
-
- # remove keys, secrets from properties
- secret_properties = [
- "aws.access.key.id",
- "aws.secret.access.key",
- "s3.sse.customer.key",
- "s3.proxy.password",
- ]
- for k in secret_properties:
- if k in self.connector_manifest.flow_property_bag:
- del self.connector_manifest.flow_property_bag[k]
-
- try:
- parser = self._get_parser(self.connector_manifest)
-
- lineages: List[KafkaConnectLineage] = list()
- for topic in parser.topics:
- target_dataset = f"{parser.bucket}/{parser.topics_dir}/{topic}"
-
- lineages.append(
- KafkaConnectLineage(
- source_dataset=topic,
- source_platform="kafka",
- target_dataset=target_dataset,
- target_platform=parser.target_platform,
- )
- )
- self.connector_manifest.lineages = lineages
- except Exception as e:
- self.report.warning(
- "Error resolving lineage for connector",
- self.connector_manifest.name,
- exc=e,
- )
-
- return
-
-
-def transform_connector_config(
- connector_config: Dict, provided_configs: List[ProvidedConfig]
-) -> None:
- """This method will update provided configs in connector config values, if any"""
- lookupsByProvider = {}
- for pconfig in provided_configs:
- lookupsByProvider[f"${{{pconfig.provider}:{pconfig.path_key}}}"] = pconfig.value
- for k, v in connector_config.items():
- for key, value in lookupsByProvider.items():
- if key in v:
- connector_config[k] = connector_config[k].replace(key, value)
-
-
-@platform_name("Kafka Connect")
-@config_class(KafkaConnectSourceConfig)
-@support_status(SupportStatus.CERTIFIED)
-@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
-@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
-@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
-class KafkaConnectSource(StatefulIngestionSourceBase):
- config: KafkaConnectSourceConfig
- report: KafkaConnectSourceReport
- platform: str = "kafka-connect"
-
- def __init__(self, config: KafkaConnectSourceConfig, ctx: PipelineContext):
- super().__init__(config, ctx)
- self.config = config
- self.report = KafkaConnectSourceReport()
- self.session = requests.Session()
- self.session.headers.update(
- {
- "Accept": "application/json",
- "Content-Type": "application/json",
- }
- )
-
- # Test the connection
- if self.config.username is not None and self.config.password is not None:
- logger.info(
- f"Connecting to {self.config.connect_uri} with Authentication..."
- )
- self.session.auth = (self.config.username, self.config.password)
-
- test_response = self.session.get(f"{self.config.connect_uri}/connectors")
- test_response.raise_for_status()
- logger.info(f"Connection to {self.config.connect_uri} is ok")
- if not jpype.isJVMStarted():
- jpype.startJVM()
-
- @classmethod
- def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
- config = KafkaConnectSourceConfig.parse_obj(config_dict)
- return cls(config, ctx)
-
- def get_connectors_manifest(self) -> List[ConnectorManifest]:
- """Get Kafka Connect connectors manifest using REST API.
- Enrich with lineages metadata.
- """
- connectors_manifest = list()
-
- connector_response = self.session.get(
- f"{self.config.connect_uri}/connectors",
- )
-
- payload = connector_response.json()
-
- for connector_name in payload:
- connector_url = f"{self.config.connect_uri}/connectors/{connector_name}"
- connector_manifest = self._get_connector_manifest(
- connector_name, connector_url
- )
- if (
- connector_manifest is None
- or not self.config.connector_patterns.allowed(connector_manifest.name)
- ):
- self.report.report_dropped(connector_name)
- continue
-
- if self.config.provided_configs:
- transform_connector_config(
- connector_manifest.config, self.config.provided_configs
- )
- # Initialize connector lineages
- connector_manifest.lineages = list()
- connector_manifest.url = connector_url
-
- connector_manifest.topic_names = self._get_connector_topics(connector_name)
-
- # Populate Source Connector metadata
- if connector_manifest.type == SOURCE:
- connector_manifest.tasks = self._get_connector_tasks(connector_name)
-
- # JDBC source connector lineages
- if connector_manifest.config.get(CONNECTOR_CLASS).__eq__(
- "io.confluent.connect.jdbc.JdbcSourceConnector"
- ):
- connector_manifest = ConfluentJDBCSourceConnector(
- connector_manifest=connector_manifest,
- config=self.config,
- report=self.report,
- ).connector_manifest
- elif connector_manifest.config.get(CONNECTOR_CLASS, "").startswith(
- "io.debezium.connector"
- ):
- connector_manifest = DebeziumSourceConnector(
- connector_manifest=connector_manifest,
- config=self.config,
- report=self.report,
- ).connector_manifest
- elif (
- connector_manifest.config.get(CONNECTOR_CLASS, "")
- == "com.mongodb.kafka.connect.MongoSourceConnector"
- ):
- connector_manifest = MongoSourceConnector(
- connector_manifest=connector_manifest, config=self.config
- ).connector_manifest
- else:
- # Find the target connector object in the list, or log an error if unknown.
- target_connector = None
- for connector in self.config.generic_connectors:
- if connector.connector_name == connector_manifest.name:
- target_connector = connector
- break
- if not target_connector:
- logger.warning(
- f"Detected undefined connector {connector_manifest.name}, which is not in the customized connector list. Please refer to Kafka Connect ingestion recipe to define this customized connector."
- )
- continue
-
- for topic in connector_manifest.topic_names:
- lineage = KafkaConnectLineage(
- source_dataset=target_connector.source_dataset,
- source_platform=target_connector.source_platform,
- target_dataset=topic,
- target_platform=KAFKA,
- )
-
- connector_manifest.lineages.append(lineage)
-
- if connector_manifest.type == SINK:
- if connector_manifest.config.get(CONNECTOR_CLASS).__eq__(
- "com.wepay.kafka.connect.bigquery.BigQuerySinkConnector"
- ):
- connector_manifest = BigQuerySinkConnector(
- connector_manifest=connector_manifest, report=self.report
- ).connector_manifest
- elif connector_manifest.config.get("connector.class").__eq__(
- "io.confluent.connect.s3.S3SinkConnector"
- ):
- connector_manifest = ConfluentS3SinkConnector(
- connector_manifest=connector_manifest, report=self.report
- ).connector_manifest
- elif connector_manifest.config.get("connector.class").__eq__(
- "com.snowflake.kafka.connector.SnowflakeSinkConnector"
- ):
- connector_manifest = SnowflakeSinkConnector(
- connector_manifest=connector_manifest, report=self.report
- ).connector_manifest
- else:
- self.report.report_dropped(connector_manifest.name)
- logger.warning(
- f"Skipping connector {connector_manifest.name}. Lineage for Connector not yet implemented"
- )
- pass
-
- connectors_manifest.append(connector_manifest)
-
- return connectors_manifest
-
- def _get_connector_manifest(
- self, connector_name: str, connector_url: str
- ) -> Optional[ConnectorManifest]:
- try:
- connector_response = self.session.get(connector_url)
- connector_response.raise_for_status()
- except Exception as e:
- self.report.warning(
- "Failed to get connector details", connector_name, exc=e
- )
- return None
- manifest = connector_response.json()
- connector_manifest = ConnectorManifest(**manifest)
- return connector_manifest
-
- def _get_connector_tasks(self, connector_name: str) -> dict:
- try:
- response = self.session.get(
- f"{self.config.connect_uri}/connectors/{connector_name}/tasks",
- )
- response.raise_for_status()
- except Exception as e:
- self.report.warning(
- "Error getting connector tasks", context=connector_name, exc=e
- )
- return {}
-
- return response.json()
-
- def _get_connector_topics(self, connector_name: str) -> List[str]:
- try:
- response = self.session.get(
- f"{self.config.connect_uri}/connectors/{connector_name}/topics",
- )
- response.raise_for_status()
- except Exception as e:
- self.report.warning(
- "Error getting connector topics", context=connector_name, exc=e
- )
- return []
-
- return response.json()[connector_name]["topics"]
-
- def construct_flow_workunit(self, connector: ConnectorManifest) -> MetadataWorkUnit:
- connector_name = connector.name
- connector_type = connector.type
- connector_class = connector.config.get(CONNECTOR_CLASS)
- flow_property_bag = connector.flow_property_bag
- # connector_url = connector.url # NOTE: this will expose connector credential when used
- flow_urn = builder.make_data_flow_urn(
- self.platform,
- connector_name,
- self.config.env,
- self.config.platform_instance,
- )
-
- return MetadataChangeProposalWrapper(
- entityUrn=flow_urn,
- aspect=models.DataFlowInfoClass(
- name=connector_name,
- description=f"{connector_type.capitalize()} connector using `{connector_class}` plugin.",
- customProperties=flow_property_bag,
- # externalUrl=connector_url, # NOTE: this will expose connector credential when used
- ),
- ).as_workunit()
-
- def construct_job_workunits(
- self, connector: ConnectorManifest
- ) -> Iterable[MetadataWorkUnit]:
- connector_name = connector.name
- flow_urn = builder.make_data_flow_urn(
- self.platform,
- connector_name,
- self.config.env,
- self.config.platform_instance,
- )
-
- lineages = connector.lineages
- if lineages:
- for lineage in lineages:
- source_dataset = lineage.source_dataset
- source_platform = lineage.source_platform
- target_dataset = lineage.target_dataset
- target_platform = lineage.target_platform
- job_property_bag = lineage.job_property_bag
-
- source_platform_instance = get_platform_instance(
- self.config, connector_name, source_platform
- )
- target_platform_instance = get_platform_instance(
- self.config, connector_name, target_platform
- )
-
- job_id = self.get_job_id(lineage, connector, self.config)
- job_urn = builder.make_data_job_urn_with_flow(flow_urn, job_id)
-
- inlets = (
- [
- self.make_lineage_dataset_urn(
- source_platform, source_dataset, source_platform_instance
- )
- ]
- if source_dataset
- else []
- )
- outlets = [
- self.make_lineage_dataset_urn(
- target_platform, target_dataset, target_platform_instance
- )
- ]
-
- yield MetadataChangeProposalWrapper(
- entityUrn=job_urn,
- aspect=models.DataJobInfoClass(
- name=f"{connector_name}:{job_id}",
- type="COMMAND",
- customProperties=job_property_bag,
- ),
- ).as_workunit()
-
- yield MetadataChangeProposalWrapper(
- entityUrn=job_urn,
- aspect=models.DataJobInputOutputClass(
- inputDatasets=inlets,
- outputDatasets=outlets,
- ),
- ).as_workunit()
-
- def get_job_id(
- self,
- lineage: KafkaConnectLineage,
- connector: ConnectorManifest,
- config: KafkaConnectSourceConfig,
- ) -> str:
- connector_class = connector.config.get(CONNECTOR_CLASS)
-
- # Note - This block is only to maintain backward compatibility of Job URN
- if (
- connector_class
- and connector.type == SOURCE
- and (
- "JdbcSourceConnector" in connector_class
- or connector_class.startswith("io.debezium.connector")
- )
- and lineage.source_dataset
- and config.connect_to_platform_map
- and config.connect_to_platform_map.get(connector.name)
- and config.connect_to_platform_map[connector.name].get(
- lineage.source_platform
- )
- ):
- return f"{config.connect_to_platform_map[connector.name][lineage.source_platform]}.{lineage.source_dataset}"
-
- return (
- lineage.source_dataset
- if lineage.source_dataset
- else f"unknown_source.{lineage.target_dataset}"
- )
-
- def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
- return [
- *super().get_workunit_processors(),
- StaleEntityRemovalHandler.create(
- self, self.config, self.ctx
- ).workunit_processor,
- ]
-
- def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
- connectors_manifest = self.get_connectors_manifest()
- for connector in connectors_manifest:
- name = connector.name
-
- yield self.construct_flow_workunit(connector)
- yield from self.construct_job_workunits(connector)
- self.report.report_connector_scanned(name)
-
- def get_report(self) -> KafkaConnectSourceReport:
- return self.report
-
- def make_lineage_dataset_urn(
- self, platform: str, name: str, platform_instance: Optional[str]
- ) -> str:
- if self.config.convert_lineage_urns_to_lowercase:
- name = name.lower()
-
- return builder.make_dataset_urn_with_platform_instance(
- platform, name, platform_instance, self.config.env
- )
-
-
-# TODO: Find a more automated way to discover new platforms with 3 level naming hierarchy.
-def has_three_level_hierarchy(platform: str) -> bool:
- return platform in ["postgres", "trino", "redshift", "snowflake"]
diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/common.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/common.py
new file mode 100644
index 00000000000000..36f6a96c0d4080
--- /dev/null
+++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/common.py
@@ -0,0 +1,202 @@
+import logging
+from dataclasses import dataclass, field
+from typing import Dict, Iterable, List, Optional
+
+from pydantic.fields import Field
+
+from datahub.configuration.common import AllowDenyPattern, ConfigModel
+from datahub.configuration.source_common import (
+ DatasetLineageProviderConfigBase,
+ PlatformInstanceConfigMixin,
+)
+from datahub.ingestion.source.state.stale_entity_removal_handler import (
+ StaleEntityRemovalSourceReport,
+ StatefulStaleMetadataRemovalConfig,
+)
+from datahub.ingestion.source.state.stateful_ingestion_base import (
+ StatefulIngestionConfigBase,
+)
+
+logger = logging.getLogger(__name__)
+
+KAFKA = "kafka"
+SOURCE = "source"
+SINK = "sink"
+CONNECTOR_CLASS = "connector.class"
+
+
+class ProvidedConfig(ConfigModel):
+ provider: str
+ path_key: str
+ value: str
+
+
+class GenericConnectorConfig(ConfigModel):
+ connector_name: str
+ source_dataset: str
+ source_platform: str
+
+
+class KafkaConnectSourceConfig(
+ PlatformInstanceConfigMixin,
+ DatasetLineageProviderConfigBase,
+ StatefulIngestionConfigBase,
+):
+ # See the Connect REST Interface for details
+ # https://docs.confluent.io/platform/current/connect/references/restapi.html#
+ connect_uri: str = Field(
+ default="http://localhost:8083/", description="URI to connect to."
+ )
+ username: Optional[str] = Field(default=None, description="Kafka Connect username.")
+ password: Optional[str] = Field(default=None, description="Kafka Connect password.")
+ cluster_name: Optional[str] = Field(
+ default="connect-cluster", description="Cluster to ingest from."
+ )
+ # convert lineage dataset's urns to lowercase
+ convert_lineage_urns_to_lowercase: bool = Field(
+ default=False,
+ description="Whether to convert the urns of ingested lineage dataset to lowercase",
+ )
+ connector_patterns: AllowDenyPattern = Field(
+ default=AllowDenyPattern.allow_all(),
+ description="regex patterns for connectors to filter for ingestion.",
+ )
+ provided_configs: Optional[List[ProvidedConfig]] = Field(
+ default=None, description="Provided Configurations"
+ )
+ connect_to_platform_map: Optional[Dict[str, Dict[str, str]]] = Field(
+ default=None,
+ description='Platform instance mapping when multiple instances for a platform is available. Entry for a platform should be in either `platform_instance_map` or `connect_to_platform_map`. e.g.`connect_to_platform_map: { "postgres-connector-finance-db": "postgres": "core_finance_instance" }`',
+ )
+ platform_instance_map: Optional[Dict[str, str]] = Field(
+ default=None,
+ description='Platform instance mapping to use when constructing URNs. e.g.`platform_instance_map: { "hive": "warehouse" }`',
+ )
+ generic_connectors: List[GenericConnectorConfig] = Field(
+ default=[],
+ description="Provide lineage graph for sources connectors other than Confluent JDBC Source Connector, Debezium Source Connector, and Mongo Source Connector",
+ )
+
+ stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
+
+
+@dataclass
+class KafkaConnectSourceReport(StaleEntityRemovalSourceReport):
+ connectors_scanned: int = 0
+ filtered: List[str] = field(default_factory=list)
+
+ def report_connector_scanned(self, connector: str) -> None:
+ self.connectors_scanned += 1
+
+ def report_dropped(self, connector: str) -> None:
+ self.filtered.append(connector)
+
+
+@dataclass
+class KafkaConnectLineage:
+ """Class to store Kafka Connect lineage mapping, Each instance is potential DataJob"""
+
+ source_platform: str
+ target_dataset: str
+ target_platform: str
+ job_property_bag: Optional[Dict[str, str]] = None
+ source_dataset: Optional[str] = None
+
+
+@dataclass
+class ConnectorManifest:
+ """Each instance is potential DataFlow"""
+
+ name: str
+ type: str
+ config: Dict
+ tasks: Dict
+ url: Optional[str] = None
+ flow_property_bag: Optional[Dict[str, str]] = None
+ lineages: List[KafkaConnectLineage] = field(default_factory=list)
+ topic_names: Iterable[str] = field(default_factory=list)
+
+
+def remove_prefix(text: str, prefix: str) -> str:
+ if text.startswith(prefix):
+ index = len(prefix)
+ return text[index:]
+ return text
+
+
+def unquote(
+ string: str, leading_quote: str = '"', trailing_quote: Optional[str] = None
+) -> str:
+ """
+ If string starts and ends with a quote, unquote it
+ """
+ trailing_quote = trailing_quote if trailing_quote else leading_quote
+ if string.startswith(leading_quote) and string.endswith(trailing_quote):
+ string = string[1:-1]
+ return string
+
+
+def get_dataset_name(
+ database_name: Optional[str],
+ source_table: str,
+) -> str:
+ if database_name:
+ dataset_name = database_name + "." + source_table
+ else:
+ dataset_name = source_table
+
+ return dataset_name
+
+
+def get_platform_instance(
+ config: KafkaConnectSourceConfig, connector_name: str, platform: str
+) -> Optional[str]:
+ instance_name = None
+ if (
+ config.connect_to_platform_map
+ and config.connect_to_platform_map.get(connector_name)
+ and config.connect_to_platform_map[connector_name].get(platform)
+ ):
+ instance_name = config.connect_to_platform_map[connector_name][platform]
+ if config.platform_instance_map and config.platform_instance_map.get(platform):
+ logger.warning(
+ f"Same source platform {platform} configured in both platform_instance_map and connect_to_platform_map."
+ "Will prefer connector specific platform instance from connect_to_platform_map."
+ )
+ elif config.platform_instance_map and config.platform_instance_map.get(platform):
+ instance_name = config.platform_instance_map[platform]
+ logger.info(
+ f"Instance name assigned is: {instance_name} for Connector Name {connector_name} and platform {platform}"
+ )
+ return instance_name
+
+
+def transform_connector_config(
+ connector_config: Dict, provided_configs: List[ProvidedConfig]
+) -> None:
+ """This method will update provided configs in connector config values, if any"""
+ lookupsByProvider = {}
+ for pconfig in provided_configs:
+ lookupsByProvider[f"${{{pconfig.provider}:{pconfig.path_key}}}"] = pconfig.value
+ for k, v in connector_config.items():
+ for key, value in lookupsByProvider.items():
+ if key in v:
+ connector_config[k] = connector_config[k].replace(key, value)
+
+
+# TODO: Find a more automated way to discover new platforms with 3 level naming hierarchy.
+def has_three_level_hierarchy(platform: str) -> bool:
+ return platform in ["postgres", "trino", "redshift", "snowflake"]
+
+
+@dataclass
+class BaseConnector:
+ connector_manifest: ConnectorManifest
+ config: KafkaConnectSourceConfig
+ report: KafkaConnectSourceReport
+
+ def extract_lineages(self) -> List[KafkaConnectLineage]:
+ return []
+
+ def extract_flow_property_bag(self) -> Optional[Dict[str, str]]:
+ return None
diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/kafka_connect.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/kafka_connect.py
new file mode 100644
index 00000000000000..fa6b614c4b52a6
--- /dev/null
+++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/kafka_connect.py
@@ -0,0 +1,367 @@
+import logging
+from typing import Iterable, List, Optional, Type
+
+import jpype
+import jpype.imports
+import requests
+
+import datahub.emitter.mce_builder as builder
+import datahub.metadata.schema_classes as models
+from datahub.emitter.mcp import MetadataChangeProposalWrapper
+from datahub.ingestion.api.common import PipelineContext
+from datahub.ingestion.api.decorators import (
+ SourceCapability,
+ SupportStatus,
+ capability,
+ config_class,
+ platform_name,
+ support_status,
+)
+from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source
+from datahub.ingestion.api.workunit import MetadataWorkUnit
+from datahub.ingestion.source.kafka_connect.common import (
+ CONNECTOR_CLASS,
+ SINK,
+ SOURCE,
+ BaseConnector,
+ ConnectorManifest,
+ KafkaConnectLineage,
+ KafkaConnectSourceConfig,
+ KafkaConnectSourceReport,
+ get_platform_instance,
+ transform_connector_config,
+)
+from datahub.ingestion.source.kafka_connect.sink_connectors import (
+ BIGQUERY_SINK_CONNECTOR_CLASS,
+ S3_SINK_CONNECTOR_CLASS,
+ SNOWFLAKE_SINK_CONNECTOR_CLASS,
+ BigQuerySinkConnector,
+ ConfluentS3SinkConnector,
+ SnowflakeSinkConnector,
+)
+from datahub.ingestion.source.kafka_connect.source_connectors import (
+ DEBEZIUM_SOURCE_CONNECTOR_PREFIX,
+ JDBC_SOURCE_CONNECTOR_CLASS,
+ MONGO_SOURCE_CONNECTOR_CLASS,
+ ConfigDrivenSourceConnector,
+ ConfluentJDBCSourceConnector,
+ DebeziumSourceConnector,
+ MongoSourceConnector,
+)
+from datahub.ingestion.source.state.stale_entity_removal_handler import (
+ StaleEntityRemovalHandler,
+)
+from datahub.ingestion.source.state.stateful_ingestion_base import (
+ StatefulIngestionSourceBase,
+)
+
+logger = logging.getLogger(__name__)
+
+
+@platform_name("Kafka Connect")
+@config_class(KafkaConnectSourceConfig)
+@support_status(SupportStatus.CERTIFIED)
+@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
+@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
+@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
+class KafkaConnectSource(StatefulIngestionSourceBase):
+ config: KafkaConnectSourceConfig
+ report: KafkaConnectSourceReport
+ platform: str = "kafka-connect"
+
+ def __init__(self, config: KafkaConnectSourceConfig, ctx: PipelineContext):
+ super().__init__(config, ctx)
+ self.config = config
+ self.report = KafkaConnectSourceReport()
+ self.session = requests.Session()
+ self.session.headers.update(
+ {
+ "Accept": "application/json",
+ "Content-Type": "application/json",
+ }
+ )
+
+ # Test the connection
+ if self.config.username is not None and self.config.password is not None:
+ logger.info(
+ f"Connecting to {self.config.connect_uri} with Authentication..."
+ )
+ self.session.auth = (self.config.username, self.config.password)
+
+ test_response = self.session.get(f"{self.config.connect_uri}/connectors")
+ test_response.raise_for_status()
+ logger.info(f"Connection to {self.config.connect_uri} is ok")
+ if not jpype.isJVMStarted():
+ jpype.startJVM()
+
+ @classmethod
+ def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
+ config = KafkaConnectSourceConfig.parse_obj(config_dict)
+ return cls(config, ctx)
+
+ def get_connectors_manifest(self) -> Iterable[ConnectorManifest]:
+ """Get Kafka Connect connectors manifest using REST API.
+ Enrich with lineages metadata.
+ """
+
+ connector_response = self.session.get(
+ f"{self.config.connect_uri}/connectors",
+ )
+
+ payload = connector_response.json()
+
+ for connector_name in payload:
+ connector_url = f"{self.config.connect_uri}/connectors/{connector_name}"
+ connector_manifest = self._get_connector_manifest(
+ connector_name, connector_url
+ )
+ if (
+ connector_manifest is None
+ or not self.config.connector_patterns.allowed(connector_manifest.name)
+ ):
+ self.report.report_dropped(connector_name)
+ continue
+
+ if self.config.provided_configs:
+ transform_connector_config(
+ connector_manifest.config, self.config.provided_configs
+ )
+ connector_manifest.url = connector_url
+ connector_manifest.topic_names = self._get_connector_topics(connector_name)
+ connector_class_value = connector_manifest.config.get(CONNECTOR_CLASS) or ""
+
+ class_type: Type[BaseConnector] = BaseConnector
+
+ # Populate Source Connector metadata
+ if connector_manifest.type == SOURCE:
+ connector_manifest.tasks = self._get_connector_tasks(connector_name)
+
+ # JDBC source connector lineages
+ if connector_class_value == JDBC_SOURCE_CONNECTOR_CLASS:
+ class_type = ConfluentJDBCSourceConnector
+ elif connector_class_value.startswith(DEBEZIUM_SOURCE_CONNECTOR_PREFIX):
+ class_type = DebeziumSourceConnector
+ elif connector_class_value == MONGO_SOURCE_CONNECTOR_CLASS:
+ class_type = MongoSourceConnector
+ elif any(
+ [
+ connector.connector_name == connector_manifest.name
+ for connector in self.config.generic_connectors
+ ]
+ ):
+ class_type = ConfigDrivenSourceConnector
+ else:
+ self.report.report_dropped(connector_manifest.name)
+ self.report.warning(
+ "Lineage for Source Connector not supported. "
+ "Please refer to Kafka Connect docs to use `generic_connectors` config.",
+ context=f"{connector_manifest.name} of type {connector_class_value}",
+ )
+ continue
+ elif connector_manifest.type == SINK:
+ if connector_class_value == BIGQUERY_SINK_CONNECTOR_CLASS:
+ class_type = BigQuerySinkConnector
+ elif connector_class_value == S3_SINK_CONNECTOR_CLASS:
+ class_type = ConfluentS3SinkConnector
+ elif connector_class_value == SNOWFLAKE_SINK_CONNECTOR_CLASS:
+ class_type = SnowflakeSinkConnector
+ else:
+ self.report.report_dropped(connector_manifest.name)
+ self.report.warning(
+ "Lineage for Sink Connector not supported.",
+ context=f"{connector_manifest.name} of type {connector_class_value}",
+ )
+
+ connector_class = class_type(connector_manifest, self.config, self.report)
+ connector_manifest.lineages = connector_class.extract_lineages()
+ connector_manifest.flow_property_bag = (
+ connector_class.extract_flow_property_bag()
+ )
+
+ yield connector_manifest
+
+ def _get_connector_manifest(
+ self, connector_name: str, connector_url: str
+ ) -> Optional[ConnectorManifest]:
+ try:
+ connector_response = self.session.get(connector_url)
+ connector_response.raise_for_status()
+ except Exception as e:
+ self.report.warning(
+ "Failed to get connector details", connector_name, exc=e
+ )
+ return None
+ manifest = connector_response.json()
+ connector_manifest = ConnectorManifest(**manifest)
+ return connector_manifest
+
+ def _get_connector_tasks(self, connector_name: str) -> dict:
+ try:
+ response = self.session.get(
+ f"{self.config.connect_uri}/connectors/{connector_name}/tasks",
+ )
+ response.raise_for_status()
+ except Exception as e:
+ self.report.warning(
+ "Error getting connector tasks", context=connector_name, exc=e
+ )
+ return {}
+
+ return response.json()
+
+ def _get_connector_topics(self, connector_name: str) -> List[str]:
+ try:
+ response = self.session.get(
+ f"{self.config.connect_uri}/connectors/{connector_name}/topics",
+ )
+ response.raise_for_status()
+ except Exception as e:
+ self.report.warning(
+ "Error getting connector topics", context=connector_name, exc=e
+ )
+ return []
+
+ return response.json()[connector_name]["topics"]
+
+ def construct_flow_workunit(self, connector: ConnectorManifest) -> MetadataWorkUnit:
+ connector_name = connector.name
+ connector_type = connector.type
+ connector_class = connector.config.get(CONNECTOR_CLASS)
+ flow_property_bag = connector.flow_property_bag
+ # connector_url = connector.url # NOTE: this will expose connector credential when used
+ flow_urn = builder.make_data_flow_urn(
+ self.platform,
+ connector_name,
+ self.config.env,
+ self.config.platform_instance,
+ )
+
+ return MetadataChangeProposalWrapper(
+ entityUrn=flow_urn,
+ aspect=models.DataFlowInfoClass(
+ name=connector_name,
+ description=f"{connector_type.capitalize()} connector using `{connector_class}` plugin.",
+ customProperties=flow_property_bag,
+ # externalUrl=connector_url, # NOTE: this will expose connector credential when used
+ ),
+ ).as_workunit()
+
+ def construct_job_workunits(
+ self, connector: ConnectorManifest
+ ) -> Iterable[MetadataWorkUnit]:
+ connector_name = connector.name
+ flow_urn = builder.make_data_flow_urn(
+ self.platform,
+ connector_name,
+ self.config.env,
+ self.config.platform_instance,
+ )
+
+ lineages = connector.lineages
+ if lineages:
+ for lineage in lineages:
+ source_dataset = lineage.source_dataset
+ source_platform = lineage.source_platform
+ target_dataset = lineage.target_dataset
+ target_platform = lineage.target_platform
+ job_property_bag = lineage.job_property_bag
+
+ source_platform_instance = get_platform_instance(
+ self.config, connector_name, source_platform
+ )
+ target_platform_instance = get_platform_instance(
+ self.config, connector_name, target_platform
+ )
+
+ job_id = self.get_job_id(lineage, connector, self.config)
+ job_urn = builder.make_data_job_urn_with_flow(flow_urn, job_id)
+
+ inlets = (
+ [
+ self.make_lineage_dataset_urn(
+ source_platform, source_dataset, source_platform_instance
+ )
+ ]
+ if source_dataset
+ else []
+ )
+ outlets = [
+ self.make_lineage_dataset_urn(
+ target_platform, target_dataset, target_platform_instance
+ )
+ ]
+
+ yield MetadataChangeProposalWrapper(
+ entityUrn=job_urn,
+ aspect=models.DataJobInfoClass(
+ name=f"{connector_name}:{job_id}",
+ type="COMMAND",
+ customProperties=job_property_bag,
+ ),
+ ).as_workunit()
+
+ yield MetadataChangeProposalWrapper(
+ entityUrn=job_urn,
+ aspect=models.DataJobInputOutputClass(
+ inputDatasets=inlets,
+ outputDatasets=outlets,
+ ),
+ ).as_workunit()
+
+ def get_job_id(
+ self,
+ lineage: KafkaConnectLineage,
+ connector: ConnectorManifest,
+ config: KafkaConnectSourceConfig,
+ ) -> str:
+ connector_class = connector.config.get(CONNECTOR_CLASS)
+
+ # Note - This block is only to maintain backward compatibility of Job URN
+ if (
+ connector_class
+ and connector.type == SOURCE
+ and (
+ "JdbcSourceConnector" in connector_class
+ or connector_class.startswith("io.debezium.connector")
+ )
+ and lineage.source_dataset
+ and config.connect_to_platform_map
+ and config.connect_to_platform_map.get(connector.name)
+ and config.connect_to_platform_map[connector.name].get(
+ lineage.source_platform
+ )
+ ):
+ return f"{config.connect_to_platform_map[connector.name][lineage.source_platform]}.{lineage.source_dataset}"
+
+ return (
+ lineage.source_dataset
+ if lineage.source_dataset
+ else f"unknown_source.{lineage.target_dataset}"
+ )
+
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
+ return [
+ *super().get_workunit_processors(),
+ StaleEntityRemovalHandler.create(
+ self, self.config, self.ctx
+ ).workunit_processor,
+ ]
+
+ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
+ for connector in self.get_connectors_manifest():
+ yield self.construct_flow_workunit(connector)
+ yield from self.construct_job_workunits(connector)
+ self.report.report_connector_scanned(connector.name)
+
+ def get_report(self) -> KafkaConnectSourceReport:
+ return self.report
+
+ def make_lineage_dataset_urn(
+ self, platform: str, name: str, platform_instance: Optional[str]
+ ) -> str:
+ if self.config.convert_lineage_urns_to_lowercase:
+ name = name.lower()
+
+ return builder.make_dataset_urn_with_platform_instance(
+ platform, name, platform_instance, self.config.env
+ )
diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/sink_connectors.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/sink_connectors.py
new file mode 100644
index 00000000000000..2790460c8e6019
--- /dev/null
+++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/sink_connectors.py
@@ -0,0 +1,341 @@
+import re
+from dataclasses import dataclass
+from typing import Dict, Iterable, List, Optional, Tuple
+
+from datahub.ingestion.source.kafka_connect.common import (
+ KAFKA,
+ BaseConnector,
+ ConnectorManifest,
+ KafkaConnectLineage,
+)
+
+
+@dataclass
+class ConfluentS3SinkConnector(BaseConnector):
+ @dataclass
+ class S3SinkParser:
+ target_platform: str
+ bucket: str
+ topics_dir: str
+ topics: Iterable[str]
+
+ def _get_parser(self, connector_manifest: ConnectorManifest) -> S3SinkParser:
+ # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#s3
+ bucket = connector_manifest.config.get("s3.bucket.name")
+ if not bucket:
+ raise ValueError(
+ "Could not find 's3.bucket.name' in connector configuration"
+ )
+
+ # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#storage
+ topics_dir = connector_manifest.config.get("topics.dir", "topics")
+
+ return self.S3SinkParser(
+ target_platform="s3",
+ bucket=bucket,
+ topics_dir=topics_dir,
+ topics=connector_manifest.topic_names,
+ )
+
+ def extract_flow_property_bag(self) -> Dict[str, str]:
+ # Mask/Remove properties that may reveal credentials
+ flow_property_bag = {
+ k: v
+ for k, v in self.connector_manifest.config.items()
+ if k
+ not in [
+ "aws.access.key.id",
+ "aws.secret.access.key",
+ "s3.sse.customer.key",
+ "s3.proxy.password",
+ ]
+ }
+ return flow_property_bag
+
+ def extract_lineages(self) -> List[KafkaConnectLineage]:
+ try:
+ parser = self._get_parser(self.connector_manifest)
+
+ lineages: List[KafkaConnectLineage] = list()
+ for topic in parser.topics:
+ target_dataset = f"{parser.bucket}/{parser.topics_dir}/{topic}"
+
+ lineages.append(
+ KafkaConnectLineage(
+ source_dataset=topic,
+ source_platform="kafka",
+ target_dataset=target_dataset,
+ target_platform=parser.target_platform,
+ )
+ )
+ return lineages
+ except Exception as e:
+ self.report.warning(
+ "Error resolving lineage for connector",
+ self.connector_manifest.name,
+ exc=e,
+ )
+
+ return []
+
+
+@dataclass
+class SnowflakeSinkConnector(BaseConnector):
+ @dataclass
+ class SnowflakeParser:
+ database_name: str
+ schema_name: str
+ topics_to_tables: Dict[str, str]
+
+ def get_table_name_from_topic_name(self, topic_name: str) -> str:
+ """
+ This function converts the topic name to a valid Snowflake table name using some rules.
+ Refer below link for more info
+ https://docs.snowflake.com/en/user-guide/kafka-connector-overview#target-tables-for-kafka-topics
+ """
+ table_name = re.sub("[^a-zA-Z0-9_]", "_", topic_name)
+ if re.match("^[^a-zA-Z_].*", table_name):
+ table_name = "_" + table_name
+ # Connector may append original topic's hash code as suffix for conflict resolution
+ # if generated table names for 2 topics are similar. This corner case is not handled here.
+ # Note that Snowflake recommends to choose topic names that follow the rules for
+ # Snowflake identifier names so this case is not recommended by snowflake.
+ return table_name
+
+ def get_parser(
+ self,
+ connector_manifest: ConnectorManifest,
+ ) -> SnowflakeParser:
+ database_name = connector_manifest.config["snowflake.database.name"]
+ schema_name = connector_manifest.config["snowflake.schema.name"]
+
+ # Fetch user provided topic to table map
+ provided_topics_to_tables: Dict[str, str] = {}
+ if connector_manifest.config.get("snowflake.topic2table.map"):
+ for each in connector_manifest.config["snowflake.topic2table.map"].split(
+ ","
+ ):
+ topic, table = each.split(":")
+ provided_topics_to_tables[topic.strip()] = table.strip()
+
+ topics_to_tables: Dict[str, str] = {}
+ # Extract lineage for only those topics whose data ingestion started
+ for topic in connector_manifest.topic_names:
+ if topic in provided_topics_to_tables:
+ # If user provided which table to get mapped with this topic
+ topics_to_tables[topic] = provided_topics_to_tables[topic]
+ else:
+ # Else connector converts topic name to a valid Snowflake table name.
+ topics_to_tables[topic] = self.get_table_name_from_topic_name(topic)
+
+ return self.SnowflakeParser(
+ database_name=database_name,
+ schema_name=schema_name,
+ topics_to_tables=topics_to_tables,
+ )
+
+ def extract_flow_property_bag(self) -> Dict[str, str]:
+ # For all snowflake sink connector properties, refer below link
+ # https://docs.snowflake.com/en/user-guide/kafka-connector-install#configuring-the-kafka-connector
+ # remove private keys, secrets from properties
+ flow_property_bag = {
+ k: v
+ for k, v in self.connector_manifest.config.items()
+ if k
+ not in [
+ "snowflake.private.key",
+ "snowflake.private.key.passphrase",
+ "value.converter.basic.auth.user.info",
+ ]
+ }
+
+ return flow_property_bag
+
+ def extract_lineages(self) -> List[KafkaConnectLineage]:
+ lineages: List[KafkaConnectLineage] = list()
+ parser = self.get_parser(self.connector_manifest)
+
+ for topic, table in parser.topics_to_tables.items():
+ target_dataset = f"{parser.database_name}.{parser.schema_name}.{table}"
+ lineages.append(
+ KafkaConnectLineage(
+ source_dataset=topic,
+ source_platform=KAFKA,
+ target_dataset=target_dataset,
+ target_platform="snowflake",
+ )
+ )
+
+ return lineages
+
+
+@dataclass
+class BigQuerySinkConnector(BaseConnector):
+ @dataclass
+ class BQParser:
+ project: str
+ target_platform: str
+ sanitizeTopics: str
+ transforms: list
+ topicsToTables: Optional[str] = None
+ datasets: Optional[str] = None
+ defaultDataset: Optional[str] = None
+ version: str = "v1"
+
+ def get_parser(
+ self,
+ connector_manifest: ConnectorManifest,
+ ) -> BQParser:
+ project = connector_manifest.config["project"]
+ sanitizeTopics = connector_manifest.config.get("sanitizeTopics", "false")
+ transform_names = (
+ self.connector_manifest.config.get("transforms", "").split(",")
+ if self.connector_manifest.config.get("transforms")
+ else []
+ )
+ transforms = []
+ for name in transform_names:
+ transform = {"name": name}
+ transforms.append(transform)
+ for key in self.connector_manifest.config.keys():
+ if key.startswith(f"transforms.{name}."):
+ transform[
+ key.replace(f"transforms.{name}.", "")
+ ] = self.connector_manifest.config[key]
+
+ if "defaultDataset" in connector_manifest.config:
+ defaultDataset = connector_manifest.config["defaultDataset"]
+ return self.BQParser(
+ project=project,
+ defaultDataset=defaultDataset,
+ target_platform="bigquery",
+ sanitizeTopics=sanitizeTopics.lower() == "true",
+ version="v2",
+ transforms=transforms,
+ )
+ else:
+ # version 1.6.x and similar configs supported
+ datasets = connector_manifest.config["datasets"]
+ topicsToTables = connector_manifest.config.get("topicsToTables")
+
+ return self.BQParser(
+ project=project,
+ topicsToTables=topicsToTables,
+ datasets=datasets,
+ target_platform="bigquery",
+ sanitizeTopics=sanitizeTopics.lower() == "true",
+ transforms=transforms,
+ )
+
+ def get_list(self, property: str) -> Iterable[Tuple[str, str]]:
+ entries = property.split(",")
+ for entry in entries:
+ key, val = entry.rsplit("=")
+ yield (key.strip(), val.strip())
+
+ def get_dataset_for_topic_v1(self, topic: str, parser: BQParser) -> Optional[str]:
+ topicregex_dataset_map: Dict[str, str] = dict(self.get_list(parser.datasets)) # type: ignore
+ from java.util.regex import Pattern
+
+ for pattern, dataset in topicregex_dataset_map.items():
+ patternMatcher = Pattern.compile(pattern).matcher(topic)
+ if patternMatcher.matches():
+ return dataset
+ return None
+
+ def sanitize_table_name(self, table_name):
+ table_name = re.sub("[^a-zA-Z0-9_]", "_", table_name)
+ if re.match("^[^a-zA-Z_].*", table_name):
+ table_name = "_" + table_name
+
+ return table_name
+
+ def get_dataset_table_for_topic(
+ self, topic: str, parser: BQParser
+ ) -> Optional[str]:
+ if parser.version == "v2":
+ dataset = parser.defaultDataset
+ parts = topic.split(":")
+ if len(parts) == 2:
+ dataset = parts[0]
+ table = parts[1]
+ else:
+ table = parts[0]
+ else:
+ dataset = self.get_dataset_for_topic_v1(topic, parser)
+ if dataset is None:
+ return None
+
+ table = topic
+ if parser.topicsToTables:
+ topicregex_table_map: Dict[str, str] = dict(
+ self.get_list(parser.topicsToTables) # type: ignore
+ )
+ from java.util.regex import Pattern
+
+ for pattern, tbl in topicregex_table_map.items():
+ patternMatcher = Pattern.compile(pattern).matcher(topic)
+ if patternMatcher.matches():
+ table = tbl
+ break
+
+ if parser.sanitizeTopics:
+ table = self.sanitize_table_name(table)
+ return f"{dataset}.{table}"
+
+ def apply_transformations(
+ self, topic: str, transforms: List[Dict[str, str]]
+ ) -> str:
+ for transform in transforms:
+ if transform["type"] == "org.apache.kafka.connect.transforms.RegexRouter":
+ regex = transform["regex"]
+ replacement = transform["replacement"]
+ pattern = re.compile(regex)
+ if pattern.match(topic):
+ topic = pattern.sub(replacement, topic, count=1)
+ return topic
+
+ def extract_flow_property_bag(self) -> Dict[str, str]:
+ # Mask/Remove properties that may reveal credentials
+ flow_property_bag = {
+ k: v
+ for k, v in self.connector_manifest.config.items()
+ if k not in ["keyfile"]
+ }
+
+ return flow_property_bag
+
+ def extract_lineages(self) -> List[KafkaConnectLineage]:
+ lineages: List[KafkaConnectLineage] = list()
+ parser = self.get_parser(self.connector_manifest)
+ if not parser:
+ return lineages
+ target_platform = parser.target_platform
+ project = parser.project
+ transforms = parser.transforms
+
+ for topic in self.connector_manifest.topic_names:
+ transformed_topic = self.apply_transformations(topic, transforms)
+ dataset_table = self.get_dataset_table_for_topic(transformed_topic, parser)
+ if dataset_table is None:
+ self.report.warning(
+ "Could not find target dataset for topic, please check your connector configuration"
+ f"{self.connector_manifest.name} : {transformed_topic} ",
+ )
+ continue
+ target_dataset = f"{project}.{dataset_table}"
+
+ lineages.append(
+ KafkaConnectLineage(
+ source_dataset=transformed_topic,
+ source_platform=KAFKA,
+ target_dataset=target_dataset,
+ target_platform=target_platform,
+ )
+ )
+ return lineages
+
+
+BIGQUERY_SINK_CONNECTOR_CLASS = "com.wepay.kafka.connect.bigquery.BigQuerySinkConnector"
+S3_SINK_CONNECTOR_CLASS = "io.confluent.connect.s3.S3SinkConnector"
+SNOWFLAKE_SINK_CONNECTOR_CLASS = "com.snowflake.kafka.connector.SnowflakeSinkConnector"
diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/source_connectors.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/source_connectors.py
new file mode 100644
index 00000000000000..7b3b6e551a0a1f
--- /dev/null
+++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/source_connectors.py
@@ -0,0 +1,570 @@
+import logging
+import re
+from dataclasses import dataclass
+from typing import Dict, Iterable, List, Optional, Tuple
+
+from sqlalchemy.engine.url import make_url
+
+from datahub.ingestion.source.kafka_connect.common import (
+ CONNECTOR_CLASS,
+ KAFKA,
+ BaseConnector,
+ ConnectorManifest,
+ KafkaConnectLineage,
+ get_dataset_name,
+ has_three_level_hierarchy,
+ remove_prefix,
+ unquote,
+)
+from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
+ get_platform_from_sqlalchemy_uri,
+)
+
+
+@dataclass
+class ConfluentJDBCSourceConnector(BaseConnector):
+ REGEXROUTER = "org.apache.kafka.connect.transforms.RegexRouter"
+ KNOWN_TOPICROUTING_TRANSFORMS = [REGEXROUTER]
+ # https://kafka.apache.org/documentation/#connect_included_transformation
+ KAFKA_NONTOPICROUTING_TRANSFORMS = [
+ "InsertField",
+ "InsertField$Key",
+ "InsertField$Value",
+ "ReplaceField",
+ "ReplaceField$Key",
+ "ReplaceField$Value",
+ "MaskField",
+ "MaskField$Key",
+ "MaskField$Value",
+ "ValueToKey",
+ "ValueToKey$Key",
+ "ValueToKey$Value",
+ "HoistField",
+ "HoistField$Key",
+ "HoistField$Value",
+ "ExtractField",
+ "ExtractField$Key",
+ "ExtractField$Value",
+ "SetSchemaMetadata",
+ "SetSchemaMetadata$Key",
+ "SetSchemaMetadata$Value",
+ "Flatten",
+ "Flatten$Key",
+ "Flatten$Value",
+ "Cast",
+ "Cast$Key",
+ "Cast$Value",
+ "HeadersFrom",
+ "HeadersFrom$Key",
+ "HeadersFrom$Value",
+ "TimestampConverter",
+ "Filter",
+ "InsertHeader",
+ "DropHeaders",
+ ]
+ # https://docs.confluent.io/platform/current/connect/transforms/overview.html
+ CONFLUENT_NONTOPICROUTING_TRANSFORMS = [
+ "Drop",
+ "Drop$Key",
+ "Drop$Value",
+ "Filter",
+ "Filter$Key",
+ "Filter$Value",
+ "TombstoneHandler",
+ ]
+ KNOWN_NONTOPICROUTING_TRANSFORMS = (
+ KAFKA_NONTOPICROUTING_TRANSFORMS
+ + [
+ f"org.apache.kafka.connect.transforms.{t}"
+ for t in KAFKA_NONTOPICROUTING_TRANSFORMS
+ ]
+ + CONFLUENT_NONTOPICROUTING_TRANSFORMS
+ + [
+ f"io.confluent.connect.transforms.{t}"
+ for t in CONFLUENT_NONTOPICROUTING_TRANSFORMS
+ ]
+ )
+
+ @dataclass
+ class JdbcParser:
+ db_connection_url: str
+ source_platform: str
+ database_name: str
+ topic_prefix: str
+ query: str
+ transforms: list
+
+ def get_parser(
+ self,
+ connector_manifest: ConnectorManifest,
+ ) -> JdbcParser:
+ url = remove_prefix(
+ str(connector_manifest.config.get("connection.url")), "jdbc:"
+ )
+ url_instance = make_url(url)
+ source_platform = get_platform_from_sqlalchemy_uri(str(url_instance))
+ database_name = url_instance.database
+ assert database_name
+ db_connection_url = f"{url_instance.drivername}://{url_instance.host}:{url_instance.port}/{database_name}"
+
+ topic_prefix = self.connector_manifest.config.get("topic.prefix", None)
+
+ query = self.connector_manifest.config.get("query", None)
+
+ transform_names = (
+ self.connector_manifest.config.get("transforms", "").split(",")
+ if self.connector_manifest.config.get("transforms")
+ else []
+ )
+
+ transforms = []
+ for name in transform_names:
+ transform = {"name": name}
+ transforms.append(transform)
+ for key in self.connector_manifest.config.keys():
+ if key.startswith(f"transforms.{name}."):
+ transform[
+ key.replace(f"transforms.{name}.", "")
+ ] = self.connector_manifest.config[key]
+
+ return self.JdbcParser(
+ db_connection_url,
+ source_platform,
+ database_name,
+ topic_prefix,
+ query,
+ transforms,
+ )
+
+ def default_get_lineages(
+ self,
+ topic_prefix: str,
+ database_name: str,
+ source_platform: str,
+ topic_names: Optional[Iterable[str]] = None,
+ include_source_dataset: bool = True,
+ ) -> List[KafkaConnectLineage]:
+ lineages: List[KafkaConnectLineage] = []
+ if not topic_names:
+ topic_names = self.connector_manifest.topic_names
+ table_name_tuples: List[Tuple] = self.get_table_names()
+ for topic in topic_names:
+ # All good for NO_TRANSFORM or (SINGLE_TRANSFORM and KNOWN_NONTOPICROUTING_TRANSFORM) or (not SINGLE_TRANSFORM and all(KNOWN_NONTOPICROUTING_TRANSFORM))
+ source_table: str = (
+ remove_prefix(topic, topic_prefix) if topic_prefix else topic
+ )
+ # include schema name for three-level hierarchies
+ if has_three_level_hierarchy(source_platform):
+ table_name_tuple: Tuple = next(
+ iter([t for t in table_name_tuples if t and t[-1] == source_table]),
+ (),
+ )
+ if len(table_name_tuple) > 1:
+ source_table = f"{table_name_tuple[-2]}.{source_table}"
+ else:
+ include_source_dataset = False
+ self.report.warning(
+ "Could not find schema for table"
+ f"{self.connector_manifest.name} : {source_table}",
+ )
+ dataset_name: str = get_dataset_name(database_name, source_table)
+ lineage = KafkaConnectLineage(
+ source_dataset=dataset_name if include_source_dataset else None,
+ source_platform=source_platform,
+ target_dataset=topic,
+ target_platform=KAFKA,
+ )
+ lineages.append(lineage)
+ return lineages
+
+ def get_table_names(self) -> List[Tuple]:
+ sep: str = "."
+ leading_quote_char: str = '"'
+ trailing_quote_char: str = leading_quote_char
+
+ table_ids: List[str] = []
+ if self.connector_manifest.tasks:
+ table_ids = (
+ ",".join(
+ [
+ task["config"].get("tables")
+ for task in self.connector_manifest.tasks
+ ]
+ )
+ ).split(",")
+ quote_method = self.connector_manifest.config.get(
+ "quote.sql.identifiers", "always"
+ )
+ if (
+ quote_method == "always"
+ and table_ids
+ and table_ids[0]
+ and table_ids[-1]
+ ):
+ leading_quote_char = table_ids[0][0]
+ trailing_quote_char = table_ids[-1][-1]
+ # This will only work for single character quotes
+ elif self.connector_manifest.config.get("table.whitelist"):
+ table_ids = self.connector_manifest.config.get("table.whitelist").split(",") # type: ignore
+
+ # List of Tuple containing (schema, table)
+ tables: List[Tuple] = [
+ (
+ (
+ unquote(
+ table_id.split(sep)[-2], leading_quote_char, trailing_quote_char
+ )
+ if len(table_id.split(sep)) > 1
+ else ""
+ ),
+ unquote(
+ table_id.split(sep)[-1], leading_quote_char, trailing_quote_char
+ ),
+ )
+ for table_id in table_ids
+ ]
+ return tables
+
+ def extract_flow_property_bag(self) -> Dict[str, str]:
+ flow_property_bag = {
+ k: v
+ for k, v in self.connector_manifest.config.items()
+ if k not in ["connection.password", "connection.user"]
+ }
+
+ # Mask/Remove properties that may reveal credentials
+ flow_property_bag["connection.url"] = self.get_parser(
+ self.connector_manifest
+ ).db_connection_url
+
+ return flow_property_bag
+
+ def extract_lineages(self) -> List[KafkaConnectLineage]:
+ lineages: List[KafkaConnectLineage] = list()
+ parser = self.get_parser(self.connector_manifest)
+ source_platform = parser.source_platform
+ database_name = parser.database_name
+ query = parser.query
+ topic_prefix = parser.topic_prefix
+ transforms = parser.transforms
+
+ logging.debug(
+ f"Extracting source platform: {source_platform} and database name: {database_name} from connection url "
+ )
+
+ if not self.connector_manifest.topic_names:
+ return lineages
+
+ if query:
+ # Lineage source_table can be extracted by parsing query
+ for topic in self.connector_manifest.topic_names:
+ # default method - as per earlier implementation
+ dataset_name: str = get_dataset_name(database_name, topic)
+
+ lineage = KafkaConnectLineage(
+ source_dataset=None,
+ source_platform=source_platform,
+ target_dataset=topic,
+ target_platform=KAFKA,
+ )
+ lineages.append(lineage)
+ self.report.warning(
+ "Could not find input dataset, the connector has query configuration set",
+ self.connector_manifest.name,
+ )
+ return lineages
+
+ SINGLE_TRANSFORM = len(transforms) == 1
+ NO_TRANSFORM = len(transforms) == 0
+ UNKNOWN_TRANSFORM = any(
+ [
+ transform["type"]
+ not in self.KNOWN_TOPICROUTING_TRANSFORMS
+ + self.KNOWN_NONTOPICROUTING_TRANSFORMS
+ for transform in transforms
+ ]
+ )
+ ALL_TRANSFORMS_NON_TOPICROUTING = all(
+ [
+ transform["type"] in self.KNOWN_NONTOPICROUTING_TRANSFORMS
+ for transform in transforms
+ ]
+ )
+
+ if NO_TRANSFORM or ALL_TRANSFORMS_NON_TOPICROUTING:
+ return self.default_get_lineages(
+ database_name=database_name,
+ source_platform=source_platform,
+ topic_prefix=topic_prefix,
+ )
+
+ if SINGLE_TRANSFORM and transforms[0]["type"] == self.REGEXROUTER:
+ tables = self.get_table_names()
+ topic_names = list(self.connector_manifest.topic_names)
+
+ from java.util.regex import Pattern
+
+ for table in tables:
+ source_table: str = table[-1]
+ topic = topic_prefix + source_table if topic_prefix else source_table
+
+ transform_regex = Pattern.compile(transforms[0]["regex"])
+ transform_replacement = transforms[0]["replacement"]
+
+ matcher = transform_regex.matcher(topic)
+ if matcher.matches():
+ topic = str(matcher.replaceFirst(transform_replacement))
+
+ # Additional check to confirm that the topic present
+ # in connector topics
+
+ if topic in self.connector_manifest.topic_names:
+ # include schema name for three-level hierarchies
+ if has_three_level_hierarchy(source_platform) and len(table) > 1:
+ source_table = f"{table[-2]}.{table[-1]}"
+
+ dataset_name = get_dataset_name(database_name, source_table)
+
+ lineage = KafkaConnectLineage(
+ source_dataset=dataset_name,
+ source_platform=source_platform,
+ target_dataset=topic,
+ target_platform=KAFKA,
+ )
+ topic_names.remove(topic)
+ lineages.append(lineage)
+
+ if topic_names:
+ lineages.extend(
+ self.default_get_lineages(
+ database_name=database_name,
+ source_platform=source_platform,
+ topic_prefix=topic_prefix,
+ topic_names=topic_names,
+ include_source_dataset=False,
+ )
+ )
+ self.report.warning(
+ "Could not find input dataset for connector topics",
+ f"{self.connector_manifest.name} : {topic_names}",
+ )
+ return lineages
+ else:
+ include_source_dataset = True
+ if SINGLE_TRANSFORM and UNKNOWN_TRANSFORM:
+ self.report.warning(
+ "Could not find input dataset, connector has unknown transform",
+ f"{self.connector_manifest.name} : {transforms[0]['type']}",
+ )
+ include_source_dataset = False
+ if not SINGLE_TRANSFORM and UNKNOWN_TRANSFORM:
+ self.report.warning(
+ "Could not find input dataset, connector has one or more unknown transforms",
+ self.connector_manifest.name,
+ )
+ include_source_dataset = False
+ lineages = self.default_get_lineages(
+ database_name=database_name,
+ source_platform=source_platform,
+ topic_prefix=topic_prefix,
+ include_source_dataset=include_source_dataset,
+ )
+ return lineages
+
+
+@dataclass
+class MongoSourceConnector(BaseConnector):
+ # https://www.mongodb.com/docs/kafka-connector/current/source-connector/
+
+ @dataclass
+ class MongoSourceParser:
+ db_connection_url: Optional[str]
+ source_platform: str
+ database_name: Optional[str]
+ topic_prefix: Optional[str]
+ transforms: List[str]
+
+ def get_parser(
+ self,
+ connector_manifest: ConnectorManifest,
+ ) -> MongoSourceParser:
+ parser = self.MongoSourceParser(
+ db_connection_url=connector_manifest.config.get("connection.uri"),
+ source_platform="mongodb",
+ database_name=connector_manifest.config.get("database"),
+ topic_prefix=connector_manifest.config.get("topic_prefix"),
+ transforms=(
+ connector_manifest.config["transforms"].split(",")
+ if "transforms" in connector_manifest.config
+ else []
+ ),
+ )
+
+ return parser
+
+ def extract_lineages(self) -> List[KafkaConnectLineage]:
+ lineages: List[KafkaConnectLineage] = list()
+ parser = self.get_parser(self.connector_manifest)
+ source_platform = parser.source_platform
+ topic_naming_pattern = r"mongodb\.(\w+)\.(\w+)"
+
+ if not self.connector_manifest.topic_names:
+ return lineages
+
+ for topic in self.connector_manifest.topic_names:
+ found = re.search(re.compile(topic_naming_pattern), topic)
+
+ if found:
+ table_name = get_dataset_name(found.group(1), found.group(2))
+
+ lineage = KafkaConnectLineage(
+ source_dataset=table_name,
+ source_platform=source_platform,
+ target_dataset=topic,
+ target_platform=KAFKA,
+ )
+ lineages.append(lineage)
+ return lineages
+
+
+@dataclass
+class DebeziumSourceConnector(BaseConnector):
+ @dataclass
+ class DebeziumParser:
+ source_platform: str
+ server_name: Optional[str]
+ database_name: Optional[str]
+
+ def get_server_name(self, connector_manifest: ConnectorManifest) -> str:
+ if "topic.prefix" in connector_manifest.config:
+ return connector_manifest.config["topic.prefix"]
+ else:
+ return connector_manifest.config.get("database.server.name", "")
+
+ def get_parser(
+ self,
+ connector_manifest: ConnectorManifest,
+ ) -> DebeziumParser:
+ connector_class = connector_manifest.config.get(CONNECTOR_CLASS, "")
+
+ if connector_class == "io.debezium.connector.mysql.MySqlConnector":
+ parser = self.DebeziumParser(
+ source_platform="mysql",
+ server_name=self.get_server_name(connector_manifest),
+ database_name=None,
+ )
+ elif connector_class == "MySqlConnector":
+ parser = self.DebeziumParser(
+ source_platform="mysql",
+ server_name=self.get_server_name(connector_manifest),
+ database_name=None,
+ )
+ elif connector_class == "io.debezium.connector.mongodb.MongoDbConnector":
+ parser = self.DebeziumParser(
+ source_platform="mongodb",
+ server_name=self.get_server_name(connector_manifest),
+ database_name=None,
+ )
+ elif connector_class == "io.debezium.connector.postgresql.PostgresConnector":
+ parser = self.DebeziumParser(
+ source_platform="postgres",
+ server_name=self.get_server_name(connector_manifest),
+ database_name=connector_manifest.config.get("database.dbname"),
+ )
+ elif connector_class == "io.debezium.connector.oracle.OracleConnector":
+ parser = self.DebeziumParser(
+ source_platform="oracle",
+ server_name=self.get_server_name(connector_manifest),
+ database_name=connector_manifest.config.get("database.dbname"),
+ )
+ elif connector_class == "io.debezium.connector.sqlserver.SqlServerConnector":
+ database_name = connector_manifest.config.get(
+ "database.names"
+ ) or connector_manifest.config.get("database.dbname")
+
+ if "," in str(database_name):
+ raise Exception(
+ f"Only one database is supported for Debezium's SQL Server connector. Found: {database_name}"
+ )
+
+ parser = self.DebeziumParser(
+ source_platform="mssql",
+ server_name=self.get_server_name(connector_manifest),
+ database_name=database_name,
+ )
+ elif connector_class == "io.debezium.connector.db2.Db2Connector":
+ parser = self.DebeziumParser(
+ source_platform="db2",
+ server_name=self.get_server_name(connector_manifest),
+ database_name=connector_manifest.config.get("database.dbname"),
+ )
+ elif connector_class == "io.debezium.connector.vitess.VitessConnector":
+ parser = self.DebeziumParser(
+ source_platform="vitess",
+ server_name=self.get_server_name(connector_manifest),
+ database_name=connector_manifest.config.get("vitess.keyspace"),
+ )
+ else:
+ raise ValueError(f"Connector class '{connector_class}' is unknown.")
+
+ return parser
+
+ def extract_lineages(self) -> List[KafkaConnectLineage]:
+ lineages: List[KafkaConnectLineage] = list()
+
+ try:
+ parser = self.get_parser(self.connector_manifest)
+ source_platform = parser.source_platform
+ server_name = parser.server_name
+ database_name = parser.database_name
+ topic_naming_pattern = rf"({server_name})\.(\w+\.\w+)"
+
+ if not self.connector_manifest.topic_names:
+ return lineages
+
+ for topic in self.connector_manifest.topic_names:
+ found = re.search(re.compile(topic_naming_pattern), topic)
+
+ if found:
+ table_name = get_dataset_name(database_name, found.group(2))
+
+ lineage = KafkaConnectLineage(
+ source_dataset=table_name,
+ source_platform=source_platform,
+ target_dataset=topic,
+ target_platform=KAFKA,
+ )
+ lineages.append(lineage)
+ return lineages
+ except Exception as e:
+ self.report.warning(
+ "Error resolving lineage for connector",
+ self.connector_manifest.name,
+ exc=e,
+ )
+
+ return []
+
+
+@dataclass
+class ConfigDrivenSourceConnector(BaseConnector):
+ def extract_lineages(self) -> List[KafkaConnectLineage]:
+ lineages = []
+ for connector in self.config.generic_connectors:
+ if connector.connector_name == self.connector_manifest.name:
+ target_connector = connector
+ break
+ for topic in self.connector_manifest.topic_names:
+ lineage = KafkaConnectLineage(
+ source_dataset=target_connector.source_dataset,
+ source_platform=target_connector.source_platform,
+ target_dataset=topic,
+ target_platform=KAFKA,
+ )
+ lineages.append(lineage)
+ return lineages
+
+
+JDBC_SOURCE_CONNECTOR_CLASS = "io.confluent.connect.jdbc.JdbcSourceConnector"
+DEBEZIUM_SOURCE_CONNECTOR_PREFIX = "io.debezium.connector"
+MONGO_SOURCE_CONNECTOR_CLASS = "com.mongodb.kafka.connect.MongoSourceConnector"
diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py
index 57a251ef2ed14f..a66962f962255f 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py
@@ -31,6 +31,10 @@
from pydantic.class_validators import validator
import datahub.emitter.mce_builder as builder
+from datahub.api.entities.platformresource.platform_resource import (
+ PlatformResource,
+ PlatformResourceKey,
+)
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.emitter.mcp_builder import ContainerKey, create_embed_mcp
from datahub.ingestion.api.report import Report
@@ -106,7 +110,7 @@
from datahub.utilities.url_util import remove_port_from_url
CORPUSER_DATAHUB = "urn:li:corpuser:datahub"
-
+LOOKER = "looker"
logger = logging.getLogger(__name__)
@@ -1411,6 +1415,7 @@ class LookerDashboardSourceReport(StaleEntityRemovalSourceReport):
resolved_user_ids: int = 0
email_ids_missing: int = 0 # resolved users with missing email addresses
+ looker_user_count: int = 0
_looker_api: Optional[LookerAPI] = None
query_latency: Dict[str, datetime.timedelta] = dataclasses_field(
@@ -1614,9 +1619,21 @@ def get_urn_dashboard_id(self):
class LookerUserRegistry:
looker_api_wrapper: LookerAPI
fields: str = ",".join(["id", "email", "display_name", "first_name", "last_name"])
+ _user_cache: Dict[str, LookerUser] = {}
- def __init__(self, looker_api: LookerAPI):
+ def __init__(self, looker_api: LookerAPI, report: LookerDashboardSourceReport):
self.looker_api_wrapper = looker_api
+ self.report = report
+ self._initialize_user_cache()
+
+ def _initialize_user_cache(self) -> None:
+ raw_users: Sequence[User] = self.looker_api_wrapper.all_users(
+ user_fields=self.fields
+ )
+
+ for raw_user in raw_users:
+ looker_user = LookerUser.create_looker_user(raw_user)
+ self._user_cache[str(looker_user.id)] = looker_user
def get_by_id(self, id_: str) -> Optional[LookerUser]:
if not id_:
@@ -1624,6 +1641,9 @@ def get_by_id(self, id_: str) -> Optional[LookerUser]:
logger.debug(f"Will get user {id_}")
+ if str(id_) in self._user_cache:
+ return self._user_cache.get(str(id_))
+
raw_user: Optional[User] = self.looker_api_wrapper.get_user(
str(id_), user_fields=self.fields
)
@@ -1632,3 +1652,35 @@ def get_by_id(self, id_: str) -> Optional[LookerUser]:
looker_user = LookerUser.create_looker_user(raw_user)
return looker_user
+
+ def to_platform_resource(
+ self, platform_instance: Optional[str]
+ ) -> Iterable[MetadataChangeProposalWrapper]:
+ try:
+ platform_resource_key = PlatformResourceKey(
+ platform=LOOKER,
+ resource_type="USER_ID_MAPPING",
+ platform_instance=platform_instance,
+ primary_key="",
+ )
+
+ # Extract user email mappings
+ user_email_cache = {
+ user_id: user.email
+ for user_id, user in self._user_cache.items()
+ if user.email
+ }
+
+ platform_resource = PlatformResource.create(
+ key=platform_resource_key,
+ value=user_email_cache,
+ )
+
+ self.report.looker_user_count = len(user_email_cache)
+ yield from platform_resource.to_mcps()
+
+ except Exception as exc:
+ self.report.warning(
+ message="Failed to generate platform resource for looker id mappings",
+ exc=exc,
+ )
diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py
index ab55d4e15e5de4..c3f2a110136c45 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py
@@ -68,6 +68,7 @@ class LookerAPIStats(BaseModel):
get_look_calls: int = 0
search_looks_calls: int = 0
search_dashboards_calls: int = 0
+ all_user_calls: int = 0
class LookerAPI:
@@ -135,7 +136,7 @@ def get_available_permissions(self) -> Set[str]:
return permissions
- @lru_cache(maxsize=1000)
+ @lru_cache(maxsize=5000)
def get_user(self, id_: str, user_fields: str) -> Optional[User]:
self.client_stats.user_calls += 1
try:
@@ -154,6 +155,17 @@ def get_user(self, id_: str, user_fields: str) -> Optional[User]:
# User not found
return None
+ def all_users(self, user_fields: str) -> Sequence[User]:
+ self.client_stats.all_user_calls += 1
+ try:
+ return self.client.all_users(
+ fields=cast(str, user_fields),
+ transport_options=self.transport_options,
+ )
+ except SDKError as e:
+ logger.warning(f"Failure was {e}")
+ return []
+
def execute_query(self, write_query: WriteQuery) -> List[Dict]:
logger.debug(f"Executing query {write_query}")
self.client_stats.query_calls += 1
diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py
index cd8ccb8217257c..815c5dfb1c0147 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py
@@ -145,7 +145,9 @@ def __init__(self, config: LookerDashboardSourceConfig, ctx: PipelineContext):
self.source_config: LookerDashboardSourceConfig = config
self.reporter: LookerDashboardSourceReport = LookerDashboardSourceReport()
self.looker_api: LookerAPI = LookerAPI(self.source_config)
- self.user_registry: LookerUserRegistry = LookerUserRegistry(self.looker_api)
+ self.user_registry: LookerUserRegistry = LookerUserRegistry(
+ self.looker_api, self.reporter
+ )
self.explore_registry: LookerExploreRegistry = LookerExploreRegistry(
self.looker_api, self.reporter, self.source_config
)
@@ -1673,5 +1675,14 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
yield usage_mcp.as_workunit()
self.reporter.report_stage_end("usage_extraction")
+ # Dump looker user resource mappings.
+ logger.info("Ingesting looker user resource mapping workunits")
+ self.reporter.report_stage_start("user_resource_extraction")
+ yield from auto_workunit(
+ self.user_registry.to_platform_resource(
+ self.source_config.platform_instance
+ )
+ )
+
def get_report(self) -> SourceReport:
return self.reporter
diff --git a/metadata-ingestion/src/datahub/ingestion/source/mlflow.py b/metadata-ingestion/src/datahub/ingestion/source/mlflow.py
index cef6d2b1bb5774..26d160acf330cf 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/mlflow.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/mlflow.py
@@ -38,16 +38,30 @@
class MLflowConfig(EnvConfigMixin):
tracking_uri: Optional[str] = Field(
default=None,
- description="Tracking server URI. If not set, an MLflow default tracking_uri is used (local `mlruns/` directory or `MLFLOW_TRACKING_URI` environment variable)",
+ description=(
+ "Tracking server URI. If not set, an MLflow default tracking_uri is used"
+ " (local `mlruns/` directory or `MLFLOW_TRACKING_URI` environment variable)"
+ ),
)
registry_uri: Optional[str] = Field(
default=None,
- description="Registry server URI. If not set, an MLflow default registry_uri is used (value of tracking_uri or `MLFLOW_REGISTRY_URI` environment variable)",
+ description=(
+ "Registry server URI. If not set, an MLflow default registry_uri is used"
+ " (value of tracking_uri or `MLFLOW_REGISTRY_URI` environment variable)"
+ ),
)
model_name_separator: str = Field(
default="_",
description="A string which separates model name from its version (e.g. model_1 or model-1)",
)
+ base_external_url: Optional[str] = Field(
+ default=None,
+ description=(
+ "Base URL to use when constructing external URLs to MLflow."
+ " If not set, tracking_uri is used if it's an HTTP URL."
+ " If neither is set, external URLs are not generated."
+ ),
+ )
@dataclass
@@ -279,12 +293,23 @@ def _make_ml_model_urn(self, model_version: ModelVersion) -> str:
)
return urn
- def _make_external_url(self, model_version: ModelVersion) -> Union[None, str]:
+ def _get_base_external_url_from_tracking_uri(self) -> Optional[str]:
+ if isinstance(
+ self.client.tracking_uri, str
+ ) and self.client.tracking_uri.startswith("http"):
+ return self.client.tracking_uri
+ else:
+ return None
+
+ def _make_external_url(self, model_version: ModelVersion) -> Optional[str]:
"""
Generate URL for a Model Version to MLflow UI.
"""
- base_uri = self.client.tracking_uri
- if base_uri.startswith("http"):
+ base_uri = (
+ self.config.base_external_url
+ or self._get_base_external_url_from_tracking_uri()
+ )
+ if base_uri:
return f"{base_uri.rstrip('/')}/#/models/{model_version.name}/versions/{model_version.version}"
else:
return None
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py
index f7458c4eb4d5b5..b49d40a0c7eb6a 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py
@@ -9,7 +9,7 @@
import datahub.emitter.mce_builder as builder
from datahub.configuration.common import AllowDenyPattern, ConfigModel
-from datahub.configuration.source_common import DatasetSourceConfigMixin
+from datahub.configuration.source_common import DatasetSourceConfigMixin, PlatformDetail
from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
from datahub.ingestion.source.common.subtypes import BIAssetSubTypes
from datahub.ingestion.source.state.stale_entity_removal_handler import (
@@ -232,19 +232,6 @@ def default_for_dataset_type_mapping() -> Dict[str, str]:
return dict_
-class PlatformDetail(ConfigModel):
- platform_instance: Optional[str] = pydantic.Field(
- default=None,
- description="DataHub platform instance name. To generate correct urn for upstream dataset, this should match "
- "with platform instance name used in ingestion "
- "recipe of other datahub sources.",
- )
- env: str = pydantic.Field(
- default=builder.DEFAULT_ENV,
- description="The environment that all assets produced by DataHub platform ingestion source belong to",
- )
-
-
class DataBricksPlatformDetail(PlatformDetail):
"""
metastore is an additional field used in Databricks connector to generate the dataset urn
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py
index baaa8d5b85ae10..6d51e853a2fb06 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py
@@ -2,8 +2,8 @@
from abc import ABC, abstractmethod
from typing import Union
+from datahub.configuration.source_common import PlatformDetail
from datahub.ingestion.source.powerbi.config import (
- PlatformDetail,
PowerBiDashboardSourceConfig,
PowerBIPlatformDetail,
)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py
index ffaed79f4e42a6..63520bd731de86 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py
@@ -5,13 +5,13 @@
from lark import Tree
+from datahub.configuration.source_common import PlatformDetail
from datahub.emitter import mce_builder as builder
from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.source.powerbi.config import (
Constant,
DataBricksPlatformDetail,
DataPlatformPair,
- PlatformDetail,
PowerBiDashboardSourceConfig,
PowerBiDashboardSourceReport,
PowerBIPlatformDetail,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/pulsar.py b/metadata-ingestion/src/datahub/ingestion/source/pulsar.py
index 15ee995b2d5fdc..f71949b9eb27f7 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/pulsar.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/pulsar.py
@@ -89,7 +89,16 @@ def __init__(self, schema):
logger.error(f"Invalid JSON schema: {schema_data}. Error: {str(e)}")
avro_schema = {}
- self.schema_name = avro_schema.get("namespace") + "." + avro_schema.get("name")
+ self.schema_name = "null"
+ if avro_schema.get("namespace") and avro_schema.get("name"):
+ self.schema_name = (
+ avro_schema.get("namespace") + "." + avro_schema.get("name")
+ )
+ elif avro_schema.get("namespace"):
+ self.schema_name = avro_schema.get("namespace")
+ elif avro_schema.get("name"):
+ self.schema_name = avro_schema.get("name")
+
self.schema_description = avro_schema.get("doc")
self.schema_type = schema.get("type")
self.schema_str = schema.get("data")
diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py
index 1863663f98bb24..3ddf47b70cdf80 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py
@@ -9,6 +9,7 @@
from itertools import groupby
from pathlib import PurePath
from typing import Any, Dict, Iterable, List, Optional, Tuple
+from urllib.parse import urlparse
import smart_open.compression as so_compression
from more_itertools import peekable
@@ -993,9 +994,7 @@ def s3_browser(self, path_spec: PathSpec, sample_size: int) -> Iterable[BrowsePa
folders = []
for dir in dirs_to_process:
logger.info(f"Getting files from folder: {dir}")
- prefix_to_process = dir.rstrip("\\").lstrip(
- self.create_s3_path(bucket_name, "/")
- )
+ prefix_to_process = urlparse(dir).path.lstrip("/")
folders.extend(
self.get_folder_info(
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py
index 93d84d8b246e51..c769c6705ac3f6 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py
@@ -414,9 +414,13 @@ def _process_upstream_lineage_row(
except Exception as e:
self.report.num_upstream_lineage_edge_parsing_failed += 1
upstream_tables = db_row.get("UPSTREAM_TABLES")
+ downstream_table = db_row.get("DOWNSTREAM_TABLE_NAME")
self.structured_reporter.warning(
"Failed to parse lineage edge",
- context=f"Upstreams: {upstream_tables} Downstreams: {db_row.get('DOWNSTREAM_TABLE_NAME')}",
+ # Tricky: sometimes the full row data is too large, and so the context
+ # message gets truncated. By pulling out the upstreams and downstream
+ # list, we can at least get the important fields if truncation does occur.
+ context=f"Upstreams: {upstream_tables} Downstream: {downstream_table} Full row: {db_row}",
exc=e,
)
return None
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py
index 99790de529ac3a..97c398c1962d6b 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py
@@ -237,6 +237,19 @@ def show_views_for_database(
LIMIT {limit} {from_clause};
"""
+ @staticmethod
+ def get_secure_view_definitions() -> str:
+ # https://docs.snowflake.com/en/sql-reference/account-usage/views
+ return """
+ SELECT
+ TABLE_CATALOG as "TABLE_CATALOG",
+ TABLE_SCHEMA as "TABLE_SCHEMA",
+ TABLE_NAME as "TABLE_NAME",
+ VIEW_DEFINITION as "VIEW_DEFINITION"
+ FROM SNOWFLAKE.ACCOUNT_USAGE.VIEWS
+ WHERE IS_SECURE = 'YES' AND VIEW_DEFINITION !='' AND DELETED IS NULL
+ """
+
@staticmethod
def columns_for_schema(
schema_name: str,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py
index 5a69b4bb779d72..780effc82b0163 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py
@@ -266,6 +266,22 @@ def get_schemas_for_database(self, db_name: str) -> List[SnowflakeSchema]:
snowflake_schemas.append(snowflake_schema)
return snowflake_schemas
+ @serialized_lru_cache(maxsize=1)
+ def get_secure_view_definitions(self) -> Dict[str, Dict[str, Dict[str, str]]]:
+ secure_view_definitions: Dict[str, Dict[str, Dict[str, str]]] = defaultdict(
+ lambda: defaultdict(lambda: defaultdict())
+ )
+ cur = self.connection.query(SnowflakeQuery.get_secure_view_definitions())
+ for view in cur:
+ db_name = view["TABLE_CATALOG"]
+ schema_name = view["TABLE_SCHEMA"]
+ view_name = view["TABLE_NAME"]
+ secure_view_definitions[db_name][schema_name][view_name] = view[
+ "VIEW_DEFINITION"
+ ]
+
+ return secure_view_definitions
+
@serialized_lru_cache(maxsize=1)
def get_tables_for_database(
self, db_name: str
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py
index 4ceeb8560c1758..bc64693b6a1084 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py
@@ -424,6 +424,10 @@ def _process_schema(
view_identifier = self.identifiers.get_dataset_identifier(
view.name, schema_name, db_name
)
+ if view.is_secure and not view.view_definition:
+ view.view_definition = self.fetch_secure_view_definition(
+ view.name, schema_name, db_name
+ )
if view.view_definition:
self.aggregator.add_view_definition(
view_urn=self.identifiers.gen_dataset_urn(view_identifier),
@@ -449,6 +453,25 @@ def _process_schema(
context=f"{db_name}.{schema_name}",
)
+ def fetch_secure_view_definition(
+ self, table_name: str, schema_name: str, db_name: str
+ ) -> Optional[str]:
+ try:
+ view_definitions = self.data_dictionary.get_secure_view_definitions()
+ return view_definitions[db_name][schema_name][table_name]
+ except Exception as e:
+ if isinstance(e, SnowflakePermissionError):
+ error_msg = (
+ "Failed to get secure views definitions. Please check permissions."
+ )
+ else:
+ error_msg = "Failed to get secure views definitions"
+ self.structured_reporter.warning(
+ error_msg,
+ exc=e,
+ )
+ return None
+
def fetch_views_for_schema(
self, snowflake_schema: SnowflakeSchema, db_name: str, schema_name: str
) -> List[SnowflakeView]:
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
index c3a7912c40e8ee..e5883dd0349a3a 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
@@ -540,6 +540,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
identifiers=self.identifiers,
schema_resolver=schema_resolver,
discovered_tables=discovered_datasets,
+ graph=self.ctx.graph,
)
# TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs
diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py
index 6844b8a425a7b6..6cc2220d90fd93 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py
@@ -111,6 +111,8 @@
tableau_field_to_schema_field,
workbook_graphql_query,
)
+from datahub.ingestion.source.tableau.tableau_server_wrapper import UserInfo
+from datahub.ingestion.source.tableau.tableau_validation import check_user_role
from datahub.metadata.com.linkedin.pegasus2avro.common import (
AuditStamp,
ChangeAuditStamps,
@@ -167,7 +169,7 @@
try:
# On earlier versions of the tableauserverclient, the NonXMLResponseError
- # was thrown when reauthentication was needed. We'll keep both exceptions
+ # was thrown when reauthentication was necessary. We'll keep both exceptions
# around for now, but can remove this in the future.
from tableauserverclient.server.endpoint.exceptions import ( # type: ignore
NotSignedInError,
@@ -632,6 +634,33 @@ class TableauSourceReport(StaleEntityRemovalSourceReport):
num_upstream_table_lineage_failed_parse_sql: int = 0
num_upstream_fine_grained_lineage_failed_parse_sql: int = 0
num_hidden_assets_skipped: int = 0
+ logged_in_user: List[UserInfo] = []
+
+
+def report_user_role(report: TableauSourceReport, server: Server) -> None:
+ title: str = "Insufficient Permissions"
+ message: str = "The user must have the `Site Administrator Explorer` role to perform metadata ingestion."
+ try:
+ # TableauSiteSource instance is per site, so each time we need to find-out user detail
+ # the site-role might be different on another site
+ logged_in_user: UserInfo = UserInfo.from_server(server=server)
+
+ if not logged_in_user.is_site_administrator_explorer():
+ report.warning(
+ title=title,
+ message=message,
+ context=f"user-name={logged_in_user.user_name}, role={logged_in_user.site_role}, site_id={logged_in_user.site_id}",
+ )
+
+ report.logged_in_user.append(logged_in_user)
+
+ except Exception as e:
+ report.warning(
+ title=title,
+ message="Failed to verify the user's role. The user must have `Site Administrator Explorer` role.",
+ context=f"{e}",
+ exc=e,
+ )
@platform_name("Tableau")
@@ -676,6 +705,7 @@ def _authenticate(self, site_content_url: str) -> None:
try:
logger.info(f"Authenticated to Tableau site: '{site_content_url}'")
self.server = self.config.make_tableau_client(site_content_url)
+ report_user_role(report=self.report, server=self.server)
# Note that we're not catching ConfigurationError, since we want that to throw.
except ValueError as e:
self.report.failure(
@@ -689,9 +719,17 @@ def test_connection(config_dict: dict) -> TestConnectionReport:
test_report = TestConnectionReport()
try:
source_config = TableauConfig.parse_obj_allow_extras(config_dict)
- source_config.make_tableau_client(source_config.site)
+
+ server = source_config.make_tableau_client(source_config.site)
+
test_report.basic_connectivity = CapabilityReport(capable=True)
+
+ test_report.capability_report = check_user_role(
+ logged_in_user=UserInfo.from_server(server=server)
+ )
+
except Exception as e:
+ logger.warning(f"{e}", exc_info=e)
test_report.basic_connectivity = CapabilityReport(
capable=False, failure_reason=str(e)
)
@@ -831,6 +869,8 @@ def __init__(
# when emitting custom SQL data sources.
self.custom_sql_ids_being_used: List[str] = []
+ report_user_role(report=report, server=server)
+
@property
def no_env_browse_prefix(self) -> str:
# Prefix to use with browse path (v1)
@@ -1290,7 +1330,6 @@ def get_connection_objects(
page_size = page_size_override or self.config.page_size
filter_pages = get_filter_pages(query_filter, page_size)
-
for filter_page in filter_pages:
has_next_page = 1
current_cursor: Optional[str] = None
diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_common.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_common.py
index c5d14e0afe15a5..61b56c4bee5bda 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_common.py
@@ -975,15 +975,22 @@ def get_filter_pages(query_filter: dict, page_size: int) -> List[dict]:
# a few ten thousand, then tableau server responds with empty response
# causing below error:
# tableauserverclient.server.endpoint.exceptions.NonXMLResponseError: b''
+
+ # in practice, we only do pagination if len(query_filter.keys()) == 1
+ if len(query_filter.keys()) != 1:
+ return filter_pages
+
+ current_key = (list(query_filter.keys()))[0]
+
if (
- len(query_filter.keys()) == 1
- and query_filter.get(c.ID_WITH_IN)
- and isinstance(query_filter[c.ID_WITH_IN], list)
+ current_key in [c.ID_WITH_IN, c.PROJECT_NAME_WITH_IN]
+ and query_filter.get(current_key)
+ and isinstance(query_filter[current_key], list)
):
- ids = query_filter[c.ID_WITH_IN]
+ ids = query_filter[current_key]
filter_pages = [
{
- c.ID_WITH_IN: ids[
+ current_key: ids[
start : (
start + page_size if start + page_size < len(ids) else len(ids)
)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_constant.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_constant.py
index d1dd0d92819991..ea0878143ef354 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_constant.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_constant.py
@@ -81,3 +81,5 @@
PROJECT = "Project"
SITE = "Site"
IS_UNSUPPORTED_CUSTOM_SQL = "isUnsupportedCustomSql"
+SITE_PERMISSION = "sitePermission"
+SITE_ROLE = "SiteAdministratorExplorer"
diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_server_wrapper.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_server_wrapper.py
new file mode 100644
index 00000000000000..f309622d12b91b
--- /dev/null
+++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_server_wrapper.py
@@ -0,0 +1,33 @@
+from dataclasses import dataclass
+
+from tableauserverclient import Server, UserItem
+
+from datahub.ingestion.source.tableau import tableau_constant as c
+
+
+@dataclass
+class UserInfo:
+ user_name: str
+ site_role: str
+ site_id: str
+
+ def is_site_administrator_explorer(self):
+ return self.site_role == c.SITE_ROLE
+
+ @staticmethod
+ def from_server(server: Server) -> "UserInfo":
+ assert server.user_id, "make the connection with tableau"
+
+ user: UserItem = server.users.get_by_id(server.user_id)
+
+ assert user.site_role, "site_role is not available" # to silent the lint
+
+ assert user.name, "user name is not available" # to silent the lint
+
+ assert server.site_id, "site identifier is not available" # to silent the lint
+
+ return UserInfo(
+ user_name=user.name,
+ site_role=user.site_role,
+ site_id=server.site_id,
+ )
diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_validation.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_validation.py
new file mode 100644
index 00000000000000..4a703faf6091b3
--- /dev/null
+++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_validation.py
@@ -0,0 +1,48 @@
+import logging
+from typing import Dict, Union
+
+from datahub.ingestion.api.source import CapabilityReport, SourceCapability
+from datahub.ingestion.source.tableau import tableau_constant as c
+from datahub.ingestion.source.tableau.tableau_server_wrapper import UserInfo
+
+logger = logging.getLogger(__name__)
+
+
+def check_user_role(
+ logged_in_user: UserInfo,
+) -> Dict[Union[SourceCapability, str], CapabilityReport]:
+ capability_dict: Dict[Union[SourceCapability, str], CapabilityReport] = {
+ c.SITE_PERMISSION: CapabilityReport(
+ capable=True,
+ )
+ }
+
+ failure_reason: str = (
+ "The user does not have the `Site Administrator Explorer` role."
+ )
+
+ mitigation_message_prefix: str = (
+ "Assign `Site Administrator Explorer` role to the user"
+ )
+ mitigation_message_suffix: str = "Refer to the setup guide: https://datahubproject.io/docs/quick-ingestion-guides/tableau/setup"
+
+ try:
+ # TODO: Add check for `Enable Derived Permissions`
+ if not logged_in_user.is_site_administrator_explorer():
+ capability_dict[c.SITE_PERMISSION] = CapabilityReport(
+ capable=False,
+ failure_reason=f"{failure_reason} Their current role is {logged_in_user.site_role}.",
+ mitigation_message=f"{mitigation_message_prefix} `{logged_in_user.user_name}`. {mitigation_message_suffix}",
+ )
+
+ return capability_dict
+
+ except Exception as e:
+ logger.warning(msg=e, exc_info=e)
+ capability_dict[c.SITE_PERMISSION] = CapabilityReport(
+ capable=False,
+ failure_reason="Failed to verify user role.",
+ mitigation_message=f"{mitigation_message_prefix}. {mitigation_message_suffix}", # user is unknown
+ )
+
+ return capability_dict
diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py
index 79ea98d1c7f54e..f81eb291e89e1d 100644
--- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py
+++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py
@@ -490,7 +490,7 @@ def __init__(
self._exit_stack.push(self._query_usage_counts)
# Tool Extractor
- self._tool_meta_extractor = ToolMetaExtractor()
+ self._tool_meta_extractor = ToolMetaExtractor.create(graph)
self.report.tool_meta_report = self._tool_meta_extractor.report
def close(self) -> None:
diff --git a/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py b/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py
index 0d85002776e5e2..5af9d9d4f0fffc 100644
--- a/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py
+++ b/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py
@@ -1,3 +1,4 @@
+import contextlib
import json
import logging
from dataclasses import dataclass, field
@@ -5,8 +6,15 @@
from typing_extensions import Protocol
+from datahub.api.entities.platformresource.platform_resource import (
+ ElasticPlatformResourceQuery,
+ PlatformResource,
+ PlatformResourceSearchFields,
+)
from datahub.ingestion.api.report import Report
+from datahub.ingestion.graph.client import DataHubGraph
from datahub.metadata.urns import CorpGroupUrn, CorpUserUrn
+from datahub.utilities.search_utils import LogicalOperator
from datahub.utilities.stats_collections import int_top_k_dict
UrnStr = str
@@ -31,6 +39,7 @@ def _get_last_line(query: str) -> str:
@dataclass
class ToolMetaExtractorReport(Report):
num_queries_meta_extracted: Dict[str, int] = field(default_factory=int_top_k_dict)
+ failures: List[str] = field(default_factory=list)
class ToolMetaExtractor:
@@ -42,14 +51,81 @@ class ToolMetaExtractor:
by warehouse query logs.
"""
- def __init__(self) -> None:
- self.report = ToolMetaExtractorReport()
+ def __init__(
+ self,
+ report: ToolMetaExtractorReport,
+ looker_user_mapping: Optional[Dict[str, str]] = None,
+ ) -> None:
+ self.report = report
self.known_tool_extractors: List[Tuple[str, Callable[[QueryLog], bool]]] = [
(
"mode",
self._extract_mode_query,
- )
+ ),
+ (
+ "looker",
+ self._extract_looker_query,
+ ),
]
+ # maps user id (as string) to email address
+ self.looker_user_mapping = looker_user_mapping
+
+ @classmethod
+ def create(
+ cls,
+ graph: Optional[DataHubGraph] = None,
+ ) -> "ToolMetaExtractor":
+ report = ToolMetaExtractorReport()
+ looker_user_mapping = None
+ if graph:
+ try:
+ looker_user_mapping = cls.extract_looker_user_mapping_from_graph(
+ graph, report
+ )
+ except Exception as e:
+ report.failures.append(
+ f"Unexpected error during Looker user metadata extraction: {str(e)}"
+ )
+
+ return cls(report, looker_user_mapping)
+
+ @classmethod
+ def extract_looker_user_mapping_from_graph(
+ cls, graph: DataHubGraph, report: ToolMetaExtractorReport
+ ) -> Optional[Dict[str, str]]:
+ looker_user_mapping = None
+ query = (
+ ElasticPlatformResourceQuery.create_from()
+ .group(LogicalOperator.AND)
+ .add_field_match(PlatformResourceSearchFields.PLATFORM, "looker")
+ .add_field_match(
+ PlatformResourceSearchFields.RESOURCE_TYPE,
+ "USER_ID_MAPPING",
+ )
+ .end()
+ )
+ platform_resources = list(
+ PlatformResource.search_by_filters(query=query, graph_client=graph)
+ )
+
+ if len(platform_resources) > 1:
+ report.failures.append(
+ "Looker user metadata extraction failed. Found more than one looker user id mappings."
+ )
+ else:
+ platform_resource = platform_resources[0]
+
+ if (
+ platform_resource
+ and platform_resource.resource_info
+ and platform_resource.resource_info.value
+ ):
+ with contextlib.suppress(ValueError, AssertionError):
+ value = platform_resource.resource_info.value.as_raw_json()
+ if value:
+ looker_user_mapping = value
+
+ return looker_user_mapping
def _extract_mode_query(self, entry: QueryLog) -> bool:
"""
@@ -78,14 +154,49 @@ def _extract_mode_query(self, entry: QueryLog) -> bool:
return True
+ def _extract_looker_query(self, entry: QueryLog) -> bool:
+ """
+ Returns:
+ bool: whether QueryLog entry is that of looker and looker user info
+ is extracted into entry.
+ """
+ if not self.looker_user_mapping:
+ return False
+
+ last_line = _get_last_line(entry.query_text)
+
+ if not (last_line.startswith("--") and "Looker Query Context" in last_line):
+ return False
+
+ start_quote_idx = last_line.index("'")
+ end_quote_idx = last_line.rindex("'")
+ if start_quote_idx == -1 or end_quote_idx == -1:
+ return False
+
+ looker_json_raw = last_line[start_quote_idx + 1 : end_quote_idx]
+ looker_json = json.loads(looker_json_raw)
+
+ user_id = str(looker_json["user_id"])
+ email = self.looker_user_mapping.get(user_id)
+ if not email:
+ return False
+
+ original_user = entry.user
+
+ entry.user = email_to_user_urn(email)
+ entry.extra_info = entry.extra_info or {}
+ entry.extra_info["user_via"] = original_user
+
+ return True
+
def extract_bi_metadata(self, entry: QueryLog) -> bool:
for tool, meta_extractor in self.known_tool_extractors:
try:
if meta_extractor(entry):
self.report.num_queries_meta_extracted[tool] += 1
return True
- except Exception:
- logger.debug("Tool metadata extraction failed with error : {e}")
+ except Exception as e:
+ logger.debug(f"Tool metadata extraction failed with error : {e}")
return False
diff --git a/metadata-ingestion/src/datahub/testing/compare_metadata_json.py b/metadata-ingestion/src/datahub/testing/compare_metadata_json.py
index bedc5bc8fcd5e5..9dbadd4804997d 100644
--- a/metadata-ingestion/src/datahub/testing/compare_metadata_json.py
+++ b/metadata-ingestion/src/datahub/testing/compare_metadata_json.py
@@ -117,7 +117,7 @@ def diff_metadata_json(
ignore_paths: Sequence[str] = (),
ignore_order: bool = True,
) -> Union[DeepDiff, MCPDiff]:
- ignore_paths = (*ignore_paths, *default_exclude_paths, r"root\[\d+].delta_info")
+ ignore_paths = [*ignore_paths, *default_exclude_paths, r"root\[\d+].delta_info"]
try:
if ignore_order:
golden_map = get_aspects_by_urn(golden)
diff --git a/metadata-ingestion/src/datahub/utilities/file_backed_collections.py b/metadata-ingestion/src/datahub/utilities/file_backed_collections.py
index b0f5022446de15..b8c27666d7f538 100644
--- a/metadata-ingestion/src/datahub/utilities/file_backed_collections.py
+++ b/metadata-ingestion/src/datahub/utilities/file_backed_collections.py
@@ -1,6 +1,7 @@
import collections
import gzip
import logging
+import os
import pathlib
import pickle
import shutil
@@ -33,6 +34,14 @@
logger: logging.Logger = logging.getLogger(__name__)
+OVERRIDE_SQLITE_VERSION_REQUIREMENT_STR = (
+ os.environ.get("OVERRIDE_SQLITE_VERSION_REQ") or ""
+)
+OVERRIDE_SQLITE_VERSION_REQUIREMENT = (
+ OVERRIDE_SQLITE_VERSION_REQUIREMENT_STR
+ and OVERRIDE_SQLITE_VERSION_REQUIREMENT_STR.lower() != "false"
+)
+
_DEFAULT_FILE_NAME = "sqlite.db"
_DEFAULT_TABLE_NAME = "data"
@@ -212,6 +221,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
_active_object_cache: OrderedDict[str, Tuple[_VT, bool]] = field(
init=False, repr=False
)
+ _use_sqlite_on_conflict: bool = field(repr=False, default=True)
def __post_init__(self) -> None:
assert (
@@ -232,7 +242,10 @@ def __post_init__(self) -> None:
# We use the ON CONFLICT clause to implement UPSERTs with sqlite.
# This was added in 3.24.0 from 2018-06-04.
# See https://www.sqlite.org/lang_conflict.html
- raise RuntimeError("SQLite version 3.24.0 or later is required")
+ if OVERRIDE_SQLITE_VERSION_REQUIREMENT:
+ self.use_sqlite_on_conflict = False
+ else:
+ raise RuntimeError("SQLite version 3.24.0 or later is required")
# We keep a small cache in memory to avoid having to serialize/deserialize
# data from the database too often. We use an OrderedDict to build
@@ -295,7 +308,7 @@ def _prune_cache(self, num_items_to_prune: int) -> None:
values.append(column_serializer(value))
items_to_write.append(tuple(values))
- if items_to_write:
+ if items_to_write and self._use_sqlite_on_conflict:
# Tricky: By using a INSERT INTO ... ON CONFLICT (key) structure, we can
# ensure that the rowid remains the same if a value is updated but is
# autoincremented when rows are inserted.
@@ -312,6 +325,26 @@ def _prune_cache(self, num_items_to_prune: int) -> None:
""",
items_to_write,
)
+ else:
+ for item in items_to_write:
+ try:
+ self._conn.execute(
+ f"""INSERT INTO {self.tablename} (
+ key,
+ value
+ {''.join(f', {column_name}' for column_name in self.extra_columns.keys())}
+ )
+ VALUES ({', '.join(['?'] *(2 + len(self.extra_columns)))})""",
+ item,
+ )
+ except sqlite3.IntegrityError:
+ self._conn.execute(
+ f"""UPDATE {self.tablename} SET
+ value = ?
+ {''.join(f', {column_name} = ?' for column_name in self.extra_columns.keys())}
+ WHERE key = ?""",
+ (*item[1:], item[0]),
+ )
def flush(self) -> None:
self._prune_cache(len(self._active_object_cache))
diff --git a/metadata-ingestion/tests/integration/git/test_git_clone.py b/metadata-ingestion/tests/integration/git/test_git_clone.py
index 60cf20fefcbdd1..01e075930998a4 100644
--- a/metadata-ingestion/tests/integration/git/test_git_clone.py
+++ b/metadata-ingestion/tests/integration/git/test_git_clone.py
@@ -1,4 +1,5 @@
import os
+import pathlib
import pytest
from pydantic import SecretStr
@@ -12,7 +13,7 @@
LOOKML_TEST_SSH_KEY = os.environ.get("DATAHUB_LOOKML_GIT_TEST_SSH_KEY")
-def test_base_url_guessing():
+def test_base_url_guessing() -> None:
# Basic GitHub repo.
config = GitInfo(repo="https://github.com/datahub-project/datahub", branch="master")
assert config.repo_ssh_locator == "git@github.com:datahub-project/datahub.git"
@@ -70,7 +71,7 @@ def test_base_url_guessing():
)
-def test_github_branch():
+def test_github_branch() -> None:
config = GitInfo(
repo="owner/repo",
)
@@ -83,11 +84,37 @@ def test_github_branch():
assert config.branch_for_clone == "main"
+def test_url_subdir() -> None:
+ git_ref = GitReference(repo="https://github.com/org/repo", url_subdir="dbt")
+ assert (
+ git_ref.get_url_for_file_path("model.sql")
+ == "https://github.com/org/repo/blob/main/dbt/model.sql"
+ )
+
+ git_ref = GitReference(repo="https://gitlab.com/org/repo", url_subdir="dbt")
+ assert (
+ git_ref.get_url_for_file_path("model.sql")
+ == "https://gitlab.com/org/repo/-/blob/main/dbt/model.sql"
+ )
+
+ git_ref = GitReference(repo="https://github.com/org/repo", url_subdir="")
+ assert (
+ git_ref.get_url_for_file_path("model.sql")
+ == "https://github.com/org/repo/blob/main/model.sql"
+ )
+
+ git_ref = GitReference(repo="https://github.com/org/repo", url_subdir="dbt/models")
+ assert (
+ git_ref.get_url_for_file_path("model.sql")
+ == "https://github.com/org/repo/blob/main/dbt/models/model.sql"
+ )
+
+
def test_sanitize_repo_url() -> None:
assert_doctest(datahub.ingestion.source.git.git_import)
-def test_git_clone_public(tmp_path):
+def test_git_clone_public(tmp_path: pathlib.Path) -> None:
git_clone = GitClone(str(tmp_path))
checkout_dir = git_clone.clone(
ssh_key=None,
@@ -107,7 +134,7 @@ def test_git_clone_public(tmp_path):
LOOKML_TEST_SSH_KEY is None,
reason="DATAHUB_LOOKML_GIT_TEST_SSH_KEY env variable is not configured",
)
-def test_git_clone_private(tmp_path):
+def test_git_clone_private(tmp_path: pathlib.Path) -> None:
git_clone = GitClone(str(tmp_path))
secret_key = SecretStr(LOOKML_TEST_SSH_KEY) if LOOKML_TEST_SSH_KEY else None
diff --git a/metadata-ingestion/tests/integration/kafka/test_kafka.py b/metadata-ingestion/tests/integration/kafka/test_kafka.py
index 0d9a714625e96b..648c4b26b20a76 100644
--- a/metadata-ingestion/tests/integration/kafka/test_kafka.py
+++ b/metadata-ingestion/tests/integration/kafka/test_kafka.py
@@ -102,7 +102,7 @@ def test_kafka_test_connection(mock_kafka_service, config_dict, is_success):
test_connection_helpers.assert_capability_report(
capability_report=report.capability_report,
failure_capabilities={
- SourceCapability.SCHEMA_METADATA: "Failed to establish a new connection"
+ SourceCapability.SCHEMA_METADATA: "[Errno 111] Connection refused"
},
)
diff --git a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json
index a9c445b5986efe..6ae772c134cb32 100644
--- a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json
+++ b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json
@@ -842,6 +842,62 @@
"pipelineName": "stateful-looker-pipeline"
}
},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562",
+ "changeType": "UPSERT",
+ "aspectName": "platformResourceInfo",
+ "aspect": {
+ "json": {
+ "resourceType": "USER_ID_MAPPING",
+ "primaryKey": "",
+ "value": {
+ "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}",
+ "contentType": "JSON"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided",
+ "pipelineName": "stateful-looker-pipeline"
+ }
+},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562",
+ "changeType": "UPSERT",
+ "aspectName": "dataPlatformInstance",
+ "aspect": {
+ "json": {
+ "platform": "urn:li:dataPlatform:looker"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided",
+ "pipelineName": "stateful-looker-pipeline"
+ }
+},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided",
+ "pipelineName": "stateful-looker-pipeline"
+ }
+},
{
"entityType": "chart",
"entityUrn": "urn:li:chart:(looker,dashboard_elements.10)",
diff --git a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json
index af9c62a2a41803..d7620980a9cedb 100644
--- a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json
+++ b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json
@@ -497,6 +497,59 @@
"lastRunId": "no-run-id-provided"
}
},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562",
+ "changeType": "UPSERT",
+ "aspectName": "platformResourceInfo",
+ "aspect": {
+ "json": {
+ "resourceType": "USER_ID_MAPPING",
+ "primaryKey": "",
+ "value": {
+ "blob": "{}",
+ "contentType": "JSON"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562",
+ "changeType": "UPSERT",
+ "aspectName": "dataPlatformInstance",
+ "aspect": {
+ "json": {
+ "platform": "urn:li:dataPlatform:looker"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
{
"entityType": "chart",
"entityUrn": "urn:li:chart:(looker,dashboard_elements.2)",
diff --git a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json
index b89bc356b48fdc..13963af55bfe56 100644
--- a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json
+++ b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json
@@ -735,6 +735,59 @@
"lastRunId": "no-run-id-provided"
}
},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562",
+ "changeType": "UPSERT",
+ "aspectName": "platformResourceInfo",
+ "aspect": {
+ "json": {
+ "resourceType": "USER_ID_MAPPING",
+ "primaryKey": "",
+ "value": {
+ "blob": "{}",
+ "contentType": "JSON"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562",
+ "changeType": "UPSERT",
+ "aspectName": "dataPlatformInstance",
+ "aspect": {
+ "json": {
+ "platform": "urn:li:dataPlatform:looker"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
{
"entityType": "tag",
"entityUrn": "urn:li:tag:Dimension",
diff --git a/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json
index 810fefd8f6cb85..f11d060102851c 100644
--- a/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json
+++ b/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json
@@ -735,6 +735,59 @@
"lastRunId": "no-run-id-provided"
}
},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562",
+ "changeType": "UPSERT",
+ "aspectName": "platformResourceInfo",
+ "aspect": {
+ "json": {
+ "resourceType": "USER_ID_MAPPING",
+ "primaryKey": "",
+ "value": {
+ "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}",
+ "contentType": "JSON"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562",
+ "changeType": "UPSERT",
+ "aspectName": "dataPlatformInstance",
+ "aspect": {
+ "json": {
+ "platform": "urn:li:dataPlatform:looker"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
{
"entityType": "tag",
"entityUrn": "urn:li:tag:Dimension",
diff --git a/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json
index 3d78397f54a235..f6e39dd5286cd0 100644
--- a/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json
+++ b/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json
@@ -828,6 +828,59 @@
"lastRunId": "no-run-id-provided"
}
},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562",
+ "changeType": "UPSERT",
+ "aspectName": "platformResourceInfo",
+ "aspect": {
+ "json": {
+ "resourceType": "USER_ID_MAPPING",
+ "primaryKey": "",
+ "value": {
+ "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}",
+ "contentType": "JSON"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562",
+ "changeType": "UPSERT",
+ "aspectName": "dataPlatformInstance",
+ "aspect": {
+ "json": {
+ "platform": "urn:li:dataPlatform:looker"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
{
"entityType": "chart",
"entityUrn": "urn:li:chart:(looker,dashboard_elements.2)",
diff --git a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json
index 5a540e61e768d7..203bed843155c8 100644
--- a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json
+++ b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json
@@ -464,6 +464,21 @@
"/Folders/Shared"
]
}
+ },
+ {
+ "com.linkedin.pegasus2avro.common.Ownership": {
+ "owners": [
+ {
+ "owner": "urn:li:corpuser:test-1@looker.com",
+ "type": "DATAOWNER"
+ }
+ ],
+ "ownerTypes": {},
+ "lastModified": {
+ "time": 0,
+ "actor": "urn:li:corpuser:unknown"
+ }
+ }
}
]
}
@@ -708,6 +723,21 @@
"/Folders/Personal"
]
}
+ },
+ {
+ "com.linkedin.pegasus2avro.common.Ownership": {
+ "owners": [
+ {
+ "owner": "urn:li:corpuser:test-2@looker.com",
+ "type": "DATAOWNER"
+ }
+ ],
+ "ownerTypes": {},
+ "lastModified": {
+ "time": 0,
+ "actor": "urn:li:corpuser:unknown"
+ }
+ }
}
]
}
@@ -1108,12 +1138,12 @@
{
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
- "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)",
+ "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)",
"aspects": [
{
"com.linkedin.pegasus2avro.common.BrowsePaths": {
"paths": [
- "/Explore/sales_model"
+ "/Explore/data"
]
}
},
@@ -1126,12 +1156,12 @@
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"project": "lkml_samples",
- "model": "sales_model",
+ "model": "data",
"looker.explore.label": "My Explore View",
- "looker.explore.name": "sales_explore",
+ "looker.explore.name": "my_view",
"looker.explore.file": "test_source_file.lkml"
},
- "externalUrl": "https://looker.company.com/explore/sales_model/sales_explore",
+ "externalUrl": "https://looker.company.com/explore/data/my_view",
"name": "My Explore View",
"description": "lorem ipsum",
"tags": []
@@ -1153,7 +1183,7 @@
},
{
"com.linkedin.pegasus2avro.schema.SchemaMetadata": {
- "schemaName": "sales_explore",
+ "schemaName": "my_view",
"platform": "urn:li:dataPlatform:looker",
"version": 0,
"created": {
@@ -1208,7 +1238,7 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
@@ -1227,12 +1257,12 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)",
"changeType": "UPSERT",
"aspectName": "embed",
"aspect": {
"json": {
- "renderUrl": "https://looker.company.com/embed/explore/sales_model/sales_explore"
+ "renderUrl": "https://looker.company.com/embed/explore/data/my_view"
}
},
"systemMetadata": {
@@ -1244,12 +1274,12 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)",
"changeType": "UPSERT",
"aspectName": "container",
"aspect": {
"json": {
- "container": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5"
+ "container": "urn:li:container:59a5aa45397364e6882e793f1bc77b42"
}
},
"systemMetadata": {
@@ -1261,7 +1291,7 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
@@ -1271,8 +1301,8 @@
"id": "Explore"
},
{
- "id": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5",
- "urn": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5"
+ "id": "urn:li:container:59a5aa45397364e6882e793f1bc77b42",
+ "urn": "urn:li:container:59a5aa45397364e6882e793f1bc77b42"
}
]
}
@@ -1287,12 +1317,12 @@
{
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
- "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)",
+ "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)",
"aspects": [
{
"com.linkedin.pegasus2avro.common.BrowsePaths": {
"paths": [
- "/Explore/data"
+ "/Explore/order_model"
]
}
},
@@ -1305,12 +1335,12 @@
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"project": "lkml_samples",
- "model": "data",
+ "model": "order_model",
"looker.explore.label": "My Explore View",
- "looker.explore.name": "my_view",
+ "looker.explore.name": "order_explore",
"looker.explore.file": "test_source_file.lkml"
},
- "externalUrl": "https://looker.company.com/explore/data/my_view",
+ "externalUrl": "https://looker.company.com/explore/order_model/order_explore",
"name": "My Explore View",
"description": "lorem ipsum",
"tags": []
@@ -1332,7 +1362,7 @@
},
{
"com.linkedin.pegasus2avro.schema.SchemaMetadata": {
- "schemaName": "my_view",
+ "schemaName": "order_explore",
"platform": "urn:li:dataPlatform:looker",
"version": 0,
"created": {
@@ -1387,7 +1417,7 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
@@ -1406,12 +1436,12 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)",
"changeType": "UPSERT",
"aspectName": "embed",
"aspect": {
"json": {
- "renderUrl": "https://looker.company.com/embed/explore/data/my_view"
+ "renderUrl": "https://looker.company.com/embed/explore/order_model/order_explore"
}
},
"systemMetadata": {
@@ -1423,12 +1453,12 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)",
"changeType": "UPSERT",
"aspectName": "container",
"aspect": {
"json": {
- "container": "urn:li:container:59a5aa45397364e6882e793f1bc77b42"
+ "container": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60"
}
},
"systemMetadata": {
@@ -1440,7 +1470,7 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
@@ -1450,8 +1480,8 @@
"id": "Explore"
},
{
- "id": "urn:li:container:59a5aa45397364e6882e793f1bc77b42",
- "urn": "urn:li:container:59a5aa45397364e6882e793f1bc77b42"
+ "id": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60",
+ "urn": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60"
}
]
}
@@ -1466,12 +1496,12 @@
{
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
- "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)",
+ "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)",
"aspects": [
{
"com.linkedin.pegasus2avro.common.BrowsePaths": {
"paths": [
- "/Explore/order_model"
+ "/Explore/sales_model"
]
}
},
@@ -1484,12 +1514,12 @@
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"project": "lkml_samples",
- "model": "order_model",
+ "model": "sales_model",
"looker.explore.label": "My Explore View",
- "looker.explore.name": "order_explore",
+ "looker.explore.name": "sales_explore",
"looker.explore.file": "test_source_file.lkml"
},
- "externalUrl": "https://looker.company.com/explore/order_model/order_explore",
+ "externalUrl": "https://looker.company.com/explore/sales_model/sales_explore",
"name": "My Explore View",
"description": "lorem ipsum",
"tags": []
@@ -1511,7 +1541,7 @@
},
{
"com.linkedin.pegasus2avro.schema.SchemaMetadata": {
- "schemaName": "order_explore",
+ "schemaName": "sales_explore",
"platform": "urn:li:dataPlatform:looker",
"version": 0,
"created": {
@@ -1566,7 +1596,7 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
@@ -1585,12 +1615,12 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)",
"changeType": "UPSERT",
"aspectName": "embed",
"aspect": {
"json": {
- "renderUrl": "https://looker.company.com/embed/explore/order_model/order_explore"
+ "renderUrl": "https://looker.company.com/embed/explore/sales_model/sales_explore"
}
},
"systemMetadata": {
@@ -1602,12 +1632,12 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)",
"changeType": "UPSERT",
"aspectName": "container",
"aspect": {
"json": {
- "container": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60"
+ "container": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5"
}
},
"systemMetadata": {
@@ -1619,7 +1649,7 @@
},
{
"entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
@@ -1629,8 +1659,8 @@
"id": "Explore"
},
{
- "id": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60",
- "urn": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60"
+ "id": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5",
+ "urn": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5"
}
]
}
@@ -1705,6 +1735,62 @@
"pipelineName": "execution-1"
}
},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562",
+ "changeType": "UPSERT",
+ "aspectName": "platformResourceInfo",
+ "aspect": {
+ "json": {
+ "resourceType": "USER_ID_MAPPING",
+ "primaryKey": "",
+ "value": {
+ "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}",
+ "contentType": "JSON"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided",
+ "pipelineName": "execution-1"
+ }
+},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562",
+ "changeType": "UPSERT",
+ "aspectName": "dataPlatformInstance",
+ "aspect": {
+ "json": {
+ "platform": "urn:li:dataPlatform:looker"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided",
+ "pipelineName": "execution-1"
+ }
+},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided",
+ "pipelineName": "execution-1"
+ }
+},
{
"entityType": "tag",
"entityUrn": "urn:li:tag:Dimension",
diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json
index 9ac95b8482a475..87af50f95ed6bb 100644
--- a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json
+++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json
@@ -793,6 +793,60 @@
"lastRunId": "no-run-id-provided"
}
},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:8436a2a37c4a7e81fb08c9c8415d2e4b",
+ "changeType": "UPSERT",
+ "aspectName": "platformResourceInfo",
+ "aspect": {
+ "json": {
+ "resourceType": "USER_ID_MAPPING",
+ "primaryKey": "",
+ "value": {
+ "blob": "{}",
+ "contentType": "JSON"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:8436a2a37c4a7e81fb08c9c8415d2e4b",
+ "changeType": "UPSERT",
+ "aspectName": "dataPlatformInstance",
+ "aspect": {
+ "json": {
+ "platform": "urn:li:dataPlatform:looker",
+ "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:looker,ap-south-1)"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:8436a2a37c4a7e81fb08c9c8415d2e4b",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
{
"entityType": "tag",
"entityUrn": "urn:li:tag:Dimension",
diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json
index 3a2c6359ea63c2..b990ce7c67dab6 100644
--- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json
+++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json
@@ -759,6 +759,59 @@
"lastRunId": "no-run-id-provided"
}
},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562",
+ "changeType": "UPSERT",
+ "aspectName": "platformResourceInfo",
+ "aspect": {
+ "json": {
+ "resourceType": "USER_ID_MAPPING",
+ "primaryKey": "",
+ "value": {
+ "blob": "{}",
+ "contentType": "JSON"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562",
+ "changeType": "UPSERT",
+ "aspectName": "dataPlatformInstance",
+ "aspect": {
+ "json": {
+ "platform": "urn:li:dataPlatform:looker"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
{
"entityType": "tag",
"entityUrn": "urn:li:tag:Dimension",
diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json
index 007eee348aeaf8..391192b3d16f36 100644
--- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json
+++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json
@@ -513,6 +513,59 @@
"lastRunId": "no-run-id-provided"
}
},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562",
+ "changeType": "UPSERT",
+ "aspectName": "platformResourceInfo",
+ "aspect": {
+ "json": {
+ "resourceType": "USER_ID_MAPPING",
+ "primaryKey": "",
+ "value": {
+ "blob": "{}",
+ "contentType": "JSON"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562",
+ "changeType": "UPSERT",
+ "aspectName": "dataPlatformInstance",
+ "aspect": {
+ "json": {
+ "platform": "urn:li:dataPlatform:looker"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
{
"entityType": "chart",
"entityUrn": "urn:li:chart:(looker,dashboard_elements.2)",
diff --git a/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json b/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json
index 859b9163d7aad6..4909a6af73a225 100644
--- a/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json
+++ b/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json
@@ -464,6 +464,21 @@
"/Folders/Shared"
]
}
+ },
+ {
+ "com.linkedin.pegasus2avro.common.Ownership": {
+ "owners": [
+ {
+ "owner": "urn:li:corpuser:test-1@looker.com",
+ "type": "DATAOWNER"
+ }
+ ],
+ "ownerTypes": {},
+ "lastModified": {
+ "time": 0,
+ "actor": "urn:li:corpuser:unknown"
+ }
+ }
}
]
}
@@ -1185,6 +1200,62 @@
"pipelineName": "execution-1"
}
},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562",
+ "changeType": "UPSERT",
+ "aspectName": "platformResourceInfo",
+ "aspect": {
+ "json": {
+ "resourceType": "USER_ID_MAPPING",
+ "primaryKey": "",
+ "value": {
+ "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}",
+ "contentType": "JSON"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided",
+ "pipelineName": "execution-1"
+ }
+},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562",
+ "changeType": "UPSERT",
+ "aspectName": "dataPlatformInstance",
+ "aspect": {
+ "json": {
+ "platform": "urn:li:dataPlatform:looker"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided",
+ "pipelineName": "execution-1"
+ }
+},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided",
+ "pipelineName": "execution-1"
+ }
+},
{
"entityType": "tag",
"entityUrn": "urn:li:tag:Dimension",
diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json
index 8256c984afb274..ddeb5428b1d726 100644
--- a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json
+++ b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json
@@ -762,6 +762,62 @@
"pipelineName": "stateful-looker-pipeline"
}
},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562",
+ "changeType": "UPSERT",
+ "aspectName": "platformResourceInfo",
+ "aspect": {
+ "json": {
+ "resourceType": "USER_ID_MAPPING",
+ "primaryKey": "",
+ "value": {
+ "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}",
+ "contentType": "JSON"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided",
+ "pipelineName": "stateful-looker-pipeline"
+ }
+},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562",
+ "changeType": "UPSERT",
+ "aspectName": "dataPlatformInstance",
+ "aspect": {
+ "json": {
+ "platform": "urn:li:dataPlatform:looker"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided",
+ "pipelineName": "stateful-looker-pipeline"
+ }
+},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided",
+ "pipelineName": "stateful-looker-pipeline"
+ }
+},
{
"entityType": "tag",
"entityUrn": "urn:li:tag:Dimension",
@@ -814,8 +870,8 @@
}
},
{
- "entityType": "dashboard",
- "entityUrn": "urn:li:dashboard:(looker,dashboards.11)",
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,bogus data.explore.my_view,PROD)",
"changeType": "UPSERT",
"aspectName": "status",
"aspect": {
@@ -831,8 +887,8 @@
}
},
{
- "entityType": "chart",
- "entityUrn": "urn:li:chart:(looker,dashboard_elements.10)",
+ "entityType": "dashboard",
+ "entityUrn": "urn:li:dashboard:(looker,dashboards.11)",
"changeType": "UPSERT",
"aspectName": "status",
"aspect": {
@@ -865,8 +921,8 @@
}
},
{
- "entityType": "dataset",
- "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,bogus data.explore.my_view,PROD)",
+ "entityType": "chart",
+ "entityUrn": "urn:li:chart:(looker,dashboard_elements.10)",
"changeType": "UPSERT",
"aspectName": "status",
"aspect": {
diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json
index 0b3530f9c24629..594983c8fb0f2a 100644
--- a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json
+++ b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json
@@ -678,6 +678,59 @@
"lastRunId": "no-run-id-provided"
}
},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562",
+ "changeType": "UPSERT",
+ "aspectName": "platformResourceInfo",
+ "aspect": {
+ "json": {
+ "resourceType": "USER_ID_MAPPING",
+ "primaryKey": "",
+ "value": {
+ "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}",
+ "contentType": "JSON"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562",
+ "changeType": "UPSERT",
+ "aspectName": "dataPlatformInstance",
+ "aspect": {
+ "json": {
+ "platform": "urn:li:dataPlatform:looker"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "platformResource",
+ "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1586847600000,
+ "runId": "looker-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
{
"entityType": "chart",
"entityUrn": "urn:li:chart:(looker,dashboard_elements.2)",
diff --git a/metadata-ingestion/tests/integration/looker/test_looker.py b/metadata-ingestion/tests/integration/looker/test_looker.py
index 8bbf14709ff9fb..a39de8384efb23 100644
--- a/metadata-ingestion/tests/integration/looker/test_looker.py
+++ b/metadata-ingestion/tests/integration/looker/test_looker.py
@@ -83,6 +83,7 @@ def test_looker_ingest(pytestconfig, tmp_path, mock_time):
with mock.patch("looker_sdk.init40") as mock_sdk:
mock_sdk.return_value = mocked_client
setup_mock_dashboard(mocked_client)
+ mocked_client.run_inline_query.side_effect = side_effect_query_inline
setup_mock_explore(mocked_client)
test_resources_dir = pytestconfig.rootpath / "tests/integration/looker"
@@ -319,6 +320,7 @@ def setup_mock_look(mocked_client):
mocked_client.all_looks.return_value = [
Look(
id="1",
+ user_id="1",
title="Outer Look",
description="I am not part of any Dashboard",
query_id="1",
@@ -327,6 +329,7 @@ def setup_mock_look(mocked_client):
Look(
id="2",
title="Personal Look",
+ user_id="2",
description="I am not part of any Dashboard and in personal folder",
query_id="2",
folder=FolderBase(
@@ -561,6 +564,20 @@ def get_user(
mocked_client.user.side_effect = get_user
+def setup_mock_all_user(mocked_client):
+ def all_users(
+ fields: Optional[str] = None,
+ transport_options: Optional[transport.TransportOptions] = None,
+ ) -> List[User]:
+ return [
+ User(id="1", email="test-1@looker.com"),
+ User(id="2", email="test-2@looker.com"),
+ User(id="3", email="test-3@looker.com"),
+ ]
+
+ mocked_client.all_users.side_effect = all_users
+
+
def side_effect_query_inline(
result_format: str, body: WriteQuery, transport_options: Optional[TransportOptions]
) -> str:
@@ -714,6 +731,7 @@ def test_looker_ingest_usage_history(pytestconfig, tmp_path, mock_time):
mocked_client.run_inline_query.side_effect = side_effect_query_inline
setup_mock_explore(mocked_client)
setup_mock_user(mocked_client)
+ setup_mock_all_user(mocked_client)
test_resources_dir = pytestconfig.rootpath / "tests/integration/looker"
@@ -946,6 +964,8 @@ def ingest_independent_looks(
mock_sdk.return_value = mocked_client
setup_mock_dashboard(mocked_client)
setup_mock_explore(mocked_client)
+ setup_mock_user(mocked_client)
+ setup_mock_all_user(mocked_client)
setup_mock_look(mocked_client)
test_resources_dir = pytestconfig.rootpath / "tests/integration/looker"
diff --git a/metadata-ingestion/tests/integration/snowflake/common.py b/metadata-ingestion/tests/integration/snowflake/common.py
index 9e4bb2f0eb634f..862d27186703a8 100644
--- a/metadata-ingestion/tests/integration/snowflake/common.py
+++ b/metadata-ingestion/tests/integration/snowflake/common.py
@@ -14,6 +14,11 @@
NUM_OPS = 10
NUM_USAGE = 0
+
+def is_secure(view_idx):
+ return view_idx == 1
+
+
FROZEN_TIME = "2022-06-07 17:00:00"
large_sql_query = """WITH object_access_history AS
(
@@ -247,9 +252,25 @@ def default_query_results( # noqa: C901
"name": f"VIEW_{view_idx}",
"created_on": datetime(2021, 6, 8, 0, 0, 0, 0),
"comment": "Comment for View",
- "text": f"create view view_{view_idx} as select * from table_{view_idx}",
+ "is_secure": "true" if is_secure(view_idx) else "false",
+ "text": (
+ f"create view view_{view_idx} as select * from table_{view_idx}"
+ if not is_secure(view_idx)
+ else None
+ ),
+ }
+ for view_idx in range(1, num_views + 1)
+ ]
+ elif query == SnowflakeQuery.get_secure_view_definitions():
+ return [
+ {
+ "TABLE_CATALOG": "TEST_DB",
+ "TABLE_SCHEMA": "TEST_SCHEMA",
+ "TABLE_NAME": f"VIEW_{view_idx}",
+ "VIEW_DEFINITION": f"create view view_{view_idx} as select * from table_{view_idx}",
}
for view_idx in range(1, num_views + 1)
+ if is_secure(view_idx)
]
elif query == SnowflakeQuery.columns_for_schema("TEST_SCHEMA", "TEST_DB"):
return [
diff --git a/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json b/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json
index 4415b1ad3e5159..48ec46af069cef 100644
--- a/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json
+++ b/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json
@@ -490,7 +490,9 @@
"aspectName": "datasetProperties",
"aspect": {
"json": {
- "customProperties": {"CLUSTERING_KEY": "LINEAR(COL_1)"},
+ "customProperties": {
+ "CLUSTERING_KEY": "LINEAR(COL_1)"
+ },
"externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_1/",
"name": "TABLE_1",
"qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_1",
@@ -789,7 +791,9 @@
"aspectName": "datasetProperties",
"aspect": {
"json": {
- "customProperties": {"CLUSTERING_KEY": "LINEAR(COL_1)"},
+ "customProperties": {
+ "CLUSTERING_KEY": "LINEAR(COL_1)"
+ },
"externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_2/",
"name": "TABLE_2",
"qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_2",
@@ -1088,7 +1092,9 @@
"aspectName": "datasetProperties",
"aspect": {
"json": {
- "customProperties": {"CLUSTERING_KEY": "LINEAR(COL_1)"},
+ "customProperties": {
+ "CLUSTERING_KEY": "LINEAR(COL_1)"
+ },
"externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_3/",
"name": "TABLE_3",
"qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_3",
@@ -1387,7 +1393,9 @@
"aspectName": "datasetProperties",
"aspect": {
"json": {
- "customProperties": {"CLUSTERING_KEY": "LINEAR(COL_1)"},
+ "customProperties": {
+ "CLUSTERING_KEY": "LINEAR(COL_1)"
+ },
"externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_4/",
"name": "TABLE_4",
"qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_4",
@@ -1686,7 +1694,9 @@
"aspectName": "datasetProperties",
"aspect": {
"json": {
- "customProperties": {"CLUSTERING_KEY": "LINEAR(COL_1)"},
+ "customProperties": {
+ "CLUSTERING_KEY": "LINEAR(COL_1)"
+ },
"externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_5/",
"name": "TABLE_5",
"qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_5",
@@ -1985,7 +1995,9 @@
"aspectName": "datasetProperties",
"aspect": {
"json": {
- "customProperties": {"CLUSTERING_KEY": "LINEAR(COL_1)"},
+ "customProperties": {
+ "CLUSTERING_KEY": "LINEAR(COL_1)"
+ },
"externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_6/",
"name": "TABLE_6",
"qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_6",
@@ -2284,7 +2296,9 @@
"aspectName": "datasetProperties",
"aspect": {
"json": {
- "customProperties": {"CLUSTERING_KEY": "LINEAR(COL_1)"},
+ "customProperties": {
+ "CLUSTERING_KEY": "LINEAR(COL_1)"
+ },
"externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_7/",
"name": "TABLE_7",
"qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_7",
@@ -2583,7 +2597,9 @@
"aspectName": "datasetProperties",
"aspect": {
"json": {
- "customProperties": {"CLUSTERING_KEY": "LINEAR(COL_1)"},
+ "customProperties": {
+ "CLUSTERING_KEY": "LINEAR(COL_1)"
+ },
"externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_8/",
"name": "TABLE_8",
"qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_8",
@@ -2882,7 +2898,9 @@
"aspectName": "datasetProperties",
"aspect": {
"json": {
- "customProperties": {"CLUSTERING_KEY": "LINEAR(COL_1)"},
+ "customProperties": {
+ "CLUSTERING_KEY": "LINEAR(COL_1)"
+ },
"externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_9/",
"name": "TABLE_9",
"qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_9",
@@ -3181,7 +3199,9 @@
"aspectName": "datasetProperties",
"aspect": {
"json": {
- "customProperties": {"CLUSTERING_KEY": "LINEAR(COL_1)"},
+ "customProperties": {
+ "CLUSTERING_KEY": "LINEAR(COL_1)"
+ },
"externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_10/",
"name": "TABLE_10",
"qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_10",
@@ -3471,23 +3491,25 @@
"aspectName": "datasetProperties",
"aspect": {
"json": {
- "customProperties": {},
+ "customProperties": {
+ "IS_SECURE": "true"
+ },
"externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/view/VIEW_1/",
"name": "VIEW_1",
"qualifiedName": "TEST_DB.TEST_SCHEMA.VIEW_1",
"description": "Comment for View",
"created": {
- "time": 1623103200000
+ "time": 1623090600000
},
"lastModified": {
- "time": 1623103200000
+ "time": 1623090600000
},
"tags": []
}
},
"systemMetadata": {
"lastObserved": 1615443388097,
- "runId": "snowflake-2023_12_18-10_16_09",
+ "runId": "snowflake-2024_12_16-15_30_20-649nax",
"lastRunId": "no-run-id-provided"
}
},
diff --git a/metadata-ingestion/tests/integration/snowflake/snowflake_privatelink_golden.json b/metadata-ingestion/tests/integration/snowflake/snowflake_privatelink_golden.json
index 3040c6c4e9196f..f22cbd122361dc 100644
--- a/metadata-ingestion/tests/integration/snowflake/snowflake_privatelink_golden.json
+++ b/metadata-ingestion/tests/integration/snowflake/snowflake_privatelink_golden.json
@@ -621,12 +621,17 @@
"op": "add",
"path": "/qualifiedName",
"value": "TEST_DB.TEST_SCHEMA.VIEW_1"
+ },
+ {
+ "op": "add",
+ "path": "/customProperties/IS_SECURE",
+ "value": "true"
}
]
},
"systemMetadata": {
"lastObserved": 1654621200000,
- "runId": "snowflake-2022_06_07-17_00_00-ad3hnf",
+ "runId": "snowflake-2022_06_07-17_00_00-ivthci",
"lastRunId": "no-run-id-provided"
}
},
diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py
index 5b557efdab0bb0..4b2ac96931b950 100644
--- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py
+++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py
@@ -7,6 +7,7 @@
import pytest
from freezegun import freeze_time
+from pydantic import ValidationError
from requests.adapters import ConnectionError
from tableauserverclient import PermissionsRule, Server
from tableauserverclient.models import (
@@ -21,7 +22,9 @@
from datahub.emitter.mce_builder import DEFAULT_ENV, make_schema_field_urn
from datahub.emitter.mcp import MetadataChangeProposalWrapper
-from datahub.ingestion.run.pipeline import Pipeline, PipelineContext, PipelineInitError
+from datahub.ingestion.api.source import TestConnectionReport
+from datahub.ingestion.run.pipeline import Pipeline, PipelineContext
+from datahub.ingestion.source.tableau import tableau_constant as c
from datahub.ingestion.source.tableau.tableau import (
TableauConfig,
TableauSiteSource,
@@ -61,6 +64,7 @@
"projects": ["default", "Project 2", "Samples"],
"extract_project_hierarchy": False,
"page_size": 1000,
+ "workbook_page_size": 1000,
"ingest_tags": True,
"ingest_owner": True,
"ingest_tables_external": True,
@@ -571,52 +575,28 @@ def test_extract_all_project(pytestconfig, tmp_path, mock_datahub_graph):
def test_value_error_projects_and_project_pattern(
pytestconfig, tmp_path, mock_datahub_graph
):
- # Ingestion should raise ValueError
- output_file_name: str = "tableau_project_pattern_precedence_mces.json"
- golden_file_name: str = "tableau_project_pattern_precedence_mces_golden.json"
-
new_config = config_source_default.copy()
new_config["projects"] = ["default"]
new_config["project_pattern"] = {"allow": ["^Samples$"]}
with pytest.raises(
- PipelineInitError,
+ ValidationError,
match=r".*projects is deprecated. Please use project_path_pattern only.*",
):
- tableau_ingest_common(
- pytestconfig,
- tmp_path,
- mock_data(),
- golden_file_name,
- output_file_name,
- mock_datahub_graph,
- pipeline_config=new_config,
- )
+ TableauConfig.parse_obj(new_config)
def test_project_pattern_deprecation(pytestconfig, tmp_path, mock_datahub_graph):
- # Ingestion should raise ValueError
- output_file_name: str = "tableau_project_pattern_deprecation_mces.json"
- golden_file_name: str = "tableau_project_pattern_deprecation_mces_golden.json"
-
new_config = config_source_default.copy()
del new_config["projects"]
new_config["project_pattern"] = {"allow": ["^Samples$"]}
new_config["project_path_pattern"] = {"allow": ["^Samples$"]}
with pytest.raises(
- PipelineInitError,
+ ValidationError,
match=r".*project_pattern is deprecated. Please use project_path_pattern only*",
):
- tableau_ingest_common(
- pytestconfig,
- tmp_path,
- mock_data(),
- golden_file_name,
- output_file_name,
- mock_datahub_graph,
- pipeline_config=new_config,
- )
+ TableauConfig.parse_obj(new_config)
def test_project_path_pattern_allow(pytestconfig, tmp_path, mock_datahub_graph):
@@ -674,6 +654,7 @@ def test_tableau_ingest_with_platform_instance(
"platform_instance": "acryl_site1",
"projects": ["default", "Project 2"],
"page_size": 1000,
+ "workbook_page_size": 1000,
"ingest_tags": True,
"ingest_owner": True,
"ingest_tables_external": True,
@@ -1296,31 +1277,21 @@ def test_hidden_asset_tags(pytestconfig, tmp_path, mock_datahub_graph):
@pytest.mark.integration
def test_hidden_assets_without_ingest_tags(pytestconfig, tmp_path, mock_datahub_graph):
enable_logging()
- output_file_name: str = "tableau_hidden_asset_tags_error_mces.json"
- golden_file_name: str = "tableau_hidden_asset_tags_error_mces_golden.json"
new_config = config_source_default.copy()
new_config["tags_for_hidden_assets"] = ["hidden", "private"]
new_config["ingest_tags"] = False
with pytest.raises(
- PipelineInitError,
+ ValidationError,
match=r".*tags_for_hidden_assets is only allowed with ingest_tags enabled.*",
):
- tableau_ingest_common(
- pytestconfig,
- tmp_path,
- mock_data(),
- golden_file_name,
- output_file_name,
- mock_datahub_graph,
- pipeline_config=new_config,
- )
+ TableauConfig.parse_obj(new_config)
@freeze_time(FROZEN_TIME)
@pytest.mark.integration
-def test_permission_mode_switched_error(pytestconfig, tmp_path, mock_datahub_graph):
+def test_permission_warning(pytestconfig, tmp_path, mock_datahub_graph):
with mock.patch(
"datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph",
mock_datahub_graph,
@@ -1357,11 +1328,99 @@ def test_permission_mode_switched_error(pytestconfig, tmp_path, mock_datahub_gra
warnings = list(reporter.warnings)
- assert len(warnings) == 1
+ assert len(warnings) == 2
+
+ assert warnings[0].title == "Insufficient Permissions"
- assert warnings[0].title == "Derived Permission Error"
+ assert warnings[1].title == "Derived Permission Error"
- assert warnings[0].message == (
+ assert warnings[1].message == (
"Turn on your derived permissions. See for details "
"https://community.tableau.com/s/question/0D54T00000QnjHbSAJ/how-to-fix-the-permissionsmodeswitched-error"
)
+
+
+@freeze_time(FROZEN_TIME)
+@pytest.mark.integration
+def test_connection_report_test(requests_mock):
+ server_info_response = """
+
+
+ foo
+ 2.4
+
+
+
+ """
+
+ requests_mock.register_uri(
+ "GET",
+ "https://do-not-connect/api/2.4/serverInfo",
+ text=server_info_response,
+ status_code=200,
+ headers={"Content-Type": "application/xml"},
+ )
+
+ signin_response = """
+
+
+
+
+
+
+ """
+
+ requests_mock.register_uri(
+ "POST",
+ "https://do-not-connect/api/2.4/auth/signin",
+ text=signin_response,
+ status_code=200,
+ headers={"Content-Type": "application/xml"},
+ )
+
+ user_by_id_response = """
+
+
+
+ """
+
+ requests_mock.register_uri(
+ "GET",
+ "https://do-not-connect/api/2.4/sites/fake_site_luid/users/fake_user_id",
+ text=user_by_id_response,
+ status_code=200,
+ headers={"Content-Type": "application/xml"},
+ )
+
+ report: TestConnectionReport = TableauSource.test_connection(config_source_default)
+
+ assert report
+ assert report.capability_report
+ assert report.capability_report.get(c.SITE_PERMISSION)
+ assert report.capability_report[c.SITE_PERMISSION].capable
+
+ # Role other than SiteAdministratorExplorer
+ user_by_id_response = """
+
+
+
+ """
+
+ requests_mock.register_uri(
+ "GET",
+ "https://do-not-connect/api/2.4/sites/fake_site_luid/users/fake_user_id",
+ text=user_by_id_response,
+ status_code=200,
+ headers={"Content-Type": "application/xml"},
+ )
+
+ report = TableauSource.test_connection(config_source_default)
+
+ assert report
+ assert report.capability_report
+ assert report.capability_report.get(c.SITE_PERMISSION)
+ assert report.capability_report[c.SITE_PERMISSION].capable is False
+ assert (
+ report.capability_report[c.SITE_PERMISSION].failure_reason
+ == "The user does not have the `Site Administrator Explorer` role. Their current role is Explorer."
+ )
diff --git a/metadata-ingestion/tests/unit/api/entities/structuredproperties/__init__.py b/metadata-ingestion/tests/unit/api/entities/structuredproperties/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/metadata-ingestion/tests/unit/api/entities/structuredproperties/example_structured_properties_golden.json b/metadata-ingestion/tests/unit/api/entities/structuredproperties/example_structured_properties_golden.json
new file mode 100644
index 00000000000000..29386ece7b0ca1
--- /dev/null
+++ b/metadata-ingestion/tests/unit/api/entities/structuredproperties/example_structured_properties_golden.json
@@ -0,0 +1,194 @@
+[
+{
+ "entityType": "structuredProperty",
+ "entityUrn": "urn:li:structuredProperty:io.acryl.privacy.retentionTime",
+ "changeType": "UPSERT",
+ "aspectName": "propertyDefinition",
+ "aspect": {
+ "json": {
+ "qualifiedName": "io.acryl.privacy.retentionTime",
+ "displayName": "Retention Time",
+ "valueType": "urn:li:dataType:datahub.number",
+ "allowedValues": [
+ {
+ "value": {
+ "string": "30"
+ },
+ "description": "30 days, usually reserved for datasets that are ephemeral and contain pii"
+ },
+ {
+ "value": {
+ "string": "90"
+ },
+ "description": "Use this for datasets that drive monthly reporting but contain pii"
+ },
+ {
+ "value": {
+ "string": "365"
+ },
+ "description": "Use this for non-sensitive data that can be retained for longer"
+ }
+ ],
+ "cardinality": "MULTIPLE",
+ "entityTypes": [
+ "urn:li:entityType:datahub.dataset",
+ "urn:li:entityType:datahub.dataFlow"
+ ],
+ "description": "Retention Time is used to figure out how long to retain records in a dataset",
+ "immutable": false
+ }
+ }
+},
+{
+ "entityType": "structuredProperty",
+ "entityUrn": "urn:li:structuredProperty:io.acryl.dataManagement.replicationSLA",
+ "changeType": "UPSERT",
+ "aspectName": "propertyDefinition",
+ "aspect": {
+ "json": {
+ "qualifiedName": "io.acryl.dataManagement.replicationSLA",
+ "displayName": "Replication SLA",
+ "valueType": "urn:li:dataType:datahub.number",
+ "cardinality": "SINGLE",
+ "entityTypes": [
+ "urn:li:entityType:datahub.dataset"
+ ],
+ "description": "SLA for how long data can be delayed before replicating to the destination cluster",
+ "immutable": false
+ }
+ }
+},
+{
+ "entityType": "structuredProperty",
+ "entityUrn": "urn:li:structuredProperty:io.acryl.dataManagement.deprecationDate",
+ "changeType": "UPSERT",
+ "aspectName": "propertyDefinition",
+ "aspect": {
+ "json": {
+ "qualifiedName": "io.acryl.dataManagement.deprecationDate",
+ "displayName": "Deprecation Date",
+ "valueType": "urn:li:dataType:datahub.date",
+ "cardinality": "SINGLE",
+ "entityTypes": [
+ "urn:li:entityType:datahub.dataset",
+ "urn:li:entityType:datahub.dataFlow",
+ "urn:li:entityType:datahub.dataJob"
+ ],
+ "immutable": false
+ }
+ }
+},
+{
+ "entityType": "structuredProperty",
+ "entityUrn": "urn:li:structuredProperty:io.acryl.dataManagement.steward",
+ "changeType": "UPSERT",
+ "aspectName": "propertyDefinition",
+ "aspect": {
+ "json": {
+ "qualifiedName": "io.acryl.dataManagement.steward",
+ "displayName": "Steward",
+ "valueType": "urn:li:dataType:datahub.urn",
+ "typeQualifier": {
+ "allowedTypes": [
+ "urn:li:entityType:datahub.corpuser",
+ "urn:li:entityType:datahub.corpGroup"
+ ]
+ },
+ "cardinality": "SINGLE",
+ "entityTypes": [
+ "urn:li:entityType:datahub.dataset",
+ "urn:li:entityType:datahub.dataFlow",
+ "urn:li:entityType:datahub.dataJob"
+ ],
+ "immutable": false
+ }
+ }
+},
+{
+ "entityType": "structuredProperty",
+ "entityUrn": "urn:li:structuredProperty:io.acryl.dataManagement.certifier",
+ "changeType": "UPSERT",
+ "aspectName": "propertyDefinition",
+ "aspect": {
+ "json": {
+ "qualifiedName": "io.acryl.dataManagement.certifier",
+ "displayName": "Person Certifying the asset",
+ "valueType": "urn:li:dataType:datahub.urn",
+ "cardinality": "SINGLE",
+ "entityTypes": [
+ "urn:li:entityType:datahub.dataset",
+ "urn:li:entityType:datahub.schemaField"
+ ],
+ "immutable": false
+ }
+ }
+},
+{
+ "entityType": "structuredProperty",
+ "entityUrn": "urn:li:structuredProperty:io.acryl.dataManagement.team",
+ "changeType": "UPSERT",
+ "aspectName": "propertyDefinition",
+ "aspect": {
+ "json": {
+ "qualifiedName": "io.acryl.dataManagement.team",
+ "displayName": "Management team",
+ "valueType": "urn:li:dataType:datahub.string",
+ "cardinality": "SINGLE",
+ "entityTypes": [
+ "urn:li:entityType:datahub.dataset"
+ ],
+ "immutable": false
+ }
+ }
+},
+{
+ "entityType": "structuredProperty",
+ "entityUrn": "urn:li:structuredProperty:projectNames",
+ "changeType": "UPSERT",
+ "aspectName": "propertyDefinition",
+ "aspect": {
+ "json": {
+ "qualifiedName": "projectNames",
+ "displayName": "Project names",
+ "valueType": "urn:li:dataType:datahub.string",
+ "allowedValues": [
+ {
+ "value": {
+ "string": "Tracking"
+ },
+ "description": "test value 1 for project"
+ },
+ {
+ "value": {
+ "string": "DataHub"
+ },
+ "description": "test value 2 for project"
+ }
+ ],
+ "cardinality": "MULTIPLE",
+ "entityTypes": [
+ "urn:li:entityType:datahub.dataset"
+ ],
+ "immutable": false
+ }
+ }
+},
+{
+ "entityType": "structuredProperty",
+ "entityUrn": "urn:li:structuredProperty:namespace",
+ "changeType": "UPSERT",
+ "aspectName": "propertyDefinition",
+ "aspect": {
+ "json": {
+ "qualifiedName": "namespace",
+ "displayName": "Namespace",
+ "valueType": "urn:li:dataType:datahub.string",
+ "cardinality": "SINGLE",
+ "entityTypes": [
+ "urn:li:entityType:datahub.dataset"
+ ],
+ "immutable": false
+ }
+ }
+}
+]
\ No newline at end of file
diff --git a/metadata-ingestion/tests/unit/api/entities/structuredproperties/test_structuredproperties.py b/metadata-ingestion/tests/unit/api/entities/structuredproperties/test_structuredproperties.py
new file mode 100644
index 00000000000000..e96b7c1f98437e
--- /dev/null
+++ b/metadata-ingestion/tests/unit/api/entities/structuredproperties/test_structuredproperties.py
@@ -0,0 +1,38 @@
+import pathlib
+
+import pydantic
+import pytest
+
+from datahub.api.entities.structuredproperties.structuredproperties import (
+ StructuredProperties,
+ TypeQualifierAllowedTypes,
+)
+from tests.test_helpers.mce_helpers import check_goldens_stream
+
+RESOURCE_DIR = pathlib.Path(__file__).parent
+
+
+def test_type_validation() -> None:
+ with pytest.raises(pydantic.ValidationError):
+ TypeQualifierAllowedTypes(allowed_types=["thisdoesnotexist"])
+
+ types = TypeQualifierAllowedTypes(allowed_types=["dataset"])
+ assert types.allowed_types == ["urn:li:entityType:datahub.dataset"]
+
+
+def test_structuredproperties_load(pytestconfig: pytest.Config) -> None:
+ example_properties_file = (
+ pytestconfig.rootpath
+ / "examples/structured_properties/structured_properties.yaml"
+ )
+
+ properties = StructuredProperties.from_yaml(str(example_properties_file))
+ mcps = []
+ for property in properties:
+ mcps.extend(property.generate_mcps())
+
+ check_goldens_stream(
+ pytestconfig,
+ mcps,
+ golden_path=RESOURCE_DIR / "example_structured_properties_golden.json",
+ )
diff --git a/metadata-ingestion/tests/unit/serde/test_codegen.py b/metadata-ingestion/tests/unit/serde/test_codegen.py
index 37ac35586950e1..98d62d5643ff2d 100644
--- a/metadata-ingestion/tests/unit/serde/test_codegen.py
+++ b/metadata-ingestion/tests/unit/serde/test_codegen.py
@@ -18,6 +18,7 @@
UpstreamClass,
_Aspect,
)
+from datahub.utilities.urns._urn_base import URN_TYPES
_UPDATE_ENTITY_REGISTRY = os.getenv("UPDATE_ENTITY_REGISTRY", "false").lower() == "true"
ENTITY_REGISTRY_PATH = pathlib.Path(
@@ -165,3 +166,9 @@ def test_enum_options():
# This is mainly a sanity check to ensure that it doesn't do anything too crazy.
env_options = get_enum_options(FabricTypeClass)
assert "PROD" in env_options
+
+
+def test_urn_types() -> None:
+ assert len(URN_TYPES) > 10
+ for checked_type in ["dataset", "dashboard", "dataFlow", "schemaField"]:
+ assert checked_type in URN_TYPES
diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json
index 2d32e1328fbb4f..fd8475090f009e 100644
--- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json
+++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json
@@ -185,7 +185,7 @@
"aspect": {
"json": {
"statement": {
- "value": "ALTER TABLE dev.public.foo_staging RENAME TO foo",
+ "value": "ALTER TABLE dev.public.foo_staging RENAME TO foo /* Datahub generated query text-- */",
"language": "SQL"
},
"source": "SYSTEM",
diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap.json
index af0fca485777ff..d9d46a4b14a146 100644
--- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap.json
+++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap.json
@@ -185,7 +185,7 @@
"aspect": {
"json": {
"statement": {
- "value": "ALTER TABLE dev.public.person_info_swap SWAP WITH dev.public.person_info",
+ "value": "ALTER TABLE dev.public.person_info_swap SWAP WITH dev.public.person_info /* Datahub generated query text-- */",
"language": "SQL"
},
"source": "SYSTEM",
@@ -438,7 +438,7 @@
"aspect": {
"json": {
"statement": {
- "value": "ALTER TABLE dev.public.person_info SWAP WITH dev.public.person_info_swap",
+ "value": "ALTER TABLE dev.public.person_info SWAP WITH dev.public.person_info_swap /* Datahub generated query text-- */",
"language": "SQL"
},
"source": "SYSTEM",
diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap_with_temp.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap_with_temp.json
index ceaaf8f6887c7c..b4eaf76a149337 100644
--- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap_with_temp.json
+++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap_with_temp.json
@@ -175,7 +175,7 @@
"aspect": {
"json": {
"statement": {
- "value": "CREATE TABLE person_info_swap CLONE person_info;\n\nCREATE TABLE person_info_incremental AS\nSELECT\n *\nFROM person_info_dep;\n\nINSERT INTO person_info_swap\nSELECT\n *\nFROM person_info_incremental;\n\nALTER TABLE dev.public.person_info_swap SWAP WITH dev.public.person_info",
+ "value": "CREATE TABLE person_info_swap CLONE person_info;\n\nCREATE TABLE person_info_incremental AS\nSELECT\n *\nFROM person_info_dep;\n\nINSERT INTO person_info_swap\nSELECT\n *\nFROM person_info_incremental;\n\nALTER TABLE dev.public.person_info_swap SWAP WITH dev.public.person_info /* Datahub generated query text-- */",
"language": "SQL"
},
"source": "SYSTEM",
diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_information_schema_query.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_information_schema_query.json
index f5f573f3d51136..9621b7d1c265b4 100644
--- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_information_schema_query.json
+++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_information_schema_query.json
@@ -1,7 +1,7 @@
{
"query_type": "SELECT",
"query_type_props": {},
- "query_fingerprint": "c721ce16410601b36e5f32bd9c5c28488500a93e617363739faebfe71496f163",
+ "query_fingerprint": "a204522c98a01568d8575a98a715de98985aeef0e822feb8450153f71891d6c6",
"in_tables": [
"urn:li:dataset:(urn:li:dataPlatform:bigquery,acryl-staging-2.smoke_test_db_4.INFORMATION_SCHEMA.COLUMNS,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:bigquery,acryl-staging-2.smoke_test_db_4.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS,PROD)"
@@ -178,6 +178,6 @@
],
"debug_info": {
"confidence": 0.2,
- "generalized_statement": "SELECT c.table_catalog AS table_catalog, c.table_schema AS table_schema, c.table_name AS table_name, c.column_name AS column_name, c.ordinal_position AS ordinal_position, cfp.field_path AS field_path, c.is_nullable AS is_nullable, CASE WHEN CONTAINS_SUBSTR(cfp.field_path, ?) THEN NULL ELSE c.data_type END AS data_type, description AS comment, c.is_hidden AS is_hidden, c.is_partitioning_column AS is_partitioning_column, c.clustering_ordinal_position AS clustering_ordinal_position FROM `acryl-staging-2`.`smoke_test_db_4`.INFORMATION_SCHEMA.COLUMNS AS c JOIN `acryl-staging-2`.`smoke_test_db_4`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS AS cfp ON cfp.table_name = c.table_name AND cfp.column_name = c.column_name ORDER BY table_catalog, table_schema, table_name, ordinal_position ASC, data_type DESC"
+ "generalized_statement": "SELECT c.table_catalog AS table_catalog, c.table_schema AS table_schema, c.table_name AS table_name, c.column_name AS column_name, c.ordinal_position AS ordinal_position, cfp.field_path AS field_path, c.is_nullable AS is_nullable, CASE WHEN CONTAINS_SUBSTR(cfp.field_path, ?) THEN NULL ELSE c.data_type END AS data_type, description AS comment, c.is_hidden AS is_hidden, c.is_partitioning_column AS is_partitioning_column, c.clustering_ordinal_position AS clustering_ordinal_position FROM `acryl-staging-2`.`smoke_test_db_4`.`INFORMATION_SCHEMA.COLUMNS` AS c JOIN `acryl-staging-2`.`smoke_test_db_4`.`INFORMATION_SCHEMA.COLUMN_FIELD_PATHS` AS cfp ON cfp.table_name = c.table_name AND cfp.column_name = c.column_name ORDER BY table_catalog, table_schema, table_name, ordinal_position ASC, data_type DESC"
}
}
\ No newline at end of file
diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_tool_meta_extractor.py b/metadata-ingestion/tests/unit/sql_parsing/test_tool_meta_extractor.py
index 6f590b53071467..f6566f007f5e6b 100644
--- a/metadata-ingestion/tests/unit/sql_parsing/test_tool_meta_extractor.py
+++ b/metadata-ingestion/tests/unit/sql_parsing/test_tool_meta_extractor.py
@@ -1,11 +1,14 @@
from datahub.configuration.datetimes import parse_absolute_time
from datahub.metadata.urns import CorpUserUrn
from datahub.sql_parsing.sql_parsing_aggregator import PreparsedQuery
-from datahub.sql_parsing.tool_meta_extractor import ToolMetaExtractor
+from datahub.sql_parsing.tool_meta_extractor import (
+ ToolMetaExtractor,
+ ToolMetaExtractorReport,
+)
def test_extract_mode_metadata() -> None:
- extractor = ToolMetaExtractor()
+ extractor = ToolMetaExtractor(report=ToolMetaExtractorReport())
query = """\
select * from LONG_TAIL_COMPANIONS.ADOPTION.PET_PROFILES
LIMIT 100
@@ -30,8 +33,42 @@ def test_extract_mode_metadata() -> None:
assert extractor.report.num_queries_meta_extracted["mode"] == 1
+def test_extract_looker_metadata() -> None:
+ extractor = ToolMetaExtractor(
+ report=ToolMetaExtractorReport(), looker_user_mapping={"7": "john.doe@xyz.com"}
+ )
+ looker_query = """\
+SELECT
+ all_entities_extended_sibling."ENTITY" AS "all_entities_extended_sibling.entity_type",
+ COUNT(DISTINCT ( all_entities_extended_sibling."URN" )) AS "all_entities_extended_sibling.distinct_count"
+FROM "PUBLIC"."ALL_ENTITIES"
+ AS all_entities_extended_sibling
+GROUP BY
+ 1
+ORDER BY
+ 1
+FETCH NEXT 50 ROWS ONLY
+-- Looker Query Context '{"user_id":7,"history_slug":"264797031bc403cf382cbefbe3700849","instance_slug":"32654f2ffadf10b1949d4009e52fc6a4"}'
+"""
+
+ entry = PreparsedQuery(
+ query_id=None,
+ query_text=looker_query,
+ upstreams=[],
+ downstream=None,
+ column_lineage=None,
+ column_usage=None,
+ inferred_schema=None,
+ user=CorpUserUrn("mode"),
+ timestamp=parse_absolute_time("2021-08-01T01:02:03Z"),
+ )
+ assert extractor.extract_bi_metadata(entry)
+ assert entry.user == CorpUserUrn("john.doe")
+ assert extractor.report.num_queries_meta_extracted["looker"] == 1
+
+
def test_extract_no_metadata() -> None:
- extractor = ToolMetaExtractor()
+ extractor = ToolMetaExtractor(report=ToolMetaExtractorReport())
query = """\
select * from LONG_TAIL_COMPANIONS.ADOPTION.PET_PROFILES
LIMIT 100
@@ -53,3 +90,4 @@ def test_extract_no_metadata() -> None:
assert not extractor.extract_bi_metadata(entry)
assert extractor.report.num_queries_meta_extracted["mode"] == 0
+ assert extractor.report.num_queries_meta_extracted["looker"] == 0
diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py
index 85c86f8d205d9a..5631ad2c69f949 100644
--- a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py
+++ b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py
@@ -37,7 +37,11 @@ def stateful_source(mock_datahub_graph: DataHubGraph) -> Iterable[SnowflakeV2Sou
),
)
- with mock.patch("snowflake.connector.connect"):
+ with mock.patch(
+ "datahub.sql_parsing.sql_parsing_aggregator.ToolMetaExtractor.create",
+ ) as mock_checkpoint, mock.patch("snowflake.connector.connect"):
+ mock_checkpoint.return_value = mock.MagicMock()
+
yield SnowflakeV2Source(ctx=ctx, config=config)
diff --git a/metadata-ingestion/tests/unit/test_mlflow_source.py b/metadata-ingestion/tests/unit/test_mlflow_source.py
index d213dd92352e62..e882296b6f331d 100644
--- a/metadata-ingestion/tests/unit/test_mlflow_source.py
+++ b/metadata-ingestion/tests/unit/test_mlflow_source.py
@@ -136,3 +136,16 @@ def test_make_external_link_remote(source, model_version):
url = source._make_external_url(model_version)
assert url == expected_url
+
+
+def test_make_external_link_remote_via_config(source, model_version):
+ custom_base_url = "https://custom-server.org"
+ source.config.base_external_url = custom_base_url
+ source.client = MlflowClient(
+ tracking_uri="https://dummy-mlflow-tracking-server.org"
+ )
+ expected_url = f"{custom_base_url}/#/models/{model_version.name}/versions/{model_version.version}"
+
+ url = source._make_external_url(model_version)
+
+ assert url == expected_url
diff --git a/metadata-ingestion/tests/unit/test_tableau_source.py b/metadata-ingestion/tests/unit/test_tableau_source.py
index c81aa0bd8a1b1a..44e59decaecbd7 100644
--- a/metadata-ingestion/tests/unit/test_tableau_source.py
+++ b/metadata-ingestion/tests/unit/test_tableau_source.py
@@ -182,8 +182,14 @@ def test_get_filter_pages_simple():
assert get_filter_pages(filter_dict, 10) == [filter_dict]
-def test_get_filter_pages_non_id_large_filter_passthrough():
- projects = [f"project{i}" for i in range(20000)]
+def test_get_filter_pages_non_id_large_filter():
+ projects = [f"project{i}" for i in range(10)]
+ filter_dict = {c.PROJECT_NAME_WITH_IN: projects}
+ assert get_filter_pages(filter_dict, 10) == [filter_dict]
+
+
+def test_get_filter_pages_for_single_key():
+ projects = ["project1"]
filter_dict = {c.PROJECT_NAME_WITH_IN: projects}
assert get_filter_pages(filter_dict, 10) == [filter_dict]
diff --git a/metadata-ingestion/tests/unit/utilities/test_file_backed_collections.py b/metadata-ingestion/tests/unit/utilities/test_file_backed_collections.py
index f4062f9a911453..6230c2e37edc6a 100644
--- a/metadata-ingestion/tests/unit/utilities/test_file_backed_collections.py
+++ b/metadata-ingestion/tests/unit/utilities/test_file_backed_collections.py
@@ -15,11 +15,13 @@
)
-def test_file_dict() -> None:
+@pytest.mark.parametrize("use_sqlite_on_conflict", [True, False])
+def test_file_dict(use_sqlite_on_conflict: bool) -> None:
cache = FileBackedDict[int](
tablename="cache",
cache_max_size=10,
cache_eviction_batch_size=10,
+ _use_sqlite_on_conflict=use_sqlite_on_conflict,
)
for i in range(100):
@@ -92,7 +94,8 @@ def test_file_dict() -> None:
cache["a"] = 1
-def test_custom_serde() -> None:
+@pytest.mark.parametrize("use_sqlite_on_conflict", [True, False])
+def test_custom_serde(use_sqlite_on_conflict: bool) -> None:
@dataclass(frozen=True)
class Label:
a: str
@@ -139,6 +142,7 @@ def deserialize(s: str) -> Main:
deserializer=deserialize,
# Disable the in-memory cache to force all reads/writes to the DB.
cache_max_size=0,
+ _use_sqlite_on_conflict=use_sqlite_on_conflict,
)
first = Main(3, {Label("one", 1): 0.1, Label("two", 2): 0.2})
second = Main(-100, {Label("z", 26): 0.26})
@@ -186,7 +190,8 @@ def test_file_dict_stores_counter() -> None:
assert in_memory_counters[i].most_common(2) == cache[str(i)].most_common(2)
-def test_file_dict_ordering() -> None:
+@pytest.mark.parametrize("use_sqlite_on_conflict", [True, False])
+def test_file_dict_ordering(use_sqlite_on_conflict: bool) -> None:
"""
We require that FileBackedDict maintains insertion order, similar to Python's
built-in dict. This test makes one of each and validates that they behave the same.
@@ -196,6 +201,7 @@ def test_file_dict_ordering() -> None:
serializer=str,
deserializer=int,
cache_max_size=1,
+ _use_sqlite_on_conflict=use_sqlite_on_conflict,
)
data = {}
@@ -229,12 +235,14 @@ class Pair:
@pytest.mark.parametrize("cache_max_size", [0, 1, 10])
-def test_custom_column(cache_max_size: int) -> None:
+@pytest.mark.parametrize("use_sqlite_on_conflict", [True, False])
+def test_custom_column(cache_max_size: int, use_sqlite_on_conflict: bool) -> None:
cache = FileBackedDict[Pair](
extra_columns={
"x": lambda m: m.x,
},
cache_max_size=cache_max_size,
+ _use_sqlite_on_conflict=use_sqlite_on_conflict,
)
cache["first"] = Pair(3, "a")
@@ -275,7 +283,8 @@ def test_custom_column(cache_max_size: int) -> None:
]
-def test_shared_connection() -> None:
+@pytest.mark.parametrize("use_sqlite_on_conflict", [True, False])
+def test_shared_connection(use_sqlite_on_conflict: bool) -> None:
with ConnectionWrapper() as connection:
cache1 = FileBackedDict[int](
shared_connection=connection,
@@ -283,6 +292,7 @@ def test_shared_connection() -> None:
extra_columns={
"v": lambda v: v,
},
+ _use_sqlite_on_conflict=use_sqlite_on_conflict,
)
cache2 = FileBackedDict[Pair](
shared_connection=connection,
@@ -291,6 +301,7 @@ def test_shared_connection() -> None:
"x": lambda m: m.x,
"y": lambda m: m.y,
},
+ _use_sqlite_on_conflict=use_sqlite_on_conflict,
)
cache1["a"] = 3
diff --git a/metadata-integration/java/datahub-client/build.gradle b/metadata-integration/java/datahub-client/build.gradle
index cec3164f10d6cc..42861cf235b56f 100644
--- a/metadata-integration/java/datahub-client/build.gradle
+++ b/metadata-integration/java/datahub-client/build.gradle
@@ -95,6 +95,11 @@ test {
finalizedBy jacocoTestReport
}
+// no submodule depends on datahub-schematron:cli
+// and tests there are the ones checking python-java compatibility
+test.dependsOn tasks.getByPath(":metadata-integration:java:datahub-schematron:cli:test")
+test.dependsOn tasks.getByPath(":metadata-integration:java:datahub-schematron:lib:test")
+
task checkShadowJar(type: Exec) {
commandLine 'sh', '-c', 'scripts/check_jar.sh'
}
diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/java/io/datahubproject/schematron/converters/avro/AvroSchemaConverterTest.java b/metadata-integration/java/datahub-schematron/lib/src/test/java/io/datahubproject/schematron/converters/avro/AvroSchemaConverterTest.java
new file mode 100644
index 00000000000000..d6522c2d84670f
--- /dev/null
+++ b/metadata-integration/java/datahub-schematron/lib/src/test/java/io/datahubproject/schematron/converters/avro/AvroSchemaConverterTest.java
@@ -0,0 +1,942 @@
+package io.datahubproject.schematron.converters.avro;
+
+import static org.testng.Assert.*;
+
+import com.linkedin.common.urn.DataPlatformUrn;
+import com.linkedin.data.template.StringArray;
+import com.linkedin.schema.*;
+import java.io.File;
+import java.io.IOException;
+import java.net.URISyntaxException;
+import java.util.Collections;
+import org.apache.avro.Schema;
+import org.testng.annotations.*;
+
+@Test(groups = "unit")
+class AvroSchemaConverterTest {
+
+ private AvroSchemaConverter avroSchemaConverter = AvroSchemaConverter.builder().build();
+ private DataPlatformUrn dataPlatformUrn =
+ DataPlatformUrn.createFromString("urn:li:dataPlatform:foo");
+
+ AvroSchemaConverterTest() throws URISyntaxException {}
+
+ @Test(groups = "basic")
+ void testPrimitiveTypes() throws IOException {
+ SchemaMetadata schema =
+ avroSchemaConverter.toDataHubSchema(
+ readAvroSchema("primitive_types.avsc"), false, false, dataPlatformUrn, null);
+
+ schema.getFields().forEach(System.out::println);
+
+ assertEquals(schema.getFields().size(), 14);
+
+ assertSchemaField(
+ schema.getFields().get(0),
+ "[version=2.0].[type=PrimitiveType].[type=int].intField",
+ "int",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType())));
+ assertSchemaField(
+ schema.getFields().get(1),
+ "[version=2.0].[type=PrimitiveType].[type=union].intFieldV2",
+ "union",
+ false,
+ false,
+ new SchemaFieldDataType()
+ .setType(
+ SchemaFieldDataType.Type.create(
+ new UnionType()
+ .setNestedTypes(new StringArray(Collections.singletonList("union"))))));
+ assertSchemaField(
+ schema.getFields().get(2),
+ "[version=2.0].[type=PrimitiveType].[type=union].[type=int].intFieldV2",
+ "int",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType())));
+ assertSchemaField(
+ schema.getFields().get(3),
+ "[version=2.0].[type=PrimitiveType].[type=null].nullField",
+ "null",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NullType())));
+ assertSchemaField(
+ schema.getFields().get(4),
+ "[version=2.0].[type=PrimitiveType].[type=union].nullFieldV2",
+ "union",
+ true,
+ false,
+ new SchemaFieldDataType()
+ .setType(
+ SchemaFieldDataType.Type.create(
+ new UnionType()
+ .setNestedTypes(new StringArray(Collections.singletonList("union"))))));
+ assertSchemaField(
+ schema.getFields().get(5),
+ "[version=2.0].[type=PrimitiveType].[type=long].longField",
+ "long",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType())));
+ assertSchemaField(
+ schema.getFields().get(6),
+ "[version=2.0].[type=PrimitiveType].[type=float].floatField",
+ "float",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType())));
+ assertSchemaField(
+ schema.getFields().get(7),
+ "[version=2.0].[type=PrimitiveType].[type=double].doubleField",
+ "double",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType())));
+ assertSchemaField(
+ schema.getFields().get(8),
+ "[version=2.0].[type=PrimitiveType].[type=string].stringField",
+ "string",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())));
+ assertSchemaField(
+ schema.getFields().get(9),
+ "[version=2.0].[type=PrimitiveType].[type=boolean].booleanField",
+ "boolean",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new BooleanType())));
+ assertSchemaField(
+ schema.getFields().get(10),
+ "[version=2.0].[type=PrimitiveType].[type=int].nullableIntField",
+ "int",
+ true,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType())));
+ assertSchemaField(
+ schema.getFields().get(11),
+ "[version=2.0].[type=PrimitiveType].[type=long].nullableLongField",
+ "long",
+ true,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType())));
+ assertSchemaField(
+ schema.getFields().get(12),
+ "[version=2.0].[type=PrimitiveType].[type=string].nullableStringField",
+ "string",
+ true,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())));
+ assertSchemaField(
+ schema.getFields().get(13),
+ "[version=2.0].[type=PrimitiveType].[type=enum].status",
+ "Enum",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new EnumType())));
+ }
+
+ @Test(groups = "basic")
+ void testComplexMaps() throws IOException {
+ SchemaMetadata schema =
+ avroSchemaConverter.toDataHubSchema(
+ readAvroSchema("complex_maps.avsc"), false, false, dataPlatformUrn, null);
+
+ schema.getFields().forEach(System.out::println);
+
+ assertEquals(schema.getFields().size(), 15);
+
+ assertSchemaField(
+ schema.getFields().get(0),
+ "[version=2.0].[type=MapType].[type=map].mapOfString",
+ "map",
+ false,
+ false,
+ new SchemaFieldDataType()
+ .setType(
+ SchemaFieldDataType.Type.create(
+ new MapType().setKeyType("string").setValueType("string"))));
+ assertSchemaField(
+ schema.getFields().get(1),
+ "[version=2.0].[type=MapType].[type=map].[type=ComplexType].mapOfComplexType",
+ "ComplexType",
+ false,
+ false,
+ new SchemaFieldDataType()
+ .setType(
+ SchemaFieldDataType.Type.create(
+ new MapType().setKeyType("string").setValueType("ComplexType"))));
+ assertSchemaField(
+ schema.getFields().get(2),
+ "[version=2.0].[type=MapType].[type=map].[type=ComplexType].mapOfComplexType.[type=string].field1",
+ "string",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())));
+ assertSchemaField(
+ schema.getFields().get(3),
+ "[version=2.0].[type=MapType].[type=map].[type=ComplexType].mapOfComplexType.[type=int].field2",
+ "int",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType())));
+ assertSchemaField(
+ schema.getFields().get(4),
+ "[version=2.0].[type=MapType].[type=map].[type=union].mapOfNullableString",
+ "union",
+ false,
+ false,
+ new SchemaFieldDataType()
+ .setType(
+ SchemaFieldDataType.Type.create(
+ new MapType().setKeyType("string").setValueType("union"))));
+ assertSchemaField(
+ schema.getFields().get(5),
+ "[version=2.0].[type=MapType].[type=map].[type=union].[type=string].mapOfNullableString",
+ "string",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())));
+ assertSchemaField(
+ schema.getFields().get(6),
+ "[version=2.0].[type=MapType].[type=map].[type=union].mapOfNullableComplexType",
+ "union",
+ false,
+ false,
+ new SchemaFieldDataType()
+ .setType(
+ SchemaFieldDataType.Type.create(
+ new MapType().setKeyType("string").setValueType("union"))));
+ assertSchemaField(
+ schema.getFields().get(7),
+ "[version=2.0].[type=MapType].[type=map].[type=union].[type=ComplexTypeNullable].mapOfNullableComplexType",
+ "ComplexTypeNullable",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new RecordType())));
+ assertSchemaField(
+ schema.getFields().get(8),
+ "[version=2.0].[type=MapType].[type=map].[type=union].[type=ComplexTypeNullable].mapOfNullableComplexType.[type=string].field1",
+ "string",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())));
+ assertSchemaField(
+ schema.getFields().get(9),
+ "[version=2.0].[type=MapType].[type=map].[type=union].[type=ComplexTypeNullable].mapOfNullableComplexType.[type=int].field2",
+ "int",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType())));
+ assertSchemaField(
+ schema.getFields().get(10),
+ "[version=2.0].[type=MapType].[type=map].[type=array].mapOfArray",
+ "array(string)",
+ false,
+ false,
+ new SchemaFieldDataType()
+ .setType(
+ SchemaFieldDataType.Type.create(
+ new ArrayType().setNestedType(new StringArray("string")))));
+ assertSchemaField(
+ schema.getFields().get(11),
+ "[version=2.0].[type=MapType].[type=map].[type=map].mapOfMap",
+ "map",
+ false,
+ false,
+ new SchemaFieldDataType()
+ .setType(
+ SchemaFieldDataType.Type.create(
+ new MapType().setKeyType("string").setValueType("int"))));
+ assertSchemaField(
+ schema.getFields().get(12),
+ "[version=2.0].[type=MapType].[type=map].[type=union].mapOfUnion",
+ "union",
+ false,
+ false,
+ new SchemaFieldDataType()
+ .setType(
+ SchemaFieldDataType.Type.create(
+ new MapType().setKeyType("string").setValueType("union"))));
+ assertSchemaField(
+ schema.getFields().get(13),
+ "[version=2.0].[type=MapType].[type=map].[type=union].[type=string].mapOfUnion",
+ "string",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())));
+ assertSchemaField(
+ schema.getFields().get(14),
+ "[version=2.0].[type=MapType].[type=map].[type=union].[type=int].mapOfUnion",
+ "int",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType())));
+ }
+
+ @Test(groups = "basic")
+ void testComplexArrays() throws IOException {
+ SchemaMetadata schema =
+ avroSchemaConverter.toDataHubSchema(
+ readAvroSchema("complex_arrays.avsc"), false, false, dataPlatformUrn, null);
+
+ schema.getFields().forEach(System.out::println);
+
+ assertEquals(schema.getFields().size(), 16);
+
+ assertSchemaField(
+ schema.getFields().get(0),
+ "[version=2.0].[type=ArrayType].[type=array].arrayOfString",
+ "array(string)",
+ false,
+ false,
+ new SchemaFieldDataType()
+ .setType(
+ SchemaFieldDataType.Type.create(
+ new ArrayType().setNestedType(new StringArray("string")))));
+ assertSchemaField(
+ schema.getFields().get(1),
+ "[version=2.0].[type=ArrayType].[type=array].[type=map].arrayOfMap",
+ "map",
+ false,
+ false,
+ new SchemaFieldDataType()
+ .setType(
+ SchemaFieldDataType.Type.create(
+ new MapType().setKeyType("string").setValueType("string"))));
+ assertSchemaField(
+ schema.getFields().get(2),
+ "[version=2.0].[type=ArrayType].[type=array].[type=ComplexType].arrayOfRecord",
+ "ComplexType",
+ false,
+ false,
+ new SchemaFieldDataType()
+ .setType(
+ SchemaFieldDataType.Type.create(
+ new ArrayType().setNestedType(new StringArray("ComplexType")))));
+ assertSchemaField(
+ schema.getFields().get(3),
+ "[version=2.0].[type=ArrayType].[type=array].[type=ComplexType].arrayOfRecord.[type=string].field1",
+ "string",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())));
+ assertSchemaField(
+ schema.getFields().get(4),
+ "[version=2.0].[type=ArrayType].[type=array].[type=ComplexType].arrayOfRecord.[type=int].field2",
+ "int",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType())));
+ assertSchemaField(
+ schema.getFields().get(5),
+ "[version=2.0].[type=ArrayType].[type=array].[type=array].arrayOfArray",
+ "array(string)",
+ false,
+ false,
+ new SchemaFieldDataType()
+ .setType(
+ SchemaFieldDataType.Type.create(
+ new ArrayType().setNestedType(new StringArray("string")))));
+ assertSchemaField(
+ schema.getFields().get(6),
+ "[version=2.0].[type=ArrayType].[type=array].[type=union].arrayOfUnion",
+ "union",
+ false,
+ false,
+ new SchemaFieldDataType()
+ .setType(
+ SchemaFieldDataType.Type.create(
+ new ArrayType().setNestedType(new StringArray("union")))));
+ assertSchemaField(
+ schema.getFields().get(7),
+ "[version=2.0].[type=ArrayType].[type=array].[type=union].[type=string].arrayOfUnion",
+ "string",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())));
+ assertSchemaField(
+ schema.getFields().get(8),
+ "[version=2.0].[type=ArrayType].[type=array].[type=union].[type=int].arrayOfUnion",
+ "int",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType())));
+ assertSchemaField(
+ schema.getFields().get(9),
+ "[version=2.0].[type=ArrayType].[type=array].[type=union].[type=boolean].arrayOfUnion",
+ "boolean",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new BooleanType())));
+ assertSchemaField(
+ schema.getFields().get(10),
+ "[version=2.0].[type=ArrayType].[type=array].[type=union].arrayOfNullableString",
+ "union",
+ false,
+ false,
+ new SchemaFieldDataType()
+ .setType(
+ SchemaFieldDataType.Type.create(
+ new ArrayType().setNestedType(new StringArray("union")))));
+ assertSchemaField(
+ schema.getFields().get(11),
+ "[version=2.0].[type=ArrayType].[type=array].[type=union].[type=string].arrayOfNullableString",
+ "string",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())));
+ assertSchemaField(
+ schema.getFields().get(12),
+ "[version=2.0].[type=ArrayType].[type=array].[type=union].arrayOfNullableRecord",
+ "union",
+ false,
+ false,
+ new SchemaFieldDataType()
+ .setType(
+ SchemaFieldDataType.Type.create(
+ new ArrayType().setNestedType(new StringArray("union")))));
+ assertSchemaField(
+ schema.getFields().get(13),
+ "[version=2.0].[type=ArrayType].[type=array].[type=union].[type=ComplexTypeNullable].arrayOfNullableRecord",
+ "ComplexTypeNullable",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new RecordType())));
+ assertSchemaField(
+ schema.getFields().get(14),
+ "[version=2.0].[type=ArrayType].[type=array].[type=union].[type=ComplexTypeNullable].arrayOfNullableRecord.[type=string].field1",
+ "string",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())));
+ assertSchemaField(
+ schema.getFields().get(15),
+ "[version=2.0].[type=ArrayType].[type=array].[type=union].[type=ComplexTypeNullable].arrayOfNullableRecord.[type=int].field2",
+ "int",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType())));
+ }
+
+ @Test(groups = "basic")
+ void testComplexStructs() throws IOException {
+ SchemaMetadata schema =
+ avroSchemaConverter.toDataHubSchema(
+ readAvroSchema("complex_structs.avsc"), false, false, dataPlatformUrn, null);
+
+ schema.getFields().forEach(System.out::println);
+
+ assertEquals(schema.getFields().size(), 13);
+
+ assertSchemaField(
+ schema.getFields().get(0),
+ "[version=2.0].[type=StructType].[type=ComplexStruct].structField",
+ "ComplexStruct",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new RecordType())));
+ assertSchemaField(
+ schema.getFields().get(1),
+ "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=string].fieldString",
+ "string",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())));
+ assertSchemaField(
+ schema.getFields().get(2),
+ "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=int].fieldInt",
+ "int",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType())));
+ assertSchemaField(
+ schema.getFields().get(3),
+ "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=boolean].fieldBoolean",
+ "boolean",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new BooleanType())));
+ assertSchemaField(
+ schema.getFields().get(4),
+ "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=map].fieldMap",
+ "map",
+ false,
+ false,
+ new SchemaFieldDataType()
+ .setType(
+ SchemaFieldDataType.Type.create(
+ new MapType().setKeyType("string").setValueType("string"))));
+ assertSchemaField(
+ schema.getFields().get(5),
+ "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=NestedRecord].fieldRecord",
+ "NestedRecord",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new RecordType())));
+ assertSchemaField(
+ schema.getFields().get(6),
+ "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=NestedRecord].fieldRecord.[type=string].nestedField1",
+ "string",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())));
+ assertSchemaField(
+ schema.getFields().get(7),
+ "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=NestedRecord].fieldRecord.[type=int].nestedField2",
+ "int",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType())));
+ assertSchemaField(
+ schema.getFields().get(8),
+ "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=array].fieldArray",
+ "array(string)",
+ false,
+ false,
+ new SchemaFieldDataType()
+ .setType(
+ SchemaFieldDataType.Type.create(
+ new ArrayType().setNestedType(new StringArray("string")))));
+ assertSchemaField(
+ schema.getFields().get(9),
+ "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=union].fieldUnion",
+ "union",
+ true,
+ false,
+ new SchemaFieldDataType()
+ .setType(
+ SchemaFieldDataType.Type.create(
+ new UnionType().setNestedTypes(new StringArray("union")))));
+ assertSchemaField(
+ schema.getFields().get(10),
+ "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=union].[type=string].fieldUnion",
+ "string",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())));
+ assertSchemaField(
+ schema.getFields().get(11),
+ "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=union].[type=int].fieldUnion",
+ "int",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType())));
+ assertSchemaField(
+ schema.getFields().get(12),
+ "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=map].fieldNullableMap",
+ "map",
+ true,
+ false,
+ new SchemaFieldDataType()
+ .setType(
+ SchemaFieldDataType.Type.create(
+ new MapType().setKeyType("string").setValueType("string"))));
+ }
+
+ @Test(groups = "basic")
+ void testComplexUnions() throws IOException {
+ SchemaMetadata schema =
+ avroSchemaConverter.toDataHubSchema(
+ readAvroSchema("complex_unions.avsc"), false, false, dataPlatformUrn, null);
+
+ schema.getFields().forEach(System.out::println);
+
+ assertEquals(schema.getFields().size(), 14);
+
+ assertSchemaField(
+ schema.getFields().get(0),
+ "[version=2.0].[type=UnionType].[type=union].fieldUnionNullablePrimitives",
+ "union",
+ true,
+ false,
+ new SchemaFieldDataType()
+ .setType(
+ SchemaFieldDataType.Type.create(
+ new UnionType().setNestedTypes(new StringArray("union")))));
+ assertSchemaField(
+ schema.getFields().get(1),
+ "[version=2.0].[type=UnionType].[type=union].[type=string].fieldUnionNullablePrimitives",
+ "string",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())));
+ assertSchemaField(
+ schema.getFields().get(2),
+ "[version=2.0].[type=UnionType].[type=union].[type=int].fieldUnionNullablePrimitives",
+ "int",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType())));
+ assertSchemaField(
+ schema.getFields().get(3),
+ "[version=2.0].[type=UnionType].[type=union].[type=boolean].fieldUnionNullablePrimitives",
+ "boolean",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new BooleanType())));
+ assertSchemaField(
+ schema.getFields().get(4),
+ "[version=2.0].[type=UnionType].[type=union].fieldUnionComplexTypes",
+ "union",
+ true,
+ false,
+ new SchemaFieldDataType()
+ .setType(
+ SchemaFieldDataType.Type.create(
+ new UnionType().setNestedTypes(new StringArray("union")))));
+ assertSchemaField(
+ schema.getFields().get(5),
+ "[version=2.0].[type=UnionType].[type=union].[type=NestedRecord].fieldUnionComplexTypes",
+ "NestedRecord",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new RecordType())));
+ assertSchemaField(
+ schema.getFields().get(6),
+ "[version=2.0].[type=UnionType].[type=union].[type=NestedRecord].fieldUnionComplexTypes.[type=string].nestedField1",
+ "string",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())));
+ assertSchemaField(
+ schema.getFields().get(7),
+ "[version=2.0].[type=UnionType].[type=union].[type=NestedRecord].fieldUnionComplexTypes.[type=int].nestedField2",
+ "int",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType())));
+ assertSchemaField(
+ schema.getFields().get(8),
+ "[version=2.0].[type=UnionType].[type=union].[type=map].fieldUnionComplexTypes",
+ "map",
+ false,
+ false,
+ new SchemaFieldDataType()
+ .setType(
+ SchemaFieldDataType.Type.create(
+ new MapType().setKeyType("string").setValueType("string"))));
+ assertSchemaField(
+ schema.getFields().get(9),
+ "[version=2.0].[type=UnionType].[type=union].fieldUnionPrimitiveAndComplex",
+ "union",
+ true,
+ false,
+ new SchemaFieldDataType()
+ .setType(
+ SchemaFieldDataType.Type.create(
+ new UnionType().setNestedTypes(new StringArray("union")))));
+ assertSchemaField(
+ schema.getFields().get(10),
+ "[version=2.0].[type=UnionType].[type=union].[type=string].fieldUnionPrimitiveAndComplex",
+ "string",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())));
+ assertSchemaField(
+ schema.getFields().get(11),
+ "[version=2.0].[type=UnionType].[type=union].[type=ComplexTypeRecord].fieldUnionPrimitiveAndComplex",
+ "ComplexTypeRecord",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new RecordType())));
+ assertSchemaField(
+ schema.getFields().get(12),
+ "[version=2.0].[type=UnionType].[type=union].[type=ComplexTypeRecord].fieldUnionPrimitiveAndComplex.[type=string].complexField1",
+ "string",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())));
+ assertSchemaField(
+ schema.getFields().get(13),
+ "[version=2.0].[type=UnionType].[type=union].[type=ComplexTypeRecord].fieldUnionPrimitiveAndComplex.[type=int].complexField2",
+ "int",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType())));
+ }
+
+ @Test(groups = "basic")
+ void testLogicalTypes() throws IOException {
+ SchemaMetadata schema =
+ avroSchemaConverter.toDataHubSchema(
+ readAvroSchema("logical_types.avsc"), false, false, dataPlatformUrn, null);
+
+ schema.getFields().forEach(System.out::println);
+
+ assertEquals(schema.getFields().size(), 9);
+
+ assertSchemaField(
+ schema.getFields().get(0),
+ "[version=2.0].[type=LogicalTypes].[type=bytes].decimalField",
+ "bytes(decimal)",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType())),
+ "{\"scale\":2,\"logicalType\":\"decimal\",\"precision\":9}");
+ assertSchemaField(
+ schema.getFields().get(1),
+ "[version=2.0].[type=LogicalTypes].[type=bytes].decimalFieldWithoutScale",
+ "bytes(decimal)",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType())),
+ "{\"logicalType\":\"decimal\",\"precision\":9}");
+ assertSchemaField(
+ schema.getFields().get(2),
+ "[version=2.0].[type=LogicalTypes].[type=bytes].decimalFieldWithoutPrecisionAndScale",
+ "bytes",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new BytesType())),
+ "{\"logicalType\":\"decimal\"}");
+ assertSchemaField(
+ schema.getFields().get(3),
+ "[version=2.0].[type=LogicalTypes].[type=long].timestampMillisField",
+ "long(timestamp-millis)",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new TimeType())),
+ "{\"logicalType\":\"timestamp-millis\"}");
+ assertSchemaField(
+ schema.getFields().get(4),
+ "[version=2.0].[type=LogicalTypes].[type=long].timestampMicrosField",
+ "long(timestamp-micros)",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new TimeType())),
+ "{\"logicalType\":\"timestamp-micros\"}");
+ assertSchemaField(
+ schema.getFields().get(5),
+ "[version=2.0].[type=LogicalTypes].[type=int].dateField",
+ "int(date)",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new DateType())),
+ "{\"logicalType\":\"date\"}");
+ assertSchemaField(
+ schema.getFields().get(6),
+ "[version=2.0].[type=LogicalTypes].[type=int].timeMillisField",
+ "int(time-millis)",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new TimeType())),
+ "{\"logicalType\":\"time-millis\"}");
+ assertSchemaField(
+ schema.getFields().get(7),
+ "[version=2.0].[type=LogicalTypes].[type=long].timeMicrosField",
+ "long(time-micros)",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new TimeType())),
+ "{\"logicalType\":\"time-micros\"}");
+ assertSchemaField(
+ schema.getFields().get(8),
+ "[version=2.0].[type=LogicalTypes].[type=string].uuidField",
+ "string(uuid)",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())),
+ "{\"logicalType\":\"uuid\"}");
+ }
+
+ @Test(groups = "basic")
+ void testUsersRecord() throws IOException {
+ // this is a test case got during the Hudi integration
+ SchemaMetadata schema =
+ avroSchemaConverter.toDataHubSchema(
+ readAvroSchema("users_record.avsc"), false, false, dataPlatformUrn, null);
+
+ schema.getFields().forEach(System.out::println);
+
+ assertEquals(schema.getFields().size(), 20);
+
+ assertSchemaField(
+ schema.getFields().get(0),
+ "[version=2.0].[type=users_record].[type=string]._hoodie_commit_time",
+ "string",
+ true,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())));
+ assertSchemaField(
+ schema.getFields().get(1),
+ "[version=2.0].[type=users_record].[type=string]._hoodie_commit_seqno",
+ "string",
+ true,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())));
+ assertSchemaField(
+ schema.getFields().get(2),
+ "[version=2.0].[type=users_record].[type=string]._hoodie_record_key",
+ "string",
+ true,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())));
+ assertSchemaField(
+ schema.getFields().get(3),
+ "[version=2.0].[type=users_record].[type=string]._hoodie_partition_path",
+ "string",
+ true,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())));
+ assertSchemaField(
+ schema.getFields().get(4),
+ "[version=2.0].[type=users_record].[type=string]._hoodie_file_name",
+ "string",
+ true,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())));
+ assertSchemaField(
+ schema.getFields().get(5),
+ "[version=2.0].[type=users_record].[type=string].user_id",
+ "string",
+ false,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())));
+ assertSchemaField(
+ schema.getFields().get(6),
+ "[version=2.0].[type=users_record].[type=string].name",
+ "string",
+ true,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())));
+ assertSchemaField(
+ schema.getFields().get(7),
+ "[version=2.0].[type=users_record].[type=address].address",
+ "address",
+ true,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new RecordType())));
+ assertSchemaField(
+ schema.getFields().get(8),
+ "[version=2.0].[type=users_record].[type=address].address.[type=string].street",
+ "string",
+ true,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())));
+ assertSchemaField(
+ schema.getFields().get(9),
+ "[version=2.0].[type=users_record].[type=address].address.[type=string].city",
+ "string",
+ true,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())));
+ assertSchemaField(
+ schema.getFields().get(10),
+ "[version=2.0].[type=users_record].[type=address].address.[type=string].country",
+ "string",
+ true,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())));
+ assertSchemaField(
+ schema.getFields().get(11),
+ "[version=2.0].[type=users_record].[type=address].address.[type=string].postal_code",
+ "string",
+ true,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())));
+ assertSchemaField(
+ schema.getFields().get(12),
+ "[version=2.0].[type=users_record].[type=address].address.[type=long].created_at",
+ "long(timestamp-micros)",
+ true,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new TimeType())),
+ "{\"logicalType\":\"timestamp-micros\"}");
+ assertSchemaField(
+ schema.getFields().get(13),
+ "[version=2.0].[type=users_record].[type=contact].contact",
+ "contact",
+ true,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new RecordType())));
+ assertSchemaField(
+ schema.getFields().get(14),
+ "[version=2.0].[type=users_record].[type=contact].contact.[type=string].email",
+ "string",
+ true,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())));
+ assertSchemaField(
+ schema.getFields().get(15),
+ "[version=2.0].[type=users_record].[type=contact].contact.[type=string].phone",
+ "string",
+ true,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())));
+ assertSchemaField(
+ schema.getFields().get(16),
+ "[version=2.0].[type=users_record].[type=long].created_at",
+ "long(timestamp-micros)",
+ true,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new TimeType())),
+ "{\"logicalType\":\"timestamp-micros\"}");
+ assertSchemaField(
+ schema.getFields().get(17),
+ "[version=2.0].[type=users_record].[type=long].updated_at",
+ "long(timestamp-micros)",
+ true,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new TimeType())),
+ "{\"logicalType\":\"timestamp-micros\"}");
+ assertSchemaField(
+ schema.getFields().get(18),
+ "[version=2.0].[type=users_record].[type=map].[type=int].props",
+ "int",
+ true,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType())));
+ assertSchemaField(
+ schema.getFields().get(19),
+ "[version=2.0].[type=users_record].[type=string].country",
+ "string",
+ true,
+ false,
+ new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())));
+ }
+
+ private void assertSchemaField(
+ SchemaField field,
+ String expectedPath,
+ String expectedNativeType,
+ boolean expectedNullable,
+ boolean expectedIsPartOfKey,
+ SchemaFieldDataType expectedType) {
+ assertSchemaField(
+ field,
+ expectedPath,
+ expectedNativeType,
+ expectedNullable,
+ expectedIsPartOfKey,
+ expectedType,
+ null);
+ }
+
+ private void assertSchemaField(
+ SchemaField field,
+ String expectedPath,
+ String expectedNativeType,
+ boolean expectedNullable,
+ boolean expectedIsPartOfKey,
+ SchemaFieldDataType expectedType,
+ String expectedJsonProps) {
+ assertEquals(field.getFieldPath(), expectedPath);
+ assertEquals(field.getNativeDataType(), expectedNativeType);
+ assertEquals(field.isNullable(), expectedNullable);
+ assertEquals(field.isIsPartOfKey(), expectedIsPartOfKey);
+ assertEquals(field.getType(), expectedType);
+ if (expectedJsonProps != null) {
+ assertEquals(field.getJsonProps(), expectedJsonProps);
+ }
+ }
+
+ private Schema readAvroSchema(String schemaFileName) throws IOException {
+ String schemaPath = getClass().getClassLoader().getResource(schemaFileName).getPath();
+ File schemaFile = new File(schemaPath);
+ return new Schema.Parser().parse(schemaFile);
+ }
+}
diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/resources/CustomerProfile.avsc b/metadata-integration/java/datahub-schematron/lib/src/test/resources/CustomerProfile.avsc
deleted file mode 100644
index 81f8b0e54b11e0..00000000000000
--- a/metadata-integration/java/datahub-schematron/lib/src/test/resources/CustomerProfile.avsc
+++ /dev/null
@@ -1,456 +0,0 @@
-{
- "type": "record",
- "name": "CustomerProfile",
- "namespace": "com.example.customer",
- "doc": "A complex customer profile schema demonstrating various union types and optional fields",
- "fields": [
- {
- "name": "customerId",
- "type": {
- "type": "string",
- "logicalType": "uuid"
- },
- "doc": "Unique identifier for the customer"
- },
- {
- "name": "identificationDocument",
- "type": [
- "null",
- {
- "type": "record",
- "name": "Passport",
- "fields": [
- {
- "name": "passportNumber",
- "type": "string"
- },
- {
- "name": "expiryDate",
- "type": {
- "type": "long",
- "logicalType": "date"
- }
- }
- ]
- },
- {
- "type": "record",
- "name": "DriversLicense",
- "fields": [
- {
- "name": "licenseNumber",
- "type": "string"
- },
- {
- "name": "state",
- "type": "string"
- },
- {
- "name": "validUntil",
- "type": {
- "type": "long",
- "logicalType": "date"
- }
- }
- ]
- },
- {
- "type": "record",
- "name": "NationalID",
- "fields": [
- {
- "name": "idNumber",
- "type": "string"
- },
- {
- "name": "country",
- "type": "string"
- }
- ]
- }
- ],
- "default": null,
- "doc": "Customer's identification document - can be passport, driver's license, or national ID"
- },
- {
- "name": "contactInfo",
- "type": {
- "type": "record",
- "name": "ContactInformation",
- "fields": [
- {
- "name": "primaryContact",
- "type": [
- {
- "type": "record",
- "name": "EmailContact",
- "fields": [
- {
- "name": "emailAddress",
- "type": "string"
- },
- {
- "name": "isVerified",
- "type": "boolean",
- "default": false
- }
- ]
- },
- {
- "type": "record",
- "name": "PhoneContact",
- "fields": [
- {
- "name": "countryCode",
- "type": "string"
- },
- {
- "name": "number",
- "type": "string"
- },
- {
- "name": "type",
- "type": {
- "type": "enum",
- "name": "PhoneType",
- "symbols": [
- "MOBILE",
- "LANDLINE"
- ]
- }
- }
- ]
- }
- ],
- "doc": "Primary contact method - either email or phone"
- },
- {
- "name": "alternativeContacts",
- "type": {
- "type": "array",
- "items": [
- "null",
- "EmailContact",
- "PhoneContact"
- ]
- },
- "default": [],
- "doc": "List of alternative contact methods"
- }
- ]
- }
- },
- {
- "name": "addresses",
- "type": {
- "type": "array",
- "items": {
- "type": "record",
- "name": "Address",
- "fields": [
- {
- "name": "type",
- "type": {
- "type": "enum",
- "name": "AddressType",
- "symbols": [
- "RESIDENTIAL",
- "BUSINESS",
- "SHIPPING"
- ]
- },
- "default": "RESIDENTIAL"
- },
- {
- "name": "street",
- "type": "string"
- },
- {
- "name": "city",
- "type": "string"
- },
- {
- "name": "state",
- "type": [
- "null",
- "string"
- ],
- "default": null
- },
- {
- "name": "country",
- "type": "string"
- },
- {
- "name": "postalCode",
- "type": [
- "null",
- "string"
- ],
- "default": null
- },
- {
- "name": "validationStatus",
- "type": [
- "null",
- {
- "type": "record",
- "name": "AddressValidation",
- "fields": [
- {
- "name": "isValid",
- "type": "boolean"
- },
- {
- "name": "verificationDate",
- "type": {
- "type": "long",
- "logicalType": "timestamp-millis"
- }
- },
- {
- "name": "verificationMethod",
- "type": {
- "type": "enum",
- "name": "VerificationMethod",
- "symbols": [
- "MANUAL",
- "AUTOMATED"
- ]
- }
- }
- ]
- }
- ],
- "default": null
- }
- ]
- }
- },
- "doc": "Customer's addresses with validation information"
- },
- {
- "name": "preferences",
- "type": {
- "type": "map",
- "values": [
- "null",
- "string",
- "boolean",
- {
- "type": "record",
- "name": "FrequencyPreference",
- "fields": [
- {
- "name": "frequency",
- "type": {
- "type": "enum",
- "name": "Frequency",
- "symbols": [
- "DAILY",
- "WEEKLY",
- "MONTHLY"
- ]
- }
- },
- {
- "name": "enabled",
- "type": "boolean",
- "default": true
- },
- {
- "name": "lastUpdated",
- "type": {
- "type": "long",
- "logicalType": "timestamp-millis"
- }
- }
- ]
- }
- ]
- },
- "doc": "Customer preferences with various possible value types"
- },
- {
- "name": "subscriptionHistory",
- "type": [
- "null",
- {
- "type": "array",
- "items": {
- "type": "record",
- "name": "Subscription",
- "fields": [
- {
- "name": "planName",
- "type": "string"
- },
- {
- "name": "startDate",
- "type": {
- "type": "long",
- "logicalType": "date"
- }
- },
- {
- "name": "endDate",
- "type": [
- "null",
- {
- "type": "long",
- "logicalType": "date"
- }
- ],
- "default": null
- },
- {
- "name": "status",
- "type": {
- "type": "enum",
- "name": "SubscriptionStatus",
- "symbols": [
- "ACTIVE",
- "CANCELLED",
- "EXPIRED",
- "SUSPENDED"
- ]
- }
- },
- {
- "name": "paymentMethod",
- "type": [
- "null",
- {
- "type": "record",
- "name": "PaymentMethod",
- "fields": [
- {
- "name": "type",
- "type": {
- "type": "enum",
- "name": "PaymentType",
- "symbols": [
- "CREDIT_CARD",
- "DEBIT_CARD",
- "BANK_TRANSFER",
- "DIGITAL_WALLET"
- ]
- }
- },
- {
- "name": "lastFourDigits",
- "type": [
- "null",
- "string"
- ],
- "default": null
- },
- {
- "name": "expiryDate",
- "type": [
- "null",
- {
- "type": "long",
- "logicalType": "date"
- }
- ],
- "default": null
- }
- ]
- }
- ],
- "default": null
- }
- ]
- }
- }
- ],
- "default": null,
- "doc": "Historical record of customer subscriptions"
- },
- {
- "name": "metadata",
- "type": {
- "type": "map",
- "values": [
- "null",
- "string",
- "long",
- "boolean",
- {
- "type": "record",
- "name": "MetadataValue",
- "fields": [
- {
- "name": "value",
- "type": [
- "null",
- "string",
- "long",
- "boolean"
- ],
- "default": null
- },
- {
- "name": "timestamp",
- "type": {
- "type": "long",
- "logicalType": "timestamp-millis"
- }
- },
- {
- "name": "source",
- "type": "string"
- }
- ]
- }
- ]
- },
- "doc": "Flexible metadata storage with various possible value types"
- },
- {
- "name": "tags",
- "type": [
- "null",
- {
- "type": "array",
- "items": {
- "type": "record",
- "name": "Tag",
- "fields": [
- {
- "name": "name",
- "type": "string"
- },
- {
- "name": "value",
- "type": [
- "null",
- "string"
- ],
- "default": null
- },
- {
- "name": "score",
- "type": [
- "null",
- "double"
- ],
- "default": null
- },
- {
- "name": "addedAt",
- "type": {
- "type": "long",
- "logicalType": "timestamp-millis"
- }
- }
- ]
- }
- }
- ],
- "default": null,
- "doc": "Optional tags associated with the customer profile"
- }
- ]
-}
\ No newline at end of file
diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/resources/CustomerProfile2.avsc b/metadata-integration/java/datahub-schematron/lib/src/test/resources/CustomerProfile2.avsc
deleted file mode 100644
index b8c7654ea072a2..00000000000000
--- a/metadata-integration/java/datahub-schematron/lib/src/test/resources/CustomerProfile2.avsc
+++ /dev/null
@@ -1,244 +0,0 @@
-{
- "type": "record",
- "name": "CustomerProfile2",
- "namespace": "com.example.customer",
- "doc": "A complex customer profile schema demonstrating various union types and optional fields",
- "fields": [
- {
- "name": "customerId",
- "type": {
- "type": "string",
- "logicalType": "uuid"
- },
- "doc": "Unique identifier for the customer"
- },
- {
- "name": "identificationDocument",
- "type": [
- "null",
- {
- "type": "record",
- "name": "Passport",
- "fields": [
- {
- "name": "passportNumber",
- "type": "string"
- },
- {
- "name": "expiryDate",
- "type": {
- "type": "long",
- "logicalType": "date"
- }
- }
- ]
- },
- {
- "type": "record",
- "name": "DriversLicense",
- "fields": [
- {
- "name": "licenseNumber",
- "type": "string"
- },
- {
- "name": "state",
- "type": "string"
- },
- {
- "name": "validUntil",
- "type": {
- "type": "long",
- "logicalType": "date"
- }
- }
- ]
- },
- {
- "type": "record",
- "name": "NationalID",
- "fields": [
- {
- "name": "idNumber",
- "type": "string"
- },
- {
- "name": "country",
- "type": "string"
- }
- ]
- }
- ],
- "default": null,
- "doc": "Customer's identification document"
- },
- {
- "name": "contactInfo",
- "type": {
- "type": "record",
- "name": "ContactInformation",
- "fields": [
- {
- "name": "primaryEmailContact",
- "type": [
- "null",
- {
- "type": "record",
- "name": "PrimaryEmailContact",
- "fields": [
- {
- "name": "emailAddress",
- "type": "string"
- },
- {
- "name": "isVerified",
- "type": "boolean",
- "default": false
- }
- ]
- }
- ],
- "default": null
- },
- {
- "name": "primaryPhoneContact",
- "type": [
- "null",
- {
- "type": "record",
- "name": "PrimaryPhoneContact",
- "fields": [
- {
- "name": "countryCode",
- "type": "string"
- },
- {
- "name": "number",
- "type": "string"
- },
- {
- "name": "type",
- "type": {
- "type": "enum",
- "name": "PhoneType",
- "symbols": [
- "MOBILE",
- "LANDLINE"
- ]
- }
- }
- ]
- }
- ],
- "default": null
- },
- {
- "name": "alternativeEmailContacts",
- "type": {
- "type": "array",
- "items": {
- "type": "record",
- "name": "AlternativeEmailContact",
- "fields": [
- {
- "name": "emailAddress",
- "type": "string"
- },
- {
- "name": "isVerified",
- "type": "boolean",
- "default": false
- }
- ]
- }
- },
- "default": []
- },
- {
- "name": "alternativePhoneContacts",
- "type": {
- "type": "array",
- "items": {
- "type": "record",
- "name": "AlternativePhoneContact",
- "fields": [
- {
- "name": "countryCode",
- "type": "string"
- },
- {
- "name": "number",
- "type": "string"
- },
- {
- "name": "type",
- "type": "PhoneType"
- }
- ]
- }
- },
- "default": []
- }
- ]
- }
- },
- {
- "name": "preferences",
- "type": {
- "type": "record",
- "name": "Preferences",
- "fields": [
- {
- "name": "simplePreferences",
- "type": {
- "type": "map",
- "values": [
- "null",
- "string",
- "boolean"
- ]
- },
- "default": {}
- },
- {
- "name": "frequencyPreferences",
- "type": {
- "type": "map",
- "values": {
- "type": "record",
- "name": "FrequencyPreference",
- "fields": [
- {
- "name": "frequency",
- "type": {
- "type": "enum",
- "name": "Frequency",
- "symbols": [
- "DAILY",
- "WEEKLY",
- "MONTHLY"
- ]
- }
- },
- {
- "name": "enabled",
- "type": "boolean",
- "default": true
- },
- {
- "name": "lastUpdated",
- "type": {
- "type": "long",
- "logicalType": "timestamp-millis"
- }
- }
- ]
- }
- },
- "default": {}
- }
- ]
- }
- }
- ]
-}
\ No newline at end of file
diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/resources/FlatUser.avsc b/metadata-integration/java/datahub-schematron/lib/src/test/resources/FlatUser.avsc
deleted file mode 100644
index c796878c32ae41..00000000000000
--- a/metadata-integration/java/datahub-schematron/lib/src/test/resources/FlatUser.avsc
+++ /dev/null
@@ -1,45 +0,0 @@
-{
- "type": "record",
- "name": "FlatUser",
- "namespace": "com.example",
- "fields": [
- {
- "name": "id",
- "type": "int",
- "doc": "The unique identifier for a user",
- "default": -1,
- "metadata": {
- "key1": "value1",
- "key2": "value2"
- }
- },
- {
- "name": "username",
- "type": "string",
- "doc": "The username of the user"
- },
- {
- "name": "email",
- "type": "string",
- "doc": "The email of the user"
- },
- {
- "name": "age",
- "type": "int",
- "doc": "The age of the user"
- },
- {
- "name": "isActive",
- "type": "boolean",
- "doc": "Whether the user is active or not"
- },
- {
- "name": "registrationDate",
- "type": {
- "type": "long",
- "logicalType": "timestamp-millis"
- },
- "doc": "The registration date of the user"
- }
- ]
-}
\ No newline at end of file
diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_arrays.avsc b/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_arrays.avsc
new file mode 100644
index 00000000000000..8e8bcdaa0a7dce
--- /dev/null
+++ b/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_arrays.avsc
@@ -0,0 +1,87 @@
+{
+ "type": "record",
+ "name": "ArrayType",
+ "fields": [
+ {
+ "name": "arrayOfString",
+ "type": {
+ "type": "array",
+ "items": "string"
+ }
+ },
+ {
+ "name": "arrayOfMap",
+ "type": {
+ "type": "array",
+ "items": {
+ "type": "map",
+ "values": "string"
+ }
+ }
+ },
+ {
+ "name": "arrayOfRecord",
+ "type": {
+ "type": "array",
+ "items": {
+ "type": "record",
+ "name": "ComplexType",
+ "fields": [
+ {
+ "name": "field1",
+ "type": "string"
+ },
+ {
+ "name": "field2",
+ "type": "int"
+ }
+ ]
+ }
+ }
+ },
+ {
+ "name": "arrayOfArray",
+ "type": {
+ "type": "array",
+ "items": {
+ "type": "array",
+ "items": "string"
+ }
+ }
+ },
+ {
+ "name": "arrayOfUnion",
+ "type": {
+ "type": "array",
+ "items": ["string", "int", "boolean"]
+ }
+ },
+ {
+ "name": "arrayOfNullableString",
+ "type": {
+ "type": "array",
+ "items": ["null", "string"]
+ }
+ },
+ {
+ "name": "arrayOfNullableRecord",
+ "type": {
+ "type": "array",
+ "items": ["null", {
+ "type": "record",
+ "name": "ComplexTypeNullable",
+ "fields": [
+ {
+ "name": "field1",
+ "type": "string"
+ },
+ {
+ "name": "field2",
+ "type": "int"
+ }
+ ]
+ }]
+ }
+ }
+ ]
+}
\ No newline at end of file
diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_maps.avsc b/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_maps.avsc
new file mode 100644
index 00000000000000..baedae1b9dcc15
--- /dev/null
+++ b/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_maps.avsc
@@ -0,0 +1,87 @@
+{
+ "type": "record",
+ "name": "MapType",
+ "fields": [
+ {
+ "name": "mapOfString",
+ "type": {
+ "type": "map",
+ "values": "string"
+ }
+ },
+ {
+ "name": "mapOfComplexType",
+ "type": {
+ "type": "map",
+ "values": {
+ "type": "record",
+ "name": "ComplexType",
+ "fields": [
+ {
+ "name": "field1",
+ "type": "string"
+ },
+ {
+ "name": "field2",
+ "type": "int"
+ }
+ ]
+ }
+ }
+ },
+ {
+ "name": "mapOfNullableString",
+ "type": {
+ "type": "map",
+ "values": ["null", "string"]
+ }
+ },
+ {
+ "name": "mapOfNullableComplexType",
+ "type": {
+ "type": "map",
+ "values": ["null", {
+ "type": "record",
+ "name": "ComplexTypeNullable",
+ "fields": [
+ {
+ "name": "field1",
+ "type": "string"
+ },
+ {
+ "name": "field2",
+ "type": "int"
+ }
+ ]
+ }]
+ }
+ },
+ {
+ "name": "mapOfArray",
+ "type": {
+ "type": "map",
+ "values": {
+ "type": "array",
+ "items": "string"
+ }
+ }
+ },
+ {
+ "name": "mapOfMap",
+ "type": {
+ "type": "map",
+ "values": {
+ "type": "map",
+ "values": "int"
+ }
+ }
+ },
+ {
+ "name": "mapOfUnion",
+ "type": {
+ "type": "map",
+ "values": ["null", "string", "int"]
+ }
+ }
+ ]
+}
\ No newline at end of file
diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_structs.avsc b/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_structs.avsc
new file mode 100644
index 00000000000000..7f5824192d3062
--- /dev/null
+++ b/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_structs.avsc
@@ -0,0 +1,76 @@
+{
+ "type": "record",
+ "name": "StructType",
+ "fields": [
+ {
+ "name": "structField",
+ "type": {
+ "type": "record",
+ "name": "ComplexStruct",
+ "fields": [
+ {
+ "name": "fieldString",
+ "type": "string"
+ },
+ {
+ "name": "fieldInt",
+ "type": "int"
+ },
+ {
+ "name": "fieldBoolean",
+ "type": "boolean"
+ },
+ {
+ "name": "fieldMap",
+ "type": {
+ "type": "map",
+ "values": "string"
+ }
+ },
+ {
+ "name": "fieldRecord",
+ "type": {
+ "type": "record",
+ "name": "NestedRecord",
+ "fields": [
+ {
+ "name": "nestedField1",
+ "type": "string"
+ },
+ {
+ "name": "nestedField2",
+ "type": "int"
+ }
+ ]
+ }
+ },
+ {
+ "name": "fieldArray",
+ "type": {
+ "type": "array",
+ "items": "string"
+ }
+ },
+ {
+ "name": "fieldUnion",
+ "type": [
+ "null",
+ "string",
+ "int"
+ ]
+ },
+ {
+ "name": "fieldNullableMap",
+ "type": [
+ "null",
+ {
+ "type": "map",
+ "values": "string"
+ }
+ ]
+ }
+ ]
+ }
+ }
+ ]
+}
\ No newline at end of file
diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_unions.avsc b/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_unions.avsc
new file mode 100644
index 00000000000000..1a35f1cfa0e6d6
--- /dev/null
+++ b/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_unions.avsc
@@ -0,0 +1,60 @@
+{
+ "type": "record",
+ "name": "UnionType",
+ "fields": [
+ {
+ "name": "fieldUnionNullablePrimitives",
+ "type": [
+ "null",
+ "string",
+ "int",
+ "boolean"
+ ]
+ },
+ {
+ "name": "fieldUnionComplexTypes",
+ "type": [
+ "null",
+ {
+ "type": "record",
+ "name": "NestedRecord",
+ "fields": [
+ {
+ "name": "nestedField1",
+ "type": "string"
+ },
+ {
+ "name": "nestedField2",
+ "type": "int"
+ }
+ ]
+ },
+ {
+ "type": "map",
+ "values": "string"
+ }
+ ]
+ },
+ {
+ "name": "fieldUnionPrimitiveAndComplex",
+ "type": [
+ "null",
+ "string",
+ {
+ "type": "record",
+ "name": "ComplexTypeRecord",
+ "fields": [
+ {
+ "name": "complexField1",
+ "type": "string"
+ },
+ {
+ "name": "complexField2",
+ "type": "int"
+ }
+ ]
+ }
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/resources/logical_types.avsc b/metadata-integration/java/datahub-schematron/lib/src/test/resources/logical_types.avsc
new file mode 100644
index 00000000000000..24919d82149653
--- /dev/null
+++ b/metadata-integration/java/datahub-schematron/lib/src/test/resources/logical_types.avsc
@@ -0,0 +1,72 @@
+{
+ "type": "record",
+ "name": "LogicalTypes",
+ "fields": [
+ {
+ "name": "decimalField",
+ "type": {
+ "type": "bytes",
+ "logicalType": "decimal",
+ "precision": 9,
+ "scale": 2
+ }
+ },
+ {
+ "name": "decimalFieldWithoutScale",
+ "type": {
+ "type": "bytes",
+ "logicalType": "decimal",
+ "precision": 9
+ }
+ },
+ {
+ "name": "decimalFieldWithoutPrecisionAndScale",
+ "type": {
+ "type": "bytes",
+ "logicalType": "decimal"
+ }
+ },
+ {
+ "name": "timestampMillisField",
+ "type": {
+ "type": "long",
+ "logicalType": "timestamp-millis"
+ }
+ },
+ {
+ "name": "timestampMicrosField",
+ "type": {
+ "type": "long",
+ "logicalType": "timestamp-micros"
+ }
+ },
+ {
+ "name": "dateField",
+ "type": {
+ "type": "int",
+ "logicalType": "date"
+ }
+ },
+ {
+ "name": "timeMillisField",
+ "type": {
+ "type": "int",
+ "logicalType": "time-millis"
+ }
+ },
+ {
+ "name": "timeMicrosField",
+ "type": {
+ "type": "long",
+ "logicalType": "time-micros"
+ }
+ },
+ {
+ "name": "uuidField",
+ "type": {
+ "type": "string",
+ "logicalType": "uuid"
+ }
+ }
+ ]
+}
\ No newline at end of file
diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/resources/primitive_types.avsc b/metadata-integration/java/datahub-schematron/lib/src/test/resources/primitive_types.avsc
new file mode 100644
index 00000000000000..c618299748fab1
--- /dev/null
+++ b/metadata-integration/java/datahub-schematron/lib/src/test/resources/primitive_types.avsc
@@ -0,0 +1,62 @@
+{
+ "type": "record",
+ "name": "PrimitiveType",
+ "fields": [
+ {
+ "name": "intField",
+ "type": "int"
+ },
+ {
+ "name": "intFieldV2",
+ "type": ["int"]
+ },
+ {
+ "name": "nullField",
+ "type": "null"
+ },
+ {
+ "name": "nullFieldV2",
+ "type": ["null"]
+ },
+ {
+ "name": "longField",
+ "type": "long"
+ },
+ {
+ "name": "floatField",
+ "type": "float"
+ },
+ {
+ "name": "doubleField",
+ "type": "double"
+ },
+ {
+ "name": "stringField",
+ "type": "string"
+ },
+ {
+ "name": "booleanField",
+ "type": "boolean"
+ },
+ {
+ "name": "nullableIntField",
+ "type": ["null", "int"]
+ },
+ {
+ "name": "nullableLongField",
+ "type": ["null", "long"]
+ },
+ {
+ "name": "nullableStringField",
+ "type": ["null", "string"]
+ },
+ {
+ "name": "status",
+ "type": {
+ "type": "enum",
+ "name": "StatusEnum",
+ "symbols": ["ACTIVE", "INACTIVE", "PENDING"]
+ }
+ }
+ ]
+}
\ No newline at end of file
diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/resources/users_record.avsc b/metadata-integration/java/datahub-schematron/lib/src/test/resources/users_record.avsc
new file mode 100644
index 00000000000000..bd46ae715a4810
--- /dev/null
+++ b/metadata-integration/java/datahub-schematron/lib/src/test/resources/users_record.avsc
@@ -0,0 +1,195 @@
+{
+ "type": "record",
+ "name": "users_record",
+ "namespace": "hoodie.users",
+ "fields": [
+ {
+ "name": "_hoodie_commit_time",
+ "type": [
+ "null",
+ "string"
+ ],
+ "doc": "",
+ "default": null
+ },
+ {
+ "name": "_hoodie_commit_seqno",
+ "type": [
+ "null",
+ "string"
+ ],
+ "doc": "",
+ "default": null
+ },
+ {
+ "name": "_hoodie_record_key",
+ "type": [
+ "null",
+ "string"
+ ],
+ "doc": "",
+ "default": null
+ },
+ {
+ "name": "_hoodie_partition_path",
+ "type": [
+ "null",
+ "string"
+ ],
+ "doc": "",
+ "default": null
+ },
+ {
+ "name": "_hoodie_file_name",
+ "type": [
+ "null",
+ "string"
+ ],
+ "doc": "",
+ "default": null
+ },
+ {
+ "name": "user_id",
+ "type": "string"
+ },
+ {
+ "name": "name",
+ "type": [
+ "null",
+ "string"
+ ],
+ "default": null
+ },
+ {
+ "name": "address",
+ "type": [
+ "null",
+ {
+ "type": "record",
+ "name": "address",
+ "namespace": "hoodie.users.users_record",
+ "fields": [
+ {
+ "name": "street",
+ "type": [
+ "null",
+ "string"
+ ],
+ "default": null
+ },
+ {
+ "name": "city",
+ "type": [
+ "null",
+ "string"
+ ],
+ "default": null
+ },
+ {
+ "name": "country",
+ "type": [
+ "null",
+ "string"
+ ],
+ "default": null
+ },
+ {
+ "name": "postal_code",
+ "type": [
+ "null",
+ "string"
+ ],
+ "default": null
+ },
+ {
+ "name": "created_at",
+ "type": [
+ "null",
+ {
+ "type": "long",
+ "logicalType": "timestamp-micros"
+ }
+ ],
+ "default": null
+ }
+ ]
+ }
+ ],
+ "default": null
+ },
+ {
+ "name": "contact",
+ "type": [
+ "null",
+ {
+ "type": "record",
+ "name": "contact",
+ "namespace": "hoodie.users.users_record",
+ "fields": [
+ {
+ "name": "email",
+ "type": [
+ "null",
+ "string"
+ ],
+ "default": null
+ },
+ {
+ "name": "phone",
+ "type": [
+ "null",
+ "string"
+ ],
+ "default": null
+ }
+ ]
+ }
+ ],
+ "default": null
+ },
+ {
+ "name": "created_at",
+ "type": [
+ "null",
+ {
+ "type": "long",
+ "logicalType": "timestamp-micros"
+ }
+ ],
+ "default": null
+ },
+ {
+ "name": "updated_at",
+ "type": [
+ "null",
+ {
+ "type": "long",
+ "logicalType": "timestamp-micros"
+ }
+ ],
+ "default": null
+ },
+ {
+ "name": "props",
+ "type": [
+ "null",
+ {
+ "type": "map",
+ "values": [
+ "null",
+ "int"
+ ]
+ }
+ ],
+ "default": null
+ },
+ {
+ "name": "country",
+ "type": [
+ "null",
+ "string"
+ ],
+ "default": null
+ }
+ ]
+}
\ No newline at end of file
diff --git a/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java b/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java
index 9f57d36f800de3..a3099b9ee21ea4 100644
--- a/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java
+++ b/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java
@@ -16,7 +16,7 @@
import com.linkedin.data.schema.annotation.PathSpecBasedSchemaAnnotationVisitor;
import com.linkedin.dataset.DatasetProperties;
import com.linkedin.events.metadata.ChangeType;
-import com.linkedin.metadata.aspect.AspectRetriever;
+import com.linkedin.metadata.aspect.CachingAspectRetriever;
import com.linkedin.metadata.aspect.GraphRetriever;
import com.linkedin.metadata.aspect.batch.MCPItem;
import com.linkedin.metadata.aspect.patch.GenericJsonPatch;
@@ -56,7 +56,7 @@
public class AspectsBatchImplTest {
private EntityRegistry testRegistry;
- private AspectRetriever mockAspectRetriever;
+ private CachingAspectRetriever mockAspectRetriever;
private RetrieverContext retrieverContext;
@BeforeTest
@@ -75,12 +75,12 @@ public void beforeTest() throws EntityRegistryException {
@BeforeMethod
public void setup() {
- this.mockAspectRetriever = mock(AspectRetriever.class);
+ this.mockAspectRetriever = mock(CachingAspectRetriever.class);
when(this.mockAspectRetriever.getEntityRegistry()).thenReturn(testRegistry);
this.retrieverContext =
RetrieverContext.builder()
.searchRetriever(mock(SearchRetriever.class))
- .aspectRetriever(mockAspectRetriever)
+ .cachingAspectRetriever(mockAspectRetriever)
.graphRetriever(mock(GraphRetriever.class))
.build();
}
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtil.java b/metadata-io/src/main/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtil.java
index 99eadd223acd1a..82bc0ae1409c52 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtil.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtil.java
@@ -137,7 +137,7 @@ public static List getAdditionalChanges(
getProposalFromAspectForDefault(
entry.getKey(), entry.getValue(), entityKeyAspect, templateItem),
templateItem.getAuditStamp(),
- opContext.getAspectRetrieverOpt().get()))
+ opContext.getAspectRetriever()))
.filter(Objects::nonNull);
})
.collect(Collectors.toList());
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/client/EntityClientAspectRetriever.java b/metadata-io/src/main/java/com/linkedin/metadata/client/EntityClientAspectRetriever.java
index bba8324d0c5612..669ec751f87c69 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/client/EntityClientAspectRetriever.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/client/EntityClientAspectRetriever.java
@@ -35,7 +35,7 @@ public EntityRegistry getEntityRegistry() {
@Override
public Aspect getLatestAspectObject(@Nonnull Urn urn, @Nonnull String aspectName) {
try {
- return entityClient.getLatestAspectObject(systemOperationContext, urn, aspectName);
+ return entityClient.getLatestAspectObject(systemOperationContext, urn, aspectName, false);
} catch (RemoteInvocationException | URISyntaxException e) {
throw new RuntimeException(e);
}
@@ -49,7 +49,7 @@ public Map> getLatestAspectObjects(
return Map.of();
} else {
try {
- return entityClient.getLatestAspects(systemOperationContext, urns, aspectNames);
+ return entityClient.getLatestAspects(systemOperationContext, urns, aspectNames, false);
} catch (RemoteInvocationException | URISyntaxException e) {
throw new RuntimeException(e);
}
@@ -70,7 +70,8 @@ public Map> getLatestSystemAspects(
urnAspectNames.keySet(),
urnAspectNames.values().stream()
.flatMap(Collection::stream)
- .collect(Collectors.toSet()));
+ .collect(Collectors.toSet()),
+ false);
} catch (RemoteInvocationException | URISyntaxException e) {
throw new RuntimeException(e);
}
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java
index 29faa3955ea662..3d35f5956b0f4f 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java
@@ -106,11 +106,17 @@ public EntityResponse getV2(
@Nonnull OperationContext opContext,
@Nonnull String entityName,
@Nonnull final Urn urn,
- @Nullable final Set aspectNames)
+ @Nullable final Set aspectNames,
+ @Nullable Boolean alwaysIncludeKeyAspect)
throws RemoteInvocationException, URISyntaxException {
final Set projectedAspects =
aspectNames == null ? opContext.getEntityAspectNames(entityName) : aspectNames;
- return entityService.getEntityV2(opContext, entityName, urn, projectedAspects);
+ return entityService.getEntityV2(
+ opContext,
+ entityName,
+ urn,
+ projectedAspects,
+ alwaysIncludeKeyAspect == null || alwaysIncludeKeyAspect);
}
@Override
@@ -126,7 +132,8 @@ public Map batchGetV2(
@Nonnull OperationContext opContext,
@Nonnull String entityName,
@Nonnull Set urns,
- @Nullable Set aspectNames)
+ @Nullable Set aspectNames,
+ @Nullable Boolean alwaysIncludeKeyAspect)
throws RemoteInvocationException, URISyntaxException {
final Set projectedAspects =
aspectNames == null ? opContext.getEntityAspectNames(entityName) : aspectNames;
@@ -139,7 +146,11 @@ public Map batchGetV2(
try {
responseMap.putAll(
entityService.getEntitiesV2(
- opContext, entityName, new HashSet<>(batch), projectedAspects));
+ opContext,
+ entityName,
+ new HashSet<>(batch),
+ projectedAspects,
+ alwaysIncludeKeyAspect == null || alwaysIncludeKeyAspect));
} catch (URISyntaxException e) {
throw new RuntimeException(e);
}
@@ -772,7 +783,7 @@ public List batchIngestProposals(
.mcps(
batch,
auditStamp,
- opContext.getRetrieverContext().get(),
+ opContext.getRetrieverContext(),
opContext.getValidationContext().isAlternateValidation())
.build();
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/client/SystemJavaEntityClient.java b/metadata-io/src/main/java/com/linkedin/metadata/client/SystemJavaEntityClient.java
index eda9b3a880228f..1d2fd422d7f460 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/client/SystemJavaEntityClient.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/client/SystemJavaEntityClient.java
@@ -89,6 +89,6 @@ public Map batchGetV2NoCache(
@Nonnull Set urns,
@Nullable Set aspectNames)
throws RemoteInvocationException, URISyntaxException {
- return super.batchGetV2(opContext, entityName, urns, aspectNames);
+ return super.batchGetV2(opContext, entityName, urns, aspectNames, false);
}
}
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceAspectRetriever.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceAspectRetriever.java
index 626a1f72f5fb73..50cf8af30d606a 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceAspectRetriever.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceAspectRetriever.java
@@ -5,7 +5,7 @@
import com.linkedin.common.urn.Urn;
import com.linkedin.entity.Aspect;
-import com.linkedin.metadata.aspect.CachingAspectRetriever;
+import com.linkedin.metadata.aspect.AspectRetriever;
import com.linkedin.metadata.aspect.SystemAspect;
import com.linkedin.metadata.models.registry.EntityRegistry;
import io.datahubproject.metadata.context.OperationContext;
@@ -22,7 +22,7 @@
@Getter
@Builder
-public class EntityServiceAspectRetriever implements CachingAspectRetriever {
+public class EntityServiceAspectRetriever implements AspectRetriever {
@Setter private OperationContext systemOperationContext;
private final EntityRegistry entityRegistry;
@@ -46,7 +46,8 @@ public Map> getLatestAspectObjects(
String entityName = urns.stream().findFirst().map(Urn::getEntityType).get();
try {
return entityResponseToAspectMap(
- entityService.getEntitiesV2(systemOperationContext, entityName, urns, aspectNames));
+ entityService.getEntitiesV2(
+ systemOperationContext, entityName, urns, aspectNames, false));
} catch (URISyntaxException e) {
throw new RuntimeException(e);
}
@@ -71,7 +72,8 @@ public Map> getLatestSystemAspects(
urnAspectNames.keySet(),
urnAspectNames.values().stream()
.flatMap(Collection::stream)
- .collect(Collectors.toSet())),
+ .collect(Collectors.toSet()),
+ false),
entityRegistry);
} catch (URISyntaxException e) {
throw new RuntimeException(e);
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java
index 6de7784bfbc0ec..8ae09111204cab 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java
@@ -261,8 +261,7 @@ public Map> getLatestAspects(
}
List systemAspects =
- EntityUtils.toSystemAspects(
- opContext.getRetrieverContext().get(), batchGetResults.values());
+ EntityUtils.toSystemAspects(opContext.getRetrieverContext(), batchGetResults.values());
systemAspects.stream()
// for now, don't add the key aspect here we have already added it above
@@ -290,8 +289,7 @@ public Map getLatestAspectsForUrn(
Map batchGetResults =
getLatestAspect(opContext, new HashSet<>(Arrays.asList(urn)), aspectNames, forUpdate);
- return EntityUtils.toSystemAspects(
- opContext.getRetrieverContext().get(), batchGetResults.values())
+ return EntityUtils.toSystemAspects(opContext.getRetrieverContext(), batchGetResults.values())
.stream()
.map(
systemAspect -> Pair.of(systemAspect.getAspectName(), systemAspect.getRecordTemplate()))
@@ -335,7 +333,7 @@ public Pair getAspectVersionPair(
final Optional maybeAspect = Optional.ofNullable(aspectDao.getAspect(primaryKey));
return Pair.of(
- EntityUtils.toSystemAspect(opContext.getRetrieverContext().get(), maybeAspect.orElse(null))
+ EntityUtils.toSystemAspect(opContext.getRetrieverContext(), maybeAspect.orElse(null))
.map(SystemAspect::getRecordTemplate)
.orElse(null),
version);
@@ -721,7 +719,7 @@ public ListResult listLatestAspects(
}
return new ListResult<>(
- EntityUtils.toSystemAspects(opContext.getRetrieverContext().get(), entityAspects).stream()
+ EntityUtils.toSystemAspects(opContext.getRetrieverContext(), entityAspects).stream()
.map(SystemAspect::getRecordTemplate)
.collect(Collectors.toList()),
aspectMetadataList.getMetadata(),
@@ -758,12 +756,12 @@ public List ingestAspects(
.recordTemplate(pair.getValue())
.systemMetadata(systemMetadata)
.auditStamp(auditStamp)
- .build(opContext.getAspectRetrieverOpt().get()))
+ .build(opContext.getAspectRetriever()))
.collect(Collectors.toList());
return ingestAspects(
opContext,
AspectsBatchImpl.builder()
- .retrieverContext(opContext.getRetrieverContext().get())
+ .retrieverContext(opContext.getRetrieverContext())
.items(items)
.build(),
true,
@@ -815,13 +813,13 @@ private void processPostCommitMCLSideEffects(
log.debug("Considering {} MCLs post commit side effects.", mcls.size());
List batch =
mcls.stream()
- .map(mcl -> MCLItemImpl.builder().build(mcl, opContext.getAspectRetrieverOpt().get()))
+ .map(mcl -> MCLItemImpl.builder().build(mcl, opContext.getAspectRetriever()))
.collect(Collectors.toList());
Iterable> iterable =
() ->
Iterators.partition(
- AspectsBatch.applyPostMCPSideEffects(batch, opContext.getRetrieverContext().get())
+ AspectsBatch.applyPostMCPSideEffects(batch, opContext.getRetrieverContext())
.iterator(),
MCP_SIDE_EFFECT_KAFKA_BATCH_SIZE);
StreamSupport.stream(iterable.spliterator(), false)
@@ -831,7 +829,7 @@ private void processPostCommitMCLSideEffects(
ingestProposalAsync(
AspectsBatchImpl.builder()
.items(sideEffects)
- .retrieverContext(opContext.getRetrieverContext().get())
+ .retrieverContext(opContext.getRetrieverContext())
.build())
.count();
log.info("Generated {} MCP SideEffects for async processing", count);
@@ -879,8 +877,7 @@ private List ingestAspectsToLocalDB(
aspectDao.getLatestAspects(urnAspects, true);
final Map> batchAspects =
- EntityUtils.toSystemAspects(
- opContext.getRetrieverContext().get(), databaseAspects);
+ EntityUtils.toSystemAspects(opContext.getRetrieverContext(), databaseAspects);
// read #2 (potentially)
final Map> nextVersions =
@@ -903,7 +900,7 @@ private List ingestAspectsToLocalDB(
Map> newLatestAspects =
EntityUtils.toSystemAspects(
- opContext.getRetrieverContext().get(),
+ opContext.getRetrieverContext(),
aspectDao.getLatestAspects(updatedItems.getFirst(), true));
// merge
updatedLatestAspects = AspectsBatch.merge(batchAspects, newLatestAspects);
@@ -941,7 +938,7 @@ private List ingestAspectsToLocalDB(
// do final pre-commit checks with previous aspect value
ValidationExceptionCollection exceptions =
- AspectsBatch.validatePreCommit(changeMCPs, opContext.getRetrieverContext().get());
+ AspectsBatch.validatePreCommit(changeMCPs, opContext.getRetrieverContext());
if (exceptions.hasFatalExceptions()) {
// IF this is a client request/API request we fail the `transaction batch`
@@ -1143,8 +1140,8 @@ public RecordTemplate ingestAspectIfNotPresent(
.recordTemplate(newValue)
.systemMetadata(systemMetadata)
.auditStamp(auditStamp)
- .build(opContext.getAspectRetrieverOpt().get()),
- opContext.getRetrieverContext().get())
+ .build(opContext.getAspectRetriever()),
+ opContext.getRetrieverContext())
.build();
List ingested = ingestAspects(opContext, aspectsBatch, true, false);
@@ -1169,7 +1166,7 @@ public IngestResult ingestProposal(
return ingestProposal(
opContext,
AspectsBatchImpl.builder()
- .mcps(List.of(proposal), auditStamp, opContext.getRetrieverContext().get())
+ .mcps(List.of(proposal), auditStamp, opContext.getRetrieverContext())
.build(),
async)
.stream()
@@ -1246,7 +1243,7 @@ private Stream ingestTimeseriesProposal(
.recordTemplate(
EntityApiUtils.buildKeyAspect(
opContext.getEntityRegistry(), item.getUrn()))
- .build(opContext.getAspectRetrieverOpt().get()))
+ .build(opContext.getAspectRetriever()))
.collect(Collectors.toList());
ingestProposalSync(
@@ -1469,7 +1466,7 @@ public List restoreIndices(
List systemAspects =
EntityUtils.toSystemAspectFromEbeanAspects(
- opContext.getRetrieverContext().get(), batch.collect(Collectors.toList()));
+ opContext.getRetrieverContext(), batch.collect(Collectors.toList()));
RestoreIndicesResult result = restoreIndices(opContext, systemAspects, logger);
result.timeSqlQueryMs = timeSqlQueryMs;
@@ -1513,7 +1510,7 @@ public List restoreIndices(
long startTime = System.currentTimeMillis();
List systemAspects =
EntityUtils.toSystemAspects(
- opContext.getRetrieverContext().get(),
+ opContext.getRetrieverContext(),
getLatestAspect(opContext, entityBatch.getValue(), aspectNames, false).values());
long timeSqlQueryMs = System.currentTimeMillis() - startTime;
@@ -1649,12 +1646,12 @@ private RestoreIndicesResult restoreIndices(
.auditStamp(auditStamp)
.systemMetadata(latestSystemMetadata)
.recordTemplate(EntityApiUtils.buildKeyAspect(opContext.getEntityRegistry(), urn))
- .build(opContext.getAspectRetrieverOpt().get()));
+ .build(opContext.getAspectRetriever()));
Stream defaultAspectsResult =
ingestProposalSync(
opContext,
AspectsBatchImpl.builder()
- .retrieverContext(opContext.getRetrieverContext().get())
+ .retrieverContext(opContext.getRetrieverContext())
.items(keyAspect)
.build());
defaultAspectsCreated += defaultAspectsResult.count();
@@ -1966,7 +1963,7 @@ private void ingestSnapshotUnion(
AspectsBatchImpl aspectsBatch =
AspectsBatchImpl.builder()
- .retrieverContext(opContext.getRetrieverContext().get())
+ .retrieverContext(opContext.getRetrieverContext())
.items(
aspectRecordsToIngest.stream()
.map(
@@ -1977,7 +1974,7 @@ private void ingestSnapshotUnion(
.recordTemplate(pair.getValue())
.auditStamp(auditStamp)
.systemMetadata(systemMetadata)
- .build(opContext.getAspectRetrieverOpt().get()))
+ .build(opContext.getAspectRetriever()))
.collect(Collectors.toList()))
.build();
@@ -2128,7 +2125,7 @@ public RollbackRunResult deleteUrn(@Nonnull OperationContext opContext, Urn urn)
}
SystemMetadata latestKeySystemMetadata =
- EntityUtils.toSystemAspect(opContext.getRetrieverContext().get(), latestKey)
+ EntityUtils.toSystemAspect(opContext.getRetrieverContext(), latestKey)
.map(SystemAspect::getSystemMetadata)
.get();
RollbackResult result =
@@ -2253,11 +2250,11 @@ private RollbackResult deleteAspectWithoutMCL(
.urn(entityUrn)
.aspectName(aspectName)
.auditStamp(auditStamp)
- .build(opContext.getAspectRetrieverOpt().get());
+ .build(opContext.getAspectRetriever());
// Delete validation hooks
ValidationExceptionCollection exceptions =
- AspectsBatch.validateProposed(List.of(deleteItem), opContext.getRetrieverContext().get());
+ AspectsBatch.validateProposed(List.of(deleteItem), opContext.getRetrieverContext());
if (!exceptions.isEmpty()) {
throw new ValidationException(collectMetrics(exceptions).toString());
}
@@ -2271,7 +2268,7 @@ private RollbackResult deleteAspectWithoutMCL(
final EntityAspect.EntitySystemAspect latest =
(EntityAspect.EntitySystemAspect)
EntityUtils.toSystemAspect(
- opContext.getRetrieverContext().get(),
+ opContext.getRetrieverContext(),
aspectDao.getLatestAspect(urn, aspectName, false))
.orElse(null);
@@ -2299,7 +2296,7 @@ private RollbackResult deleteAspectWithoutMCL(
EntityAspect.EntitySystemAspect candidateAspect =
(EntityAspect.EntitySystemAspect)
EntityUtils.toSystemAspect(
- opContext.getRetrieverContext().get(),
+ opContext.getRetrieverContext(),
aspectDao.getAspect(urn, aspectName, maxVersion))
.orElse(null);
SystemMetadata previousSysMetadata =
@@ -2325,13 +2322,9 @@ private RollbackResult deleteAspectWithoutMCL(
.urn(UrnUtils.getUrn(toDelete.getUrn()))
.aspectName(toDelete.getAspect())
.auditStamp(auditStamp)
- .build(
- opContext
- .getRetrieverContext()
- .get()
- .getAspectRetriever()))
+ .build(opContext.getAspectRetriever()))
.collect(Collectors.toList()),
- opContext.getRetrieverContext().get());
+ opContext.getRetrieverContext());
if (!preCommitExceptions.isEmpty()) {
throw new ValidationException(collectMetrics(preCommitExceptions).toString());
}
@@ -2509,7 +2502,7 @@ private Map getEnvelopedAspects(
final Map dbEntries = aspectDao.batchGet(dbKeys, false);
List envelopedAspects =
- EntityUtils.toSystemAspects(opContext.getRetrieverContext().get(), dbEntries.values());
+ EntityUtils.toSystemAspects(opContext.getRetrieverContext(), dbEntries.values());
return envelopedAspects.stream()
.collect(
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityUtils.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityUtils.java
index 3c4109970e9d0b..da48a2b76d6d56 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityUtils.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityUtils.java
@@ -72,7 +72,7 @@ public static void ingestChangeProposals(
entityService.ingestProposal(
opContext,
AspectsBatchImpl.builder()
- .mcps(changes, getAuditStamp(actor), opContext.getRetrieverContext().get())
+ .mcps(changes, getAuditStamp(actor), opContext.getRetrieverContext())
.build(),
async);
}
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraRetentionService.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraRetentionService.java
index ccc1910ba5cdbd..c595e3e07b8342 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraRetentionService.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraRetentionService.java
@@ -64,7 +64,7 @@ protected AspectsBatch buildAspectsBatch(
List mcps,
@Nonnull AuditStamp auditStamp) {
return AspectsBatchImpl.builder()
- .mcps(mcps, auditStamp, opContext.getRetrieverContext().get())
+ .mcps(mcps, auditStamp, opContext.getRetrieverContext())
.build();
}
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java
index bd6cc67561b883..ea580a97c51886 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java
@@ -93,8 +93,14 @@ public class EbeanAspectDao implements AspectDao, AspectMigrationsDao {
*/
private final LoadingCache locks;
+ private final String batchGetMethod;
+
public EbeanAspectDao(@Nonnull final Database server, EbeanConfiguration ebeanConfiguration) {
_server = server;
+ this.batchGetMethod =
+ ebeanConfiguration.getBatchGetMethod() != null
+ ? ebeanConfiguration.getBatchGetMethod()
+ : "IN";
if (ebeanConfiguration.getLocking().isEnabled()) {
this.locks =
CacheBuilder.newBuilder()
@@ -371,23 +377,37 @@ private List batchGet(
final int totalPageCount = QueryUtils.getTotalPageCount(keys.size(), keysCount);
final List finalResult =
- batchGetUnion(new ArrayList<>(keys), keysCount, position, forUpdate);
+ batchGetSelectString(new ArrayList<>(keys), keysCount, position, forUpdate);
while (QueryUtils.hasMore(position, keysCount, totalPageCount)) {
position += keysCount;
final List oneStatementResult =
- batchGetUnion(new ArrayList<>(keys), keysCount, position, forUpdate);
+ batchGetSelectString(new ArrayList<>(keys), keysCount, position, forUpdate);
finalResult.addAll(oneStatementResult);
}
return finalResult;
}
+ @Nonnull
+ private List batchGetSelectString(
+ @Nonnull final List keys,
+ final int keysCount,
+ final int position,
+ boolean forUpdate) {
+
+ if (batchGetMethod.equals("IN")) {
+ return batchGetIn(keys, keysCount, position, forUpdate);
+ }
+
+ return batchGetUnion(keys, keysCount, position, forUpdate);
+ }
+
/**
* Builds a single SELECT statement for batch get, which selects one entity, and then can be
* UNION'd with other SELECT statements.
*/
- private String batchGetSelect(
+ private String batchGetSelectString(
final int selectId,
@Nonnull final String urn,
@Nonnull final String aspect,
@@ -434,7 +454,7 @@ private List batchGetUnion(
final Map params = new HashMap<>();
for (int index = position; index < end; index++) {
sb.append(
- batchGetSelect(
+ batchGetSelectString(
index - position,
keys.get(index).getUrn(),
keys.get(index).getAspect(),
@@ -467,6 +487,65 @@ private List batchGetUnion(
return query.findList();
}
+ @Nonnull
+ private List batchGetIn(
+ @Nonnull final List keys,
+ final int keysCount,
+ final int position,
+ boolean forUpdate) {
+ validateConnection();
+
+ // Build a single SELECT with IN clause using composite key comparison
+ // Query will look like:
+ // SELECT * FROM metadata_aspect WHERE (urn, aspect, version) IN
+ // (('urn0', 'aspect0', 0), ('urn1', 'aspect1', 1))
+ final StringBuilder sb = new StringBuilder();
+ sb.append(
+ "SELECT urn, aspect, version, metadata, systemMetadata, createdOn, createdBy, createdFor ");
+ sb.append("FROM metadata_aspect_v2 WHERE (urn, aspect, version) IN (");
+
+ final int end = Math.min(keys.size(), position + keysCount);
+ final Map params = new HashMap<>();
+
+ for (int index = position; index < end; index++) {
+ int paramIndex = index - position;
+ String urnParam = "urn" + paramIndex;
+ String aspectParam = "aspect" + paramIndex;
+ String versionParam = "version" + paramIndex;
+
+ params.put(urnParam, keys.get(index).getUrn());
+ params.put(aspectParam, keys.get(index).getAspect());
+ params.put(versionParam, keys.get(index).getVersion());
+
+ sb.append("(:" + urnParam + ", :" + aspectParam + ", :" + versionParam + ")");
+
+ if (index != end - 1) {
+ sb.append(",");
+ }
+ }
+
+ sb.append(")");
+
+ if (forUpdate) {
+ sb.append(" FOR UPDATE");
+ }
+
+ final RawSql rawSql =
+ RawSqlBuilder.parse(sb.toString())
+ .columnMapping(EbeanAspectV2.URN_COLUMN, "key.urn")
+ .columnMapping(EbeanAspectV2.ASPECT_COLUMN, "key.aspect")
+ .columnMapping(EbeanAspectV2.VERSION_COLUMN, "key.version")
+ .create();
+
+ final Query query = _server.find(EbeanAspectV2.class).setRawSql(rawSql);
+
+ for (Map.Entry param : params.entrySet()) {
+ query.setParameter(param.getKey(), param.getValue());
+ }
+
+ return query.findList();
+ }
+
@Override
@Nonnull
public ListResult listUrns(
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanRetentionService.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanRetentionService.java
index 49fa555e006f61..74d0d8b0964de0 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanRetentionService.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanRetentionService.java
@@ -59,7 +59,7 @@ protected AspectsBatch buildAspectsBatch(
List mcps,
@Nonnull AuditStamp auditStamp) {
return AspectsBatchImpl.builder()
- .mcps(mcps, auditStamp, opContext.getRetrieverContext().get())
+ .mcps(mcps, auditStamp, opContext.getRetrieverContext())
.build();
}
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/filter/BaseQueryFilterRewriter.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/filter/BaseQueryFilterRewriter.java
index 367705d369c7ce..6c5c6243d33620 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/filter/BaseQueryFilterRewriter.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/filter/BaseQueryFilterRewriter.java
@@ -143,7 +143,7 @@ private static QueryBuilder expandTerms(
if (!queryUrns.isEmpty()) {
scrollGraph(
- opContext.getRetrieverContext().get().getGraphRetriever(),
+ opContext.getRetrieverContext().getGraphRetriever(),
queryUrns,
relationshipTypes,
relationshipDirection,
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformer.java b/metadata-io/src/main/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformer.java
index 4bb8e0630de480..b4ad847cb7afc2 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformer.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformer.java
@@ -437,8 +437,6 @@ private void setStructuredPropertiesSearchValue(
Map> definitions =
opContext
- .getRetrieverContext()
- .get()
.getAspectRetriever()
.getLatestAspectObjects(
propertyMap.keySet(), Set.of(STRUCTURED_PROPERTY_DEFINITION_ASPECT_NAME));
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/service/BusinessAttributeUpdateHookService.java b/metadata-io/src/main/java/com/linkedin/metadata/service/BusinessAttributeUpdateHookService.java
index ad2825ead3d0da..4a692e95346222 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/service/BusinessAttributeUpdateHookService.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/service/BusinessAttributeUpdateHookService.java
@@ -112,7 +112,7 @@ private void fetchRelatedEntities(
@Nullable String scrollId,
int consumedEntityCount,
int batchNumber) {
- GraphRetriever graph = opContext.getRetrieverContext().get().getGraphRetriever();
+ GraphRetriever graph = opContext.getRetrieverContext().getGraphRetriever();
final ArrayList> futureList = new ArrayList<>();
RelatedEntitiesScrollResult result =
graph.scrollRelatedEntities(
@@ -165,7 +165,7 @@ private Callable processBatch(
return () -> {
StopWatch stopWatch = new StopWatch();
stopWatch.start();
- AspectRetriever aspectRetriever = opContext.getAspectRetrieverOpt().get();
+ AspectRetriever aspectRetriever = opContext.getAspectRetriever();
log.info("Batch {} for BA:{} started", batchNumber, entityKey);
ExecutionResult executionResult = new ExecutionResult();
executionResult.setBatchNumber(batchNumber);
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateGraphIndicesService.java b/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateGraphIndicesService.java
index efe073fc00dfdc..4b09bc00efb61a 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateGraphIndicesService.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateGraphIndicesService.java
@@ -94,8 +94,7 @@ public UpdateGraphIndicesService(
public void handleChangeEvent(
@Nonnull OperationContext opContext, @Nonnull final MetadataChangeLog event) {
try {
- MCLItemImpl mclItem =
- MCLItemImpl.builder().build(event, opContext.getAspectRetrieverOpt().get());
+ MCLItemImpl mclItem = MCLItemImpl.builder().build(event, opContext.getAspectRetriever());
if (UPDATE_CHANGE_TYPES.contains(event.getChangeType())) {
handleUpdateChangeEvent(opContext, mclItem);
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java b/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java
index 187ef3e8c62290..c5fc9ebdac9fa6 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java
@@ -121,11 +121,10 @@ public UpdateIndicesService(
public void handleChangeEvent(
@Nonnull OperationContext opContext, @Nonnull final MetadataChangeLog event) {
try {
- MCLItemImpl batch =
- MCLItemImpl.builder().build(event, opContext.getAspectRetrieverOpt().get());
+ MCLItemImpl batch = MCLItemImpl.builder().build(event, opContext.getAspectRetriever());
Stream sideEffects =
- AspectsBatch.applyMCLSideEffects(List.of(batch), opContext.getRetrieverContext().get());
+ AspectsBatch.applyMCLSideEffects(List.of(batch), opContext.getRetrieverContext());
for (MCLItem mclItem :
Stream.concat(Stream.of(batch), sideEffects).collect(Collectors.toList())) {
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/AspectIngestionUtils.java b/metadata-io/src/test/java/com/linkedin/metadata/AspectIngestionUtils.java
index 12b12cf105196e..fa6ab7932001b6 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/AspectIngestionUtils.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/AspectIngestionUtils.java
@@ -46,12 +46,12 @@ public static Map ingestCorpUserKeyAspects(
.recordTemplate(aspect)
.auditStamp(AspectGenerationUtils.createAuditStamp())
.systemMetadata(AspectGenerationUtils.createSystemMetadata())
- .build(opContext.getAspectRetrieverOpt().get()));
+ .build(opContext.getAspectRetriever()));
}
entityService.ingestAspects(
opContext,
AspectsBatchImpl.builder()
- .retrieverContext(opContext.getRetrieverContext().get())
+ .retrieverContext(opContext.getRetrieverContext())
.items(items)
.build(),
true,
@@ -83,12 +83,12 @@ public static Map ingestCorpUserInfoAspects(
.recordTemplate(aspect)
.auditStamp(AspectGenerationUtils.createAuditStamp())
.systemMetadata(AspectGenerationUtils.createSystemMetadata())
- .build(opContext.getAspectRetrieverOpt().get()));
+ .build(opContext.getAspectRetriever()));
}
entityService.ingestAspects(
opContext,
AspectsBatchImpl.builder()
- .retrieverContext(opContext.getRetrieverContext().get())
+ .retrieverContext(opContext.getRetrieverContext())
.items(items)
.build(),
true,
@@ -121,12 +121,12 @@ public static Map ingestChartInfoAspects(
.recordTemplate(aspect)
.auditStamp(AspectGenerationUtils.createAuditStamp())
.systemMetadata(AspectGenerationUtils.createSystemMetadata())
- .build(opContext.getAspectRetrieverOpt().get()));
+ .build(opContext.getAspectRetriever()));
}
entityService.ingestAspects(
opContext,
AspectsBatchImpl.builder()
- .retrieverContext(opContext.getRetrieverContext().get())
+ .retrieverContext(opContext.getRetrieverContext())
.items(items)
.build(),
true,
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/aspect/hooks/IgnoreUnknownMutatorTest.java b/metadata-io/src/test/java/com/linkedin/metadata/aspect/hooks/IgnoreUnknownMutatorTest.java
index 11a3153abcaeed..19be1eb14667d8 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/aspect/hooks/IgnoreUnknownMutatorTest.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/aspect/hooks/IgnoreUnknownMutatorTest.java
@@ -16,7 +16,8 @@
import com.linkedin.data.template.StringMap;
import com.linkedin.dataset.DatasetProperties;
import com.linkedin.events.metadata.ChangeType;
-import com.linkedin.metadata.aspect.AspectRetriever;
+import com.linkedin.metadata.aspect.CachingAspectRetriever;
+import com.linkedin.metadata.aspect.GraphRetriever;
import com.linkedin.metadata.aspect.batch.MCPItem;
import com.linkedin.metadata.aspect.plugins.config.AspectPluginConfig;
import com.linkedin.metadata.entity.SearchRetriever;
@@ -28,7 +29,6 @@
import com.linkedin.mxe.SystemMetadata;
import com.linkedin.test.metadata.aspect.TestEntityRegistry;
import io.datahubproject.metadata.context.RetrieverContext;
-import io.datahubproject.test.metadata.context.TestOperationContexts;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.util.List;
@@ -53,17 +53,17 @@ public class IgnoreUnknownMutatorTest {
private static final Urn TEST_DATASET_URN =
UrnUtils.getUrn(
"urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.customers,PROD)");
- private AspectRetriever mockAspectRetriever;
+ private CachingAspectRetriever mockAspectRetriever;
private RetrieverContext retrieverContext;
@BeforeMethod
public void setup() {
- mockAspectRetriever = mock(AspectRetriever.class);
+ mockAspectRetriever = mock(CachingAspectRetriever.class);
retrieverContext =
RetrieverContext.builder()
.searchRetriever(mock(SearchRetriever.class))
- .aspectRetriever(mockAspectRetriever)
- .graphRetriever(TestOperationContexts.emptyGraphRetriever)
+ .cachingAspectRetriever(mockAspectRetriever)
+ .graphRetriever(GraphRetriever.EMPTY)
.build();
}
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtilTest.java b/metadata-io/src/test/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtilTest.java
index 04aff4edf456d9..e7ed2671131592 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtilTest.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtilTest.java
@@ -56,8 +56,7 @@ public void testAdditionalChanges() {
DefaultAspectsUtil.getAdditionalChanges(
opContext,
AspectsBatchImpl.builder()
- .mcps(
- List.of(proposal1), new AuditStamp(), opContext.getRetrieverContext().get())
+ .mcps(List.of(proposal1), new AuditStamp(), opContext.getRetrieverContext())
.build()
.getMCPItems(),
entityServiceImpl,
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/dataproducts/sideeffects/DataProductUnsetSideEffectTest.java b/metadata-io/src/test/java/com/linkedin/metadata/dataproducts/sideeffects/DataProductUnsetSideEffectTest.java
index 976b165fea53df..215e1e2431efa0 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/dataproducts/sideeffects/DataProductUnsetSideEffectTest.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/dataproducts/sideeffects/DataProductUnsetSideEffectTest.java
@@ -15,7 +15,7 @@
import com.linkedin.dataproduct.DataProductAssociationArray;
import com.linkedin.dataproduct.DataProductProperties;
import com.linkedin.events.metadata.ChangeType;
-import com.linkedin.metadata.aspect.AspectRetriever;
+import com.linkedin.metadata.aspect.CachingAspectRetriever;
import com.linkedin.metadata.aspect.GraphRetriever;
import com.linkedin.metadata.aspect.SystemAspect;
import com.linkedin.metadata.aspect.batch.MCPItem;
@@ -75,12 +75,12 @@ public class DataProductUnsetSideEffectTest {
.build()))
.build();
- private AspectRetriever mockAspectRetriever;
+ private CachingAspectRetriever mockAspectRetriever;
private RetrieverContext retrieverContext;
@BeforeMethod
public void setup() {
- mockAspectRetriever = mock(AspectRetriever.class);
+ mockAspectRetriever = mock(CachingAspectRetriever.class);
when(mockAspectRetriever.getEntityRegistry()).thenReturn(TEST_REGISTRY);
GraphRetriever graphRetriever = mock(GraphRetriever.class);
RelatedEntities relatedEntities =
@@ -139,7 +139,7 @@ public void setup() {
retrieverContext =
RetrieverContext.builder()
.searchRetriever(mock(SearchRetriever.class))
- .aspectRetriever(mockAspectRetriever)
+ .cachingAspectRetriever(mockAspectRetriever)
.graphRetriever(graphRetriever)
.build();
}
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java
index 0386031cbcad86..88f84ee94c8ee7 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java
@@ -19,6 +19,7 @@
import com.linkedin.metadata.AspectGenerationUtils;
import com.linkedin.metadata.Constants;
import com.linkedin.metadata.EbeanTestUtils;
+import com.linkedin.metadata.aspect.GraphRetriever;
import com.linkedin.metadata.config.EbeanConfiguration;
import com.linkedin.metadata.config.PreProcessHooks;
import com.linkedin.metadata.entity.ebean.EbeanAspectDao;
@@ -98,12 +99,15 @@ public void setupTest() {
.entityService(_entityServiceImpl)
.entityRegistry(_testEntityRegistry)
.build())
- .graphRetriever(TestOperationContexts.emptyGraphRetriever)
- .searchRetriever(TestOperationContexts.emptySearchRetriever)
+ .cachingAspectRetriever(
+ TestOperationContexts.emptyActiveUsersAspectRetriever(
+ () -> _testEntityRegistry))
+ .graphRetriever(GraphRetriever.EMPTY)
+ .searchRetriever(SearchRetriever.EMPTY)
.build(),
null,
opContext ->
- ((EntityServiceAspectRetriever) opContext.getAspectRetrieverOpt().get())
+ ((EntityServiceAspectRetriever) opContext.getAspectRetriever())
.setSystemOperationContext(opContext),
null);
}
@@ -152,25 +156,25 @@ public void testIngestListLatestAspects() throws AssertionError {
.recordTemplate(writeAspect1)
.systemMetadata(metadata1)
.auditStamp(TEST_AUDIT_STAMP)
- .build(TestOperationContexts.emptyAspectRetriever(null)),
+ .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)),
ChangeItemImpl.builder()
.urn(entityUrn2)
.aspectName(aspectName)
.recordTemplate(writeAspect2)
.systemMetadata(metadata1)
.auditStamp(TEST_AUDIT_STAMP)
- .build(TestOperationContexts.emptyAspectRetriever(null)),
+ .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)),
ChangeItemImpl.builder()
.urn(entityUrn3)
.aspectName(aspectName)
.recordTemplate(writeAspect3)
.systemMetadata(metadata1)
.auditStamp(TEST_AUDIT_STAMP)
- .build(TestOperationContexts.emptyAspectRetriever(null)));
+ .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)));
_entityServiceImpl.ingestAspects(
opContext,
AspectsBatchImpl.builder()
- .retrieverContext(opContext.getRetrieverContext().get())
+ .retrieverContext(opContext.getRetrieverContext())
.items(items)
.build(),
true,
@@ -230,25 +234,25 @@ public void testIngestListUrns() throws AssertionError {
.recordTemplate(writeAspect1)
.systemMetadata(metadata1)
.auditStamp(TEST_AUDIT_STAMP)
- .build(TestOperationContexts.emptyAspectRetriever(null)),
+ .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)),
ChangeItemImpl.builder()
.urn(entityUrn2)
.aspectName(aspectName)
.recordTemplate(writeAspect2)
.systemMetadata(metadata1)
.auditStamp(TEST_AUDIT_STAMP)
- .build(TestOperationContexts.emptyAspectRetriever(null)),
+ .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)),
ChangeItemImpl.builder()
.urn(entityUrn3)
.aspectName(aspectName)
.recordTemplate(writeAspect3)
.systemMetadata(metadata1)
.auditStamp(TEST_AUDIT_STAMP)
- .build(TestOperationContexts.emptyAspectRetriever(null)));
+ .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)));
_entityServiceImpl.ingestAspects(
opContext,
AspectsBatchImpl.builder()
- .retrieverContext(opContext.getRetrieverContext().get())
+ .retrieverContext(opContext.getRetrieverContext())
.items(items)
.build(),
true,
@@ -310,11 +314,11 @@ public void testSystemMetadataDuplicateKey() throws Exception {
.recordTemplate(new Status().setRemoved(true))
.systemMetadata(systemMetadata)
.auditStamp(TEST_AUDIT_STAMP)
- .build(TestOperationContexts.emptyAspectRetriever(null));
+ .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null));
_entityServiceImpl.ingestAspects(
opContext,
AspectsBatchImpl.builder()
- .retrieverContext(opContext.getRetrieverContext().get())
+ .retrieverContext(opContext.getRetrieverContext())
.items(List.of(item))
.build(),
false,
@@ -356,7 +360,7 @@ public void testSystemMetadataDuplicateKey() throws Exception {
_entityServiceImpl.ingestAspects(
opContext,
AspectsBatchImpl.builder()
- .retrieverContext(opContext.getRetrieverContext().get())
+ .retrieverContext(opContext.getRetrieverContext())
.items(
List.of(
ChangeItemImpl.builder()
@@ -365,7 +369,7 @@ public void testSystemMetadataDuplicateKey() throws Exception {
.recordTemplate(new Status().setRemoved(false))
.systemMetadata(systemMetadata)
.auditStamp(TEST_AUDIT_STAMP)
- .build(TestOperationContexts.emptyAspectRetriever(null))))
+ .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null))))
.build(),
false,
true);
@@ -600,7 +604,7 @@ public void run() {
auditStamp.setTime(System.currentTimeMillis());
AspectsBatchImpl batch =
AspectsBatchImpl.builder()
- .mcps(mcps, auditStamp, operationContext.getRetrieverContext().get())
+ .mcps(mcps, auditStamp, operationContext.getRetrieverContext())
.build();
entityService.ingestProposal(operationContext, batch, false);
}
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java
index 2d59632e6f3c6d..c00632e5cf5424 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java
@@ -945,32 +945,32 @@ public void testRollbackAspect() throws AssertionError {
.recordTemplate(writeAspect1)
.systemMetadata(metadata1)
.auditStamp(TEST_AUDIT_STAMP)
- .build(opContext.getAspectRetrieverOpt().get()),
+ .build(opContext.getAspectRetriever()),
ChangeItemImpl.builder()
.urn(entityUrn2)
.aspectName(aspectName)
.recordTemplate(writeAspect2)
.auditStamp(TEST_AUDIT_STAMP)
.systemMetadata(metadata1)
- .build(opContext.getAspectRetrieverOpt().get()),
+ .build(opContext.getAspectRetriever()),
ChangeItemImpl.builder()
.urn(entityUrn3)
.aspectName(aspectName)
.recordTemplate(writeAspect3)
.auditStamp(TEST_AUDIT_STAMP)
.systemMetadata(metadata1)
- .build(opContext.getAspectRetrieverOpt().get()),
+ .build(opContext.getAspectRetriever()),
ChangeItemImpl.builder()
.urn(entityUrn1)
.aspectName(aspectName)
.recordTemplate(writeAspect1Overwrite)
.systemMetadata(metadata2)
.auditStamp(TEST_AUDIT_STAMP)
- .build(opContext.getAspectRetrieverOpt().get()));
+ .build(opContext.getAspectRetriever()));
_entityServiceImpl.ingestAspects(
opContext,
AspectsBatchImpl.builder()
- .retrieverContext(opContext.getRetrieverContext().get())
+ .retrieverContext(opContext.getRetrieverContext())
.items(items)
.build(),
true,
@@ -1037,25 +1037,25 @@ public void testRollbackKey() throws AssertionError {
.recordTemplate(writeAspect1)
.systemMetadata(metadata1)
.auditStamp(TEST_AUDIT_STAMP)
- .build(opContext.getAspectRetrieverOpt().get()),
+ .build(opContext.getAspectRetriever()),
ChangeItemImpl.builder()
.urn(entityUrn1)
.aspectName(keyAspectName)
.recordTemplate(writeKey1)
.systemMetadata(metadata1)
.auditStamp(TEST_AUDIT_STAMP)
- .build(opContext.getAspectRetrieverOpt().get()),
+ .build(opContext.getAspectRetriever()),
ChangeItemImpl.builder()
.urn(entityUrn1)
.aspectName(aspectName)
.recordTemplate(writeAspect1Overwrite)
.systemMetadata(metadata2)
.auditStamp(TEST_AUDIT_STAMP)
- .build(opContext.getAspectRetrieverOpt().get()));
+ .build(opContext.getAspectRetriever()));
_entityServiceImpl.ingestAspects(
opContext,
AspectsBatchImpl.builder()
- .retrieverContext(opContext.getRetrieverContext().get())
+ .retrieverContext(opContext.getRetrieverContext())
.items(items)
.build(),
true,
@@ -1130,39 +1130,39 @@ public void testRollbackUrn() throws AssertionError {
.recordTemplate(writeAspect1)
.systemMetadata(metadata1)
.auditStamp(TEST_AUDIT_STAMP)
- .build(opContext.getAspectRetrieverOpt().get()),
+ .build(opContext.getAspectRetriever()),
ChangeItemImpl.builder()
.urn(entityUrn1)
.aspectName(keyAspectName)
.recordTemplate(writeKey1)
.auditStamp(TEST_AUDIT_STAMP)
.systemMetadata(metadata1)
- .build(opContext.getAspectRetrieverOpt().get()),
+ .build(opContext.getAspectRetriever()),
ChangeItemImpl.builder()
.urn(entityUrn2)
.aspectName(aspectName)
.recordTemplate(writeAspect2)
.auditStamp(TEST_AUDIT_STAMP)
.systemMetadata(metadata1)
- .build(opContext.getAspectRetrieverOpt().get()),
+ .build(opContext.getAspectRetriever()),
ChangeItemImpl.builder()
.urn(entityUrn3)
.aspectName(aspectName)
.recordTemplate(writeAspect3)
.systemMetadata(metadata1)
.auditStamp(TEST_AUDIT_STAMP)
- .build(opContext.getAspectRetrieverOpt().get()),
+ .build(opContext.getAspectRetriever()),
ChangeItemImpl.builder()
.urn(entityUrn1)
.aspectName(aspectName)
.recordTemplate(writeAspect1Overwrite)
.systemMetadata(metadata2)
.auditStamp(TEST_AUDIT_STAMP)
- .build(opContext.getAspectRetrieverOpt().get()));
+ .build(opContext.getAspectRetriever()));
_entityServiceImpl.ingestAspects(
opContext,
AspectsBatchImpl.builder()
- .retrieverContext(opContext.getRetrieverContext().get())
+ .retrieverContext(opContext.getRetrieverContext())
.items(items)
.build(),
true,
@@ -1208,11 +1208,11 @@ public void testIngestGetLatestAspect() throws AssertionError {
.recordTemplate(writeAspect1)
.auditStamp(TEST_AUDIT_STAMP)
.systemMetadata(metadata1)
- .build(opContext.getAspectRetrieverOpt().get()));
+ .build(opContext.getAspectRetriever()));
_entityServiceImpl.ingestAspects(
opContext,
AspectsBatchImpl.builder()
- .retrieverContext(opContext.getRetrieverContext().get())
+ .retrieverContext(opContext.getRetrieverContext())
.items(items)
.build(),
true,
@@ -1264,11 +1264,11 @@ public void testIngestGetLatestAspect() throws AssertionError {
.recordTemplate(writeAspect2)
.auditStamp(TEST_AUDIT_STAMP)
.systemMetadata(metadata2)
- .build(opContext.getAspectRetrieverOpt().get()));
+ .build(opContext.getAspectRetriever()));
_entityServiceImpl.ingestAspects(
opContext,
AspectsBatchImpl.builder()
- .retrieverContext(opContext.getRetrieverContext().get())
+ .retrieverContext(opContext.getRetrieverContext())
.items(items)
.build(),
true,
@@ -1320,11 +1320,11 @@ public void testIngestGetLatestEnvelopedAspect() throws Exception {
.recordTemplate(writeAspect1)
.auditStamp(TEST_AUDIT_STAMP)
.systemMetadata(metadata1)
- .build(opContext.getAspectRetrieverOpt().get()));
+ .build(opContext.getAspectRetriever()));
_entityServiceImpl.ingestAspects(
opContext,
AspectsBatchImpl.builder()
- .retrieverContext(opContext.getRetrieverContext().get())
+ .retrieverContext(opContext.getRetrieverContext())
.items(items)
.build(),
true,
@@ -1347,11 +1347,11 @@ public void testIngestGetLatestEnvelopedAspect() throws Exception {
.recordTemplate(writeAspect2)
.systemMetadata(metadata2)
.auditStamp(TEST_AUDIT_STAMP)
- .build(opContext.getAspectRetrieverOpt().get()));
+ .build(opContext.getAspectRetriever()));
_entityServiceImpl.ingestAspects(
opContext,
AspectsBatchImpl.builder()
- .retrieverContext(opContext.getRetrieverContext().get())
+ .retrieverContext(opContext.getRetrieverContext())
.items(items)
.build(),
true,
@@ -1416,11 +1416,11 @@ public void testIngestSameAspect() throws AssertionError {
.recordTemplate(writeAspect1)
.systemMetadata(metadata1)
.auditStamp(TEST_AUDIT_STAMP)
- .build(opContext.getAspectRetrieverOpt().get()));
+ .build(opContext.getAspectRetriever()));
_entityServiceImpl.ingestAspects(
opContext,
AspectsBatchImpl.builder()
- .retrieverContext(opContext.getRetrieverContext().get())
+ .retrieverContext(opContext.getRetrieverContext())
.items(items)
.build(),
true,
@@ -1472,11 +1472,11 @@ public void testIngestSameAspect() throws AssertionError {
.recordTemplate(writeAspect2)
.systemMetadata(metadata2)
.auditStamp(TEST_AUDIT_STAMP)
- .build(opContext.getAspectRetrieverOpt().get()));
+ .build(opContext.getAspectRetriever()));
_entityServiceImpl.ingestAspects(
opContext,
AspectsBatchImpl.builder()
- .retrieverContext(opContext.getRetrieverContext().get())
+ .retrieverContext(opContext.getRetrieverContext())
.items(items)
.build(),
true,
@@ -1534,46 +1534,46 @@ public void testRetention() throws AssertionError {
.recordTemplate(writeAspect1)
.systemMetadata(AspectGenerationUtils.createSystemMetadata())
.auditStamp(TEST_AUDIT_STAMP)
- .build(opContext.getAspectRetrieverOpt().get()),
+ .build(opContext.getAspectRetriever()),
ChangeItemImpl.builder()
.urn(entityUrn)
.aspectName(aspectName)
.recordTemplate(writeAspect1a)
.systemMetadata(AspectGenerationUtils.createSystemMetadata())
.auditStamp(TEST_AUDIT_STAMP)
- .build(opContext.getAspectRetrieverOpt().get()),
+ .build(opContext.getAspectRetriever()),
ChangeItemImpl.builder()
.urn(entityUrn)
.aspectName(aspectName)
.recordTemplate(writeAspect1b)
.systemMetadata(AspectGenerationUtils.createSystemMetadata())
.auditStamp(TEST_AUDIT_STAMP)
- .build(opContext.getAspectRetrieverOpt().get()),
+ .build(opContext.getAspectRetriever()),
ChangeItemImpl.builder()
.urn(entityUrn)
.aspectName(aspectName2)
.recordTemplate(writeAspect2)
.systemMetadata(AspectGenerationUtils.createSystemMetadata())
.auditStamp(TEST_AUDIT_STAMP)
- .build(opContext.getAspectRetrieverOpt().get()),
+ .build(opContext.getAspectRetriever()),
ChangeItemImpl.builder()
.urn(entityUrn)
.aspectName(aspectName2)
.recordTemplate(writeAspect2a)
.systemMetadata(AspectGenerationUtils.createSystemMetadata())
.auditStamp(TEST_AUDIT_STAMP)
- .build(opContext.getAspectRetrieverOpt().get()),
+ .build(opContext.getAspectRetriever()),
ChangeItemImpl.builder()
.urn(entityUrn)
.aspectName(aspectName2)
.recordTemplate(writeAspect2b)
.systemMetadata(AspectGenerationUtils.createSystemMetadata())
.auditStamp(TEST_AUDIT_STAMP)
- .build(opContext.getAspectRetrieverOpt().get()));
+ .build(opContext.getAspectRetriever()));
_entityServiceImpl.ingestAspects(
opContext,
AspectsBatchImpl.builder()
- .retrieverContext(opContext.getRetrieverContext().get())
+ .retrieverContext(opContext.getRetrieverContext())
.items(items)
.build(),
true,
@@ -1610,18 +1610,18 @@ public void testRetention() throws AssertionError {
.recordTemplate(writeAspect1c)
.systemMetadata(AspectGenerationUtils.createSystemMetadata())
.auditStamp(TEST_AUDIT_STAMP)
- .build(opContext.getAspectRetrieverOpt().get()),
+ .build(opContext.getAspectRetriever()),
ChangeItemImpl.builder()
.urn(entityUrn)
.aspectName(aspectName2)
.recordTemplate(writeAspect2c)
.systemMetadata(AspectGenerationUtils.createSystemMetadata())
.auditStamp(TEST_AUDIT_STAMP)
- .build(opContext.getAspectRetrieverOpt().get()));
+ .build(opContext.getAspectRetriever()));
_entityServiceImpl.ingestAspects(
opContext,
AspectsBatchImpl.builder()
- .retrieverContext(opContext.getRetrieverContext().get())
+ .retrieverContext(opContext.getRetrieverContext())
.items(items)
.build(),
true,
@@ -1982,8 +1982,7 @@ public void testStructuredPropertyIngestProposal() throws Exception {
stream
.map(
entityAspect ->
- EntityUtils.toSystemAspect(
- opContext.getRetrieverContext().get(), entityAspect)
+ EntityUtils.toSystemAspect(opContext.getRetrieverContext(), entityAspect)
.get()
.getAspect(StructuredPropertyDefinition.class))
.collect(Collectors.toSet());
@@ -1995,7 +1994,10 @@ public void testStructuredPropertyIngestProposal() throws Exception {
SystemEntityClient mockSystemEntityClient = Mockito.mock(SystemEntityClient.class);
Mockito.when(
mockSystemEntityClient.getLatestAspectObject(
- any(OperationContext.class), eq(firstPropertyUrn), eq("propertyDefinition")))
+ any(OperationContext.class),
+ eq(firstPropertyUrn),
+ eq("propertyDefinition"),
+ anyBoolean()))
.thenReturn(new com.linkedin.entity.Aspect(structuredPropertyDefinition.data()));
// Add a value for that property
@@ -2062,8 +2064,7 @@ public void testStructuredPropertyIngestProposal() throws Exception {
stream
.map(
entityAspect ->
- EntityUtils.toSystemAspect(
- opContext.getRetrieverContext().get(), entityAspect)
+ EntityUtils.toSystemAspect(opContext.getRetrieverContext(), entityAspect)
.get()
.getAspect(StructuredPropertyDefinition.class))
.collect(Collectors.toSet());
@@ -2074,7 +2075,10 @@ public void testStructuredPropertyIngestProposal() throws Exception {
Mockito.when(
mockSystemEntityClient.getLatestAspectObject(
- any(OperationContext.class), eq(secondPropertyUrn), eq("propertyDefinition")))
+ any(OperationContext.class),
+ eq(secondPropertyUrn),
+ eq("propertyDefinition"),
+ anyBoolean()))
.thenReturn(new com.linkedin.entity.Aspect(secondDefinition.data()));
// Get existing value for first structured property
@@ -2209,7 +2213,7 @@ public void testBatchDuplicate() throws Exception {
.recordTemplate(new Status().setRemoved(true))
.systemMetadata(systemMetadata.copy())
.auditStamp(TEST_AUDIT_STAMP)
- .build(TestOperationContexts.emptyAspectRetriever(null));
+ .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null));
ChangeItemImpl item2 =
ChangeItemImpl.builder()
.urn(entityUrn)
@@ -2217,11 +2221,11 @@ public void testBatchDuplicate() throws Exception {
.recordTemplate(new Status().setRemoved(false))
.systemMetadata(systemMetadata.copy())
.auditStamp(TEST_AUDIT_STAMP)
- .build(TestOperationContexts.emptyAspectRetriever(null));
+ .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null));
_entityServiceImpl.ingestAspects(
opContext,
AspectsBatchImpl.builder()
- .retrieverContext(opContext.getRetrieverContext().get())
+ .retrieverContext(opContext.getRetrieverContext())
.items(List.of(item1, item2))
.build(),
false,
@@ -2269,7 +2273,7 @@ public void testBatchPatchWithTrailingNoOp() throws Exception {
.setTags(new TagAssociationArray(new TagAssociation().setTag(tag1))))
.systemMetadata(systemMetadata.copy())
.auditStamp(TEST_AUDIT_STAMP)
- .build(TestOperationContexts.emptyAspectRetriever(null));
+ .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null));
PatchItemImpl patchAdd2 =
PatchItemImpl.builder()
@@ -2311,7 +2315,7 @@ public void testBatchPatchWithTrailingNoOp() throws Exception {
_entityServiceImpl.ingestAspects(
opContext,
AspectsBatchImpl.builder()
- .retrieverContext(opContext.getRetrieverContext().get())
+ .retrieverContext(opContext.getRetrieverContext())
.items(List.of(initialAspectTag1))
.build(),
false,
@@ -2320,7 +2324,7 @@ public void testBatchPatchWithTrailingNoOp() throws Exception {
_entityServiceImpl.ingestAspects(
opContext,
AspectsBatchImpl.builder()
- .retrieverContext(opContext.getRetrieverContext().get())
+ .retrieverContext(opContext.getRetrieverContext())
.items(List.of(patchAdd2, patchRemoveNonExistent))
.build(),
false,
@@ -2368,7 +2372,7 @@ public void testBatchPatchAdd() throws Exception {
.setTags(new TagAssociationArray(new TagAssociation().setTag(tag1))))
.systemMetadata(systemMetadata.copy())
.auditStamp(TEST_AUDIT_STAMP)
- .build(TestOperationContexts.emptyAspectRetriever(null));
+ .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null));
PatchItemImpl patchAdd3 =
PatchItemImpl.builder()
@@ -2428,7 +2432,7 @@ public void testBatchPatchAdd() throws Exception {
_entityServiceImpl.ingestAspects(
opContext,
AspectsBatchImpl.builder()
- .retrieverContext(opContext.getRetrieverContext().get())
+ .retrieverContext(opContext.getRetrieverContext())
.items(List.of(initialAspectTag1))
.build(),
false,
@@ -2437,7 +2441,7 @@ public void testBatchPatchAdd() throws Exception {
_entityServiceImpl.ingestAspects(
opContext,
AspectsBatchImpl.builder()
- .retrieverContext(opContext.getRetrieverContext().get())
+ .retrieverContext(opContext.getRetrieverContext())
.items(List.of(patchAdd3, patchAdd2, patchAdd1))
.build(),
false,
@@ -2491,7 +2495,7 @@ public void testBatchPatchAddDuplicate() throws Exception {
.recordTemplate(new GlobalTags().setTags(new TagAssociationArray(initialTags)))
.systemMetadata(systemMetadata.copy())
.auditStamp(TEST_AUDIT_STAMP)
- .build(TestOperationContexts.emptyAspectRetriever(null));
+ .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null));
PatchItemImpl patchAdd2 =
PatchItemImpl.builder()
@@ -2516,7 +2520,7 @@ public void testBatchPatchAddDuplicate() throws Exception {
_entityServiceImpl.ingestAspects(
opContext,
AspectsBatchImpl.builder()
- .retrieverContext(opContext.getRetrieverContext().get())
+ .retrieverContext(opContext.getRetrieverContext())
.items(List.of(initialAspectTag1))
.build(),
false,
@@ -2525,7 +2529,7 @@ public void testBatchPatchAddDuplicate() throws Exception {
_entityServiceImpl.ingestAspects(
opContext,
AspectsBatchImpl.builder()
- .retrieverContext(opContext.getRetrieverContext().get())
+ .retrieverContext(opContext.getRetrieverContext())
.items(List.of(patchAdd2, patchAdd2)) // duplicate
.build(),
false,
@@ -2581,7 +2585,7 @@ public void testPatchRemoveNonExistent() throws Exception {
_entityServiceImpl.ingestAspects(
opContext,
AspectsBatchImpl.builder()
- .retrieverContext(opContext.getRetrieverContext().get())
+ .retrieverContext(opContext.getRetrieverContext())
.items(List.of(patchRemove))
.build(),
false,
@@ -2638,7 +2642,7 @@ public void testPatchAddNonExistent() throws Exception {
_entityServiceImpl.ingestAspects(
opContext,
AspectsBatchImpl.builder()
- .retrieverContext(opContext.getRetrieverContext().get())
+ .retrieverContext(opContext.getRetrieverContext())
.items(List.of(patchAdd))
.build(),
false,
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraEntityServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraEntityServiceTest.java
index 550f55e6bfd0b9..b4fbfecc9d60d3 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraEntityServiceTest.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraEntityServiceTest.java
@@ -10,11 +10,13 @@
import com.linkedin.metadata.AspectGenerationUtils;
import com.linkedin.metadata.AspectIngestionUtils;
import com.linkedin.metadata.CassandraTestUtils;
+import com.linkedin.metadata.aspect.GraphRetriever;
import com.linkedin.metadata.config.PreProcessHooks;
import com.linkedin.metadata.entity.EntityServiceAspectRetriever;
import com.linkedin.metadata.entity.EntityServiceImpl;
import com.linkedin.metadata.entity.EntityServiceTest;
import com.linkedin.metadata.entity.ListResult;
+import com.linkedin.metadata.entity.SearchRetriever;
import com.linkedin.metadata.event.EventProducer;
import com.linkedin.metadata.key.CorpUserKey;
import com.linkedin.metadata.models.registry.EntityRegistryException;
@@ -93,12 +95,15 @@ private void configureComponents() {
.entityService(_entityServiceImpl)
.entityRegistry(_testEntityRegistry)
.build())
- .graphRetriever(TestOperationContexts.emptyGraphRetriever)
- .searchRetriever(TestOperationContexts.emptySearchRetriever)
+ .cachingAspectRetriever(
+ TestOperationContexts.emptyActiveUsersAspectRetriever(
+ () -> _testEntityRegistry))
+ .graphRetriever(GraphRetriever.EMPTY)
+ .searchRetriever(SearchRetriever.EMPTY)
.build(),
null,
opContext ->
- ((EntityServiceAspectRetriever) opContext.getAspectRetrieverOpt().get())
+ ((EntityServiceAspectRetriever) opContext.getAspectRetriever())
.setSystemOperationContext(opContext),
null);
}
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImplTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImplTest.java
index 3f6b301e72aa5a..0a867ae3c8f2e0 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImplTest.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImplTest.java
@@ -26,7 +26,7 @@ public void testBatchDuplicate() throws Exception {
.recordTemplate(new Status().setRemoved(true))
.systemMetadata(systemMetadata.copy())
.auditStamp(TEST_AUDIT_STAMP)
- .build(TestOperationContexts.emptyAspectRetriever(null));
+ .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null));
ChangeItemImpl item2 =
ChangeItemImpl.builder()
.urn(entityUrn)
@@ -34,7 +34,7 @@ public void testBatchDuplicate() throws Exception {
.recordTemplate(new Status().setRemoved(false))
.systemMetadata(systemMetadata.copy())
.auditStamp(TEST_AUDIT_STAMP)
- .build(TestOperationContexts.emptyAspectRetriever(null));
+ .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null));
assertFalse(item1.isDatabaseDuplicateOf(item2));
}
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/recommendation/RecommendationsServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/recommendation/RecommendationsServiceTest.java
index ca42f0327c86db..8f68f119cb0b7d 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/recommendation/RecommendationsServiceTest.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/recommendation/RecommendationsServiceTest.java
@@ -11,6 +11,7 @@
import com.linkedin.metadata.recommendation.ranker.SimpleRecommendationRanker;
import io.datahubproject.test.metadata.context.TestOperationContexts;
import java.net.URISyntaxException;
+import java.nio.file.AccessDeniedException;
import java.util.List;
import java.util.stream.Collectors;
import org.testng.annotations.Test;
@@ -74,7 +75,7 @@ private List getContentFromUrns(List urns) {
}
@Test
- public void testService() throws URISyntaxException {
+ public void testService() throws URISyntaxException, AccessDeniedException {
// Test non-eligible and empty
RecommendationsService service =
new RecommendationsService(ImmutableList.of(nonEligibleSource, emptySource), ranker);
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/schemafields/sideeffects/SchemaFieldSideEffectTest.java b/metadata-io/src/test/java/com/linkedin/metadata/schemafields/sideeffects/SchemaFieldSideEffectTest.java
index 1661f5f02ee593..fa895cb4540117 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/schemafields/sideeffects/SchemaFieldSideEffectTest.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/schemafields/sideeffects/SchemaFieldSideEffectTest.java
@@ -21,7 +21,8 @@
import com.linkedin.data.ByteString;
import com.linkedin.entity.Aspect;
import com.linkedin.events.metadata.ChangeType;
-import com.linkedin.metadata.aspect.AspectRetriever;
+import com.linkedin.metadata.aspect.CachingAspectRetriever;
+import com.linkedin.metadata.aspect.GraphRetriever;
import com.linkedin.metadata.aspect.batch.MCLItem;
import com.linkedin.metadata.aspect.batch.MCPItem;
import com.linkedin.metadata.aspect.plugins.config.AspectPluginConfig;
@@ -46,7 +47,6 @@
import com.linkedin.test.metadata.aspect.TestEntityRegistry;
import com.linkedin.test.metadata.aspect.batch.TestMCP;
import io.datahubproject.metadata.context.RetrieverContext;
-import io.datahubproject.test.metadata.context.TestOperationContexts;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
@@ -87,18 +87,18 @@ public class SchemaFieldSideEffectTest {
.build()))
.build();
- private AspectRetriever mockAspectRetriever;
+ private CachingAspectRetriever mockAspectRetriever;
private RetrieverContext retrieverContext;
@BeforeMethod
public void setup() {
- mockAspectRetriever = mock(AspectRetriever.class);
+ mockAspectRetriever = mock(CachingAspectRetriever.class);
when(mockAspectRetriever.getEntityRegistry()).thenReturn(TEST_REGISTRY);
retrieverContext =
RetrieverContext.builder()
.searchRetriever(mock(SearchRetriever.class))
- .aspectRetriever(mockAspectRetriever)
- .graphRetriever(TestOperationContexts.emptyGraphRetriever)
+ .cachingAspectRetriever(mockAspectRetriever)
+ .graphRetriever(GraphRetriever.EMPTY)
.build();
}
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/ContainerExpansionRewriterTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/ContainerExpansionRewriterTest.java
index fd768424e13c19..1825b65a18ab19 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/ContainerExpansionRewriterTest.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/ContainerExpansionRewriterTest.java
@@ -20,6 +20,7 @@
import com.linkedin.metadata.aspect.models.graph.RelatedEntities;
import com.linkedin.metadata.aspect.models.graph.RelatedEntitiesScrollResult;
import com.linkedin.metadata.config.search.QueryFilterRewriterConfiguration;
+import com.linkedin.metadata.entity.SearchRetriever;
import com.linkedin.metadata.models.registry.EntityRegistry;
import com.linkedin.metadata.query.SearchFlags;
import com.linkedin.metadata.query.filter.Condition;
@@ -71,8 +72,10 @@ public void init() {
() ->
io.datahubproject.metadata.context.RetrieverContext.builder()
.aspectRetriever(mockAspectRetriever)
+ .cachingAspectRetriever(
+ TestOperationContexts.emptyActiveUsersAspectRetriever(() -> entityRegistry))
.graphRetriever(mockGraphRetriever)
- .searchRetriever(TestOperationContexts.emptySearchRetriever)
+ .searchRetriever(SearchRetriever.EMPTY)
.build(),
null,
null,
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/DomainExpansionRewriterTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/DomainExpansionRewriterTest.java
index 8741e24b1bca50..de375271ed6602 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/DomainExpansionRewriterTest.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/DomainExpansionRewriterTest.java
@@ -13,13 +13,14 @@
import static org.mockito.Mockito.when;
import static org.testng.Assert.assertEquals;
-import com.linkedin.metadata.aspect.AspectRetriever;
+import com.linkedin.metadata.aspect.CachingAspectRetriever;
import com.linkedin.metadata.aspect.GraphRetriever;
import com.linkedin.metadata.aspect.RetrieverContext;
import com.linkedin.metadata.aspect.models.graph.Edge;
import com.linkedin.metadata.aspect.models.graph.RelatedEntities;
import com.linkedin.metadata.aspect.models.graph.RelatedEntitiesScrollResult;
import com.linkedin.metadata.config.search.QueryFilterRewriterConfiguration;
+import com.linkedin.metadata.entity.SearchRetriever;
import com.linkedin.metadata.models.registry.EntityRegistry;
import com.linkedin.metadata.query.SearchFlags;
import com.linkedin.metadata.query.filter.Condition;
@@ -54,7 +55,7 @@ public class DomainExpansionRewriterTest
@BeforeMethod
public void init() {
EntityRegistry entityRegistry = new TestEntityRegistry();
- AspectRetriever mockAspectRetriever = mock(AspectRetriever.class);
+ CachingAspectRetriever mockAspectRetriever = mock(CachingAspectRetriever.class);
when(mockAspectRetriever.getEntityRegistry()).thenReturn(entityRegistry);
mockGraphRetriever = spy(GraphRetriever.class);
@@ -71,8 +72,10 @@ public void init() {
() ->
io.datahubproject.metadata.context.RetrieverContext.builder()
.aspectRetriever(mockAspectRetriever)
+ .cachingAspectRetriever(
+ TestOperationContexts.emptyActiveUsersAspectRetriever(() -> entityRegistry))
.graphRetriever(mockGraphRetriever)
- .searchRetriever(TestOperationContexts.emptySearchRetriever)
+ .searchRetriever(SearchRetriever.EMPTY)
.build(),
null,
null,
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/AggregationQueryBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/AggregationQueryBuilderTest.java
index c68997e25bcff7..d6f5f9c3eedbe7 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/AggregationQueryBuilderTest.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/AggregationQueryBuilderTest.java
@@ -18,6 +18,7 @@
import com.linkedin.data.template.StringArray;
import com.linkedin.entity.Aspect;
import com.linkedin.metadata.aspect.AspectRetriever;
+import com.linkedin.metadata.aspect.CachingAspectRetriever;
import com.linkedin.metadata.config.search.SearchConfiguration;
import com.linkedin.metadata.models.EntitySpec;
import com.linkedin.metadata.models.annotation.SearchableAnnotation;
@@ -49,8 +50,8 @@
public class AggregationQueryBuilderTest {
- private static AspectRetriever aspectRetriever;
- private static AspectRetriever aspectRetrieverV1;
+ private static CachingAspectRetriever aspectRetriever;
+ private static CachingAspectRetriever aspectRetrieverV1;
private static String DEFAULT_FILTER = "_index";
@BeforeClass
@@ -61,7 +62,7 @@ public void setup() throws RemoteInvocationException, URISyntaxException {
Urn.createFromString("urn:li:structuredProperty:under.scores.and.dots_make_a_mess");
// legacy
- aspectRetriever = mock(AspectRetriever.class);
+ aspectRetriever = mock(CachingAspectRetriever.class);
when(aspectRetriever.getEntityRegistry())
.thenReturn(TestOperationContexts.defaultEntityRegistry());
@@ -106,7 +107,7 @@ public void setup() throws RemoteInvocationException, URISyntaxException {
new Aspect(structPropUnderscoresAndDotsDefinition.data()))));
// V1
- aspectRetrieverV1 = mock(AspectRetriever.class);
+ aspectRetrieverV1 = mock(CachingAspectRetriever.class);
when(aspectRetrieverV1.getEntityRegistry())
.thenReturn(TestOperationContexts.defaultEntityRegistry());
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java
index 393ca3ca5d4a64..e51511699e345a 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java
@@ -662,6 +662,7 @@ public void testInvalidStructuredProperty() {
TestOperationContexts.systemContextNoSearchAuthorization(
RetrieverContext.builder()
.aspectRetriever(aspectRetriever)
+ .cachingAspectRetriever(TestOperationContexts.emptyActiveUsersAspectRetriever(null))
.graphRetriever(mock(GraphRetriever.class))
.searchRetriever(mock(SearchRetriever.class))
.build());
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java
index 2c5bcd1294fa15..65b73b7425b743 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java
@@ -247,6 +247,9 @@ public void testSetSearchableRefValue() throws URISyntaxException, RemoteInvocat
TestOperationContexts.systemContextNoSearchAuthorization(
RetrieverContext.builder()
.aspectRetriever(aspectRetriever)
+ .cachingAspectRetriever(
+ TestOperationContexts.emptyActiveUsersAspectRetriever(
+ () -> TEST_ENTITY_REGISTRY))
.graphRetriever(mock(GraphRetriever.class))
.searchRetriever(mock(SearchRetriever.class))
.build());
@@ -301,6 +304,9 @@ public void testSetSearchableRefValue_RuntimeException()
TestOperationContexts.systemContextNoSearchAuthorization(
RetrieverContext.builder()
.aspectRetriever(aspectRetriever)
+ .cachingAspectRetriever(
+ TestOperationContexts.emptyActiveUsersAspectRetriever(
+ () -> TEST_ENTITY_REGISTRY))
.graphRetriever(mock(GraphRetriever.class))
.searchRetriever(mock(SearchRetriever.class))
.build());
@@ -337,6 +343,9 @@ public void testSetSearchableRefValue_RuntimeException_URNExist()
TestOperationContexts.systemContextNoSearchAuthorization(
RetrieverContext.builder()
.aspectRetriever(aspectRetriever)
+ .cachingAspectRetriever(
+ TestOperationContexts.emptyActiveUsersAspectRetriever(
+ () -> TEST_ENTITY_REGISTRY))
.graphRetriever(mock(GraphRetriever.class))
.searchRetriever(mock(SearchRetriever.class))
.build());
@@ -369,6 +378,9 @@ void testSetSearchableRefValue_WithInvalidURN()
TestOperationContexts.systemContextNoSearchAuthorization(
RetrieverContext.builder()
.aspectRetriever(aspectRetriever)
+ .cachingAspectRetriever(
+ TestOperationContexts.emptyActiveUsersAspectRetriever(
+ () -> TEST_ENTITY_REGISTRY))
.graphRetriever(mock(GraphRetriever.class))
.searchRetriever(mock(SearchRetriever.class))
.build());
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/hooks/PropertyDefinitionDeleteSideEffectTest.java b/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/hooks/PropertyDefinitionDeleteSideEffectTest.java
index b1b716c5604816..9a0a82c7f9f49d 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/hooks/PropertyDefinitionDeleteSideEffectTest.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/hooks/PropertyDefinitionDeleteSideEffectTest.java
@@ -18,7 +18,8 @@
import com.linkedin.common.urn.UrnUtils;
import com.linkedin.entity.Aspect;
import com.linkedin.events.metadata.ChangeType;
-import com.linkedin.metadata.aspect.AspectRetriever;
+import com.linkedin.metadata.aspect.CachingAspectRetriever;
+import com.linkedin.metadata.aspect.GraphRetriever;
import com.linkedin.metadata.aspect.batch.MCPItem;
import com.linkedin.metadata.aspect.batch.PatchMCP;
import com.linkedin.metadata.aspect.plugins.config.AspectPluginConfig;
@@ -36,7 +37,6 @@
import com.linkedin.test.metadata.aspect.TestEntityRegistry;
import com.linkedin.test.metadata.aspect.batch.TestMCL;
import io.datahubproject.metadata.context.RetrieverContext;
-import io.datahubproject.test.metadata.context.TestOperationContexts;
import jakarta.json.Json;
import jakarta.json.JsonPatch;
import java.util.List;
@@ -76,13 +76,13 @@ public class PropertyDefinitionDeleteSideEffectTest {
private static final Urn TEST_DATASET_URN =
UrnUtils.getUrn(
"urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.customers,PROD)");
- private AspectRetriever mockAspectRetriever;
+ private CachingAspectRetriever mockAspectRetriever;
private SearchRetriever mockSearchRetriever;
private RetrieverContext retrieverContext;
@BeforeMethod
public void setup() {
- mockAspectRetriever = mock(AspectRetriever.class);
+ mockAspectRetriever = mock(CachingAspectRetriever.class);
when(mockAspectRetriever.getEntityRegistry()).thenReturn(TEST_REGISTRY);
when(mockAspectRetriever.getLatestAspectObject(
eq(TEST_PROPERTY_URN), eq(STRUCTURED_PROPERTY_DEFINITION_ASPECT_NAME)))
@@ -101,8 +101,8 @@ public void setup() {
retrieverContext =
RetrieverContext.builder()
.searchRetriever(mockSearchRetriever)
- .aspectRetriever(mockAspectRetriever)
- .graphRetriever(TestOperationContexts.emptyGraphRetriever)
+ .cachingAspectRetriever(mockAspectRetriever)
+ .graphRetriever(GraphRetriever.EMPTY)
.build();
}
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/validators/ShowPropertyAsBadgeValidatorTest.java b/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/validators/ShowPropertyAsBadgeValidatorTest.java
index 2503faa00f6e71..6e8886f495c95a 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/validators/ShowPropertyAsBadgeValidatorTest.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/validators/ShowPropertyAsBadgeValidatorTest.java
@@ -58,7 +58,7 @@ public void setup() {
mockGraphRetriever = Mockito.mock(GraphRetriever.class);
retrieverContext =
io.datahubproject.metadata.context.RetrieverContext.builder()
- .aspectRetriever(mockAspectRetriever)
+ .cachingAspectRetriever(mockAspectRetriever)
.searchRetriever(mockSearchRetriever)
.graphRetriever(mockGraphRetriever)
.build();
diff --git a/metadata-io/src/test/java/io/datahubproject/test/DataGenerator.java b/metadata-io/src/test/java/io/datahubproject/test/DataGenerator.java
index 3acd2bf3413578..02cd28eb202e94 100644
--- a/metadata-io/src/test/java/io/datahubproject/test/DataGenerator.java
+++ b/metadata-io/src/test/java/io/datahubproject/test/DataGenerator.java
@@ -171,10 +171,7 @@ public Stream> generateMCPs(
DefaultAspectsUtil.getAdditionalChanges(
opContext,
AspectsBatchImpl.builder()
- .mcps(
- List.of(mcp),
- auditStamp,
- opContext.getRetrieverContext().get())
+ .mcps(List.of(mcp), auditStamp, opContext.getRetrieverContext())
.build()
.getMCPItems(),
entityService,
diff --git a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java
index cf9d73dfa729be..f16c9dbd82e749 100644
--- a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java
+++ b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java
@@ -20,7 +20,6 @@
import com.linkedin.metadata.utils.elasticsearch.IndexConvention;
import io.datahubproject.metadata.context.OperationContext;
import io.datahubproject.metadata.context.OperationContextConfig;
-import io.datahubproject.metadata.context.RetrieverContext;
import io.datahubproject.metadata.context.ServicesRegistryContext;
import io.datahubproject.metadata.context.ValidationContext;
import io.datahubproject.test.metadata.context.TestOperationContexts;
@@ -95,7 +94,7 @@ public OperationContext operationContext(
entityRegistry,
mock(ServicesRegistryContext.class),
indexConvention,
- mock(RetrieverContext.class),
+ TestOperationContexts.emptyActiveUsersRetrieverContext(() -> entityRegistry),
mock(ValidationContext.class));
}
diff --git a/metadata-jobs/pe-consumer/src/test/java/com/datahub/event/hook/BusinessAttributeUpdateHookTest.java b/metadata-jobs/pe-consumer/src/test/java/com/datahub/event/hook/BusinessAttributeUpdateHookTest.java
index 47740b02d6166c..65ee6b8591f489 100644
--- a/metadata-jobs/pe-consumer/src/test/java/com/datahub/event/hook/BusinessAttributeUpdateHookTest.java
+++ b/metadata-jobs/pe-consumer/src/test/java/com/datahub/event/hook/BusinessAttributeUpdateHookTest.java
@@ -93,8 +93,6 @@ public void testMCLOnBusinessAttributeUpdate() throws Exception {
new RelatedEntity(BUSINESS_ATTRIBUTE_OF, SCHEMA_FIELD_URN.toString())));
when(opContext
- .getRetrieverContext()
- .get()
.getAspectRetriever()
.getLatestAspectObjects(
eq(Set.of(SCHEMA_FIELD_URN)), eq(Set.of(BUSINESS_ATTRIBUTE_ASPECT))))
@@ -108,7 +106,7 @@ public void testMCLOnBusinessAttributeUpdate() throws Exception {
// verify
// page 1
- Mockito.verify(opContext.getRetrieverContext().get().getGraphRetriever(), Mockito.times(1))
+ Mockito.verify(opContext.getRetrieverContext().getGraphRetriever(), Mockito.times(1))
.scrollRelatedEntities(
isNull(),
any(Filter.class),
@@ -122,7 +120,7 @@ public void testMCLOnBusinessAttributeUpdate() throws Exception {
isNull(),
isNull());
// page 2
- Mockito.verify(opContext.getRetrieverContext().get().getGraphRetriever(), Mockito.times(1))
+ Mockito.verify(opContext.getRetrieverContext().getGraphRetriever(), Mockito.times(1))
.scrollRelatedEntities(
isNull(),
any(Filter.class),
@@ -136,7 +134,7 @@ public void testMCLOnBusinessAttributeUpdate() throws Exception {
isNull(),
isNull());
- Mockito.verifyNoMoreInteractions(opContext.getRetrieverContext().get().getGraphRetriever());
+ Mockito.verifyNoMoreInteractions(opContext.getRetrieverContext().getGraphRetriever());
// 2 pages = 2 ingest proposals
Mockito.verify(mockUpdateIndicesService, Mockito.times(2))
@@ -152,8 +150,8 @@ private void testMCLOnInvalidCategory() throws Exception {
businessAttributeServiceHook.handleChangeEvent(opContext, platformEvent);
// verify
- Mockito.verifyNoInteractions(opContext.getRetrieverContext().get().getGraphRetriever());
- Mockito.verifyNoInteractions(opContext.getAspectRetrieverOpt().get());
+ Mockito.verifyNoInteractions(opContext.getRetrieverContext().getGraphRetriever());
+ Mockito.verifyNoInteractions(opContext.getAspectRetriever());
Mockito.verifyNoInteractions(mockUpdateIndicesService);
}
@@ -226,13 +224,15 @@ private OperationContext mockOperationContextWithGraph(List graph
RetrieverContext mockRetrieverContext = mock(RetrieverContext.class);
when(mockRetrieverContext.getAspectRetriever()).thenReturn(mock(AspectRetriever.class));
+ when(mockRetrieverContext.getCachingAspectRetriever())
+ .thenReturn(TestOperationContexts.emptyActiveUsersAspectRetriever(null));
when(mockRetrieverContext.getGraphRetriever()).thenReturn(graphRetriever);
OperationContext opContext =
TestOperationContexts.systemContextNoSearchAuthorization(mockRetrieverContext);
// reset mock for test
- reset(opContext.getAspectRetrieverOpt().get());
+ reset(opContext.getAspectRetriever());
if (!graphEdges.isEmpty()) {
diff --git a/metadata-models/src/main/pegasus/com/linkedin/platformresource/PlatformResourceType.pdl b/metadata-models/src/main/pegasus/com/linkedin/platformresource/PlatformResourceType.pdl
index 2f36eda9141abb..1a1dbea4359fbd 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/platformresource/PlatformResourceType.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/platformresource/PlatformResourceType.pdl
@@ -9,9 +9,13 @@ enum PlatformResourceType {
/**
* e.g. a Slack member resource, Looker user resource, etc.
*/
- USER_INFO,
+ USER_INFO,
/**
* e.g. a Slack channel
*/
CONVERSATION
+ /**
+ * e.g. Looker mapping of all user ids
+ */
+ USER_ID_MAPPING
}
diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/ActorContext.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/ActorContext.java
index e65bf22991736d..c08b7fad4dee32 100644
--- a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/ActorContext.java
+++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/ActorContext.java
@@ -1,12 +1,23 @@
package io.datahubproject.metadata.context;
+import static com.linkedin.metadata.Constants.CORP_USER_KEY_ASPECT_NAME;
+import static com.linkedin.metadata.Constants.CORP_USER_STATUS_ASPECT_NAME;
+import static com.linkedin.metadata.Constants.CORP_USER_STATUS_SUSPENDED;
+import static com.linkedin.metadata.Constants.STATUS_ASPECT_NAME;
+import static com.linkedin.metadata.Constants.SYSTEM_ACTOR;
+
import com.datahub.authentication.Authentication;
+import com.linkedin.common.Status;
import com.linkedin.common.urn.Urn;
import com.linkedin.common.urn.UrnUtils;
+import com.linkedin.entity.Aspect;
+import com.linkedin.identity.CorpUserStatus;
+import com.linkedin.metadata.aspect.AspectRetriever;
import com.linkedin.metadata.authorization.PoliciesConfig;
import com.linkedin.policy.DataHubPolicyInfo;
import java.util.Collection;
import java.util.Collections;
+import java.util.Map;
import java.util.Optional;
import java.util.Set;
import lombok.Builder;
@@ -48,6 +59,43 @@ public Urn getActorUrn() {
return UrnUtils.getUrn(authentication.getActor().toUrnStr());
}
+ /**
+ * Actor is considered active if the user is not hard-deleted, soft-deleted, and is not suspended
+ *
+ * @param aspectRetriever aspect retriever - ideally the SystemEntityClient backed one for caching
+ * @return active status
+ */
+ public boolean isActive(AspectRetriever aspectRetriever) {
+ // system cannot be disabled
+ if (SYSTEM_ACTOR.equals(authentication.getActor().toUrnStr())) {
+ return true;
+ }
+
+ Urn selfUrn = UrnUtils.getUrn(authentication.getActor().toUrnStr());
+ Map> urnAspectMap =
+ aspectRetriever.getLatestAspectObjects(
+ Set.of(selfUrn),
+ Set.of(STATUS_ASPECT_NAME, CORP_USER_STATUS_ASPECT_NAME, CORP_USER_KEY_ASPECT_NAME));
+
+ Map aspectMap = urnAspectMap.getOrDefault(selfUrn, Map.of());
+
+ if (!aspectMap.containsKey(CORP_USER_KEY_ASPECT_NAME)) {
+ // user is hard deleted
+ return false;
+ }
+
+ Status status =
+ Optional.ofNullable(aspectMap.get(STATUS_ASPECT_NAME))
+ .map(a -> new Status(a.data()))
+ .orElse(new Status().setRemoved(false));
+ CorpUserStatus corpUserStatus =
+ Optional.ofNullable(aspectMap.get(CORP_USER_STATUS_ASPECT_NAME))
+ .map(a -> new CorpUserStatus(a.data()))
+ .orElse(new CorpUserStatus().setStatus(""));
+
+ return !status.isRemoved() && !CORP_USER_STATUS_SUSPENDED.equals(corpUserStatus.getStatus());
+ }
+
/**
* The current implementation creates a cache entry unique for the set of policies.
*
diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java
index 9a058c526647c2..9158129235b39e 100644
--- a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java
+++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java
@@ -16,6 +16,8 @@
import com.linkedin.metadata.query.SearchFlags;
import com.linkedin.metadata.utils.AuditStampUtils;
import com.linkedin.metadata.utils.elasticsearch.IndexConvention;
+import io.datahubproject.metadata.exception.ActorAccessException;
+import io.datahubproject.metadata.exception.OperationContextException;
import java.util.Collection;
import java.util.Objects;
import java.util.Optional;
@@ -63,6 +65,24 @@ public static OperationContext asSession(
@Nonnull Authorizer authorizer,
@Nonnull Authentication sessionAuthentication,
boolean allowSystemAuthentication) {
+ return OperationContext.asSession(
+ systemOperationContext,
+ requestContext,
+ authorizer,
+ sessionAuthentication,
+ allowSystemAuthentication,
+ false);
+ }
+
+ @Nonnull
+ public static OperationContext asSession(
+ OperationContext systemOperationContext,
+ @Nonnull RequestContext requestContext,
+ @Nonnull Authorizer authorizer,
+ @Nonnull Authentication sessionAuthentication,
+ boolean allowSystemAuthentication,
+ boolean skipCache)
+ throws ActorAccessException {
return systemOperationContext.toBuilder()
.operationContextConfig(
// update allowed system authentication
@@ -72,7 +92,7 @@ public static OperationContext asSession(
.authorizationContext(AuthorizationContext.builder().authorizer(authorizer).build())
.requestContext(requestContext)
.validationContext(systemOperationContext.getValidationContext())
- .build(sessionAuthentication);
+ .build(sessionAuthentication, skipCache);
}
/**
@@ -85,10 +105,14 @@ public static OperationContext asSession(
public static OperationContext withSearchFlags(
OperationContext opContext, Function flagDefaults) {
- return opContext.toBuilder()
- // update search flags for the request's session
- .searchContext(opContext.getSearchContext().withFlagDefaults(flagDefaults))
- .build(opContext.getSessionActorContext());
+ try {
+ return opContext.toBuilder()
+ // update search flags for the request's session
+ .searchContext(opContext.getSearchContext().withFlagDefaults(flagDefaults))
+ .build(opContext.getSessionActorContext(), false);
+ } catch (OperationContextException e) {
+ throw new RuntimeException(e);
+ }
}
/**
@@ -101,10 +125,14 @@ public static OperationContext withSearchFlags(
public static OperationContext withLineageFlags(
OperationContext opContext, Function flagDefaults) {
- return opContext.toBuilder()
- // update lineage flags for the request's session
- .searchContext(opContext.getSearchContext().withLineageFlagDefaults(flagDefaults))
- .build(opContext.getSessionActorContext());
+ try {
+ return opContext.toBuilder()
+ // update lineage flags for the request's session
+ .searchContext(opContext.getSearchContext().withLineageFlagDefaults(flagDefaults))
+ .build(opContext.getSessionActorContext(), false);
+ } catch (OperationContextException e) {
+ throw new RuntimeException(e);
+ }
}
/**
@@ -155,18 +183,22 @@ public static OperationContext asSystem(
? SearchContext.EMPTY
: SearchContext.builder().indexConvention(indexConvention).build();
- return OperationContext.builder()
- .operationContextConfig(systemConfig)
- .systemActorContext(systemActorContext)
- .searchContext(systemSearchContext)
- .entityRegistryContext(EntityRegistryContext.builder().build(entityRegistry))
- .servicesRegistryContext(servicesRegistryContext)
- // Authorizer.EMPTY doesn't actually apply to system auth
- .authorizationContext(AuthorizationContext.builder().authorizer(Authorizer.EMPTY).build())
- .retrieverContext(retrieverContext)
- .objectMapperContext(objectMapperContext)
- .validationContext(validationContext)
- .build(systemAuthentication);
+ try {
+ return OperationContext.builder()
+ .operationContextConfig(systemConfig)
+ .systemActorContext(systemActorContext)
+ .searchContext(systemSearchContext)
+ .entityRegistryContext(EntityRegistryContext.builder().build(entityRegistry))
+ .servicesRegistryContext(servicesRegistryContext)
+ // Authorizer.EMPTY doesn't actually apply to system auth
+ .authorizationContext(AuthorizationContext.builder().authorizer(Authorizer.EMPTY).build())
+ .retrieverContext(retrieverContext)
+ .objectMapperContext(objectMapperContext)
+ .validationContext(validationContext)
+ .build(systemAuthentication, false);
+ } catch (OperationContextException e) {
+ throw new RuntimeException(e);
+ }
}
@Nonnull private final OperationContextConfig operationContextConfig;
@@ -177,7 +209,7 @@ public static OperationContext asSystem(
@Nonnull private final EntityRegistryContext entityRegistryContext;
@Nullable private final ServicesRegistryContext servicesRegistryContext;
@Nullable private final RequestContext requestContext;
- @Nullable private final RetrieverContext retrieverContext;
+ @Nonnull private final RetrieverContext retrieverContext;
@Nonnull private final ObjectMapperContext objectMapperContext;
@Nonnull private final ValidationContext validationContext;
@@ -194,13 +226,15 @@ public OperationContext withLineageFlags(
public OperationContext asSession(
@Nonnull RequestContext requestContext,
@Nonnull Authorizer authorizer,
- @Nonnull Authentication sessionAuthentication) {
+ @Nonnull Authentication sessionAuthentication)
+ throws ActorAccessException {
return OperationContext.asSession(
this,
requestContext,
authorizer,
sessionAuthentication,
- getOperationContextConfig().isAllowSystemAuthentication());
+ getOperationContextConfig().isAllowSystemAuthentication(),
+ false);
}
@Nonnull
@@ -284,17 +318,9 @@ public AuditStamp getAuditStamp() {
return getAuditStamp(null);
}
- public Optional getRetrieverContext() {
- return Optional.ofNullable(retrieverContext);
- }
-
- @Nullable
+ @Nonnull
public AspectRetriever getAspectRetriever() {
- return getAspectRetrieverOpt().orElse(null);
- }
-
- public Optional getAspectRetrieverOpt() {
- return getRetrieverContext().map(RetrieverContext::getAspectRetriever);
+ return retrieverContext.getAspectRetriever();
}
/**
@@ -336,10 +362,7 @@ public String getGlobalContextId() {
? EmptyContext.EMPTY
: getServicesRegistryContext())
.add(getRequestContext() == null ? EmptyContext.EMPTY : getRequestContext())
- .add(
- getRetrieverContext().isPresent()
- ? getRetrieverContext().get()
- : EmptyContext.EMPTY)
+ .add(getRetrieverContext())
.add(getObjectMapperContext())
.build()
.stream()
@@ -364,10 +387,7 @@ public String getSearchContextId() {
getServicesRegistryContext() == null
? EmptyContext.EMPTY
: getServicesRegistryContext())
- .add(
- getRetrieverContext().isPresent()
- ? getRetrieverContext().get()
- : EmptyContext.EMPTY)
+ .add(getRetrieverContext())
.build()
.stream()
.map(ContextInterface::getCacheKeyComponent)
@@ -438,6 +458,12 @@ public static class OperationContextBuilder {
@Nonnull
public OperationContext build(@Nonnull Authentication sessionAuthentication) {
+ return build(sessionAuthentication, false);
+ }
+
+ @Nonnull
+ public OperationContext build(
+ @Nonnull Authentication sessionAuthentication, boolean skipCache) {
final Urn actorUrn = UrnUtils.getUrn(sessionAuthentication.getActor().toUrnStr());
final ActorContext sessionActor =
ActorContext.builder()
@@ -451,11 +477,20 @@ public OperationContext build(@Nonnull Authentication sessionAuthentication) {
.policyInfoSet(this.authorizationContext.getAuthorizer().getActorPolicies(actorUrn))
.groupMembership(this.authorizationContext.getAuthorizer().getActorGroups(actorUrn))
.build();
- return build(sessionActor);
+ return build(sessionActor, skipCache);
}
@Nonnull
- public OperationContext build(@Nonnull ActorContext sessionActor) {
+ public OperationContext build(@Nonnull ActorContext sessionActor, boolean skipCache) {
+ AspectRetriever retriever =
+ skipCache
+ ? this.retrieverContext.getAspectRetriever()
+ : this.retrieverContext.getCachingAspectRetriever();
+
+ if (!sessionActor.isActive(retriever)) {
+ throw new ActorAccessException("Actor is not active");
+ }
+
return new OperationContext(
this.operationContextConfig,
sessionActor,
diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RetrieverContext.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RetrieverContext.java
index 9337fbfe3bb003..9afc4138810bb2 100644
--- a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RetrieverContext.java
+++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RetrieverContext.java
@@ -1,8 +1,10 @@
package io.datahubproject.metadata.context;
import com.linkedin.metadata.aspect.AspectRetriever;
+import com.linkedin.metadata.aspect.CachingAspectRetriever;
import com.linkedin.metadata.aspect.GraphRetriever;
import com.linkedin.metadata.entity.SearchRetriever;
+import java.util.Objects;
import java.util.Optional;
import javax.annotation.Nonnull;
import lombok.Builder;
@@ -15,10 +17,37 @@ public class RetrieverContext
@Nonnull private final GraphRetriever graphRetriever;
@Nonnull private final AspectRetriever aspectRetriever;
+ @Nonnull private final CachingAspectRetriever cachingAspectRetriever;
@Nonnull private final SearchRetriever searchRetriever;
@Override
public Optional getCacheKeyComponent() {
return Optional.empty();
}
+
+ public static class RetrieverContextBuilder {
+ public RetrieverContext build() {
+ if (this.aspectRetriever == null && this.cachingAspectRetriever != null) {
+ this.aspectRetriever = this.cachingAspectRetriever;
+ }
+
+ if (this.cachingAspectRetriever == null
+ && this.aspectRetriever instanceof CachingAspectRetriever) {
+ this.cachingAspectRetriever = (CachingAspectRetriever) this.aspectRetriever;
+ }
+
+ return new RetrieverContext(
+ this.graphRetriever,
+ Objects.requireNonNull(this.aspectRetriever),
+ Objects.requireNonNull(this.cachingAspectRetriever),
+ this.searchRetriever);
+ }
+ }
+
+ public static final RetrieverContext EMPTY =
+ RetrieverContext.builder()
+ .graphRetriever(GraphRetriever.EMPTY)
+ .searchRetriever(SearchRetriever.EMPTY)
+ .cachingAspectRetriever(CachingAspectRetriever.EMPTY)
+ .build();
}
diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/ActorAccessException.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/ActorAccessException.java
new file mode 100644
index 00000000000000..bca2594b96430e
--- /dev/null
+++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/ActorAccessException.java
@@ -0,0 +1,7 @@
+package io.datahubproject.metadata.exception;
+
+public class ActorAccessException extends OperationContextException {
+ public ActorAccessException(String string) {
+ super(string);
+ }
+}
diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/OperationContextException.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/OperationContextException.java
new file mode 100644
index 00000000000000..1aac8dc3e60ec9
--- /dev/null
+++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/OperationContextException.java
@@ -0,0 +1,9 @@
+package io.datahubproject.metadata.exception;
+
+public class OperationContextException extends RuntimeException {
+ public OperationContextException(String message) {
+ super(message);
+ }
+
+ public OperationContextException() {}
+}
diff --git a/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java b/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java
index 42de6b7398c616..4abfbb196f067c 100644
--- a/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java
+++ b/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java
@@ -8,21 +8,17 @@
import com.linkedin.common.urn.Urn;
import com.linkedin.data.schema.annotation.PathSpecBasedSchemaAnnotationVisitor;
import com.linkedin.entity.Aspect;
+import com.linkedin.identity.CorpUserInfo;
+import com.linkedin.metadata.Constants;
import com.linkedin.metadata.aspect.AspectRetriever;
+import com.linkedin.metadata.aspect.CachingAspectRetriever;
import com.linkedin.metadata.aspect.GraphRetriever;
-import com.linkedin.metadata.aspect.SystemAspect;
-import com.linkedin.metadata.aspect.models.graph.RelatedEntitiesScrollResult;
import com.linkedin.metadata.entity.SearchRetriever;
import com.linkedin.metadata.models.registry.ConfigEntityRegistry;
import com.linkedin.metadata.models.registry.EntityRegistry;
import com.linkedin.metadata.models.registry.EntityRegistryException;
import com.linkedin.metadata.models.registry.MergedEntityRegistry;
import com.linkedin.metadata.models.registry.SnapshotEntityRegistry;
-import com.linkedin.metadata.query.filter.Filter;
-import com.linkedin.metadata.query.filter.RelationshipFilter;
-import com.linkedin.metadata.query.filter.SortCriterion;
-import com.linkedin.metadata.search.ScrollResult;
-import com.linkedin.metadata.search.SearchEntityArray;
import com.linkedin.metadata.snapshot.Snapshot;
import com.linkedin.metadata.utils.elasticsearch.IndexConvention;
import com.linkedin.metadata.utils.elasticsearch.IndexConventionImpl;
@@ -32,15 +28,14 @@
import io.datahubproject.metadata.context.RetrieverContext;
import io.datahubproject.metadata.context.ServicesRegistryContext;
import io.datahubproject.metadata.context.ValidationContext;
-import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.function.Consumer;
import java.util.function.Supplier;
+import java.util.stream.Collectors;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
-import lombok.Builder;
/**
* Useful for testing. If the defaults are not sufficient, try using the .toBuilder() and replacing
@@ -81,26 +76,53 @@ public static EntityRegistry defaultEntityRegistry() {
return defaultEntityRegistryInstance;
}
- public static AspectRetriever emptyAspectRetriever(
+ public static RetrieverContext emptyActiveUsersRetrieverContext(
@Nullable Supplier entityRegistrySupplier) {
- return new EmptyAspectRetriever(
- () ->
- Optional.ofNullable(entityRegistrySupplier)
- .map(Supplier::get)
- .orElse(defaultEntityRegistry()));
- }
- public static GraphRetriever emptyGraphRetriever = new EmptyGraphRetriever();
- public static SearchRetriever emptySearchRetriever = new EmptySearchRetriever();
+ return RetrieverContext.builder()
+ .cachingAspectRetriever(emptyActiveUsersAspectRetriever(entityRegistrySupplier))
+ .graphRetriever(GraphRetriever.EMPTY)
+ .searchRetriever(SearchRetriever.EMPTY)
+ .build();
+ }
- public static RetrieverContext emptyRetrieverContext(
+ public static CachingAspectRetriever emptyActiveUsersAspectRetriever(
@Nullable Supplier entityRegistrySupplier) {
- return RetrieverContext.builder()
- .aspectRetriever(emptyAspectRetriever(entityRegistrySupplier))
- .graphRetriever(emptyGraphRetriever)
- .searchRetriever(emptySearchRetriever)
- .build();
+ return new CachingAspectRetriever.EmptyAspectRetriever() {
+
+ @Nonnull
+ @Override
+ public Map> getLatestAspectObjects(
+ Set urns, Set aspectNames) {
+ if (urns.stream().allMatch(urn -> urn.toString().startsWith("urn:li:corpuser:"))
+ && aspectNames.contains(Constants.CORP_USER_KEY_ASPECT_NAME)) {
+ return urns.stream()
+ .map(
+ urn ->
+ Map.entry(
+ urn,
+ Map.of(
+ Constants.CORP_USER_KEY_ASPECT_NAME,
+ new Aspect(
+ new CorpUserInfo()
+ .setActive(true)
+ .setEmail(urn.getId())
+ .setDisplayName(urn.getId())
+ .data()))))
+ .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
+ }
+ return super.getLatestAspectObjects(urns, aspectNames);
+ }
+
+ @Nonnull
+ @Override
+ public EntityRegistry getEntityRegistry() {
+ return Optional.ofNullable(entityRegistrySupplier)
+ .map(Supplier::get)
+ .orElse(defaultEntityRegistry());
+ }
+ };
}
public static OperationContext systemContextNoSearchAuthorization(
@@ -140,8 +162,10 @@ public static OperationContext systemContextNoSearchAuthorization(
RetrieverContext retrieverContext =
RetrieverContext.builder()
.aspectRetriever(aspectRetriever)
- .graphRetriever(emptyGraphRetriever)
- .searchRetriever(emptySearchRetriever)
+ .cachingAspectRetriever(
+ emptyActiveUsersAspectRetriever(() -> aspectRetriever.getEntityRegistry()))
+ .graphRetriever(GraphRetriever.EMPTY)
+ .searchRetriever(SearchRetriever.EMPTY)
.build();
return systemContextNoSearchAuthorization(
() -> retrieverContext.getAspectRetriever().getEntityRegistry(),
@@ -208,7 +232,7 @@ public static OperationContext systemContext(
RetrieverContext retrieverContext =
Optional.ofNullable(retrieverContextSupplier)
.map(Supplier::get)
- .orElse(emptyRetrieverContext(entityRegistrySupplier));
+ .orElse(emptyActiveUsersRetrieverContext(entityRegistrySupplier));
EntityRegistry entityRegistry =
Optional.ofNullable(entityRegistrySupplier)
@@ -298,66 +322,5 @@ public static OperationContext userContextNoSearchAuthorization(
.asSession(requestContext, Authorizer.EMPTY, TEST_USER_AUTH);
}
- @Builder
- public static class EmptyAspectRetriever implements AspectRetriever {
- private final Supplier entityRegistrySupplier;
-
- @Nonnull
- @Override
- public Map> getLatestAspectObjects(
- Set urns, Set aspectNames) {
- return Map.of();
- }
-
- @Nonnull
- @Override
- public Map> getLatestSystemAspects(
- Map> urnAspectNames) {
- return Map.of();
- }
-
- @Nonnull
- @Override
- public EntityRegistry getEntityRegistry() {
- return entityRegistrySupplier.get();
- }
- }
-
- public static class EmptyGraphRetriever implements GraphRetriever {
-
- @Nonnull
- @Override
- public RelatedEntitiesScrollResult scrollRelatedEntities(
- @Nullable List sourceTypes,
- @Nonnull Filter sourceEntityFilter,
- @Nullable List destinationTypes,
- @Nonnull Filter destinationEntityFilter,
- @Nonnull List relationshipTypes,
- @Nonnull RelationshipFilter relationshipFilter,
- @Nonnull List sortCriterion,
- @Nullable String scrollId,
- int count,
- @Nullable Long startTimeMillis,
- @Nullable Long endTimeMillis) {
- return new RelatedEntitiesScrollResult(0, 0, null, List.of());
- }
- }
-
- public static class EmptySearchRetriever implements SearchRetriever {
-
- @Override
- public ScrollResult scroll(
- @Nonnull List entities,
- @Nullable Filter filters,
- @Nullable String scrollId,
- int count) {
- ScrollResult empty = new ScrollResult();
- empty.setEntities(new SearchEntityArray());
- empty.setNumEntities(0);
- empty.setPageSize(0);
- return empty;
- }
- }
-
private TestOperationContexts() {}
}
diff --git a/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java b/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java
index 3e092e20127ee5..f77b244d8f2d86 100644
--- a/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java
+++ b/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java
@@ -8,6 +8,7 @@
import com.datahub.authentication.Authentication;
import com.datahub.plugins.auth.authorization.Authorizer;
import com.linkedin.metadata.models.registry.EntityRegistry;
+import io.datahubproject.test.metadata.context.TestOperationContexts;
import org.testng.annotations.Test;
public class OperationContextTest {
@@ -25,7 +26,7 @@ public void testSystemPrivilegeEscalation() {
mock(EntityRegistry.class),
mock(ServicesRegistryContext.class),
null,
- mock(RetrieverContext.class),
+ TestOperationContexts.emptyActiveUsersRetrieverContext(null),
mock(ValidationContext.class));
OperationContext opContext =
diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authentication/token/StatefulTokenService.java b/metadata-service/auth-impl/src/main/java/com/datahub/authentication/token/StatefulTokenService.java
index 6724f35d840adb..a9871f1ed99482 100644
--- a/metadata-service/auth-impl/src/main/java/com/datahub/authentication/token/StatefulTokenService.java
+++ b/metadata-service/auth-impl/src/main/java/com/datahub/authentication/token/StatefulTokenService.java
@@ -145,7 +145,7 @@ public String generateAccessToken(
_entityService.ingestProposal(
systemOperationContext,
AspectsBatchImpl.builder()
- .mcps(List.of(proposal), auditStamp, systemOperationContext.getRetrieverContext().get())
+ .mcps(List.of(proposal), auditStamp, systemOperationContext.getRetrieverContext())
.build(),
false);
diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/EbeanConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/EbeanConfiguration.java
index 47b406e695a3fb..6eb31e14a2d3b0 100644
--- a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/EbeanConfiguration.java
+++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/EbeanConfiguration.java
@@ -23,6 +23,7 @@ public class EbeanConfiguration {
private boolean autoCreateDdl;
private boolean postgresUseIamAuth;
private LockingConfiguration locking;
+ private String batchGetMethod;
public static final EbeanConfiguration testDefault =
EbeanConfiguration.builder().locking(LockingConfiguration.testDefault).build();
diff --git a/metadata-service/configuration/src/main/resources/application.yaml b/metadata-service/configuration/src/main/resources/application.yaml
index 9348416606d0a9..b997bc108e4ba1 100644
--- a/metadata-service/configuration/src/main/resources/application.yaml
+++ b/metadata-service/configuration/src/main/resources/application.yaml
@@ -164,6 +164,7 @@ ebean:
waitTimeoutMillis: ${EBEAN_WAIT_TIMEOUT_MILLIS:1000}
autoCreateDdl: ${EBEAN_AUTOCREATE:false}
postgresUseIamAuth: ${EBEAN_POSTGRES_USE_AWS_IAM_AUTH:false}
+ batchGetMethod: ${EBEAN_BATCH_GET_METHOD:IN} # Alternative UNION
locking:
enabled: ${EBEAN_LOCKING_ENABLED:false}
durationSeconds: ${EBEAN_LOCKING_DURATION_SECONDS:60}
@@ -522,12 +523,12 @@ cache:
entityAspectTTLSeconds:
# cache user aspects for 20s
corpuser:
- corpUserKey: 20
+ corpUserKey: 300 # 5 min
corpUserInfo: 20
corpUserEditableInfo: 20
- corpUserStatus: 20
+ corpUserStatus: 300 # 5 min
globalTags: 20
- status: 20
+ status: 300 # 5 min
corpUserCredentials: 20
corpUserSettings: 20
roleMembership: 20
@@ -561,7 +562,7 @@ springdoc.api-docs.groups.enabled: true
forms:
hook:
- enabled: { $FORMS_HOOK_ENABLED:true }
+ enabled: ${FORMS_HOOK_ENABLED:true}
consumerGroupSuffix: ${FORMS_HOOK_CONSUMER_GROUP_SUFFIX:}
businessAttribute:
diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java
index f5235dc3682fce..3e2823591e168c 100644
--- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java
+++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java
@@ -45,7 +45,8 @@ protected OperationContext javaSystemOperationContext(
@Nonnull final SearchService searchService,
@Qualifier("baseElasticSearchComponents")
BaseElasticSearchComponentsFactory.BaseElasticSearchComponents components,
- @Nonnull final ConfigurationProvider configurationProvider) {
+ @Nonnull final ConfigurationProvider configurationProvider,
+ @Qualifier("systemEntityClient") @Nonnull final SystemEntityClient systemEntityClient) {
EntityServiceAspectRetriever entityServiceAspectRetriever =
EntityServiceAspectRetriever.builder()
@@ -53,6 +54,9 @@ protected OperationContext javaSystemOperationContext(
.entityService(entityService)
.build();
+ EntityClientAspectRetriever entityClientAspectRetriever =
+ EntityClientAspectRetriever.builder().entityClient(systemEntityClient).build();
+
SystemGraphRetriever systemGraphRetriever =
SystemGraphRetriever.builder().graphService(graphService).build();
@@ -68,6 +72,7 @@ protected OperationContext javaSystemOperationContext(
components.getIndexConvention(),
RetrieverContext.builder()
.aspectRetriever(entityServiceAspectRetriever)
+ .cachingAspectRetriever(entityClientAspectRetriever)
.graphRetriever(systemGraphRetriever)
.searchRetriever(searchServiceSearchRetriever)
.build(),
@@ -76,6 +81,7 @@ protected OperationContext javaSystemOperationContext(
configurationProvider.getFeatureFlags().isAlternateMCPValidation())
.build());
+ entityClientAspectRetriever.setSystemOperationContext(systemOperationContext);
entityServiceAspectRetriever.setSystemOperationContext(systemOperationContext);
systemGraphRetriever.setSystemOperationContext(systemOperationContext);
searchServiceSearchRetriever.setSystemOperationContext(systemOperationContext);
@@ -104,7 +110,7 @@ protected OperationContext restliSystemOperationContext(
BaseElasticSearchComponentsFactory.BaseElasticSearchComponents components,
@Nonnull final ConfigurationProvider configurationProvider) {
- EntityClientAspectRetriever entityServiceAspectRetriever =
+ EntityClientAspectRetriever entityClientAspectRetriever =
EntityClientAspectRetriever.builder().entityClient(systemEntityClient).build();
SystemGraphRetriever systemGraphRetriever =
@@ -121,7 +127,7 @@ protected OperationContext restliSystemOperationContext(
ServicesRegistryContext.builder().restrictedService(restrictedService).build(),
components.getIndexConvention(),
RetrieverContext.builder()
- .aspectRetriever(entityServiceAspectRetriever)
+ .cachingAspectRetriever(entityClientAspectRetriever)
.graphRetriever(systemGraphRetriever)
.searchRetriever(searchServiceSearchRetriever)
.build(),
@@ -130,7 +136,7 @@ protected OperationContext restliSystemOperationContext(
configurationProvider.getFeatureFlags().isAlternateMCPValidation())
.build());
- entityServiceAspectRetriever.setSystemOperationContext(systemOperationContext);
+ entityClientAspectRetriever.setSystemOperationContext(systemOperationContext);
systemGraphRetriever.setSystemOperationContext(systemOperationContext);
searchServiceSearchRetriever.setSystemOperationContext(systemOperationContext);
diff --git a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestDataPlatformInstancesStep.java b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestDataPlatformInstancesStep.java
index 22ce06a5984ea6..c04dd25ccd4ac9 100644
--- a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestDataPlatformInstancesStep.java
+++ b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestDataPlatformInstancesStep.java
@@ -84,14 +84,14 @@ public void execute(@Nonnull OperationContext systemOperationContext) throws Exc
.aspectName(DATA_PLATFORM_INSTANCE_ASPECT_NAME)
.recordTemplate(dataPlatformInstance.get())
.auditStamp(aspectAuditStamp)
- .build(systemOperationContext.getAspectRetrieverOpt().get()));
+ .build(systemOperationContext.getAspectRetriever()));
}
}
_entityService.ingestAspects(
systemOperationContext,
AspectsBatchImpl.builder()
- .retrieverContext(systemOperationContext.getRetrieverContext().get())
+ .retrieverContext(systemOperationContext.getRetrieverContext())
.items(items)
.build(),
true,
diff --git a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestPoliciesStep.java b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestPoliciesStep.java
index eb6bfe17ac198e..dac2879487469c 100644
--- a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestPoliciesStep.java
+++ b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestPoliciesStep.java
@@ -225,7 +225,7 @@ private void ingestPolicy(
new AuditStamp()
.setActor(Urn.createFromString(Constants.SYSTEM_ACTOR))
.setTime(System.currentTimeMillis()),
- systemOperationContext.getRetrieverContext().get())
+ systemOperationContext.getRetrieverContext())
.build(),
false);
}
diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/GlobalControllerExceptionHandler.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/GlobalControllerExceptionHandler.java
similarity index 81%
rename from metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/GlobalControllerExceptionHandler.java
rename to metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/GlobalControllerExceptionHandler.java
index ba0a426fa20e89..c756827cad56ba 100644
--- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/GlobalControllerExceptionHandler.java
+++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/GlobalControllerExceptionHandler.java
@@ -1,9 +1,11 @@
-package io.datahubproject.openapi;
+package io.datahubproject.openapi.config;
import com.linkedin.metadata.dao.throttle.APIThrottleException;
+import io.datahubproject.metadata.exception.ActorAccessException;
import io.datahubproject.openapi.exception.InvalidUrnException;
import io.datahubproject.openapi.exception.UnauthorizedException;
import java.util.Map;
+import javax.annotation.PostConstruct;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.ConversionNotSupportedException;
import org.springframework.core.Ordered;
@@ -19,6 +21,11 @@
@ControllerAdvice
public class GlobalControllerExceptionHandler extends DefaultHandlerExceptionResolver {
+ @PostConstruct
+ public void init() {
+ log.info("GlobalControllerExceptionHandler initialized");
+ }
+
public GlobalControllerExceptionHandler() {
setOrder(Ordered.HIGHEST_PRECEDENCE);
setWarnLogCategory(getClass().getName());
@@ -52,4 +59,9 @@ public static ResponseEntity