diff --git a/.github/workflows/check-datahub-jars.yml b/.github/workflows/check-datahub-jars.yml index 7a49f32729ec1f..dc770f7fc83a61 100644 --- a/.github/workflows/check-datahub-jars.yml +++ b/.github/workflows/check-datahub-jars.yml @@ -5,12 +5,12 @@ on: branches: - master paths: - - "metadata-integration" + - "metadata-integration/**" pull_request: branches: - "**" paths: - - "metadata-integration" + - "metadata-integration/**" release: types: [published] @@ -28,15 +28,22 @@ jobs: runs-on: ubuntu-latest steps: - uses: acryldata/sane-checkout-action@v3 + - uses: actions/setup-python@v5 + with: + python-version: "3.10" + - uses: actions/cache@v4 + with: + path: | + ~/.cache/uv + key: ${{ runner.os }}-uv-${{ hashFiles('**/requirements.txt') }} + - name: Install dependencies + run: ./metadata-ingestion/scripts/install_deps.sh - name: Set up JDK 17 uses: actions/setup-java@v4 with: distribution: "zulu" java-version: 17 - uses: gradle/actions/setup-gradle@v3 - - uses: actions/setup-python@v5 - with: - python-version: "3.10" - name: check ${{ matrix.command }} jar run: | ./gradlew :metadata-integration:java:${{ matrix.command }}:build --info diff --git a/build.gradle b/build.gradle index 6893a2ca93d365..a3d807a7333494 100644 --- a/build.gradle +++ b/build.gradle @@ -373,6 +373,7 @@ configure(subprojects.findAll {! it.name.startsWith('spark-lineage')}) { exclude group: "org.slf4j", module: "slf4j-log4j12" exclude group: "org.slf4j", module: "slf4j-nop" exclude group: "org.slf4j", module: "slf4j-ext" + exclude group: "org.codehaus.jackson", module: "jackson-mapper-asl" resolutionStrategy.force externalDependency.antlr4Runtime resolutionStrategy.force externalDependency.antlr4 diff --git a/datahub-frontend/app/auth/AuthModule.java b/datahub-frontend/app/auth/AuthModule.java index 7fa99ab3cb2621..b95515684f01fc 100644 --- a/datahub-frontend/app/auth/AuthModule.java +++ b/datahub-frontend/app/auth/AuthModule.java @@ -27,6 +27,7 @@ import io.datahubproject.metadata.context.EntityRegistryContext; import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.metadata.context.OperationContextConfig; +import io.datahubproject.metadata.context.RetrieverContext; import io.datahubproject.metadata.context.SearchContext; import io.datahubproject.metadata.context.ValidationContext; import java.nio.charset.StandardCharsets; @@ -195,6 +196,7 @@ protected OperationContext provideOperationContext( .searchContext(SearchContext.EMPTY) .entityRegistryContext(EntityRegistryContext.builder().build(EmptyEntityRegistry.EMPTY)) .validationContext(ValidationContext.builder().alternateValidation(false).build()) + .retrieverContext(RetrieverContext.EMPTY) .build(systemAuthentication); } diff --git a/datahub-frontend/conf/logback.xml b/datahub-frontend/conf/logback.xml index 78da231b4a71c5..de37c56cba38a7 100644 --- a/datahub-frontend/conf/logback.xml +++ b/datahub-frontend/conf/logback.xml @@ -61,7 +61,7 @@ - + diff --git a/datahub-upgrade/build.gradle b/datahub-upgrade/build.gradle index 372b0eb0570b98..a3b2e9ad6b3e22 100644 --- a/datahub-upgrade/build.gradle +++ b/datahub-upgrade/build.gradle @@ -60,7 +60,7 @@ dependencies { // mock internal schema registry implementation externalDependency.kafkaAvroSerde implementation externalDependency.kafkaAvroSerializer - implementation "org.apache.kafka:kafka_2.12:3.7.1" + implementation "org.apache.kafka:kafka_2.13:3.7.2" implementation externalDependency.slf4jApi compileOnly externalDependency.lombok diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java index 661717c6309cfc..fdd84da6044f73 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java @@ -13,6 +13,7 @@ import com.linkedin.gms.factory.kafka.common.TopicConventionFactory; import com.linkedin.gms.factory.kafka.schemaregistry.InternalSchemaRegistryFactory; import com.linkedin.gms.factory.search.BaseElasticSearchComponentsFactory; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.config.kafka.KafkaConfiguration; import com.linkedin.metadata.dao.producer.KafkaEventProducer; import com.linkedin.metadata.dao.producer.KafkaHealthChecker; @@ -186,6 +187,7 @@ protected OperationContext javaSystemOperationContext( components.getIndexConvention(), RetrieverContext.builder() .aspectRetriever(entityServiceAspectRetriever) + .cachingAspectRetriever(CachingAspectRetriever.EMPTY) .graphRetriever(systemGraphRetriever) .searchRetriever(searchServiceSearchRetriever) .build(), diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreStorageStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreStorageStep.java index 4d53b603c1eaff..1e5cd6cdb24174 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreStorageStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreStorageStep.java @@ -180,7 +180,7 @@ private void readerExecutable(ReaderWrapper reader, UpgradeContext context) { try { aspectRecord = EntityUtils.toSystemAspect( - context.opContext().getRetrieverContext().get(), aspect.toEntityAspect()) + context.opContext().getRetrieverContext(), aspect.toEntityAspect()) .get() .getRecordTemplate(); } catch (Exception e) { diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java index cd7947ce3c11aa..56feffd211bcd7 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java @@ -113,8 +113,7 @@ public Function executable() { List, SystemAspect>> futures; futures = EntityUtils.toSystemAspectFromEbeanAspects( - opContext.getRetrieverContext().get(), - batch.collect(Collectors.toList())) + opContext.getRetrieverContext(), batch.collect(Collectors.toList())) .stream() .map( systemAspect -> { diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/bootstrapmcps/BootstrapMCPUtil.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/bootstrapmcps/BootstrapMCPUtil.java index 4cc3edff3eb52d..5b807c6c450afb 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/bootstrapmcps/BootstrapMCPUtil.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/bootstrapmcps/BootstrapMCPUtil.java @@ -100,8 +100,8 @@ static AspectsBatch generateAspectBatch( .collect(Collectors.toList()); return AspectsBatchImpl.builder() - .mcps(mcps, auditStamp, opContext.getRetrieverContext().get()) - .retrieverContext(opContext.getRetrieverContext().get()) + .mcps(mcps, auditStamp, opContext.getRetrieverContext()) + .retrieverContext(opContext.getRetrieverContext()) .build(); } diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/dataprocessinstances/BackfillDataProcessInstancesHasRunEventsStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/dataprocessinstances/BackfillDataProcessInstancesHasRunEventsStep.java index 55cdcae931ab5b..1bdea10123999a 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/dataprocessinstances/BackfillDataProcessInstancesHasRunEventsStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/dataprocessinstances/BackfillDataProcessInstancesHasRunEventsStep.java @@ -2,6 +2,8 @@ import static com.linkedin.metadata.Constants.*; +import com.fasterxml.jackson.databind.node.JsonNodeFactory; +import com.fasterxml.jackson.databind.node.ObjectNode; import com.google.common.base.Throwables; import com.linkedin.common.urn.Urn; import com.linkedin.datahub.upgrade.UpgradeContext; @@ -23,8 +25,6 @@ import java.util.Set; import java.util.function.Function; import lombok.extern.slf4j.Slf4j; -import org.codehaus.jackson.node.JsonNodeFactory; -import org.codehaus.jackson.node.ObjectNode; import org.opensearch.action.search.SearchRequest; import org.opensearch.action.search.SearchResponse; import org.opensearch.client.RequestOptions; diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java index 55bc8edbf6a768..de03538907432f 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java @@ -168,13 +168,13 @@ public Function executable() { AspectsBatch aspectsBatch = AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items( batch .flatMap( ebeanAspectV2 -> EntityUtils.toSystemAspectFromEbeanAspects( - opContext.getRetrieverContext().get(), + opContext.getRetrieverContext(), Set.of(ebeanAspectV2)) .stream()) .map( @@ -189,11 +189,7 @@ public Function executable() { .auditStamp(systemAspect.getAuditStamp()) .systemMetadata( withAppSource(systemAspect.getSystemMetadata())) - .build( - opContext - .getRetrieverContext() - .get() - .getAspectRetriever())) + .build(opContext.getAspectRetriever())) .collect(Collectors.toList())) .build(); diff --git a/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/schemafield/GenerateSchemaFieldsFromSchemaMetadataStepTest.java b/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/schemafield/GenerateSchemaFieldsFromSchemaMetadataStepTest.java index 3a2728b4e1d3d6..04b1095e770e0e 100644 --- a/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/schemafield/GenerateSchemaFieldsFromSchemaMetadataStepTest.java +++ b/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/schemafield/GenerateSchemaFieldsFromSchemaMetadataStepTest.java @@ -22,7 +22,6 @@ import com.linkedin.upgrade.DataHubUpgradeState; import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.metadata.context.RetrieverContext; -import java.util.Optional; import java.util.stream.Stream; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -48,7 +47,7 @@ public void setup() { step = new GenerateSchemaFieldsFromSchemaMetadataStep( mockOpContext, mockEntityService, mockAspectDao, 10, 100, 1000); - when(mockOpContext.getRetrieverContext()).thenReturn(Optional.of(mockRetrieverContext)); + when(mockOpContext.getRetrieverContext()).thenReturn(mockRetrieverContext); } /** Test to verify the correct step ID is returned. */ diff --git a/datahub-web-react/src/app/entity/schemaField/SchemaFieldPropertiesEntity.tsx b/datahub-web-react/src/app/entity/schemaField/SchemaFieldPropertiesEntity.tsx index 2c59c476195d0b..fdc0e33d77a057 100644 --- a/datahub-web-react/src/app/entity/schemaField/SchemaFieldPropertiesEntity.tsx +++ b/datahub-web-react/src/app/entity/schemaField/SchemaFieldPropertiesEntity.tsx @@ -35,11 +35,9 @@ export class SchemaFieldPropertiesEntity implements Entity { // Currently unused. getPathName = () => 'schemaField'; - // Currently unused. - getEntityName = () => 'schemaField'; + getEntityName = () => 'Column'; - // Currently unused. - getCollectionName = () => 'schemaFields'; + getCollectionName = () => 'Columns'; // Currently unused. renderProfile = (_: string) => <>; diff --git a/datahub-web-react/src/app/entity/shared/components/styled/DeprecationPill.tsx b/datahub-web-react/src/app/entity/shared/components/styled/DeprecationPill.tsx index 08e9636f760de5..613264709ac23c 100644 --- a/datahub-web-react/src/app/entity/shared/components/styled/DeprecationPill.tsx +++ b/datahub-web-react/src/app/entity/shared/components/styled/DeprecationPill.tsx @@ -19,8 +19,6 @@ const DeprecatedContainer = styled.div` justify-content: center; align-items: center; color: #cd0d24; - margin-left: 0px; - margin-right: 8px; padding-top: 8px; padding-bottom: 8px; padding-right: 4px; diff --git a/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/MultipleStringInput.tsx b/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/MultipleOpenEndedInput.tsx similarity index 87% rename from datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/MultipleStringInput.tsx rename to datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/MultipleOpenEndedInput.tsx index fe6c0bbb99ce22..fe6cd1115419ae 100644 --- a/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/MultipleStringInput.tsx +++ b/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/MultipleOpenEndedInput.tsx @@ -4,6 +4,8 @@ import React from 'react'; import styled from 'styled-components'; import { ANTD_GRAY_V2 } from '../../../constants'; +const MultiStringWrapper = styled.div``; + const StyledInput = styled(Input)` width: 75%; min-width: 350px; @@ -29,10 +31,11 @@ const DeleteButton = styled(Button)` interface Props { selectedValues: any[]; + inputType?: string; updateSelectedValues: (values: any[]) => void; } -export default function MultipleStringInput({ selectedValues, updateSelectedValues }: Props) { +export default function MultipleOpenEndedInput({ selectedValues, updateSelectedValues, inputType = 'text' }: Props) { function updateInput(text: string, index: number) { const updatedValues = selectedValues.length > 0 ? selectedValues.map((value, i) => (i === index ? text : value)) : [text]; @@ -53,14 +56,14 @@ export default function MultipleStringInput({ selectedValues, updateSelectedValu } return ( -
+ {selectedValues.length > 1 && selectedValues.map((selectedValue, index) => { const key = `${index}`; return ( updateInput(e.target.value, index)} /> @@ -70,7 +73,7 @@ export default function MultipleStringInput({ selectedValues, updateSelectedValu })} {selectedValues.length <= 1 && ( updateInput(e.target.value, 0)} /> @@ -78,6 +81,6 @@ export default function MultipleStringInput({ selectedValues, updateSelectedValu + Add More -
+ ); } diff --git a/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/NumberInput.tsx b/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/NumberInput.tsx index c56d85db7ef712..f4cedc4cf80ee5 100644 --- a/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/NumberInput.tsx +++ b/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/NumberInput.tsx @@ -1,7 +1,9 @@ import { Input } from 'antd'; import React, { ChangeEvent } from 'react'; import styled from 'styled-components'; +import { PropertyCardinality } from '@src/types.generated'; import { ANTD_GRAY_V2 } from '../../../constants'; +import MultipleOpenEndedInput from './MultipleOpenEndedInput'; const StyledInput = styled(Input)` border: 1px solid ${ANTD_GRAY_V2[6]}; @@ -10,15 +12,31 @@ const StyledInput = styled(Input)` interface Props { selectedValues: any[]; + cardinality?: PropertyCardinality | null; updateSelectedValues: (values: string[] | number[]) => void; } -export default function NumberInput({ selectedValues, updateSelectedValues }: Props) { +export default function NumberInput({ selectedValues, cardinality, updateSelectedValues }: Props) { function updateInput(event: ChangeEvent) { const number = Number(event.target.value); updateSelectedValues([number]); } + function updateMultipleValues(values: string[] | number[]) { + const numbers = values.map((v) => Number(v)); + updateSelectedValues(numbers); + } + + if (cardinality === PropertyCardinality.Multiple) { + return ( + + ); + } + return ( ; + return ; } - return ; + return ( + + ); } diff --git a/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/StructuredPropertyInput.tsx b/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/StructuredPropertyInput.tsx index 894a304335b0f6..305347ee0bce80 100644 --- a/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/StructuredPropertyInput.tsx +++ b/datahub-web-react/src/app/entity/shared/components/styled/StructuredProperty/StructuredPropertyInput.tsx @@ -60,7 +60,11 @@ export default function StructuredPropertyInput({ )} {!allowedValues && valueType.info.type === StdDataType.Number && ( - + )} {!allowedValues && valueType.info.type === StdDataType.Urn && ( { + it('should not return parent rows when there are none', () => { + const propertyRows = [ + { displayName: 'test1', qualifiedName: 'test1' }, + { displayName: 'test2', qualifiedName: 'test2' }, + ]; + expect(identifyAndAddParentRows(propertyRows)).toMatchObject([]); + }); + + it('should not return parent rows when another row starts with the same letters but is a different token', () => { + const propertyRows = [ + { displayName: 'test1', qualifiedName: 'testing.one' }, + { displayName: 'test2', qualifiedName: 'testingAgain.two' }, + ]; + expect(identifyAndAddParentRows(propertyRows)).toMatchObject([]); + }); + + it('should return parent rows properly', () => { + const propertyRows = [ + { displayName: 'test1', qualifiedName: 'testing.one' }, + { displayName: 'test2', qualifiedName: 'testing.two' }, + { displayName: 'test3', qualifiedName: 'testing.three' }, + ]; + expect(identifyAndAddParentRows(propertyRows)).toMatchObject([ + { displayName: 'testing', qualifiedName: 'testing', childrenCount: 3 }, + ]); + }); + + it('should return parent rows properly with multiple layers of nesting', () => { + const propertyRows = [ + { displayName: 'test1', qualifiedName: 'testing.one.two.a.1' }, + { displayName: 'test1', qualifiedName: 'testing.one.two.a.2' }, + { displayName: 'test1', qualifiedName: 'testing.one.two.b' }, + { displayName: 'test1', qualifiedName: 'testing.one.three' }, + { displayName: 'test2', qualifiedName: 'testing.two.c.d' }, + { displayName: 'test3', qualifiedName: 'testing.three' }, + { displayName: 'test3', qualifiedName: 'testParent' }, + ]; + expect(identifyAndAddParentRows(propertyRows)).toMatchObject([ + { displayName: 'testing', qualifiedName: 'testing', isParentRow: true, childrenCount: 6 }, + { displayName: 'testing.one', qualifiedName: 'testing.one', isParentRow: true, childrenCount: 4 }, + { displayName: 'testing.one.two', qualifiedName: 'testing.one.two', isParentRow: true, childrenCount: 3 }, + { + displayName: 'testing.one.two.a', + qualifiedName: 'testing.one.two.a', + isParentRow: true, + childrenCount: 2, + }, + ]); + }); + + it('should return parent rows properly with multiple layers of nesting regardless of order', () => { + const propertyRows = [ + { displayName: 'test1', qualifiedName: 'testing.one.two.a.1' }, + { displayName: 'test3', qualifiedName: 'testParent' }, + { displayName: 'test1', qualifiedName: 'testing.one.three' }, + { displayName: 'test2', qualifiedName: 'testing.two.c.d' }, + { displayName: 'test1', qualifiedName: 'testing.one.two.b' }, + { displayName: 'test3', qualifiedName: 'testing.three' }, + { displayName: 'test1', qualifiedName: 'testing.one.two.a.2' }, + ]; + expect(identifyAndAddParentRows(propertyRows)).toMatchObject([ + { displayName: 'testing', qualifiedName: 'testing', isParentRow: true, childrenCount: 6 }, + { displayName: 'testing.one', qualifiedName: 'testing.one', isParentRow: true, childrenCount: 4 }, + { displayName: 'testing.one.two', qualifiedName: 'testing.one.two', isParentRow: true, childrenCount: 3 }, + { + displayName: 'testing.one.two.a', + qualifiedName: 'testing.one.two.a', + isParentRow: true, + childrenCount: 2, + }, + ]); + }); + + it('should return parent rows properly with simpler layers of nesting', () => { + const propertyRows = [ + { displayName: 'test2', qualifiedName: 'testing.two.c.d' }, + { displayName: 'test3', qualifiedName: 'testing.three' }, + { displayName: 'test3', qualifiedName: 'testParent' }, + ]; + expect(identifyAndAddParentRows(propertyRows)).toMatchObject([ + { displayName: 'testing', qualifiedName: 'testing', isParentRow: true, childrenCount: 2 }, + ]); + }); +}); diff --git a/datahub-web-react/src/app/entity/shared/tabs/Properties/useStructuredProperties.tsx b/datahub-web-react/src/app/entity/shared/tabs/Properties/useStructuredProperties.tsx index 18ee6bb18da3d3..60d0aac30eb4ce 100644 --- a/datahub-web-react/src/app/entity/shared/tabs/Properties/useStructuredProperties.tsx +++ b/datahub-web-react/src/app/entity/shared/tabs/Properties/useStructuredProperties.tsx @@ -122,10 +122,10 @@ export function identifyAndAddParentRows(rows?: Array): Array name.startsWith(token)).length; + const currentCount = qualifiedNames.filter((name) => name.startsWith(`${token}.`)).length; - // If we're at the beginning of the path and there is no nesting, break - if (index === 0 && currentCount === 1) { + // If there's only one child, don't nest it + if (currentCount === 1) { break; } diff --git a/datahub-web-react/src/app/govern/structuredProperties/AllowedValuesDrawer.tsx b/datahub-web-react/src/app/govern/structuredProperties/AllowedValuesDrawer.tsx index f1dccb6db0c22c..16c07e8257cd9b 100644 --- a/datahub-web-react/src/app/govern/structuredProperties/AllowedValuesDrawer.tsx +++ b/datahub-web-react/src/app/govern/structuredProperties/AllowedValuesDrawer.tsx @@ -127,6 +127,7 @@ const AllowedValuesDrawer = ({ setTimeout(() => scrollToBottom(), 0); }} color="violet" + type="button" > Add diff --git a/datahub-web-react/src/app/govern/structuredProperties/DisplayPreferences.tsx b/datahub-web-react/src/app/govern/structuredProperties/DisplayPreferences.tsx index 260c91ef93207c..95823de0f27c40 100644 --- a/datahub-web-react/src/app/govern/structuredProperties/DisplayPreferences.tsx +++ b/datahub-web-react/src/app/govern/structuredProperties/DisplayPreferences.tsx @@ -153,7 +153,8 @@ const DisplayPreferences = ({ clickable={false} />  is already being shown on asset previews, but only one property is allowed at a time. - Do you want to replace the current property? This will hide PropVal on all asset previews. + Do you want to replace the current property? This will hide {getDisplayName(badgeProperty)}{' '} + on all asset previews.

} /> diff --git a/datahub-web-react/src/app/govern/structuredProperties/StructuredPropsDrawer.tsx b/datahub-web-react/src/app/govern/structuredProperties/StructuredPropsDrawer.tsx index 4b2bbaaf96826b..debffeac7d583c 100644 --- a/datahub-web-react/src/app/govern/structuredProperties/StructuredPropsDrawer.tsx +++ b/datahub-web-react/src/app/govern/structuredProperties/StructuredPropsDrawer.tsx @@ -192,6 +192,7 @@ const StructuredPropsDrawer = ({ form.validateFields().then(() => { const createInput = { ...form.getFieldsValue(), + qualifiedName: form.getFieldValue('qualifiedName') || undefined, valueType: valueTypes.find((type) => type.value === form.getFieldValue('valueType'))?.urn, allowedValues, cardinality, diff --git a/datahub-web-react/src/app/govern/structuredProperties/cacheUtils.ts b/datahub-web-react/src/app/govern/structuredProperties/cacheUtils.ts index 590189d06e6b16..c8052784c6972a 100644 --- a/datahub-web-react/src/app/govern/structuredProperties/cacheUtils.ts +++ b/datahub-web-react/src/app/govern/structuredProperties/cacheUtils.ts @@ -17,7 +17,6 @@ const addToCache = (existingProperties, newProperty) => { allowedValues: newProperty.definition.allowedValues, created: newProperty.definition.created, lastModified: newProperty.definition.lastModified, - filterStatus: newProperty.definition.filterStatus, }, settings: { isHidden: newProperty.settings.isHidden, diff --git a/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx b/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx index 4c8948a6664e07..a19862e83ae510 100644 --- a/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx +++ b/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx @@ -68,6 +68,7 @@ const TitleContainer = styled.div` const EntityTitleContainer = styled.div` display: flex; align-items: center; + gap: 8px; `; const EntityTitle = styled(Typography.Text)<{ $titleSizePx?: number }>` @@ -77,7 +78,6 @@ const EntityTitle = styled(Typography.Text)<{ $titleSizePx?: number }>` } &&& { - margin-right 8px; font-size: ${(props) => props.$titleSizePx || 16}px; font-weight: 600; vertical-align: middle; diff --git a/datahub-web-react/src/graphql/search.graphql b/datahub-web-react/src/graphql/search.graphql index ce0fde27f4c425..58c9a51f3d7e90 100644 --- a/datahub-web-react/src/graphql/search.graphql +++ b/datahub-web-react/src/graphql/search.graphql @@ -963,6 +963,7 @@ fragment facetFields on FacetMetadata { entity { urn type + ...entityDisplayNameFields ... on Tag { name properties { diff --git a/docker/kafka-setup/Dockerfile b/docker/kafka-setup/Dockerfile index a11f823f5efa55..324357b942e8e1 100644 --- a/docker/kafka-setup/Dockerfile +++ b/docker/kafka-setup/Dockerfile @@ -22,7 +22,7 @@ ARG ALPINE_REPO_URL ARG APACHE_DOWNLOAD_URL ARG GITHUB_REPO_URL -ENV KAFKA_VERSION=3.7.1 +ENV KAFKA_VERSION=3.7.2 ENV SCALA_VERSION=2.13 LABEL name="kafka" version=${KAFKA_VERSION} diff --git a/docs-website/vercel-setup.sh b/docs-website/vercel-setup.sh index 4bb40eaddf4775..e9ba87b75be779 100755 --- a/docs-website/vercel-setup.sh +++ b/docs-website/vercel-setup.sh @@ -5,8 +5,8 @@ set -euxo pipefail ./metadata-ingestion/scripts/install_deps.sh # Set up java version for gradle -yum install java-17-amazon-corretto -y -java --version +yum install java-17-amazon-corretto-devel -y +javac --version # Build python from source. # Amazon Linux 2 has Python 3.8, but it's version of OpenSSL is super old and hence it diff --git a/docs/cli.md b/docs/cli.md index c633b7f4a38ad3..1c38077d0d12ef 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -115,6 +115,19 @@ datahub ingest -c ./examples/recipes/example_to_datahub_rest.dhub.yaml --dry-run datahub ingest -c ./examples/recipes/example_to_datahub_rest.dhub.yaml -n ``` +#### ingest --list-source-runs + +The `--list-source-runs` option of the `ingest` command lists the previous runs, displaying their run ID, source name, +start time, status, and source URN. This command allows you to filter results using the --urn option for URN-based +filtering or the --source option to filter by source name (partial or complete matches are supported). + +```shell +# List all ingestion runs +datahub ingest --list-source-runs +# Filter runs by a source name containing "demo" +datahub ingest --list-source-runs --source "demo" +``` + #### ingest --preview The `--preview` option of the `ingest` command performs all of the ingestion steps, but limits the processing to only the first 10 workunits produced by the source. diff --git a/docs/how/delete-metadata.md b/docs/how/delete-metadata.md index f720a66ce57652..e36940bf398356 100644 --- a/docs/how/delete-metadata.md +++ b/docs/how/delete-metadata.md @@ -4,7 +4,7 @@ To follow this guide, you'll need the [DataHub CLI](../cli.md). ::: -There are a two ways to delete metadata from DataHub: +There are two ways to delete metadata from DataHub: 1. Delete metadata attached to entities by providing a specific urn or filters that identify a set of urns (delete CLI). 2. Delete metadata created by a single ingestion run (rollback). @@ -233,7 +233,13 @@ To view the ids of the most recent set of ingestion batches, execute datahub ingest list-runs ``` -That will print out a table of all the runs. Once you have an idea of which run you want to roll back, run +That will print out a table of all the runs. To see run statuses or to filter runs by URN/source run + +```shell +datahub ingest list-source-runs +``` + +Once you have an idea of which run you want to roll back, run ```shell datahub ingest show --run-id diff --git a/docs/lineage/airflow.md b/docs/lineage/airflow.md index 72b5cbf57592d3..345213a0672d37 100644 --- a/docs/lineage/airflow.md +++ b/docs/lineage/airflow.md @@ -339,6 +339,37 @@ TypeError: on_task_instance_success() missing 3 required positional arguments: ' The solution is to upgrade `acryl-datahub-airflow-plugin>=0.12.0.4` or upgrade `pluggy>=1.2.0`. See this [PR](https://github.com/datahub-project/datahub/pull/9365) for details. +### Disabling the DataHub Plugin v2 + +There are two ways to disable the DataHub Plugin v2: + +#### 1. Disable via Configuration + +Set the `datahub.enabled` configuration property to `False` in the `airflow.cfg` file and restart the Airflow environment to reload the configuration and disable the plugin. + +```ini title="airflow.cfg" +[datahub] +enabled = False +``` + +#### 2. Disable via Airflow Variable (Kill-Switch) + +If a restart is not possible and you need a faster way to disable the plugin, you can use the kill-switch. Create and set the `datahub_airflow_plugin_disable_listener` Airflow variable to `true`. This ensures that the listener won't process anything. + +#### Command Line + +```shell +airflow variables set datahub_airflow_plugin_disable_listener true +``` + +#### Airflow UI + +1. Go to Admin -> Variables. +2. Click the "+" symbol to create a new variable. +3. Set the key to `datahub_airflow_plugin_disable_listener` and the value to `true`. + +This will immediately disable the plugin without requiring a restart. + ## Compatibility We no longer officially support Airflow <2.3. However, you can use older versions of `acryl-datahub-airflow-plugin` with older versions of Airflow. diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/CachingAspectRetriever.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/CachingAspectRetriever.java index 77e799f752455c..375dd8cf8911e1 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/CachingAspectRetriever.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/CachingAspectRetriever.java @@ -1,4 +1,38 @@ package com.linkedin.metadata.aspect; +import com.linkedin.common.urn.Urn; +import com.linkedin.entity.Aspect; +import com.linkedin.metadata.models.registry.EmptyEntityRegistry; +import com.linkedin.metadata.models.registry.EntityRegistry; +import java.util.Collections; +import java.util.Map; +import java.util.Set; +import javax.annotation.Nonnull; + /** Responses can be cached based on application.yaml caching configuration for the EntityClient */ -public interface CachingAspectRetriever extends AspectRetriever {} +public interface CachingAspectRetriever extends AspectRetriever { + + CachingAspectRetriever EMPTY = new EmptyAspectRetriever(); + + class EmptyAspectRetriever implements CachingAspectRetriever { + @Nonnull + @Override + public Map> getLatestAspectObjects( + Set urns, Set aspectNames) { + return Collections.emptyMap(); + } + + @Nonnull + @Override + public Map> getLatestSystemAspects( + Map> urnAspectNames) { + return Collections.emptyMap(); + } + + @Nonnull + @Override + public EntityRegistry getEntityRegistry() { + return EmptyEntityRegistry.EMPTY; + } + } +} diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/GraphRetriever.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/GraphRetriever.java index f6858e7da4ba63..30a2c1eb9df8c1 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/GraphRetriever.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/GraphRetriever.java @@ -4,6 +4,7 @@ import com.linkedin.metadata.query.filter.Filter; import com.linkedin.metadata.query.filter.RelationshipFilter; import com.linkedin.metadata.query.filter.SortCriterion; +import java.util.Collections; import java.util.List; import java.util.function.Function; import javax.annotation.Nonnull; @@ -97,4 +98,26 @@ default void consumeRelatedEntities( } } } + + GraphRetriever EMPTY = new EmptyGraphRetriever(); + + class EmptyGraphRetriever implements GraphRetriever { + + @Nonnull + @Override + public RelatedEntitiesScrollResult scrollRelatedEntities( + @Nullable List sourceTypes, + @Nonnull Filter sourceEntityFilter, + @Nullable List destinationTypes, + @Nonnull Filter destinationEntityFilter, + @Nonnull List relationshipTypes, + @Nonnull RelationshipFilter relationshipFilter, + @Nonnull List sortCriterion, + @Nullable String scrollId, + int count, + @Nullable Long startTimeMillis, + @Nullable Long endTimeMillis) { + return new RelatedEntitiesScrollResult(0, 0, null, Collections.emptyList()); + } + } } diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/DataJobInputOutputPatchBuilder.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/DataJobInputOutputPatchBuilder.java index 6fffb17521ddb7..14fc92a1bf3c86 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/DataJobInputOutputPatchBuilder.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/DataJobInputOutputPatchBuilder.java @@ -15,6 +15,8 @@ import com.linkedin.metadata.aspect.patch.PatchOperationType; import com.linkedin.metadata.graph.LineageDirection; import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.ImmutableTriple; public class DataJobInputOutputPatchBuilder @@ -24,6 +26,7 @@ public class DataJobInputOutputPatchBuilder private static final String OUTPUT_DATASET_EDGES_PATH_START = "/outputDatasetEdges/"; private static final String INPUT_DATASET_FIELDS_PATH_START = "/inputDatasetFields/"; private static final String OUTPUT_DATASET_FIELDS_PATH_START = "/outputDatasetFields/"; + private static final String FINE_GRAINED_PATH_START = "/fineGrainedLineages/"; // Simplified with just Urn public DataJobInputOutputPatchBuilder addInputDatajobEdge(@Nonnull DataJobUrn dataJobUrn) { @@ -136,6 +139,103 @@ public DataJobInputOutputPatchBuilder addEdge( return this; } + /** + * Adds a field as a fine grained upstream + * + * @param upstreamSchemaField a schema field to be marked as upstream, format: + * urn:li:schemaField(DATASET_URN, COLUMN NAME) + * @param confidenceScore optional, confidence score for the lineage edge. Defaults to 1.0 for + * full confidence + * @param transformationOperation string operation type that describes the transformation + * operation happening in the lineage edge + * @param downstreamSchemaField the downstream schema field this upstream is derived from, format: + * urn:li:schemaField(DATASET_URN, COLUMN NAME) + * @param queryUrn query urn the relationship is derived from + * @return this builder + */ + public DataJobInputOutputPatchBuilder addFineGrainedUpstreamField( + @Nonnull Urn upstreamSchemaField, + @Nullable Float confidenceScore, + @Nonnull String transformationOperation, + @Nonnull Urn downstreamSchemaField, + @Nullable Urn queryUrn) { + Float finalConfidenceScore = getConfidenceScoreOrDefault(confidenceScore); + String finalQueryUrn; + if (queryUrn == null || StringUtils.isBlank(queryUrn.toString())) { + finalQueryUrn = "NONE"; + } else { + finalQueryUrn = queryUrn.toString(); + } + + ObjectNode fineGrainedLineageNode = instance.objectNode(); + fineGrainedLineageNode.put("confidenceScore", instance.numberNode(finalConfidenceScore)); + pathValues.add( + ImmutableTriple.of( + PatchOperationType.ADD.getValue(), + FINE_GRAINED_PATH_START + + transformationOperation + + "/" + + encodeValueUrn(downstreamSchemaField) + + "/" + + finalQueryUrn + + "/" + + encodeValueUrn(upstreamSchemaField), + fineGrainedLineageNode)); + + return this; + } + + private Float getConfidenceScoreOrDefault(@Nullable Float confidenceScore) { + float finalConfidenceScore; + if (confidenceScore != null && confidenceScore > 0 && confidenceScore <= 1.0f) { + finalConfidenceScore = confidenceScore; + } else { + finalConfidenceScore = 1.0f; + } + + return finalConfidenceScore; + } + + /** + * Removes a field as a fine grained upstream + * + * @param upstreamSchemaField a schema field to be marked as upstream, format: + * urn:li:schemaField(DATASET_URN, COLUMN NAME) + * @param transformationOperation string operation type that describes the transformation + * operation happening in the lineage edge + * @param downstreamSchemaField the downstream schema field this upstream is derived from, format: + * urn:li:schemaField(DATASET_URN, COLUMN NAME) + * @param queryUrn query urn the relationship is derived from + * @return this builder + */ + public DataJobInputOutputPatchBuilder removeFineGrainedUpstreamField( + @Nonnull Urn upstreamSchemaField, + @Nonnull String transformationOperation, + @Nonnull Urn downstreamSchemaField, + @Nullable Urn queryUrn) { + + String finalQueryUrn; + if (queryUrn == null || StringUtils.isBlank(queryUrn.toString())) { + finalQueryUrn = "NONE"; + } else { + finalQueryUrn = queryUrn.toString(); + } + pathValues.add( + ImmutableTriple.of( + PatchOperationType.REMOVE.getValue(), + FINE_GRAINED_PATH_START + + transformationOperation + + "/" + + encodeValueUrn(downstreamSchemaField) + + "/" + + finalQueryUrn + + "/" + + encodeValueUrn(upstreamSchemaField), + null)); + + return this; + } + public DataJobInputOutputPatchBuilder removeEdge( @Nonnull Edge edge, @Nonnull LineageDirection direction) { String path = getEdgePath(edge, direction); diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/UpstreamLineagePatchBuilder.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/UpstreamLineagePatchBuilder.java index 08182761aeb03f..d0a46a35d51820 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/UpstreamLineagePatchBuilder.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/UpstreamLineagePatchBuilder.java @@ -142,7 +142,7 @@ public UpstreamLineagePatchBuilder removeFineGrainedUpstreamField( FINE_GRAINED_PATH_START + transformationOperation + "/" - + downstreamSchemaField + + encodeValueUrn(downstreamSchemaField) + "/" + finalQueryUrn + "/" diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/FineGrainedLineageTemplateHelper.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/FineGrainedLineageTemplateHelper.java new file mode 100644 index 00000000000000..1f6a58c52ba248 --- /dev/null +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/FineGrainedLineageTemplateHelper.java @@ -0,0 +1,282 @@ +package com.linkedin.metadata.aspect.patch.template; + +import static com.fasterxml.jackson.databind.node.JsonNodeFactory.*; +import static com.linkedin.metadata.Constants.*; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.node.ArrayNode; +import com.fasterxml.jackson.databind.node.ObjectNode; +import com.google.common.collect.Streams; +import com.linkedin.common.urn.Urn; +import com.linkedin.common.urn.UrnUtils; +import java.util.concurrent.atomic.AtomicReference; +import java.util.stream.Collectors; +import javax.annotation.Nullable; +import org.codehaus.plexus.util.StringUtils; + +public class FineGrainedLineageTemplateHelper { + + private static final String FINE_GRAINED_UPSTREAM_TYPE = "upstreamType"; + private static final String FINE_GRAINED_UPSTREAMS = "upstreams"; + private static final String FINE_GRAINED_DOWNSTREAM_TYPE = "downstreamType"; + private static final String FINE_GRAINED_DOWNSTREAMS = "downstreams"; + private static final String FINE_GRAINED_TRANSFORMATION_OPERATION = "transformOperation"; + private static final String FINE_GRAINED_CONFIDENCE_SCORE = "confidenceScore"; + private static final String FINE_GRAINED_QUERY_ID = "query"; + + // Template support + private static final String NONE_TRANSFORMATION_TYPE = "NONE"; + private static final Float DEFAULT_CONFIDENCE_SCORE = 1.0f; + private static final String DEFAULT_QUERY_ID = "NONE"; + + /** + * Combines fine grained lineage array into a map using upstream and downstream types as keys, + * defaulting when not present. Due to this construction, patches will look like: path: + * /fineGrainedLineages/TRANSFORMATION_OPERATION/DOWNSTREAM_FIELD_URN/QUERY_ID/UPSTREAM_FIELD_URN, + * op: ADD/REMOVE, value: float (confidenceScore) Due to the way FineGrainedLineage was designed + * it doesn't necessarily have a consistent key we can reference, so this specialized method + * mimics the arrayFieldToMap of the super class with the specialization that it does not put the + * full value of the aspect at the end of the key, just the particular array. This prevents + * unintended overwrites through improper MCP construction that is technically allowed by the + * schema when combining under fields that form the natural key. + * + * @param fineGrainedLineages the fine grained lineage array node + * @return the modified {@link JsonNode} with array fields transformed to maps + */ + public static JsonNode combineAndTransformFineGrainedLineages( + @Nullable JsonNode fineGrainedLineages) { + ObjectNode mapNode = instance.objectNode(); + if (!(fineGrainedLineages instanceof ArrayNode) || fineGrainedLineages.isEmpty()) { + return mapNode; + } + JsonNode lineageCopy = fineGrainedLineages.deepCopy(); + + lineageCopy + .elements() + .forEachRemaining( + node -> { + JsonNode nodeClone = node.deepCopy(); + String transformationOperation = + nodeClone.has(FINE_GRAINED_TRANSFORMATION_OPERATION) + ? nodeClone.get(FINE_GRAINED_TRANSFORMATION_OPERATION).asText() + : NONE_TRANSFORMATION_TYPE; + + if (!mapNode.has(transformationOperation)) { + mapNode.set(transformationOperation, instance.objectNode()); + } + ObjectNode transformationOperationNode = + (ObjectNode) mapNode.get(transformationOperation); + + ArrayNode downstreams = + nodeClone.has(FINE_GRAINED_DOWNSTREAMS) + ? (ArrayNode) nodeClone.get(FINE_GRAINED_DOWNSTREAMS) + : null; + + if (downstreams == null || downstreams.size() != 1) { + throw new UnsupportedOperationException( + "Patching not supported on fine grained lineages with not" + + " exactly one downstream. Current fine grained lineage implementation is downstream derived and " + + "patches are keyed on the root of this derivation."); + } + + Float confidenceScore = + nodeClone.has(FINE_GRAINED_CONFIDENCE_SCORE) + ? nodeClone.get(FINE_GRAINED_CONFIDENCE_SCORE).floatValue() + : DEFAULT_CONFIDENCE_SCORE; + + String upstreamType = + nodeClone.has(FINE_GRAINED_UPSTREAM_TYPE) + ? nodeClone.get(FINE_GRAINED_UPSTREAM_TYPE).asText() + : null; + String downstreamType = + nodeClone.has(FINE_GRAINED_DOWNSTREAM_TYPE) + ? nodeClone.get(FINE_GRAINED_DOWNSTREAM_TYPE).asText() + : null; + ArrayNode upstreams = + nodeClone.has(FINE_GRAINED_UPSTREAMS) + ? (ArrayNode) nodeClone.get(FINE_GRAINED_UPSTREAMS) + : null; + + String queryId = + nodeClone.has(FINE_GRAINED_QUERY_ID) + ? nodeClone.get(FINE_GRAINED_QUERY_ID).asText() + : DEFAULT_QUERY_ID; + + if (upstreamType == null) { + // Determine default type + Urn upstreamUrn = + upstreams != null ? UrnUtils.getUrn(upstreams.get(0).asText()) : null; + if (upstreamUrn != null + && DATASET_ENTITY_NAME.equals(upstreamUrn.getEntityType())) { + upstreamType = FINE_GRAINED_LINEAGE_DATASET_TYPE; + } else { + upstreamType = FINE_GRAINED_LINEAGE_FIELD_SET_TYPE; + } + } + + if (downstreamType == null) { + // Always use FIELD type, only support patches for single field downstream + downstreamType = FINE_GRAINED_LINEAGE_FIELD_TYPE; + } + + String downstreamRoot = downstreams.get(0).asText(); + if (!transformationOperationNode.has(downstreamRoot)) { + transformationOperationNode.set(downstreamRoot, instance.objectNode()); + } + ObjectNode downstreamRootNode = + (ObjectNode) transformationOperationNode.get(downstreamRoot); + if (!downstreamRootNode.has(queryId)) { + downstreamRootNode.set(queryId, instance.objectNode()); + } + ObjectNode queryNode = (ObjectNode) downstreamRootNode.get(queryId); + if (upstreams != null) { + addUrnsToParent( + queryNode, upstreams, confidenceScore, upstreamType, downstreamType); + } + }); + return mapNode; + } + + private static void addUrnsToParent( + JsonNode parentNode, + ArrayNode urnsList, + Float confidenceScore, + String upstreamType, + String downstreamType) { + // Will overwrite repeat urns with different confidence scores with the most recently seen + ((ObjectNode) parentNode) + .setAll( + Streams.stream(urnsList.elements()) + .map(JsonNode::asText) + .distinct() + .collect( + Collectors.toMap( + urn -> urn, + urn -> + mapToLineageValueNode(confidenceScore, upstreamType, downstreamType)))); + } + + private static JsonNode mapToLineageValueNode( + Float confidenceScore, String upstreamType, String downstreamType) { + ObjectNode objectNode = instance.objectNode(); + objectNode.set(FINE_GRAINED_CONFIDENCE_SCORE, instance.numberNode(confidenceScore)); + objectNode.set(FINE_GRAINED_UPSTREAM_TYPE, instance.textNode(upstreamType)); + objectNode.set(FINE_GRAINED_DOWNSTREAM_TYPE, instance.textNode(downstreamType)); + return objectNode; + } + + /** + * Takes the transformed fine grained lineages map from pre-processing and reconstructs an array + * of FineGrainedLineages Avoids producing side effects by copying nodes, use resulting node and + * not the original + * + * @param transformedFineGrainedLineages the transformed fine grained lineage map + * @return the modified {@link JsonNode} formatted consistent with the original schema + */ + public static ArrayNode reconstructFineGrainedLineages(JsonNode transformedFineGrainedLineages) { + if (transformedFineGrainedLineages instanceof ArrayNode) { + // We already have an ArrayNode, no need to transform. This happens during `replace` + // operations + return (ArrayNode) transformedFineGrainedLineages; + } + ObjectNode mapNode = (ObjectNode) transformedFineGrainedLineages; + ArrayNode fineGrainedLineages = instance.arrayNode(); + + mapNode + .fieldNames() + .forEachRemaining( + transformationOperation -> { + final ObjectNode transformationOperationNode = + (ObjectNode) mapNode.get(transformationOperation); + transformationOperationNode + .fieldNames() + .forEachRemaining( + downstreamName -> { + final ObjectNode downstreamNode = + (ObjectNode) transformationOperationNode.get(downstreamName); + downstreamNode + .fieldNames() + .forEachRemaining( + queryId -> + buildFineGrainedLineage( + downstreamName, + downstreamNode, + queryId, + transformationOperation, + fineGrainedLineages)); + }); + }); + + return fineGrainedLineages; + } + + private static void buildFineGrainedLineage( + final String downstreamName, + final ObjectNode downstreamNode, + final String queryId, + final String transformationOperation, + final ArrayNode fineGrainedLineages) { + final ObjectNode fineGrainedLineage = instance.objectNode(); + final ObjectNode queryNode = (ObjectNode) downstreamNode.get(queryId); + if (queryNode.isEmpty()) { + // Short circuit if no upstreams left + return; + } + ArrayNode downstream = instance.arrayNode(); + downstream.add(instance.textNode(downstreamName)); + // Set defaults, if found in sub nodes override, for confidenceScore take lowest + AtomicReference minimumConfidenceScore = new AtomicReference<>(DEFAULT_CONFIDENCE_SCORE); + AtomicReference upstreamType = + new AtomicReference<>(FINE_GRAINED_LINEAGE_FIELD_SET_TYPE); + AtomicReference downstreamType = new AtomicReference<>(FINE_GRAINED_LINEAGE_FIELD_TYPE); + ArrayNode upstreams = instance.arrayNode(); + queryNode + .fieldNames() + .forEachRemaining( + upstream -> + processUpstream( + queryNode, + upstream, + minimumConfidenceScore, + upstreamType, + downstreamType, + upstreams)); + fineGrainedLineage.set(FINE_GRAINED_DOWNSTREAMS, downstream); + fineGrainedLineage.set(FINE_GRAINED_UPSTREAMS, upstreams); + if (StringUtils.isNotBlank(queryId) && !DEFAULT_QUERY_ID.equals(queryId)) { + fineGrainedLineage.set(FINE_GRAINED_QUERY_ID, instance.textNode(queryId)); + } + fineGrainedLineage.set(FINE_GRAINED_UPSTREAM_TYPE, instance.textNode(upstreamType.get())); + fineGrainedLineage.set(FINE_GRAINED_DOWNSTREAM_TYPE, instance.textNode(downstreamType.get())); + fineGrainedLineage.set( + FINE_GRAINED_CONFIDENCE_SCORE, instance.numberNode(minimumConfidenceScore.get())); + fineGrainedLineage.set( + FINE_GRAINED_TRANSFORMATION_OPERATION, instance.textNode(transformationOperation)); + fineGrainedLineages.add(fineGrainedLineage); + } + + private static void processUpstream( + final ObjectNode queryNode, + final String upstream, + final AtomicReference minimumConfidenceScore, + final AtomicReference upstreamType, + final AtomicReference downstreamType, + final ArrayNode upstreams) { + final ObjectNode upstreamNode = (ObjectNode) queryNode.get(upstream); + if (upstreamNode.has(FINE_GRAINED_CONFIDENCE_SCORE)) { + Float scoreValue = upstreamNode.get(FINE_GRAINED_CONFIDENCE_SCORE).floatValue(); + if (scoreValue <= minimumConfidenceScore.get()) { + minimumConfidenceScore.set(scoreValue); + } + } + // Set types to last encountered, should never change, but this at least tries to support + // other types being specified. + if (upstreamNode.has(FINE_GRAINED_UPSTREAM_TYPE)) { + upstreamType.set(upstreamNode.get(FINE_GRAINED_UPSTREAM_TYPE).asText()); + } + if (upstreamNode.has(FINE_GRAINED_DOWNSTREAM_TYPE)) { + downstreamType.set(upstreamNode.get(FINE_GRAINED_DOWNSTREAM_TYPE).asText()); + } + upstreams.add(instance.textNode(upstream)); + } +} diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/TemplateUtil.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/TemplateUtil.java index 2423e37e6d5419..23879ad1c2e353 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/TemplateUtil.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/TemplateUtil.java @@ -84,7 +84,7 @@ public static JsonNode populateTopLevelKeys(JsonNode transformedNode, JsonPatch // Skip first as it will always be blank due to path starting with / for (int i = 1; i < endIdx; i++) { String decodedKey = decodeValue(keys[i]); - if (parent.get(keys[i]) == null) { + if (parent.get(decodedKey) == null) { ((ObjectNode) parent).set(decodedKey, instance.objectNode()); } parent = parent.get(decodedKey); diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/datajob/DataJobInputOutputTemplate.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/datajob/DataJobInputOutputTemplate.java index 3d398d97b50c38..ef26eed2f814f8 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/datajob/DataJobInputOutputTemplate.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/datajob/DataJobInputOutputTemplate.java @@ -1,6 +1,10 @@ package com.linkedin.metadata.aspect.patch.template.datajob; +import static com.fasterxml.jackson.databind.node.JsonNodeFactory.*; +import static com.linkedin.metadata.Constants.*; + import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.node.ObjectNode; import com.linkedin.common.DataJobUrnArray; import com.linkedin.common.DatasetUrnArray; import com.linkedin.common.EdgeArray; @@ -9,6 +13,7 @@ import com.linkedin.datajob.DataJobInputOutput; import com.linkedin.dataset.FineGrainedLineageArray; import com.linkedin.metadata.aspect.patch.template.ArrayMergingTemplate; +import com.linkedin.metadata.aspect.patch.template.FineGrainedLineageTemplateHelper; import java.util.Collections; import javax.annotation.Nonnull; @@ -23,6 +28,8 @@ public class DataJobInputOutputTemplate implements ArrayMergingTemplate { @@ -27,18 +19,6 @@ public class UpstreamLineageTemplate extends CompoundKeyTemplate { - JsonNode nodeClone = node.deepCopy(); - String transformationOperation = - nodeClone.has(FINE_GRAINED_TRANSFORMATION_OPERATION) - ? nodeClone.get(FINE_GRAINED_TRANSFORMATION_OPERATION).asText() - : NONE_TRANSFORMATION_TYPE; - - if (!mapNode.has(transformationOperation)) { - mapNode.set(transformationOperation, instance.objectNode()); - } - ObjectNode transformationOperationNode = - (ObjectNode) mapNode.get(transformationOperation); - - ArrayNode downstreams = - nodeClone.has(FINE_GRAINED_DOWNSTREAMS) - ? (ArrayNode) nodeClone.get(FINE_GRAINED_DOWNSTREAMS) - : null; - - if (downstreams == null || downstreams.size() != 1) { - throw new UnsupportedOperationException( - "Patching not supported on fine grained lineages with not" - + " exactly one downstream. Current fine grained lineage implementation is downstream derived and " - + "patches are keyed on the root of this derivation."); - } - - Float confidenceScore = - nodeClone.has(FINE_GRAINED_CONFIDENCE_SCORE) - ? nodeClone.get(FINE_GRAINED_CONFIDENCE_SCORE).floatValue() - : DEFAULT_CONFIDENCE_SCORE; - - String upstreamType = - nodeClone.has(FINE_GRAINED_UPSTREAM_TYPE) - ? nodeClone.get(FINE_GRAINED_UPSTREAM_TYPE).asText() - : null; - String downstreamType = - nodeClone.has(FINE_GRAINED_DOWNSTREAM_TYPE) - ? nodeClone.get(FINE_GRAINED_DOWNSTREAM_TYPE).asText() - : null; - ArrayNode upstreams = - nodeClone.has(FINE_GRAINED_UPSTREAMS) - ? (ArrayNode) nodeClone.get(FINE_GRAINED_UPSTREAMS) - : null; - - String queryId = - nodeClone.has(FINE_GRAINED_QUERY_ID) - ? nodeClone.get(FINE_GRAINED_QUERY_ID).asText() - : DEFAULT_QUERY_ID; - - if (upstreamType == null) { - // Determine default type - Urn upstreamUrn = - upstreams != null ? UrnUtils.getUrn(upstreams.get(0).asText()) : null; - if (upstreamUrn != null - && DATASET_ENTITY_NAME.equals(upstreamUrn.getEntityType())) { - upstreamType = FINE_GRAINED_LINEAGE_DATASET_TYPE; - } else { - upstreamType = FINE_GRAINED_LINEAGE_FIELD_SET_TYPE; - } - } - - if (downstreamType == null) { - // Always use FIELD type, only support patches for single field downstream - downstreamType = FINE_GRAINED_LINEAGE_FIELD_TYPE; - } - - String downstreamRoot = downstreams.get(0).asText(); - if (!transformationOperationNode.has(downstreamRoot)) { - transformationOperationNode.set(downstreamRoot, instance.objectNode()); - } - ObjectNode downstreamRootNode = - (ObjectNode) transformationOperationNode.get(downstreamRoot); - if (!downstreamRootNode.has(queryId)) { - downstreamRootNode.set(queryId, instance.objectNode()); - } - ObjectNode queryNode = (ObjectNode) downstreamRootNode.get(queryId); - if (upstreams != null) { - addUrnsToParent( - queryNode, upstreams, confidenceScore, upstreamType, downstreamType); - } - }); - return mapNode; - } - - private void addUrnsToParent( - JsonNode parentNode, - ArrayNode urnsList, - Float confidenceScore, - String upstreamType, - String downstreamType) { - // Will overwrite repeat urns with different confidence scores with the most recently seen - ((ObjectNode) parentNode) - .setAll( - Streams.stream(urnsList.elements()) - .map(JsonNode::asText) - .distinct() - .collect( - Collectors.toMap( - urn -> urn, - urn -> - mapToLineageValueNode(confidenceScore, upstreamType, downstreamType)))); - } - - private JsonNode mapToLineageValueNode( - Float confidenceScore, String upstreamType, String downstreamType) { - ObjectNode objectNode = instance.objectNode(); - objectNode.set(FINE_GRAINED_CONFIDENCE_SCORE, instance.numberNode(confidenceScore)); - objectNode.set(FINE_GRAINED_UPSTREAM_TYPE, instance.textNode(upstreamType)); - objectNode.set(FINE_GRAINED_DOWNSTREAM_TYPE, instance.textNode(downstreamType)); - return objectNode; - } - - /** - * Takes the transformed fine grained lineages map from pre-processing and reconstructs an array - * of FineGrainedLineages Avoids producing side effects by copying nodes, use resulting node and - * not the original - * - * @param transformedFineGrainedLineages the transformed fine grained lineage map - * @return the modified {@link JsonNode} formatted consistent with the original schema - */ - private ArrayNode reconstructFineGrainedLineages(JsonNode transformedFineGrainedLineages) { - if (transformedFineGrainedLineages instanceof ArrayNode) { - // We already have an ArrayNode, no need to transform. This happens during `replace` - // operations - return (ArrayNode) transformedFineGrainedLineages; - } - ObjectNode mapNode = (ObjectNode) transformedFineGrainedLineages; - ArrayNode fineGrainedLineages = instance.arrayNode(); - - mapNode - .fieldNames() - .forEachRemaining( - transformationOperation -> { - final ObjectNode transformationOperationNode = - (ObjectNode) mapNode.get(transformationOperation); - transformationOperationNode - .fieldNames() - .forEachRemaining( - downstreamName -> { - final ObjectNode downstreamNode = - (ObjectNode) transformationOperationNode.get(downstreamName); - downstreamNode - .fieldNames() - .forEachRemaining( - queryId -> - buildFineGrainedLineage( - downstreamName, - downstreamNode, - queryId, - transformationOperation, - fineGrainedLineages)); - }); - }); - - return fineGrainedLineages; - } - - private void buildFineGrainedLineage( - final String downstreamName, - final ObjectNode downstreamNode, - final String queryId, - final String transformationOperation, - final ArrayNode fineGrainedLineages) { - final ObjectNode fineGrainedLineage = instance.objectNode(); - final ObjectNode queryNode = (ObjectNode) downstreamNode.get(queryId); - if (queryNode.isEmpty()) { - // Short circuit if no upstreams left - return; - } - ArrayNode downstream = instance.arrayNode(); - downstream.add(instance.textNode(downstreamName)); - // Set defaults, if found in sub nodes override, for confidenceScore take lowest - AtomicReference minimumConfidenceScore = new AtomicReference<>(DEFAULT_CONFIDENCE_SCORE); - AtomicReference upstreamType = - new AtomicReference<>(FINE_GRAINED_LINEAGE_FIELD_SET_TYPE); - AtomicReference downstreamType = new AtomicReference<>(FINE_GRAINED_LINEAGE_FIELD_TYPE); - ArrayNode upstreams = instance.arrayNode(); - queryNode - .fieldNames() - .forEachRemaining( - upstream -> - processUpstream( - queryNode, - upstream, - minimumConfidenceScore, - upstreamType, - downstreamType, - upstreams)); - fineGrainedLineage.set(FINE_GRAINED_DOWNSTREAMS, downstream); - fineGrainedLineage.set(FINE_GRAINED_UPSTREAMS, upstreams); - if (StringUtils.isNotBlank(queryId) && !DEFAULT_QUERY_ID.equals(queryId)) { - fineGrainedLineage.set(FINE_GRAINED_QUERY_ID, instance.textNode(queryId)); - } - fineGrainedLineage.set(FINE_GRAINED_UPSTREAM_TYPE, instance.textNode(upstreamType.get())); - fineGrainedLineage.set(FINE_GRAINED_DOWNSTREAM_TYPE, instance.textNode(downstreamType.get())); - fineGrainedLineage.set( - FINE_GRAINED_CONFIDENCE_SCORE, instance.numberNode(minimumConfidenceScore.get())); - fineGrainedLineage.set( - FINE_GRAINED_TRANSFORMATION_OPERATION, instance.textNode(transformationOperation)); - fineGrainedLineages.add(fineGrainedLineage); - } - - private void processUpstream( - final ObjectNode queryNode, - final String upstream, - final AtomicReference minimumConfidenceScore, - final AtomicReference upstreamType, - final AtomicReference downstreamType, - final ArrayNode upstreams) { - final ObjectNode upstreamNode = (ObjectNode) queryNode.get(upstream); - if (upstreamNode.has(FINE_GRAINED_CONFIDENCE_SCORE)) { - Float scoreValue = upstreamNode.get(FINE_GRAINED_CONFIDENCE_SCORE).floatValue(); - if (scoreValue <= minimumConfidenceScore.get()) { - minimumConfidenceScore.set(scoreValue); - } - } - // Set types to last encountered, should never change, but this at least tries to support - // other types being specified. - if (upstreamNode.has(FINE_GRAINED_UPSTREAM_TYPE)) { - upstreamType.set(upstreamNode.get(FINE_GRAINED_UPSTREAM_TYPE).asText()); - } - if (upstreamNode.has(FINE_GRAINED_DOWNSTREAM_TYPE)) { - downstreamType.set(upstreamNode.get(FINE_GRAINED_DOWNSTREAM_TYPE).asText()); - } - upstreams.add(instance.textNode(upstream)); - } } diff --git a/entity-registry/src/main/java/com/linkedin/metadata/entity/SearchRetriever.java b/entity-registry/src/main/java/com/linkedin/metadata/entity/SearchRetriever.java index eaa106b8d1f638..d4894c97015f8f 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/entity/SearchRetriever.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/entity/SearchRetriever.java @@ -2,6 +2,7 @@ import com.linkedin.metadata.query.filter.Filter; import com.linkedin.metadata.search.ScrollResult; +import com.linkedin.metadata.search.SearchEntityArray; import java.util.List; import javax.annotation.Nonnull; import javax.annotation.Nullable; @@ -21,4 +22,22 @@ ScrollResult scroll( @Nullable Filter filters, @Nullable String scrollId, int count); + + SearchRetriever EMPTY = new EmptySearchRetriever(); + + class EmptySearchRetriever implements SearchRetriever { + + @Override + public ScrollResult scroll( + @Nonnull List entities, + @Nullable Filter filters, + @Nullable String scrollId, + int count) { + ScrollResult empty = new ScrollResult(); + empty.setEntities(new SearchEntityArray()); + empty.setNumEntities(0); + empty.setPageSize(0); + return empty; + } + } } diff --git a/entity-registry/src/test/java/com/linkedin/metadata/aspect/patch/template/DataJobInputOutputTemplateTest.java b/entity-registry/src/test/java/com/linkedin/metadata/aspect/patch/template/DataJobInputOutputTemplateTest.java new file mode 100644 index 00000000000000..d2a26221a3bb9f --- /dev/null +++ b/entity-registry/src/test/java/com/linkedin/metadata/aspect/patch/template/DataJobInputOutputTemplateTest.java @@ -0,0 +1,255 @@ +package com.linkedin.metadata.aspect.patch.template; + +import static com.linkedin.metadata.utils.GenericRecordUtils.*; +import static org.testng.Assert.*; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.linkedin.common.UrnArray; +import com.linkedin.common.urn.Urn; +import com.linkedin.common.urn.UrnUtils; +import com.linkedin.data.DataMap; +import com.linkedin.datajob.DataJobInputOutput; +import com.linkedin.dataset.FineGrainedLineage; +import com.linkedin.dataset.FineGrainedLineageDownstreamType; +import com.linkedin.dataset.FineGrainedLineageUpstreamType; +import com.linkedin.metadata.aspect.patch.template.datajob.DataJobInputOutputTemplate; +import jakarta.json.Json; +import jakarta.json.JsonObjectBuilder; +import jakarta.json.JsonPatch; +import jakarta.json.JsonPatchBuilder; +import jakarta.json.JsonValue; +import org.testng.annotations.Test; + +public class DataJobInputOutputTemplateTest { + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + @Test + public void testPatchUpstream() throws Exception { + DataJobInputOutputTemplate dataJobInputOutputTemplate = new DataJobInputOutputTemplate(); + DataJobInputOutput dataJobInputOutput = dataJobInputOutputTemplate.getDefault(); + JsonPatchBuilder jsonPatchBuilder = Json.createPatchBuilder(); + + JsonObjectBuilder fineGrainedLineageNode = Json.createObjectBuilder(); + JsonValue upstreamConfidenceScore = Json.createValue(1.0f); + fineGrainedLineageNode.add("confidenceScore", upstreamConfidenceScore); + jsonPatchBuilder.add( + "/fineGrainedLineages/CREATE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c1)//urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c1)", + fineGrainedLineageNode.build()); + + // Initial population test + DataJobInputOutput result = + dataJobInputOutputTemplate.applyPatch(dataJobInputOutput, jsonPatchBuilder.build()); + // Hack because Jackson parses values to doubles instead of floats + DataMap dataMap = new DataMap(); + dataMap.put("confidenceScore", 1.0); + FineGrainedLineage fineGrainedLineage = new FineGrainedLineage(dataMap); + UrnArray urns = new UrnArray(); + Urn urn1 = + UrnUtils.getUrn( + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c1)"); + urns.add(urn1); + UrnArray upstreams = new UrnArray(); + Urn upstreamUrn = + UrnUtils.getUrn( + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c1)"); + upstreams.add(upstreamUrn); + fineGrainedLineage.setDownstreams(urns); + fineGrainedLineage.setUpstreams(upstreams); + fineGrainedLineage.setTransformOperation("CREATE"); + fineGrainedLineage.setUpstreamType(FineGrainedLineageUpstreamType.FIELD_SET); + fineGrainedLineage.setDownstreamType(FineGrainedLineageDownstreamType.FIELD); + assertEquals(result.getFineGrainedLineages().get(0), fineGrainedLineage); + + // Test non-overwrite upstreams and correct confidence score and types w/ overwrite + JsonObjectBuilder finegrainedLineageNode2 = Json.createObjectBuilder(); + finegrainedLineageNode2.add( + "upstreamType", Json.createValue(FineGrainedLineageUpstreamType.FIELD_SET.name())); + finegrainedLineageNode2.add("confidenceScore", upstreamConfidenceScore); + finegrainedLineageNode2.add( + "downstreamType", Json.createValue(FineGrainedLineageDownstreamType.FIELD.name())); + + JsonPatchBuilder patchOperations2 = Json.createPatchBuilder(); + patchOperations2.add( + "/fineGrainedLineages/CREATE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c2)/urn:li:query:someQuery/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)", + finegrainedLineageNode2.build()); + + JsonValue upstreamConfidenceScore2 = Json.createValue(0.1f); + JsonObjectBuilder finegrainedLineageNode3 = Json.createObjectBuilder(); + finegrainedLineageNode3.add( + "upstreamType", Json.createValue(FineGrainedLineageUpstreamType.DATASET.name())); + finegrainedLineageNode3.add("confidenceScore", upstreamConfidenceScore2); + finegrainedLineageNode3.add( + "downstreamType", Json.createValue(FineGrainedLineageDownstreamType.FIELD_SET.name())); + + patchOperations2.add( + "/fineGrainedLineages/CREATE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c2)/urn:li:query:someQuery/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)", + finegrainedLineageNode3.build()); + + JsonPatch jsonPatch2 = patchOperations2.build(); + + DataJobInputOutput result2 = dataJobInputOutputTemplate.applyPatch(result, jsonPatch2); + // Hack because Jackson parses values to doubles instead of floats + DataMap dataMap2 = new DataMap(); + dataMap2.put("confidenceScore", 0.1); + FineGrainedLineage fineGrainedLineage2 = new FineGrainedLineage(dataMap2); + UrnArray urns2 = new UrnArray(); + Urn urn2 = + UrnUtils.getUrn( + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)"); + urns2.add(urn2); + Urn downstreamUrn2 = + UrnUtils.getUrn( + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c2)"); + UrnArray downstreams2 = new UrnArray(); + downstreams2.add(downstreamUrn2); + fineGrainedLineage2.setUpstreams(urns2); + fineGrainedLineage2.setDownstreams(downstreams2); + fineGrainedLineage2.setTransformOperation("CREATE"); + fineGrainedLineage2.setUpstreamType(FineGrainedLineageUpstreamType.DATASET); + fineGrainedLineage2.setDownstreamType(FineGrainedLineageDownstreamType.FIELD_SET); + fineGrainedLineage2.setQuery(UrnUtils.getUrn("urn:li:query:someQuery")); + assertEquals(result2.getFineGrainedLineages().get(1), fineGrainedLineage2); + + // Check different queries + JsonObjectBuilder finegrainedLineageNode4 = Json.createObjectBuilder(); + finegrainedLineageNode4.add( + "upstreamType", Json.createValue(FineGrainedLineageUpstreamType.FIELD_SET.name())); + finegrainedLineageNode4.add("confidenceScore", upstreamConfidenceScore); + finegrainedLineageNode4.add( + "downstreamType", Json.createValue(FineGrainedLineageDownstreamType.FIELD.name())); + + JsonPatchBuilder patchOperations3 = Json.createPatchBuilder(); + patchOperations3.add( + "/fineGrainedLineages/CREATE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c2)/urn:li:query:anotherQuery/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)", + finegrainedLineageNode4.build()); + + JsonPatch jsonPatch3 = patchOperations3.build(); + DataJobInputOutput result3 = dataJobInputOutputTemplate.applyPatch(result2, jsonPatch3); + // Hack because Jackson parses values to doubles instead of floats + DataMap dataMap3 = new DataMap(); + dataMap3.put("confidenceScore", 1.0); + FineGrainedLineage fineGrainedLineage3 = new FineGrainedLineage(dataMap3); + UrnArray urns3 = new UrnArray(); + Urn urn3 = + UrnUtils.getUrn( + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c2)"); + urns3.add(urn3); + + Urn upstreamUrn3 = + UrnUtils.getUrn( + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)"); + UrnArray upstreamUrns3 = new UrnArray(); + upstreamUrns3.add(upstreamUrn3); + fineGrainedLineage3.setDownstreams(urns3); + fineGrainedLineage3.setUpstreams(upstreamUrns3); + fineGrainedLineage3.setTransformOperation("CREATE"); + fineGrainedLineage3.setUpstreamType(FineGrainedLineageUpstreamType.FIELD_SET); + fineGrainedLineage3.setDownstreamType(FineGrainedLineageDownstreamType.FIELD); + fineGrainedLineage3.setQuery(UrnUtils.getUrn("urn:li:query:anotherQuery")); + // Splits into two for different types + assertEquals(result3.getFineGrainedLineages().get(2), fineGrainedLineage3); + + // Check different transform types + JsonObjectBuilder finegrainedLineageNode5 = Json.createObjectBuilder(); + finegrainedLineageNode5.add( + "upstreamType", Json.createValue(FineGrainedLineageUpstreamType.FIELD_SET.name())); + finegrainedLineageNode5.add("confidenceScore", upstreamConfidenceScore); + finegrainedLineageNode5.add( + "downstreamType", Json.createValue(FineGrainedLineageDownstreamType.FIELD.name())); + + JsonPatchBuilder patchOperations4 = Json.createPatchBuilder(); + patchOperations4.add( + "/fineGrainedLineages/TRANSFORM/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c2)/urn:li:query:anotherQuery/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)", + finegrainedLineageNode5.build()); + JsonPatch jsonPatch4 = patchOperations4.build(); + + DataJobInputOutput result4 = dataJobInputOutputTemplate.applyPatch(result3, jsonPatch4); + // Hack because Jackson parses values to doubles instead of floats + DataMap dataMap4 = new DataMap(); + dataMap4.put("confidenceScore", 1.0); + FineGrainedLineage fineGrainedLineage4 = new FineGrainedLineage(dataMap4); + fineGrainedLineage4.setUpstreams(upstreamUrns3); + fineGrainedLineage4.setDownstreams(urns3); + fineGrainedLineage4.setTransformOperation("TRANSFORM"); + fineGrainedLineage4.setUpstreamType(FineGrainedLineageUpstreamType.FIELD_SET); + fineGrainedLineage4.setDownstreamType(FineGrainedLineageDownstreamType.FIELD); + fineGrainedLineage4.setQuery(UrnUtils.getUrn("urn:li:query:anotherQuery")); + // New entry in array because of new transformation type + assertEquals(result4.getFineGrainedLineages().get(3), fineGrainedLineage4); + + // Remove + JsonPatchBuilder removeOperations = Json.createPatchBuilder(); + removeOperations.remove( + "/fineGrainedLineages/CREATE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c1)/NONE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c1)"); + removeOperations.remove( + "/fineGrainedLineages/CREATE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c2)/urn:li:query:someQuery/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)"); + removeOperations.remove( + "/fineGrainedLineages/CREATE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c2)/urn:li:query:anotherQuery/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)"); + removeOperations.remove( + "/fineGrainedLineages/TRANSFORM/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c2)/urn:li:query:anotherQuery/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)"); + + JsonPatch removePatch = removeOperations.build(); + DataJobInputOutput finalResult = dataJobInputOutputTemplate.applyPatch(result4, removePatch); + assertEquals(finalResult, dataJobInputOutputTemplate.getDefault()); + } + + @Test + public void testPatchWithFieldWithForwardSlash() throws JsonProcessingException { + + String downstreamUrn = + "/fineGrainedLineages/CREATE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c1)"; + String unescapedUpstreamUrn = + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),slash/column)"; + String escapedUpstreamUrn = + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),slash~1column)"; + String lineagePath = downstreamUrn + "//" + escapedUpstreamUrn; + + DataJobInputOutputTemplate dataJobInputOutputTemplate = new DataJobInputOutputTemplate(); + DataJobInputOutput dataJobInputOutput = dataJobInputOutputTemplate.getDefault(); + JsonPatchBuilder jsonPatchBuilder = Json.createPatchBuilder(); + + JsonObjectBuilder fineGrainedLineageNode = Json.createObjectBuilder(); + JsonValue upstreamConfidenceScore = Json.createValue(1.0f); + fineGrainedLineageNode.add("confidenceScore", upstreamConfidenceScore); + + jsonPatchBuilder.add(lineagePath, fineGrainedLineageNode.build()); + + // Initial population test + DataJobInputOutput result = + dataJobInputOutputTemplate.applyPatch(dataJobInputOutput, jsonPatchBuilder.build()); + + assertEquals( + result.getFineGrainedLineages().get(0).getUpstreams().get(0).toString(), + unescapedUpstreamUrn); + } + + @Test + public void testPatchWithFieldWithTilde() throws JsonProcessingException { + + String downstreamUrn = + "/fineGrainedLineages/CREATE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c1)"; + String unescapedUpstreamUrn = + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),tilde~column)"; + String escapedUpstreamUrn = + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),tilde~0column)"; + String lineagePath = downstreamUrn + "//" + escapedUpstreamUrn; + + DataJobInputOutputTemplate dataJobInputOutputTemplate = new DataJobInputOutputTemplate(); + DataJobInputOutput dataJobInputOutput = dataJobInputOutputTemplate.getDefault(); + JsonPatchBuilder jsonPatchBuilder = Json.createPatchBuilder(); + + JsonObjectBuilder fineGrainedLineageNode = Json.createObjectBuilder(); + JsonValue upstreamConfidenceScore = Json.createValue(1.0f); + fineGrainedLineageNode.add("confidenceScore", upstreamConfidenceScore); + + jsonPatchBuilder.add(lineagePath, fineGrainedLineageNode.build()); + + // Initial population test + DataJobInputOutput result = + dataJobInputOutputTemplate.applyPatch(dataJobInputOutput, jsonPatchBuilder.build()); + assertEquals( + result.getFineGrainedLineages().get(0).getUpstreams().get(0).toString(), + unescapedUpstreamUrn); + } +} diff --git a/entity-registry/src/test/java/com/linkedin/metadata/aspect/patch/template/UpstreamLineageTemplateTest.java b/entity-registry/src/test/java/com/linkedin/metadata/aspect/patch/template/UpstreamLineageTemplateTest.java index f934dd8961ca37..ab0e7f960251c4 100644 --- a/entity-registry/src/test/java/com/linkedin/metadata/aspect/patch/template/UpstreamLineageTemplateTest.java +++ b/entity-registry/src/test/java/com/linkedin/metadata/aspect/patch/template/UpstreamLineageTemplateTest.java @@ -221,6 +221,7 @@ public void testPatchUpstream() throws Exception { JsonPatch removePatch = removeOperations.build(); UpstreamLineage finalResult = upstreamLineageTemplate.applyPatch(result4, removePatch); + assertEquals(finalResult, upstreamLineageTemplate.getDefault()); } @@ -337,4 +338,39 @@ public void testPatchWithFieldWithTilde() throws JsonProcessingException { result.getFineGrainedLineages().get(0).getUpstreams().get(0).toString(), unescapedUpstreamUrn); } + + @Test + public void testPatchRemoveWithFields() throws JsonProcessingException { + + String downstreamUrn = + "/fineGrainedLineages/CREATE/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:s3,~1tmp~1test.parquet,PROD),c1)"; + String upstreamUrn = + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c1)"; + String upstreamUrn2 = + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_2,PROD),c2)"; + + String lineagePath1 = downstreamUrn + "/NONE/" + upstreamUrn; + String lineagePath2 = downstreamUrn + "/NONE/" + upstreamUrn2; + + UpstreamLineageTemplate upstreamLineageTemplate = new UpstreamLineageTemplate(); + UpstreamLineage upstreamLineage = upstreamLineageTemplate.getDefault(); + JsonPatchBuilder jsonPatchBuilder = Json.createPatchBuilder(); + + JsonObjectBuilder fineGrainedLineageNode = Json.createObjectBuilder(); + JsonValue upstreamConfidenceScore = Json.createValue(1.0f); + fineGrainedLineageNode.add("confidenceScore", upstreamConfidenceScore); + + jsonPatchBuilder.add(lineagePath1, fineGrainedLineageNode.build()); + jsonPatchBuilder.add(lineagePath2, fineGrainedLineageNode.build()); + + // Initial population test + UpstreamLineage result = + upstreamLineageTemplate.applyPatch(upstreamLineage, jsonPatchBuilder.build()); + assertEquals( + result.getFineGrainedLineages().get(0).getUpstreams().get(0).toString(), upstreamUrn); + assertEquals( + result.getFineGrainedLineages().get(0).getUpstreams().get(1).toString(), upstreamUrn2); + + assertEquals(result.getFineGrainedLineages().get(0).getUpstreams().size(), 2); + } } diff --git a/entity-registry/src/testFixtures/java/com/linkedin/test/metadata/aspect/MockAspectRetriever.java b/entity-registry/src/testFixtures/java/com/linkedin/test/metadata/aspect/MockAspectRetriever.java index 65705f15022b6b..98a6d59004a92a 100644 --- a/entity-registry/src/testFixtures/java/com/linkedin/test/metadata/aspect/MockAspectRetriever.java +++ b/entity-registry/src/testFixtures/java/com/linkedin/test/metadata/aspect/MockAspectRetriever.java @@ -5,7 +5,7 @@ import com.linkedin.data.DataMap; import com.linkedin.data.template.RecordTemplate; import com.linkedin.entity.Aspect; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.aspect.SystemAspect; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.mxe.SystemMetadata; @@ -22,7 +22,7 @@ import javax.annotation.Nonnull; import org.mockito.Mockito; -public class MockAspectRetriever implements AspectRetriever { +public class MockAspectRetriever implements CachingAspectRetriever { private final Map> data; private final Map> systemData = new HashMap<>(); diff --git a/li-utils/src/main/java/com/linkedin/metadata/Constants.java b/li-utils/src/main/java/com/linkedin/metadata/Constants.java index ff6a79108600a3..09f873ebf7bc96 100644 --- a/li-utils/src/main/java/com/linkedin/metadata/Constants.java +++ b/li-utils/src/main/java/com/linkedin/metadata/Constants.java @@ -409,6 +409,8 @@ public class Constants { /** User Status */ public static final String CORP_USER_STATUS_ACTIVE = "ACTIVE"; + public static final String CORP_USER_STATUS_SUSPENDED = "SUSPENDED"; + /** Task Runs */ public static final String DATA_PROCESS_INSTANCE_ENTITY_NAME = "dataProcessInstance"; diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py index aa7b3108f64f1e..640991a90a1d28 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py @@ -9,6 +9,7 @@ import airflow import datahub.emitter.mce_builder as builder +from airflow.models import Variable from airflow.models.serialized_dag import SerializedDagModel from datahub.api.entities.datajob import DataJob from datahub.api.entities.dataprocess.dataprocess_instance import InstanceRunResult @@ -78,6 +79,8 @@ def hookimpl(f: _F) -> _F: # type: ignore[misc] # noqa: F811 ) _DATAHUB_CLEANUP_DAG = "Datahub_Cleanup" +KILL_SWITCH_VARIABLE_NAME = "datahub_airflow_plugin_disable_listener" + def get_airflow_plugin_listener() -> Optional["DataHubListener"]: # Using globals instead of functools.lru_cache to make testing easier. @@ -364,6 +367,12 @@ def _extract_lineage( redact_with_exclusions(v) ) + def check_kill_switch(self): + if Variable.get(KILL_SWITCH_VARIABLE_NAME, "false").lower() == "true": + logger.debug("DataHub listener disabled by kill switch") + return True + return False + @hookimpl @run_in_thread def on_task_instance_running( @@ -372,6 +381,8 @@ def on_task_instance_running( task_instance: "TaskInstance", session: "Session", # This will always be QUEUED ) -> None: + if self.check_kill_switch(): + return self._set_log_level() # This if statement mirrors the logic in https://github.com/OpenLineage/OpenLineage/pull/508. @@ -454,6 +465,9 @@ def on_task_instance_running( f"DataHub listener finished processing notification about task instance start for {task_instance.task_id}" ) + self.materialize_iolets(datajob) + + def materialize_iolets(self, datajob: DataJob) -> None: if self.config.materialize_iolets: for outlet in datajob.outlets: reported_time: int = int(time.time() * 1000) @@ -541,6 +555,9 @@ def on_task_instance_finish( def on_task_instance_success( self, previous_state: None, task_instance: "TaskInstance", session: "Session" ) -> None: + if self.check_kill_switch(): + return + self._set_log_level() logger.debug( @@ -556,6 +573,9 @@ def on_task_instance_success( def on_task_instance_failed( self, previous_state: None, task_instance: "TaskInstance", session: "Session" ) -> None: + if self.check_kill_switch(): + return + self._set_log_level() logger.debug( @@ -696,6 +716,9 @@ def on_dag_start(self, dag_run: "DagRun") -> None: @hookimpl @run_in_thread def on_dag_run_running(self, dag_run: "DagRun", msg: str) -> None: + if self.check_kill_switch(): + return + self._set_log_level() logger.debug( diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 415871d30175f8..c6994dd6d5aa65 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -76,7 +76,7 @@ # now provide prebuilt wheels for most platforms, including M1 Macs and # Linux aarch64 (e.g. Docker's linux/arm64). Installing confluent_kafka # from source remains a pain. - "confluent_kafka>=1.9.0", + "confluent_kafka[schemaregistry]>=1.9.0", # We currently require both Avro libraries. The codegen uses avro-python3 (above) # schema parsers at runtime for generating and reading JSON into Python objects. # At the same time, we use Kafka's AvroSerializer, which internally relies on @@ -101,7 +101,7 @@ # We heavily monkeypatch sqlglot. # Prior to the patching, we originally maintained an acryl-sqlglot fork: # https://github.com/tobymao/sqlglot/compare/main...hsheth2:sqlglot:main?expand=1 - "sqlglot[rs]==25.26.0", + "sqlglot[rs]==25.32.1", "patchy==2.8.0", } @@ -741,7 +741,7 @@ "hive-metastore = datahub.ingestion.source.sql.hive_metastore:HiveMetastoreSource", "json-schema = datahub.ingestion.source.schema.json_schema:JsonSchemaSource", "kafka = datahub.ingestion.source.kafka.kafka:KafkaSource", - "kafka-connect = datahub.ingestion.source.kafka.kafka_connect:KafkaConnectSource", + "kafka-connect = datahub.ingestion.source.kafka_connect.kafka_connect:KafkaConnectSource", "ldap = datahub.ingestion.source.ldap:LDAPSource", "looker = datahub.ingestion.source.looker.looker_source:LookerDashboardSource", "lookml = datahub.ingestion.source.looker.lookml_source:LookMLSource", diff --git a/metadata-ingestion/sink_docs/metadata-file.md b/metadata-ingestion/sink_docs/metadata-file.md index 49ca3c75397af4..36c868828070ed 100644 --- a/metadata-ingestion/sink_docs/metadata-file.md +++ b/metadata-ingestion/sink_docs/metadata-file.md @@ -25,7 +25,7 @@ source: sink: type: file config: - path: ./path/to/mce/file.json + filename: ./path/to/mce/file.json ``` ## Config details diff --git a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py index fd3fe7ca098ecb..619f69b016262d 100644 --- a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py +++ b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py @@ -9,27 +9,18 @@ from datahub.configuration.common import ConfigModel from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.ingestion.api.global_context import get_graph_context, set_graph_context -from datahub.ingestion.graph.client import DataHubGraph, get_default_graph +from datahub.ingestion.graph.client import DataHubGraph from datahub.metadata.schema_classes import ( PropertyValueClass, StructuredPropertyDefinitionClass, ) -from datahub.utilities.urns.urn import Urn +from datahub.metadata.urns import DataTypeUrn, StructuredPropertyUrn, Urn +from datahub.utilities.urns._urn_base import URN_TYPES logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) -class StructuredPropertiesConfig: - """Configuration class to hold the graph client""" - - @classmethod - def get_graph_required(cls) -> DataHubGraph: - """Get the current graph, falling back to default if none set""" - return get_graph_context() or get_default_graph() - - class AllowedTypes(Enum): STRING = "string" RICH_TEXT = "rich_text" @@ -51,29 +42,28 @@ class AllowedValue(ConfigModel): description: Optional[str] = None -VALID_ENTITY_TYPES_PREFIX_STRING = ", ".join( - [ - f"urn:li:entityType:datahub.{x}" - for x in ["dataset", "dashboard", "dataFlow", "schemaField"] - ] -) -VALID_ENTITY_TYPES_STRING = f"Valid entity type urns are {VALID_ENTITY_TYPES_PREFIX_STRING}, etc... Ensure that the entity type is valid." +VALID_ENTITY_TYPE_URNS = [ + Urn.make_entity_type_urn(entity_type) for entity_type in URN_TYPES.keys() +] +_VALID_ENTITY_TYPES_STRING = f"Valid entity type urns are {', '.join(VALID_ENTITY_TYPE_URNS)}, etc... Ensure that the entity type is valid." + + +def _validate_entity_type_urn(v: str) -> str: + urn = Urn.make_entity_type_urn(v) + if urn not in VALID_ENTITY_TYPE_URNS: + raise ValueError( + f"Input {v} is not a valid entity type urn. {_VALID_ENTITY_TYPES_STRING}" + ) + v = str(urn) + return v class TypeQualifierAllowedTypes(ConfigModel): allowed_types: List[str] - @validator("allowed_types", each_item=True) - def validate_allowed_types(cls, v): - if v: - graph = StructuredPropertiesConfig.get_graph_required() - validated_urn = Urn.make_entity_type_urn(v) - if not graph.exists(validated_urn): - raise ValueError( - f"Input {v} is not a valid entity type urn. {VALID_ENTITY_TYPES_STRING}" - ) - v = str(validated_urn) - return v + _check_allowed_types = validator("allowed_types", each_item=True, allow_reuse=True)( + _validate_entity_type_urn + ) class StructuredProperties(ConfigModel): @@ -90,22 +80,42 @@ class StructuredProperties(ConfigModel): type_qualifier: Optional[TypeQualifierAllowedTypes] = None immutable: Optional[bool] = False - @validator("entity_types", each_item=True) - def validate_entity_types(cls, v): - if v: - graph = StructuredPropertiesConfig.get_graph_required() - validated_urn = Urn.make_entity_type_urn(v) - if not graph.exists(validated_urn): - raise ValueError( - f"Input {v} is not a valid entity type urn. {VALID_ENTITY_TYPES_STRING}" - ) - v = str(validated_urn) - return v + _check_entity_types = validator("entity_types", each_item=True, allow_reuse=True)( + _validate_entity_type_urn + ) + + @validator("type") + def validate_type(cls, v: str) -> str: + # This logic is somewhat hacky, since we need to deal with + # 1. fully qualified urns + # 2. raw data types, that need to get the datahub namespace prefix + # While keeping the user-facing interface and error messages clean. + + if not v.startswith("urn:li:") and not v.islower(): + # Convert to lowercase if needed + v = v.lower() + logger.warning( + f"Structured property type should be lowercase. Updated to {v}" + ) + + urn = Urn.make_data_type_urn(v) + + # Check if type is allowed + data_type_urn = DataTypeUrn.from_string(urn) + unqualified_data_type = data_type_urn.id + if unqualified_data_type.startswith("datahub."): + unqualified_data_type = unqualified_data_type[len("datahub.") :] + if not AllowedTypes.check_allowed_type(unqualified_data_type): + raise ValueError( + f"Type {unqualified_data_type} is not allowed. Allowed types are {AllowedTypes.values()}" + ) + + return urn @property def fqn(self) -> str: assert self.urn is not None - id = Urn.create_from_string(self.urn).get_entity_id()[0] + id = StructuredPropertyUrn.from_string(self.urn).id if self.qualified_name is not None: # ensure that qualified name and ID match assert ( @@ -122,101 +132,90 @@ def urn_must_be_present(cls, v, values): return v @staticmethod - def create(file: str, graph: Optional[DataHubGraph] = None) -> None: - with set_graph_context(graph): - graph = StructuredPropertiesConfig.get_graph_required() - - with open(file) as fp: - structuredproperties: List[dict] = yaml.safe_load(fp) - for structuredproperty_raw in structuredproperties: - structuredproperty = StructuredProperties.parse_obj( - structuredproperty_raw - ) - - if not structuredproperty.type.islower(): - structuredproperty.type = structuredproperty.type.lower() - logger.warning( - f"Structured property type should be lowercase. Updated to {structuredproperty.type}" - ) - if not AllowedTypes.check_allowed_type(structuredproperty.type): - raise ValueError( - f"Type {structuredproperty.type} is not allowed. Allowed types are {AllowedTypes.values()}" - ) - mcp = MetadataChangeProposalWrapper( - entityUrn=structuredproperty.urn, - aspect=StructuredPropertyDefinitionClass( - qualifiedName=structuredproperty.fqn, - valueType=Urn.make_data_type_urn(structuredproperty.type), - displayName=structuredproperty.display_name, - description=structuredproperty.description, - entityTypes=[ - Urn.make_entity_type_urn(entity_type) - for entity_type in structuredproperty.entity_types or [] - ], - cardinality=structuredproperty.cardinality, - immutable=structuredproperty.immutable, - allowedValues=( - [ - PropertyValueClass( - value=v.value, description=v.description - ) - for v in structuredproperty.allowed_values - ] - if structuredproperty.allowed_values - else None - ), - typeQualifier=( - { - "allowedTypes": structuredproperty.type_qualifier.allowed_types - } - if structuredproperty.type_qualifier - else None - ), - ), - ) - graph.emit_mcp(mcp) - - logger.info(f"Created structured property {structuredproperty.urn}") - - @classmethod - def from_datahub(cls, graph: DataHubGraph, urn: str) -> "StructuredProperties": - with set_graph_context(graph): - structured_property: Optional[ - StructuredPropertyDefinitionClass - ] = graph.get_aspect(urn, StructuredPropertyDefinitionClass) - if structured_property is None: - raise Exception( - "StructuredPropertyDefinition aspect is None. Unable to create structured property." - ) - return StructuredProperties( - urn=urn, - qualified_name=structured_property.qualifiedName, - display_name=structured_property.displayName, - type=structured_property.valueType, - description=structured_property.description, - entity_types=structured_property.entityTypes, - cardinality=structured_property.cardinality, - allowed_values=( + def from_yaml(file: str) -> List["StructuredProperties"]: + with open(file) as fp: + structuredproperties: List[dict] = yaml.safe_load(fp) + + result: List[StructuredProperties] = [] + for structuredproperty_raw in structuredproperties: + result.append(StructuredProperties.parse_obj(structuredproperty_raw)) + return result + + def generate_mcps(self) -> List[MetadataChangeProposalWrapper]: + mcp = MetadataChangeProposalWrapper( + entityUrn=self.urn, + aspect=StructuredPropertyDefinitionClass( + qualifiedName=self.fqn, + valueType=Urn.make_data_type_urn(self.type), + displayName=self.display_name, + description=self.description, + entityTypes=[ + Urn.make_entity_type_urn(entity_type) + for entity_type in self.entity_types or [] + ], + cardinality=self.cardinality, + immutable=self.immutable, + allowedValues=( [ - AllowedValue( - value=av.value, - description=av.description, - ) - for av in structured_property.allowedValues or [] + PropertyValueClass(value=v.value, description=v.description) + for v in self.allowed_values ] - if structured_property.allowedValues is not None + if self.allowed_values else None ), - type_qualifier=( - { - "allowed_types": structured_property.typeQualifier.get( - "allowedTypes" - ) - } - if structured_property.typeQualifier + typeQualifier=( + {"allowedTypes": self.type_qualifier.allowed_types} + if self.type_qualifier else None ), + ), + ) + return [mcp] + + @staticmethod + def create(file: str, graph: DataHubGraph) -> None: + # TODO: Deprecate this method. + structuredproperties = StructuredProperties.from_yaml(file) + for structuredproperty in structuredproperties: + for mcp in structuredproperty.generate_mcps(): + graph.emit_mcp(mcp) + + logger.info(f"Created structured property {structuredproperty.urn}") + + @classmethod + def from_datahub(cls, graph: DataHubGraph, urn: str) -> "StructuredProperties": + structured_property: Optional[ + StructuredPropertyDefinitionClass + ] = graph.get_aspect(urn, StructuredPropertyDefinitionClass) + if structured_property is None: + raise Exception( + "StructuredPropertyDefinition aspect is None. Unable to create structured property." ) + return StructuredProperties( + urn=urn, + qualified_name=structured_property.qualifiedName, + display_name=structured_property.displayName, + type=structured_property.valueType, + description=structured_property.description, + entity_types=structured_property.entityTypes, + cardinality=structured_property.cardinality, + allowed_values=( + [ + AllowedValue( + value=av.value, + description=av.description, + ) + for av in structured_property.allowedValues or [] + ] + if structured_property.allowedValues is not None + else None + ), + type_qualifier=( + {"allowed_types": structured_property.typeQualifier.get("allowedTypes")} + if structured_property.typeQualifier + else None + ), + ) def to_yaml( self, diff --git a/metadata-ingestion/src/datahub/cli/ingest_cli.py b/metadata-ingestion/src/datahub/cli/ingest_cli.py index 51f095751f7dd9..fcab07a1c2aaf6 100644 --- a/metadata-ingestion/src/datahub/cli/ingest_cli.py +++ b/metadata-ingestion/src/datahub/cli/ingest_cli.py @@ -27,6 +27,7 @@ logger = logging.getLogger(__name__) +INGEST_SRC_TABLE_COLUMNS = ["runId", "source", "startTime", "status", "URN"] RUNS_TABLE_COLUMNS = ["runId", "rows", "created at"] RUN_TABLE_COLUMNS = ["urn", "aspect name", "created at"] @@ -437,6 +438,115 @@ def mcps(path: str) -> None: sys.exit(ret) +@ingest.command() +@click.argument("page_offset", type=int, default=0) +@click.argument("page_size", type=int, default=100) +@click.option("--urn", type=str, default=None, help="Filter by ingestion source URN.") +@click.option( + "--source", type=str, default=None, help="Filter by ingestion source name." +) +@upgrade.check_upgrade +@telemetry.with_telemetry() +def list_source_runs(page_offset: int, page_size: int, urn: str, source: str) -> None: + """List ingestion source runs with their details, optionally filtered by URN or source.""" + + query = """ + query listIngestionRuns($input: ListIngestionSourcesInput!) { + listIngestionSources(input: $input) { + ingestionSources { + urn + name + executions { + executionRequests { + id + result { + startTimeMs + status + } + } + } + } + } + } + """ + + # filter by urn and/or source using CONTAINS + filters = [] + if urn: + filters.append({"field": "urn", "values": [urn], "condition": "CONTAIN"}) + if source: + filters.append({"field": "name", "values": [source], "condition": "CONTAIN"}) + + variables = { + "input": { + "start": page_offset, + "count": page_size, + "filters": filters, + } + } + + client = get_default_graph() + session = client._session + gms_host = client.config.server + + url = f"{gms_host}/api/graphql" + try: + response = session.post(url, json={"query": query, "variables": variables}) + response.raise_for_status() + except Exception as e: + click.echo(f"Error fetching data: {str(e)}") + return + + try: + data = response.json() + except ValueError: + click.echo("Failed to parse JSON response from server.") + return + + if not data: + click.echo("No response received from the server.") + return + + # when urn or source filter does not match, exit gracefully + if ( + not isinstance(data.get("data"), dict) + or "listIngestionSources" not in data["data"] + ): + click.echo("No matching ingestion sources found. Please check your filters.") + return + + ingestion_sources = data["data"]["listIngestionSources"]["ingestionSources"] + if not ingestion_sources: + click.echo("No ingestion sources or executions found.") + return + + rows = [] + for ingestion_source in ingestion_sources: + urn = ingestion_source.get("urn", "N/A") + name = ingestion_source.get("name", "N/A") + + executions = ingestion_source.get("executions", {}).get("executionRequests", []) + for execution in executions: + execution_id = execution.get("id", "N/A") + start_time = execution.get("result", {}).get("startTimeMs", "N/A") + start_time = ( + datetime.fromtimestamp(start_time / 1000).strftime("%Y-%m-%d %H:%M:%S") + if start_time != "N/A" + else "N/A" + ) + status = execution.get("result", {}).get("status", "N/A") + + rows.append([execution_id, name, start_time, status, urn]) + + click.echo( + tabulate( + rows, + headers=INGEST_SRC_TABLE_COLUMNS, + tablefmt="grid", + ) + ) + + @ingest.command() @click.argument("page_offset", type=int, default=0) @click.argument("page_size", type=int, default=100) diff --git a/metadata-ingestion/src/datahub/cli/specific/structuredproperties_cli.py b/metadata-ingestion/src/datahub/cli/specific/structuredproperties_cli.py index 4162d44b9b0ea8..42285cf13a5ddc 100644 --- a/metadata-ingestion/src/datahub/cli/specific/structuredproperties_cli.py +++ b/metadata-ingestion/src/datahub/cli/specific/structuredproperties_cli.py @@ -31,7 +31,8 @@ def properties() -> None: def upsert(file: Path) -> None: """Upsert structured properties in DataHub.""" - StructuredProperties.create(str(file)) + with get_default_graph() as graph: + StructuredProperties.create(str(file), graph) @properties.command( diff --git a/metadata-ingestion/src/datahub/configuration/git.py b/metadata-ingestion/src/datahub/configuration/git.py index d237cd9ddd306c..e7e9bfd43adca5 100644 --- a/metadata-ingestion/src/datahub/configuration/git.py +++ b/metadata-ingestion/src/datahub/configuration/git.py @@ -24,7 +24,11 @@ class GitReference(ConfigModel): "main", description="Branch on which your files live by default. Typically main or master. This can also be a commit hash.", ) - + url_subdir: Optional[str] = Field( + default=None, + description="Prefix to prepend when generating URLs for files - useful when files are in a subdirectory. " + "Only affects URL generation, not git operations.", + ) url_template: Optional[str] = Field( None, description=f"Template for generating a URL to a file in the repo e.g. '{_GITHUB_URL_TEMPLATE}'. We can infer this for GitHub and GitLab repos, and it is otherwise required." @@ -68,6 +72,8 @@ def infer_url_template(cls, url_template: Optional[str], values: dict) -> str: def get_url_for_file_path(self, file_path: str) -> str: assert self.url_template + if self.url_subdir: + file_path = f"{self.url_subdir}/{file_path}" return self.url_template.format( repo_url=self.repo, branch=self.branch, file_path=file_path ) diff --git a/metadata-ingestion/src/datahub/configuration/source_common.py b/metadata-ingestion/src/datahub/configuration/source_common.py index 44c737f1bd13d4..8e41e9fb917878 100644 --- a/metadata-ingestion/src/datahub/configuration/source_common.py +++ b/metadata-ingestion/src/datahub/configuration/source_common.py @@ -63,3 +63,16 @@ class DatasetLineageProviderConfigBase(EnvConfigMixin): default=None, description="A holder for platform -> platform_instance mappings to generate correct dataset urns", ) + + +class PlatformDetail(ConfigModel): + platform_instance: Optional[str] = Field( + default=None, + description="DataHub platform instance name. To generate correct urn for upstream dataset, this should match " + "with platform instance name used in ingestion " + "recipe of other datahub sources.", + ) + env: str = Field( + default=DEFAULT_ENV, + description="The environment that all assets produced by DataHub platform ingestion source belong to", + ) diff --git a/metadata-ingestion/src/datahub/ingestion/api/source.py b/metadata-ingestion/src/datahub/ingestion/api/source.py index c80da04e481a9f..c3638635b19aac 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source.py @@ -184,6 +184,7 @@ def infos(self) -> LossyList[StructuredLogEntry]: @dataclass class SourceReport(Report): + event_not_produced_warn: bool = True events_produced: int = 0 events_produced_per_sec: int = 0 diff --git a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py index 0c86e1cf47203f..7791ea2797be34 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py @@ -150,7 +150,7 @@ def auto_workunit_reporter(report: "SourceReport", stream: Iterable[T]) -> Itera report.report_workunit(wu) yield wu - if report.events_produced == 0: + if report.event_not_produced_warn and report.events_produced == 0: report.warning( title="No metadata was produced by the source", message="Please check the source configuration, filters, and permissions.", diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py index faa281097de4cd..80906ca63115f5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py @@ -147,6 +147,47 @@ def query(self) -> str: version """ + def execute_server_cursor( + self, query: str, params: Dict[str, Any] + ) -> Iterable[Dict[str, Any]]: + with self.engine.connect() as conn: + if self.engine.dialect.name == "postgresql": + with conn.begin(): # Transaction required for PostgreSQL server-side cursor + conn = conn.execution_options( + stream_results=True, + yield_per=self.config.database_query_batch_size, + ) + result = conn.execute(query, params) + for row in result: + yield dict(row) + elif self.engine.dialect.name == "mysql": # MySQL + import MySQLdb + + with contextlib.closing( + conn.connection.cursor(MySQLdb.cursors.SSCursor) + ) as cursor: + logger.debug(f"Using Cursor type: {cursor.__class__.__name__}") + cursor.execute(query, params) + + columns = [desc[0] for desc in cursor.description] + while True: + rows = cursor.fetchmany(self.config.database_query_batch_size) + if not rows: + break # Use break instead of return in generator + for row in rows: + yield dict(zip(columns, row)) + else: + raise ValueError(f"Unsupported dialect: {self.engine.dialect.name}") + + def _get_rows( + self, from_createdon: datetime, stop_time: datetime + ) -> Iterable[Dict[str, Any]]: + params = { + "exclude_aspects": list(self.config.exclude_aspects), + "since_createdon": from_createdon.strftime(DATETIME_FORMAT), + } + yield from self.execute_server_cursor(self.query, params) + def get_aspects( self, from_createdon: datetime, stop_time: datetime ) -> Iterable[Tuple[MetadataChangeProposalWrapper, datetime]]: @@ -159,27 +200,6 @@ def get_aspects( if mcp: yield mcp, row["createdon"] - def _get_rows( - self, from_createdon: datetime, stop_time: datetime - ) -> Iterable[Dict[str, Any]]: - with self.engine.connect() as conn: - with contextlib.closing(conn.connection.cursor()) as cursor: - cursor.execute( - self.query, - { - "exclude_aspects": list(self.config.exclude_aspects), - "since_createdon": from_createdon.strftime(DATETIME_FORMAT), - }, - ) - - columns = [desc[0] for desc in cursor.description] - while True: - rows = cursor.fetchmany(self.config.database_query_batch_size) - if not rows: - return - for row in rows: - yield dict(zip(columns, row)) - def get_soft_deleted_rows(self) -> Iterable[Dict[str, Any]]: """ Fetches all soft-deleted entities from the database. diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py index 63cea45f75864b..cb72441344088c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py @@ -1,5 +1,5 @@ import logging -from datetime import datetime, timezone +from datetime import datetime, timedelta, timezone from functools import partial from typing import Dict, Iterable, List, Optional @@ -26,6 +26,7 @@ StatefulIngestionSourceBase, ) from datahub.metadata.schema_classes import ChangeTypeClass +from datahub.utilities.progress_timer import ProgressTimer logger = logging.getLogger(__name__) @@ -105,11 +106,17 @@ def _get_database_workunits( self, from_createdon: datetime, reader: DataHubDatabaseReader ) -> Iterable[MetadataWorkUnit]: logger.info(f"Fetching database aspects starting from {from_createdon}") + progress = ProgressTimer(report_every=timedelta(seconds=60)) mcps = reader.get_aspects(from_createdon, self.report.stop_time) for i, (mcp, createdon) in enumerate(mcps): if not self.urn_pattern.allowed(str(mcp.entityUrn)): continue + if progress.should_report(): + logger.info( + f"Ingested {i} database aspects so far, currently at {createdon}" + ) + yield mcp.as_workunit() self.report.num_database_aspects_ingested += 1 diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py b/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py index 814f65ecb45cf0..4eecbb4d9d7177 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py @@ -65,18 +65,18 @@ class DataHubGcSourceConfig(ConfigModel): description="Sleep between truncation monitoring.", ) - dataprocess_cleanup: Optional[DataProcessCleanupConfig] = Field( - default=None, + dataprocess_cleanup: DataProcessCleanupConfig = Field( + default_factory=DataProcessCleanupConfig, description="Configuration for data process cleanup", ) - soft_deleted_entities_cleanup: Optional[SoftDeletedEntitiesCleanupConfig] = Field( - default=None, + soft_deleted_entities_cleanup: SoftDeletedEntitiesCleanupConfig = Field( + default_factory=SoftDeletedEntitiesCleanupConfig, description="Configuration for soft deleted entities cleanup", ) - execution_request_cleanup: Optional[DatahubExecutionRequestCleanupConfig] = Field( - default=None, + execution_request_cleanup: DatahubExecutionRequestCleanupConfig = Field( + default_factory=DatahubExecutionRequestCleanupConfig, description="Configuration for execution request cleanup", ) @@ -108,28 +108,22 @@ def __init__(self, ctx: PipelineContext, config: DataHubGcSourceConfig): self.ctx = ctx self.config = config self.report = DataHubGcSourceReport() + self.report.event_not_produced_warn = False self.graph = ctx.require_graph("The DataHubGc source") - self.dataprocess_cleanup: Optional[DataProcessCleanup] = None - self.soft_deleted_entities_cleanup: Optional[SoftDeletedEntitiesCleanup] = None - self.execution_request_cleanup: Optional[DatahubExecutionRequestCleanup] = None - - if self.config.dataprocess_cleanup: - self.dataprocess_cleanup = DataProcessCleanup( - ctx, self.config.dataprocess_cleanup, self.report, self.config.dry_run - ) - if self.config.soft_deleted_entities_cleanup: - self.soft_deleted_entities_cleanup = SoftDeletedEntitiesCleanup( - ctx, - self.config.soft_deleted_entities_cleanup, - self.report, - self.config.dry_run, - ) - if self.config.execution_request_cleanup: - self.execution_request_cleanup = DatahubExecutionRequestCleanup( - config=self.config.execution_request_cleanup, - graph=self.graph, - report=self.report, - ) + self.dataprocess_cleanup = DataProcessCleanup( + ctx, self.config.dataprocess_cleanup, self.report, self.config.dry_run + ) + self.soft_deleted_entities_cleanup = SoftDeletedEntitiesCleanup( + ctx, + self.config.soft_deleted_entities_cleanup, + self.report, + self.config.dry_run, + ) + self.execution_request_cleanup = DatahubExecutionRequestCleanup( + config=self.config.execution_request_cleanup, + graph=self.graph, + report=self.report, + ) @classmethod def create(cls, config_dict, ctx): @@ -153,19 +147,19 @@ def get_workunits_internal( self.truncate_indices() except Exception as e: self.report.failure("While trying to truncate indices ", exc=e) - if self.soft_deleted_entities_cleanup: + if self.config.soft_deleted_entities_cleanup.enabled: try: self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities() except Exception as e: self.report.failure( "While trying to cleanup soft deleted entities ", exc=e ) - if self.execution_request_cleanup: + if self.config.execution_request_cleanup.enabled: try: self.execution_request_cleanup.run() except Exception as e: self.report.failure("While trying to cleanup execution request ", exc=e) - if self.dataprocess_cleanup: + if self.config.dataprocess_cleanup.enabled: try: yield from self.dataprocess_cleanup.get_workunits_internal() except Exception as e: diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py index 8aacf13cdb00fb..6d16aaab2d7980 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py @@ -98,6 +98,9 @@ class DataProcessCleanupConfig(ConfigModel): + enabled: bool = Field( + default=True, description="Whether to do data process cleanup." + ) retention_days: Optional[int] = Field( 10, description="Number of days to retain metadata in DataHub", @@ -371,17 +374,26 @@ def get_data_flows(self) -> Iterable[DataFlowEntity]: previous_scroll_id: Optional[str] = None while True: - result = self.ctx.graph.execute_graphql( - DATAFLOW_QUERY, - { - "query": "*", - "scrollId": scroll_id if scroll_id else None, - "batchSize": self.config.batch_size, - }, - ) + result = None + try: + result = self.ctx.graph.execute_graphql( + DATAFLOW_QUERY, + { + "query": "*", + "scrollId": scroll_id if scroll_id else None, + "batchSize": self.config.batch_size, + }, + ) + except Exception as e: + self.report.failure( + f"While trying to get dataflows with {scroll_id}", exc=e + ) + break + scrollAcrossEntities = result.get("scrollAcrossEntities") if not scrollAcrossEntities: raise ValueError("Missing scrollAcrossEntities in response") + logger.info(f"Got {scrollAcrossEntities.get('count')} DataFlow entities") scroll_id = scrollAcrossEntities.get("nextScrollId") for flow in scrollAcrossEntities.get("searchResults"): @@ -398,6 +410,8 @@ def get_data_flows(self) -> Iterable[DataFlowEntity]: previous_scroll_id = scroll_id def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + if not self.config.enabled: + return [] assert self.ctx.graph dataFlows: Dict[str, DataFlowEntity] = {} @@ -411,14 +425,20 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: deleted_jobs: int = 0 while True: - result = self.ctx.graph.execute_graphql( - DATAJOB_QUERY, - { - "query": "*", - "scrollId": scroll_id if scroll_id else None, - "batchSize": self.config.batch_size, - }, - ) + try: + result = self.ctx.graph.execute_graphql( + DATAJOB_QUERY, + { + "query": "*", + "scrollId": scroll_id if scroll_id else None, + "batchSize": self.config.batch_size, + }, + ) + except Exception as e: + self.report.failure( + f"While trying to get data jobs with {scroll_id}", exc=e + ) + break scrollAcrossEntities = result.get("scrollAcrossEntities") if not scrollAcrossEntities: raise ValueError("Missing scrollAcrossEntities in response") diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py index bb4ab753543b7b..93f004ab675edc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py @@ -20,6 +20,9 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel): + enabled: bool = Field( + default=True, description="Whether to do soft deletion cleanup." + ) retention_days: Optional[int] = Field( 10, description="Number of days to retain metadata in DataHub", @@ -156,6 +159,8 @@ def delete_soft_deleted_entity(self, urn: str) -> None: self.delete_entity(urn) def cleanup_soft_deleted_entities(self) -> None: + if not self.config.enabled: + return assert self.ctx.graph start_time = time.time() diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka_connect.py b/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka_connect.py deleted file mode 100644 index 23a99ccb310e13..00000000000000 --- a/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka_connect.py +++ /dev/null @@ -1,1468 +0,0 @@ -import logging -import re -from dataclasses import dataclass, field -from typing import Dict, Iterable, List, Optional, Tuple - -import jpype -import jpype.imports -import requests -from pydantic.fields import Field -from sqlalchemy.engine.url import make_url - -import datahub.emitter.mce_builder as builder -import datahub.metadata.schema_classes as models -from datahub.configuration.common import AllowDenyPattern, ConfigModel -from datahub.configuration.source_common import ( - DatasetLineageProviderConfigBase, - PlatformInstanceConfigMixin, -) -from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.api.decorators import ( - SourceCapability, - SupportStatus, - capability, - config_class, - platform_name, - support_status, -) -from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source -from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import ( - get_platform_from_sqlalchemy_uri, -) -from datahub.ingestion.source.state.stale_entity_removal_handler import ( - StaleEntityRemovalHandler, - StaleEntityRemovalSourceReport, - StatefulStaleMetadataRemovalConfig, -) -from datahub.ingestion.source.state.stateful_ingestion_base import ( - StatefulIngestionConfigBase, - StatefulIngestionSourceBase, -) - -logger = logging.getLogger(__name__) - -KAFKA = "kafka" -SOURCE = "source" -SINK = "sink" -CONNECTOR_CLASS = "connector.class" - - -class ProvidedConfig(ConfigModel): - provider: str - path_key: str - value: str - - -class GenericConnectorConfig(ConfigModel): - connector_name: str - source_dataset: str - source_platform: str - - -class KafkaConnectSourceConfig( - PlatformInstanceConfigMixin, - DatasetLineageProviderConfigBase, - StatefulIngestionConfigBase, -): - # See the Connect REST Interface for details - # https://docs.confluent.io/platform/current/connect/references/restapi.html# - connect_uri: str = Field( - default="http://localhost:8083/", description="URI to connect to." - ) - username: Optional[str] = Field(default=None, description="Kafka Connect username.") - password: Optional[str] = Field(default=None, description="Kafka Connect password.") - cluster_name: Optional[str] = Field( - default="connect-cluster", description="Cluster to ingest from." - ) - # convert lineage dataset's urns to lowercase - convert_lineage_urns_to_lowercase: bool = Field( - default=False, - description="Whether to convert the urns of ingested lineage dataset to lowercase", - ) - connector_patterns: AllowDenyPattern = Field( - default=AllowDenyPattern.allow_all(), - description="regex patterns for connectors to filter for ingestion.", - ) - provided_configs: Optional[List[ProvidedConfig]] = Field( - default=None, description="Provided Configurations" - ) - connect_to_platform_map: Optional[Dict[str, Dict[str, str]]] = Field( - default=None, - description='Platform instance mapping when multiple instances for a platform is available. Entry for a platform should be in either `platform_instance_map` or `connect_to_platform_map`. e.g.`connect_to_platform_map: { "postgres-connector-finance-db": "postgres": "core_finance_instance" }`', - ) - platform_instance_map: Optional[Dict[str, str]] = Field( - default=None, - description='Platform instance mapping to use when constructing URNs. e.g.`platform_instance_map: { "hive": "warehouse" }`', - ) - generic_connectors: List[GenericConnectorConfig] = Field( - default=[], - description="Provide lineage graph for sources connectors other than Confluent JDBC Source Connector, Debezium Source Connector, and Mongo Source Connector", - ) - - stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None - - -@dataclass -class KafkaConnectSourceReport(StaleEntityRemovalSourceReport): - connectors_scanned: int = 0 - filtered: List[str] = field(default_factory=list) - - def report_connector_scanned(self, connector: str) -> None: - self.connectors_scanned += 1 - - def report_dropped(self, connector: str) -> None: - self.filtered.append(connector) - - -@dataclass -class KafkaConnectLineage: - """Class to store Kafka Connect lineage mapping, Each instance is potential DataJob""" - - source_platform: str - target_dataset: str - target_platform: str - job_property_bag: Optional[Dict[str, str]] = None - source_dataset: Optional[str] = None - - -@dataclass -class ConnectorManifest: - """Each instance is potential DataFlow""" - - name: str - type: str - config: Dict - tasks: Dict - url: Optional[str] = None - flow_property_bag: Optional[Dict[str, str]] = None - lineages: List[KafkaConnectLineage] = field(default_factory=list) - topic_names: Iterable[str] = field(default_factory=list) - - -def remove_prefix(text: str, prefix: str) -> str: - if text.startswith(prefix): - index = len(prefix) - return text[index:] - return text - - -def unquote( - string: str, leading_quote: str = '"', trailing_quote: Optional[str] = None -) -> str: - """ - If string starts and ends with a quote, unquote it - """ - trailing_quote = trailing_quote if trailing_quote else leading_quote - if string.startswith(leading_quote) and string.endswith(trailing_quote): - string = string[1:-1] - return string - - -def get_dataset_name( - database_name: Optional[str], - source_table: str, -) -> str: - if database_name: - dataset_name = database_name + "." + source_table - else: - dataset_name = source_table - - return dataset_name - - -def get_platform_instance( - config: KafkaConnectSourceConfig, connector_name: str, platform: str -) -> Optional[str]: - instance_name = None - if ( - config.connect_to_platform_map - and config.connect_to_platform_map.get(connector_name) - and config.connect_to_platform_map[connector_name].get(platform) - ): - instance_name = config.connect_to_platform_map[connector_name][platform] - if config.platform_instance_map and config.platform_instance_map.get(platform): - logger.warning( - f"Same source platform {platform} configured in both platform_instance_map and connect_to_platform_map." - "Will prefer connector specific platform instance from connect_to_platform_map." - ) - elif config.platform_instance_map and config.platform_instance_map.get(platform): - instance_name = config.platform_instance_map[platform] - logger.info( - f"Instance name assigned is: {instance_name} for Connector Name {connector_name} and platform {platform}" - ) - return instance_name - - -@dataclass -class ConfluentJDBCSourceConnector: - connector_manifest: ConnectorManifest - report: KafkaConnectSourceReport - - def __init__( - self, - connector_manifest: ConnectorManifest, - config: KafkaConnectSourceConfig, - report: KafkaConnectSourceReport, - ) -> None: - self.connector_manifest = connector_manifest - self.config = config - self.report = report - self._extract_lineages() - - REGEXROUTER = "org.apache.kafka.connect.transforms.RegexRouter" - KNOWN_TOPICROUTING_TRANSFORMS = [REGEXROUTER] - # https://kafka.apache.org/documentation/#connect_included_transformation - KAFKA_NONTOPICROUTING_TRANSFORMS = [ - "InsertField", - "InsertField$Key", - "InsertField$Value", - "ReplaceField", - "ReplaceField$Key", - "ReplaceField$Value", - "MaskField", - "MaskField$Key", - "MaskField$Value", - "ValueToKey", - "ValueToKey$Key", - "ValueToKey$Value", - "HoistField", - "HoistField$Key", - "HoistField$Value", - "ExtractField", - "ExtractField$Key", - "ExtractField$Value", - "SetSchemaMetadata", - "SetSchemaMetadata$Key", - "SetSchemaMetadata$Value", - "Flatten", - "Flatten$Key", - "Flatten$Value", - "Cast", - "Cast$Key", - "Cast$Value", - "HeadersFrom", - "HeadersFrom$Key", - "HeadersFrom$Value", - "TimestampConverter", - "Filter", - "InsertHeader", - "DropHeaders", - ] - # https://docs.confluent.io/platform/current/connect/transforms/overview.html - CONFLUENT_NONTOPICROUTING_TRANSFORMS = [ - "Drop", - "Drop$Key", - "Drop$Value", - "Filter", - "Filter$Key", - "Filter$Value", - "TombstoneHandler", - ] - KNOWN_NONTOPICROUTING_TRANSFORMS = ( - KAFKA_NONTOPICROUTING_TRANSFORMS - + [ - f"org.apache.kafka.connect.transforms.{t}" - for t in KAFKA_NONTOPICROUTING_TRANSFORMS - ] - + CONFLUENT_NONTOPICROUTING_TRANSFORMS - + [ - f"io.confluent.connect.transforms.{t}" - for t in CONFLUENT_NONTOPICROUTING_TRANSFORMS - ] - ) - - @dataclass - class JdbcParser: - db_connection_url: str - source_platform: str - database_name: str - topic_prefix: str - query: str - transforms: list - - def get_parser( - self, - connector_manifest: ConnectorManifest, - ) -> JdbcParser: - url = remove_prefix( - str(connector_manifest.config.get("connection.url")), "jdbc:" - ) - url_instance = make_url(url) - source_platform = get_platform_from_sqlalchemy_uri(str(url_instance)) - database_name = url_instance.database - assert database_name - db_connection_url = f"{url_instance.drivername}://{url_instance.host}:{url_instance.port}/{database_name}" - - topic_prefix = self.connector_manifest.config.get("topic.prefix", None) - - query = self.connector_manifest.config.get("query", None) - - transform_names = ( - self.connector_manifest.config.get("transforms", "").split(",") - if self.connector_manifest.config.get("transforms") - else [] - ) - - transforms = [] - for name in transform_names: - transform = {"name": name} - transforms.append(transform) - for key in self.connector_manifest.config.keys(): - if key.startswith(f"transforms.{name}."): - transform[ - key.replace(f"transforms.{name}.", "") - ] = self.connector_manifest.config[key] - - return self.JdbcParser( - db_connection_url, - source_platform, - database_name, - topic_prefix, - query, - transforms, - ) - - def default_get_lineages( - self, - topic_prefix: str, - database_name: str, - source_platform: str, - topic_names: Optional[Iterable[str]] = None, - include_source_dataset: bool = True, - ) -> List[KafkaConnectLineage]: - lineages: List[KafkaConnectLineage] = [] - if not topic_names: - topic_names = self.connector_manifest.topic_names - table_name_tuples: List[Tuple] = self.get_table_names() - for topic in topic_names: - # All good for NO_TRANSFORM or (SINGLE_TRANSFORM and KNOWN_NONTOPICROUTING_TRANSFORM) or (not SINGLE_TRANSFORM and all(KNOWN_NONTOPICROUTING_TRANSFORM)) - source_table: str = ( - remove_prefix(topic, topic_prefix) if topic_prefix else topic - ) - # include schema name for three-level hierarchies - if has_three_level_hierarchy(source_platform): - table_name_tuple: Tuple = next( - iter([t for t in table_name_tuples if t and t[-1] == source_table]), - (), - ) - if len(table_name_tuple) > 1: - source_table = f"{table_name_tuple[-2]}.{source_table}" - else: - include_source_dataset = False - self.report.warning( - "Could not find schema for table" - f"{self.connector_manifest.name} : {source_table}", - ) - dataset_name: str = get_dataset_name(database_name, source_table) - lineage = KafkaConnectLineage( - source_dataset=dataset_name if include_source_dataset else None, - source_platform=source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - lineages.append(lineage) - return lineages - - def get_table_names(self) -> List[Tuple]: - sep: str = "." - leading_quote_char: str = '"' - trailing_quote_char: str = leading_quote_char - - table_ids: List[str] = [] - if self.connector_manifest.tasks: - table_ids = ( - ",".join( - [ - task["config"].get("tables") - for task in self.connector_manifest.tasks - ] - ) - ).split(",") - quote_method = self.connector_manifest.config.get( - "quote.sql.identifiers", "always" - ) - if ( - quote_method == "always" - and table_ids - and table_ids[0] - and table_ids[-1] - ): - leading_quote_char = table_ids[0][0] - trailing_quote_char = table_ids[-1][-1] - # This will only work for single character quotes - elif self.connector_manifest.config.get("table.whitelist"): - table_ids = self.connector_manifest.config.get("table.whitelist").split(",") # type: ignore - - # List of Tuple containing (schema, table) - tables: List[Tuple] = [ - ( - ( - unquote( - table_id.split(sep)[-2], leading_quote_char, trailing_quote_char - ) - if len(table_id.split(sep)) > 1 - else "" - ), - unquote( - table_id.split(sep)[-1], leading_quote_char, trailing_quote_char - ), - ) - for table_id in table_ids - ] - return tables - - def _extract_lineages(self): - lineages: List[KafkaConnectLineage] = list() - parser = self.get_parser(self.connector_manifest) - source_platform = parser.source_platform - database_name = parser.database_name - query = parser.query - topic_prefix = parser.topic_prefix - transforms = parser.transforms - self.connector_manifest.flow_property_bag = self.connector_manifest.config - - # Mask/Remove properties that may reveal credentials - self.connector_manifest.flow_property_bag[ - "connection.url" - ] = parser.db_connection_url - if "connection.password" in self.connector_manifest.flow_property_bag: - del self.connector_manifest.flow_property_bag["connection.password"] - if "connection.user" in self.connector_manifest.flow_property_bag: - del self.connector_manifest.flow_property_bag["connection.user"] - - logging.debug( - f"Extracting source platform: {source_platform} and database name: {database_name} from connection url " - ) - - if not self.connector_manifest.topic_names: - self.connector_manifest.lineages = lineages - return - - if query: - # Lineage source_table can be extracted by parsing query - for topic in self.connector_manifest.topic_names: - # default method - as per earlier implementation - dataset_name: str = get_dataset_name(database_name, topic) - - lineage = KafkaConnectLineage( - source_dataset=None, - source_platform=source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - lineages.append(lineage) - self.report.warning( - "Could not find input dataset, the connector has query configuration set", - self.connector_manifest.name, - ) - self.connector_manifest.lineages = lineages - return - - SINGLE_TRANSFORM = len(transforms) == 1 - NO_TRANSFORM = len(transforms) == 0 - UNKNOWN_TRANSFORM = any( - [ - transform["type"] - not in self.KNOWN_TOPICROUTING_TRANSFORMS - + self.KNOWN_NONTOPICROUTING_TRANSFORMS - for transform in transforms - ] - ) - ALL_TRANSFORMS_NON_TOPICROUTING = all( - [ - transform["type"] in self.KNOWN_NONTOPICROUTING_TRANSFORMS - for transform in transforms - ] - ) - - if NO_TRANSFORM or ALL_TRANSFORMS_NON_TOPICROUTING: - self.connector_manifest.lineages = self.default_get_lineages( - database_name=database_name, - source_platform=source_platform, - topic_prefix=topic_prefix, - ) - return - - if SINGLE_TRANSFORM and transforms[0]["type"] == self.REGEXROUTER: - tables = self.get_table_names() - topic_names = list(self.connector_manifest.topic_names) - - from java.util.regex import Pattern - - for table in tables: - source_table: str = table[-1] - topic = topic_prefix + source_table if topic_prefix else source_table - - transform_regex = Pattern.compile(transforms[0]["regex"]) - transform_replacement = transforms[0]["replacement"] - - matcher = transform_regex.matcher(topic) - if matcher.matches(): - topic = str(matcher.replaceFirst(transform_replacement)) - - # Additional check to confirm that the topic present - # in connector topics - - if topic in self.connector_manifest.topic_names: - # include schema name for three-level hierarchies - if has_three_level_hierarchy(source_platform) and len(table) > 1: - source_table = f"{table[-2]}.{table[-1]}" - - dataset_name = get_dataset_name(database_name, source_table) - - lineage = KafkaConnectLineage( - source_dataset=dataset_name, - source_platform=source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - topic_names.remove(topic) - lineages.append(lineage) - - if topic_names: - lineages.extend( - self.default_get_lineages( - database_name=database_name, - source_platform=source_platform, - topic_prefix=topic_prefix, - topic_names=topic_names, - include_source_dataset=False, - ) - ) - self.report.warning( - "Could not find input dataset for connector topics", - f"{self.connector_manifest.name} : {topic_names}", - ) - self.connector_manifest.lineages = lineages - return - else: - include_source_dataset = True - if SINGLE_TRANSFORM and UNKNOWN_TRANSFORM: - self.report.warning( - "Could not find input dataset, connector has unknown transform", - f"{self.connector_manifest.name} : {transforms[0]['type']}", - ) - include_source_dataset = False - if not SINGLE_TRANSFORM and UNKNOWN_TRANSFORM: - self.report.warning( - "Could not find input dataset, connector has one or more unknown transforms", - self.connector_manifest.name, - ) - include_source_dataset = False - lineages = self.default_get_lineages( - database_name=database_name, - source_platform=source_platform, - topic_prefix=topic_prefix, - include_source_dataset=include_source_dataset, - ) - self.connector_manifest.lineages = lineages - return - - -@dataclass -class MongoSourceConnector: - # https://www.mongodb.com/docs/kafka-connector/current/source-connector/ - - connector_manifest: ConnectorManifest - - def __init__( - self, connector_manifest: ConnectorManifest, config: KafkaConnectSourceConfig - ) -> None: - self.connector_manifest = connector_manifest - self.config = config - self._extract_lineages() - - @dataclass - class MongoSourceParser: - db_connection_url: Optional[str] - source_platform: str - database_name: Optional[str] - topic_prefix: Optional[str] - transforms: List[str] - - def get_parser( - self, - connector_manifest: ConnectorManifest, - ) -> MongoSourceParser: - parser = self.MongoSourceParser( - db_connection_url=connector_manifest.config.get("connection.uri"), - source_platform="mongodb", - database_name=connector_manifest.config.get("database"), - topic_prefix=connector_manifest.config.get("topic_prefix"), - transforms=( - connector_manifest.config["transforms"].split(",") - if "transforms" in connector_manifest.config - else [] - ), - ) - - return parser - - def _extract_lineages(self): - lineages: List[KafkaConnectLineage] = list() - parser = self.get_parser(self.connector_manifest) - source_platform = parser.source_platform - topic_naming_pattern = r"mongodb\.(\w+)\.(\w+)" - - if not self.connector_manifest.topic_names: - return lineages - - for topic in self.connector_manifest.topic_names: - found = re.search(re.compile(topic_naming_pattern), topic) - - if found: - table_name = get_dataset_name(found.group(1), found.group(2)) - - lineage = KafkaConnectLineage( - source_dataset=table_name, - source_platform=source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - lineages.append(lineage) - self.connector_manifest.lineages = lineages - - -@dataclass -class DebeziumSourceConnector: - connector_manifest: ConnectorManifest - report: KafkaConnectSourceReport - - def __init__( - self, - connector_manifest: ConnectorManifest, - config: KafkaConnectSourceConfig, - report: KafkaConnectSourceReport, - ) -> None: - self.connector_manifest = connector_manifest - self.config = config - self.report = report - self._extract_lineages() - - @dataclass - class DebeziumParser: - source_platform: str - server_name: Optional[str] - database_name: Optional[str] - - def get_server_name(self, connector_manifest: ConnectorManifest) -> str: - if "topic.prefix" in connector_manifest.config: - return connector_manifest.config["topic.prefix"] - else: - return connector_manifest.config.get("database.server.name", "") - - def get_parser( - self, - connector_manifest: ConnectorManifest, - ) -> DebeziumParser: - connector_class = connector_manifest.config.get(CONNECTOR_CLASS, "") - - if connector_class == "io.debezium.connector.mysql.MySqlConnector": - parser = self.DebeziumParser( - source_platform="mysql", - server_name=self.get_server_name(connector_manifest), - database_name=None, - ) - elif connector_class == "MySqlConnector": - parser = self.DebeziumParser( - source_platform="mysql", - server_name=self.get_server_name(connector_manifest), - database_name=None, - ) - elif connector_class == "io.debezium.connector.mongodb.MongoDbConnector": - parser = self.DebeziumParser( - source_platform="mongodb", - server_name=self.get_server_name(connector_manifest), - database_name=None, - ) - elif connector_class == "io.debezium.connector.postgresql.PostgresConnector": - parser = self.DebeziumParser( - source_platform="postgres", - server_name=self.get_server_name(connector_manifest), - database_name=connector_manifest.config.get("database.dbname"), - ) - elif connector_class == "io.debezium.connector.oracle.OracleConnector": - parser = self.DebeziumParser( - source_platform="oracle", - server_name=self.get_server_name(connector_manifest), - database_name=connector_manifest.config.get("database.dbname"), - ) - elif connector_class == "io.debezium.connector.sqlserver.SqlServerConnector": - database_name = connector_manifest.config.get( - "database.names" - ) or connector_manifest.config.get("database.dbname") - - if "," in str(database_name): - raise Exception( - f"Only one database is supported for Debezium's SQL Server connector. Found: {database_name}" - ) - - parser = self.DebeziumParser( - source_platform="mssql", - server_name=self.get_server_name(connector_manifest), - database_name=database_name, - ) - elif connector_class == "io.debezium.connector.db2.Db2Connector": - parser = self.DebeziumParser( - source_platform="db2", - server_name=self.get_server_name(connector_manifest), - database_name=connector_manifest.config.get("database.dbname"), - ) - elif connector_class == "io.debezium.connector.vitess.VitessConnector": - parser = self.DebeziumParser( - source_platform="vitess", - server_name=self.get_server_name(connector_manifest), - database_name=connector_manifest.config.get("vitess.keyspace"), - ) - else: - raise ValueError(f"Connector class '{connector_class}' is unknown.") - - return parser - - def _extract_lineages(self): - lineages: List[KafkaConnectLineage] = list() - - try: - parser = self.get_parser(self.connector_manifest) - source_platform = parser.source_platform - server_name = parser.server_name - database_name = parser.database_name - topic_naming_pattern = rf"({server_name})\.(\w+\.\w+)" - - if not self.connector_manifest.topic_names: - return lineages - - for topic in self.connector_manifest.topic_names: - found = re.search(re.compile(topic_naming_pattern), topic) - - if found: - table_name = get_dataset_name(database_name, found.group(2)) - - lineage = KafkaConnectLineage( - source_dataset=table_name, - source_platform=source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - lineages.append(lineage) - self.connector_manifest.lineages = lineages - except Exception as e: - self.report.warning( - "Error resolving lineage for connector", - self.connector_manifest.name, - exc=e, - ) - - return - - -@dataclass -class BigQuerySinkConnector: - connector_manifest: ConnectorManifest - report: KafkaConnectSourceReport - - def __init__( - self, connector_manifest: ConnectorManifest, report: KafkaConnectSourceReport - ) -> None: - self.connector_manifest = connector_manifest - self.report = report - self._extract_lineages() - - @dataclass - class BQParser: - project: str - target_platform: str - sanitizeTopics: str - transforms: list - topicsToTables: Optional[str] = None - datasets: Optional[str] = None - defaultDataset: Optional[str] = None - version: str = "v1" - - def get_parser( - self, - connector_manifest: ConnectorManifest, - ) -> BQParser: - project = connector_manifest.config["project"] - sanitizeTopics = connector_manifest.config.get("sanitizeTopics", "false") - transform_names = ( - self.connector_manifest.config.get("transforms", "").split(",") - if self.connector_manifest.config.get("transforms") - else [] - ) - transforms = [] - for name in transform_names: - transform = {"name": name} - transforms.append(transform) - for key in self.connector_manifest.config.keys(): - if key.startswith(f"transforms.{name}."): - transform[ - key.replace(f"transforms.{name}.", "") - ] = self.connector_manifest.config[key] - - if "defaultDataset" in connector_manifest.config: - defaultDataset = connector_manifest.config["defaultDataset"] - return self.BQParser( - project=project, - defaultDataset=defaultDataset, - target_platform="bigquery", - sanitizeTopics=sanitizeTopics.lower() == "true", - version="v2", - transforms=transforms, - ) - else: - # version 1.6.x and similar configs supported - datasets = connector_manifest.config["datasets"] - topicsToTables = connector_manifest.config.get("topicsToTables") - - return self.BQParser( - project=project, - topicsToTables=topicsToTables, - datasets=datasets, - target_platform="bigquery", - sanitizeTopics=sanitizeTopics.lower() == "true", - transforms=transforms, - ) - - def get_list(self, property: str) -> Iterable[Tuple[str, str]]: - entries = property.split(",") - for entry in entries: - key, val = entry.rsplit("=") - yield (key.strip(), val.strip()) - - def get_dataset_for_topic_v1(self, topic: str, parser: BQParser) -> Optional[str]: - topicregex_dataset_map: Dict[str, str] = dict(self.get_list(parser.datasets)) # type: ignore - from java.util.regex import Pattern - - for pattern, dataset in topicregex_dataset_map.items(): - patternMatcher = Pattern.compile(pattern).matcher(topic) - if patternMatcher.matches(): - return dataset - return None - - def sanitize_table_name(self, table_name): - table_name = re.sub("[^a-zA-Z0-9_]", "_", table_name) - if re.match("^[^a-zA-Z_].*", table_name): - table_name = "_" + table_name - - return table_name - - def get_dataset_table_for_topic( - self, topic: str, parser: BQParser - ) -> Optional[str]: - if parser.version == "v2": - dataset = parser.defaultDataset - parts = topic.split(":") - if len(parts) == 2: - dataset = parts[0] - table = parts[1] - else: - table = parts[0] - else: - dataset = self.get_dataset_for_topic_v1(topic, parser) - if dataset is None: - return None - - table = topic - if parser.topicsToTables: - topicregex_table_map: Dict[str, str] = dict( - self.get_list(parser.topicsToTables) # type: ignore - ) - from java.util.regex import Pattern - - for pattern, tbl in topicregex_table_map.items(): - patternMatcher = Pattern.compile(pattern).matcher(topic) - if patternMatcher.matches(): - table = tbl - break - - if parser.sanitizeTopics: - table = self.sanitize_table_name(table) - return f"{dataset}.{table}" - - def apply_transformations( - self, topic: str, transforms: List[Dict[str, str]] - ) -> str: - for transform in transforms: - if transform["type"] == "org.apache.kafka.connect.transforms.RegexRouter": - regex = transform["regex"] - replacement = transform["replacement"] - pattern = re.compile(regex) - if pattern.match(topic): - topic = pattern.sub(replacement, topic, count=1) - return topic - - def _extract_lineages(self): - lineages: List[KafkaConnectLineage] = list() - parser = self.get_parser(self.connector_manifest) - if not parser: - return lineages - target_platform = parser.target_platform - project = parser.project - transforms = parser.transforms - self.connector_manifest.flow_property_bag = self.connector_manifest.config - # Mask/Remove properties that may reveal credentials - if "keyfile" in self.connector_manifest.flow_property_bag: - del self.connector_manifest.flow_property_bag["keyfile"] - - for topic in self.connector_manifest.topic_names: - transformed_topic = self.apply_transformations(topic, transforms) - dataset_table = self.get_dataset_table_for_topic(transformed_topic, parser) - if dataset_table is None: - self.report.warning( - "Could not find target dataset for topic, please check your connector configuration" - f"{self.connector_manifest.name} : {transformed_topic} ", - ) - continue - target_dataset = f"{project}.{dataset_table}" - - lineages.append( - KafkaConnectLineage( - source_dataset=transformed_topic, - source_platform=KAFKA, - target_dataset=target_dataset, - target_platform=target_platform, - ) - ) - self.connector_manifest.lineages = lineages - return - - -@dataclass -class SnowflakeSinkConnector: - connector_manifest: ConnectorManifest - report: KafkaConnectSourceReport - - def __init__( - self, connector_manifest: ConnectorManifest, report: KafkaConnectSourceReport - ) -> None: - self.connector_manifest = connector_manifest - self.report = report - self._extract_lineages() - - @dataclass - class SnowflakeParser: - database_name: str - schema_name: str - topics_to_tables: Dict[str, str] - - def get_table_name_from_topic_name(self, topic_name: str) -> str: - """ - This function converts the topic name to a valid Snowflake table name using some rules. - Refer below link for more info - https://docs.snowflake.com/en/user-guide/kafka-connector-overview#target-tables-for-kafka-topics - """ - table_name = re.sub("[^a-zA-Z0-9_]", "_", topic_name) - if re.match("^[^a-zA-Z_].*", table_name): - table_name = "_" + table_name - # Connector may append original topic's hash code as suffix for conflict resolution - # if generated table names for 2 topics are similar. This corner case is not handled here. - # Note that Snowflake recommends to choose topic names that follow the rules for - # Snowflake identifier names so this case is not recommended by snowflake. - return table_name - - def get_parser( - self, - connector_manifest: ConnectorManifest, - ) -> SnowflakeParser: - database_name = connector_manifest.config["snowflake.database.name"] - schema_name = connector_manifest.config["snowflake.schema.name"] - - # Fetch user provided topic to table map - provided_topics_to_tables: Dict[str, str] = {} - if connector_manifest.config.get("snowflake.topic2table.map"): - for each in connector_manifest.config["snowflake.topic2table.map"].split( - "," - ): - topic, table = each.split(":") - provided_topics_to_tables[topic.strip()] = table.strip() - - topics_to_tables: Dict[str, str] = {} - # Extract lineage for only those topics whose data ingestion started - for topic in connector_manifest.topic_names: - if topic in provided_topics_to_tables: - # If user provided which table to get mapped with this topic - topics_to_tables[topic] = provided_topics_to_tables[topic] - else: - # Else connector converts topic name to a valid Snowflake table name. - topics_to_tables[topic] = self.get_table_name_from_topic_name(topic) - - return self.SnowflakeParser( - database_name=database_name, - schema_name=schema_name, - topics_to_tables=topics_to_tables, - ) - - def _extract_lineages(self): - self.connector_manifest.flow_property_bag = self.connector_manifest.config - - # For all snowflake sink connector properties, refer below link - # https://docs.snowflake.com/en/user-guide/kafka-connector-install#configuring-the-kafka-connector - # remove private keys, secrets from properties - secret_properties = [ - "snowflake.private.key", - "snowflake.private.key.passphrase", - "value.converter.basic.auth.user.info", - ] - for k in secret_properties: - if k in self.connector_manifest.flow_property_bag: - del self.connector_manifest.flow_property_bag[k] - - lineages: List[KafkaConnectLineage] = list() - parser = self.get_parser(self.connector_manifest) - - for topic, table in parser.topics_to_tables.items(): - target_dataset = f"{parser.database_name}.{parser.schema_name}.{table}" - lineages.append( - KafkaConnectLineage( - source_dataset=topic, - source_platform=KAFKA, - target_dataset=target_dataset, - target_platform="snowflake", - ) - ) - - self.connector_manifest.lineages = lineages - return - - -@dataclass -class ConfluentS3SinkConnector: - connector_manifest: ConnectorManifest - - def __init__( - self, connector_manifest: ConnectorManifest, report: KafkaConnectSourceReport - ) -> None: - self.connector_manifest = connector_manifest - self.report = report - self._extract_lineages() - - @dataclass - class S3SinkParser: - target_platform: str - bucket: str - topics_dir: str - topics: Iterable[str] - - def _get_parser(self, connector_manifest: ConnectorManifest) -> S3SinkParser: - # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#s3 - bucket = connector_manifest.config.get("s3.bucket.name") - if not bucket: - raise ValueError( - "Could not find 's3.bucket.name' in connector configuration" - ) - - # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#storage - topics_dir = connector_manifest.config.get("topics.dir", "topics") - - return self.S3SinkParser( - target_platform="s3", - bucket=bucket, - topics_dir=topics_dir, - topics=connector_manifest.topic_names, - ) - - def _extract_lineages(self): - self.connector_manifest.flow_property_bag = self.connector_manifest.config - - # remove keys, secrets from properties - secret_properties = [ - "aws.access.key.id", - "aws.secret.access.key", - "s3.sse.customer.key", - "s3.proxy.password", - ] - for k in secret_properties: - if k in self.connector_manifest.flow_property_bag: - del self.connector_manifest.flow_property_bag[k] - - try: - parser = self._get_parser(self.connector_manifest) - - lineages: List[KafkaConnectLineage] = list() - for topic in parser.topics: - target_dataset = f"{parser.bucket}/{parser.topics_dir}/{topic}" - - lineages.append( - KafkaConnectLineage( - source_dataset=topic, - source_platform="kafka", - target_dataset=target_dataset, - target_platform=parser.target_platform, - ) - ) - self.connector_manifest.lineages = lineages - except Exception as e: - self.report.warning( - "Error resolving lineage for connector", - self.connector_manifest.name, - exc=e, - ) - - return - - -def transform_connector_config( - connector_config: Dict, provided_configs: List[ProvidedConfig] -) -> None: - """This method will update provided configs in connector config values, if any""" - lookupsByProvider = {} - for pconfig in provided_configs: - lookupsByProvider[f"${{{pconfig.provider}:{pconfig.path_key}}}"] = pconfig.value - for k, v in connector_config.items(): - for key, value in lookupsByProvider.items(): - if key in v: - connector_config[k] = connector_config[k].replace(key, value) - - -@platform_name("Kafka Connect") -@config_class(KafkaConnectSourceConfig) -@support_status(SupportStatus.CERTIFIED) -@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") -@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default") -@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default") -class KafkaConnectSource(StatefulIngestionSourceBase): - config: KafkaConnectSourceConfig - report: KafkaConnectSourceReport - platform: str = "kafka-connect" - - def __init__(self, config: KafkaConnectSourceConfig, ctx: PipelineContext): - super().__init__(config, ctx) - self.config = config - self.report = KafkaConnectSourceReport() - self.session = requests.Session() - self.session.headers.update( - { - "Accept": "application/json", - "Content-Type": "application/json", - } - ) - - # Test the connection - if self.config.username is not None and self.config.password is not None: - logger.info( - f"Connecting to {self.config.connect_uri} with Authentication..." - ) - self.session.auth = (self.config.username, self.config.password) - - test_response = self.session.get(f"{self.config.connect_uri}/connectors") - test_response.raise_for_status() - logger.info(f"Connection to {self.config.connect_uri} is ok") - if not jpype.isJVMStarted(): - jpype.startJVM() - - @classmethod - def create(cls, config_dict: dict, ctx: PipelineContext) -> Source: - config = KafkaConnectSourceConfig.parse_obj(config_dict) - return cls(config, ctx) - - def get_connectors_manifest(self) -> List[ConnectorManifest]: - """Get Kafka Connect connectors manifest using REST API. - Enrich with lineages metadata. - """ - connectors_manifest = list() - - connector_response = self.session.get( - f"{self.config.connect_uri}/connectors", - ) - - payload = connector_response.json() - - for connector_name in payload: - connector_url = f"{self.config.connect_uri}/connectors/{connector_name}" - connector_manifest = self._get_connector_manifest( - connector_name, connector_url - ) - if ( - connector_manifest is None - or not self.config.connector_patterns.allowed(connector_manifest.name) - ): - self.report.report_dropped(connector_name) - continue - - if self.config.provided_configs: - transform_connector_config( - connector_manifest.config, self.config.provided_configs - ) - # Initialize connector lineages - connector_manifest.lineages = list() - connector_manifest.url = connector_url - - connector_manifest.topic_names = self._get_connector_topics(connector_name) - - # Populate Source Connector metadata - if connector_manifest.type == SOURCE: - connector_manifest.tasks = self._get_connector_tasks(connector_name) - - # JDBC source connector lineages - if connector_manifest.config.get(CONNECTOR_CLASS).__eq__( - "io.confluent.connect.jdbc.JdbcSourceConnector" - ): - connector_manifest = ConfluentJDBCSourceConnector( - connector_manifest=connector_manifest, - config=self.config, - report=self.report, - ).connector_manifest - elif connector_manifest.config.get(CONNECTOR_CLASS, "").startswith( - "io.debezium.connector" - ): - connector_manifest = DebeziumSourceConnector( - connector_manifest=connector_manifest, - config=self.config, - report=self.report, - ).connector_manifest - elif ( - connector_manifest.config.get(CONNECTOR_CLASS, "") - == "com.mongodb.kafka.connect.MongoSourceConnector" - ): - connector_manifest = MongoSourceConnector( - connector_manifest=connector_manifest, config=self.config - ).connector_manifest - else: - # Find the target connector object in the list, or log an error if unknown. - target_connector = None - for connector in self.config.generic_connectors: - if connector.connector_name == connector_manifest.name: - target_connector = connector - break - if not target_connector: - logger.warning( - f"Detected undefined connector {connector_manifest.name}, which is not in the customized connector list. Please refer to Kafka Connect ingestion recipe to define this customized connector." - ) - continue - - for topic in connector_manifest.topic_names: - lineage = KafkaConnectLineage( - source_dataset=target_connector.source_dataset, - source_platform=target_connector.source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - - connector_manifest.lineages.append(lineage) - - if connector_manifest.type == SINK: - if connector_manifest.config.get(CONNECTOR_CLASS).__eq__( - "com.wepay.kafka.connect.bigquery.BigQuerySinkConnector" - ): - connector_manifest = BigQuerySinkConnector( - connector_manifest=connector_manifest, report=self.report - ).connector_manifest - elif connector_manifest.config.get("connector.class").__eq__( - "io.confluent.connect.s3.S3SinkConnector" - ): - connector_manifest = ConfluentS3SinkConnector( - connector_manifest=connector_manifest, report=self.report - ).connector_manifest - elif connector_manifest.config.get("connector.class").__eq__( - "com.snowflake.kafka.connector.SnowflakeSinkConnector" - ): - connector_manifest = SnowflakeSinkConnector( - connector_manifest=connector_manifest, report=self.report - ).connector_manifest - else: - self.report.report_dropped(connector_manifest.name) - logger.warning( - f"Skipping connector {connector_manifest.name}. Lineage for Connector not yet implemented" - ) - pass - - connectors_manifest.append(connector_manifest) - - return connectors_manifest - - def _get_connector_manifest( - self, connector_name: str, connector_url: str - ) -> Optional[ConnectorManifest]: - try: - connector_response = self.session.get(connector_url) - connector_response.raise_for_status() - except Exception as e: - self.report.warning( - "Failed to get connector details", connector_name, exc=e - ) - return None - manifest = connector_response.json() - connector_manifest = ConnectorManifest(**manifest) - return connector_manifest - - def _get_connector_tasks(self, connector_name: str) -> dict: - try: - response = self.session.get( - f"{self.config.connect_uri}/connectors/{connector_name}/tasks", - ) - response.raise_for_status() - except Exception as e: - self.report.warning( - "Error getting connector tasks", context=connector_name, exc=e - ) - return {} - - return response.json() - - def _get_connector_topics(self, connector_name: str) -> List[str]: - try: - response = self.session.get( - f"{self.config.connect_uri}/connectors/{connector_name}/topics", - ) - response.raise_for_status() - except Exception as e: - self.report.warning( - "Error getting connector topics", context=connector_name, exc=e - ) - return [] - - return response.json()[connector_name]["topics"] - - def construct_flow_workunit(self, connector: ConnectorManifest) -> MetadataWorkUnit: - connector_name = connector.name - connector_type = connector.type - connector_class = connector.config.get(CONNECTOR_CLASS) - flow_property_bag = connector.flow_property_bag - # connector_url = connector.url # NOTE: this will expose connector credential when used - flow_urn = builder.make_data_flow_urn( - self.platform, - connector_name, - self.config.env, - self.config.platform_instance, - ) - - return MetadataChangeProposalWrapper( - entityUrn=flow_urn, - aspect=models.DataFlowInfoClass( - name=connector_name, - description=f"{connector_type.capitalize()} connector using `{connector_class}` plugin.", - customProperties=flow_property_bag, - # externalUrl=connector_url, # NOTE: this will expose connector credential when used - ), - ).as_workunit() - - def construct_job_workunits( - self, connector: ConnectorManifest - ) -> Iterable[MetadataWorkUnit]: - connector_name = connector.name - flow_urn = builder.make_data_flow_urn( - self.platform, - connector_name, - self.config.env, - self.config.platform_instance, - ) - - lineages = connector.lineages - if lineages: - for lineage in lineages: - source_dataset = lineage.source_dataset - source_platform = lineage.source_platform - target_dataset = lineage.target_dataset - target_platform = lineage.target_platform - job_property_bag = lineage.job_property_bag - - source_platform_instance = get_platform_instance( - self.config, connector_name, source_platform - ) - target_platform_instance = get_platform_instance( - self.config, connector_name, target_platform - ) - - job_id = self.get_job_id(lineage, connector, self.config) - job_urn = builder.make_data_job_urn_with_flow(flow_urn, job_id) - - inlets = ( - [ - self.make_lineage_dataset_urn( - source_platform, source_dataset, source_platform_instance - ) - ] - if source_dataset - else [] - ) - outlets = [ - self.make_lineage_dataset_urn( - target_platform, target_dataset, target_platform_instance - ) - ] - - yield MetadataChangeProposalWrapper( - entityUrn=job_urn, - aspect=models.DataJobInfoClass( - name=f"{connector_name}:{job_id}", - type="COMMAND", - customProperties=job_property_bag, - ), - ).as_workunit() - - yield MetadataChangeProposalWrapper( - entityUrn=job_urn, - aspect=models.DataJobInputOutputClass( - inputDatasets=inlets, - outputDatasets=outlets, - ), - ).as_workunit() - - def get_job_id( - self, - lineage: KafkaConnectLineage, - connector: ConnectorManifest, - config: KafkaConnectSourceConfig, - ) -> str: - connector_class = connector.config.get(CONNECTOR_CLASS) - - # Note - This block is only to maintain backward compatibility of Job URN - if ( - connector_class - and connector.type == SOURCE - and ( - "JdbcSourceConnector" in connector_class - or connector_class.startswith("io.debezium.connector") - ) - and lineage.source_dataset - and config.connect_to_platform_map - and config.connect_to_platform_map.get(connector.name) - and config.connect_to_platform_map[connector.name].get( - lineage.source_platform - ) - ): - return f"{config.connect_to_platform_map[connector.name][lineage.source_platform]}.{lineage.source_dataset}" - - return ( - lineage.source_dataset - if lineage.source_dataset - else f"unknown_source.{lineage.target_dataset}" - ) - - def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: - return [ - *super().get_workunit_processors(), - StaleEntityRemovalHandler.create( - self, self.config, self.ctx - ).workunit_processor, - ] - - def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: - connectors_manifest = self.get_connectors_manifest() - for connector in connectors_manifest: - name = connector.name - - yield self.construct_flow_workunit(connector) - yield from self.construct_job_workunits(connector) - self.report.report_connector_scanned(name) - - def get_report(self) -> KafkaConnectSourceReport: - return self.report - - def make_lineage_dataset_urn( - self, platform: str, name: str, platform_instance: Optional[str] - ) -> str: - if self.config.convert_lineage_urns_to_lowercase: - name = name.lower() - - return builder.make_dataset_urn_with_platform_instance( - platform, name, platform_instance, self.config.env - ) - - -# TODO: Find a more automated way to discover new platforms with 3 level naming hierarchy. -def has_three_level_hierarchy(platform: str) -> bool: - return platform in ["postgres", "trino", "redshift", "snowflake"] diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/common.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/common.py new file mode 100644 index 00000000000000..36f6a96c0d4080 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/common.py @@ -0,0 +1,202 @@ +import logging +from dataclasses import dataclass, field +from typing import Dict, Iterable, List, Optional + +from pydantic.fields import Field + +from datahub.configuration.common import AllowDenyPattern, ConfigModel +from datahub.configuration.source_common import ( + DatasetLineageProviderConfigBase, + PlatformInstanceConfigMixin, +) +from datahub.ingestion.source.state.stale_entity_removal_handler import ( + StaleEntityRemovalSourceReport, + StatefulStaleMetadataRemovalConfig, +) +from datahub.ingestion.source.state.stateful_ingestion_base import ( + StatefulIngestionConfigBase, +) + +logger = logging.getLogger(__name__) + +KAFKA = "kafka" +SOURCE = "source" +SINK = "sink" +CONNECTOR_CLASS = "connector.class" + + +class ProvidedConfig(ConfigModel): + provider: str + path_key: str + value: str + + +class GenericConnectorConfig(ConfigModel): + connector_name: str + source_dataset: str + source_platform: str + + +class KafkaConnectSourceConfig( + PlatformInstanceConfigMixin, + DatasetLineageProviderConfigBase, + StatefulIngestionConfigBase, +): + # See the Connect REST Interface for details + # https://docs.confluent.io/platform/current/connect/references/restapi.html# + connect_uri: str = Field( + default="http://localhost:8083/", description="URI to connect to." + ) + username: Optional[str] = Field(default=None, description="Kafka Connect username.") + password: Optional[str] = Field(default=None, description="Kafka Connect password.") + cluster_name: Optional[str] = Field( + default="connect-cluster", description="Cluster to ingest from." + ) + # convert lineage dataset's urns to lowercase + convert_lineage_urns_to_lowercase: bool = Field( + default=False, + description="Whether to convert the urns of ingested lineage dataset to lowercase", + ) + connector_patterns: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="regex patterns for connectors to filter for ingestion.", + ) + provided_configs: Optional[List[ProvidedConfig]] = Field( + default=None, description="Provided Configurations" + ) + connect_to_platform_map: Optional[Dict[str, Dict[str, str]]] = Field( + default=None, + description='Platform instance mapping when multiple instances for a platform is available. Entry for a platform should be in either `platform_instance_map` or `connect_to_platform_map`. e.g.`connect_to_platform_map: { "postgres-connector-finance-db": "postgres": "core_finance_instance" }`', + ) + platform_instance_map: Optional[Dict[str, str]] = Field( + default=None, + description='Platform instance mapping to use when constructing URNs. e.g.`platform_instance_map: { "hive": "warehouse" }`', + ) + generic_connectors: List[GenericConnectorConfig] = Field( + default=[], + description="Provide lineage graph for sources connectors other than Confluent JDBC Source Connector, Debezium Source Connector, and Mongo Source Connector", + ) + + stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None + + +@dataclass +class KafkaConnectSourceReport(StaleEntityRemovalSourceReport): + connectors_scanned: int = 0 + filtered: List[str] = field(default_factory=list) + + def report_connector_scanned(self, connector: str) -> None: + self.connectors_scanned += 1 + + def report_dropped(self, connector: str) -> None: + self.filtered.append(connector) + + +@dataclass +class KafkaConnectLineage: + """Class to store Kafka Connect lineage mapping, Each instance is potential DataJob""" + + source_platform: str + target_dataset: str + target_platform: str + job_property_bag: Optional[Dict[str, str]] = None + source_dataset: Optional[str] = None + + +@dataclass +class ConnectorManifest: + """Each instance is potential DataFlow""" + + name: str + type: str + config: Dict + tasks: Dict + url: Optional[str] = None + flow_property_bag: Optional[Dict[str, str]] = None + lineages: List[KafkaConnectLineage] = field(default_factory=list) + topic_names: Iterable[str] = field(default_factory=list) + + +def remove_prefix(text: str, prefix: str) -> str: + if text.startswith(prefix): + index = len(prefix) + return text[index:] + return text + + +def unquote( + string: str, leading_quote: str = '"', trailing_quote: Optional[str] = None +) -> str: + """ + If string starts and ends with a quote, unquote it + """ + trailing_quote = trailing_quote if trailing_quote else leading_quote + if string.startswith(leading_quote) and string.endswith(trailing_quote): + string = string[1:-1] + return string + + +def get_dataset_name( + database_name: Optional[str], + source_table: str, +) -> str: + if database_name: + dataset_name = database_name + "." + source_table + else: + dataset_name = source_table + + return dataset_name + + +def get_platform_instance( + config: KafkaConnectSourceConfig, connector_name: str, platform: str +) -> Optional[str]: + instance_name = None + if ( + config.connect_to_platform_map + and config.connect_to_platform_map.get(connector_name) + and config.connect_to_platform_map[connector_name].get(platform) + ): + instance_name = config.connect_to_platform_map[connector_name][platform] + if config.platform_instance_map and config.platform_instance_map.get(platform): + logger.warning( + f"Same source platform {platform} configured in both platform_instance_map and connect_to_platform_map." + "Will prefer connector specific platform instance from connect_to_platform_map." + ) + elif config.platform_instance_map and config.platform_instance_map.get(platform): + instance_name = config.platform_instance_map[platform] + logger.info( + f"Instance name assigned is: {instance_name} for Connector Name {connector_name} and platform {platform}" + ) + return instance_name + + +def transform_connector_config( + connector_config: Dict, provided_configs: List[ProvidedConfig] +) -> None: + """This method will update provided configs in connector config values, if any""" + lookupsByProvider = {} + for pconfig in provided_configs: + lookupsByProvider[f"${{{pconfig.provider}:{pconfig.path_key}}}"] = pconfig.value + for k, v in connector_config.items(): + for key, value in lookupsByProvider.items(): + if key in v: + connector_config[k] = connector_config[k].replace(key, value) + + +# TODO: Find a more automated way to discover new platforms with 3 level naming hierarchy. +def has_three_level_hierarchy(platform: str) -> bool: + return platform in ["postgres", "trino", "redshift", "snowflake"] + + +@dataclass +class BaseConnector: + connector_manifest: ConnectorManifest + config: KafkaConnectSourceConfig + report: KafkaConnectSourceReport + + def extract_lineages(self) -> List[KafkaConnectLineage]: + return [] + + def extract_flow_property_bag(self) -> Optional[Dict[str, str]]: + return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/kafka_connect.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/kafka_connect.py new file mode 100644 index 00000000000000..fa6b614c4b52a6 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/kafka_connect.py @@ -0,0 +1,367 @@ +import logging +from typing import Iterable, List, Optional, Type + +import jpype +import jpype.imports +import requests + +import datahub.emitter.mce_builder as builder +import datahub.metadata.schema_classes as models +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.kafka_connect.common import ( + CONNECTOR_CLASS, + SINK, + SOURCE, + BaseConnector, + ConnectorManifest, + KafkaConnectLineage, + KafkaConnectSourceConfig, + KafkaConnectSourceReport, + get_platform_instance, + transform_connector_config, +) +from datahub.ingestion.source.kafka_connect.sink_connectors import ( + BIGQUERY_SINK_CONNECTOR_CLASS, + S3_SINK_CONNECTOR_CLASS, + SNOWFLAKE_SINK_CONNECTOR_CLASS, + BigQuerySinkConnector, + ConfluentS3SinkConnector, + SnowflakeSinkConnector, +) +from datahub.ingestion.source.kafka_connect.source_connectors import ( + DEBEZIUM_SOURCE_CONNECTOR_PREFIX, + JDBC_SOURCE_CONNECTOR_CLASS, + MONGO_SOURCE_CONNECTOR_CLASS, + ConfigDrivenSourceConnector, + ConfluentJDBCSourceConnector, + DebeziumSourceConnector, + MongoSourceConnector, +) +from datahub.ingestion.source.state.stale_entity_removal_handler import ( + StaleEntityRemovalHandler, +) +from datahub.ingestion.source.state.stateful_ingestion_base import ( + StatefulIngestionSourceBase, +) + +logger = logging.getLogger(__name__) + + +@platform_name("Kafka Connect") +@config_class(KafkaConnectSourceConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default") +@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default") +class KafkaConnectSource(StatefulIngestionSourceBase): + config: KafkaConnectSourceConfig + report: KafkaConnectSourceReport + platform: str = "kafka-connect" + + def __init__(self, config: KafkaConnectSourceConfig, ctx: PipelineContext): + super().__init__(config, ctx) + self.config = config + self.report = KafkaConnectSourceReport() + self.session = requests.Session() + self.session.headers.update( + { + "Accept": "application/json", + "Content-Type": "application/json", + } + ) + + # Test the connection + if self.config.username is not None and self.config.password is not None: + logger.info( + f"Connecting to {self.config.connect_uri} with Authentication..." + ) + self.session.auth = (self.config.username, self.config.password) + + test_response = self.session.get(f"{self.config.connect_uri}/connectors") + test_response.raise_for_status() + logger.info(f"Connection to {self.config.connect_uri} is ok") + if not jpype.isJVMStarted(): + jpype.startJVM() + + @classmethod + def create(cls, config_dict: dict, ctx: PipelineContext) -> Source: + config = KafkaConnectSourceConfig.parse_obj(config_dict) + return cls(config, ctx) + + def get_connectors_manifest(self) -> Iterable[ConnectorManifest]: + """Get Kafka Connect connectors manifest using REST API. + Enrich with lineages metadata. + """ + + connector_response = self.session.get( + f"{self.config.connect_uri}/connectors", + ) + + payload = connector_response.json() + + for connector_name in payload: + connector_url = f"{self.config.connect_uri}/connectors/{connector_name}" + connector_manifest = self._get_connector_manifest( + connector_name, connector_url + ) + if ( + connector_manifest is None + or not self.config.connector_patterns.allowed(connector_manifest.name) + ): + self.report.report_dropped(connector_name) + continue + + if self.config.provided_configs: + transform_connector_config( + connector_manifest.config, self.config.provided_configs + ) + connector_manifest.url = connector_url + connector_manifest.topic_names = self._get_connector_topics(connector_name) + connector_class_value = connector_manifest.config.get(CONNECTOR_CLASS) or "" + + class_type: Type[BaseConnector] = BaseConnector + + # Populate Source Connector metadata + if connector_manifest.type == SOURCE: + connector_manifest.tasks = self._get_connector_tasks(connector_name) + + # JDBC source connector lineages + if connector_class_value == JDBC_SOURCE_CONNECTOR_CLASS: + class_type = ConfluentJDBCSourceConnector + elif connector_class_value.startswith(DEBEZIUM_SOURCE_CONNECTOR_PREFIX): + class_type = DebeziumSourceConnector + elif connector_class_value == MONGO_SOURCE_CONNECTOR_CLASS: + class_type = MongoSourceConnector + elif any( + [ + connector.connector_name == connector_manifest.name + for connector in self.config.generic_connectors + ] + ): + class_type = ConfigDrivenSourceConnector + else: + self.report.report_dropped(connector_manifest.name) + self.report.warning( + "Lineage for Source Connector not supported. " + "Please refer to Kafka Connect docs to use `generic_connectors` config.", + context=f"{connector_manifest.name} of type {connector_class_value}", + ) + continue + elif connector_manifest.type == SINK: + if connector_class_value == BIGQUERY_SINK_CONNECTOR_CLASS: + class_type = BigQuerySinkConnector + elif connector_class_value == S3_SINK_CONNECTOR_CLASS: + class_type = ConfluentS3SinkConnector + elif connector_class_value == SNOWFLAKE_SINK_CONNECTOR_CLASS: + class_type = SnowflakeSinkConnector + else: + self.report.report_dropped(connector_manifest.name) + self.report.warning( + "Lineage for Sink Connector not supported.", + context=f"{connector_manifest.name} of type {connector_class_value}", + ) + + connector_class = class_type(connector_manifest, self.config, self.report) + connector_manifest.lineages = connector_class.extract_lineages() + connector_manifest.flow_property_bag = ( + connector_class.extract_flow_property_bag() + ) + + yield connector_manifest + + def _get_connector_manifest( + self, connector_name: str, connector_url: str + ) -> Optional[ConnectorManifest]: + try: + connector_response = self.session.get(connector_url) + connector_response.raise_for_status() + except Exception as e: + self.report.warning( + "Failed to get connector details", connector_name, exc=e + ) + return None + manifest = connector_response.json() + connector_manifest = ConnectorManifest(**manifest) + return connector_manifest + + def _get_connector_tasks(self, connector_name: str) -> dict: + try: + response = self.session.get( + f"{self.config.connect_uri}/connectors/{connector_name}/tasks", + ) + response.raise_for_status() + except Exception as e: + self.report.warning( + "Error getting connector tasks", context=connector_name, exc=e + ) + return {} + + return response.json() + + def _get_connector_topics(self, connector_name: str) -> List[str]: + try: + response = self.session.get( + f"{self.config.connect_uri}/connectors/{connector_name}/topics", + ) + response.raise_for_status() + except Exception as e: + self.report.warning( + "Error getting connector topics", context=connector_name, exc=e + ) + return [] + + return response.json()[connector_name]["topics"] + + def construct_flow_workunit(self, connector: ConnectorManifest) -> MetadataWorkUnit: + connector_name = connector.name + connector_type = connector.type + connector_class = connector.config.get(CONNECTOR_CLASS) + flow_property_bag = connector.flow_property_bag + # connector_url = connector.url # NOTE: this will expose connector credential when used + flow_urn = builder.make_data_flow_urn( + self.platform, + connector_name, + self.config.env, + self.config.platform_instance, + ) + + return MetadataChangeProposalWrapper( + entityUrn=flow_urn, + aspect=models.DataFlowInfoClass( + name=connector_name, + description=f"{connector_type.capitalize()} connector using `{connector_class}` plugin.", + customProperties=flow_property_bag, + # externalUrl=connector_url, # NOTE: this will expose connector credential when used + ), + ).as_workunit() + + def construct_job_workunits( + self, connector: ConnectorManifest + ) -> Iterable[MetadataWorkUnit]: + connector_name = connector.name + flow_urn = builder.make_data_flow_urn( + self.platform, + connector_name, + self.config.env, + self.config.platform_instance, + ) + + lineages = connector.lineages + if lineages: + for lineage in lineages: + source_dataset = lineage.source_dataset + source_platform = lineage.source_platform + target_dataset = lineage.target_dataset + target_platform = lineage.target_platform + job_property_bag = lineage.job_property_bag + + source_platform_instance = get_platform_instance( + self.config, connector_name, source_platform + ) + target_platform_instance = get_platform_instance( + self.config, connector_name, target_platform + ) + + job_id = self.get_job_id(lineage, connector, self.config) + job_urn = builder.make_data_job_urn_with_flow(flow_urn, job_id) + + inlets = ( + [ + self.make_lineage_dataset_urn( + source_platform, source_dataset, source_platform_instance + ) + ] + if source_dataset + else [] + ) + outlets = [ + self.make_lineage_dataset_urn( + target_platform, target_dataset, target_platform_instance + ) + ] + + yield MetadataChangeProposalWrapper( + entityUrn=job_urn, + aspect=models.DataJobInfoClass( + name=f"{connector_name}:{job_id}", + type="COMMAND", + customProperties=job_property_bag, + ), + ).as_workunit() + + yield MetadataChangeProposalWrapper( + entityUrn=job_urn, + aspect=models.DataJobInputOutputClass( + inputDatasets=inlets, + outputDatasets=outlets, + ), + ).as_workunit() + + def get_job_id( + self, + lineage: KafkaConnectLineage, + connector: ConnectorManifest, + config: KafkaConnectSourceConfig, + ) -> str: + connector_class = connector.config.get(CONNECTOR_CLASS) + + # Note - This block is only to maintain backward compatibility of Job URN + if ( + connector_class + and connector.type == SOURCE + and ( + "JdbcSourceConnector" in connector_class + or connector_class.startswith("io.debezium.connector") + ) + and lineage.source_dataset + and config.connect_to_platform_map + and config.connect_to_platform_map.get(connector.name) + and config.connect_to_platform_map[connector.name].get( + lineage.source_platform + ) + ): + return f"{config.connect_to_platform_map[connector.name][lineage.source_platform]}.{lineage.source_dataset}" + + return ( + lineage.source_dataset + if lineage.source_dataset + else f"unknown_source.{lineage.target_dataset}" + ) + + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + StaleEntityRemovalHandler.create( + self, self.config, self.ctx + ).workunit_processor, + ] + + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + for connector in self.get_connectors_manifest(): + yield self.construct_flow_workunit(connector) + yield from self.construct_job_workunits(connector) + self.report.report_connector_scanned(connector.name) + + def get_report(self) -> KafkaConnectSourceReport: + return self.report + + def make_lineage_dataset_urn( + self, platform: str, name: str, platform_instance: Optional[str] + ) -> str: + if self.config.convert_lineage_urns_to_lowercase: + name = name.lower() + + return builder.make_dataset_urn_with_platform_instance( + platform, name, platform_instance, self.config.env + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/sink_connectors.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/sink_connectors.py new file mode 100644 index 00000000000000..2790460c8e6019 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/sink_connectors.py @@ -0,0 +1,341 @@ +import re +from dataclasses import dataclass +from typing import Dict, Iterable, List, Optional, Tuple + +from datahub.ingestion.source.kafka_connect.common import ( + KAFKA, + BaseConnector, + ConnectorManifest, + KafkaConnectLineage, +) + + +@dataclass +class ConfluentS3SinkConnector(BaseConnector): + @dataclass + class S3SinkParser: + target_platform: str + bucket: str + topics_dir: str + topics: Iterable[str] + + def _get_parser(self, connector_manifest: ConnectorManifest) -> S3SinkParser: + # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#s3 + bucket = connector_manifest.config.get("s3.bucket.name") + if not bucket: + raise ValueError( + "Could not find 's3.bucket.name' in connector configuration" + ) + + # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#storage + topics_dir = connector_manifest.config.get("topics.dir", "topics") + + return self.S3SinkParser( + target_platform="s3", + bucket=bucket, + topics_dir=topics_dir, + topics=connector_manifest.topic_names, + ) + + def extract_flow_property_bag(self) -> Dict[str, str]: + # Mask/Remove properties that may reveal credentials + flow_property_bag = { + k: v + for k, v in self.connector_manifest.config.items() + if k + not in [ + "aws.access.key.id", + "aws.secret.access.key", + "s3.sse.customer.key", + "s3.proxy.password", + ] + } + return flow_property_bag + + def extract_lineages(self) -> List[KafkaConnectLineage]: + try: + parser = self._get_parser(self.connector_manifest) + + lineages: List[KafkaConnectLineage] = list() + for topic in parser.topics: + target_dataset = f"{parser.bucket}/{parser.topics_dir}/{topic}" + + lineages.append( + KafkaConnectLineage( + source_dataset=topic, + source_platform="kafka", + target_dataset=target_dataset, + target_platform=parser.target_platform, + ) + ) + return lineages + except Exception as e: + self.report.warning( + "Error resolving lineage for connector", + self.connector_manifest.name, + exc=e, + ) + + return [] + + +@dataclass +class SnowflakeSinkConnector(BaseConnector): + @dataclass + class SnowflakeParser: + database_name: str + schema_name: str + topics_to_tables: Dict[str, str] + + def get_table_name_from_topic_name(self, topic_name: str) -> str: + """ + This function converts the topic name to a valid Snowflake table name using some rules. + Refer below link for more info + https://docs.snowflake.com/en/user-guide/kafka-connector-overview#target-tables-for-kafka-topics + """ + table_name = re.sub("[^a-zA-Z0-9_]", "_", topic_name) + if re.match("^[^a-zA-Z_].*", table_name): + table_name = "_" + table_name + # Connector may append original topic's hash code as suffix for conflict resolution + # if generated table names for 2 topics are similar. This corner case is not handled here. + # Note that Snowflake recommends to choose topic names that follow the rules for + # Snowflake identifier names so this case is not recommended by snowflake. + return table_name + + def get_parser( + self, + connector_manifest: ConnectorManifest, + ) -> SnowflakeParser: + database_name = connector_manifest.config["snowflake.database.name"] + schema_name = connector_manifest.config["snowflake.schema.name"] + + # Fetch user provided topic to table map + provided_topics_to_tables: Dict[str, str] = {} + if connector_manifest.config.get("snowflake.topic2table.map"): + for each in connector_manifest.config["snowflake.topic2table.map"].split( + "," + ): + topic, table = each.split(":") + provided_topics_to_tables[topic.strip()] = table.strip() + + topics_to_tables: Dict[str, str] = {} + # Extract lineage for only those topics whose data ingestion started + for topic in connector_manifest.topic_names: + if topic in provided_topics_to_tables: + # If user provided which table to get mapped with this topic + topics_to_tables[topic] = provided_topics_to_tables[topic] + else: + # Else connector converts topic name to a valid Snowflake table name. + topics_to_tables[topic] = self.get_table_name_from_topic_name(topic) + + return self.SnowflakeParser( + database_name=database_name, + schema_name=schema_name, + topics_to_tables=topics_to_tables, + ) + + def extract_flow_property_bag(self) -> Dict[str, str]: + # For all snowflake sink connector properties, refer below link + # https://docs.snowflake.com/en/user-guide/kafka-connector-install#configuring-the-kafka-connector + # remove private keys, secrets from properties + flow_property_bag = { + k: v + for k, v in self.connector_manifest.config.items() + if k + not in [ + "snowflake.private.key", + "snowflake.private.key.passphrase", + "value.converter.basic.auth.user.info", + ] + } + + return flow_property_bag + + def extract_lineages(self) -> List[KafkaConnectLineage]: + lineages: List[KafkaConnectLineage] = list() + parser = self.get_parser(self.connector_manifest) + + for topic, table in parser.topics_to_tables.items(): + target_dataset = f"{parser.database_name}.{parser.schema_name}.{table}" + lineages.append( + KafkaConnectLineage( + source_dataset=topic, + source_platform=KAFKA, + target_dataset=target_dataset, + target_platform="snowflake", + ) + ) + + return lineages + + +@dataclass +class BigQuerySinkConnector(BaseConnector): + @dataclass + class BQParser: + project: str + target_platform: str + sanitizeTopics: str + transforms: list + topicsToTables: Optional[str] = None + datasets: Optional[str] = None + defaultDataset: Optional[str] = None + version: str = "v1" + + def get_parser( + self, + connector_manifest: ConnectorManifest, + ) -> BQParser: + project = connector_manifest.config["project"] + sanitizeTopics = connector_manifest.config.get("sanitizeTopics", "false") + transform_names = ( + self.connector_manifest.config.get("transforms", "").split(",") + if self.connector_manifest.config.get("transforms") + else [] + ) + transforms = [] + for name in transform_names: + transform = {"name": name} + transforms.append(transform) + for key in self.connector_manifest.config.keys(): + if key.startswith(f"transforms.{name}."): + transform[ + key.replace(f"transforms.{name}.", "") + ] = self.connector_manifest.config[key] + + if "defaultDataset" in connector_manifest.config: + defaultDataset = connector_manifest.config["defaultDataset"] + return self.BQParser( + project=project, + defaultDataset=defaultDataset, + target_platform="bigquery", + sanitizeTopics=sanitizeTopics.lower() == "true", + version="v2", + transforms=transforms, + ) + else: + # version 1.6.x and similar configs supported + datasets = connector_manifest.config["datasets"] + topicsToTables = connector_manifest.config.get("topicsToTables") + + return self.BQParser( + project=project, + topicsToTables=topicsToTables, + datasets=datasets, + target_platform="bigquery", + sanitizeTopics=sanitizeTopics.lower() == "true", + transforms=transforms, + ) + + def get_list(self, property: str) -> Iterable[Tuple[str, str]]: + entries = property.split(",") + for entry in entries: + key, val = entry.rsplit("=") + yield (key.strip(), val.strip()) + + def get_dataset_for_topic_v1(self, topic: str, parser: BQParser) -> Optional[str]: + topicregex_dataset_map: Dict[str, str] = dict(self.get_list(parser.datasets)) # type: ignore + from java.util.regex import Pattern + + for pattern, dataset in topicregex_dataset_map.items(): + patternMatcher = Pattern.compile(pattern).matcher(topic) + if patternMatcher.matches(): + return dataset + return None + + def sanitize_table_name(self, table_name): + table_name = re.sub("[^a-zA-Z0-9_]", "_", table_name) + if re.match("^[^a-zA-Z_].*", table_name): + table_name = "_" + table_name + + return table_name + + def get_dataset_table_for_topic( + self, topic: str, parser: BQParser + ) -> Optional[str]: + if parser.version == "v2": + dataset = parser.defaultDataset + parts = topic.split(":") + if len(parts) == 2: + dataset = parts[0] + table = parts[1] + else: + table = parts[0] + else: + dataset = self.get_dataset_for_topic_v1(topic, parser) + if dataset is None: + return None + + table = topic + if parser.topicsToTables: + topicregex_table_map: Dict[str, str] = dict( + self.get_list(parser.topicsToTables) # type: ignore + ) + from java.util.regex import Pattern + + for pattern, tbl in topicregex_table_map.items(): + patternMatcher = Pattern.compile(pattern).matcher(topic) + if patternMatcher.matches(): + table = tbl + break + + if parser.sanitizeTopics: + table = self.sanitize_table_name(table) + return f"{dataset}.{table}" + + def apply_transformations( + self, topic: str, transforms: List[Dict[str, str]] + ) -> str: + for transform in transforms: + if transform["type"] == "org.apache.kafka.connect.transforms.RegexRouter": + regex = transform["regex"] + replacement = transform["replacement"] + pattern = re.compile(regex) + if pattern.match(topic): + topic = pattern.sub(replacement, topic, count=1) + return topic + + def extract_flow_property_bag(self) -> Dict[str, str]: + # Mask/Remove properties that may reveal credentials + flow_property_bag = { + k: v + for k, v in self.connector_manifest.config.items() + if k not in ["keyfile"] + } + + return flow_property_bag + + def extract_lineages(self) -> List[KafkaConnectLineage]: + lineages: List[KafkaConnectLineage] = list() + parser = self.get_parser(self.connector_manifest) + if not parser: + return lineages + target_platform = parser.target_platform + project = parser.project + transforms = parser.transforms + + for topic in self.connector_manifest.topic_names: + transformed_topic = self.apply_transformations(topic, transforms) + dataset_table = self.get_dataset_table_for_topic(transformed_topic, parser) + if dataset_table is None: + self.report.warning( + "Could not find target dataset for topic, please check your connector configuration" + f"{self.connector_manifest.name} : {transformed_topic} ", + ) + continue + target_dataset = f"{project}.{dataset_table}" + + lineages.append( + KafkaConnectLineage( + source_dataset=transformed_topic, + source_platform=KAFKA, + target_dataset=target_dataset, + target_platform=target_platform, + ) + ) + return lineages + + +BIGQUERY_SINK_CONNECTOR_CLASS = "com.wepay.kafka.connect.bigquery.BigQuerySinkConnector" +S3_SINK_CONNECTOR_CLASS = "io.confluent.connect.s3.S3SinkConnector" +SNOWFLAKE_SINK_CONNECTOR_CLASS = "com.snowflake.kafka.connector.SnowflakeSinkConnector" diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/source_connectors.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/source_connectors.py new file mode 100644 index 00000000000000..7b3b6e551a0a1f --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/source_connectors.py @@ -0,0 +1,570 @@ +import logging +import re +from dataclasses import dataclass +from typing import Dict, Iterable, List, Optional, Tuple + +from sqlalchemy.engine.url import make_url + +from datahub.ingestion.source.kafka_connect.common import ( + CONNECTOR_CLASS, + KAFKA, + BaseConnector, + ConnectorManifest, + KafkaConnectLineage, + get_dataset_name, + has_three_level_hierarchy, + remove_prefix, + unquote, +) +from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import ( + get_platform_from_sqlalchemy_uri, +) + + +@dataclass +class ConfluentJDBCSourceConnector(BaseConnector): + REGEXROUTER = "org.apache.kafka.connect.transforms.RegexRouter" + KNOWN_TOPICROUTING_TRANSFORMS = [REGEXROUTER] + # https://kafka.apache.org/documentation/#connect_included_transformation + KAFKA_NONTOPICROUTING_TRANSFORMS = [ + "InsertField", + "InsertField$Key", + "InsertField$Value", + "ReplaceField", + "ReplaceField$Key", + "ReplaceField$Value", + "MaskField", + "MaskField$Key", + "MaskField$Value", + "ValueToKey", + "ValueToKey$Key", + "ValueToKey$Value", + "HoistField", + "HoistField$Key", + "HoistField$Value", + "ExtractField", + "ExtractField$Key", + "ExtractField$Value", + "SetSchemaMetadata", + "SetSchemaMetadata$Key", + "SetSchemaMetadata$Value", + "Flatten", + "Flatten$Key", + "Flatten$Value", + "Cast", + "Cast$Key", + "Cast$Value", + "HeadersFrom", + "HeadersFrom$Key", + "HeadersFrom$Value", + "TimestampConverter", + "Filter", + "InsertHeader", + "DropHeaders", + ] + # https://docs.confluent.io/platform/current/connect/transforms/overview.html + CONFLUENT_NONTOPICROUTING_TRANSFORMS = [ + "Drop", + "Drop$Key", + "Drop$Value", + "Filter", + "Filter$Key", + "Filter$Value", + "TombstoneHandler", + ] + KNOWN_NONTOPICROUTING_TRANSFORMS = ( + KAFKA_NONTOPICROUTING_TRANSFORMS + + [ + f"org.apache.kafka.connect.transforms.{t}" + for t in KAFKA_NONTOPICROUTING_TRANSFORMS + ] + + CONFLUENT_NONTOPICROUTING_TRANSFORMS + + [ + f"io.confluent.connect.transforms.{t}" + for t in CONFLUENT_NONTOPICROUTING_TRANSFORMS + ] + ) + + @dataclass + class JdbcParser: + db_connection_url: str + source_platform: str + database_name: str + topic_prefix: str + query: str + transforms: list + + def get_parser( + self, + connector_manifest: ConnectorManifest, + ) -> JdbcParser: + url = remove_prefix( + str(connector_manifest.config.get("connection.url")), "jdbc:" + ) + url_instance = make_url(url) + source_platform = get_platform_from_sqlalchemy_uri(str(url_instance)) + database_name = url_instance.database + assert database_name + db_connection_url = f"{url_instance.drivername}://{url_instance.host}:{url_instance.port}/{database_name}" + + topic_prefix = self.connector_manifest.config.get("topic.prefix", None) + + query = self.connector_manifest.config.get("query", None) + + transform_names = ( + self.connector_manifest.config.get("transforms", "").split(",") + if self.connector_manifest.config.get("transforms") + else [] + ) + + transforms = [] + for name in transform_names: + transform = {"name": name} + transforms.append(transform) + for key in self.connector_manifest.config.keys(): + if key.startswith(f"transforms.{name}."): + transform[ + key.replace(f"transforms.{name}.", "") + ] = self.connector_manifest.config[key] + + return self.JdbcParser( + db_connection_url, + source_platform, + database_name, + topic_prefix, + query, + transforms, + ) + + def default_get_lineages( + self, + topic_prefix: str, + database_name: str, + source_platform: str, + topic_names: Optional[Iterable[str]] = None, + include_source_dataset: bool = True, + ) -> List[KafkaConnectLineage]: + lineages: List[KafkaConnectLineage] = [] + if not topic_names: + topic_names = self.connector_manifest.topic_names + table_name_tuples: List[Tuple] = self.get_table_names() + for topic in topic_names: + # All good for NO_TRANSFORM or (SINGLE_TRANSFORM and KNOWN_NONTOPICROUTING_TRANSFORM) or (not SINGLE_TRANSFORM and all(KNOWN_NONTOPICROUTING_TRANSFORM)) + source_table: str = ( + remove_prefix(topic, topic_prefix) if topic_prefix else topic + ) + # include schema name for three-level hierarchies + if has_three_level_hierarchy(source_platform): + table_name_tuple: Tuple = next( + iter([t for t in table_name_tuples if t and t[-1] == source_table]), + (), + ) + if len(table_name_tuple) > 1: + source_table = f"{table_name_tuple[-2]}.{source_table}" + else: + include_source_dataset = False + self.report.warning( + "Could not find schema for table" + f"{self.connector_manifest.name} : {source_table}", + ) + dataset_name: str = get_dataset_name(database_name, source_table) + lineage = KafkaConnectLineage( + source_dataset=dataset_name if include_source_dataset else None, + source_platform=source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + lineages.append(lineage) + return lineages + + def get_table_names(self) -> List[Tuple]: + sep: str = "." + leading_quote_char: str = '"' + trailing_quote_char: str = leading_quote_char + + table_ids: List[str] = [] + if self.connector_manifest.tasks: + table_ids = ( + ",".join( + [ + task["config"].get("tables") + for task in self.connector_manifest.tasks + ] + ) + ).split(",") + quote_method = self.connector_manifest.config.get( + "quote.sql.identifiers", "always" + ) + if ( + quote_method == "always" + and table_ids + and table_ids[0] + and table_ids[-1] + ): + leading_quote_char = table_ids[0][0] + trailing_quote_char = table_ids[-1][-1] + # This will only work for single character quotes + elif self.connector_manifest.config.get("table.whitelist"): + table_ids = self.connector_manifest.config.get("table.whitelist").split(",") # type: ignore + + # List of Tuple containing (schema, table) + tables: List[Tuple] = [ + ( + ( + unquote( + table_id.split(sep)[-2], leading_quote_char, trailing_quote_char + ) + if len(table_id.split(sep)) > 1 + else "" + ), + unquote( + table_id.split(sep)[-1], leading_quote_char, trailing_quote_char + ), + ) + for table_id in table_ids + ] + return tables + + def extract_flow_property_bag(self) -> Dict[str, str]: + flow_property_bag = { + k: v + for k, v in self.connector_manifest.config.items() + if k not in ["connection.password", "connection.user"] + } + + # Mask/Remove properties that may reveal credentials + flow_property_bag["connection.url"] = self.get_parser( + self.connector_manifest + ).db_connection_url + + return flow_property_bag + + def extract_lineages(self) -> List[KafkaConnectLineage]: + lineages: List[KafkaConnectLineage] = list() + parser = self.get_parser(self.connector_manifest) + source_platform = parser.source_platform + database_name = parser.database_name + query = parser.query + topic_prefix = parser.topic_prefix + transforms = parser.transforms + + logging.debug( + f"Extracting source platform: {source_platform} and database name: {database_name} from connection url " + ) + + if not self.connector_manifest.topic_names: + return lineages + + if query: + # Lineage source_table can be extracted by parsing query + for topic in self.connector_manifest.topic_names: + # default method - as per earlier implementation + dataset_name: str = get_dataset_name(database_name, topic) + + lineage = KafkaConnectLineage( + source_dataset=None, + source_platform=source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + lineages.append(lineage) + self.report.warning( + "Could not find input dataset, the connector has query configuration set", + self.connector_manifest.name, + ) + return lineages + + SINGLE_TRANSFORM = len(transforms) == 1 + NO_TRANSFORM = len(transforms) == 0 + UNKNOWN_TRANSFORM = any( + [ + transform["type"] + not in self.KNOWN_TOPICROUTING_TRANSFORMS + + self.KNOWN_NONTOPICROUTING_TRANSFORMS + for transform in transforms + ] + ) + ALL_TRANSFORMS_NON_TOPICROUTING = all( + [ + transform["type"] in self.KNOWN_NONTOPICROUTING_TRANSFORMS + for transform in transforms + ] + ) + + if NO_TRANSFORM or ALL_TRANSFORMS_NON_TOPICROUTING: + return self.default_get_lineages( + database_name=database_name, + source_platform=source_platform, + topic_prefix=topic_prefix, + ) + + if SINGLE_TRANSFORM and transforms[0]["type"] == self.REGEXROUTER: + tables = self.get_table_names() + topic_names = list(self.connector_manifest.topic_names) + + from java.util.regex import Pattern + + for table in tables: + source_table: str = table[-1] + topic = topic_prefix + source_table if topic_prefix else source_table + + transform_regex = Pattern.compile(transforms[0]["regex"]) + transform_replacement = transforms[0]["replacement"] + + matcher = transform_regex.matcher(topic) + if matcher.matches(): + topic = str(matcher.replaceFirst(transform_replacement)) + + # Additional check to confirm that the topic present + # in connector topics + + if topic in self.connector_manifest.topic_names: + # include schema name for three-level hierarchies + if has_three_level_hierarchy(source_platform) and len(table) > 1: + source_table = f"{table[-2]}.{table[-1]}" + + dataset_name = get_dataset_name(database_name, source_table) + + lineage = KafkaConnectLineage( + source_dataset=dataset_name, + source_platform=source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + topic_names.remove(topic) + lineages.append(lineage) + + if topic_names: + lineages.extend( + self.default_get_lineages( + database_name=database_name, + source_platform=source_platform, + topic_prefix=topic_prefix, + topic_names=topic_names, + include_source_dataset=False, + ) + ) + self.report.warning( + "Could not find input dataset for connector topics", + f"{self.connector_manifest.name} : {topic_names}", + ) + return lineages + else: + include_source_dataset = True + if SINGLE_TRANSFORM and UNKNOWN_TRANSFORM: + self.report.warning( + "Could not find input dataset, connector has unknown transform", + f"{self.connector_manifest.name} : {transforms[0]['type']}", + ) + include_source_dataset = False + if not SINGLE_TRANSFORM and UNKNOWN_TRANSFORM: + self.report.warning( + "Could not find input dataset, connector has one or more unknown transforms", + self.connector_manifest.name, + ) + include_source_dataset = False + lineages = self.default_get_lineages( + database_name=database_name, + source_platform=source_platform, + topic_prefix=topic_prefix, + include_source_dataset=include_source_dataset, + ) + return lineages + + +@dataclass +class MongoSourceConnector(BaseConnector): + # https://www.mongodb.com/docs/kafka-connector/current/source-connector/ + + @dataclass + class MongoSourceParser: + db_connection_url: Optional[str] + source_platform: str + database_name: Optional[str] + topic_prefix: Optional[str] + transforms: List[str] + + def get_parser( + self, + connector_manifest: ConnectorManifest, + ) -> MongoSourceParser: + parser = self.MongoSourceParser( + db_connection_url=connector_manifest.config.get("connection.uri"), + source_platform="mongodb", + database_name=connector_manifest.config.get("database"), + topic_prefix=connector_manifest.config.get("topic_prefix"), + transforms=( + connector_manifest.config["transforms"].split(",") + if "transforms" in connector_manifest.config + else [] + ), + ) + + return parser + + def extract_lineages(self) -> List[KafkaConnectLineage]: + lineages: List[KafkaConnectLineage] = list() + parser = self.get_parser(self.connector_manifest) + source_platform = parser.source_platform + topic_naming_pattern = r"mongodb\.(\w+)\.(\w+)" + + if not self.connector_manifest.topic_names: + return lineages + + for topic in self.connector_manifest.topic_names: + found = re.search(re.compile(topic_naming_pattern), topic) + + if found: + table_name = get_dataset_name(found.group(1), found.group(2)) + + lineage = KafkaConnectLineage( + source_dataset=table_name, + source_platform=source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + lineages.append(lineage) + return lineages + + +@dataclass +class DebeziumSourceConnector(BaseConnector): + @dataclass + class DebeziumParser: + source_platform: str + server_name: Optional[str] + database_name: Optional[str] + + def get_server_name(self, connector_manifest: ConnectorManifest) -> str: + if "topic.prefix" in connector_manifest.config: + return connector_manifest.config["topic.prefix"] + else: + return connector_manifest.config.get("database.server.name", "") + + def get_parser( + self, + connector_manifest: ConnectorManifest, + ) -> DebeziumParser: + connector_class = connector_manifest.config.get(CONNECTOR_CLASS, "") + + if connector_class == "io.debezium.connector.mysql.MySqlConnector": + parser = self.DebeziumParser( + source_platform="mysql", + server_name=self.get_server_name(connector_manifest), + database_name=None, + ) + elif connector_class == "MySqlConnector": + parser = self.DebeziumParser( + source_platform="mysql", + server_name=self.get_server_name(connector_manifest), + database_name=None, + ) + elif connector_class == "io.debezium.connector.mongodb.MongoDbConnector": + parser = self.DebeziumParser( + source_platform="mongodb", + server_name=self.get_server_name(connector_manifest), + database_name=None, + ) + elif connector_class == "io.debezium.connector.postgresql.PostgresConnector": + parser = self.DebeziumParser( + source_platform="postgres", + server_name=self.get_server_name(connector_manifest), + database_name=connector_manifest.config.get("database.dbname"), + ) + elif connector_class == "io.debezium.connector.oracle.OracleConnector": + parser = self.DebeziumParser( + source_platform="oracle", + server_name=self.get_server_name(connector_manifest), + database_name=connector_manifest.config.get("database.dbname"), + ) + elif connector_class == "io.debezium.connector.sqlserver.SqlServerConnector": + database_name = connector_manifest.config.get( + "database.names" + ) or connector_manifest.config.get("database.dbname") + + if "," in str(database_name): + raise Exception( + f"Only one database is supported for Debezium's SQL Server connector. Found: {database_name}" + ) + + parser = self.DebeziumParser( + source_platform="mssql", + server_name=self.get_server_name(connector_manifest), + database_name=database_name, + ) + elif connector_class == "io.debezium.connector.db2.Db2Connector": + parser = self.DebeziumParser( + source_platform="db2", + server_name=self.get_server_name(connector_manifest), + database_name=connector_manifest.config.get("database.dbname"), + ) + elif connector_class == "io.debezium.connector.vitess.VitessConnector": + parser = self.DebeziumParser( + source_platform="vitess", + server_name=self.get_server_name(connector_manifest), + database_name=connector_manifest.config.get("vitess.keyspace"), + ) + else: + raise ValueError(f"Connector class '{connector_class}' is unknown.") + + return parser + + def extract_lineages(self) -> List[KafkaConnectLineage]: + lineages: List[KafkaConnectLineage] = list() + + try: + parser = self.get_parser(self.connector_manifest) + source_platform = parser.source_platform + server_name = parser.server_name + database_name = parser.database_name + topic_naming_pattern = rf"({server_name})\.(\w+\.\w+)" + + if not self.connector_manifest.topic_names: + return lineages + + for topic in self.connector_manifest.topic_names: + found = re.search(re.compile(topic_naming_pattern), topic) + + if found: + table_name = get_dataset_name(database_name, found.group(2)) + + lineage = KafkaConnectLineage( + source_dataset=table_name, + source_platform=source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + lineages.append(lineage) + return lineages + except Exception as e: + self.report.warning( + "Error resolving lineage for connector", + self.connector_manifest.name, + exc=e, + ) + + return [] + + +@dataclass +class ConfigDrivenSourceConnector(BaseConnector): + def extract_lineages(self) -> List[KafkaConnectLineage]: + lineages = [] + for connector in self.config.generic_connectors: + if connector.connector_name == self.connector_manifest.name: + target_connector = connector + break + for topic in self.connector_manifest.topic_names: + lineage = KafkaConnectLineage( + source_dataset=target_connector.source_dataset, + source_platform=target_connector.source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + lineages.append(lineage) + return lineages + + +JDBC_SOURCE_CONNECTOR_CLASS = "io.confluent.connect.jdbc.JdbcSourceConnector" +DEBEZIUM_SOURCE_CONNECTOR_PREFIX = "io.debezium.connector" +MONGO_SOURCE_CONNECTOR_CLASS = "com.mongodb.kafka.connect.MongoSourceConnector" diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py index 57a251ef2ed14f..a66962f962255f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py @@ -31,6 +31,10 @@ from pydantic.class_validators import validator import datahub.emitter.mce_builder as builder +from datahub.api.entities.platformresource.platform_resource import ( + PlatformResource, + PlatformResourceKey, +) from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_builder import ContainerKey, create_embed_mcp from datahub.ingestion.api.report import Report @@ -106,7 +110,7 @@ from datahub.utilities.url_util import remove_port_from_url CORPUSER_DATAHUB = "urn:li:corpuser:datahub" - +LOOKER = "looker" logger = logging.getLogger(__name__) @@ -1411,6 +1415,7 @@ class LookerDashboardSourceReport(StaleEntityRemovalSourceReport): resolved_user_ids: int = 0 email_ids_missing: int = 0 # resolved users with missing email addresses + looker_user_count: int = 0 _looker_api: Optional[LookerAPI] = None query_latency: Dict[str, datetime.timedelta] = dataclasses_field( @@ -1614,9 +1619,21 @@ def get_urn_dashboard_id(self): class LookerUserRegistry: looker_api_wrapper: LookerAPI fields: str = ",".join(["id", "email", "display_name", "first_name", "last_name"]) + _user_cache: Dict[str, LookerUser] = {} - def __init__(self, looker_api: LookerAPI): + def __init__(self, looker_api: LookerAPI, report: LookerDashboardSourceReport): self.looker_api_wrapper = looker_api + self.report = report + self._initialize_user_cache() + + def _initialize_user_cache(self) -> None: + raw_users: Sequence[User] = self.looker_api_wrapper.all_users( + user_fields=self.fields + ) + + for raw_user in raw_users: + looker_user = LookerUser.create_looker_user(raw_user) + self._user_cache[str(looker_user.id)] = looker_user def get_by_id(self, id_: str) -> Optional[LookerUser]: if not id_: @@ -1624,6 +1641,9 @@ def get_by_id(self, id_: str) -> Optional[LookerUser]: logger.debug(f"Will get user {id_}") + if str(id_) in self._user_cache: + return self._user_cache.get(str(id_)) + raw_user: Optional[User] = self.looker_api_wrapper.get_user( str(id_), user_fields=self.fields ) @@ -1632,3 +1652,35 @@ def get_by_id(self, id_: str) -> Optional[LookerUser]: looker_user = LookerUser.create_looker_user(raw_user) return looker_user + + def to_platform_resource( + self, platform_instance: Optional[str] + ) -> Iterable[MetadataChangeProposalWrapper]: + try: + platform_resource_key = PlatformResourceKey( + platform=LOOKER, + resource_type="USER_ID_MAPPING", + platform_instance=platform_instance, + primary_key="", + ) + + # Extract user email mappings + user_email_cache = { + user_id: user.email + for user_id, user in self._user_cache.items() + if user.email + } + + platform_resource = PlatformResource.create( + key=platform_resource_key, + value=user_email_cache, + ) + + self.report.looker_user_count = len(user_email_cache) + yield from platform_resource.to_mcps() + + except Exception as exc: + self.report.warning( + message="Failed to generate platform resource for looker id mappings", + exc=exc, + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py index ab55d4e15e5de4..c3f2a110136c45 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py @@ -68,6 +68,7 @@ class LookerAPIStats(BaseModel): get_look_calls: int = 0 search_looks_calls: int = 0 search_dashboards_calls: int = 0 + all_user_calls: int = 0 class LookerAPI: @@ -135,7 +136,7 @@ def get_available_permissions(self) -> Set[str]: return permissions - @lru_cache(maxsize=1000) + @lru_cache(maxsize=5000) def get_user(self, id_: str, user_fields: str) -> Optional[User]: self.client_stats.user_calls += 1 try: @@ -154,6 +155,17 @@ def get_user(self, id_: str, user_fields: str) -> Optional[User]: # User not found return None + def all_users(self, user_fields: str) -> Sequence[User]: + self.client_stats.all_user_calls += 1 + try: + return self.client.all_users( + fields=cast(str, user_fields), + transport_options=self.transport_options, + ) + except SDKError as e: + logger.warning(f"Failure was {e}") + return [] + def execute_query(self, write_query: WriteQuery) -> List[Dict]: logger.debug(f"Executing query {write_query}") self.client_stats.query_calls += 1 diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py index cd8ccb8217257c..815c5dfb1c0147 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py @@ -145,7 +145,9 @@ def __init__(self, config: LookerDashboardSourceConfig, ctx: PipelineContext): self.source_config: LookerDashboardSourceConfig = config self.reporter: LookerDashboardSourceReport = LookerDashboardSourceReport() self.looker_api: LookerAPI = LookerAPI(self.source_config) - self.user_registry: LookerUserRegistry = LookerUserRegistry(self.looker_api) + self.user_registry: LookerUserRegistry = LookerUserRegistry( + self.looker_api, self.reporter + ) self.explore_registry: LookerExploreRegistry = LookerExploreRegistry( self.looker_api, self.reporter, self.source_config ) @@ -1673,5 +1675,14 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: yield usage_mcp.as_workunit() self.reporter.report_stage_end("usage_extraction") + # Dump looker user resource mappings. + logger.info("Ingesting looker user resource mapping workunits") + self.reporter.report_stage_start("user_resource_extraction") + yield from auto_workunit( + self.user_registry.to_platform_resource( + self.source_config.platform_instance + ) + ) + def get_report(self) -> SourceReport: return self.reporter diff --git a/metadata-ingestion/src/datahub/ingestion/source/mlflow.py b/metadata-ingestion/src/datahub/ingestion/source/mlflow.py index cef6d2b1bb5774..26d160acf330cf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mlflow.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mlflow.py @@ -38,16 +38,30 @@ class MLflowConfig(EnvConfigMixin): tracking_uri: Optional[str] = Field( default=None, - description="Tracking server URI. If not set, an MLflow default tracking_uri is used (local `mlruns/` directory or `MLFLOW_TRACKING_URI` environment variable)", + description=( + "Tracking server URI. If not set, an MLflow default tracking_uri is used" + " (local `mlruns/` directory or `MLFLOW_TRACKING_URI` environment variable)" + ), ) registry_uri: Optional[str] = Field( default=None, - description="Registry server URI. If not set, an MLflow default registry_uri is used (value of tracking_uri or `MLFLOW_REGISTRY_URI` environment variable)", + description=( + "Registry server URI. If not set, an MLflow default registry_uri is used" + " (value of tracking_uri or `MLFLOW_REGISTRY_URI` environment variable)" + ), ) model_name_separator: str = Field( default="_", description="A string which separates model name from its version (e.g. model_1 or model-1)", ) + base_external_url: Optional[str] = Field( + default=None, + description=( + "Base URL to use when constructing external URLs to MLflow." + " If not set, tracking_uri is used if it's an HTTP URL." + " If neither is set, external URLs are not generated." + ), + ) @dataclass @@ -279,12 +293,23 @@ def _make_ml_model_urn(self, model_version: ModelVersion) -> str: ) return urn - def _make_external_url(self, model_version: ModelVersion) -> Union[None, str]: + def _get_base_external_url_from_tracking_uri(self) -> Optional[str]: + if isinstance( + self.client.tracking_uri, str + ) and self.client.tracking_uri.startswith("http"): + return self.client.tracking_uri + else: + return None + + def _make_external_url(self, model_version: ModelVersion) -> Optional[str]: """ Generate URL for a Model Version to MLflow UI. """ - base_uri = self.client.tracking_uri - if base_uri.startswith("http"): + base_uri = ( + self.config.base_external_url + or self._get_base_external_url_from_tracking_uri() + ) + if base_uri: return f"{base_uri.rstrip('/')}/#/models/{model_version.name}/versions/{model_version.version}" else: return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index f7458c4eb4d5b5..b49d40a0c7eb6a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -9,7 +9,7 @@ import datahub.emitter.mce_builder as builder from datahub.configuration.common import AllowDenyPattern, ConfigModel -from datahub.configuration.source_common import DatasetSourceConfigMixin +from datahub.configuration.source_common import DatasetSourceConfigMixin, PlatformDetail from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.ingestion.source.common.subtypes import BIAssetSubTypes from datahub.ingestion.source.state.stale_entity_removal_handler import ( @@ -232,19 +232,6 @@ def default_for_dataset_type_mapping() -> Dict[str, str]: return dict_ -class PlatformDetail(ConfigModel): - platform_instance: Optional[str] = pydantic.Field( - default=None, - description="DataHub platform instance name. To generate correct urn for upstream dataset, this should match " - "with platform instance name used in ingestion " - "recipe of other datahub sources.", - ) - env: str = pydantic.Field( - default=builder.DEFAULT_ENV, - description="The environment that all assets produced by DataHub platform ingestion source belong to", - ) - - class DataBricksPlatformDetail(PlatformDetail): """ metastore is an additional field used in Databricks connector to generate the dataset urn diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py index baaa8d5b85ae10..6d51e853a2fb06 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py @@ -2,8 +2,8 @@ from abc import ABC, abstractmethod from typing import Union +from datahub.configuration.source_common import PlatformDetail from datahub.ingestion.source.powerbi.config import ( - PlatformDetail, PowerBiDashboardSourceConfig, PowerBIPlatformDetail, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py index ffaed79f4e42a6..63520bd731de86 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py @@ -5,13 +5,13 @@ from lark import Tree +from datahub.configuration.source_common import PlatformDetail from datahub.emitter import mce_builder as builder from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.powerbi.config import ( Constant, DataBricksPlatformDetail, DataPlatformPair, - PlatformDetail, PowerBiDashboardSourceConfig, PowerBiDashboardSourceReport, PowerBIPlatformDetail, diff --git a/metadata-ingestion/src/datahub/ingestion/source/pulsar.py b/metadata-ingestion/src/datahub/ingestion/source/pulsar.py index 15ee995b2d5fdc..f71949b9eb27f7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/pulsar.py +++ b/metadata-ingestion/src/datahub/ingestion/source/pulsar.py @@ -89,7 +89,16 @@ def __init__(self, schema): logger.error(f"Invalid JSON schema: {schema_data}. Error: {str(e)}") avro_schema = {} - self.schema_name = avro_schema.get("namespace") + "." + avro_schema.get("name") + self.schema_name = "null" + if avro_schema.get("namespace") and avro_schema.get("name"): + self.schema_name = ( + avro_schema.get("namespace") + "." + avro_schema.get("name") + ) + elif avro_schema.get("namespace"): + self.schema_name = avro_schema.get("namespace") + elif avro_schema.get("name"): + self.schema_name = avro_schema.get("name") + self.schema_description = avro_schema.get("doc") self.schema_type = schema.get("type") self.schema_str = schema.get("data") diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py index 1863663f98bb24..3ddf47b70cdf80 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py @@ -9,6 +9,7 @@ from itertools import groupby from pathlib import PurePath from typing import Any, Dict, Iterable, List, Optional, Tuple +from urllib.parse import urlparse import smart_open.compression as so_compression from more_itertools import peekable @@ -993,9 +994,7 @@ def s3_browser(self, path_spec: PathSpec, sample_size: int) -> Iterable[BrowsePa folders = [] for dir in dirs_to_process: logger.info(f"Getting files from folder: {dir}") - prefix_to_process = dir.rstrip("\\").lstrip( - self.create_s3_path(bucket_name, "/") - ) + prefix_to_process = urlparse(dir).path.lstrip("/") folders.extend( self.get_folder_info( diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index 93d84d8b246e51..c769c6705ac3f6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -414,9 +414,13 @@ def _process_upstream_lineage_row( except Exception as e: self.report.num_upstream_lineage_edge_parsing_failed += 1 upstream_tables = db_row.get("UPSTREAM_TABLES") + downstream_table = db_row.get("DOWNSTREAM_TABLE_NAME") self.structured_reporter.warning( "Failed to parse lineage edge", - context=f"Upstreams: {upstream_tables} Downstreams: {db_row.get('DOWNSTREAM_TABLE_NAME')}", + # Tricky: sometimes the full row data is too large, and so the context + # message gets truncated. By pulling out the upstreams and downstream + # list, we can at least get the important fields if truncation does occur. + context=f"Upstreams: {upstream_tables} Downstream: {downstream_table} Full row: {db_row}", exc=e, ) return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py index 99790de529ac3a..97c398c1962d6b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py @@ -237,6 +237,19 @@ def show_views_for_database( LIMIT {limit} {from_clause}; """ + @staticmethod + def get_secure_view_definitions() -> str: + # https://docs.snowflake.com/en/sql-reference/account-usage/views + return """ + SELECT + TABLE_CATALOG as "TABLE_CATALOG", + TABLE_SCHEMA as "TABLE_SCHEMA", + TABLE_NAME as "TABLE_NAME", + VIEW_DEFINITION as "VIEW_DEFINITION" + FROM SNOWFLAKE.ACCOUNT_USAGE.VIEWS + WHERE IS_SECURE = 'YES' AND VIEW_DEFINITION !='' AND DELETED IS NULL + """ + @staticmethod def columns_for_schema( schema_name: str, diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py index 5a69b4bb779d72..780effc82b0163 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py @@ -266,6 +266,22 @@ def get_schemas_for_database(self, db_name: str) -> List[SnowflakeSchema]: snowflake_schemas.append(snowflake_schema) return snowflake_schemas + @serialized_lru_cache(maxsize=1) + def get_secure_view_definitions(self) -> Dict[str, Dict[str, Dict[str, str]]]: + secure_view_definitions: Dict[str, Dict[str, Dict[str, str]]] = defaultdict( + lambda: defaultdict(lambda: defaultdict()) + ) + cur = self.connection.query(SnowflakeQuery.get_secure_view_definitions()) + for view in cur: + db_name = view["TABLE_CATALOG"] + schema_name = view["TABLE_SCHEMA"] + view_name = view["TABLE_NAME"] + secure_view_definitions[db_name][schema_name][view_name] = view[ + "VIEW_DEFINITION" + ] + + return secure_view_definitions + @serialized_lru_cache(maxsize=1) def get_tables_for_database( self, db_name: str diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py index 4ceeb8560c1758..bc64693b6a1084 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py @@ -424,6 +424,10 @@ def _process_schema( view_identifier = self.identifiers.get_dataset_identifier( view.name, schema_name, db_name ) + if view.is_secure and not view.view_definition: + view.view_definition = self.fetch_secure_view_definition( + view.name, schema_name, db_name + ) if view.view_definition: self.aggregator.add_view_definition( view_urn=self.identifiers.gen_dataset_urn(view_identifier), @@ -449,6 +453,25 @@ def _process_schema( context=f"{db_name}.{schema_name}", ) + def fetch_secure_view_definition( + self, table_name: str, schema_name: str, db_name: str + ) -> Optional[str]: + try: + view_definitions = self.data_dictionary.get_secure_view_definitions() + return view_definitions[db_name][schema_name][table_name] + except Exception as e: + if isinstance(e, SnowflakePermissionError): + error_msg = ( + "Failed to get secure views definitions. Please check permissions." + ) + else: + error_msg = "Failed to get secure views definitions" + self.structured_reporter.warning( + error_msg, + exc=e, + ) + return None + def fetch_views_for_schema( self, snowflake_schema: SnowflakeSchema, db_name: str, schema_name: str ) -> List[SnowflakeView]: diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index c3a7912c40e8ee..e5883dd0349a3a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -540,6 +540,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: identifiers=self.identifiers, schema_resolver=schema_resolver, discovered_tables=discovered_datasets, + graph=self.ctx.graph, ) # TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index 6844b8a425a7b6..6cc2220d90fd93 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -111,6 +111,8 @@ tableau_field_to_schema_field, workbook_graphql_query, ) +from datahub.ingestion.source.tableau.tableau_server_wrapper import UserInfo +from datahub.ingestion.source.tableau.tableau_validation import check_user_role from datahub.metadata.com.linkedin.pegasus2avro.common import ( AuditStamp, ChangeAuditStamps, @@ -167,7 +169,7 @@ try: # On earlier versions of the tableauserverclient, the NonXMLResponseError - # was thrown when reauthentication was needed. We'll keep both exceptions + # was thrown when reauthentication was necessary. We'll keep both exceptions # around for now, but can remove this in the future. from tableauserverclient.server.endpoint.exceptions import ( # type: ignore NotSignedInError, @@ -632,6 +634,33 @@ class TableauSourceReport(StaleEntityRemovalSourceReport): num_upstream_table_lineage_failed_parse_sql: int = 0 num_upstream_fine_grained_lineage_failed_parse_sql: int = 0 num_hidden_assets_skipped: int = 0 + logged_in_user: List[UserInfo] = [] + + +def report_user_role(report: TableauSourceReport, server: Server) -> None: + title: str = "Insufficient Permissions" + message: str = "The user must have the `Site Administrator Explorer` role to perform metadata ingestion." + try: + # TableauSiteSource instance is per site, so each time we need to find-out user detail + # the site-role might be different on another site + logged_in_user: UserInfo = UserInfo.from_server(server=server) + + if not logged_in_user.is_site_administrator_explorer(): + report.warning( + title=title, + message=message, + context=f"user-name={logged_in_user.user_name}, role={logged_in_user.site_role}, site_id={logged_in_user.site_id}", + ) + + report.logged_in_user.append(logged_in_user) + + except Exception as e: + report.warning( + title=title, + message="Failed to verify the user's role. The user must have `Site Administrator Explorer` role.", + context=f"{e}", + exc=e, + ) @platform_name("Tableau") @@ -676,6 +705,7 @@ def _authenticate(self, site_content_url: str) -> None: try: logger.info(f"Authenticated to Tableau site: '{site_content_url}'") self.server = self.config.make_tableau_client(site_content_url) + report_user_role(report=self.report, server=self.server) # Note that we're not catching ConfigurationError, since we want that to throw. except ValueError as e: self.report.failure( @@ -689,9 +719,17 @@ def test_connection(config_dict: dict) -> TestConnectionReport: test_report = TestConnectionReport() try: source_config = TableauConfig.parse_obj_allow_extras(config_dict) - source_config.make_tableau_client(source_config.site) + + server = source_config.make_tableau_client(source_config.site) + test_report.basic_connectivity = CapabilityReport(capable=True) + + test_report.capability_report = check_user_role( + logged_in_user=UserInfo.from_server(server=server) + ) + except Exception as e: + logger.warning(f"{e}", exc_info=e) test_report.basic_connectivity = CapabilityReport( capable=False, failure_reason=str(e) ) @@ -831,6 +869,8 @@ def __init__( # when emitting custom SQL data sources. self.custom_sql_ids_being_used: List[str] = [] + report_user_role(report=report, server=server) + @property def no_env_browse_prefix(self) -> str: # Prefix to use with browse path (v1) @@ -1290,7 +1330,6 @@ def get_connection_objects( page_size = page_size_override or self.config.page_size filter_pages = get_filter_pages(query_filter, page_size) - for filter_page in filter_pages: has_next_page = 1 current_cursor: Optional[str] = None diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_common.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_common.py index c5d14e0afe15a5..61b56c4bee5bda 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_common.py @@ -975,15 +975,22 @@ def get_filter_pages(query_filter: dict, page_size: int) -> List[dict]: # a few ten thousand, then tableau server responds with empty response # causing below error: # tableauserverclient.server.endpoint.exceptions.NonXMLResponseError: b'' + + # in practice, we only do pagination if len(query_filter.keys()) == 1 + if len(query_filter.keys()) != 1: + return filter_pages + + current_key = (list(query_filter.keys()))[0] + if ( - len(query_filter.keys()) == 1 - and query_filter.get(c.ID_WITH_IN) - and isinstance(query_filter[c.ID_WITH_IN], list) + current_key in [c.ID_WITH_IN, c.PROJECT_NAME_WITH_IN] + and query_filter.get(current_key) + and isinstance(query_filter[current_key], list) ): - ids = query_filter[c.ID_WITH_IN] + ids = query_filter[current_key] filter_pages = [ { - c.ID_WITH_IN: ids[ + current_key: ids[ start : ( start + page_size if start + page_size < len(ids) else len(ids) ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_constant.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_constant.py index d1dd0d92819991..ea0878143ef354 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_constant.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_constant.py @@ -81,3 +81,5 @@ PROJECT = "Project" SITE = "Site" IS_UNSUPPORTED_CUSTOM_SQL = "isUnsupportedCustomSql" +SITE_PERMISSION = "sitePermission" +SITE_ROLE = "SiteAdministratorExplorer" diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_server_wrapper.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_server_wrapper.py new file mode 100644 index 00000000000000..f309622d12b91b --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_server_wrapper.py @@ -0,0 +1,33 @@ +from dataclasses import dataclass + +from tableauserverclient import Server, UserItem + +from datahub.ingestion.source.tableau import tableau_constant as c + + +@dataclass +class UserInfo: + user_name: str + site_role: str + site_id: str + + def is_site_administrator_explorer(self): + return self.site_role == c.SITE_ROLE + + @staticmethod + def from_server(server: Server) -> "UserInfo": + assert server.user_id, "make the connection with tableau" + + user: UserItem = server.users.get_by_id(server.user_id) + + assert user.site_role, "site_role is not available" # to silent the lint + + assert user.name, "user name is not available" # to silent the lint + + assert server.site_id, "site identifier is not available" # to silent the lint + + return UserInfo( + user_name=user.name, + site_role=user.site_role, + site_id=server.site_id, + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_validation.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_validation.py new file mode 100644 index 00000000000000..4a703faf6091b3 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_validation.py @@ -0,0 +1,48 @@ +import logging +from typing import Dict, Union + +from datahub.ingestion.api.source import CapabilityReport, SourceCapability +from datahub.ingestion.source.tableau import tableau_constant as c +from datahub.ingestion.source.tableau.tableau_server_wrapper import UserInfo + +logger = logging.getLogger(__name__) + + +def check_user_role( + logged_in_user: UserInfo, +) -> Dict[Union[SourceCapability, str], CapabilityReport]: + capability_dict: Dict[Union[SourceCapability, str], CapabilityReport] = { + c.SITE_PERMISSION: CapabilityReport( + capable=True, + ) + } + + failure_reason: str = ( + "The user does not have the `Site Administrator Explorer` role." + ) + + mitigation_message_prefix: str = ( + "Assign `Site Administrator Explorer` role to the user" + ) + mitigation_message_suffix: str = "Refer to the setup guide: https://datahubproject.io/docs/quick-ingestion-guides/tableau/setup" + + try: + # TODO: Add check for `Enable Derived Permissions` + if not logged_in_user.is_site_administrator_explorer(): + capability_dict[c.SITE_PERMISSION] = CapabilityReport( + capable=False, + failure_reason=f"{failure_reason} Their current role is {logged_in_user.site_role}.", + mitigation_message=f"{mitigation_message_prefix} `{logged_in_user.user_name}`. {mitigation_message_suffix}", + ) + + return capability_dict + + except Exception as e: + logger.warning(msg=e, exc_info=e) + capability_dict[c.SITE_PERMISSION] = CapabilityReport( + capable=False, + failure_reason="Failed to verify user role.", + mitigation_message=f"{mitigation_message_prefix}. {mitigation_message_suffix}", # user is unknown + ) + + return capability_dict diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py index 79ea98d1c7f54e..f81eb291e89e1d 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py @@ -490,7 +490,7 @@ def __init__( self._exit_stack.push(self._query_usage_counts) # Tool Extractor - self._tool_meta_extractor = ToolMetaExtractor() + self._tool_meta_extractor = ToolMetaExtractor.create(graph) self.report.tool_meta_report = self._tool_meta_extractor.report def close(self) -> None: diff --git a/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py b/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py index 0d85002776e5e2..5af9d9d4f0fffc 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py +++ b/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py @@ -1,3 +1,4 @@ +import contextlib import json import logging from dataclasses import dataclass, field @@ -5,8 +6,15 @@ from typing_extensions import Protocol +from datahub.api.entities.platformresource.platform_resource import ( + ElasticPlatformResourceQuery, + PlatformResource, + PlatformResourceSearchFields, +) from datahub.ingestion.api.report import Report +from datahub.ingestion.graph.client import DataHubGraph from datahub.metadata.urns import CorpGroupUrn, CorpUserUrn +from datahub.utilities.search_utils import LogicalOperator from datahub.utilities.stats_collections import int_top_k_dict UrnStr = str @@ -31,6 +39,7 @@ def _get_last_line(query: str) -> str: @dataclass class ToolMetaExtractorReport(Report): num_queries_meta_extracted: Dict[str, int] = field(default_factory=int_top_k_dict) + failures: List[str] = field(default_factory=list) class ToolMetaExtractor: @@ -42,14 +51,81 @@ class ToolMetaExtractor: by warehouse query logs. """ - def __init__(self) -> None: - self.report = ToolMetaExtractorReport() + def __init__( + self, + report: ToolMetaExtractorReport, + looker_user_mapping: Optional[Dict[str, str]] = None, + ) -> None: + self.report = report self.known_tool_extractors: List[Tuple[str, Callable[[QueryLog], bool]]] = [ ( "mode", self._extract_mode_query, - ) + ), + ( + "looker", + self._extract_looker_query, + ), ] + # maps user id (as string) to email address + self.looker_user_mapping = looker_user_mapping + + @classmethod + def create( + cls, + graph: Optional[DataHubGraph] = None, + ) -> "ToolMetaExtractor": + report = ToolMetaExtractorReport() + looker_user_mapping = None + if graph: + try: + looker_user_mapping = cls.extract_looker_user_mapping_from_graph( + graph, report + ) + except Exception as e: + report.failures.append( + f"Unexpected error during Looker user metadata extraction: {str(e)}" + ) + + return cls(report, looker_user_mapping) + + @classmethod + def extract_looker_user_mapping_from_graph( + cls, graph: DataHubGraph, report: ToolMetaExtractorReport + ) -> Optional[Dict[str, str]]: + looker_user_mapping = None + query = ( + ElasticPlatformResourceQuery.create_from() + .group(LogicalOperator.AND) + .add_field_match(PlatformResourceSearchFields.PLATFORM, "looker") + .add_field_match( + PlatformResourceSearchFields.RESOURCE_TYPE, + "USER_ID_MAPPING", + ) + .end() + ) + platform_resources = list( + PlatformResource.search_by_filters(query=query, graph_client=graph) + ) + + if len(platform_resources) > 1: + report.failures.append( + "Looker user metadata extraction failed. Found more than one looker user id mappings." + ) + else: + platform_resource = platform_resources[0] + + if ( + platform_resource + and platform_resource.resource_info + and platform_resource.resource_info.value + ): + with contextlib.suppress(ValueError, AssertionError): + value = platform_resource.resource_info.value.as_raw_json() + if value: + looker_user_mapping = value + + return looker_user_mapping def _extract_mode_query(self, entry: QueryLog) -> bool: """ @@ -78,14 +154,49 @@ def _extract_mode_query(self, entry: QueryLog) -> bool: return True + def _extract_looker_query(self, entry: QueryLog) -> bool: + """ + Returns: + bool: whether QueryLog entry is that of looker and looker user info + is extracted into entry. + """ + if not self.looker_user_mapping: + return False + + last_line = _get_last_line(entry.query_text) + + if not (last_line.startswith("--") and "Looker Query Context" in last_line): + return False + + start_quote_idx = last_line.index("'") + end_quote_idx = last_line.rindex("'") + if start_quote_idx == -1 or end_quote_idx == -1: + return False + + looker_json_raw = last_line[start_quote_idx + 1 : end_quote_idx] + looker_json = json.loads(looker_json_raw) + + user_id = str(looker_json["user_id"]) + email = self.looker_user_mapping.get(user_id) + if not email: + return False + + original_user = entry.user + + entry.user = email_to_user_urn(email) + entry.extra_info = entry.extra_info or {} + entry.extra_info["user_via"] = original_user + + return True + def extract_bi_metadata(self, entry: QueryLog) -> bool: for tool, meta_extractor in self.known_tool_extractors: try: if meta_extractor(entry): self.report.num_queries_meta_extracted[tool] += 1 return True - except Exception: - logger.debug("Tool metadata extraction failed with error : {e}") + except Exception as e: + logger.debug(f"Tool metadata extraction failed with error : {e}") return False diff --git a/metadata-ingestion/src/datahub/testing/compare_metadata_json.py b/metadata-ingestion/src/datahub/testing/compare_metadata_json.py index bedc5bc8fcd5e5..9dbadd4804997d 100644 --- a/metadata-ingestion/src/datahub/testing/compare_metadata_json.py +++ b/metadata-ingestion/src/datahub/testing/compare_metadata_json.py @@ -117,7 +117,7 @@ def diff_metadata_json( ignore_paths: Sequence[str] = (), ignore_order: bool = True, ) -> Union[DeepDiff, MCPDiff]: - ignore_paths = (*ignore_paths, *default_exclude_paths, r"root\[\d+].delta_info") + ignore_paths = [*ignore_paths, *default_exclude_paths, r"root\[\d+].delta_info"] try: if ignore_order: golden_map = get_aspects_by_urn(golden) diff --git a/metadata-ingestion/src/datahub/utilities/file_backed_collections.py b/metadata-ingestion/src/datahub/utilities/file_backed_collections.py index b0f5022446de15..b8c27666d7f538 100644 --- a/metadata-ingestion/src/datahub/utilities/file_backed_collections.py +++ b/metadata-ingestion/src/datahub/utilities/file_backed_collections.py @@ -1,6 +1,7 @@ import collections import gzip import logging +import os import pathlib import pickle import shutil @@ -33,6 +34,14 @@ logger: logging.Logger = logging.getLogger(__name__) +OVERRIDE_SQLITE_VERSION_REQUIREMENT_STR = ( + os.environ.get("OVERRIDE_SQLITE_VERSION_REQ") or "" +) +OVERRIDE_SQLITE_VERSION_REQUIREMENT = ( + OVERRIDE_SQLITE_VERSION_REQUIREMENT_STR + and OVERRIDE_SQLITE_VERSION_REQUIREMENT_STR.lower() != "false" +) + _DEFAULT_FILE_NAME = "sqlite.db" _DEFAULT_TABLE_NAME = "data" @@ -212,6 +221,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]): _active_object_cache: OrderedDict[str, Tuple[_VT, bool]] = field( init=False, repr=False ) + _use_sqlite_on_conflict: bool = field(repr=False, default=True) def __post_init__(self) -> None: assert ( @@ -232,7 +242,10 @@ def __post_init__(self) -> None: # We use the ON CONFLICT clause to implement UPSERTs with sqlite. # This was added in 3.24.0 from 2018-06-04. # See https://www.sqlite.org/lang_conflict.html - raise RuntimeError("SQLite version 3.24.0 or later is required") + if OVERRIDE_SQLITE_VERSION_REQUIREMENT: + self.use_sqlite_on_conflict = False + else: + raise RuntimeError("SQLite version 3.24.0 or later is required") # We keep a small cache in memory to avoid having to serialize/deserialize # data from the database too often. We use an OrderedDict to build @@ -295,7 +308,7 @@ def _prune_cache(self, num_items_to_prune: int) -> None: values.append(column_serializer(value)) items_to_write.append(tuple(values)) - if items_to_write: + if items_to_write and self._use_sqlite_on_conflict: # Tricky: By using a INSERT INTO ... ON CONFLICT (key) structure, we can # ensure that the rowid remains the same if a value is updated but is # autoincremented when rows are inserted. @@ -312,6 +325,26 @@ def _prune_cache(self, num_items_to_prune: int) -> None: """, items_to_write, ) + else: + for item in items_to_write: + try: + self._conn.execute( + f"""INSERT INTO {self.tablename} ( + key, + value + {''.join(f', {column_name}' for column_name in self.extra_columns.keys())} + ) + VALUES ({', '.join(['?'] *(2 + len(self.extra_columns)))})""", + item, + ) + except sqlite3.IntegrityError: + self._conn.execute( + f"""UPDATE {self.tablename} SET + value = ? + {''.join(f', {column_name} = ?' for column_name in self.extra_columns.keys())} + WHERE key = ?""", + (*item[1:], item[0]), + ) def flush(self) -> None: self._prune_cache(len(self._active_object_cache)) diff --git a/metadata-ingestion/tests/integration/git/test_git_clone.py b/metadata-ingestion/tests/integration/git/test_git_clone.py index 60cf20fefcbdd1..01e075930998a4 100644 --- a/metadata-ingestion/tests/integration/git/test_git_clone.py +++ b/metadata-ingestion/tests/integration/git/test_git_clone.py @@ -1,4 +1,5 @@ import os +import pathlib import pytest from pydantic import SecretStr @@ -12,7 +13,7 @@ LOOKML_TEST_SSH_KEY = os.environ.get("DATAHUB_LOOKML_GIT_TEST_SSH_KEY") -def test_base_url_guessing(): +def test_base_url_guessing() -> None: # Basic GitHub repo. config = GitInfo(repo="https://github.com/datahub-project/datahub", branch="master") assert config.repo_ssh_locator == "git@github.com:datahub-project/datahub.git" @@ -70,7 +71,7 @@ def test_base_url_guessing(): ) -def test_github_branch(): +def test_github_branch() -> None: config = GitInfo( repo="owner/repo", ) @@ -83,11 +84,37 @@ def test_github_branch(): assert config.branch_for_clone == "main" +def test_url_subdir() -> None: + git_ref = GitReference(repo="https://github.com/org/repo", url_subdir="dbt") + assert ( + git_ref.get_url_for_file_path("model.sql") + == "https://github.com/org/repo/blob/main/dbt/model.sql" + ) + + git_ref = GitReference(repo="https://gitlab.com/org/repo", url_subdir="dbt") + assert ( + git_ref.get_url_for_file_path("model.sql") + == "https://gitlab.com/org/repo/-/blob/main/dbt/model.sql" + ) + + git_ref = GitReference(repo="https://github.com/org/repo", url_subdir="") + assert ( + git_ref.get_url_for_file_path("model.sql") + == "https://github.com/org/repo/blob/main/model.sql" + ) + + git_ref = GitReference(repo="https://github.com/org/repo", url_subdir="dbt/models") + assert ( + git_ref.get_url_for_file_path("model.sql") + == "https://github.com/org/repo/blob/main/dbt/models/model.sql" + ) + + def test_sanitize_repo_url() -> None: assert_doctest(datahub.ingestion.source.git.git_import) -def test_git_clone_public(tmp_path): +def test_git_clone_public(tmp_path: pathlib.Path) -> None: git_clone = GitClone(str(tmp_path)) checkout_dir = git_clone.clone( ssh_key=None, @@ -107,7 +134,7 @@ def test_git_clone_public(tmp_path): LOOKML_TEST_SSH_KEY is None, reason="DATAHUB_LOOKML_GIT_TEST_SSH_KEY env variable is not configured", ) -def test_git_clone_private(tmp_path): +def test_git_clone_private(tmp_path: pathlib.Path) -> None: git_clone = GitClone(str(tmp_path)) secret_key = SecretStr(LOOKML_TEST_SSH_KEY) if LOOKML_TEST_SSH_KEY else None diff --git a/metadata-ingestion/tests/integration/kafka/test_kafka.py b/metadata-ingestion/tests/integration/kafka/test_kafka.py index 0d9a714625e96b..648c4b26b20a76 100644 --- a/metadata-ingestion/tests/integration/kafka/test_kafka.py +++ b/metadata-ingestion/tests/integration/kafka/test_kafka.py @@ -102,7 +102,7 @@ def test_kafka_test_connection(mock_kafka_service, config_dict, is_success): test_connection_helpers.assert_capability_report( capability_report=report.capability_report, failure_capabilities={ - SourceCapability.SCHEMA_METADATA: "Failed to establish a new connection" + SourceCapability.SCHEMA_METADATA: "[Errno 111] Connection refused" }, ) diff --git a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json index a9c445b5986efe..6ae772c134cb32 100644 --- a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json +++ b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json @@ -842,6 +842,62 @@ "pipelineName": "stateful-looker-pipeline" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "stateful-looker-pipeline" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "stateful-looker-pipeline" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "stateful-looker-pipeline" + } +}, { "entityType": "chart", "entityUrn": "urn:li:chart:(looker,dashboard_elements.10)", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json index af9c62a2a41803..d7620980a9cedb 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json @@ -497,6 +497,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "chart", "entityUrn": "urn:li:chart:(looker,dashboard_elements.2)", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json index b89bc356b48fdc..13963af55bfe56 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json @@ -735,6 +735,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json index 810fefd8f6cb85..f11d060102851c 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json @@ -735,6 +735,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json index 3d78397f54a235..f6e39dd5286cd0 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json @@ -828,6 +828,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "chart", "entityUrn": "urn:li:chart:(looker,dashboard_elements.2)", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json index 5a540e61e768d7..203bed843155c8 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json @@ -464,6 +464,21 @@ "/Folders/Shared" ] } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:test-1@looker.com", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } } ] } @@ -708,6 +723,21 @@ "/Folders/Personal" ] } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:test-2@looker.com", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } } ] } @@ -1108,12 +1138,12 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/Explore/sales_model" + "/Explore/data" ] } }, @@ -1126,12 +1156,12 @@ "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { "project": "lkml_samples", - "model": "sales_model", + "model": "data", "looker.explore.label": "My Explore View", - "looker.explore.name": "sales_explore", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, - "externalUrl": "https://looker.company.com/explore/sales_model/sales_explore", + "externalUrl": "https://looker.company.com/explore/data/my_view", "name": "My Explore View", "description": "lorem ipsum", "tags": [] @@ -1153,7 +1183,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "sales_explore", + "schemaName": "my_view", "platform": "urn:li:dataPlatform:looker", "version": 0, "created": { @@ -1208,7 +1238,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1227,12 +1257,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "embed", "aspect": { "json": { - "renderUrl": "https://looker.company.com/embed/explore/sales_model/sales_explore" + "renderUrl": "https://looker.company.com/embed/explore/data/my_view" } }, "systemMetadata": { @@ -1244,12 +1274,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5" + "container": "urn:li:container:59a5aa45397364e6882e793f1bc77b42" } }, "systemMetadata": { @@ -1261,7 +1291,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1271,8 +1301,8 @@ "id": "Explore" }, { - "id": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5", - "urn": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5" + "id": "urn:li:container:59a5aa45397364e6882e793f1bc77b42", + "urn": "urn:li:container:59a5aa45397364e6882e793f1bc77b42" } ] } @@ -1287,12 +1317,12 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/Explore/data" + "/Explore/order_model" ] } }, @@ -1305,12 +1335,12 @@ "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { "project": "lkml_samples", - "model": "data", + "model": "order_model", "looker.explore.label": "My Explore View", - "looker.explore.name": "my_view", + "looker.explore.name": "order_explore", "looker.explore.file": "test_source_file.lkml" }, - "externalUrl": "https://looker.company.com/explore/data/my_view", + "externalUrl": "https://looker.company.com/explore/order_model/order_explore", "name": "My Explore View", "description": "lorem ipsum", "tags": [] @@ -1332,7 +1362,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "my_view", + "schemaName": "order_explore", "platform": "urn:li:dataPlatform:looker", "version": 0, "created": { @@ -1387,7 +1417,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1406,12 +1436,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "changeType": "UPSERT", "aspectName": "embed", "aspect": { "json": { - "renderUrl": "https://looker.company.com/embed/explore/data/my_view" + "renderUrl": "https://looker.company.com/embed/explore/order_model/order_explore" } }, "systemMetadata": { @@ -1423,12 +1453,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:59a5aa45397364e6882e793f1bc77b42" + "container": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60" } }, "systemMetadata": { @@ -1440,7 +1470,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1450,8 +1480,8 @@ "id": "Explore" }, { - "id": "urn:li:container:59a5aa45397364e6882e793f1bc77b42", - "urn": "urn:li:container:59a5aa45397364e6882e793f1bc77b42" + "id": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60", + "urn": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60" } ] } @@ -1466,12 +1496,12 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/Explore/order_model" + "/Explore/sales_model" ] } }, @@ -1484,12 +1514,12 @@ "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { "project": "lkml_samples", - "model": "order_model", + "model": "sales_model", "looker.explore.label": "My Explore View", - "looker.explore.name": "order_explore", + "looker.explore.name": "sales_explore", "looker.explore.file": "test_source_file.lkml" }, - "externalUrl": "https://looker.company.com/explore/order_model/order_explore", + "externalUrl": "https://looker.company.com/explore/sales_model/sales_explore", "name": "My Explore View", "description": "lorem ipsum", "tags": [] @@ -1511,7 +1541,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "order_explore", + "schemaName": "sales_explore", "platform": "urn:li:dataPlatform:looker", "version": 0, "created": { @@ -1566,7 +1596,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1585,12 +1615,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "changeType": "UPSERT", "aspectName": "embed", "aspect": { "json": { - "renderUrl": "https://looker.company.com/embed/explore/order_model/order_explore" + "renderUrl": "https://looker.company.com/embed/explore/sales_model/sales_explore" } }, "systemMetadata": { @@ -1602,12 +1632,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60" + "container": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5" } }, "systemMetadata": { @@ -1619,7 +1649,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1629,8 +1659,8 @@ "id": "Explore" }, { - "id": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60", - "urn": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60" + "id": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5", + "urn": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5" } ] } @@ -1705,6 +1735,62 @@ "pipelineName": "execution-1" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "execution-1" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "execution-1" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "execution-1" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json index 9ac95b8482a475..87af50f95ed6bb 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json @@ -793,6 +793,60 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:8436a2a37c4a7e81fb08c9c8415d2e4b", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:8436a2a37c4a7e81fb08c9c8415d2e4b", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:looker,ap-south-1)" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:8436a2a37c4a7e81fb08c9c8415d2e4b", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json index 3a2c6359ea63c2..b990ce7c67dab6 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json @@ -759,6 +759,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json index 007eee348aeaf8..391192b3d16f36 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json @@ -513,6 +513,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "chart", "entityUrn": "urn:li:chart:(looker,dashboard_elements.2)", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json b/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json index 859b9163d7aad6..4909a6af73a225 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json @@ -464,6 +464,21 @@ "/Folders/Shared" ] } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:test-1@looker.com", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } } ] } @@ -1185,6 +1200,62 @@ "pipelineName": "execution-1" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "execution-1" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "execution-1" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "execution-1" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json index 8256c984afb274..ddeb5428b1d726 100644 --- a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json +++ b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json @@ -762,6 +762,62 @@ "pipelineName": "stateful-looker-pipeline" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "stateful-looker-pipeline" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "stateful-looker-pipeline" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "stateful-looker-pipeline" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", @@ -814,8 +870,8 @@ } }, { - "entityType": "dashboard", - "entityUrn": "urn:li:dashboard:(looker,dashboards.11)", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,bogus data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -831,8 +887,8 @@ } }, { - "entityType": "chart", - "entityUrn": "urn:li:chart:(looker,dashboard_elements.10)", + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(looker,dashboards.11)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -865,8 +921,8 @@ } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,bogus data.explore.my_view,PROD)", + "entityType": "chart", + "entityUrn": "urn:li:chart:(looker,dashboard_elements.10)", "changeType": "UPSERT", "aspectName": "status", "aspect": { diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json index 0b3530f9c24629..594983c8fb0f2a 100644 --- a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json +++ b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json @@ -678,6 +678,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "chart", "entityUrn": "urn:li:chart:(looker,dashboard_elements.2)", diff --git a/metadata-ingestion/tests/integration/looker/test_looker.py b/metadata-ingestion/tests/integration/looker/test_looker.py index 8bbf14709ff9fb..a39de8384efb23 100644 --- a/metadata-ingestion/tests/integration/looker/test_looker.py +++ b/metadata-ingestion/tests/integration/looker/test_looker.py @@ -83,6 +83,7 @@ def test_looker_ingest(pytestconfig, tmp_path, mock_time): with mock.patch("looker_sdk.init40") as mock_sdk: mock_sdk.return_value = mocked_client setup_mock_dashboard(mocked_client) + mocked_client.run_inline_query.side_effect = side_effect_query_inline setup_mock_explore(mocked_client) test_resources_dir = pytestconfig.rootpath / "tests/integration/looker" @@ -319,6 +320,7 @@ def setup_mock_look(mocked_client): mocked_client.all_looks.return_value = [ Look( id="1", + user_id="1", title="Outer Look", description="I am not part of any Dashboard", query_id="1", @@ -327,6 +329,7 @@ def setup_mock_look(mocked_client): Look( id="2", title="Personal Look", + user_id="2", description="I am not part of any Dashboard and in personal folder", query_id="2", folder=FolderBase( @@ -561,6 +564,20 @@ def get_user( mocked_client.user.side_effect = get_user +def setup_mock_all_user(mocked_client): + def all_users( + fields: Optional[str] = None, + transport_options: Optional[transport.TransportOptions] = None, + ) -> List[User]: + return [ + User(id="1", email="test-1@looker.com"), + User(id="2", email="test-2@looker.com"), + User(id="3", email="test-3@looker.com"), + ] + + mocked_client.all_users.side_effect = all_users + + def side_effect_query_inline( result_format: str, body: WriteQuery, transport_options: Optional[TransportOptions] ) -> str: @@ -714,6 +731,7 @@ def test_looker_ingest_usage_history(pytestconfig, tmp_path, mock_time): mocked_client.run_inline_query.side_effect = side_effect_query_inline setup_mock_explore(mocked_client) setup_mock_user(mocked_client) + setup_mock_all_user(mocked_client) test_resources_dir = pytestconfig.rootpath / "tests/integration/looker" @@ -946,6 +964,8 @@ def ingest_independent_looks( mock_sdk.return_value = mocked_client setup_mock_dashboard(mocked_client) setup_mock_explore(mocked_client) + setup_mock_user(mocked_client) + setup_mock_all_user(mocked_client) setup_mock_look(mocked_client) test_resources_dir = pytestconfig.rootpath / "tests/integration/looker" diff --git a/metadata-ingestion/tests/integration/snowflake/common.py b/metadata-ingestion/tests/integration/snowflake/common.py index 9e4bb2f0eb634f..862d27186703a8 100644 --- a/metadata-ingestion/tests/integration/snowflake/common.py +++ b/metadata-ingestion/tests/integration/snowflake/common.py @@ -14,6 +14,11 @@ NUM_OPS = 10 NUM_USAGE = 0 + +def is_secure(view_idx): + return view_idx == 1 + + FROZEN_TIME = "2022-06-07 17:00:00" large_sql_query = """WITH object_access_history AS ( @@ -247,9 +252,25 @@ def default_query_results( # noqa: C901 "name": f"VIEW_{view_idx}", "created_on": datetime(2021, 6, 8, 0, 0, 0, 0), "comment": "Comment for View", - "text": f"create view view_{view_idx} as select * from table_{view_idx}", + "is_secure": "true" if is_secure(view_idx) else "false", + "text": ( + f"create view view_{view_idx} as select * from table_{view_idx}" + if not is_secure(view_idx) + else None + ), + } + for view_idx in range(1, num_views + 1) + ] + elif query == SnowflakeQuery.get_secure_view_definitions(): + return [ + { + "TABLE_CATALOG": "TEST_DB", + "TABLE_SCHEMA": "TEST_SCHEMA", + "TABLE_NAME": f"VIEW_{view_idx}", + "VIEW_DEFINITION": f"create view view_{view_idx} as select * from table_{view_idx}", } for view_idx in range(1, num_views + 1) + if is_secure(view_idx) ] elif query == SnowflakeQuery.columns_for_schema("TEST_SCHEMA", "TEST_DB"): return [ diff --git a/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json b/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json index 4415b1ad3e5159..48ec46af069cef 100644 --- a/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json +++ b/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json @@ -490,7 +490,9 @@ "aspectName": "datasetProperties", "aspect": { "json": { - "customProperties": {"CLUSTERING_KEY": "LINEAR(COL_1)"}, + "customProperties": { + "CLUSTERING_KEY": "LINEAR(COL_1)" + }, "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_1/", "name": "TABLE_1", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_1", @@ -789,7 +791,9 @@ "aspectName": "datasetProperties", "aspect": { "json": { - "customProperties": {"CLUSTERING_KEY": "LINEAR(COL_1)"}, + "customProperties": { + "CLUSTERING_KEY": "LINEAR(COL_1)" + }, "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_2/", "name": "TABLE_2", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_2", @@ -1088,7 +1092,9 @@ "aspectName": "datasetProperties", "aspect": { "json": { - "customProperties": {"CLUSTERING_KEY": "LINEAR(COL_1)"}, + "customProperties": { + "CLUSTERING_KEY": "LINEAR(COL_1)" + }, "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_3/", "name": "TABLE_3", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_3", @@ -1387,7 +1393,9 @@ "aspectName": "datasetProperties", "aspect": { "json": { - "customProperties": {"CLUSTERING_KEY": "LINEAR(COL_1)"}, + "customProperties": { + "CLUSTERING_KEY": "LINEAR(COL_1)" + }, "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_4/", "name": "TABLE_4", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_4", @@ -1686,7 +1694,9 @@ "aspectName": "datasetProperties", "aspect": { "json": { - "customProperties": {"CLUSTERING_KEY": "LINEAR(COL_1)"}, + "customProperties": { + "CLUSTERING_KEY": "LINEAR(COL_1)" + }, "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_5/", "name": "TABLE_5", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_5", @@ -1985,7 +1995,9 @@ "aspectName": "datasetProperties", "aspect": { "json": { - "customProperties": {"CLUSTERING_KEY": "LINEAR(COL_1)"}, + "customProperties": { + "CLUSTERING_KEY": "LINEAR(COL_1)" + }, "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_6/", "name": "TABLE_6", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_6", @@ -2284,7 +2296,9 @@ "aspectName": "datasetProperties", "aspect": { "json": { - "customProperties": {"CLUSTERING_KEY": "LINEAR(COL_1)"}, + "customProperties": { + "CLUSTERING_KEY": "LINEAR(COL_1)" + }, "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_7/", "name": "TABLE_7", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_7", @@ -2583,7 +2597,9 @@ "aspectName": "datasetProperties", "aspect": { "json": { - "customProperties": {"CLUSTERING_KEY": "LINEAR(COL_1)"}, + "customProperties": { + "CLUSTERING_KEY": "LINEAR(COL_1)" + }, "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_8/", "name": "TABLE_8", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_8", @@ -2882,7 +2898,9 @@ "aspectName": "datasetProperties", "aspect": { "json": { - "customProperties": {"CLUSTERING_KEY": "LINEAR(COL_1)"}, + "customProperties": { + "CLUSTERING_KEY": "LINEAR(COL_1)" + }, "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_9/", "name": "TABLE_9", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_9", @@ -3181,7 +3199,9 @@ "aspectName": "datasetProperties", "aspect": { "json": { - "customProperties": {"CLUSTERING_KEY": "LINEAR(COL_1)"}, + "customProperties": { + "CLUSTERING_KEY": "LINEAR(COL_1)" + }, "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_10/", "name": "TABLE_10", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_10", @@ -3471,23 +3491,25 @@ "aspectName": "datasetProperties", "aspect": { "json": { - "customProperties": {}, + "customProperties": { + "IS_SECURE": "true" + }, "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/view/VIEW_1/", "name": "VIEW_1", "qualifiedName": "TEST_DB.TEST_SCHEMA.VIEW_1", "description": "Comment for View", "created": { - "time": 1623103200000 + "time": 1623090600000 }, "lastModified": { - "time": 1623103200000 + "time": 1623090600000 }, "tags": [] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_12_18-10_16_09", + "runId": "snowflake-2024_12_16-15_30_20-649nax", "lastRunId": "no-run-id-provided" } }, diff --git a/metadata-ingestion/tests/integration/snowflake/snowflake_privatelink_golden.json b/metadata-ingestion/tests/integration/snowflake/snowflake_privatelink_golden.json index 3040c6c4e9196f..f22cbd122361dc 100644 --- a/metadata-ingestion/tests/integration/snowflake/snowflake_privatelink_golden.json +++ b/metadata-ingestion/tests/integration/snowflake/snowflake_privatelink_golden.json @@ -621,12 +621,17 @@ "op": "add", "path": "/qualifiedName", "value": "TEST_DB.TEST_SCHEMA.VIEW_1" + }, + { + "op": "add", + "path": "/customProperties/IS_SECURE", + "value": "true" } ] }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00-ad3hnf", + "runId": "snowflake-2022_06_07-17_00_00-ivthci", "lastRunId": "no-run-id-provided" } }, diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index 5b557efdab0bb0..4b2ac96931b950 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -7,6 +7,7 @@ import pytest from freezegun import freeze_time +from pydantic import ValidationError from requests.adapters import ConnectionError from tableauserverclient import PermissionsRule, Server from tableauserverclient.models import ( @@ -21,7 +22,9 @@ from datahub.emitter.mce_builder import DEFAULT_ENV, make_schema_field_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.ingestion.run.pipeline import Pipeline, PipelineContext, PipelineInitError +from datahub.ingestion.api.source import TestConnectionReport +from datahub.ingestion.run.pipeline import Pipeline, PipelineContext +from datahub.ingestion.source.tableau import tableau_constant as c from datahub.ingestion.source.tableau.tableau import ( TableauConfig, TableauSiteSource, @@ -61,6 +64,7 @@ "projects": ["default", "Project 2", "Samples"], "extract_project_hierarchy": False, "page_size": 1000, + "workbook_page_size": 1000, "ingest_tags": True, "ingest_owner": True, "ingest_tables_external": True, @@ -571,52 +575,28 @@ def test_extract_all_project(pytestconfig, tmp_path, mock_datahub_graph): def test_value_error_projects_and_project_pattern( pytestconfig, tmp_path, mock_datahub_graph ): - # Ingestion should raise ValueError - output_file_name: str = "tableau_project_pattern_precedence_mces.json" - golden_file_name: str = "tableau_project_pattern_precedence_mces_golden.json" - new_config = config_source_default.copy() new_config["projects"] = ["default"] new_config["project_pattern"] = {"allow": ["^Samples$"]} with pytest.raises( - PipelineInitError, + ValidationError, match=r".*projects is deprecated. Please use project_path_pattern only.*", ): - tableau_ingest_common( - pytestconfig, - tmp_path, - mock_data(), - golden_file_name, - output_file_name, - mock_datahub_graph, - pipeline_config=new_config, - ) + TableauConfig.parse_obj(new_config) def test_project_pattern_deprecation(pytestconfig, tmp_path, mock_datahub_graph): - # Ingestion should raise ValueError - output_file_name: str = "tableau_project_pattern_deprecation_mces.json" - golden_file_name: str = "tableau_project_pattern_deprecation_mces_golden.json" - new_config = config_source_default.copy() del new_config["projects"] new_config["project_pattern"] = {"allow": ["^Samples$"]} new_config["project_path_pattern"] = {"allow": ["^Samples$"]} with pytest.raises( - PipelineInitError, + ValidationError, match=r".*project_pattern is deprecated. Please use project_path_pattern only*", ): - tableau_ingest_common( - pytestconfig, - tmp_path, - mock_data(), - golden_file_name, - output_file_name, - mock_datahub_graph, - pipeline_config=new_config, - ) + TableauConfig.parse_obj(new_config) def test_project_path_pattern_allow(pytestconfig, tmp_path, mock_datahub_graph): @@ -674,6 +654,7 @@ def test_tableau_ingest_with_platform_instance( "platform_instance": "acryl_site1", "projects": ["default", "Project 2"], "page_size": 1000, + "workbook_page_size": 1000, "ingest_tags": True, "ingest_owner": True, "ingest_tables_external": True, @@ -1296,31 +1277,21 @@ def test_hidden_asset_tags(pytestconfig, tmp_path, mock_datahub_graph): @pytest.mark.integration def test_hidden_assets_without_ingest_tags(pytestconfig, tmp_path, mock_datahub_graph): enable_logging() - output_file_name: str = "tableau_hidden_asset_tags_error_mces.json" - golden_file_name: str = "tableau_hidden_asset_tags_error_mces_golden.json" new_config = config_source_default.copy() new_config["tags_for_hidden_assets"] = ["hidden", "private"] new_config["ingest_tags"] = False with pytest.raises( - PipelineInitError, + ValidationError, match=r".*tags_for_hidden_assets is only allowed with ingest_tags enabled.*", ): - tableau_ingest_common( - pytestconfig, - tmp_path, - mock_data(), - golden_file_name, - output_file_name, - mock_datahub_graph, - pipeline_config=new_config, - ) + TableauConfig.parse_obj(new_config) @freeze_time(FROZEN_TIME) @pytest.mark.integration -def test_permission_mode_switched_error(pytestconfig, tmp_path, mock_datahub_graph): +def test_permission_warning(pytestconfig, tmp_path, mock_datahub_graph): with mock.patch( "datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph", mock_datahub_graph, @@ -1357,11 +1328,99 @@ def test_permission_mode_switched_error(pytestconfig, tmp_path, mock_datahub_gra warnings = list(reporter.warnings) - assert len(warnings) == 1 + assert len(warnings) == 2 + + assert warnings[0].title == "Insufficient Permissions" - assert warnings[0].title == "Derived Permission Error" + assert warnings[1].title == "Derived Permission Error" - assert warnings[0].message == ( + assert warnings[1].message == ( "Turn on your derived permissions. See for details " "https://community.tableau.com/s/question/0D54T00000QnjHbSAJ/how-to-fix-the-permissionsmodeswitched-error" ) + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_connection_report_test(requests_mock): + server_info_response = """ + + + foo + 2.4 + + + + """ + + requests_mock.register_uri( + "GET", + "https://do-not-connect/api/2.4/serverInfo", + text=server_info_response, + status_code=200, + headers={"Content-Type": "application/xml"}, + ) + + signin_response = """ + + + + + + + """ + + requests_mock.register_uri( + "POST", + "https://do-not-connect/api/2.4/auth/signin", + text=signin_response, + status_code=200, + headers={"Content-Type": "application/xml"}, + ) + + user_by_id_response = """ + + + + """ + + requests_mock.register_uri( + "GET", + "https://do-not-connect/api/2.4/sites/fake_site_luid/users/fake_user_id", + text=user_by_id_response, + status_code=200, + headers={"Content-Type": "application/xml"}, + ) + + report: TestConnectionReport = TableauSource.test_connection(config_source_default) + + assert report + assert report.capability_report + assert report.capability_report.get(c.SITE_PERMISSION) + assert report.capability_report[c.SITE_PERMISSION].capable + + # Role other than SiteAdministratorExplorer + user_by_id_response = """ + + + + """ + + requests_mock.register_uri( + "GET", + "https://do-not-connect/api/2.4/sites/fake_site_luid/users/fake_user_id", + text=user_by_id_response, + status_code=200, + headers={"Content-Type": "application/xml"}, + ) + + report = TableauSource.test_connection(config_source_default) + + assert report + assert report.capability_report + assert report.capability_report.get(c.SITE_PERMISSION) + assert report.capability_report[c.SITE_PERMISSION].capable is False + assert ( + report.capability_report[c.SITE_PERMISSION].failure_reason + == "The user does not have the `Site Administrator Explorer` role. Their current role is Explorer." + ) diff --git a/metadata-ingestion/tests/unit/api/entities/structuredproperties/__init__.py b/metadata-ingestion/tests/unit/api/entities/structuredproperties/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/metadata-ingestion/tests/unit/api/entities/structuredproperties/example_structured_properties_golden.json b/metadata-ingestion/tests/unit/api/entities/structuredproperties/example_structured_properties_golden.json new file mode 100644 index 00000000000000..29386ece7b0ca1 --- /dev/null +++ b/metadata-ingestion/tests/unit/api/entities/structuredproperties/example_structured_properties_golden.json @@ -0,0 +1,194 @@ +[ +{ + "entityType": "structuredProperty", + "entityUrn": "urn:li:structuredProperty:io.acryl.privacy.retentionTime", + "changeType": "UPSERT", + "aspectName": "propertyDefinition", + "aspect": { + "json": { + "qualifiedName": "io.acryl.privacy.retentionTime", + "displayName": "Retention Time", + "valueType": "urn:li:dataType:datahub.number", + "allowedValues": [ + { + "value": { + "string": "30" + }, + "description": "30 days, usually reserved for datasets that are ephemeral and contain pii" + }, + { + "value": { + "string": "90" + }, + "description": "Use this for datasets that drive monthly reporting but contain pii" + }, + { + "value": { + "string": "365" + }, + "description": "Use this for non-sensitive data that can be retained for longer" + } + ], + "cardinality": "MULTIPLE", + "entityTypes": [ + "urn:li:entityType:datahub.dataset", + "urn:li:entityType:datahub.dataFlow" + ], + "description": "Retention Time is used to figure out how long to retain records in a dataset", + "immutable": false + } + } +}, +{ + "entityType": "structuredProperty", + "entityUrn": "urn:li:structuredProperty:io.acryl.dataManagement.replicationSLA", + "changeType": "UPSERT", + "aspectName": "propertyDefinition", + "aspect": { + "json": { + "qualifiedName": "io.acryl.dataManagement.replicationSLA", + "displayName": "Replication SLA", + "valueType": "urn:li:dataType:datahub.number", + "cardinality": "SINGLE", + "entityTypes": [ + "urn:li:entityType:datahub.dataset" + ], + "description": "SLA for how long data can be delayed before replicating to the destination cluster", + "immutable": false + } + } +}, +{ + "entityType": "structuredProperty", + "entityUrn": "urn:li:structuredProperty:io.acryl.dataManagement.deprecationDate", + "changeType": "UPSERT", + "aspectName": "propertyDefinition", + "aspect": { + "json": { + "qualifiedName": "io.acryl.dataManagement.deprecationDate", + "displayName": "Deprecation Date", + "valueType": "urn:li:dataType:datahub.date", + "cardinality": "SINGLE", + "entityTypes": [ + "urn:li:entityType:datahub.dataset", + "urn:li:entityType:datahub.dataFlow", + "urn:li:entityType:datahub.dataJob" + ], + "immutable": false + } + } +}, +{ + "entityType": "structuredProperty", + "entityUrn": "urn:li:structuredProperty:io.acryl.dataManagement.steward", + "changeType": "UPSERT", + "aspectName": "propertyDefinition", + "aspect": { + "json": { + "qualifiedName": "io.acryl.dataManagement.steward", + "displayName": "Steward", + "valueType": "urn:li:dataType:datahub.urn", + "typeQualifier": { + "allowedTypes": [ + "urn:li:entityType:datahub.corpuser", + "urn:li:entityType:datahub.corpGroup" + ] + }, + "cardinality": "SINGLE", + "entityTypes": [ + "urn:li:entityType:datahub.dataset", + "urn:li:entityType:datahub.dataFlow", + "urn:li:entityType:datahub.dataJob" + ], + "immutable": false + } + } +}, +{ + "entityType": "structuredProperty", + "entityUrn": "urn:li:structuredProperty:io.acryl.dataManagement.certifier", + "changeType": "UPSERT", + "aspectName": "propertyDefinition", + "aspect": { + "json": { + "qualifiedName": "io.acryl.dataManagement.certifier", + "displayName": "Person Certifying the asset", + "valueType": "urn:li:dataType:datahub.urn", + "cardinality": "SINGLE", + "entityTypes": [ + "urn:li:entityType:datahub.dataset", + "urn:li:entityType:datahub.schemaField" + ], + "immutable": false + } + } +}, +{ + "entityType": "structuredProperty", + "entityUrn": "urn:li:structuredProperty:io.acryl.dataManagement.team", + "changeType": "UPSERT", + "aspectName": "propertyDefinition", + "aspect": { + "json": { + "qualifiedName": "io.acryl.dataManagement.team", + "displayName": "Management team", + "valueType": "urn:li:dataType:datahub.string", + "cardinality": "SINGLE", + "entityTypes": [ + "urn:li:entityType:datahub.dataset" + ], + "immutable": false + } + } +}, +{ + "entityType": "structuredProperty", + "entityUrn": "urn:li:structuredProperty:projectNames", + "changeType": "UPSERT", + "aspectName": "propertyDefinition", + "aspect": { + "json": { + "qualifiedName": "projectNames", + "displayName": "Project names", + "valueType": "urn:li:dataType:datahub.string", + "allowedValues": [ + { + "value": { + "string": "Tracking" + }, + "description": "test value 1 for project" + }, + { + "value": { + "string": "DataHub" + }, + "description": "test value 2 for project" + } + ], + "cardinality": "MULTIPLE", + "entityTypes": [ + "urn:li:entityType:datahub.dataset" + ], + "immutable": false + } + } +}, +{ + "entityType": "structuredProperty", + "entityUrn": "urn:li:structuredProperty:namespace", + "changeType": "UPSERT", + "aspectName": "propertyDefinition", + "aspect": { + "json": { + "qualifiedName": "namespace", + "displayName": "Namespace", + "valueType": "urn:li:dataType:datahub.string", + "cardinality": "SINGLE", + "entityTypes": [ + "urn:li:entityType:datahub.dataset" + ], + "immutable": false + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/api/entities/structuredproperties/test_structuredproperties.py b/metadata-ingestion/tests/unit/api/entities/structuredproperties/test_structuredproperties.py new file mode 100644 index 00000000000000..e96b7c1f98437e --- /dev/null +++ b/metadata-ingestion/tests/unit/api/entities/structuredproperties/test_structuredproperties.py @@ -0,0 +1,38 @@ +import pathlib + +import pydantic +import pytest + +from datahub.api.entities.structuredproperties.structuredproperties import ( + StructuredProperties, + TypeQualifierAllowedTypes, +) +from tests.test_helpers.mce_helpers import check_goldens_stream + +RESOURCE_DIR = pathlib.Path(__file__).parent + + +def test_type_validation() -> None: + with pytest.raises(pydantic.ValidationError): + TypeQualifierAllowedTypes(allowed_types=["thisdoesnotexist"]) + + types = TypeQualifierAllowedTypes(allowed_types=["dataset"]) + assert types.allowed_types == ["urn:li:entityType:datahub.dataset"] + + +def test_structuredproperties_load(pytestconfig: pytest.Config) -> None: + example_properties_file = ( + pytestconfig.rootpath + / "examples/structured_properties/structured_properties.yaml" + ) + + properties = StructuredProperties.from_yaml(str(example_properties_file)) + mcps = [] + for property in properties: + mcps.extend(property.generate_mcps()) + + check_goldens_stream( + pytestconfig, + mcps, + golden_path=RESOURCE_DIR / "example_structured_properties_golden.json", + ) diff --git a/metadata-ingestion/tests/unit/serde/test_codegen.py b/metadata-ingestion/tests/unit/serde/test_codegen.py index 37ac35586950e1..98d62d5643ff2d 100644 --- a/metadata-ingestion/tests/unit/serde/test_codegen.py +++ b/metadata-ingestion/tests/unit/serde/test_codegen.py @@ -18,6 +18,7 @@ UpstreamClass, _Aspect, ) +from datahub.utilities.urns._urn_base import URN_TYPES _UPDATE_ENTITY_REGISTRY = os.getenv("UPDATE_ENTITY_REGISTRY", "false").lower() == "true" ENTITY_REGISTRY_PATH = pathlib.Path( @@ -165,3 +166,9 @@ def test_enum_options(): # This is mainly a sanity check to ensure that it doesn't do anything too crazy. env_options = get_enum_options(FabricTypeClass) assert "PROD" in env_options + + +def test_urn_types() -> None: + assert len(URN_TYPES) > 10 + for checked_type in ["dataset", "dashboard", "dataFlow", "schemaField"]: + assert checked_type in URN_TYPES diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json index 2d32e1328fbb4f..fd8475090f009e 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json @@ -185,7 +185,7 @@ "aspect": { "json": { "statement": { - "value": "ALTER TABLE dev.public.foo_staging RENAME TO foo", + "value": "ALTER TABLE dev.public.foo_staging RENAME TO foo /* Datahub generated query text-- */", "language": "SQL" }, "source": "SYSTEM", diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap.json index af0fca485777ff..d9d46a4b14a146 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap.json @@ -185,7 +185,7 @@ "aspect": { "json": { "statement": { - "value": "ALTER TABLE dev.public.person_info_swap SWAP WITH dev.public.person_info", + "value": "ALTER TABLE dev.public.person_info_swap SWAP WITH dev.public.person_info /* Datahub generated query text-- */", "language": "SQL" }, "source": "SYSTEM", @@ -438,7 +438,7 @@ "aspect": { "json": { "statement": { - "value": "ALTER TABLE dev.public.person_info SWAP WITH dev.public.person_info_swap", + "value": "ALTER TABLE dev.public.person_info SWAP WITH dev.public.person_info_swap /* Datahub generated query text-- */", "language": "SQL" }, "source": "SYSTEM", diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap_with_temp.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap_with_temp.json index ceaaf8f6887c7c..b4eaf76a149337 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap_with_temp.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_swap_with_temp.json @@ -175,7 +175,7 @@ "aspect": { "json": { "statement": { - "value": "CREATE TABLE person_info_swap CLONE person_info;\n\nCREATE TABLE person_info_incremental AS\nSELECT\n *\nFROM person_info_dep;\n\nINSERT INTO person_info_swap\nSELECT\n *\nFROM person_info_incremental;\n\nALTER TABLE dev.public.person_info_swap SWAP WITH dev.public.person_info", + "value": "CREATE TABLE person_info_swap CLONE person_info;\n\nCREATE TABLE person_info_incremental AS\nSELECT\n *\nFROM person_info_dep;\n\nINSERT INTO person_info_swap\nSELECT\n *\nFROM person_info_incremental;\n\nALTER TABLE dev.public.person_info_swap SWAP WITH dev.public.person_info /* Datahub generated query text-- */", "language": "SQL" }, "source": "SYSTEM", diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_information_schema_query.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_information_schema_query.json index f5f573f3d51136..9621b7d1c265b4 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_information_schema_query.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_information_schema_query.json @@ -1,7 +1,7 @@ { "query_type": "SELECT", "query_type_props": {}, - "query_fingerprint": "c721ce16410601b36e5f32bd9c5c28488500a93e617363739faebfe71496f163", + "query_fingerprint": "a204522c98a01568d8575a98a715de98985aeef0e822feb8450153f71891d6c6", "in_tables": [ "urn:li:dataset:(urn:li:dataPlatform:bigquery,acryl-staging-2.smoke_test_db_4.INFORMATION_SCHEMA.COLUMNS,PROD)", "urn:li:dataset:(urn:li:dataPlatform:bigquery,acryl-staging-2.smoke_test_db_4.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS,PROD)" @@ -178,6 +178,6 @@ ], "debug_info": { "confidence": 0.2, - "generalized_statement": "SELECT c.table_catalog AS table_catalog, c.table_schema AS table_schema, c.table_name AS table_name, c.column_name AS column_name, c.ordinal_position AS ordinal_position, cfp.field_path AS field_path, c.is_nullable AS is_nullable, CASE WHEN CONTAINS_SUBSTR(cfp.field_path, ?) THEN NULL ELSE c.data_type END AS data_type, description AS comment, c.is_hidden AS is_hidden, c.is_partitioning_column AS is_partitioning_column, c.clustering_ordinal_position AS clustering_ordinal_position FROM `acryl-staging-2`.`smoke_test_db_4`.INFORMATION_SCHEMA.COLUMNS AS c JOIN `acryl-staging-2`.`smoke_test_db_4`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS AS cfp ON cfp.table_name = c.table_name AND cfp.column_name = c.column_name ORDER BY table_catalog, table_schema, table_name, ordinal_position ASC, data_type DESC" + "generalized_statement": "SELECT c.table_catalog AS table_catalog, c.table_schema AS table_schema, c.table_name AS table_name, c.column_name AS column_name, c.ordinal_position AS ordinal_position, cfp.field_path AS field_path, c.is_nullable AS is_nullable, CASE WHEN CONTAINS_SUBSTR(cfp.field_path, ?) THEN NULL ELSE c.data_type END AS data_type, description AS comment, c.is_hidden AS is_hidden, c.is_partitioning_column AS is_partitioning_column, c.clustering_ordinal_position AS clustering_ordinal_position FROM `acryl-staging-2`.`smoke_test_db_4`.`INFORMATION_SCHEMA.COLUMNS` AS c JOIN `acryl-staging-2`.`smoke_test_db_4`.`INFORMATION_SCHEMA.COLUMN_FIELD_PATHS` AS cfp ON cfp.table_name = c.table_name AND cfp.column_name = c.column_name ORDER BY table_catalog, table_schema, table_name, ordinal_position ASC, data_type DESC" } } \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_tool_meta_extractor.py b/metadata-ingestion/tests/unit/sql_parsing/test_tool_meta_extractor.py index 6f590b53071467..f6566f007f5e6b 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_tool_meta_extractor.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_tool_meta_extractor.py @@ -1,11 +1,14 @@ from datahub.configuration.datetimes import parse_absolute_time from datahub.metadata.urns import CorpUserUrn from datahub.sql_parsing.sql_parsing_aggregator import PreparsedQuery -from datahub.sql_parsing.tool_meta_extractor import ToolMetaExtractor +from datahub.sql_parsing.tool_meta_extractor import ( + ToolMetaExtractor, + ToolMetaExtractorReport, +) def test_extract_mode_metadata() -> None: - extractor = ToolMetaExtractor() + extractor = ToolMetaExtractor(report=ToolMetaExtractorReport()) query = """\ select * from LONG_TAIL_COMPANIONS.ADOPTION.PET_PROFILES LIMIT 100 @@ -30,8 +33,42 @@ def test_extract_mode_metadata() -> None: assert extractor.report.num_queries_meta_extracted["mode"] == 1 +def test_extract_looker_metadata() -> None: + extractor = ToolMetaExtractor( + report=ToolMetaExtractorReport(), looker_user_mapping={"7": "john.doe@xyz.com"} + ) + looker_query = """\ +SELECT + all_entities_extended_sibling."ENTITY" AS "all_entities_extended_sibling.entity_type", + COUNT(DISTINCT ( all_entities_extended_sibling."URN" )) AS "all_entities_extended_sibling.distinct_count" +FROM "PUBLIC"."ALL_ENTITIES" + AS all_entities_extended_sibling +GROUP BY + 1 +ORDER BY + 1 +FETCH NEXT 50 ROWS ONLY +-- Looker Query Context '{"user_id":7,"history_slug":"264797031bc403cf382cbefbe3700849","instance_slug":"32654f2ffadf10b1949d4009e52fc6a4"}' +""" + + entry = PreparsedQuery( + query_id=None, + query_text=looker_query, + upstreams=[], + downstream=None, + column_lineage=None, + column_usage=None, + inferred_schema=None, + user=CorpUserUrn("mode"), + timestamp=parse_absolute_time("2021-08-01T01:02:03Z"), + ) + assert extractor.extract_bi_metadata(entry) + assert entry.user == CorpUserUrn("john.doe") + assert extractor.report.num_queries_meta_extracted["looker"] == 1 + + def test_extract_no_metadata() -> None: - extractor = ToolMetaExtractor() + extractor = ToolMetaExtractor(report=ToolMetaExtractorReport()) query = """\ select * from LONG_TAIL_COMPANIONS.ADOPTION.PET_PROFILES LIMIT 100 @@ -53,3 +90,4 @@ def test_extract_no_metadata() -> None: assert not extractor.extract_bi_metadata(entry) assert extractor.report.num_queries_meta_extracted["mode"] == 0 + assert extractor.report.num_queries_meta_extracted["looker"] == 0 diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py index 85c86f8d205d9a..5631ad2c69f949 100644 --- a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py +++ b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py @@ -37,7 +37,11 @@ def stateful_source(mock_datahub_graph: DataHubGraph) -> Iterable[SnowflakeV2Sou ), ) - with mock.patch("snowflake.connector.connect"): + with mock.patch( + "datahub.sql_parsing.sql_parsing_aggregator.ToolMetaExtractor.create", + ) as mock_checkpoint, mock.patch("snowflake.connector.connect"): + mock_checkpoint.return_value = mock.MagicMock() + yield SnowflakeV2Source(ctx=ctx, config=config) diff --git a/metadata-ingestion/tests/unit/test_mlflow_source.py b/metadata-ingestion/tests/unit/test_mlflow_source.py index d213dd92352e62..e882296b6f331d 100644 --- a/metadata-ingestion/tests/unit/test_mlflow_source.py +++ b/metadata-ingestion/tests/unit/test_mlflow_source.py @@ -136,3 +136,16 @@ def test_make_external_link_remote(source, model_version): url = source._make_external_url(model_version) assert url == expected_url + + +def test_make_external_link_remote_via_config(source, model_version): + custom_base_url = "https://custom-server.org" + source.config.base_external_url = custom_base_url + source.client = MlflowClient( + tracking_uri="https://dummy-mlflow-tracking-server.org" + ) + expected_url = f"{custom_base_url}/#/models/{model_version.name}/versions/{model_version.version}" + + url = source._make_external_url(model_version) + + assert url == expected_url diff --git a/metadata-ingestion/tests/unit/test_tableau_source.py b/metadata-ingestion/tests/unit/test_tableau_source.py index c81aa0bd8a1b1a..44e59decaecbd7 100644 --- a/metadata-ingestion/tests/unit/test_tableau_source.py +++ b/metadata-ingestion/tests/unit/test_tableau_source.py @@ -182,8 +182,14 @@ def test_get_filter_pages_simple(): assert get_filter_pages(filter_dict, 10) == [filter_dict] -def test_get_filter_pages_non_id_large_filter_passthrough(): - projects = [f"project{i}" for i in range(20000)] +def test_get_filter_pages_non_id_large_filter(): + projects = [f"project{i}" for i in range(10)] + filter_dict = {c.PROJECT_NAME_WITH_IN: projects} + assert get_filter_pages(filter_dict, 10) == [filter_dict] + + +def test_get_filter_pages_for_single_key(): + projects = ["project1"] filter_dict = {c.PROJECT_NAME_WITH_IN: projects} assert get_filter_pages(filter_dict, 10) == [filter_dict] diff --git a/metadata-ingestion/tests/unit/utilities/test_file_backed_collections.py b/metadata-ingestion/tests/unit/utilities/test_file_backed_collections.py index f4062f9a911453..6230c2e37edc6a 100644 --- a/metadata-ingestion/tests/unit/utilities/test_file_backed_collections.py +++ b/metadata-ingestion/tests/unit/utilities/test_file_backed_collections.py @@ -15,11 +15,13 @@ ) -def test_file_dict() -> None: +@pytest.mark.parametrize("use_sqlite_on_conflict", [True, False]) +def test_file_dict(use_sqlite_on_conflict: bool) -> None: cache = FileBackedDict[int]( tablename="cache", cache_max_size=10, cache_eviction_batch_size=10, + _use_sqlite_on_conflict=use_sqlite_on_conflict, ) for i in range(100): @@ -92,7 +94,8 @@ def test_file_dict() -> None: cache["a"] = 1 -def test_custom_serde() -> None: +@pytest.mark.parametrize("use_sqlite_on_conflict", [True, False]) +def test_custom_serde(use_sqlite_on_conflict: bool) -> None: @dataclass(frozen=True) class Label: a: str @@ -139,6 +142,7 @@ def deserialize(s: str) -> Main: deserializer=deserialize, # Disable the in-memory cache to force all reads/writes to the DB. cache_max_size=0, + _use_sqlite_on_conflict=use_sqlite_on_conflict, ) first = Main(3, {Label("one", 1): 0.1, Label("two", 2): 0.2}) second = Main(-100, {Label("z", 26): 0.26}) @@ -186,7 +190,8 @@ def test_file_dict_stores_counter() -> None: assert in_memory_counters[i].most_common(2) == cache[str(i)].most_common(2) -def test_file_dict_ordering() -> None: +@pytest.mark.parametrize("use_sqlite_on_conflict", [True, False]) +def test_file_dict_ordering(use_sqlite_on_conflict: bool) -> None: """ We require that FileBackedDict maintains insertion order, similar to Python's built-in dict. This test makes one of each and validates that they behave the same. @@ -196,6 +201,7 @@ def test_file_dict_ordering() -> None: serializer=str, deserializer=int, cache_max_size=1, + _use_sqlite_on_conflict=use_sqlite_on_conflict, ) data = {} @@ -229,12 +235,14 @@ class Pair: @pytest.mark.parametrize("cache_max_size", [0, 1, 10]) -def test_custom_column(cache_max_size: int) -> None: +@pytest.mark.parametrize("use_sqlite_on_conflict", [True, False]) +def test_custom_column(cache_max_size: int, use_sqlite_on_conflict: bool) -> None: cache = FileBackedDict[Pair]( extra_columns={ "x": lambda m: m.x, }, cache_max_size=cache_max_size, + _use_sqlite_on_conflict=use_sqlite_on_conflict, ) cache["first"] = Pair(3, "a") @@ -275,7 +283,8 @@ def test_custom_column(cache_max_size: int) -> None: ] -def test_shared_connection() -> None: +@pytest.mark.parametrize("use_sqlite_on_conflict", [True, False]) +def test_shared_connection(use_sqlite_on_conflict: bool) -> None: with ConnectionWrapper() as connection: cache1 = FileBackedDict[int]( shared_connection=connection, @@ -283,6 +292,7 @@ def test_shared_connection() -> None: extra_columns={ "v": lambda v: v, }, + _use_sqlite_on_conflict=use_sqlite_on_conflict, ) cache2 = FileBackedDict[Pair]( shared_connection=connection, @@ -291,6 +301,7 @@ def test_shared_connection() -> None: "x": lambda m: m.x, "y": lambda m: m.y, }, + _use_sqlite_on_conflict=use_sqlite_on_conflict, ) cache1["a"] = 3 diff --git a/metadata-integration/java/datahub-client/build.gradle b/metadata-integration/java/datahub-client/build.gradle index cec3164f10d6cc..42861cf235b56f 100644 --- a/metadata-integration/java/datahub-client/build.gradle +++ b/metadata-integration/java/datahub-client/build.gradle @@ -95,6 +95,11 @@ test { finalizedBy jacocoTestReport } +// no submodule depends on datahub-schematron:cli +// and tests there are the ones checking python-java compatibility +test.dependsOn tasks.getByPath(":metadata-integration:java:datahub-schematron:cli:test") +test.dependsOn tasks.getByPath(":metadata-integration:java:datahub-schematron:lib:test") + task checkShadowJar(type: Exec) { commandLine 'sh', '-c', 'scripts/check_jar.sh' } diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/java/io/datahubproject/schematron/converters/avro/AvroSchemaConverterTest.java b/metadata-integration/java/datahub-schematron/lib/src/test/java/io/datahubproject/schematron/converters/avro/AvroSchemaConverterTest.java new file mode 100644 index 00000000000000..d6522c2d84670f --- /dev/null +++ b/metadata-integration/java/datahub-schematron/lib/src/test/java/io/datahubproject/schematron/converters/avro/AvroSchemaConverterTest.java @@ -0,0 +1,942 @@ +package io.datahubproject.schematron.converters.avro; + +import static org.testng.Assert.*; + +import com.linkedin.common.urn.DataPlatformUrn; +import com.linkedin.data.template.StringArray; +import com.linkedin.schema.*; +import java.io.File; +import java.io.IOException; +import java.net.URISyntaxException; +import java.util.Collections; +import org.apache.avro.Schema; +import org.testng.annotations.*; + +@Test(groups = "unit") +class AvroSchemaConverterTest { + + private AvroSchemaConverter avroSchemaConverter = AvroSchemaConverter.builder().build(); + private DataPlatformUrn dataPlatformUrn = + DataPlatformUrn.createFromString("urn:li:dataPlatform:foo"); + + AvroSchemaConverterTest() throws URISyntaxException {} + + @Test(groups = "basic") + void testPrimitiveTypes() throws IOException { + SchemaMetadata schema = + avroSchemaConverter.toDataHubSchema( + readAvroSchema("primitive_types.avsc"), false, false, dataPlatformUrn, null); + + schema.getFields().forEach(System.out::println); + + assertEquals(schema.getFields().size(), 14); + + assertSchemaField( + schema.getFields().get(0), + "[version=2.0].[type=PrimitiveType].[type=int].intField", + "int", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(1), + "[version=2.0].[type=PrimitiveType].[type=union].intFieldV2", + "union", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new UnionType() + .setNestedTypes(new StringArray(Collections.singletonList("union")))))); + assertSchemaField( + schema.getFields().get(2), + "[version=2.0].[type=PrimitiveType].[type=union].[type=int].intFieldV2", + "int", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(3), + "[version=2.0].[type=PrimitiveType].[type=null].nullField", + "null", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NullType()))); + assertSchemaField( + schema.getFields().get(4), + "[version=2.0].[type=PrimitiveType].[type=union].nullFieldV2", + "union", + true, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new UnionType() + .setNestedTypes(new StringArray(Collections.singletonList("union")))))); + assertSchemaField( + schema.getFields().get(5), + "[version=2.0].[type=PrimitiveType].[type=long].longField", + "long", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(6), + "[version=2.0].[type=PrimitiveType].[type=float].floatField", + "float", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(7), + "[version=2.0].[type=PrimitiveType].[type=double].doubleField", + "double", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(8), + "[version=2.0].[type=PrimitiveType].[type=string].stringField", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(9), + "[version=2.0].[type=PrimitiveType].[type=boolean].booleanField", + "boolean", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new BooleanType()))); + assertSchemaField( + schema.getFields().get(10), + "[version=2.0].[type=PrimitiveType].[type=int].nullableIntField", + "int", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(11), + "[version=2.0].[type=PrimitiveType].[type=long].nullableLongField", + "long", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(12), + "[version=2.0].[type=PrimitiveType].[type=string].nullableStringField", + "string", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(13), + "[version=2.0].[type=PrimitiveType].[type=enum].status", + "Enum", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new EnumType()))); + } + + @Test(groups = "basic") + void testComplexMaps() throws IOException { + SchemaMetadata schema = + avroSchemaConverter.toDataHubSchema( + readAvroSchema("complex_maps.avsc"), false, false, dataPlatformUrn, null); + + schema.getFields().forEach(System.out::println); + + assertEquals(schema.getFields().size(), 15); + + assertSchemaField( + schema.getFields().get(0), + "[version=2.0].[type=MapType].[type=map].mapOfString", + "map", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new MapType().setKeyType("string").setValueType("string")))); + assertSchemaField( + schema.getFields().get(1), + "[version=2.0].[type=MapType].[type=map].[type=ComplexType].mapOfComplexType", + "ComplexType", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new MapType().setKeyType("string").setValueType("ComplexType")))); + assertSchemaField( + schema.getFields().get(2), + "[version=2.0].[type=MapType].[type=map].[type=ComplexType].mapOfComplexType.[type=string].field1", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(3), + "[version=2.0].[type=MapType].[type=map].[type=ComplexType].mapOfComplexType.[type=int].field2", + "int", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(4), + "[version=2.0].[type=MapType].[type=map].[type=union].mapOfNullableString", + "union", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new MapType().setKeyType("string").setValueType("union")))); + assertSchemaField( + schema.getFields().get(5), + "[version=2.0].[type=MapType].[type=map].[type=union].[type=string].mapOfNullableString", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(6), + "[version=2.0].[type=MapType].[type=map].[type=union].mapOfNullableComplexType", + "union", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new MapType().setKeyType("string").setValueType("union")))); + assertSchemaField( + schema.getFields().get(7), + "[version=2.0].[type=MapType].[type=map].[type=union].[type=ComplexTypeNullable].mapOfNullableComplexType", + "ComplexTypeNullable", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new RecordType()))); + assertSchemaField( + schema.getFields().get(8), + "[version=2.0].[type=MapType].[type=map].[type=union].[type=ComplexTypeNullable].mapOfNullableComplexType.[type=string].field1", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(9), + "[version=2.0].[type=MapType].[type=map].[type=union].[type=ComplexTypeNullable].mapOfNullableComplexType.[type=int].field2", + "int", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(10), + "[version=2.0].[type=MapType].[type=map].[type=array].mapOfArray", + "array(string)", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new ArrayType().setNestedType(new StringArray("string"))))); + assertSchemaField( + schema.getFields().get(11), + "[version=2.0].[type=MapType].[type=map].[type=map].mapOfMap", + "map", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new MapType().setKeyType("string").setValueType("int")))); + assertSchemaField( + schema.getFields().get(12), + "[version=2.0].[type=MapType].[type=map].[type=union].mapOfUnion", + "union", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new MapType().setKeyType("string").setValueType("union")))); + assertSchemaField( + schema.getFields().get(13), + "[version=2.0].[type=MapType].[type=map].[type=union].[type=string].mapOfUnion", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(14), + "[version=2.0].[type=MapType].[type=map].[type=union].[type=int].mapOfUnion", + "int", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + } + + @Test(groups = "basic") + void testComplexArrays() throws IOException { + SchemaMetadata schema = + avroSchemaConverter.toDataHubSchema( + readAvroSchema("complex_arrays.avsc"), false, false, dataPlatformUrn, null); + + schema.getFields().forEach(System.out::println); + + assertEquals(schema.getFields().size(), 16); + + assertSchemaField( + schema.getFields().get(0), + "[version=2.0].[type=ArrayType].[type=array].arrayOfString", + "array(string)", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new ArrayType().setNestedType(new StringArray("string"))))); + assertSchemaField( + schema.getFields().get(1), + "[version=2.0].[type=ArrayType].[type=array].[type=map].arrayOfMap", + "map", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new MapType().setKeyType("string").setValueType("string")))); + assertSchemaField( + schema.getFields().get(2), + "[version=2.0].[type=ArrayType].[type=array].[type=ComplexType].arrayOfRecord", + "ComplexType", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new ArrayType().setNestedType(new StringArray("ComplexType"))))); + assertSchemaField( + schema.getFields().get(3), + "[version=2.0].[type=ArrayType].[type=array].[type=ComplexType].arrayOfRecord.[type=string].field1", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(4), + "[version=2.0].[type=ArrayType].[type=array].[type=ComplexType].arrayOfRecord.[type=int].field2", + "int", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(5), + "[version=2.0].[type=ArrayType].[type=array].[type=array].arrayOfArray", + "array(string)", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new ArrayType().setNestedType(new StringArray("string"))))); + assertSchemaField( + schema.getFields().get(6), + "[version=2.0].[type=ArrayType].[type=array].[type=union].arrayOfUnion", + "union", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new ArrayType().setNestedType(new StringArray("union"))))); + assertSchemaField( + schema.getFields().get(7), + "[version=2.0].[type=ArrayType].[type=array].[type=union].[type=string].arrayOfUnion", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(8), + "[version=2.0].[type=ArrayType].[type=array].[type=union].[type=int].arrayOfUnion", + "int", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(9), + "[version=2.0].[type=ArrayType].[type=array].[type=union].[type=boolean].arrayOfUnion", + "boolean", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new BooleanType()))); + assertSchemaField( + schema.getFields().get(10), + "[version=2.0].[type=ArrayType].[type=array].[type=union].arrayOfNullableString", + "union", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new ArrayType().setNestedType(new StringArray("union"))))); + assertSchemaField( + schema.getFields().get(11), + "[version=2.0].[type=ArrayType].[type=array].[type=union].[type=string].arrayOfNullableString", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(12), + "[version=2.0].[type=ArrayType].[type=array].[type=union].arrayOfNullableRecord", + "union", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new ArrayType().setNestedType(new StringArray("union"))))); + assertSchemaField( + schema.getFields().get(13), + "[version=2.0].[type=ArrayType].[type=array].[type=union].[type=ComplexTypeNullable].arrayOfNullableRecord", + "ComplexTypeNullable", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new RecordType()))); + assertSchemaField( + schema.getFields().get(14), + "[version=2.0].[type=ArrayType].[type=array].[type=union].[type=ComplexTypeNullable].arrayOfNullableRecord.[type=string].field1", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(15), + "[version=2.0].[type=ArrayType].[type=array].[type=union].[type=ComplexTypeNullable].arrayOfNullableRecord.[type=int].field2", + "int", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + } + + @Test(groups = "basic") + void testComplexStructs() throws IOException { + SchemaMetadata schema = + avroSchemaConverter.toDataHubSchema( + readAvroSchema("complex_structs.avsc"), false, false, dataPlatformUrn, null); + + schema.getFields().forEach(System.out::println); + + assertEquals(schema.getFields().size(), 13); + + assertSchemaField( + schema.getFields().get(0), + "[version=2.0].[type=StructType].[type=ComplexStruct].structField", + "ComplexStruct", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new RecordType()))); + assertSchemaField( + schema.getFields().get(1), + "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=string].fieldString", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(2), + "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=int].fieldInt", + "int", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(3), + "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=boolean].fieldBoolean", + "boolean", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new BooleanType()))); + assertSchemaField( + schema.getFields().get(4), + "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=map].fieldMap", + "map", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new MapType().setKeyType("string").setValueType("string")))); + assertSchemaField( + schema.getFields().get(5), + "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=NestedRecord].fieldRecord", + "NestedRecord", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new RecordType()))); + assertSchemaField( + schema.getFields().get(6), + "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=NestedRecord].fieldRecord.[type=string].nestedField1", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(7), + "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=NestedRecord].fieldRecord.[type=int].nestedField2", + "int", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(8), + "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=array].fieldArray", + "array(string)", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new ArrayType().setNestedType(new StringArray("string"))))); + assertSchemaField( + schema.getFields().get(9), + "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=union].fieldUnion", + "union", + true, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new UnionType().setNestedTypes(new StringArray("union"))))); + assertSchemaField( + schema.getFields().get(10), + "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=union].[type=string].fieldUnion", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(11), + "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=union].[type=int].fieldUnion", + "int", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(12), + "[version=2.0].[type=StructType].[type=ComplexStruct].structField.[type=map].fieldNullableMap", + "map", + true, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new MapType().setKeyType("string").setValueType("string")))); + } + + @Test(groups = "basic") + void testComplexUnions() throws IOException { + SchemaMetadata schema = + avroSchemaConverter.toDataHubSchema( + readAvroSchema("complex_unions.avsc"), false, false, dataPlatformUrn, null); + + schema.getFields().forEach(System.out::println); + + assertEquals(schema.getFields().size(), 14); + + assertSchemaField( + schema.getFields().get(0), + "[version=2.0].[type=UnionType].[type=union].fieldUnionNullablePrimitives", + "union", + true, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new UnionType().setNestedTypes(new StringArray("union"))))); + assertSchemaField( + schema.getFields().get(1), + "[version=2.0].[type=UnionType].[type=union].[type=string].fieldUnionNullablePrimitives", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(2), + "[version=2.0].[type=UnionType].[type=union].[type=int].fieldUnionNullablePrimitives", + "int", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(3), + "[version=2.0].[type=UnionType].[type=union].[type=boolean].fieldUnionNullablePrimitives", + "boolean", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new BooleanType()))); + assertSchemaField( + schema.getFields().get(4), + "[version=2.0].[type=UnionType].[type=union].fieldUnionComplexTypes", + "union", + true, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new UnionType().setNestedTypes(new StringArray("union"))))); + assertSchemaField( + schema.getFields().get(5), + "[version=2.0].[type=UnionType].[type=union].[type=NestedRecord].fieldUnionComplexTypes", + "NestedRecord", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new RecordType()))); + assertSchemaField( + schema.getFields().get(6), + "[version=2.0].[type=UnionType].[type=union].[type=NestedRecord].fieldUnionComplexTypes.[type=string].nestedField1", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(7), + "[version=2.0].[type=UnionType].[type=union].[type=NestedRecord].fieldUnionComplexTypes.[type=int].nestedField2", + "int", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(8), + "[version=2.0].[type=UnionType].[type=union].[type=map].fieldUnionComplexTypes", + "map", + false, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new MapType().setKeyType("string").setValueType("string")))); + assertSchemaField( + schema.getFields().get(9), + "[version=2.0].[type=UnionType].[type=union].fieldUnionPrimitiveAndComplex", + "union", + true, + false, + new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new UnionType().setNestedTypes(new StringArray("union"))))); + assertSchemaField( + schema.getFields().get(10), + "[version=2.0].[type=UnionType].[type=union].[type=string].fieldUnionPrimitiveAndComplex", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(11), + "[version=2.0].[type=UnionType].[type=union].[type=ComplexTypeRecord].fieldUnionPrimitiveAndComplex", + "ComplexTypeRecord", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new RecordType()))); + assertSchemaField( + schema.getFields().get(12), + "[version=2.0].[type=UnionType].[type=union].[type=ComplexTypeRecord].fieldUnionPrimitiveAndComplex.[type=string].complexField1", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(13), + "[version=2.0].[type=UnionType].[type=union].[type=ComplexTypeRecord].fieldUnionPrimitiveAndComplex.[type=int].complexField2", + "int", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + } + + @Test(groups = "basic") + void testLogicalTypes() throws IOException { + SchemaMetadata schema = + avroSchemaConverter.toDataHubSchema( + readAvroSchema("logical_types.avsc"), false, false, dataPlatformUrn, null); + + schema.getFields().forEach(System.out::println); + + assertEquals(schema.getFields().size(), 9); + + assertSchemaField( + schema.getFields().get(0), + "[version=2.0].[type=LogicalTypes].[type=bytes].decimalField", + "bytes(decimal)", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType())), + "{\"scale\":2,\"logicalType\":\"decimal\",\"precision\":9}"); + assertSchemaField( + schema.getFields().get(1), + "[version=2.0].[type=LogicalTypes].[type=bytes].decimalFieldWithoutScale", + "bytes(decimal)", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType())), + "{\"logicalType\":\"decimal\",\"precision\":9}"); + assertSchemaField( + schema.getFields().get(2), + "[version=2.0].[type=LogicalTypes].[type=bytes].decimalFieldWithoutPrecisionAndScale", + "bytes", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new BytesType())), + "{\"logicalType\":\"decimal\"}"); + assertSchemaField( + schema.getFields().get(3), + "[version=2.0].[type=LogicalTypes].[type=long].timestampMillisField", + "long(timestamp-millis)", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new TimeType())), + "{\"logicalType\":\"timestamp-millis\"}"); + assertSchemaField( + schema.getFields().get(4), + "[version=2.0].[type=LogicalTypes].[type=long].timestampMicrosField", + "long(timestamp-micros)", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new TimeType())), + "{\"logicalType\":\"timestamp-micros\"}"); + assertSchemaField( + schema.getFields().get(5), + "[version=2.0].[type=LogicalTypes].[type=int].dateField", + "int(date)", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new DateType())), + "{\"logicalType\":\"date\"}"); + assertSchemaField( + schema.getFields().get(6), + "[version=2.0].[type=LogicalTypes].[type=int].timeMillisField", + "int(time-millis)", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new TimeType())), + "{\"logicalType\":\"time-millis\"}"); + assertSchemaField( + schema.getFields().get(7), + "[version=2.0].[type=LogicalTypes].[type=long].timeMicrosField", + "long(time-micros)", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new TimeType())), + "{\"logicalType\":\"time-micros\"}"); + assertSchemaField( + schema.getFields().get(8), + "[version=2.0].[type=LogicalTypes].[type=string].uuidField", + "string(uuid)", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType())), + "{\"logicalType\":\"uuid\"}"); + } + + @Test(groups = "basic") + void testUsersRecord() throws IOException { + // this is a test case got during the Hudi integration + SchemaMetadata schema = + avroSchemaConverter.toDataHubSchema( + readAvroSchema("users_record.avsc"), false, false, dataPlatformUrn, null); + + schema.getFields().forEach(System.out::println); + + assertEquals(schema.getFields().size(), 20); + + assertSchemaField( + schema.getFields().get(0), + "[version=2.0].[type=users_record].[type=string]._hoodie_commit_time", + "string", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(1), + "[version=2.0].[type=users_record].[type=string]._hoodie_commit_seqno", + "string", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(2), + "[version=2.0].[type=users_record].[type=string]._hoodie_record_key", + "string", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(3), + "[version=2.0].[type=users_record].[type=string]._hoodie_partition_path", + "string", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(4), + "[version=2.0].[type=users_record].[type=string]._hoodie_file_name", + "string", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(5), + "[version=2.0].[type=users_record].[type=string].user_id", + "string", + false, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(6), + "[version=2.0].[type=users_record].[type=string].name", + "string", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(7), + "[version=2.0].[type=users_record].[type=address].address", + "address", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new RecordType()))); + assertSchemaField( + schema.getFields().get(8), + "[version=2.0].[type=users_record].[type=address].address.[type=string].street", + "string", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(9), + "[version=2.0].[type=users_record].[type=address].address.[type=string].city", + "string", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(10), + "[version=2.0].[type=users_record].[type=address].address.[type=string].country", + "string", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(11), + "[version=2.0].[type=users_record].[type=address].address.[type=string].postal_code", + "string", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(12), + "[version=2.0].[type=users_record].[type=address].address.[type=long].created_at", + "long(timestamp-micros)", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new TimeType())), + "{\"logicalType\":\"timestamp-micros\"}"); + assertSchemaField( + schema.getFields().get(13), + "[version=2.0].[type=users_record].[type=contact].contact", + "contact", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new RecordType()))); + assertSchemaField( + schema.getFields().get(14), + "[version=2.0].[type=users_record].[type=contact].contact.[type=string].email", + "string", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(15), + "[version=2.0].[type=users_record].[type=contact].contact.[type=string].phone", + "string", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + assertSchemaField( + schema.getFields().get(16), + "[version=2.0].[type=users_record].[type=long].created_at", + "long(timestamp-micros)", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new TimeType())), + "{\"logicalType\":\"timestamp-micros\"}"); + assertSchemaField( + schema.getFields().get(17), + "[version=2.0].[type=users_record].[type=long].updated_at", + "long(timestamp-micros)", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new TimeType())), + "{\"logicalType\":\"timestamp-micros\"}"); + assertSchemaField( + schema.getFields().get(18), + "[version=2.0].[type=users_record].[type=map].[type=int].props", + "int", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new NumberType()))); + assertSchemaField( + schema.getFields().get(19), + "[version=2.0].[type=users_record].[type=string].country", + "string", + true, + false, + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))); + } + + private void assertSchemaField( + SchemaField field, + String expectedPath, + String expectedNativeType, + boolean expectedNullable, + boolean expectedIsPartOfKey, + SchemaFieldDataType expectedType) { + assertSchemaField( + field, + expectedPath, + expectedNativeType, + expectedNullable, + expectedIsPartOfKey, + expectedType, + null); + } + + private void assertSchemaField( + SchemaField field, + String expectedPath, + String expectedNativeType, + boolean expectedNullable, + boolean expectedIsPartOfKey, + SchemaFieldDataType expectedType, + String expectedJsonProps) { + assertEquals(field.getFieldPath(), expectedPath); + assertEquals(field.getNativeDataType(), expectedNativeType); + assertEquals(field.isNullable(), expectedNullable); + assertEquals(field.isIsPartOfKey(), expectedIsPartOfKey); + assertEquals(field.getType(), expectedType); + if (expectedJsonProps != null) { + assertEquals(field.getJsonProps(), expectedJsonProps); + } + } + + private Schema readAvroSchema(String schemaFileName) throws IOException { + String schemaPath = getClass().getClassLoader().getResource(schemaFileName).getPath(); + File schemaFile = new File(schemaPath); + return new Schema.Parser().parse(schemaFile); + } +} diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/resources/CustomerProfile.avsc b/metadata-integration/java/datahub-schematron/lib/src/test/resources/CustomerProfile.avsc deleted file mode 100644 index 81f8b0e54b11e0..00000000000000 --- a/metadata-integration/java/datahub-schematron/lib/src/test/resources/CustomerProfile.avsc +++ /dev/null @@ -1,456 +0,0 @@ -{ - "type": "record", - "name": "CustomerProfile", - "namespace": "com.example.customer", - "doc": "A complex customer profile schema demonstrating various union types and optional fields", - "fields": [ - { - "name": "customerId", - "type": { - "type": "string", - "logicalType": "uuid" - }, - "doc": "Unique identifier for the customer" - }, - { - "name": "identificationDocument", - "type": [ - "null", - { - "type": "record", - "name": "Passport", - "fields": [ - { - "name": "passportNumber", - "type": "string" - }, - { - "name": "expiryDate", - "type": { - "type": "long", - "logicalType": "date" - } - } - ] - }, - { - "type": "record", - "name": "DriversLicense", - "fields": [ - { - "name": "licenseNumber", - "type": "string" - }, - { - "name": "state", - "type": "string" - }, - { - "name": "validUntil", - "type": { - "type": "long", - "logicalType": "date" - } - } - ] - }, - { - "type": "record", - "name": "NationalID", - "fields": [ - { - "name": "idNumber", - "type": "string" - }, - { - "name": "country", - "type": "string" - } - ] - } - ], - "default": null, - "doc": "Customer's identification document - can be passport, driver's license, or national ID" - }, - { - "name": "contactInfo", - "type": { - "type": "record", - "name": "ContactInformation", - "fields": [ - { - "name": "primaryContact", - "type": [ - { - "type": "record", - "name": "EmailContact", - "fields": [ - { - "name": "emailAddress", - "type": "string" - }, - { - "name": "isVerified", - "type": "boolean", - "default": false - } - ] - }, - { - "type": "record", - "name": "PhoneContact", - "fields": [ - { - "name": "countryCode", - "type": "string" - }, - { - "name": "number", - "type": "string" - }, - { - "name": "type", - "type": { - "type": "enum", - "name": "PhoneType", - "symbols": [ - "MOBILE", - "LANDLINE" - ] - } - } - ] - } - ], - "doc": "Primary contact method - either email or phone" - }, - { - "name": "alternativeContacts", - "type": { - "type": "array", - "items": [ - "null", - "EmailContact", - "PhoneContact" - ] - }, - "default": [], - "doc": "List of alternative contact methods" - } - ] - } - }, - { - "name": "addresses", - "type": { - "type": "array", - "items": { - "type": "record", - "name": "Address", - "fields": [ - { - "name": "type", - "type": { - "type": "enum", - "name": "AddressType", - "symbols": [ - "RESIDENTIAL", - "BUSINESS", - "SHIPPING" - ] - }, - "default": "RESIDENTIAL" - }, - { - "name": "street", - "type": "string" - }, - { - "name": "city", - "type": "string" - }, - { - "name": "state", - "type": [ - "null", - "string" - ], - "default": null - }, - { - "name": "country", - "type": "string" - }, - { - "name": "postalCode", - "type": [ - "null", - "string" - ], - "default": null - }, - { - "name": "validationStatus", - "type": [ - "null", - { - "type": "record", - "name": "AddressValidation", - "fields": [ - { - "name": "isValid", - "type": "boolean" - }, - { - "name": "verificationDate", - "type": { - "type": "long", - "logicalType": "timestamp-millis" - } - }, - { - "name": "verificationMethod", - "type": { - "type": "enum", - "name": "VerificationMethod", - "symbols": [ - "MANUAL", - "AUTOMATED" - ] - } - } - ] - } - ], - "default": null - } - ] - } - }, - "doc": "Customer's addresses with validation information" - }, - { - "name": "preferences", - "type": { - "type": "map", - "values": [ - "null", - "string", - "boolean", - { - "type": "record", - "name": "FrequencyPreference", - "fields": [ - { - "name": "frequency", - "type": { - "type": "enum", - "name": "Frequency", - "symbols": [ - "DAILY", - "WEEKLY", - "MONTHLY" - ] - } - }, - { - "name": "enabled", - "type": "boolean", - "default": true - }, - { - "name": "lastUpdated", - "type": { - "type": "long", - "logicalType": "timestamp-millis" - } - } - ] - } - ] - }, - "doc": "Customer preferences with various possible value types" - }, - { - "name": "subscriptionHistory", - "type": [ - "null", - { - "type": "array", - "items": { - "type": "record", - "name": "Subscription", - "fields": [ - { - "name": "planName", - "type": "string" - }, - { - "name": "startDate", - "type": { - "type": "long", - "logicalType": "date" - } - }, - { - "name": "endDate", - "type": [ - "null", - { - "type": "long", - "logicalType": "date" - } - ], - "default": null - }, - { - "name": "status", - "type": { - "type": "enum", - "name": "SubscriptionStatus", - "symbols": [ - "ACTIVE", - "CANCELLED", - "EXPIRED", - "SUSPENDED" - ] - } - }, - { - "name": "paymentMethod", - "type": [ - "null", - { - "type": "record", - "name": "PaymentMethod", - "fields": [ - { - "name": "type", - "type": { - "type": "enum", - "name": "PaymentType", - "symbols": [ - "CREDIT_CARD", - "DEBIT_CARD", - "BANK_TRANSFER", - "DIGITAL_WALLET" - ] - } - }, - { - "name": "lastFourDigits", - "type": [ - "null", - "string" - ], - "default": null - }, - { - "name": "expiryDate", - "type": [ - "null", - { - "type": "long", - "logicalType": "date" - } - ], - "default": null - } - ] - } - ], - "default": null - } - ] - } - } - ], - "default": null, - "doc": "Historical record of customer subscriptions" - }, - { - "name": "metadata", - "type": { - "type": "map", - "values": [ - "null", - "string", - "long", - "boolean", - { - "type": "record", - "name": "MetadataValue", - "fields": [ - { - "name": "value", - "type": [ - "null", - "string", - "long", - "boolean" - ], - "default": null - }, - { - "name": "timestamp", - "type": { - "type": "long", - "logicalType": "timestamp-millis" - } - }, - { - "name": "source", - "type": "string" - } - ] - } - ] - }, - "doc": "Flexible metadata storage with various possible value types" - }, - { - "name": "tags", - "type": [ - "null", - { - "type": "array", - "items": { - "type": "record", - "name": "Tag", - "fields": [ - { - "name": "name", - "type": "string" - }, - { - "name": "value", - "type": [ - "null", - "string" - ], - "default": null - }, - { - "name": "score", - "type": [ - "null", - "double" - ], - "default": null - }, - { - "name": "addedAt", - "type": { - "type": "long", - "logicalType": "timestamp-millis" - } - } - ] - } - } - ], - "default": null, - "doc": "Optional tags associated with the customer profile" - } - ] -} \ No newline at end of file diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/resources/CustomerProfile2.avsc b/metadata-integration/java/datahub-schematron/lib/src/test/resources/CustomerProfile2.avsc deleted file mode 100644 index b8c7654ea072a2..00000000000000 --- a/metadata-integration/java/datahub-schematron/lib/src/test/resources/CustomerProfile2.avsc +++ /dev/null @@ -1,244 +0,0 @@ -{ - "type": "record", - "name": "CustomerProfile2", - "namespace": "com.example.customer", - "doc": "A complex customer profile schema demonstrating various union types and optional fields", - "fields": [ - { - "name": "customerId", - "type": { - "type": "string", - "logicalType": "uuid" - }, - "doc": "Unique identifier for the customer" - }, - { - "name": "identificationDocument", - "type": [ - "null", - { - "type": "record", - "name": "Passport", - "fields": [ - { - "name": "passportNumber", - "type": "string" - }, - { - "name": "expiryDate", - "type": { - "type": "long", - "logicalType": "date" - } - } - ] - }, - { - "type": "record", - "name": "DriversLicense", - "fields": [ - { - "name": "licenseNumber", - "type": "string" - }, - { - "name": "state", - "type": "string" - }, - { - "name": "validUntil", - "type": { - "type": "long", - "logicalType": "date" - } - } - ] - }, - { - "type": "record", - "name": "NationalID", - "fields": [ - { - "name": "idNumber", - "type": "string" - }, - { - "name": "country", - "type": "string" - } - ] - } - ], - "default": null, - "doc": "Customer's identification document" - }, - { - "name": "contactInfo", - "type": { - "type": "record", - "name": "ContactInformation", - "fields": [ - { - "name": "primaryEmailContact", - "type": [ - "null", - { - "type": "record", - "name": "PrimaryEmailContact", - "fields": [ - { - "name": "emailAddress", - "type": "string" - }, - { - "name": "isVerified", - "type": "boolean", - "default": false - } - ] - } - ], - "default": null - }, - { - "name": "primaryPhoneContact", - "type": [ - "null", - { - "type": "record", - "name": "PrimaryPhoneContact", - "fields": [ - { - "name": "countryCode", - "type": "string" - }, - { - "name": "number", - "type": "string" - }, - { - "name": "type", - "type": { - "type": "enum", - "name": "PhoneType", - "symbols": [ - "MOBILE", - "LANDLINE" - ] - } - } - ] - } - ], - "default": null - }, - { - "name": "alternativeEmailContacts", - "type": { - "type": "array", - "items": { - "type": "record", - "name": "AlternativeEmailContact", - "fields": [ - { - "name": "emailAddress", - "type": "string" - }, - { - "name": "isVerified", - "type": "boolean", - "default": false - } - ] - } - }, - "default": [] - }, - { - "name": "alternativePhoneContacts", - "type": { - "type": "array", - "items": { - "type": "record", - "name": "AlternativePhoneContact", - "fields": [ - { - "name": "countryCode", - "type": "string" - }, - { - "name": "number", - "type": "string" - }, - { - "name": "type", - "type": "PhoneType" - } - ] - } - }, - "default": [] - } - ] - } - }, - { - "name": "preferences", - "type": { - "type": "record", - "name": "Preferences", - "fields": [ - { - "name": "simplePreferences", - "type": { - "type": "map", - "values": [ - "null", - "string", - "boolean" - ] - }, - "default": {} - }, - { - "name": "frequencyPreferences", - "type": { - "type": "map", - "values": { - "type": "record", - "name": "FrequencyPreference", - "fields": [ - { - "name": "frequency", - "type": { - "type": "enum", - "name": "Frequency", - "symbols": [ - "DAILY", - "WEEKLY", - "MONTHLY" - ] - } - }, - { - "name": "enabled", - "type": "boolean", - "default": true - }, - { - "name": "lastUpdated", - "type": { - "type": "long", - "logicalType": "timestamp-millis" - } - } - ] - } - }, - "default": {} - } - ] - } - } - ] -} \ No newline at end of file diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/resources/FlatUser.avsc b/metadata-integration/java/datahub-schematron/lib/src/test/resources/FlatUser.avsc deleted file mode 100644 index c796878c32ae41..00000000000000 --- a/metadata-integration/java/datahub-schematron/lib/src/test/resources/FlatUser.avsc +++ /dev/null @@ -1,45 +0,0 @@ -{ - "type": "record", - "name": "FlatUser", - "namespace": "com.example", - "fields": [ - { - "name": "id", - "type": "int", - "doc": "The unique identifier for a user", - "default": -1, - "metadata": { - "key1": "value1", - "key2": "value2" - } - }, - { - "name": "username", - "type": "string", - "doc": "The username of the user" - }, - { - "name": "email", - "type": "string", - "doc": "The email of the user" - }, - { - "name": "age", - "type": "int", - "doc": "The age of the user" - }, - { - "name": "isActive", - "type": "boolean", - "doc": "Whether the user is active or not" - }, - { - "name": "registrationDate", - "type": { - "type": "long", - "logicalType": "timestamp-millis" - }, - "doc": "The registration date of the user" - } - ] -} \ No newline at end of file diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_arrays.avsc b/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_arrays.avsc new file mode 100644 index 00000000000000..8e8bcdaa0a7dce --- /dev/null +++ b/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_arrays.avsc @@ -0,0 +1,87 @@ +{ + "type": "record", + "name": "ArrayType", + "fields": [ + { + "name": "arrayOfString", + "type": { + "type": "array", + "items": "string" + } + }, + { + "name": "arrayOfMap", + "type": { + "type": "array", + "items": { + "type": "map", + "values": "string" + } + } + }, + { + "name": "arrayOfRecord", + "type": { + "type": "array", + "items": { + "type": "record", + "name": "ComplexType", + "fields": [ + { + "name": "field1", + "type": "string" + }, + { + "name": "field2", + "type": "int" + } + ] + } + } + }, + { + "name": "arrayOfArray", + "type": { + "type": "array", + "items": { + "type": "array", + "items": "string" + } + } + }, + { + "name": "arrayOfUnion", + "type": { + "type": "array", + "items": ["string", "int", "boolean"] + } + }, + { + "name": "arrayOfNullableString", + "type": { + "type": "array", + "items": ["null", "string"] + } + }, + { + "name": "arrayOfNullableRecord", + "type": { + "type": "array", + "items": ["null", { + "type": "record", + "name": "ComplexTypeNullable", + "fields": [ + { + "name": "field1", + "type": "string" + }, + { + "name": "field2", + "type": "int" + } + ] + }] + } + } + ] +} \ No newline at end of file diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_maps.avsc b/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_maps.avsc new file mode 100644 index 00000000000000..baedae1b9dcc15 --- /dev/null +++ b/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_maps.avsc @@ -0,0 +1,87 @@ +{ + "type": "record", + "name": "MapType", + "fields": [ + { + "name": "mapOfString", + "type": { + "type": "map", + "values": "string" + } + }, + { + "name": "mapOfComplexType", + "type": { + "type": "map", + "values": { + "type": "record", + "name": "ComplexType", + "fields": [ + { + "name": "field1", + "type": "string" + }, + { + "name": "field2", + "type": "int" + } + ] + } + } + }, + { + "name": "mapOfNullableString", + "type": { + "type": "map", + "values": ["null", "string"] + } + }, + { + "name": "mapOfNullableComplexType", + "type": { + "type": "map", + "values": ["null", { + "type": "record", + "name": "ComplexTypeNullable", + "fields": [ + { + "name": "field1", + "type": "string" + }, + { + "name": "field2", + "type": "int" + } + ] + }] + } + }, + { + "name": "mapOfArray", + "type": { + "type": "map", + "values": { + "type": "array", + "items": "string" + } + } + }, + { + "name": "mapOfMap", + "type": { + "type": "map", + "values": { + "type": "map", + "values": "int" + } + } + }, + { + "name": "mapOfUnion", + "type": { + "type": "map", + "values": ["null", "string", "int"] + } + } + ] +} \ No newline at end of file diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_structs.avsc b/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_structs.avsc new file mode 100644 index 00000000000000..7f5824192d3062 --- /dev/null +++ b/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_structs.avsc @@ -0,0 +1,76 @@ +{ + "type": "record", + "name": "StructType", + "fields": [ + { + "name": "structField", + "type": { + "type": "record", + "name": "ComplexStruct", + "fields": [ + { + "name": "fieldString", + "type": "string" + }, + { + "name": "fieldInt", + "type": "int" + }, + { + "name": "fieldBoolean", + "type": "boolean" + }, + { + "name": "fieldMap", + "type": { + "type": "map", + "values": "string" + } + }, + { + "name": "fieldRecord", + "type": { + "type": "record", + "name": "NestedRecord", + "fields": [ + { + "name": "nestedField1", + "type": "string" + }, + { + "name": "nestedField2", + "type": "int" + } + ] + } + }, + { + "name": "fieldArray", + "type": { + "type": "array", + "items": "string" + } + }, + { + "name": "fieldUnion", + "type": [ + "null", + "string", + "int" + ] + }, + { + "name": "fieldNullableMap", + "type": [ + "null", + { + "type": "map", + "values": "string" + } + ] + } + ] + } + } + ] +} \ No newline at end of file diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_unions.avsc b/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_unions.avsc new file mode 100644 index 00000000000000..1a35f1cfa0e6d6 --- /dev/null +++ b/metadata-integration/java/datahub-schematron/lib/src/test/resources/complex_unions.avsc @@ -0,0 +1,60 @@ +{ + "type": "record", + "name": "UnionType", + "fields": [ + { + "name": "fieldUnionNullablePrimitives", + "type": [ + "null", + "string", + "int", + "boolean" + ] + }, + { + "name": "fieldUnionComplexTypes", + "type": [ + "null", + { + "type": "record", + "name": "NestedRecord", + "fields": [ + { + "name": "nestedField1", + "type": "string" + }, + { + "name": "nestedField2", + "type": "int" + } + ] + }, + { + "type": "map", + "values": "string" + } + ] + }, + { + "name": "fieldUnionPrimitiveAndComplex", + "type": [ + "null", + "string", + { + "type": "record", + "name": "ComplexTypeRecord", + "fields": [ + { + "name": "complexField1", + "type": "string" + }, + { + "name": "complexField2", + "type": "int" + } + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/resources/logical_types.avsc b/metadata-integration/java/datahub-schematron/lib/src/test/resources/logical_types.avsc new file mode 100644 index 00000000000000..24919d82149653 --- /dev/null +++ b/metadata-integration/java/datahub-schematron/lib/src/test/resources/logical_types.avsc @@ -0,0 +1,72 @@ +{ + "type": "record", + "name": "LogicalTypes", + "fields": [ + { + "name": "decimalField", + "type": { + "type": "bytes", + "logicalType": "decimal", + "precision": 9, + "scale": 2 + } + }, + { + "name": "decimalFieldWithoutScale", + "type": { + "type": "bytes", + "logicalType": "decimal", + "precision": 9 + } + }, + { + "name": "decimalFieldWithoutPrecisionAndScale", + "type": { + "type": "bytes", + "logicalType": "decimal" + } + }, + { + "name": "timestampMillisField", + "type": { + "type": "long", + "logicalType": "timestamp-millis" + } + }, + { + "name": "timestampMicrosField", + "type": { + "type": "long", + "logicalType": "timestamp-micros" + } + }, + { + "name": "dateField", + "type": { + "type": "int", + "logicalType": "date" + } + }, + { + "name": "timeMillisField", + "type": { + "type": "int", + "logicalType": "time-millis" + } + }, + { + "name": "timeMicrosField", + "type": { + "type": "long", + "logicalType": "time-micros" + } + }, + { + "name": "uuidField", + "type": { + "type": "string", + "logicalType": "uuid" + } + } + ] +} \ No newline at end of file diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/resources/primitive_types.avsc b/metadata-integration/java/datahub-schematron/lib/src/test/resources/primitive_types.avsc new file mode 100644 index 00000000000000..c618299748fab1 --- /dev/null +++ b/metadata-integration/java/datahub-schematron/lib/src/test/resources/primitive_types.avsc @@ -0,0 +1,62 @@ +{ + "type": "record", + "name": "PrimitiveType", + "fields": [ + { + "name": "intField", + "type": "int" + }, + { + "name": "intFieldV2", + "type": ["int"] + }, + { + "name": "nullField", + "type": "null" + }, + { + "name": "nullFieldV2", + "type": ["null"] + }, + { + "name": "longField", + "type": "long" + }, + { + "name": "floatField", + "type": "float" + }, + { + "name": "doubleField", + "type": "double" + }, + { + "name": "stringField", + "type": "string" + }, + { + "name": "booleanField", + "type": "boolean" + }, + { + "name": "nullableIntField", + "type": ["null", "int"] + }, + { + "name": "nullableLongField", + "type": ["null", "long"] + }, + { + "name": "nullableStringField", + "type": ["null", "string"] + }, + { + "name": "status", + "type": { + "type": "enum", + "name": "StatusEnum", + "symbols": ["ACTIVE", "INACTIVE", "PENDING"] + } + } + ] +} \ No newline at end of file diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/resources/users_record.avsc b/metadata-integration/java/datahub-schematron/lib/src/test/resources/users_record.avsc new file mode 100644 index 00000000000000..bd46ae715a4810 --- /dev/null +++ b/metadata-integration/java/datahub-schematron/lib/src/test/resources/users_record.avsc @@ -0,0 +1,195 @@ +{ + "type": "record", + "name": "users_record", + "namespace": "hoodie.users", + "fields": [ + { + "name": "_hoodie_commit_time", + "type": [ + "null", + "string" + ], + "doc": "", + "default": null + }, + { + "name": "_hoodie_commit_seqno", + "type": [ + "null", + "string" + ], + "doc": "", + "default": null + }, + { + "name": "_hoodie_record_key", + "type": [ + "null", + "string" + ], + "doc": "", + "default": null + }, + { + "name": "_hoodie_partition_path", + "type": [ + "null", + "string" + ], + "doc": "", + "default": null + }, + { + "name": "_hoodie_file_name", + "type": [ + "null", + "string" + ], + "doc": "", + "default": null + }, + { + "name": "user_id", + "type": "string" + }, + { + "name": "name", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "address", + "type": [ + "null", + { + "type": "record", + "name": "address", + "namespace": "hoodie.users.users_record", + "fields": [ + { + "name": "street", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "city", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "country", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "postal_code", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "created_at", + "type": [ + "null", + { + "type": "long", + "logicalType": "timestamp-micros" + } + ], + "default": null + } + ] + } + ], + "default": null + }, + { + "name": "contact", + "type": [ + "null", + { + "type": "record", + "name": "contact", + "namespace": "hoodie.users.users_record", + "fields": [ + { + "name": "email", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "phone", + "type": [ + "null", + "string" + ], + "default": null + } + ] + } + ], + "default": null + }, + { + "name": "created_at", + "type": [ + "null", + { + "type": "long", + "logicalType": "timestamp-micros" + } + ], + "default": null + }, + { + "name": "updated_at", + "type": [ + "null", + { + "type": "long", + "logicalType": "timestamp-micros" + } + ], + "default": null + }, + { + "name": "props", + "type": [ + "null", + { + "type": "map", + "values": [ + "null", + "int" + ] + } + ], + "default": null + }, + { + "name": "country", + "type": [ + "null", + "string" + ], + "default": null + } + ] +} \ No newline at end of file diff --git a/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java b/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java index 9f57d36f800de3..a3099b9ee21ea4 100644 --- a/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java +++ b/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java @@ -16,7 +16,7 @@ import com.linkedin.data.schema.annotation.PathSpecBasedSchemaAnnotationVisitor; import com.linkedin.dataset.DatasetProperties; import com.linkedin.events.metadata.ChangeType; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.aspect.batch.MCPItem; import com.linkedin.metadata.aspect.patch.GenericJsonPatch; @@ -56,7 +56,7 @@ public class AspectsBatchImplTest { private EntityRegistry testRegistry; - private AspectRetriever mockAspectRetriever; + private CachingAspectRetriever mockAspectRetriever; private RetrieverContext retrieverContext; @BeforeTest @@ -75,12 +75,12 @@ public void beforeTest() throws EntityRegistryException { @BeforeMethod public void setup() { - this.mockAspectRetriever = mock(AspectRetriever.class); + this.mockAspectRetriever = mock(CachingAspectRetriever.class); when(this.mockAspectRetriever.getEntityRegistry()).thenReturn(testRegistry); this.retrieverContext = RetrieverContext.builder() .searchRetriever(mock(SearchRetriever.class)) - .aspectRetriever(mockAspectRetriever) + .cachingAspectRetriever(mockAspectRetriever) .graphRetriever(mock(GraphRetriever.class)) .build(); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtil.java b/metadata-io/src/main/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtil.java index 99eadd223acd1a..82bc0ae1409c52 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtil.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtil.java @@ -137,7 +137,7 @@ public static List getAdditionalChanges( getProposalFromAspectForDefault( entry.getKey(), entry.getValue(), entityKeyAspect, templateItem), templateItem.getAuditStamp(), - opContext.getAspectRetrieverOpt().get())) + opContext.getAspectRetriever())) .filter(Objects::nonNull); }) .collect(Collectors.toList()); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/client/EntityClientAspectRetriever.java b/metadata-io/src/main/java/com/linkedin/metadata/client/EntityClientAspectRetriever.java index bba8324d0c5612..669ec751f87c69 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/client/EntityClientAspectRetriever.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/client/EntityClientAspectRetriever.java @@ -35,7 +35,7 @@ public EntityRegistry getEntityRegistry() { @Override public Aspect getLatestAspectObject(@Nonnull Urn urn, @Nonnull String aspectName) { try { - return entityClient.getLatestAspectObject(systemOperationContext, urn, aspectName); + return entityClient.getLatestAspectObject(systemOperationContext, urn, aspectName, false); } catch (RemoteInvocationException | URISyntaxException e) { throw new RuntimeException(e); } @@ -49,7 +49,7 @@ public Map> getLatestAspectObjects( return Map.of(); } else { try { - return entityClient.getLatestAspects(systemOperationContext, urns, aspectNames); + return entityClient.getLatestAspects(systemOperationContext, urns, aspectNames, false); } catch (RemoteInvocationException | URISyntaxException e) { throw new RuntimeException(e); } @@ -70,7 +70,8 @@ public Map> getLatestSystemAspects( urnAspectNames.keySet(), urnAspectNames.values().stream() .flatMap(Collection::stream) - .collect(Collectors.toSet())); + .collect(Collectors.toSet()), + false); } catch (RemoteInvocationException | URISyntaxException e) { throw new RuntimeException(e); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java index 29faa3955ea662..3d35f5956b0f4f 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java @@ -106,11 +106,17 @@ public EntityResponse getV2( @Nonnull OperationContext opContext, @Nonnull String entityName, @Nonnull final Urn urn, - @Nullable final Set aspectNames) + @Nullable final Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { final Set projectedAspects = aspectNames == null ? opContext.getEntityAspectNames(entityName) : aspectNames; - return entityService.getEntityV2(opContext, entityName, urn, projectedAspects); + return entityService.getEntityV2( + opContext, + entityName, + urn, + projectedAspects, + alwaysIncludeKeyAspect == null || alwaysIncludeKeyAspect); } @Override @@ -126,7 +132,8 @@ public Map batchGetV2( @Nonnull OperationContext opContext, @Nonnull String entityName, @Nonnull Set urns, - @Nullable Set aspectNames) + @Nullable Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { final Set projectedAspects = aspectNames == null ? opContext.getEntityAspectNames(entityName) : aspectNames; @@ -139,7 +146,11 @@ public Map batchGetV2( try { responseMap.putAll( entityService.getEntitiesV2( - opContext, entityName, new HashSet<>(batch), projectedAspects)); + opContext, + entityName, + new HashSet<>(batch), + projectedAspects, + alwaysIncludeKeyAspect == null || alwaysIncludeKeyAspect)); } catch (URISyntaxException e) { throw new RuntimeException(e); } @@ -772,7 +783,7 @@ public List batchIngestProposals( .mcps( batch, auditStamp, - opContext.getRetrieverContext().get(), + opContext.getRetrieverContext(), opContext.getValidationContext().isAlternateValidation()) .build(); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/client/SystemJavaEntityClient.java b/metadata-io/src/main/java/com/linkedin/metadata/client/SystemJavaEntityClient.java index eda9b3a880228f..1d2fd422d7f460 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/client/SystemJavaEntityClient.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/client/SystemJavaEntityClient.java @@ -89,6 +89,6 @@ public Map batchGetV2NoCache( @Nonnull Set urns, @Nullable Set aspectNames) throws RemoteInvocationException, URISyntaxException { - return super.batchGetV2(opContext, entityName, urns, aspectNames); + return super.batchGetV2(opContext, entityName, urns, aspectNames, false); } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceAspectRetriever.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceAspectRetriever.java index 626a1f72f5fb73..50cf8af30d606a 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceAspectRetriever.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceAspectRetriever.java @@ -5,7 +5,7 @@ import com.linkedin.common.urn.Urn; import com.linkedin.entity.Aspect; -import com.linkedin.metadata.aspect.CachingAspectRetriever; +import com.linkedin.metadata.aspect.AspectRetriever; import com.linkedin.metadata.aspect.SystemAspect; import com.linkedin.metadata.models.registry.EntityRegistry; import io.datahubproject.metadata.context.OperationContext; @@ -22,7 +22,7 @@ @Getter @Builder -public class EntityServiceAspectRetriever implements CachingAspectRetriever { +public class EntityServiceAspectRetriever implements AspectRetriever { @Setter private OperationContext systemOperationContext; private final EntityRegistry entityRegistry; @@ -46,7 +46,8 @@ public Map> getLatestAspectObjects( String entityName = urns.stream().findFirst().map(Urn::getEntityType).get(); try { return entityResponseToAspectMap( - entityService.getEntitiesV2(systemOperationContext, entityName, urns, aspectNames)); + entityService.getEntitiesV2( + systemOperationContext, entityName, urns, aspectNames, false)); } catch (URISyntaxException e) { throw new RuntimeException(e); } @@ -71,7 +72,8 @@ public Map> getLatestSystemAspects( urnAspectNames.keySet(), urnAspectNames.values().stream() .flatMap(Collection::stream) - .collect(Collectors.toSet())), + .collect(Collectors.toSet()), + false), entityRegistry); } catch (URISyntaxException e) { throw new RuntimeException(e); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java index 6de7784bfbc0ec..8ae09111204cab 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java @@ -261,8 +261,7 @@ public Map> getLatestAspects( } List systemAspects = - EntityUtils.toSystemAspects( - opContext.getRetrieverContext().get(), batchGetResults.values()); + EntityUtils.toSystemAspects(opContext.getRetrieverContext(), batchGetResults.values()); systemAspects.stream() // for now, don't add the key aspect here we have already added it above @@ -290,8 +289,7 @@ public Map getLatestAspectsForUrn( Map batchGetResults = getLatestAspect(opContext, new HashSet<>(Arrays.asList(urn)), aspectNames, forUpdate); - return EntityUtils.toSystemAspects( - opContext.getRetrieverContext().get(), batchGetResults.values()) + return EntityUtils.toSystemAspects(opContext.getRetrieverContext(), batchGetResults.values()) .stream() .map( systemAspect -> Pair.of(systemAspect.getAspectName(), systemAspect.getRecordTemplate())) @@ -335,7 +333,7 @@ public Pair getAspectVersionPair( final Optional maybeAspect = Optional.ofNullable(aspectDao.getAspect(primaryKey)); return Pair.of( - EntityUtils.toSystemAspect(opContext.getRetrieverContext().get(), maybeAspect.orElse(null)) + EntityUtils.toSystemAspect(opContext.getRetrieverContext(), maybeAspect.orElse(null)) .map(SystemAspect::getRecordTemplate) .orElse(null), version); @@ -721,7 +719,7 @@ public ListResult listLatestAspects( } return new ListResult<>( - EntityUtils.toSystemAspects(opContext.getRetrieverContext().get(), entityAspects).stream() + EntityUtils.toSystemAspects(opContext.getRetrieverContext(), entityAspects).stream() .map(SystemAspect::getRecordTemplate) .collect(Collectors.toList()), aspectMetadataList.getMetadata(), @@ -758,12 +756,12 @@ public List ingestAspects( .recordTemplate(pair.getValue()) .systemMetadata(systemMetadata) .auditStamp(auditStamp) - .build(opContext.getAspectRetrieverOpt().get())) + .build(opContext.getAspectRetriever())) .collect(Collectors.toList()); return ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -815,13 +813,13 @@ private void processPostCommitMCLSideEffects( log.debug("Considering {} MCLs post commit side effects.", mcls.size()); List batch = mcls.stream() - .map(mcl -> MCLItemImpl.builder().build(mcl, opContext.getAspectRetrieverOpt().get())) + .map(mcl -> MCLItemImpl.builder().build(mcl, opContext.getAspectRetriever())) .collect(Collectors.toList()); Iterable> iterable = () -> Iterators.partition( - AspectsBatch.applyPostMCPSideEffects(batch, opContext.getRetrieverContext().get()) + AspectsBatch.applyPostMCPSideEffects(batch, opContext.getRetrieverContext()) .iterator(), MCP_SIDE_EFFECT_KAFKA_BATCH_SIZE); StreamSupport.stream(iterable.spliterator(), false) @@ -831,7 +829,7 @@ private void processPostCommitMCLSideEffects( ingestProposalAsync( AspectsBatchImpl.builder() .items(sideEffects) - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .build()) .count(); log.info("Generated {} MCP SideEffects for async processing", count); @@ -879,8 +877,7 @@ private List ingestAspectsToLocalDB( aspectDao.getLatestAspects(urnAspects, true); final Map> batchAspects = - EntityUtils.toSystemAspects( - opContext.getRetrieverContext().get(), databaseAspects); + EntityUtils.toSystemAspects(opContext.getRetrieverContext(), databaseAspects); // read #2 (potentially) final Map> nextVersions = @@ -903,7 +900,7 @@ private List ingestAspectsToLocalDB( Map> newLatestAspects = EntityUtils.toSystemAspects( - opContext.getRetrieverContext().get(), + opContext.getRetrieverContext(), aspectDao.getLatestAspects(updatedItems.getFirst(), true)); // merge updatedLatestAspects = AspectsBatch.merge(batchAspects, newLatestAspects); @@ -941,7 +938,7 @@ private List ingestAspectsToLocalDB( // do final pre-commit checks with previous aspect value ValidationExceptionCollection exceptions = - AspectsBatch.validatePreCommit(changeMCPs, opContext.getRetrieverContext().get()); + AspectsBatch.validatePreCommit(changeMCPs, opContext.getRetrieverContext()); if (exceptions.hasFatalExceptions()) { // IF this is a client request/API request we fail the `transaction batch` @@ -1143,8 +1140,8 @@ public RecordTemplate ingestAspectIfNotPresent( .recordTemplate(newValue) .systemMetadata(systemMetadata) .auditStamp(auditStamp) - .build(opContext.getAspectRetrieverOpt().get()), - opContext.getRetrieverContext().get()) + .build(opContext.getAspectRetriever()), + opContext.getRetrieverContext()) .build(); List ingested = ingestAspects(opContext, aspectsBatch, true, false); @@ -1169,7 +1166,7 @@ public IngestResult ingestProposal( return ingestProposal( opContext, AspectsBatchImpl.builder() - .mcps(List.of(proposal), auditStamp, opContext.getRetrieverContext().get()) + .mcps(List.of(proposal), auditStamp, opContext.getRetrieverContext()) .build(), async) .stream() @@ -1246,7 +1243,7 @@ private Stream ingestTimeseriesProposal( .recordTemplate( EntityApiUtils.buildKeyAspect( opContext.getEntityRegistry(), item.getUrn())) - .build(opContext.getAspectRetrieverOpt().get())) + .build(opContext.getAspectRetriever())) .collect(Collectors.toList()); ingestProposalSync( @@ -1469,7 +1466,7 @@ public List restoreIndices( List systemAspects = EntityUtils.toSystemAspectFromEbeanAspects( - opContext.getRetrieverContext().get(), batch.collect(Collectors.toList())); + opContext.getRetrieverContext(), batch.collect(Collectors.toList())); RestoreIndicesResult result = restoreIndices(opContext, systemAspects, logger); result.timeSqlQueryMs = timeSqlQueryMs; @@ -1513,7 +1510,7 @@ public List restoreIndices( long startTime = System.currentTimeMillis(); List systemAspects = EntityUtils.toSystemAspects( - opContext.getRetrieverContext().get(), + opContext.getRetrieverContext(), getLatestAspect(opContext, entityBatch.getValue(), aspectNames, false).values()); long timeSqlQueryMs = System.currentTimeMillis() - startTime; @@ -1649,12 +1646,12 @@ private RestoreIndicesResult restoreIndices( .auditStamp(auditStamp) .systemMetadata(latestSystemMetadata) .recordTemplate(EntityApiUtils.buildKeyAspect(opContext.getEntityRegistry(), urn)) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); Stream defaultAspectsResult = ingestProposalSync( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(keyAspect) .build()); defaultAspectsCreated += defaultAspectsResult.count(); @@ -1966,7 +1963,7 @@ private void ingestSnapshotUnion( AspectsBatchImpl aspectsBatch = AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items( aspectRecordsToIngest.stream() .map( @@ -1977,7 +1974,7 @@ private void ingestSnapshotUnion( .recordTemplate(pair.getValue()) .auditStamp(auditStamp) .systemMetadata(systemMetadata) - .build(opContext.getAspectRetrieverOpt().get())) + .build(opContext.getAspectRetriever())) .collect(Collectors.toList())) .build(); @@ -2128,7 +2125,7 @@ public RollbackRunResult deleteUrn(@Nonnull OperationContext opContext, Urn urn) } SystemMetadata latestKeySystemMetadata = - EntityUtils.toSystemAspect(opContext.getRetrieverContext().get(), latestKey) + EntityUtils.toSystemAspect(opContext.getRetrieverContext(), latestKey) .map(SystemAspect::getSystemMetadata) .get(); RollbackResult result = @@ -2253,11 +2250,11 @@ private RollbackResult deleteAspectWithoutMCL( .urn(entityUrn) .aspectName(aspectName) .auditStamp(auditStamp) - .build(opContext.getAspectRetrieverOpt().get()); + .build(opContext.getAspectRetriever()); // Delete validation hooks ValidationExceptionCollection exceptions = - AspectsBatch.validateProposed(List.of(deleteItem), opContext.getRetrieverContext().get()); + AspectsBatch.validateProposed(List.of(deleteItem), opContext.getRetrieverContext()); if (!exceptions.isEmpty()) { throw new ValidationException(collectMetrics(exceptions).toString()); } @@ -2271,7 +2268,7 @@ private RollbackResult deleteAspectWithoutMCL( final EntityAspect.EntitySystemAspect latest = (EntityAspect.EntitySystemAspect) EntityUtils.toSystemAspect( - opContext.getRetrieverContext().get(), + opContext.getRetrieverContext(), aspectDao.getLatestAspect(urn, aspectName, false)) .orElse(null); @@ -2299,7 +2296,7 @@ private RollbackResult deleteAspectWithoutMCL( EntityAspect.EntitySystemAspect candidateAspect = (EntityAspect.EntitySystemAspect) EntityUtils.toSystemAspect( - opContext.getRetrieverContext().get(), + opContext.getRetrieverContext(), aspectDao.getAspect(urn, aspectName, maxVersion)) .orElse(null); SystemMetadata previousSysMetadata = @@ -2325,13 +2322,9 @@ private RollbackResult deleteAspectWithoutMCL( .urn(UrnUtils.getUrn(toDelete.getUrn())) .aspectName(toDelete.getAspect()) .auditStamp(auditStamp) - .build( - opContext - .getRetrieverContext() - .get() - .getAspectRetriever())) + .build(opContext.getAspectRetriever())) .collect(Collectors.toList()), - opContext.getRetrieverContext().get()); + opContext.getRetrieverContext()); if (!preCommitExceptions.isEmpty()) { throw new ValidationException(collectMetrics(preCommitExceptions).toString()); } @@ -2509,7 +2502,7 @@ private Map getEnvelopedAspects( final Map dbEntries = aspectDao.batchGet(dbKeys, false); List envelopedAspects = - EntityUtils.toSystemAspects(opContext.getRetrieverContext().get(), dbEntries.values()); + EntityUtils.toSystemAspects(opContext.getRetrieverContext(), dbEntries.values()); return envelopedAspects.stream() .collect( diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityUtils.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityUtils.java index 3c4109970e9d0b..da48a2b76d6d56 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityUtils.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityUtils.java @@ -72,7 +72,7 @@ public static void ingestChangeProposals( entityService.ingestProposal( opContext, AspectsBatchImpl.builder() - .mcps(changes, getAuditStamp(actor), opContext.getRetrieverContext().get()) + .mcps(changes, getAuditStamp(actor), opContext.getRetrieverContext()) .build(), async); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraRetentionService.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraRetentionService.java index ccc1910ba5cdbd..c595e3e07b8342 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraRetentionService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraRetentionService.java @@ -64,7 +64,7 @@ protected AspectsBatch buildAspectsBatch( List mcps, @Nonnull AuditStamp auditStamp) { return AspectsBatchImpl.builder() - .mcps(mcps, auditStamp, opContext.getRetrieverContext().get()) + .mcps(mcps, auditStamp, opContext.getRetrieverContext()) .build(); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java index bd6cc67561b883..ea580a97c51886 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java @@ -93,8 +93,14 @@ public class EbeanAspectDao implements AspectDao, AspectMigrationsDao { */ private final LoadingCache locks; + private final String batchGetMethod; + public EbeanAspectDao(@Nonnull final Database server, EbeanConfiguration ebeanConfiguration) { _server = server; + this.batchGetMethod = + ebeanConfiguration.getBatchGetMethod() != null + ? ebeanConfiguration.getBatchGetMethod() + : "IN"; if (ebeanConfiguration.getLocking().isEnabled()) { this.locks = CacheBuilder.newBuilder() @@ -371,23 +377,37 @@ private List batchGet( final int totalPageCount = QueryUtils.getTotalPageCount(keys.size(), keysCount); final List finalResult = - batchGetUnion(new ArrayList<>(keys), keysCount, position, forUpdate); + batchGetSelectString(new ArrayList<>(keys), keysCount, position, forUpdate); while (QueryUtils.hasMore(position, keysCount, totalPageCount)) { position += keysCount; final List oneStatementResult = - batchGetUnion(new ArrayList<>(keys), keysCount, position, forUpdate); + batchGetSelectString(new ArrayList<>(keys), keysCount, position, forUpdate); finalResult.addAll(oneStatementResult); } return finalResult; } + @Nonnull + private List batchGetSelectString( + @Nonnull final List keys, + final int keysCount, + final int position, + boolean forUpdate) { + + if (batchGetMethod.equals("IN")) { + return batchGetIn(keys, keysCount, position, forUpdate); + } + + return batchGetUnion(keys, keysCount, position, forUpdate); + } + /** * Builds a single SELECT statement for batch get, which selects one entity, and then can be * UNION'd with other SELECT statements. */ - private String batchGetSelect( + private String batchGetSelectString( final int selectId, @Nonnull final String urn, @Nonnull final String aspect, @@ -434,7 +454,7 @@ private List batchGetUnion( final Map params = new HashMap<>(); for (int index = position; index < end; index++) { sb.append( - batchGetSelect( + batchGetSelectString( index - position, keys.get(index).getUrn(), keys.get(index).getAspect(), @@ -467,6 +487,65 @@ private List batchGetUnion( return query.findList(); } + @Nonnull + private List batchGetIn( + @Nonnull final List keys, + final int keysCount, + final int position, + boolean forUpdate) { + validateConnection(); + + // Build a single SELECT with IN clause using composite key comparison + // Query will look like: + // SELECT * FROM metadata_aspect WHERE (urn, aspect, version) IN + // (('urn0', 'aspect0', 0), ('urn1', 'aspect1', 1)) + final StringBuilder sb = new StringBuilder(); + sb.append( + "SELECT urn, aspect, version, metadata, systemMetadata, createdOn, createdBy, createdFor "); + sb.append("FROM metadata_aspect_v2 WHERE (urn, aspect, version) IN ("); + + final int end = Math.min(keys.size(), position + keysCount); + final Map params = new HashMap<>(); + + for (int index = position; index < end; index++) { + int paramIndex = index - position; + String urnParam = "urn" + paramIndex; + String aspectParam = "aspect" + paramIndex; + String versionParam = "version" + paramIndex; + + params.put(urnParam, keys.get(index).getUrn()); + params.put(aspectParam, keys.get(index).getAspect()); + params.put(versionParam, keys.get(index).getVersion()); + + sb.append("(:" + urnParam + ", :" + aspectParam + ", :" + versionParam + ")"); + + if (index != end - 1) { + sb.append(","); + } + } + + sb.append(")"); + + if (forUpdate) { + sb.append(" FOR UPDATE"); + } + + final RawSql rawSql = + RawSqlBuilder.parse(sb.toString()) + .columnMapping(EbeanAspectV2.URN_COLUMN, "key.urn") + .columnMapping(EbeanAspectV2.ASPECT_COLUMN, "key.aspect") + .columnMapping(EbeanAspectV2.VERSION_COLUMN, "key.version") + .create(); + + final Query query = _server.find(EbeanAspectV2.class).setRawSql(rawSql); + + for (Map.Entry param : params.entrySet()) { + query.setParameter(param.getKey(), param.getValue()); + } + + return query.findList(); + } + @Override @Nonnull public ListResult listUrns( diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanRetentionService.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanRetentionService.java index 49fa555e006f61..74d0d8b0964de0 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanRetentionService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanRetentionService.java @@ -59,7 +59,7 @@ protected AspectsBatch buildAspectsBatch( List mcps, @Nonnull AuditStamp auditStamp) { return AspectsBatchImpl.builder() - .mcps(mcps, auditStamp, opContext.getRetrieverContext().get()) + .mcps(mcps, auditStamp, opContext.getRetrieverContext()) .build(); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/filter/BaseQueryFilterRewriter.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/filter/BaseQueryFilterRewriter.java index 367705d369c7ce..6c5c6243d33620 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/filter/BaseQueryFilterRewriter.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/filter/BaseQueryFilterRewriter.java @@ -143,7 +143,7 @@ private static QueryBuilder expandTerms( if (!queryUrns.isEmpty()) { scrollGraph( - opContext.getRetrieverContext().get().getGraphRetriever(), + opContext.getRetrieverContext().getGraphRetriever(), queryUrns, relationshipTypes, relationshipDirection, diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformer.java b/metadata-io/src/main/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformer.java index 4bb8e0630de480..b4ad847cb7afc2 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformer.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformer.java @@ -437,8 +437,6 @@ private void setStructuredPropertiesSearchValue( Map> definitions = opContext - .getRetrieverContext() - .get() .getAspectRetriever() .getLatestAspectObjects( propertyMap.keySet(), Set.of(STRUCTURED_PROPERTY_DEFINITION_ASPECT_NAME)); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/service/BusinessAttributeUpdateHookService.java b/metadata-io/src/main/java/com/linkedin/metadata/service/BusinessAttributeUpdateHookService.java index ad2825ead3d0da..4a692e95346222 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/service/BusinessAttributeUpdateHookService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/service/BusinessAttributeUpdateHookService.java @@ -112,7 +112,7 @@ private void fetchRelatedEntities( @Nullable String scrollId, int consumedEntityCount, int batchNumber) { - GraphRetriever graph = opContext.getRetrieverContext().get().getGraphRetriever(); + GraphRetriever graph = opContext.getRetrieverContext().getGraphRetriever(); final ArrayList> futureList = new ArrayList<>(); RelatedEntitiesScrollResult result = graph.scrollRelatedEntities( @@ -165,7 +165,7 @@ private Callable processBatch( return () -> { StopWatch stopWatch = new StopWatch(); stopWatch.start(); - AspectRetriever aspectRetriever = opContext.getAspectRetrieverOpt().get(); + AspectRetriever aspectRetriever = opContext.getAspectRetriever(); log.info("Batch {} for BA:{} started", batchNumber, entityKey); ExecutionResult executionResult = new ExecutionResult(); executionResult.setBatchNumber(batchNumber); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateGraphIndicesService.java b/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateGraphIndicesService.java index efe073fc00dfdc..4b09bc00efb61a 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateGraphIndicesService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateGraphIndicesService.java @@ -94,8 +94,7 @@ public UpdateGraphIndicesService( public void handleChangeEvent( @Nonnull OperationContext opContext, @Nonnull final MetadataChangeLog event) { try { - MCLItemImpl mclItem = - MCLItemImpl.builder().build(event, opContext.getAspectRetrieverOpt().get()); + MCLItemImpl mclItem = MCLItemImpl.builder().build(event, opContext.getAspectRetriever()); if (UPDATE_CHANGE_TYPES.contains(event.getChangeType())) { handleUpdateChangeEvent(opContext, mclItem); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java b/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java index 187ef3e8c62290..c5fc9ebdac9fa6 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java @@ -121,11 +121,10 @@ public UpdateIndicesService( public void handleChangeEvent( @Nonnull OperationContext opContext, @Nonnull final MetadataChangeLog event) { try { - MCLItemImpl batch = - MCLItemImpl.builder().build(event, opContext.getAspectRetrieverOpt().get()); + MCLItemImpl batch = MCLItemImpl.builder().build(event, opContext.getAspectRetriever()); Stream sideEffects = - AspectsBatch.applyMCLSideEffects(List.of(batch), opContext.getRetrieverContext().get()); + AspectsBatch.applyMCLSideEffects(List.of(batch), opContext.getRetrieverContext()); for (MCLItem mclItem : Stream.concat(Stream.of(batch), sideEffects).collect(Collectors.toList())) { diff --git a/metadata-io/src/test/java/com/linkedin/metadata/AspectIngestionUtils.java b/metadata-io/src/test/java/com/linkedin/metadata/AspectIngestionUtils.java index 12b12cf105196e..fa6ab7932001b6 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/AspectIngestionUtils.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/AspectIngestionUtils.java @@ -46,12 +46,12 @@ public static Map ingestCorpUserKeyAspects( .recordTemplate(aspect) .auditStamp(AspectGenerationUtils.createAuditStamp()) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); } entityService.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -83,12 +83,12 @@ public static Map ingestCorpUserInfoAspects( .recordTemplate(aspect) .auditStamp(AspectGenerationUtils.createAuditStamp()) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); } entityService.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -121,12 +121,12 @@ public static Map ingestChartInfoAspects( .recordTemplate(aspect) .auditStamp(AspectGenerationUtils.createAuditStamp()) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); } entityService.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, diff --git a/metadata-io/src/test/java/com/linkedin/metadata/aspect/hooks/IgnoreUnknownMutatorTest.java b/metadata-io/src/test/java/com/linkedin/metadata/aspect/hooks/IgnoreUnknownMutatorTest.java index 11a3153abcaeed..19be1eb14667d8 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/aspect/hooks/IgnoreUnknownMutatorTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/aspect/hooks/IgnoreUnknownMutatorTest.java @@ -16,7 +16,8 @@ import com.linkedin.data.template.StringMap; import com.linkedin.dataset.DatasetProperties; import com.linkedin.events.metadata.ChangeType; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; +import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.aspect.batch.MCPItem; import com.linkedin.metadata.aspect.plugins.config.AspectPluginConfig; import com.linkedin.metadata.entity.SearchRetriever; @@ -28,7 +29,6 @@ import com.linkedin.mxe.SystemMetadata; import com.linkedin.test.metadata.aspect.TestEntityRegistry; import io.datahubproject.metadata.context.RetrieverContext; -import io.datahubproject.test.metadata.context.TestOperationContexts; import java.net.URISyntaxException; import java.nio.charset.StandardCharsets; import java.util.List; @@ -53,17 +53,17 @@ public class IgnoreUnknownMutatorTest { private static final Urn TEST_DATASET_URN = UrnUtils.getUrn( "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.customers,PROD)"); - private AspectRetriever mockAspectRetriever; + private CachingAspectRetriever mockAspectRetriever; private RetrieverContext retrieverContext; @BeforeMethod public void setup() { - mockAspectRetriever = mock(AspectRetriever.class); + mockAspectRetriever = mock(CachingAspectRetriever.class); retrieverContext = RetrieverContext.builder() .searchRetriever(mock(SearchRetriever.class)) - .aspectRetriever(mockAspectRetriever) - .graphRetriever(TestOperationContexts.emptyGraphRetriever) + .cachingAspectRetriever(mockAspectRetriever) + .graphRetriever(GraphRetriever.EMPTY) .build(); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtilTest.java b/metadata-io/src/test/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtilTest.java index 04aff4edf456d9..e7ed2671131592 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtilTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtilTest.java @@ -56,8 +56,7 @@ public void testAdditionalChanges() { DefaultAspectsUtil.getAdditionalChanges( opContext, AspectsBatchImpl.builder() - .mcps( - List.of(proposal1), new AuditStamp(), opContext.getRetrieverContext().get()) + .mcps(List.of(proposal1), new AuditStamp(), opContext.getRetrieverContext()) .build() .getMCPItems(), entityServiceImpl, diff --git a/metadata-io/src/test/java/com/linkedin/metadata/dataproducts/sideeffects/DataProductUnsetSideEffectTest.java b/metadata-io/src/test/java/com/linkedin/metadata/dataproducts/sideeffects/DataProductUnsetSideEffectTest.java index 976b165fea53df..215e1e2431efa0 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/dataproducts/sideeffects/DataProductUnsetSideEffectTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/dataproducts/sideeffects/DataProductUnsetSideEffectTest.java @@ -15,7 +15,7 @@ import com.linkedin.dataproduct.DataProductAssociationArray; import com.linkedin.dataproduct.DataProductProperties; import com.linkedin.events.metadata.ChangeType; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.aspect.SystemAspect; import com.linkedin.metadata.aspect.batch.MCPItem; @@ -75,12 +75,12 @@ public class DataProductUnsetSideEffectTest { .build())) .build(); - private AspectRetriever mockAspectRetriever; + private CachingAspectRetriever mockAspectRetriever; private RetrieverContext retrieverContext; @BeforeMethod public void setup() { - mockAspectRetriever = mock(AspectRetriever.class); + mockAspectRetriever = mock(CachingAspectRetriever.class); when(mockAspectRetriever.getEntityRegistry()).thenReturn(TEST_REGISTRY); GraphRetriever graphRetriever = mock(GraphRetriever.class); RelatedEntities relatedEntities = @@ -139,7 +139,7 @@ public void setup() { retrieverContext = RetrieverContext.builder() .searchRetriever(mock(SearchRetriever.class)) - .aspectRetriever(mockAspectRetriever) + .cachingAspectRetriever(mockAspectRetriever) .graphRetriever(graphRetriever) .build(); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java index 0386031cbcad86..88f84ee94c8ee7 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java @@ -19,6 +19,7 @@ import com.linkedin.metadata.AspectGenerationUtils; import com.linkedin.metadata.Constants; import com.linkedin.metadata.EbeanTestUtils; +import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.config.EbeanConfiguration; import com.linkedin.metadata.config.PreProcessHooks; import com.linkedin.metadata.entity.ebean.EbeanAspectDao; @@ -98,12 +99,15 @@ public void setupTest() { .entityService(_entityServiceImpl) .entityRegistry(_testEntityRegistry) .build()) - .graphRetriever(TestOperationContexts.emptyGraphRetriever) - .searchRetriever(TestOperationContexts.emptySearchRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever( + () -> _testEntityRegistry)) + .graphRetriever(GraphRetriever.EMPTY) + .searchRetriever(SearchRetriever.EMPTY) .build(), null, opContext -> - ((EntityServiceAspectRetriever) opContext.getAspectRetrieverOpt().get()) + ((EntityServiceAspectRetriever) opContext.getAspectRetriever()) .setSystemOperationContext(opContext), null); } @@ -152,25 +156,25 @@ public void testIngestListLatestAspects() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)), + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)), ChangeItemImpl.builder() .urn(entityUrn2) .aspectName(aspectName) .recordTemplate(writeAspect2) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)), + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)), ChangeItemImpl.builder() .urn(entityUrn3) .aspectName(aspectName) .recordTemplate(writeAspect3) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null))); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null))); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -230,25 +234,25 @@ public void testIngestListUrns() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)), + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)), ChangeItemImpl.builder() .urn(entityUrn2) .aspectName(aspectName) .recordTemplate(writeAspect2) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)), + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)), ChangeItemImpl.builder() .urn(entityUrn3) .aspectName(aspectName) .recordTemplate(writeAspect3) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null))); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null))); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -310,11 +314,11 @@ public void testSystemMetadataDuplicateKey() throws Exception { .recordTemplate(new Status().setRemoved(true)) .systemMetadata(systemMetadata) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(item)) .build(), false, @@ -356,7 +360,7 @@ public void testSystemMetadataDuplicateKey() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items( List.of( ChangeItemImpl.builder() @@ -365,7 +369,7 @@ public void testSystemMetadataDuplicateKey() throws Exception { .recordTemplate(new Status().setRemoved(false)) .systemMetadata(systemMetadata) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)))) + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)))) .build(), false, true); @@ -600,7 +604,7 @@ public void run() { auditStamp.setTime(System.currentTimeMillis()); AspectsBatchImpl batch = AspectsBatchImpl.builder() - .mcps(mcps, auditStamp, operationContext.getRetrieverContext().get()) + .mcps(mcps, auditStamp, operationContext.getRetrieverContext()) .build(); entityService.ingestProposal(operationContext, batch, false); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java index 2d59632e6f3c6d..c00632e5cf5424 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java @@ -945,32 +945,32 @@ public void testRollbackAspect() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn2) .aspectName(aspectName) .recordTemplate(writeAspect2) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata1) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn3) .aspectName(aspectName) .recordTemplate(writeAspect3) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata1) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn1) .aspectName(aspectName) .recordTemplate(writeAspect1Overwrite) .systemMetadata(metadata2) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1037,25 +1037,25 @@ public void testRollbackKey() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn1) .aspectName(keyAspectName) .recordTemplate(writeKey1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn1) .aspectName(aspectName) .recordTemplate(writeAspect1Overwrite) .systemMetadata(metadata2) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1130,39 +1130,39 @@ public void testRollbackUrn() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn1) .aspectName(keyAspectName) .recordTemplate(writeKey1) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata1) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn2) .aspectName(aspectName) .recordTemplate(writeAspect2) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata1) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn3) .aspectName(aspectName) .recordTemplate(writeAspect3) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn1) .aspectName(aspectName) .recordTemplate(writeAspect1Overwrite) .systemMetadata(metadata2) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1208,11 +1208,11 @@ public void testIngestGetLatestAspect() throws AssertionError { .recordTemplate(writeAspect1) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata1) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1264,11 +1264,11 @@ public void testIngestGetLatestAspect() throws AssertionError { .recordTemplate(writeAspect2) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata2) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1320,11 +1320,11 @@ public void testIngestGetLatestEnvelopedAspect() throws Exception { .recordTemplate(writeAspect1) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata1) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1347,11 +1347,11 @@ public void testIngestGetLatestEnvelopedAspect() throws Exception { .recordTemplate(writeAspect2) .systemMetadata(metadata2) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1416,11 +1416,11 @@ public void testIngestSameAspect() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1472,11 +1472,11 @@ public void testIngestSameAspect() throws AssertionError { .recordTemplate(writeAspect2) .systemMetadata(metadata2) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1534,46 +1534,46 @@ public void testRetention() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn) .aspectName(aspectName) .recordTemplate(writeAspect1a) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn) .aspectName(aspectName) .recordTemplate(writeAspect1b) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn) .aspectName(aspectName2) .recordTemplate(writeAspect2) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn) .aspectName(aspectName2) .recordTemplate(writeAspect2a) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn) .aspectName(aspectName2) .recordTemplate(writeAspect2b) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1610,18 +1610,18 @@ public void testRetention() throws AssertionError { .recordTemplate(writeAspect1c) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn) .aspectName(aspectName2) .recordTemplate(writeAspect2c) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1982,8 +1982,7 @@ public void testStructuredPropertyIngestProposal() throws Exception { stream .map( entityAspect -> - EntityUtils.toSystemAspect( - opContext.getRetrieverContext().get(), entityAspect) + EntityUtils.toSystemAspect(opContext.getRetrieverContext(), entityAspect) .get() .getAspect(StructuredPropertyDefinition.class)) .collect(Collectors.toSet()); @@ -1995,7 +1994,10 @@ public void testStructuredPropertyIngestProposal() throws Exception { SystemEntityClient mockSystemEntityClient = Mockito.mock(SystemEntityClient.class); Mockito.when( mockSystemEntityClient.getLatestAspectObject( - any(OperationContext.class), eq(firstPropertyUrn), eq("propertyDefinition"))) + any(OperationContext.class), + eq(firstPropertyUrn), + eq("propertyDefinition"), + anyBoolean())) .thenReturn(new com.linkedin.entity.Aspect(structuredPropertyDefinition.data())); // Add a value for that property @@ -2062,8 +2064,7 @@ public void testStructuredPropertyIngestProposal() throws Exception { stream .map( entityAspect -> - EntityUtils.toSystemAspect( - opContext.getRetrieverContext().get(), entityAspect) + EntityUtils.toSystemAspect(opContext.getRetrieverContext(), entityAspect) .get() .getAspect(StructuredPropertyDefinition.class)) .collect(Collectors.toSet()); @@ -2074,7 +2075,10 @@ public void testStructuredPropertyIngestProposal() throws Exception { Mockito.when( mockSystemEntityClient.getLatestAspectObject( - any(OperationContext.class), eq(secondPropertyUrn), eq("propertyDefinition"))) + any(OperationContext.class), + eq(secondPropertyUrn), + eq("propertyDefinition"), + anyBoolean())) .thenReturn(new com.linkedin.entity.Aspect(secondDefinition.data())); // Get existing value for first structured property @@ -2209,7 +2213,7 @@ public void testBatchDuplicate() throws Exception { .recordTemplate(new Status().setRemoved(true)) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); ChangeItemImpl item2 = ChangeItemImpl.builder() .urn(entityUrn) @@ -2217,11 +2221,11 @@ public void testBatchDuplicate() throws Exception { .recordTemplate(new Status().setRemoved(false)) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(item1, item2)) .build(), false, @@ -2269,7 +2273,7 @@ public void testBatchPatchWithTrailingNoOp() throws Exception { .setTags(new TagAssociationArray(new TagAssociation().setTag(tag1)))) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); PatchItemImpl patchAdd2 = PatchItemImpl.builder() @@ -2311,7 +2315,7 @@ public void testBatchPatchWithTrailingNoOp() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(initialAspectTag1)) .build(), false, @@ -2320,7 +2324,7 @@ public void testBatchPatchWithTrailingNoOp() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(patchAdd2, patchRemoveNonExistent)) .build(), false, @@ -2368,7 +2372,7 @@ public void testBatchPatchAdd() throws Exception { .setTags(new TagAssociationArray(new TagAssociation().setTag(tag1)))) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); PatchItemImpl patchAdd3 = PatchItemImpl.builder() @@ -2428,7 +2432,7 @@ public void testBatchPatchAdd() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(initialAspectTag1)) .build(), false, @@ -2437,7 +2441,7 @@ public void testBatchPatchAdd() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(patchAdd3, patchAdd2, patchAdd1)) .build(), false, @@ -2491,7 +2495,7 @@ public void testBatchPatchAddDuplicate() throws Exception { .recordTemplate(new GlobalTags().setTags(new TagAssociationArray(initialTags))) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); PatchItemImpl patchAdd2 = PatchItemImpl.builder() @@ -2516,7 +2520,7 @@ public void testBatchPatchAddDuplicate() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(initialAspectTag1)) .build(), false, @@ -2525,7 +2529,7 @@ public void testBatchPatchAddDuplicate() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(patchAdd2, patchAdd2)) // duplicate .build(), false, @@ -2581,7 +2585,7 @@ public void testPatchRemoveNonExistent() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(patchRemove)) .build(), false, @@ -2638,7 +2642,7 @@ public void testPatchAddNonExistent() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(patchAdd)) .build(), false, diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraEntityServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraEntityServiceTest.java index 550f55e6bfd0b9..b4fbfecc9d60d3 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraEntityServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraEntityServiceTest.java @@ -10,11 +10,13 @@ import com.linkedin.metadata.AspectGenerationUtils; import com.linkedin.metadata.AspectIngestionUtils; import com.linkedin.metadata.CassandraTestUtils; +import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.config.PreProcessHooks; import com.linkedin.metadata.entity.EntityServiceAspectRetriever; import com.linkedin.metadata.entity.EntityServiceImpl; import com.linkedin.metadata.entity.EntityServiceTest; import com.linkedin.metadata.entity.ListResult; +import com.linkedin.metadata.entity.SearchRetriever; import com.linkedin.metadata.event.EventProducer; import com.linkedin.metadata.key.CorpUserKey; import com.linkedin.metadata.models.registry.EntityRegistryException; @@ -93,12 +95,15 @@ private void configureComponents() { .entityService(_entityServiceImpl) .entityRegistry(_testEntityRegistry) .build()) - .graphRetriever(TestOperationContexts.emptyGraphRetriever) - .searchRetriever(TestOperationContexts.emptySearchRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever( + () -> _testEntityRegistry)) + .graphRetriever(GraphRetriever.EMPTY) + .searchRetriever(SearchRetriever.EMPTY) .build(), null, opContext -> - ((EntityServiceAspectRetriever) opContext.getAspectRetrieverOpt().get()) + ((EntityServiceAspectRetriever) opContext.getAspectRetriever()) .setSystemOperationContext(opContext), null); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImplTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImplTest.java index 3f6b301e72aa5a..0a867ae3c8f2e0 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImplTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImplTest.java @@ -26,7 +26,7 @@ public void testBatchDuplicate() throws Exception { .recordTemplate(new Status().setRemoved(true)) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); ChangeItemImpl item2 = ChangeItemImpl.builder() .urn(entityUrn) @@ -34,7 +34,7 @@ public void testBatchDuplicate() throws Exception { .recordTemplate(new Status().setRemoved(false)) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); assertFalse(item1.isDatabaseDuplicateOf(item2)); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/recommendation/RecommendationsServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/recommendation/RecommendationsServiceTest.java index ca42f0327c86db..8f68f119cb0b7d 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/recommendation/RecommendationsServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/recommendation/RecommendationsServiceTest.java @@ -11,6 +11,7 @@ import com.linkedin.metadata.recommendation.ranker.SimpleRecommendationRanker; import io.datahubproject.test.metadata.context.TestOperationContexts; import java.net.URISyntaxException; +import java.nio.file.AccessDeniedException; import java.util.List; import java.util.stream.Collectors; import org.testng.annotations.Test; @@ -74,7 +75,7 @@ private List getContentFromUrns(List urns) { } @Test - public void testService() throws URISyntaxException { + public void testService() throws URISyntaxException, AccessDeniedException { // Test non-eligible and empty RecommendationsService service = new RecommendationsService(ImmutableList.of(nonEligibleSource, emptySource), ranker); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/schemafields/sideeffects/SchemaFieldSideEffectTest.java b/metadata-io/src/test/java/com/linkedin/metadata/schemafields/sideeffects/SchemaFieldSideEffectTest.java index 1661f5f02ee593..fa895cb4540117 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/schemafields/sideeffects/SchemaFieldSideEffectTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/schemafields/sideeffects/SchemaFieldSideEffectTest.java @@ -21,7 +21,8 @@ import com.linkedin.data.ByteString; import com.linkedin.entity.Aspect; import com.linkedin.events.metadata.ChangeType; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; +import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.aspect.batch.MCLItem; import com.linkedin.metadata.aspect.batch.MCPItem; import com.linkedin.metadata.aspect.plugins.config.AspectPluginConfig; @@ -46,7 +47,6 @@ import com.linkedin.test.metadata.aspect.TestEntityRegistry; import com.linkedin.test.metadata.aspect.batch.TestMCP; import io.datahubproject.metadata.context.RetrieverContext; -import io.datahubproject.test.metadata.context.TestOperationContexts; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; @@ -87,18 +87,18 @@ public class SchemaFieldSideEffectTest { .build())) .build(); - private AspectRetriever mockAspectRetriever; + private CachingAspectRetriever mockAspectRetriever; private RetrieverContext retrieverContext; @BeforeMethod public void setup() { - mockAspectRetriever = mock(AspectRetriever.class); + mockAspectRetriever = mock(CachingAspectRetriever.class); when(mockAspectRetriever.getEntityRegistry()).thenReturn(TEST_REGISTRY); retrieverContext = RetrieverContext.builder() .searchRetriever(mock(SearchRetriever.class)) - .aspectRetriever(mockAspectRetriever) - .graphRetriever(TestOperationContexts.emptyGraphRetriever) + .cachingAspectRetriever(mockAspectRetriever) + .graphRetriever(GraphRetriever.EMPTY) .build(); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/ContainerExpansionRewriterTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/ContainerExpansionRewriterTest.java index fd768424e13c19..1825b65a18ab19 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/ContainerExpansionRewriterTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/ContainerExpansionRewriterTest.java @@ -20,6 +20,7 @@ import com.linkedin.metadata.aspect.models.graph.RelatedEntities; import com.linkedin.metadata.aspect.models.graph.RelatedEntitiesScrollResult; import com.linkedin.metadata.config.search.QueryFilterRewriterConfiguration; +import com.linkedin.metadata.entity.SearchRetriever; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.query.SearchFlags; import com.linkedin.metadata.query.filter.Condition; @@ -71,8 +72,10 @@ public void init() { () -> io.datahubproject.metadata.context.RetrieverContext.builder() .aspectRetriever(mockAspectRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever(() -> entityRegistry)) .graphRetriever(mockGraphRetriever) - .searchRetriever(TestOperationContexts.emptySearchRetriever) + .searchRetriever(SearchRetriever.EMPTY) .build(), null, null, diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/DomainExpansionRewriterTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/DomainExpansionRewriterTest.java index 8741e24b1bca50..de375271ed6602 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/DomainExpansionRewriterTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/DomainExpansionRewriterTest.java @@ -13,13 +13,14 @@ import static org.mockito.Mockito.when; import static org.testng.Assert.assertEquals; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.aspect.RetrieverContext; import com.linkedin.metadata.aspect.models.graph.Edge; import com.linkedin.metadata.aspect.models.graph.RelatedEntities; import com.linkedin.metadata.aspect.models.graph.RelatedEntitiesScrollResult; import com.linkedin.metadata.config.search.QueryFilterRewriterConfiguration; +import com.linkedin.metadata.entity.SearchRetriever; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.query.SearchFlags; import com.linkedin.metadata.query.filter.Condition; @@ -54,7 +55,7 @@ public class DomainExpansionRewriterTest @BeforeMethod public void init() { EntityRegistry entityRegistry = new TestEntityRegistry(); - AspectRetriever mockAspectRetriever = mock(AspectRetriever.class); + CachingAspectRetriever mockAspectRetriever = mock(CachingAspectRetriever.class); when(mockAspectRetriever.getEntityRegistry()).thenReturn(entityRegistry); mockGraphRetriever = spy(GraphRetriever.class); @@ -71,8 +72,10 @@ public void init() { () -> io.datahubproject.metadata.context.RetrieverContext.builder() .aspectRetriever(mockAspectRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever(() -> entityRegistry)) .graphRetriever(mockGraphRetriever) - .searchRetriever(TestOperationContexts.emptySearchRetriever) + .searchRetriever(SearchRetriever.EMPTY) .build(), null, null, diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/AggregationQueryBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/AggregationQueryBuilderTest.java index c68997e25bcff7..d6f5f9c3eedbe7 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/AggregationQueryBuilderTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/AggregationQueryBuilderTest.java @@ -18,6 +18,7 @@ import com.linkedin.data.template.StringArray; import com.linkedin.entity.Aspect; import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.config.search.SearchConfiguration; import com.linkedin.metadata.models.EntitySpec; import com.linkedin.metadata.models.annotation.SearchableAnnotation; @@ -49,8 +50,8 @@ public class AggregationQueryBuilderTest { - private static AspectRetriever aspectRetriever; - private static AspectRetriever aspectRetrieverV1; + private static CachingAspectRetriever aspectRetriever; + private static CachingAspectRetriever aspectRetrieverV1; private static String DEFAULT_FILTER = "_index"; @BeforeClass @@ -61,7 +62,7 @@ public void setup() throws RemoteInvocationException, URISyntaxException { Urn.createFromString("urn:li:structuredProperty:under.scores.and.dots_make_a_mess"); // legacy - aspectRetriever = mock(AspectRetriever.class); + aspectRetriever = mock(CachingAspectRetriever.class); when(aspectRetriever.getEntityRegistry()) .thenReturn(TestOperationContexts.defaultEntityRegistry()); @@ -106,7 +107,7 @@ public void setup() throws RemoteInvocationException, URISyntaxException { new Aspect(structPropUnderscoresAndDotsDefinition.data())))); // V1 - aspectRetrieverV1 = mock(AspectRetriever.class); + aspectRetrieverV1 = mock(CachingAspectRetriever.class); when(aspectRetrieverV1.getEntityRegistry()) .thenReturn(TestOperationContexts.defaultEntityRegistry()); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java index 393ca3ca5d4a64..e51511699e345a 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java @@ -662,6 +662,7 @@ public void testInvalidStructuredProperty() { TestOperationContexts.systemContextNoSearchAuthorization( RetrieverContext.builder() .aspectRetriever(aspectRetriever) + .cachingAspectRetriever(TestOperationContexts.emptyActiveUsersAspectRetriever(null)) .graphRetriever(mock(GraphRetriever.class)) .searchRetriever(mock(SearchRetriever.class)) .build()); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java index 2c5bcd1294fa15..65b73b7425b743 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java @@ -247,6 +247,9 @@ public void testSetSearchableRefValue() throws URISyntaxException, RemoteInvocat TestOperationContexts.systemContextNoSearchAuthorization( RetrieverContext.builder() .aspectRetriever(aspectRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever( + () -> TEST_ENTITY_REGISTRY)) .graphRetriever(mock(GraphRetriever.class)) .searchRetriever(mock(SearchRetriever.class)) .build()); @@ -301,6 +304,9 @@ public void testSetSearchableRefValue_RuntimeException() TestOperationContexts.systemContextNoSearchAuthorization( RetrieverContext.builder() .aspectRetriever(aspectRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever( + () -> TEST_ENTITY_REGISTRY)) .graphRetriever(mock(GraphRetriever.class)) .searchRetriever(mock(SearchRetriever.class)) .build()); @@ -337,6 +343,9 @@ public void testSetSearchableRefValue_RuntimeException_URNExist() TestOperationContexts.systemContextNoSearchAuthorization( RetrieverContext.builder() .aspectRetriever(aspectRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever( + () -> TEST_ENTITY_REGISTRY)) .graphRetriever(mock(GraphRetriever.class)) .searchRetriever(mock(SearchRetriever.class)) .build()); @@ -369,6 +378,9 @@ void testSetSearchableRefValue_WithInvalidURN() TestOperationContexts.systemContextNoSearchAuthorization( RetrieverContext.builder() .aspectRetriever(aspectRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever( + () -> TEST_ENTITY_REGISTRY)) .graphRetriever(mock(GraphRetriever.class)) .searchRetriever(mock(SearchRetriever.class)) .build()); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/hooks/PropertyDefinitionDeleteSideEffectTest.java b/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/hooks/PropertyDefinitionDeleteSideEffectTest.java index b1b716c5604816..9a0a82c7f9f49d 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/hooks/PropertyDefinitionDeleteSideEffectTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/hooks/PropertyDefinitionDeleteSideEffectTest.java @@ -18,7 +18,8 @@ import com.linkedin.common.urn.UrnUtils; import com.linkedin.entity.Aspect; import com.linkedin.events.metadata.ChangeType; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; +import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.aspect.batch.MCPItem; import com.linkedin.metadata.aspect.batch.PatchMCP; import com.linkedin.metadata.aspect.plugins.config.AspectPluginConfig; @@ -36,7 +37,6 @@ import com.linkedin.test.metadata.aspect.TestEntityRegistry; import com.linkedin.test.metadata.aspect.batch.TestMCL; import io.datahubproject.metadata.context.RetrieverContext; -import io.datahubproject.test.metadata.context.TestOperationContexts; import jakarta.json.Json; import jakarta.json.JsonPatch; import java.util.List; @@ -76,13 +76,13 @@ public class PropertyDefinitionDeleteSideEffectTest { private static final Urn TEST_DATASET_URN = UrnUtils.getUrn( "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.customers,PROD)"); - private AspectRetriever mockAspectRetriever; + private CachingAspectRetriever mockAspectRetriever; private SearchRetriever mockSearchRetriever; private RetrieverContext retrieverContext; @BeforeMethod public void setup() { - mockAspectRetriever = mock(AspectRetriever.class); + mockAspectRetriever = mock(CachingAspectRetriever.class); when(mockAspectRetriever.getEntityRegistry()).thenReturn(TEST_REGISTRY); when(mockAspectRetriever.getLatestAspectObject( eq(TEST_PROPERTY_URN), eq(STRUCTURED_PROPERTY_DEFINITION_ASPECT_NAME))) @@ -101,8 +101,8 @@ public void setup() { retrieverContext = RetrieverContext.builder() .searchRetriever(mockSearchRetriever) - .aspectRetriever(mockAspectRetriever) - .graphRetriever(TestOperationContexts.emptyGraphRetriever) + .cachingAspectRetriever(mockAspectRetriever) + .graphRetriever(GraphRetriever.EMPTY) .build(); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/validators/ShowPropertyAsBadgeValidatorTest.java b/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/validators/ShowPropertyAsBadgeValidatorTest.java index 2503faa00f6e71..6e8886f495c95a 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/validators/ShowPropertyAsBadgeValidatorTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/validators/ShowPropertyAsBadgeValidatorTest.java @@ -58,7 +58,7 @@ public void setup() { mockGraphRetriever = Mockito.mock(GraphRetriever.class); retrieverContext = io.datahubproject.metadata.context.RetrieverContext.builder() - .aspectRetriever(mockAspectRetriever) + .cachingAspectRetriever(mockAspectRetriever) .searchRetriever(mockSearchRetriever) .graphRetriever(mockGraphRetriever) .build(); diff --git a/metadata-io/src/test/java/io/datahubproject/test/DataGenerator.java b/metadata-io/src/test/java/io/datahubproject/test/DataGenerator.java index 3acd2bf3413578..02cd28eb202e94 100644 --- a/metadata-io/src/test/java/io/datahubproject/test/DataGenerator.java +++ b/metadata-io/src/test/java/io/datahubproject/test/DataGenerator.java @@ -171,10 +171,7 @@ public Stream> generateMCPs( DefaultAspectsUtil.getAdditionalChanges( opContext, AspectsBatchImpl.builder() - .mcps( - List.of(mcp), - auditStamp, - opContext.getRetrieverContext().get()) + .mcps(List.of(mcp), auditStamp, opContext.getRetrieverContext()) .build() .getMCPItems(), entityService, diff --git a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java index cf9d73dfa729be..f16c9dbd82e749 100644 --- a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java +++ b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java @@ -20,7 +20,6 @@ import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.metadata.context.OperationContextConfig; -import io.datahubproject.metadata.context.RetrieverContext; import io.datahubproject.metadata.context.ServicesRegistryContext; import io.datahubproject.metadata.context.ValidationContext; import io.datahubproject.test.metadata.context.TestOperationContexts; @@ -95,7 +94,7 @@ public OperationContext operationContext( entityRegistry, mock(ServicesRegistryContext.class), indexConvention, - mock(RetrieverContext.class), + TestOperationContexts.emptyActiveUsersRetrieverContext(() -> entityRegistry), mock(ValidationContext.class)); } diff --git a/metadata-jobs/pe-consumer/src/test/java/com/datahub/event/hook/BusinessAttributeUpdateHookTest.java b/metadata-jobs/pe-consumer/src/test/java/com/datahub/event/hook/BusinessAttributeUpdateHookTest.java index 47740b02d6166c..65ee6b8591f489 100644 --- a/metadata-jobs/pe-consumer/src/test/java/com/datahub/event/hook/BusinessAttributeUpdateHookTest.java +++ b/metadata-jobs/pe-consumer/src/test/java/com/datahub/event/hook/BusinessAttributeUpdateHookTest.java @@ -93,8 +93,6 @@ public void testMCLOnBusinessAttributeUpdate() throws Exception { new RelatedEntity(BUSINESS_ATTRIBUTE_OF, SCHEMA_FIELD_URN.toString()))); when(opContext - .getRetrieverContext() - .get() .getAspectRetriever() .getLatestAspectObjects( eq(Set.of(SCHEMA_FIELD_URN)), eq(Set.of(BUSINESS_ATTRIBUTE_ASPECT)))) @@ -108,7 +106,7 @@ public void testMCLOnBusinessAttributeUpdate() throws Exception { // verify // page 1 - Mockito.verify(opContext.getRetrieverContext().get().getGraphRetriever(), Mockito.times(1)) + Mockito.verify(opContext.getRetrieverContext().getGraphRetriever(), Mockito.times(1)) .scrollRelatedEntities( isNull(), any(Filter.class), @@ -122,7 +120,7 @@ public void testMCLOnBusinessAttributeUpdate() throws Exception { isNull(), isNull()); // page 2 - Mockito.verify(opContext.getRetrieverContext().get().getGraphRetriever(), Mockito.times(1)) + Mockito.verify(opContext.getRetrieverContext().getGraphRetriever(), Mockito.times(1)) .scrollRelatedEntities( isNull(), any(Filter.class), @@ -136,7 +134,7 @@ public void testMCLOnBusinessAttributeUpdate() throws Exception { isNull(), isNull()); - Mockito.verifyNoMoreInteractions(opContext.getRetrieverContext().get().getGraphRetriever()); + Mockito.verifyNoMoreInteractions(opContext.getRetrieverContext().getGraphRetriever()); // 2 pages = 2 ingest proposals Mockito.verify(mockUpdateIndicesService, Mockito.times(2)) @@ -152,8 +150,8 @@ private void testMCLOnInvalidCategory() throws Exception { businessAttributeServiceHook.handleChangeEvent(opContext, platformEvent); // verify - Mockito.verifyNoInteractions(opContext.getRetrieverContext().get().getGraphRetriever()); - Mockito.verifyNoInteractions(opContext.getAspectRetrieverOpt().get()); + Mockito.verifyNoInteractions(opContext.getRetrieverContext().getGraphRetriever()); + Mockito.verifyNoInteractions(opContext.getAspectRetriever()); Mockito.verifyNoInteractions(mockUpdateIndicesService); } @@ -226,13 +224,15 @@ private OperationContext mockOperationContextWithGraph(List graph RetrieverContext mockRetrieverContext = mock(RetrieverContext.class); when(mockRetrieverContext.getAspectRetriever()).thenReturn(mock(AspectRetriever.class)); + when(mockRetrieverContext.getCachingAspectRetriever()) + .thenReturn(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); when(mockRetrieverContext.getGraphRetriever()).thenReturn(graphRetriever); OperationContext opContext = TestOperationContexts.systemContextNoSearchAuthorization(mockRetrieverContext); // reset mock for test - reset(opContext.getAspectRetrieverOpt().get()); + reset(opContext.getAspectRetriever()); if (!graphEdges.isEmpty()) { diff --git a/metadata-models/src/main/pegasus/com/linkedin/platformresource/PlatformResourceType.pdl b/metadata-models/src/main/pegasus/com/linkedin/platformresource/PlatformResourceType.pdl index 2f36eda9141abb..1a1dbea4359fbd 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/platformresource/PlatformResourceType.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/platformresource/PlatformResourceType.pdl @@ -9,9 +9,13 @@ enum PlatformResourceType { /** * e.g. a Slack member resource, Looker user resource, etc. */ - USER_INFO, + USER_INFO, /** * e.g. a Slack channel */ CONVERSATION + /** + * e.g. Looker mapping of all user ids + */ + USER_ID_MAPPING } diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/ActorContext.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/ActorContext.java index e65bf22991736d..c08b7fad4dee32 100644 --- a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/ActorContext.java +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/ActorContext.java @@ -1,12 +1,23 @@ package io.datahubproject.metadata.context; +import static com.linkedin.metadata.Constants.CORP_USER_KEY_ASPECT_NAME; +import static com.linkedin.metadata.Constants.CORP_USER_STATUS_ASPECT_NAME; +import static com.linkedin.metadata.Constants.CORP_USER_STATUS_SUSPENDED; +import static com.linkedin.metadata.Constants.STATUS_ASPECT_NAME; +import static com.linkedin.metadata.Constants.SYSTEM_ACTOR; + import com.datahub.authentication.Authentication; +import com.linkedin.common.Status; import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; +import com.linkedin.entity.Aspect; +import com.linkedin.identity.CorpUserStatus; +import com.linkedin.metadata.aspect.AspectRetriever; import com.linkedin.metadata.authorization.PoliciesConfig; import com.linkedin.policy.DataHubPolicyInfo; import java.util.Collection; import java.util.Collections; +import java.util.Map; import java.util.Optional; import java.util.Set; import lombok.Builder; @@ -48,6 +59,43 @@ public Urn getActorUrn() { return UrnUtils.getUrn(authentication.getActor().toUrnStr()); } + /** + * Actor is considered active if the user is not hard-deleted, soft-deleted, and is not suspended + * + * @param aspectRetriever aspect retriever - ideally the SystemEntityClient backed one for caching + * @return active status + */ + public boolean isActive(AspectRetriever aspectRetriever) { + // system cannot be disabled + if (SYSTEM_ACTOR.equals(authentication.getActor().toUrnStr())) { + return true; + } + + Urn selfUrn = UrnUtils.getUrn(authentication.getActor().toUrnStr()); + Map> urnAspectMap = + aspectRetriever.getLatestAspectObjects( + Set.of(selfUrn), + Set.of(STATUS_ASPECT_NAME, CORP_USER_STATUS_ASPECT_NAME, CORP_USER_KEY_ASPECT_NAME)); + + Map aspectMap = urnAspectMap.getOrDefault(selfUrn, Map.of()); + + if (!aspectMap.containsKey(CORP_USER_KEY_ASPECT_NAME)) { + // user is hard deleted + return false; + } + + Status status = + Optional.ofNullable(aspectMap.get(STATUS_ASPECT_NAME)) + .map(a -> new Status(a.data())) + .orElse(new Status().setRemoved(false)); + CorpUserStatus corpUserStatus = + Optional.ofNullable(aspectMap.get(CORP_USER_STATUS_ASPECT_NAME)) + .map(a -> new CorpUserStatus(a.data())) + .orElse(new CorpUserStatus().setStatus("")); + + return !status.isRemoved() && !CORP_USER_STATUS_SUSPENDED.equals(corpUserStatus.getStatus()); + } + /** * The current implementation creates a cache entry unique for the set of policies. * diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java index 9a058c526647c2..9158129235b39e 100644 --- a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java @@ -16,6 +16,8 @@ import com.linkedin.metadata.query.SearchFlags; import com.linkedin.metadata.utils.AuditStampUtils; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; +import io.datahubproject.metadata.exception.ActorAccessException; +import io.datahubproject.metadata.exception.OperationContextException; import java.util.Collection; import java.util.Objects; import java.util.Optional; @@ -63,6 +65,24 @@ public static OperationContext asSession( @Nonnull Authorizer authorizer, @Nonnull Authentication sessionAuthentication, boolean allowSystemAuthentication) { + return OperationContext.asSession( + systemOperationContext, + requestContext, + authorizer, + sessionAuthentication, + allowSystemAuthentication, + false); + } + + @Nonnull + public static OperationContext asSession( + OperationContext systemOperationContext, + @Nonnull RequestContext requestContext, + @Nonnull Authorizer authorizer, + @Nonnull Authentication sessionAuthentication, + boolean allowSystemAuthentication, + boolean skipCache) + throws ActorAccessException { return systemOperationContext.toBuilder() .operationContextConfig( // update allowed system authentication @@ -72,7 +92,7 @@ public static OperationContext asSession( .authorizationContext(AuthorizationContext.builder().authorizer(authorizer).build()) .requestContext(requestContext) .validationContext(systemOperationContext.getValidationContext()) - .build(sessionAuthentication); + .build(sessionAuthentication, skipCache); } /** @@ -85,10 +105,14 @@ public static OperationContext asSession( public static OperationContext withSearchFlags( OperationContext opContext, Function flagDefaults) { - return opContext.toBuilder() - // update search flags for the request's session - .searchContext(opContext.getSearchContext().withFlagDefaults(flagDefaults)) - .build(opContext.getSessionActorContext()); + try { + return opContext.toBuilder() + // update search flags for the request's session + .searchContext(opContext.getSearchContext().withFlagDefaults(flagDefaults)) + .build(opContext.getSessionActorContext(), false); + } catch (OperationContextException e) { + throw new RuntimeException(e); + } } /** @@ -101,10 +125,14 @@ public static OperationContext withSearchFlags( public static OperationContext withLineageFlags( OperationContext opContext, Function flagDefaults) { - return opContext.toBuilder() - // update lineage flags for the request's session - .searchContext(opContext.getSearchContext().withLineageFlagDefaults(flagDefaults)) - .build(opContext.getSessionActorContext()); + try { + return opContext.toBuilder() + // update lineage flags for the request's session + .searchContext(opContext.getSearchContext().withLineageFlagDefaults(flagDefaults)) + .build(opContext.getSessionActorContext(), false); + } catch (OperationContextException e) { + throw new RuntimeException(e); + } } /** @@ -155,18 +183,22 @@ public static OperationContext asSystem( ? SearchContext.EMPTY : SearchContext.builder().indexConvention(indexConvention).build(); - return OperationContext.builder() - .operationContextConfig(systemConfig) - .systemActorContext(systemActorContext) - .searchContext(systemSearchContext) - .entityRegistryContext(EntityRegistryContext.builder().build(entityRegistry)) - .servicesRegistryContext(servicesRegistryContext) - // Authorizer.EMPTY doesn't actually apply to system auth - .authorizationContext(AuthorizationContext.builder().authorizer(Authorizer.EMPTY).build()) - .retrieverContext(retrieverContext) - .objectMapperContext(objectMapperContext) - .validationContext(validationContext) - .build(systemAuthentication); + try { + return OperationContext.builder() + .operationContextConfig(systemConfig) + .systemActorContext(systemActorContext) + .searchContext(systemSearchContext) + .entityRegistryContext(EntityRegistryContext.builder().build(entityRegistry)) + .servicesRegistryContext(servicesRegistryContext) + // Authorizer.EMPTY doesn't actually apply to system auth + .authorizationContext(AuthorizationContext.builder().authorizer(Authorizer.EMPTY).build()) + .retrieverContext(retrieverContext) + .objectMapperContext(objectMapperContext) + .validationContext(validationContext) + .build(systemAuthentication, false); + } catch (OperationContextException e) { + throw new RuntimeException(e); + } } @Nonnull private final OperationContextConfig operationContextConfig; @@ -177,7 +209,7 @@ public static OperationContext asSystem( @Nonnull private final EntityRegistryContext entityRegistryContext; @Nullable private final ServicesRegistryContext servicesRegistryContext; @Nullable private final RequestContext requestContext; - @Nullable private final RetrieverContext retrieverContext; + @Nonnull private final RetrieverContext retrieverContext; @Nonnull private final ObjectMapperContext objectMapperContext; @Nonnull private final ValidationContext validationContext; @@ -194,13 +226,15 @@ public OperationContext withLineageFlags( public OperationContext asSession( @Nonnull RequestContext requestContext, @Nonnull Authorizer authorizer, - @Nonnull Authentication sessionAuthentication) { + @Nonnull Authentication sessionAuthentication) + throws ActorAccessException { return OperationContext.asSession( this, requestContext, authorizer, sessionAuthentication, - getOperationContextConfig().isAllowSystemAuthentication()); + getOperationContextConfig().isAllowSystemAuthentication(), + false); } @Nonnull @@ -284,17 +318,9 @@ public AuditStamp getAuditStamp() { return getAuditStamp(null); } - public Optional getRetrieverContext() { - return Optional.ofNullable(retrieverContext); - } - - @Nullable + @Nonnull public AspectRetriever getAspectRetriever() { - return getAspectRetrieverOpt().orElse(null); - } - - public Optional getAspectRetrieverOpt() { - return getRetrieverContext().map(RetrieverContext::getAspectRetriever); + return retrieverContext.getAspectRetriever(); } /** @@ -336,10 +362,7 @@ public String getGlobalContextId() { ? EmptyContext.EMPTY : getServicesRegistryContext()) .add(getRequestContext() == null ? EmptyContext.EMPTY : getRequestContext()) - .add( - getRetrieverContext().isPresent() - ? getRetrieverContext().get() - : EmptyContext.EMPTY) + .add(getRetrieverContext()) .add(getObjectMapperContext()) .build() .stream() @@ -364,10 +387,7 @@ public String getSearchContextId() { getServicesRegistryContext() == null ? EmptyContext.EMPTY : getServicesRegistryContext()) - .add( - getRetrieverContext().isPresent() - ? getRetrieverContext().get() - : EmptyContext.EMPTY) + .add(getRetrieverContext()) .build() .stream() .map(ContextInterface::getCacheKeyComponent) @@ -438,6 +458,12 @@ public static class OperationContextBuilder { @Nonnull public OperationContext build(@Nonnull Authentication sessionAuthentication) { + return build(sessionAuthentication, false); + } + + @Nonnull + public OperationContext build( + @Nonnull Authentication sessionAuthentication, boolean skipCache) { final Urn actorUrn = UrnUtils.getUrn(sessionAuthentication.getActor().toUrnStr()); final ActorContext sessionActor = ActorContext.builder() @@ -451,11 +477,20 @@ public OperationContext build(@Nonnull Authentication sessionAuthentication) { .policyInfoSet(this.authorizationContext.getAuthorizer().getActorPolicies(actorUrn)) .groupMembership(this.authorizationContext.getAuthorizer().getActorGroups(actorUrn)) .build(); - return build(sessionActor); + return build(sessionActor, skipCache); } @Nonnull - public OperationContext build(@Nonnull ActorContext sessionActor) { + public OperationContext build(@Nonnull ActorContext sessionActor, boolean skipCache) { + AspectRetriever retriever = + skipCache + ? this.retrieverContext.getAspectRetriever() + : this.retrieverContext.getCachingAspectRetriever(); + + if (!sessionActor.isActive(retriever)) { + throw new ActorAccessException("Actor is not active"); + } + return new OperationContext( this.operationContextConfig, sessionActor, diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RetrieverContext.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RetrieverContext.java index 9337fbfe3bb003..9afc4138810bb2 100644 --- a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RetrieverContext.java +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RetrieverContext.java @@ -1,8 +1,10 @@ package io.datahubproject.metadata.context; import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.entity.SearchRetriever; +import java.util.Objects; import java.util.Optional; import javax.annotation.Nonnull; import lombok.Builder; @@ -15,10 +17,37 @@ public class RetrieverContext @Nonnull private final GraphRetriever graphRetriever; @Nonnull private final AspectRetriever aspectRetriever; + @Nonnull private final CachingAspectRetriever cachingAspectRetriever; @Nonnull private final SearchRetriever searchRetriever; @Override public Optional getCacheKeyComponent() { return Optional.empty(); } + + public static class RetrieverContextBuilder { + public RetrieverContext build() { + if (this.aspectRetriever == null && this.cachingAspectRetriever != null) { + this.aspectRetriever = this.cachingAspectRetriever; + } + + if (this.cachingAspectRetriever == null + && this.aspectRetriever instanceof CachingAspectRetriever) { + this.cachingAspectRetriever = (CachingAspectRetriever) this.aspectRetriever; + } + + return new RetrieverContext( + this.graphRetriever, + Objects.requireNonNull(this.aspectRetriever), + Objects.requireNonNull(this.cachingAspectRetriever), + this.searchRetriever); + } + } + + public static final RetrieverContext EMPTY = + RetrieverContext.builder() + .graphRetriever(GraphRetriever.EMPTY) + .searchRetriever(SearchRetriever.EMPTY) + .cachingAspectRetriever(CachingAspectRetriever.EMPTY) + .build(); } diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/ActorAccessException.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/ActorAccessException.java new file mode 100644 index 00000000000000..bca2594b96430e --- /dev/null +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/ActorAccessException.java @@ -0,0 +1,7 @@ +package io.datahubproject.metadata.exception; + +public class ActorAccessException extends OperationContextException { + public ActorAccessException(String string) { + super(string); + } +} diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/OperationContextException.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/OperationContextException.java new file mode 100644 index 00000000000000..1aac8dc3e60ec9 --- /dev/null +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/OperationContextException.java @@ -0,0 +1,9 @@ +package io.datahubproject.metadata.exception; + +public class OperationContextException extends RuntimeException { + public OperationContextException(String message) { + super(message); + } + + public OperationContextException() {} +} diff --git a/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java b/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java index 42de6b7398c616..4abfbb196f067c 100644 --- a/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java +++ b/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java @@ -8,21 +8,17 @@ import com.linkedin.common.urn.Urn; import com.linkedin.data.schema.annotation.PathSpecBasedSchemaAnnotationVisitor; import com.linkedin.entity.Aspect; +import com.linkedin.identity.CorpUserInfo; +import com.linkedin.metadata.Constants; import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.aspect.GraphRetriever; -import com.linkedin.metadata.aspect.SystemAspect; -import com.linkedin.metadata.aspect.models.graph.RelatedEntitiesScrollResult; import com.linkedin.metadata.entity.SearchRetriever; import com.linkedin.metadata.models.registry.ConfigEntityRegistry; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.models.registry.EntityRegistryException; import com.linkedin.metadata.models.registry.MergedEntityRegistry; import com.linkedin.metadata.models.registry.SnapshotEntityRegistry; -import com.linkedin.metadata.query.filter.Filter; -import com.linkedin.metadata.query.filter.RelationshipFilter; -import com.linkedin.metadata.query.filter.SortCriterion; -import com.linkedin.metadata.search.ScrollResult; -import com.linkedin.metadata.search.SearchEntityArray; import com.linkedin.metadata.snapshot.Snapshot; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.metadata.utils.elasticsearch.IndexConventionImpl; @@ -32,15 +28,14 @@ import io.datahubproject.metadata.context.RetrieverContext; import io.datahubproject.metadata.context.ServicesRegistryContext; import io.datahubproject.metadata.context.ValidationContext; -import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Set; import java.util.function.Consumer; import java.util.function.Supplier; +import java.util.stream.Collectors; import javax.annotation.Nonnull; import javax.annotation.Nullable; -import lombok.Builder; /** * Useful for testing. If the defaults are not sufficient, try using the .toBuilder() and replacing @@ -81,26 +76,53 @@ public static EntityRegistry defaultEntityRegistry() { return defaultEntityRegistryInstance; } - public static AspectRetriever emptyAspectRetriever( + public static RetrieverContext emptyActiveUsersRetrieverContext( @Nullable Supplier entityRegistrySupplier) { - return new EmptyAspectRetriever( - () -> - Optional.ofNullable(entityRegistrySupplier) - .map(Supplier::get) - .orElse(defaultEntityRegistry())); - } - public static GraphRetriever emptyGraphRetriever = new EmptyGraphRetriever(); - public static SearchRetriever emptySearchRetriever = new EmptySearchRetriever(); + return RetrieverContext.builder() + .cachingAspectRetriever(emptyActiveUsersAspectRetriever(entityRegistrySupplier)) + .graphRetriever(GraphRetriever.EMPTY) + .searchRetriever(SearchRetriever.EMPTY) + .build(); + } - public static RetrieverContext emptyRetrieverContext( + public static CachingAspectRetriever emptyActiveUsersAspectRetriever( @Nullable Supplier entityRegistrySupplier) { - return RetrieverContext.builder() - .aspectRetriever(emptyAspectRetriever(entityRegistrySupplier)) - .graphRetriever(emptyGraphRetriever) - .searchRetriever(emptySearchRetriever) - .build(); + return new CachingAspectRetriever.EmptyAspectRetriever() { + + @Nonnull + @Override + public Map> getLatestAspectObjects( + Set urns, Set aspectNames) { + if (urns.stream().allMatch(urn -> urn.toString().startsWith("urn:li:corpuser:")) + && aspectNames.contains(Constants.CORP_USER_KEY_ASPECT_NAME)) { + return urns.stream() + .map( + urn -> + Map.entry( + urn, + Map.of( + Constants.CORP_USER_KEY_ASPECT_NAME, + new Aspect( + new CorpUserInfo() + .setActive(true) + .setEmail(urn.getId()) + .setDisplayName(urn.getId()) + .data())))) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + } + return super.getLatestAspectObjects(urns, aspectNames); + } + + @Nonnull + @Override + public EntityRegistry getEntityRegistry() { + return Optional.ofNullable(entityRegistrySupplier) + .map(Supplier::get) + .orElse(defaultEntityRegistry()); + } + }; } public static OperationContext systemContextNoSearchAuthorization( @@ -140,8 +162,10 @@ public static OperationContext systemContextNoSearchAuthorization( RetrieverContext retrieverContext = RetrieverContext.builder() .aspectRetriever(aspectRetriever) - .graphRetriever(emptyGraphRetriever) - .searchRetriever(emptySearchRetriever) + .cachingAspectRetriever( + emptyActiveUsersAspectRetriever(() -> aspectRetriever.getEntityRegistry())) + .graphRetriever(GraphRetriever.EMPTY) + .searchRetriever(SearchRetriever.EMPTY) .build(); return systemContextNoSearchAuthorization( () -> retrieverContext.getAspectRetriever().getEntityRegistry(), @@ -208,7 +232,7 @@ public static OperationContext systemContext( RetrieverContext retrieverContext = Optional.ofNullable(retrieverContextSupplier) .map(Supplier::get) - .orElse(emptyRetrieverContext(entityRegistrySupplier)); + .orElse(emptyActiveUsersRetrieverContext(entityRegistrySupplier)); EntityRegistry entityRegistry = Optional.ofNullable(entityRegistrySupplier) @@ -298,66 +322,5 @@ public static OperationContext userContextNoSearchAuthorization( .asSession(requestContext, Authorizer.EMPTY, TEST_USER_AUTH); } - @Builder - public static class EmptyAspectRetriever implements AspectRetriever { - private final Supplier entityRegistrySupplier; - - @Nonnull - @Override - public Map> getLatestAspectObjects( - Set urns, Set aspectNames) { - return Map.of(); - } - - @Nonnull - @Override - public Map> getLatestSystemAspects( - Map> urnAspectNames) { - return Map.of(); - } - - @Nonnull - @Override - public EntityRegistry getEntityRegistry() { - return entityRegistrySupplier.get(); - } - } - - public static class EmptyGraphRetriever implements GraphRetriever { - - @Nonnull - @Override - public RelatedEntitiesScrollResult scrollRelatedEntities( - @Nullable List sourceTypes, - @Nonnull Filter sourceEntityFilter, - @Nullable List destinationTypes, - @Nonnull Filter destinationEntityFilter, - @Nonnull List relationshipTypes, - @Nonnull RelationshipFilter relationshipFilter, - @Nonnull List sortCriterion, - @Nullable String scrollId, - int count, - @Nullable Long startTimeMillis, - @Nullable Long endTimeMillis) { - return new RelatedEntitiesScrollResult(0, 0, null, List.of()); - } - } - - public static class EmptySearchRetriever implements SearchRetriever { - - @Override - public ScrollResult scroll( - @Nonnull List entities, - @Nullable Filter filters, - @Nullable String scrollId, - int count) { - ScrollResult empty = new ScrollResult(); - empty.setEntities(new SearchEntityArray()); - empty.setNumEntities(0); - empty.setPageSize(0); - return empty; - } - } - private TestOperationContexts() {} } diff --git a/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java b/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java index 3e092e20127ee5..f77b244d8f2d86 100644 --- a/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java +++ b/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java @@ -8,6 +8,7 @@ import com.datahub.authentication.Authentication; import com.datahub.plugins.auth.authorization.Authorizer; import com.linkedin.metadata.models.registry.EntityRegistry; +import io.datahubproject.test.metadata.context.TestOperationContexts; import org.testng.annotations.Test; public class OperationContextTest { @@ -25,7 +26,7 @@ public void testSystemPrivilegeEscalation() { mock(EntityRegistry.class), mock(ServicesRegistryContext.class), null, - mock(RetrieverContext.class), + TestOperationContexts.emptyActiveUsersRetrieverContext(null), mock(ValidationContext.class)); OperationContext opContext = diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authentication/token/StatefulTokenService.java b/metadata-service/auth-impl/src/main/java/com/datahub/authentication/token/StatefulTokenService.java index 6724f35d840adb..a9871f1ed99482 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authentication/token/StatefulTokenService.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authentication/token/StatefulTokenService.java @@ -145,7 +145,7 @@ public String generateAccessToken( _entityService.ingestProposal( systemOperationContext, AspectsBatchImpl.builder() - .mcps(List.of(proposal), auditStamp, systemOperationContext.getRetrieverContext().get()) + .mcps(List.of(proposal), auditStamp, systemOperationContext.getRetrieverContext()) .build(), false); diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/EbeanConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/EbeanConfiguration.java index 47b406e695a3fb..6eb31e14a2d3b0 100644 --- a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/EbeanConfiguration.java +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/EbeanConfiguration.java @@ -23,6 +23,7 @@ public class EbeanConfiguration { private boolean autoCreateDdl; private boolean postgresUseIamAuth; private LockingConfiguration locking; + private String batchGetMethod; public static final EbeanConfiguration testDefault = EbeanConfiguration.builder().locking(LockingConfiguration.testDefault).build(); diff --git a/metadata-service/configuration/src/main/resources/application.yaml b/metadata-service/configuration/src/main/resources/application.yaml index 9348416606d0a9..b997bc108e4ba1 100644 --- a/metadata-service/configuration/src/main/resources/application.yaml +++ b/metadata-service/configuration/src/main/resources/application.yaml @@ -164,6 +164,7 @@ ebean: waitTimeoutMillis: ${EBEAN_WAIT_TIMEOUT_MILLIS:1000} autoCreateDdl: ${EBEAN_AUTOCREATE:false} postgresUseIamAuth: ${EBEAN_POSTGRES_USE_AWS_IAM_AUTH:false} + batchGetMethod: ${EBEAN_BATCH_GET_METHOD:IN} # Alternative UNION locking: enabled: ${EBEAN_LOCKING_ENABLED:false} durationSeconds: ${EBEAN_LOCKING_DURATION_SECONDS:60} @@ -522,12 +523,12 @@ cache: entityAspectTTLSeconds: # cache user aspects for 20s corpuser: - corpUserKey: 20 + corpUserKey: 300 # 5 min corpUserInfo: 20 corpUserEditableInfo: 20 - corpUserStatus: 20 + corpUserStatus: 300 # 5 min globalTags: 20 - status: 20 + status: 300 # 5 min corpUserCredentials: 20 corpUserSettings: 20 roleMembership: 20 @@ -561,7 +562,7 @@ springdoc.api-docs.groups.enabled: true forms: hook: - enabled: { $FORMS_HOOK_ENABLED:true } + enabled: ${FORMS_HOOK_ENABLED:true} consumerGroupSuffix: ${FORMS_HOOK_CONSUMER_GROUP_SUFFIX:} businessAttribute: diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java index f5235dc3682fce..3e2823591e168c 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java @@ -45,7 +45,8 @@ protected OperationContext javaSystemOperationContext( @Nonnull final SearchService searchService, @Qualifier("baseElasticSearchComponents") BaseElasticSearchComponentsFactory.BaseElasticSearchComponents components, - @Nonnull final ConfigurationProvider configurationProvider) { + @Nonnull final ConfigurationProvider configurationProvider, + @Qualifier("systemEntityClient") @Nonnull final SystemEntityClient systemEntityClient) { EntityServiceAspectRetriever entityServiceAspectRetriever = EntityServiceAspectRetriever.builder() @@ -53,6 +54,9 @@ protected OperationContext javaSystemOperationContext( .entityService(entityService) .build(); + EntityClientAspectRetriever entityClientAspectRetriever = + EntityClientAspectRetriever.builder().entityClient(systemEntityClient).build(); + SystemGraphRetriever systemGraphRetriever = SystemGraphRetriever.builder().graphService(graphService).build(); @@ -68,6 +72,7 @@ protected OperationContext javaSystemOperationContext( components.getIndexConvention(), RetrieverContext.builder() .aspectRetriever(entityServiceAspectRetriever) + .cachingAspectRetriever(entityClientAspectRetriever) .graphRetriever(systemGraphRetriever) .searchRetriever(searchServiceSearchRetriever) .build(), @@ -76,6 +81,7 @@ protected OperationContext javaSystemOperationContext( configurationProvider.getFeatureFlags().isAlternateMCPValidation()) .build()); + entityClientAspectRetriever.setSystemOperationContext(systemOperationContext); entityServiceAspectRetriever.setSystemOperationContext(systemOperationContext); systemGraphRetriever.setSystemOperationContext(systemOperationContext); searchServiceSearchRetriever.setSystemOperationContext(systemOperationContext); @@ -104,7 +110,7 @@ protected OperationContext restliSystemOperationContext( BaseElasticSearchComponentsFactory.BaseElasticSearchComponents components, @Nonnull final ConfigurationProvider configurationProvider) { - EntityClientAspectRetriever entityServiceAspectRetriever = + EntityClientAspectRetriever entityClientAspectRetriever = EntityClientAspectRetriever.builder().entityClient(systemEntityClient).build(); SystemGraphRetriever systemGraphRetriever = @@ -121,7 +127,7 @@ protected OperationContext restliSystemOperationContext( ServicesRegistryContext.builder().restrictedService(restrictedService).build(), components.getIndexConvention(), RetrieverContext.builder() - .aspectRetriever(entityServiceAspectRetriever) + .cachingAspectRetriever(entityClientAspectRetriever) .graphRetriever(systemGraphRetriever) .searchRetriever(searchServiceSearchRetriever) .build(), @@ -130,7 +136,7 @@ protected OperationContext restliSystemOperationContext( configurationProvider.getFeatureFlags().isAlternateMCPValidation()) .build()); - entityServiceAspectRetriever.setSystemOperationContext(systemOperationContext); + entityClientAspectRetriever.setSystemOperationContext(systemOperationContext); systemGraphRetriever.setSystemOperationContext(systemOperationContext); searchServiceSearchRetriever.setSystemOperationContext(systemOperationContext); diff --git a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestDataPlatformInstancesStep.java b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestDataPlatformInstancesStep.java index 22ce06a5984ea6..c04dd25ccd4ac9 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestDataPlatformInstancesStep.java +++ b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestDataPlatformInstancesStep.java @@ -84,14 +84,14 @@ public void execute(@Nonnull OperationContext systemOperationContext) throws Exc .aspectName(DATA_PLATFORM_INSTANCE_ASPECT_NAME) .recordTemplate(dataPlatformInstance.get()) .auditStamp(aspectAuditStamp) - .build(systemOperationContext.getAspectRetrieverOpt().get())); + .build(systemOperationContext.getAspectRetriever())); } } _entityService.ingestAspects( systemOperationContext, AspectsBatchImpl.builder() - .retrieverContext(systemOperationContext.getRetrieverContext().get()) + .retrieverContext(systemOperationContext.getRetrieverContext()) .items(items) .build(), true, diff --git a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestPoliciesStep.java b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestPoliciesStep.java index eb6bfe17ac198e..dac2879487469c 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestPoliciesStep.java +++ b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestPoliciesStep.java @@ -225,7 +225,7 @@ private void ingestPolicy( new AuditStamp() .setActor(Urn.createFromString(Constants.SYSTEM_ACTOR)) .setTime(System.currentTimeMillis()), - systemOperationContext.getRetrieverContext().get()) + systemOperationContext.getRetrieverContext()) .build(), false); } diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/GlobalControllerExceptionHandler.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/GlobalControllerExceptionHandler.java similarity index 81% rename from metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/GlobalControllerExceptionHandler.java rename to metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/GlobalControllerExceptionHandler.java index ba0a426fa20e89..c756827cad56ba 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/GlobalControllerExceptionHandler.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/GlobalControllerExceptionHandler.java @@ -1,9 +1,11 @@ -package io.datahubproject.openapi; +package io.datahubproject.openapi.config; import com.linkedin.metadata.dao.throttle.APIThrottleException; +import io.datahubproject.metadata.exception.ActorAccessException; import io.datahubproject.openapi.exception.InvalidUrnException; import io.datahubproject.openapi.exception.UnauthorizedException; import java.util.Map; +import javax.annotation.PostConstruct; import lombok.extern.slf4j.Slf4j; import org.springframework.beans.ConversionNotSupportedException; import org.springframework.core.Ordered; @@ -19,6 +21,11 @@ @ControllerAdvice public class GlobalControllerExceptionHandler extends DefaultHandlerExceptionResolver { + @PostConstruct + public void init() { + log.info("GlobalControllerExceptionHandler initialized"); + } + public GlobalControllerExceptionHandler() { setOrder(Ordered.HIGHEST_PRECEDENCE); setWarnLogCategory(getClass().getName()); @@ -52,4 +59,9 @@ public static ResponseEntity> handleUnauthorizedException( UnauthorizedException e) { return new ResponseEntity<>(Map.of("error", e.getMessage()), HttpStatus.FORBIDDEN); } + + @ExceptionHandler(ActorAccessException.class) + public static ResponseEntity> actorAccessException(ActorAccessException e) { + return new ResponseEntity<>(Map.of("error", e.getMessage()), HttpStatus.FORBIDDEN); + } } diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericEntitiesController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericEntitiesController.java index 579a62c084999a..592d7bba4211fe 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericEntitiesController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericEntitiesController.java @@ -637,7 +637,7 @@ public ResponseEntity createAspect( AspectSpec aspectSpec = lookupAspectSpec(entitySpec, aspectName).get(); ChangeMCP upsert = toUpsertItem( - opContext.getRetrieverContext().get().getAspectRetriever(), + opContext.getRetrieverContext().getAspectRetriever(), urn, aspectSpec, createIfEntityNotExists, @@ -649,7 +649,7 @@ public ResponseEntity createAspect( entityService.ingestProposal( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(upsert)) .build(), async); @@ -725,7 +725,7 @@ public ResponseEntity patchAspect( .build(); ChangeMCP upsert = toUpsertItem( - opContext.getRetrieverContext().get().getAspectRetriever(), + opContext.getRetrieverContext().getAspectRetriever(), validatedUrn(entityUrn), aspectSpec, currentValue, @@ -736,7 +736,7 @@ public ResponseEntity patchAspect( entityService.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(upsert)) .build(), true, diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/test/IdController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/test/IdController.java new file mode 100644 index 00000000000000..99d3879ab9a320 --- /dev/null +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/test/IdController.java @@ -0,0 +1,54 @@ +package io.datahubproject.openapi.operations.test; + +import com.datahub.authentication.Authentication; +import com.datahub.authentication.AuthenticationContext; +import com.datahub.authorization.AuthorizerChain; +import io.datahubproject.metadata.context.OperationContext; +import io.datahubproject.metadata.context.RequestContext; +import io.swagger.v3.oas.annotations.Operation; +import io.swagger.v3.oas.annotations.tags.Tag; +import jakarta.servlet.http.HttpServletRequest; +import java.util.List; +import java.util.Map; +import lombok.extern.slf4j.Slf4j; +import org.springframework.http.MediaType; +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.GetMapping; +import org.springframework.web.bind.annotation.RequestMapping; +import org.springframework.web.bind.annotation.RequestParam; +import org.springframework.web.bind.annotation.RestController; + +@RestController +@RequestMapping("/operations/identity") +@Slf4j +@Tag(name = "Identity", description = "An API for checking identity") +public class IdController { + private final AuthorizerChain authorizerChain; + private final OperationContext systemOperationContext; + + public IdController(OperationContext systemOperationContext, AuthorizerChain authorizerChain) { + this.systemOperationContext = systemOperationContext; + this.authorizerChain = authorizerChain; + } + + @Tag(name = "User") + @GetMapping(path = "/user/urn", produces = MediaType.APPLICATION_JSON_VALUE) + @Operation(summary = "User id") + public ResponseEntity> getUserId( + HttpServletRequest request, + @RequestParam(value = "skipCache", required = false, defaultValue = "false") + Boolean skipCache) { + Authentication authentication = AuthenticationContext.getAuthentication(); + String actorUrnStr = authentication.getActor().toUrnStr(); + + OperationContext.asSession( + systemOperationContext, + RequestContext.builder().buildOpenapi(actorUrnStr, request, "getUserIdentity", List.of()), + authorizerChain, + authentication, + true, + skipCache); + + return ResponseEntity.ok(Map.of("urn", actorUrnStr)); + } +} diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/MappingUtil.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/MappingUtil.java index c38f2db0eefbb3..ca425810c87a09 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/MappingUtil.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/MappingUtil.java @@ -491,7 +491,7 @@ public static List> ingestBatchProposal( try { AspectsBatch batch = AspectsBatchImpl.builder() - .mcps(serviceProposals, auditStamp, opContext.getRetrieverContext().get()) + .mcps(serviceProposals, auditStamp, opContext.getRetrieverContext()) .build(); Map> resultMap = diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/EntityController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/EntityController.java index 56a7955b9fe871..b1c5709ef01470 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/EntityController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/EntityController.java @@ -203,7 +203,7 @@ protected AspectsBatch toMCPBatch( objectMapper.writeValueAsString(aspect.getValue().get("systemMetadata")))); } - items.add(builder.build(opContext.getAspectRetrieverOpt().get())); + items.add(builder.build(opContext.getAspectRetriever())); } } } @@ -211,7 +211,7 @@ protected AspectsBatch toMCPBatch( return AspectsBatchImpl.builder() .items(items) - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .build(); } diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/EntityController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/EntityController.java index ce7fd73f99b9e5..af13cd3aab0510 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/EntityController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/EntityController.java @@ -554,14 +554,14 @@ protected AspectsBatch toMCPBatch( GenericRecordUtils.JSON, aspectSpec)); - items.add(builder.build(opContext.getRetrieverContext().get().getAspectRetriever())); + items.add(builder.build(opContext.getRetrieverContext().getAspectRetriever())); } } } } return AspectsBatchImpl.builder() .items(items) - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .build(); } diff --git a/metadata-service/restli-api/src/main/idl/com.linkedin.entity.entitiesV2.restspec.json b/metadata-service/restli-api/src/main/idl/com.linkedin.entity.entitiesV2.restspec.json index 33cfba0f27802c..27731af9ffaa71 100644 --- a/metadata-service/restli-api/src/main/idl/com.linkedin.entity.entitiesV2.restspec.json +++ b/metadata-service/restli-api/src/main/idl/com.linkedin.entity.entitiesV2.restspec.json @@ -19,6 +19,10 @@ "name" : "aspects", "type" : "{ \"type\" : \"array\", \"items\" : \"string\" }", "optional" : true + }, { + "name" : "alwaysIncludeKeyAspect", + "type" : "boolean", + "optional" : true } ] }, { "method" : "batch_get", @@ -27,6 +31,10 @@ "name" : "aspects", "type" : "{ \"type\" : \"array\", \"items\" : \"string\" }", "optional" : true + }, { + "name" : "alwaysIncludeKeyAspect", + "type" : "boolean", + "optional" : true } ] } ], "entity" : { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesV2.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesV2.snapshot.json index 9bf7f97b34be18..9c5f41281fcfbe 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesV2.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesV2.snapshot.json @@ -182,6 +182,10 @@ "name" : "aspects", "type" : "{ \"type\" : \"array\", \"items\" : \"string\" }", "optional" : true + }, { + "name" : "alwaysIncludeKeyAspect", + "type" : "boolean", + "optional" : true } ] }, { "method" : "batch_get", @@ -190,6 +194,10 @@ "name" : "aspects", "type" : "{ \"type\" : \"array\", \"items\" : \"string\" }", "optional" : true + }, { + "name" : "alwaysIncludeKeyAspect", + "type" : "boolean", + "optional" : true } ] } ], "entity" : { diff --git a/metadata-service/restli-client-api/src/main/java/com/linkedin/entity/client/EntityClient.java b/metadata-service/restli-client-api/src/main/java/com/linkedin/entity/client/EntityClient.java index cf6e571cb8cbeb..b85f22e781d0b0 100644 --- a/metadata-service/restli-client-api/src/main/java/com/linkedin/entity/client/EntityClient.java +++ b/metadata-service/restli-client-api/src/main/java/com/linkedin/entity/client/EntityClient.java @@ -45,12 +45,34 @@ // Consider renaming this to datahub client. public interface EntityClient { + /** + * This version follows the legacy behavior of returning key aspects regardless of whether they + * exist + * + * @param opContext operation context + * @param entityName entity type + * @param urn urn id for the entity + * @param aspectNames set of aspects + * @return requested entity/aspects + */ + @Deprecated @Nullable - EntityResponse getV2( + default EntityResponse getV2( @Nonnull OperationContext opContext, @Nonnull String entityName, @Nonnull final Urn urn, @Nullable final Set aspectNames) + throws RemoteInvocationException, URISyntaxException { + return getV2(opContext, entityName, urn, aspectNames, true); + } + + @Nullable + EntityResponse getV2( + @Nonnull OperationContext opContext, + @Nonnull String entityName, + @Nonnull final Urn urn, + @Nullable final Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException; @Nonnull @@ -58,12 +80,34 @@ EntityResponse getV2( Entity get(@Nonnull OperationContext opContext, @Nonnull final Urn urn) throws RemoteInvocationException; + /** + * This version follows the legacy behavior of returning key aspects regardless of whether they + * exist + * + * @param opContext operation context + * @param entityName entity type + * @param urns urn ids for the entities + * @param aspectNames set of aspects + * @return requested entity/aspects + */ + @Deprecated @Nonnull - Map batchGetV2( + default Map batchGetV2( @Nonnull OperationContext opContext, @Nonnull String entityName, @Nonnull final Set urns, @Nullable final Set aspectNames) + throws RemoteInvocationException, URISyntaxException { + return batchGetV2(opContext, entityName, urns, aspectNames, true); + } + + @Nonnull + Map batchGetV2( + @Nonnull OperationContext opContext, + @Nonnull String entityName, + @Nonnull final Set urns, + @Nullable final Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException; @Nonnull @@ -589,27 +633,38 @@ void rollbackIngestion( @Nullable default Aspect getLatestAspectObject( - @Nonnull OperationContext opContext, @Nonnull Urn urn, @Nonnull String aspectName) + @Nonnull OperationContext opContext, + @Nonnull Urn urn, + @Nonnull String aspectName, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { - return getLatestAspects(opContext, Set.of(urn), Set.of(aspectName)) + return getLatestAspects(opContext, Set.of(urn), Set.of(aspectName), alwaysIncludeKeyAspect) .getOrDefault(urn, Map.of()) .get(aspectName); } @Nonnull default Map> getLatestAspects( - @Nonnull OperationContext opContext, @Nonnull Set urns, @Nonnull Set aspectNames) + @Nonnull OperationContext opContext, + @Nonnull Set urns, + @Nonnull Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { String entityName = urns.stream().findFirst().map(Urn::getEntityType).get(); - return entityResponseToAspectMap(batchGetV2(opContext, entityName, urns, aspectNames)); + return entityResponseToAspectMap( + batchGetV2(opContext, entityName, urns, aspectNames, alwaysIncludeKeyAspect)); } @Nonnull default Map> getLatestSystemAspect( - @Nonnull OperationContext opContext, @Nonnull Set urns, @Nonnull Set aspectNames) + @Nonnull OperationContext opContext, + @Nonnull Set urns, + @Nonnull Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { String entityName = urns.stream().findFirst().map(Urn::getEntityType).get(); return entityResponseToSystemAspectMap( - batchGetV2(opContext, entityName, urns, aspectNames), opContext.getEntityRegistry()); + batchGetV2(opContext, entityName, urns, aspectNames, alwaysIncludeKeyAspect), + opContext.getEntityRegistry()); } } diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java index 516902601f08a1..8d4c5e9228a71c 100644 --- a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java +++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java @@ -156,10 +156,15 @@ public EntityResponse getV2( @Nonnull OperationContext opContext, @Nonnull String entityName, @Nonnull final Urn urn, - @Nullable final Set aspectNames) + @Nullable final Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { final EntitiesV2GetRequestBuilder requestBuilder = - ENTITIES_V2_REQUEST_BUILDERS.get().aspectsParam(aspectNames).id(urn.toString()); + ENTITIES_V2_REQUEST_BUILDERS + .get() + .aspectsParam(aspectNames) + .id(urn.toString()) + .alwaysIncludeKeyAspectParam(alwaysIncludeKeyAspect); return sendClientRequest(requestBuilder, opContext.getSessionAuthentication()).getEntity(); } @@ -241,7 +246,8 @@ public Map batchGetV2( @Nonnull OperationContext opContext, @Nonnull String entityName, @Nonnull final Set urns, - @Nullable final Set aspectNames) + @Nullable final Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { Map responseMap = new HashMap<>(); @@ -260,6 +266,7 @@ public Map batchGetV2( ENTITIES_V2_REQUEST_BUILDERS .batchGet() .aspectsParam(aspectNames) + .alwaysIncludeKeyAspectParam(alwaysIncludeKeyAspect) .ids( batch.stream() .map(Urn::toString) diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/SystemRestliEntityClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/SystemRestliEntityClient.java index 2637e2d067c6d5..aa17f1951bc912 100644 --- a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/SystemRestliEntityClient.java +++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/SystemRestliEntityClient.java @@ -59,6 +59,6 @@ public Map batchGetV2NoCache( @Nonnull Set urns, @Nullable Set aspectNames) throws RemoteInvocationException, URISyntaxException { - return super.batchGetV2(opContext, entityName, urns, aspectNames); + return super.batchGetV2(opContext, entityName, urns, aspectNames, false); } } diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java index 6033ead36f10ec..30b187da00e91a 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java @@ -309,7 +309,7 @@ private Task ingestProposals( log.debug("Proposals: {}", metadataChangeProposals); try { final AspectsBatch batch = AspectsBatchImpl.builder() - .mcps(metadataChangeProposals, auditStamp, opContext.getRetrieverContext().get(), + .mcps(metadataChangeProposals, auditStamp, opContext.getRetrieverContext(), opContext.getValidationContext().isAlternateValidation()) .build(); diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java index 20209ddf44d643..896d81d3cbecc3 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java @@ -64,7 +64,8 @@ public class EntityV2Resource extends CollectionResourceTaskTemplate get( - @Nonnull String urnStr, @QueryParam(PARAM_ASPECTS) @Optional @Nullable String[] aspectNames) + @Nonnull String urnStr, @QueryParam(PARAM_ASPECTS) @Optional @Nullable String[] aspectNames, + @QueryParam(PARAM_ALWAYS_INCLUDE_KEY_ASPECT) @Optional @Nullable Boolean alwaysIncludeKeyAspect) throws URISyntaxException { log.debug("GET V2 {}", urnStr); final Urn urn = Urn.createFromString(urnStr); @@ -90,7 +91,7 @@ public Task get( ? opContext.getEntityAspectNames(entityName) : new HashSet<>(Arrays.asList(aspectNames)); try { - return _entityService.getEntityV2(opContext, entityName, urn, projectedAspects); + return _entityService.getEntityV2(opContext, entityName, urn, projectedAspects, alwaysIncludeKeyAspect == null || alwaysIncludeKeyAspect); } catch (Exception e) { throw new RuntimeException( String.format( @@ -106,7 +107,8 @@ public Task get( @WithSpan public Task> batchGet( @Nonnull Set urnStrs, - @QueryParam(PARAM_ASPECTS) @Optional @Nullable String[] aspectNames) + @QueryParam(PARAM_ASPECTS) @Optional @Nullable String[] aspectNames, + @QueryParam(PARAM_ALWAYS_INCLUDE_KEY_ASPECT) @Optional @Nullable Boolean alwaysIncludeKeyAspect) throws URISyntaxException { log.debug("BATCH GET V2 {}", urnStrs.toString()); final Set urns = new HashSet<>(); @@ -138,7 +140,7 @@ public Task> batchGet( ? opContext.getEntityAspectNames(entityName) : new HashSet<>(Arrays.asList(aspectNames)); try { - return _entityService.getEntitiesV2(opContext, entityName, urns, projectedAspects); + return _entityService.getEntitiesV2(opContext, entityName, urns, projectedAspects, alwaysIncludeKeyAspect == null || alwaysIncludeKeyAspect); } catch (Exception e) { throw new RuntimeException( String.format( diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliConstants.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliConstants.java index ef79a404c2145e..11df52ad66709e 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliConstants.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliConstants.java @@ -1,5 +1,7 @@ package com.linkedin.metadata.resources.restli; +import javax.annotation.Nullable; + public final class RestliConstants { private RestliConstants() {} @@ -21,6 +23,7 @@ private RestliConstants() {} public static final String PARAM_INPUT = "input"; public static final String PARAM_MAX_HOPS = "maxHops"; public static final String PARAM_ASPECTS = "aspects"; + public static final String PARAM_ALWAYS_INCLUDE_KEY_ASPECT = "alwaysIncludeKeyAspect"; public static final String PARAM_FILTER = "filter"; public static final String PARAM_GROUP = "group"; public static final String PARAM_SORT = "sort"; diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliUtils.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliUtils.java index 185874fac1382d..a2092405da3ff6 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliUtils.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliUtils.java @@ -8,6 +8,7 @@ import com.linkedin.parseq.Task; import com.linkedin.restli.common.HttpStatus; import com.linkedin.restli.server.RestLiServiceException; +import io.datahubproject.metadata.exception.ActorAccessException; import java.util.Optional; import java.util.function.Supplier; import javax.annotation.Nonnull; @@ -38,6 +39,8 @@ public static Task toTask(@Nonnull Supplier supplier) { if (throwable instanceof IllegalArgumentException || throwable.getCause() instanceof IllegalArgumentException) { finalException = badRequestException(throwable.getMessage()); + } else if (throwable.getCause() instanceof ActorAccessException) { + finalException = forbidden(throwable.getCause().getMessage()); } else if (throwable instanceof APIThrottleException) { finalException = apiThrottled(throwable.getMessage()); } else if (throwable instanceof RestLiServiceException) { @@ -109,4 +112,9 @@ public static RestLiServiceException invalidArgumentsException(@Nullable String public static RestLiServiceException apiThrottled(@Nullable String message) { return new RestLiServiceException(HttpStatus.S_429_TOO_MANY_REQUESTS, message); } + + @Nonnull + public static RestLiServiceException forbidden(@Nullable String message) { + return new RestLiServiceException(HttpStatus.S_403_FORBIDDEN, message); + } } diff --git a/metadata-service/restli-servlet-impl/src/test/java/com/linkedin/metadata/resources/entity/AspectResourceTest.java b/metadata-service/restli-servlet-impl/src/test/java/com/linkedin/metadata/resources/entity/AspectResourceTest.java index a39401c170a114..037b5b81fd4df0 100644 --- a/metadata-service/restli-servlet-impl/src/test/java/com/linkedin/metadata/resources/entity/AspectResourceTest.java +++ b/metadata-service/restli-servlet-impl/src/test/java/com/linkedin/metadata/resources/entity/AspectResourceTest.java @@ -100,7 +100,7 @@ public void testAsyncDefaultAspects() throws URISyntaxException { .recordTemplate(mcp.getAspect()) .auditStamp(new AuditStamp()) .metadataChangeProposal(mcp) - .build(opContext.getAspectRetrieverOpt().get()); + .build(opContext.getAspectRetriever()); when(aspectDao.runInTransactionWithRetry(any(), any(), anyInt())) .thenReturn( List.of(List.of( diff --git a/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java b/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java index d701c8fc8be035..80a11ab98bbf4a 100644 --- a/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java +++ b/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java @@ -219,6 +219,7 @@ public class PoliciesConfig { MANAGE_BUSINESS_ATTRIBUTE_PRIVILEGE, MANAGE_CONNECTIONS_PRIVILEGE, MANAGE_STRUCTURED_PROPERTIES_PRIVILEGE, + VIEW_STRUCTURED_PROPERTIES_PAGE_PRIVILEGE, MANAGE_DOCUMENTATION_FORMS_PRIVILEGE, MANAGE_FEATURES_PRIVILEGE, MANAGE_SYSTEM_OPERATIONS_PRIVILEGE); diff --git a/smoke-test/tests/tokens/revokable_access_token_test.py b/smoke-test/tests/tokens/revokable_access_token_test.py index af29437c051e19..006daae39333ed 100644 --- a/smoke-test/tests/tokens/revokable_access_token_test.py +++ b/smoke-test/tests/tokens/revokable_access_token_test.py @@ -9,6 +9,8 @@ wait_for_writes_to_sync, ) +from .token_utils import listUsers, removeUser + pytestmark = pytest.mark.no_cypress_suite1 # Disable telemetry @@ -490,45 +492,3 @@ def getAccessTokenMetadata(session, token): response.raise_for_status() return response.json() - - -def removeUser(session, urn): - # Remove user - json = { - "query": """mutation removeUser($urn: String!) { - removeUser(urn: $urn) - }""", - "variables": {"urn": urn}, - } - - response = session.post(f"{get_frontend_url()}/api/v2/graphql", json=json) - - response.raise_for_status() - return response.json() - - -def listUsers(session): - input = { - "start": "0", - "count": "20", - } - - # list users - json = { - "query": """query listUsers($input: ListUsersInput!) { - listUsers(input: $input) { - start - count - total - users { - username - } - } - }""", - "variables": {"input": input}, - } - - response = session.post(f"{get_frontend_url()}/api/v2/graphql", json=json) - - response.raise_for_status() - return response.json() diff --git a/smoke-test/tests/tokens/session_access_token_test.py b/smoke-test/tests/tokens/session_access_token_test.py new file mode 100644 index 00000000000000..a16abc44453036 --- /dev/null +++ b/smoke-test/tests/tokens/session_access_token_test.py @@ -0,0 +1,173 @@ +import os +import time + +import pytest +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.metadata.schema_classes import AuditStampClass, CorpUserStatusClass +from requests.exceptions import HTTPError + +from tests.utils import ( + get_admin_credentials, + get_frontend_url, + login_as, + wait_for_writes_to_sync, +) + +from .token_utils import getUserId, listUsers, removeUser + +pytestmark = pytest.mark.no_cypress_suite1 + +# Disable telemetry +os.environ["DATAHUB_TELEMETRY_ENABLED"] = "false" + +(admin_user, admin_pass) = get_admin_credentials() +user_urn = "urn:li:corpuser:sessionUser" + + +@pytest.fixture(scope="class") +def custom_user_session(): + """Fixture to execute setup before and tear down after all tests are run""" + admin_session = login_as(admin_user, admin_pass) + + res_data = removeUser(admin_session, user_urn) + assert res_data + assert "error" not in res_data + + # Test getting the invite token + get_invite_token_json = { + "query": """query getInviteToken($input: GetInviteTokenInput!) { + getInviteToken(input: $input){ + inviteToken + } + }""", + "variables": {"input": {}}, + } + + get_invite_token_response = admin_session.post( + f"{get_frontend_url()}/api/v2/graphql", json=get_invite_token_json + ) + get_invite_token_response.raise_for_status() + get_invite_token_res_data = get_invite_token_response.json() + + assert get_invite_token_res_data + assert get_invite_token_res_data["data"] + invite_token = get_invite_token_res_data["data"]["getInviteToken"]["inviteToken"] + assert invite_token is not None + assert "error" not in invite_token + + # Pass the invite token when creating the user + sign_up_json = { + "fullName": "Test Session User", + "email": "sessionUser", + "password": "sessionUser", + "title": "Date Engineer", + "inviteToken": invite_token, + } + + sign_up_response = admin_session.post( + f"{get_frontend_url()}/signUp", json=sign_up_json + ) + sign_up_response.raise_for_status() + assert sign_up_response + assert "error" not in sign_up_response + # Sleep for eventual consistency + wait_for_writes_to_sync() + + # signUp will override the session cookie to the new user to be signed up. + admin_session.cookies.clear() + admin_session = login_as(admin_user, admin_pass) + + # Make user created user is there. + res_data = listUsers(admin_session) + assert res_data["data"] + assert res_data["data"]["listUsers"] + assert {"username": "sessionUser"} in res_data["data"]["listUsers"]["users"] + + yield login_as(sign_up_json["email"], sign_up_json["password"]) + + # Delete created user + res_data = removeUser(admin_session, user_urn) + assert res_data + assert res_data["data"] + assert res_data["data"]["removeUser"] is True + # Sleep for eventual consistency + wait_for_writes_to_sync() + + # Make user created user is not there. + res_data = listUsers(admin_session) + assert res_data["data"] + assert res_data["data"]["listUsers"] + assert {"username": "sessionUser"} not in res_data["data"]["listUsers"]["users"] + + +@pytest.mark.dependency() +def test_soft_delete(graph_client, custom_user_session): + # assert initial access + assert getUserId(custom_user_session) == {"urn": user_urn} + + graph_client.soft_delete_entity(urn=user_urn) + wait_for_writes_to_sync() + + with pytest.raises(HTTPError) as req_info: + getUserId(custom_user_session) + assert "403 Client Error: Forbidden" in str(req_info.value) + + # undo soft delete + graph_client.set_soft_delete_status(urn=user_urn, delete=False) + wait_for_writes_to_sync() + + +@pytest.mark.dependency(depends=["test_soft_delete"]) +def test_suspend(graph_client, custom_user_session): + # assert initial access + assert getUserId(custom_user_session) == {"urn": user_urn} + + graph_client.emit( + MetadataChangeProposalWrapper( + entityType="corpuser", + entityUrn=user_urn, + changeType="UPSERT", + aspectName="corpUserStatus", + aspect=CorpUserStatusClass( + status="SUSPENDED", + lastModified=AuditStampClass( + time=int(time.time() * 1000.0), actor="urn:li:corpuser:unknown" + ), + ), + ) + ) + wait_for_writes_to_sync() + + with pytest.raises(HTTPError) as req_info: + getUserId(custom_user_session) + assert "403 Client Error: Forbidden" in str(req_info.value) + + # undo suspend + graph_client.emit( + MetadataChangeProposalWrapper( + entityType="corpuser", + entityUrn=user_urn, + changeType="UPSERT", + aspectName="corpUserStatus", + aspect=CorpUserStatusClass( + status="ACTIVE", + lastModified=AuditStampClass( + time=int(time.time() * 1000.0), actor="urn:li:corpuser:unknown" + ), + ), + ) + ) + wait_for_writes_to_sync() + + +@pytest.mark.dependency(depends=["test_suspend"]) +def test_hard_delete(graph_client, custom_user_session): + # assert initial access + assert getUserId(custom_user_session) == {"urn": user_urn} + + graph_client.hard_delete_entity(urn=user_urn) + wait_for_writes_to_sync() + + with pytest.raises(HTTPError) as req_info: + getUserId(custom_user_session) + assert "403 Client Error: Forbidden" in str(req_info.value) diff --git a/smoke-test/tests/tokens/token_utils.py b/smoke-test/tests/tokens/token_utils.py new file mode 100644 index 00000000000000..10558e7085de72 --- /dev/null +++ b/smoke-test/tests/tokens/token_utils.py @@ -0,0 +1,53 @@ +from tests.utils import get_frontend_url + + +def getUserId(session): + response = session.get( + f"{get_frontend_url()}/openapi/operations/identity/user/urn", + params={"skipCache": "true"}, + ) + + response.raise_for_status() + return response.json() + + +def removeUser(session, urn): + # Remove user + json = { + "query": """mutation removeUser($urn: String!) { + removeUser(urn: $urn) + }""", + "variables": {"urn": urn}, + } + + response = session.post(f"{get_frontend_url()}/api/v2/graphql", json=json) + + response.raise_for_status() + return response.json() + + +def listUsers(session): + input = { + "start": "0", + "count": "20", + } + + # list users + json = { + "query": """query listUsers($input: ListUsersInput!) { + listUsers(input: $input) { + start + count + total + users { + username + } + } + }""", + "variables": {"input": input}, + } + + response = session.post(f"{get_frontend_url()}/api/v2/graphql", json=json) + + response.raise_for_status() + return response.json()