diff --git a/.github/workflows/pr-labeler.yml b/.github/workflows/pr-labeler.yml index 1ae3edae7aa90b..7da20ece44f6d6 100644 --- a/.github/workflows/pr-labeler.yml +++ b/.github/workflows/pr-labeler.yml @@ -45,7 +45,8 @@ jobs: "Salman-Apptware", "mayurinehate", "noggi", - "skrydal" + "skrydal", + "kevinkarchacryl" ]'), github.actor ) diff --git a/build.gradle b/build.gradle index 6e6dadb7ebfa34..9ee756d41e11ef 100644 --- a/build.gradle +++ b/build.gradle @@ -56,7 +56,7 @@ buildscript { ext.hazelcastVersion = '5.3.6' ext.ebeanVersion = '15.5.2' ext.googleJavaFormatVersion = '1.18.1' - ext.openLineageVersion = '1.19.0' + ext.openLineageVersion = '1.24.2' ext.logbackClassicJava8 = '1.2.12' ext.docker_registry = 'acryldata' diff --git a/datahub-frontend/app/auth/sso/oidc/OidcCallbackLogic.java b/datahub-frontend/app/auth/sso/oidc/OidcCallbackLogic.java index ef5833f607efdb..113aeeb36551f0 100644 --- a/datahub-frontend/app/auth/sso/oidc/OidcCallbackLogic.java +++ b/datahub-frontend/app/auth/sso/oidc/OidcCallbackLogic.java @@ -130,8 +130,6 @@ public Object perform( CallContext ctx = ctxResult.getFirst(); Result result = (Result) ctxResult.getSecond(); - setContextRedirectUrl(ctx); - // Handle OIDC authentication errors. if (OidcResponseErrorHandler.isError(ctx)) { return OidcResponseErrorHandler.handleError(ctx); @@ -192,6 +190,9 @@ private Pair superPerform( } } + // Set the redirect url from cookie before creating action + setContextRedirectUrl(ctx); + action = this.redirectToOriginallyRequestedUrl(ctx, defaultUrl); } } catch (RuntimeException var20) { diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java index 6c70aee88675c5..cd7947ce3c11aa 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java @@ -1,13 +1,12 @@ package com.linkedin.datahub.upgrade.system; -import static com.linkedin.metadata.Constants.DATA_HUB_UPGRADE_RESULT_ASPECT_NAME; - import com.linkedin.common.urn.Urn; import com.linkedin.datahub.upgrade.UpgradeContext; import com.linkedin.datahub.upgrade.UpgradeStep; import com.linkedin.datahub.upgrade.UpgradeStepResult; import com.linkedin.datahub.upgrade.impl.DefaultUpgradeStepResult; import com.linkedin.events.metadata.ChangeType; +import com.linkedin.metadata.aspect.SystemAspect; import com.linkedin.metadata.boot.BootstrapStep; import com.linkedin.metadata.entity.AspectDao; import com.linkedin.metadata.entity.EntityService; @@ -16,10 +15,13 @@ import com.linkedin.metadata.entity.ebean.PartitionedStream; import com.linkedin.metadata.entity.restoreindices.RestoreIndicesArgs; import com.linkedin.metadata.utils.AuditStampUtils; +import com.linkedin.upgrade.DataHubUpgradeResult; import com.linkedin.upgrade.DataHubUpgradeState; import com.linkedin.util.Pair; import io.datahubproject.metadata.context.OperationContext; import java.util.List; +import java.util.Map; +import java.util.Optional; import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; import java.util.function.Function; @@ -33,6 +35,8 @@ */ @Slf4j public abstract class AbstractMCLStep implements UpgradeStep { + public static final String LAST_URN_KEY = "lastUrn"; + private final OperationContext opContext; private final EntityService entityService; private final AspectDao aspectDao; @@ -70,10 +74,30 @@ protected Urn getUpgradeIdUrn() { @Override public Function executable() { return (context) -> { + // Resume state + Optional prevResult = + context.upgrade().getUpgradeResult(opContext, getUpgradeIdUrn(), entityService); + String resumeUrn = + prevResult + .filter( + result -> + DataHubUpgradeState.IN_PROGRESS.equals(result.getState()) + && result.getResult() != null + && result.getResult().containsKey(LAST_URN_KEY)) + .map(result -> result.getResult().get(LAST_URN_KEY)) + .orElse(null); + if (resumeUrn != null) { + log.info("{}: Resuming from URN: {}", getUpgradeIdUrn(), resumeUrn); + } // re-using for configuring the sql scan RestoreIndicesArgs args = - new RestoreIndicesArgs().aspectName(getAspectName()).batchSize(batchSize).limit(limit); + new RestoreIndicesArgs() + .aspectName(getAspectName()) + .batchSize(batchSize) + .lastUrn(resumeUrn) + .urnBasedPagination(resumeUrn != null) + .limit(limit); if (getUrnLike() != null) { args = args.urnLike(getUrnLike()); @@ -86,40 +110,62 @@ public Function executable() { batch -> { log.info("Processing batch({}) of size {}.", getAspectName(), batchSize); - List, Boolean>> futures; - + List, SystemAspect>> futures; futures = EntityUtils.toSystemAspectFromEbeanAspects( opContext.getRetrieverContext().get(), batch.collect(Collectors.toList())) .stream() .map( - systemAspect -> - entityService.alwaysProduceMCLAsync( - opContext, - systemAspect.getUrn(), - systemAspect.getUrn().getEntityType(), - getAspectName(), - systemAspect.getAspectSpec(), - null, - systemAspect.getRecordTemplate(), - null, - systemAspect - .getSystemMetadata() - .setRunId(id()) - .setLastObserved(System.currentTimeMillis()), - AuditStampUtils.createDefaultAuditStamp(), - ChangeType.UPSERT)) - .collect(Collectors.toList()); - - futures.forEach( - f -> { - try { - f.getFirst().get(); - } catch (InterruptedException | ExecutionException e) { - throw new RuntimeException(e); - } - }); + systemAspect -> { + Pair, Boolean> future = + entityService.alwaysProduceMCLAsync( + opContext, + systemAspect.getUrn(), + systemAspect.getUrn().getEntityType(), + getAspectName(), + systemAspect.getAspectSpec(), + null, + systemAspect.getRecordTemplate(), + null, + systemAspect + .getSystemMetadata() + .setRunId(id()) + .setLastObserved(System.currentTimeMillis()), + AuditStampUtils.createDefaultAuditStamp(), + ChangeType.UPSERT); + return Pair., SystemAspect>of( + future.getFirst(), systemAspect); + }) + .toList(); + + SystemAspect lastAspect = + futures.stream() + .map( + f -> { + try { + f.getFirst().get(); + return f.getSecond(); + } catch (InterruptedException | ExecutionException e) { + throw new RuntimeException(e); + } + }) + .reduce((a, b) -> b) + .orElse(null); + + // record progress + if (lastAspect != null) { + log.info( + "{}: Saving state. Last urn:{}", getUpgradeIdUrn(), lastAspect.getUrn()); + context + .upgrade() + .setUpgradeResult( + opContext, + getUpgradeIdUrn(), + entityService, + DataHubUpgradeState.IN_PROGRESS, + Map.of(LAST_URN_KEY, lastAspect.getUrn().toString())); + } if (batchDelayMs > 0) { log.info("Sleeping for {} ms", batchDelayMs); @@ -142,12 +188,23 @@ public Function executable() { @Override /** Returns whether the upgrade should be skipped. */ public boolean skip(UpgradeContext context) { - boolean previouslyRun = - entityService.exists( - opContext, getUpgradeIdUrn(), DATA_HUB_UPGRADE_RESULT_ASPECT_NAME, true); - if (previouslyRun) { - log.info("{} was already run. Skipping.", id()); + Optional prevResult = + context.upgrade().getUpgradeResult(opContext, getUpgradeIdUrn(), entityService); + + boolean previousRunFinal = + prevResult + .filter( + result -> + DataHubUpgradeState.SUCCEEDED.equals(result.getState()) + || DataHubUpgradeState.ABORTED.equals(result.getState())) + .isPresent(); + + if (previousRunFinal) { + log.info( + "{} was already run. State: {} Skipping.", + id(), + prevResult.map(DataHubUpgradeResult::getState)); } - return previouslyRun; + return previousRunFinal; } } diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java index eece83f4ab713e..55bc8edbf6a768 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java @@ -1,5 +1,6 @@ package com.linkedin.datahub.upgrade.system.schemafield; +import static com.linkedin.datahub.upgrade.system.AbstractMCLStep.LAST_URN_KEY; import static com.linkedin.metadata.Constants.APP_SOURCE; import static com.linkedin.metadata.Constants.DATASET_ENTITY_NAME; import static com.linkedin.metadata.Constants.SCHEMA_METADATA_ASPECT_NAME; @@ -61,7 +62,6 @@ */ @Slf4j public class GenerateSchemaFieldsFromSchemaMetadataStep implements UpgradeStep { - private static final String LAST_URN_KEY = "lastUrn"; private static final List REQUIRED_ASPECTS = List.of(SCHEMA_METADATA_ASPECT_NAME, STATUS_ASPECT_NAME); diff --git a/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/DatahubUpgradeNonBlockingTest.java b/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/DatahubUpgradeNonBlockingTest.java index f340e688ad7f77..21bc6b725cba2b 100644 --- a/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/DatahubUpgradeNonBlockingTest.java +++ b/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/DatahubUpgradeNonBlockingTest.java @@ -1,14 +1,18 @@ package com.linkedin.datahub.upgrade; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyInt; import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertTrue; import static org.testng.AssertJUnit.assertNotNull; +import com.linkedin.data.template.StringMap; import com.linkedin.datahub.upgrade.impl.DefaultUpgradeManager; import com.linkedin.datahub.upgrade.system.SystemUpdateNonBlocking; import com.linkedin.datahub.upgrade.system.bootstrapmcps.BootstrapMCPStep; @@ -20,17 +24,30 @@ import com.linkedin.metadata.entity.AspectDao; import com.linkedin.metadata.entity.EntityService; import com.linkedin.metadata.entity.EntityServiceImpl; +import com.linkedin.metadata.entity.ebean.EbeanAspectV2; +import com.linkedin.metadata.entity.ebean.PartitionedStream; import com.linkedin.metadata.entity.restoreindices.RestoreIndicesArgs; import com.linkedin.mxe.Topics; +import com.linkedin.upgrade.DataHubUpgradeResult; +import com.linkedin.upgrade.DataHubUpgradeState; +import com.linkedin.util.Pair; import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.test.metadata.context.TestOperationContexts; +import java.sql.Timestamp; +import java.util.Arrays; import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.concurrent.CompletableFuture; import java.util.stream.Collectors; +import java.util.stream.Stream; import javax.inject.Named; +import org.mockito.ArgumentCaptor; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.test.context.SpringBootTest; import org.springframework.test.context.ActiveProfiles; import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; +import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; @ActiveProfiles("test") @@ -63,7 +80,12 @@ public class DatahubUpgradeNonBlockingTest extends AbstractTestNGSpringContextTe @Autowired private EntityServiceImpl entityService; - @Autowired private OperationContext opContext; + private OperationContext opContext; + + @BeforeClass + public void init() { + opContext = TestOperationContexts.systemContextNoValidate(); + } @Test public void testSystemUpdateNonBlockingInit() { @@ -81,10 +103,13 @@ public void testSystemUpdateNonBlockingInit() { } @Test - public void testReindexDataJobViaNodesCLLPaging() { + public void testReindexDataJobViaNodesCLLPagingArgs() { EntityService mockService = mock(EntityService.class); AspectDao mockAspectDao = mock(AspectDao.class); + PartitionedStream mockStream = mock(PartitionedStream.class); + when(mockStream.partition(anyInt())).thenReturn(Stream.empty()); + when(mockAspectDao.streamAspectBatches(any(RestoreIndicesArgs.class))).thenReturn(mockStream); ReindexDataJobViaNodesCLL cllUpgrade = new ReindexDataJobViaNodesCLL(opContext, mockService, mockAspectDao, true, 10, 0, 0); @@ -102,9 +127,79 @@ public void testReindexDataJobViaNodesCLLPaging() { .batchSize(10) .limit(0) .aspectName("dataJobInputOutput") + .urnBasedPagination(false) + .lastUrn(null) .urnLike("urn:li:dataJob:%"))); } + @Test + public void testReindexDataJobViaNodesCLLResumePaging() throws Exception { + // Mock services + EntityService mockService = mock(EntityService.class); + AspectDao mockAspectDao = mock(AspectDao.class); + + // Create test data + EbeanAspectV2 aspect1 = createMockEbeanAspect("urn:li:dataJob:job1", "dataJobInputOutput"); + EbeanAspectV2 aspect2 = createMockEbeanAspect("urn:li:dataJob:job2", "dataJobInputOutput"); + EbeanAspectV2 aspect3 = createMockEbeanAspect("urn:li:dataJob:job3", "dataJobInputOutput"); + List initialBatch = Arrays.asList(aspect1, aspect2); + List resumeBatch = Arrays.asList(aspect3); + + // Mock the stream for first batch + PartitionedStream initialStream = mock(PartitionedStream.class); + when(initialStream.partition(anyInt())).thenReturn(Stream.of(initialBatch.stream())); + + // Mock the stream for second batch + PartitionedStream resumeStream = mock(PartitionedStream.class); + when(resumeStream.partition(anyInt())).thenReturn(Stream.of(resumeBatch.stream())); + + // Setup the AspectDao using Answer to handle null safely + when(mockAspectDao.streamAspectBatches(any(RestoreIndicesArgs.class))) + .thenAnswer( + invocation -> { + RestoreIndicesArgs args = invocation.getArgument(0); + if (args.lastUrn() == null) { + return initialStream; + } else if ("urn:li:dataJob:job2".equals(args.lastUrn())) { + return resumeStream; + } + return mock(PartitionedStream.class); + }); + + // Mock successful MCL production + when(mockService.alwaysProduceMCLAsync( + any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), any())) + .thenReturn(Pair.of(CompletableFuture.completedFuture(null), true)); + + // Create the upgrade + ReindexDataJobViaNodesCLL cllUpgrade = + new ReindexDataJobViaNodesCLL(opContext, mockService, mockAspectDao, true, 2, 0, 0); + + // Initial Run + cllUpgrade.steps().get(0).executable().apply(createMockInitialUpgrade()); + + // Resumed + cllUpgrade.steps().get(0).executable().apply(createMockResumeUpgrade()); + + // Use ArgumentCaptor to verify the calls + ArgumentCaptor argsCaptor = + ArgumentCaptor.forClass(RestoreIndicesArgs.class); + verify(mockAspectDao, times(2)).streamAspectBatches(argsCaptor.capture()); + + List capturedArgs = argsCaptor.getAllValues(); + + // Verify both the initial and resume calls were made with correct arguments + assertEquals(capturedArgs.get(0).lastUrn(), null); + assertEquals(capturedArgs.get(0).urnBasedPagination(), false); + assertEquals(capturedArgs.get(1).lastUrn(), "urn:li:dataJob:job2"); + assertEquals(capturedArgs.get(1).urnBasedPagination(), true); + + // Verify MCL production was called for each aspect + verify(mockService, times(3)) + .alwaysProduceMCLAsync( + any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), any()); + } + @Test public void testNonBlockingBootstrapMCP() { List mcpTemplate = @@ -123,4 +218,54 @@ public void testNonBlockingBootstrapMCP() { .map(update -> update.getMcpTemplate().getName()) .collect(Collectors.toSet()))); } + + private UpgradeContext createMockInitialUpgrade() { + // Mock the Upgrade instance + Upgrade mockUpgrade = mock(Upgrade.class); + + // Configure the mock upgrade to return no previous result + when(mockUpgrade.getUpgradeResult(any(), any(), any())).thenReturn(Optional.empty()); + + UpgradeContext mockInitialContext = mock(UpgradeContext.class); + when(mockInitialContext.opContext()).thenReturn(opContext); + when(mockInitialContext.upgrade()).thenReturn(mockUpgrade); + when(mockInitialContext.report()).thenReturn(mock(UpgradeReport.class)); + + return mockInitialContext; + } + + private UpgradeContext createMockResumeUpgrade() { + // Mock the Upgrade instance + Upgrade mockUpgrade = mock(Upgrade.class); + DataHubUpgradeResult mockPrevResult = mock(DataHubUpgradeResult.class); + + // Configure the mock previous result + when(mockPrevResult.getState()).thenReturn(DataHubUpgradeState.IN_PROGRESS); + when(mockPrevResult.getResult()) + .thenReturn(new StringMap(Map.of("lastUrn", "urn:li:dataJob:job2"))); + + // Configure the mock upgrade to return our previous result + when(mockUpgrade.getUpgradeResult(any(), any(), any())).thenReturn(Optional.of(mockPrevResult)); + + UpgradeContext mockResumeContext = mock(UpgradeContext.class); + when(mockResumeContext.opContext()).thenReturn(opContext); + when(mockResumeContext.upgrade()).thenReturn(mockUpgrade); + when(mockResumeContext.report()).thenReturn(mock(UpgradeReport.class)); + + return mockResumeContext; + } + + private static EbeanAspectV2 createMockEbeanAspect(String urn, String aspectName) { + Timestamp now = new Timestamp(System.currentTimeMillis()); + return new EbeanAspectV2( + urn, + aspectName, + 0L, + "{}", // metadata + now, // createdOn + "urn:li:corpuser:testUser", // createdBy + null, // createdFor + null // systemMetadata + ); + } } diff --git a/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/UpgradeCliApplicationTestConfiguration.java b/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/UpgradeCliApplicationTestConfiguration.java index 81d883d8ce36b7..5b7b8756f11fb1 100644 --- a/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/UpgradeCliApplicationTestConfiguration.java +++ b/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/UpgradeCliApplicationTestConfiguration.java @@ -19,17 +19,17 @@ @Import(value = {SystemAuthenticationFactory.class}) public class UpgradeCliApplicationTestConfiguration { - @MockBean private UpgradeCli upgradeCli; + @MockBean public UpgradeCli upgradeCli; - @MockBean private Database ebeanServer; + @MockBean public Database ebeanServer; - @MockBean private SearchService searchService; + @MockBean public SearchService searchService; - @MockBean private GraphService graphService; + @MockBean public GraphService graphService; - @MockBean private EntityRegistry entityRegistry; + @MockBean public EntityRegistry entityRegistry; - @MockBean ConfigEntityRegistry configEntityRegistry; + @MockBean public ConfigEntityRegistry configEntityRegistry; @MockBean public EntityIndexBuilders entityIndexBuilders; diff --git a/datahub-web-react/src/app/entity/group/GroupMembers.tsx b/datahub-web-react/src/app/entity/group/GroupMembers.tsx index 147c3f8030d0e0..28e81b438d4cb4 100644 --- a/datahub-web-react/src/app/entity/group/GroupMembers.tsx +++ b/datahub-web-react/src/app/entity/group/GroupMembers.tsx @@ -137,12 +137,13 @@ export default function GroupMembers({ urn, pageSize, isExternalGroup, onChangeM }, 3000); }; - const onRemoveMember = (memberUrn: string) => { + const onRemoveMember = (memberEntity: CorpUser) => { + const memberName = entityRegistry.getDisplayName(EntityType.CorpUser, memberEntity); Modal.confirm({ title: `Confirm Group Member Removal`, - content: `Are you sure you want to remove this user from the group?`, + content: `Are you sure you want to remove ${memberName} user from the group?`, onOk() { - removeGroupMember(memberUrn); + removeGroupMember(memberEntity?.urn); }, onCancel() {}, okText: 'Yes', @@ -155,7 +156,7 @@ export default function GroupMembers({ urn, pageSize, isExternalGroup, onChangeM const total = relationships?.total || 0; const groupMembers = relationships?.relationships?.map((rel) => rel.entity as CorpUser) || []; - const getItems = (urnID: string): MenuProps['items'] => { + const getItems = (userEntity: CorpUser): MenuProps['items'] => { return [ { key: 'make', @@ -169,7 +170,7 @@ export default function GroupMembers({ urn, pageSize, isExternalGroup, onChangeM { key: 'remove', disabled: isExternalGroup, - onClick: () => onRemoveMember(urnID), + onClick: () => onRemoveMember(userEntity), label: ( Remove from Group @@ -210,7 +211,7 @@ export default function GroupMembers({ urn, pageSize, isExternalGroup, onChangeM - + diff --git a/datahub-web-react/src/app/entity/shared/siblingUtils.ts b/datahub-web-react/src/app/entity/shared/siblingUtils.ts index 2f50dc99df191b..aa9e4bcb5e46e1 100644 --- a/datahub-web-react/src/app/entity/shared/siblingUtils.ts +++ b/datahub-web-react/src/app/entity/shared/siblingUtils.ts @@ -5,6 +5,7 @@ import * as QueryString from 'query-string'; import { Dataset, Entity, Maybe, SiblingProperties } from '../../../types.generated'; import { GenericEntityProperties } from './types'; import { useIsShowSeparateSiblingsEnabled } from '../../useAppConfig'; +import { downgradeV2FieldPath } from '../dataset/profile/schema/utils/utils'; export function stripSiblingsFromEntity(entity: any) { return { @@ -55,16 +56,30 @@ const combineMerge = (target, source, options) => { return destination; }; -function convertObjectKeysToLowercase(object: Record) { - return Object.fromEntries(Object.entries(object).map(([key, value]) => [key.toLowerCase(), value])); +// this function is responsible for normalizing object keys to make sure merging on key matches keys appropriately +function normalizeObjectKeys(object: Record, isSchemaField = false) { + return Object.fromEntries( + Object.entries(object).map(([key, value]) => { + let normalizedKey = key.toLowerCase(); + if (isSchemaField) { + normalizedKey = downgradeV2FieldPath(normalizedKey) || normalizedKey; + } + return [normalizedKey, value]; + }), + ); } // use when you want to merge an array of objects by key in the object as opposed to by index of array -const mergeArrayOfObjectsByKey = (destinationArray: any[], sourceArray: any[], key: string) => { - const destination = convertObjectKeysToLowercase(keyBy(destinationArray, key)); - const source = convertObjectKeysToLowercase(keyBy(sourceArray, key)); +const mergeArrayOfObjectsByKey = (destinationArray: any[], sourceArray: any[], key: string, isSchemaField = false) => { + const destination = normalizeObjectKeys(keyBy(destinationArray, key), isSchemaField); + const source = normalizeObjectKeys(keyBy(sourceArray, key), isSchemaField); - return values(merge(destination, source)); + return values( + merge(destination, source, { + arrayMerge: combineMerge, + customMerge, + }), + ); }; const mergeTags = (destinationArray, sourceArray, _options) => { @@ -88,7 +103,7 @@ const mergeOwners = (destinationArray, sourceArray, _options) => { }; const mergeFields = (destinationArray, sourceArray, _options) => { - return mergeArrayOfObjectsByKey(destinationArray, sourceArray, 'fieldPath'); + return mergeArrayOfObjectsByKey(destinationArray, sourceArray, 'fieldPath', true); }; function getArrayMergeFunction(key) { @@ -112,7 +127,7 @@ function getArrayMergeFunction(key) { } } -const customMerge = (isPrimary, key) => { +function customMerge(isPrimary, key) { if (key === 'upstream' || key === 'downstream') { return (_secondary, primary) => primary; } @@ -145,7 +160,7 @@ const customMerge = (isPrimary, key) => { customMerge: customMerge.bind({}, isPrimary), }); }; -}; +} export const getEntitySiblingData = (baseEntity: T): Maybe => { if (!baseEntity) { diff --git a/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/editor/Editor.tsx b/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/editor/Editor.tsx index fe2a8c51f9377b..8ee0f637094d6d 100644 --- a/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/editor/Editor.tsx +++ b/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/editor/Editor.tsx @@ -50,26 +50,26 @@ export const Editor = forwardRef((props: EditorProps, ref) => { const { manager, state, getContext } = useRemirror({ extensions: () => [ new BlockquoteExtension(), - new BoldExtension(), - new BulletListExtension(), + new BoldExtension({}), + new BulletListExtension({}), new CodeBlockExtension({ syntaxTheme: 'base16_ateliersulphurpool_light' }), new CodeExtension(), - new DataHubMentionsExtension(), - new DropCursorExtension(), + new DataHubMentionsExtension({}), + new DropCursorExtension({}), new HardBreakExtension(), - new HeadingExtension(), - new HistoryExtension(), - new HorizontalRuleExtension(), + new HeadingExtension({}), + new HistoryExtension({}), + new HorizontalRuleExtension({}), new ImageExtension({ enableResizing: !readOnly }), new ItalicExtension(), new LinkExtension({ autoLink: true, defaultTarget: '_blank' }), - new ListItemExtension(), + new ListItemExtension({}), new MarkdownExtension({ htmlSanitizer: DOMPurify.sanitize, htmlToMarkdown, markdownToHtml }), new OrderedListExtension(), new UnderlineExtension(), new StrikeExtension(), new TableExtension({ resizable: false }), - ...(readOnly ? [] : [new HistoryExtension()]), + ...(readOnly ? [] : [new HistoryExtension({})]), ], content, stringHandler: 'markdown', diff --git a/docs-website/filterTagIndexes.json b/docs-website/filterTagIndexes.json index 2309593b2c3b9f..b269f23cccd667 100644 --- a/docs-website/filterTagIndexes.json +++ b/docs-website/filterTagIndexes.json @@ -231,6 +231,17 @@ "Features": "Stateful Ingestion, UI Ingestion, Status Aspect" } }, + { + "Path": "https://hudi.apache.org/docs/syncing_datahub/", + "imgPath": "img/logos/platforms/hudi.png", + "Title": "Apache Hudi", + "Description": "Apache Hudi is an open-source data lake framework that provides ACID transactions, efficient upserts, time travel queries, and incremental data processing for large-scale datasets.", + "tags": { + "Platform Type": "Datastore", + "Connection Type": "Pull", + "Features": "" + } + }, { "Path": "docs/generated/ingestion/sources/iceberg", "imgPath": "img/logos/platforms/iceberg.png", diff --git a/docs-website/static/img/logos/platforms/hudi.png b/docs-website/static/img/logos/platforms/hudi.png new file mode 100644 index 00000000000000..c5e79bcc86ce34 Binary files /dev/null and b/docs-website/static/img/logos/platforms/hudi.png differ diff --git a/docs-website/yarn.lock b/docs-website/yarn.lock index 4e457abd50af7a..9c82b27c3b61f3 100644 --- a/docs-website/yarn.lock +++ b/docs-website/yarn.lock @@ -1827,7 +1827,7 @@ "@docusaurus/theme-search-algolia" "2.4.3" "@docusaurus/types" "2.4.3" -"@docusaurus/react-loadable@5.5.2", "react-loadable@npm:@docusaurus/react-loadable@5.5.2": +"@docusaurus/react-loadable@5.5.2": version "5.5.2" resolved "https://registry.yarnpkg.com/@docusaurus/react-loadable/-/react-loadable-5.5.2.tgz#81aae0db81ecafbdaee3651f12804580868fa6ce" integrity sha512-A3dYjdBGuy0IGT+wyLIGIKLRE+sAk1iNk0f1HjNDysO7u8lhL4N3VEm+FAubmJbAztn94F7MxBTPmnixbiyFdQ== @@ -4757,9 +4757,9 @@ cross-fetch@^3.1.5: node-fetch "^2.6.12" cross-spawn@^7.0.3: - version "7.0.3" - resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-7.0.3.tgz#f73a85b9d5d41d045551c177e2882d4ac85728a6" - integrity sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w== + version "7.0.6" + resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-7.0.6.tgz#8a58fe78f00dcd70c370451759dfbfaf03e8ee9f" + integrity sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA== dependencies: path-key "^3.1.0" shebang-command "^2.0.0" @@ -9713,6 +9713,14 @@ react-loadable-ssr-addon-v5-slorber@^1.0.1: dependencies: "@babel/runtime" "^7.10.3" +"react-loadable@npm:@docusaurus/react-loadable@5.5.2": + version "5.5.2" + resolved "https://registry.yarnpkg.com/@docusaurus/react-loadable/-/react-loadable-5.5.2.tgz#81aae0db81ecafbdaee3651f12804580868fa6ce" + integrity sha512-A3dYjdBGuy0IGT+wyLIGIKLRE+sAk1iNk0f1HjNDysO7u8lhL4N3VEm+FAubmJbAztn94F7MxBTPmnixbiyFdQ== + dependencies: + "@types/react" "*" + prop-types "^15.6.2" + react-markdown@^8.0.6: version "8.0.7" resolved "https://registry.yarnpkg.com/react-markdown/-/react-markdown-8.0.7.tgz#c8dbd1b9ba5f1c5e7e5f2a44de465a3caafdf89b" diff --git a/docs/api/restli/restli-overview.md b/docs/api/restli/restli-overview.md index 22b913d9a25df4..3e9ab00b522670 100644 --- a/docs/api/restli/restli-overview.md +++ b/docs/api/restli/restli-overview.md @@ -1156,7 +1156,7 @@ curl -X POST 'http://localhost:8080/entities?action=search' \ "and": [ { "field": "title", - "value": "Baz Chart 1", + "values": ["Baz Chart 1"], "condition": "EQUAL" } ] @@ -1261,7 +1261,7 @@ curl -X POST 'http://localhost:8080/entities?action=autocomplete' \ "and": [ { "field": "tool", - "value": "looker", + "values": ["looker"], "condition": "EQUAL" } ] diff --git a/docs/deploy/aws.md b/docs/deploy/aws.md index 49b0ea1d69ae19..c625ba26a3865a 100644 --- a/docs/deploy/aws.md +++ b/docs/deploy/aws.md @@ -53,7 +53,18 @@ ip-192-168-64-56.us-west-2.compute.internal Ready 3h v1.18.9-ek ip-192-168-8-126.us-west-2.compute.internal Ready 3h v1.18.9-eks-d1db3c ``` -Once your cluster is running, make sure to install the EBS CSI driver, Core DNS, and VPC CNI plugin for Kubernetes. [add-ons](https://docs.aws.amazon.com/eks/latest/userguide/eks-add-ons.html) +### Install EBS CSI driver, Core DNS, and VPC CNI plugin for Kubernetes + +Once your cluster is running, make sure to install the EBS CSI driver, Core DNS, and VPC CNI plugin for Kubernetes. [add-ons](https://docs.aws.amazon.com/eks/latest/userguide/eks-add-ons.html). By default Core DNS and VPC CNI plugins are installed. You need to manually install the EBS CSI driver. It show look this in your console when you are done. + +![Screenshot 2024-11-15 at 4 42 09 PM](https://github.com/user-attachments/assets/5a9a2af0-e804-4896-85bb-dc5834208719) + +### Add the AmazonEBSCSIDriverPolicy role to the EKS node group + +Next is to add the AmazonEBSCSIDriverPolicy role to the EKS node group. You will from the EKS Node group by going to the Compute tab in your EKS cluster and clicking on the IAM entry for the EKS node group. Add the AmazonEBSCSIDriverPolicy policy. + +![Screenshot 2024-11-15 at 4 42 29 PM](https://github.com/user-attachments/assets/8971c8d6-8543-408b-9a07-814aacb2532d) +![Screenshot 2024-11-15 at 4 42 46 PM](https://github.com/user-attachments/assets/397f9131-5f13-4d9f-a664-9921d9bbf44e) ## Setup DataHub using Helm diff --git a/entity-registry/build.gradle b/entity-registry/build.gradle index 2dedea1f16d99c..22e5b601d39db2 100644 --- a/entity-registry/build.gradle +++ b/entity-registry/build.gradle @@ -25,6 +25,8 @@ dependencies { because("previous versions are vulnerable to CVE-2022-25857") } } + api project(path: ':li-utils') + api project(path: ':li-utils', configuration: "dataTemplate") dataModel project(':li-utils') annotationProcessor externalDependency.lombok diff --git a/li-utils/src/main/javaPegasus/com/linkedin/common/urn/UrnUtils.java b/li-utils/src/main/javaPegasus/com/linkedin/common/urn/UrnUtils.java index 89f0cd8fbc9791..0a2400badfc627 100644 --- a/li-utils/src/main/javaPegasus/com/linkedin/common/urn/UrnUtils.java +++ b/li-utils/src/main/javaPegasus/com/linkedin/common/urn/UrnUtils.java @@ -27,28 +27,6 @@ public static DatasetUrn toDatasetUrn( new DataPlatformUrn(platformName), datasetName, FabricType.valueOf(origin.toUpperCase())); } - /** - * Convert fabric String to FabricType - * - * @param fabric PROD, CORP, EI, DEV, LIT, PRIME - * @return FabricType - */ - @Nonnull - public static FabricType toFabricType(@Nonnull String fabric) { - switch (fabric.toUpperCase()) { - case "PROD": - return FabricType.PROD; - case "CORP": - return FabricType.CORP; - case "EI": - return FabricType.EI; - case "DEV": - return FabricType.DEV; - default: - throw new IllegalArgumentException("Unsupported Fabric Type: " + fabric); - } - } - public static Urn getUrn(String urnStr) { try { return Urn.createFromString(urnStr); diff --git a/metadata-ingestion/docs/sources/sac/sac_pre.md b/metadata-ingestion/docs/sources/sac/sac_pre.md index c62cd81fa27534..624eb61f716f92 100644 --- a/metadata-ingestion/docs/sources/sac/sac_pre.md +++ b/metadata-ingestion/docs/sources/sac/sac_pre.md @@ -4,6 +4,7 @@ - Purpose: API Access - Access: + - Story Listing - Data Import Service - Authorization Grant: Client Credentials diff --git a/metadata-ingestion/scripts/avro_codegen.py b/metadata-ingestion/scripts/avro_codegen.py index e2dd5151439923..e5792da32fb5d7 100644 --- a/metadata-ingestion/scripts/avro_codegen.py +++ b/metadata-ingestion/scripts/avro_codegen.py @@ -769,7 +769,7 @@ def generate( import importlib from typing import TYPE_CHECKING -from datahub._codegen.aspect import _Aspect +from datahub._codegen.aspect import _Aspect as _Aspect from datahub.utilities.docs_build import IS_SPHINX_BUILD from datahub.utilities._custom_package_loader import get_custom_models_package @@ -802,7 +802,7 @@ def generate( from datahub.utilities.docs_build import IS_SPHINX_BUILD from datahub.utilities._custom_package_loader import get_custom_urns_package -from datahub.utilities.urns._urn_base import Urn # noqa: F401 +from datahub.utilities.urns._urn_base import Urn as Urn # noqa: F401 _custom_package_path = get_custom_urns_package() diff --git a/metadata-ingestion/scripts/modeldocgen.py b/metadata-ingestion/scripts/modeldocgen.py index ee5f06cb801baa..998947e5caa954 100644 --- a/metadata-ingestion/scripts/modeldocgen.py +++ b/metadata-ingestion/scripts/modeldocgen.py @@ -14,7 +14,11 @@ import click from datahub.configuration.common import ConfigEnum, PermissiveConfigModel -from datahub.emitter.mce_builder import make_data_platform_urn, make_dataset_urn +from datahub.emitter.mce_builder import ( + make_data_platform_urn, + make_dataset_urn, + make_schema_field_urn, +) from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.rest_emitter import DatahubRestEmitter from datahub.ingestion.api.common import PipelineContext, RecordEnvelope @@ -442,10 +446,10 @@ def strip_types(field_path: str) -> str: name=relnship_name, foreignDataset=foreign_dataset_urn, foreignFields=[ - f"urn:li:schemaField:({foreign_dataset_urn}, urn)" + make_schema_field_urn(foreign_dataset_urn, "urn") ], sourceFields=[ - f"urn:li:schemaField:({source_dataset_urn},{f_field.fieldPath})" + make_schema_field_urn(source_dataset_urn, f_field.fieldPath) ], ) foreign_keys.append(fkey) diff --git a/metadata-ingestion/setup.cfg b/metadata-ingestion/setup.cfg index c095420e4e3f30..057779bc87c622 100644 --- a/metadata-ingestion/setup.cfg +++ b/metadata-ingestion/setup.cfg @@ -31,7 +31,7 @@ exclude = __pycache__ per-file-ignores = # imported but unused - __init__.py: F401 + __init__.py: F401, I250 ban-relative-imports = true [mypy] @@ -53,6 +53,14 @@ disallow_untyped_defs = no # try to be a bit more strict in certain areas of the codebase [mypy-datahub.*] ignore_missing_imports = no +implicit_reexport = no +[mypy-datahub.metadata.*] +# TODO: Remove this once all the code has been updated. +implicit_reexport = yes +[mypy-datahub.ingestion.*] +# TODO: Remove this once all the code has been updated. +implicit_reexport = yes + [mypy-datahub_provider.*] ignore_missing_imports = no [mypy-tests.*] diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index cb99d4955dd0bc..1b0f2fb7607e5c 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -592,22 +592,26 @@ "memray", } -base_dev_requirements = { - *base_requirements, - *framework_common, - *mypy_stubs, - *s3_base, +lint_requirements = { # This is pinned only to avoid spurious errors in CI. # We should make an effort to keep it up to date. - "black==22.12.0", - "coverage>=5.1", - "faker>=18.4.0", + "black==23.3.0", "flake8>=6.0.0", "flake8-tidy-imports>=4.3.0", "flake8-bugbear==23.3.12", "isort>=5.7.0", "mypy==1.10.1", +} + +base_dev_requirements = { + *base_requirements, + *framework_common, + *mypy_stubs, + *s3_base, + *lint_requirements, *test_api_requirements, + "coverage>=5.1", + "faker>=18.4.0", "pytest-asyncio>=0.16.0", "pytest-cov>=2.8.1", "pytest-random-order~=1.1.0", @@ -934,6 +938,7 @@ ), "cloud": ["acryl-datahub-cloud"], "dev": list(dev_requirements), + "lint": list(lint_requirements), "testing-utils": list(test_api_requirements), # To import `datahub.testing` "integration-tests": list(full_test_dev_requirements), "debug": list(debug_requirements), diff --git a/metadata-ingestion/src/datahub/api/circuit_breaker/__init__.py b/metadata-ingestion/src/datahub/api/circuit_breaker/__init__.py index 27317826264b85..0b04bfa4025a1b 100644 --- a/metadata-ingestion/src/datahub/api/circuit_breaker/__init__.py +++ b/metadata-ingestion/src/datahub/api/circuit_breaker/__init__.py @@ -12,3 +12,10 @@ ) requests_logger.setLevel(logging.WARNING) + +__all__ = [ + "AssertionCircuitBreaker", + "AssertionCircuitBreakerConfig", + "OperationCircuitBreaker", + "OperationCircuitBreakerConfig", +] diff --git a/metadata-ingestion/src/datahub/api/circuit_breaker/circuit_breaker.py b/metadata-ingestion/src/datahub/api/circuit_breaker/circuit_breaker.py index a3c54046faf681..7c1180536a90fb 100644 --- a/metadata-ingestion/src/datahub/api/circuit_breaker/circuit_breaker.py +++ b/metadata-ingestion/src/datahub/api/circuit_breaker/circuit_breaker.py @@ -6,7 +6,7 @@ from gql.transport.requests import RequestsHTTPTransport from pydantic import Field -from datahub.configuration import ConfigModel +from datahub.configuration.common import ConfigModel logger = logging.getLogger(__name__) diff --git a/metadata-ingestion/src/datahub/api/entities/datajob/__init__.py b/metadata-ingestion/src/datahub/api/entities/datajob/__init__.py index 6d85a1569cb63d..3a073005968222 100644 --- a/metadata-ingestion/src/datahub/api/entities/datajob/__init__.py +++ b/metadata-ingestion/src/datahub/api/entities/datajob/__init__.py @@ -1,2 +1,5 @@ from datahub.api.entities.datajob.dataflow import DataFlow from datahub.api.entities.datajob.datajob import DataJob + +# TODO: Remove this and start importing directly from the inner files. +__all__ = ["DataFlow", "DataJob"] diff --git a/metadata-ingestion/src/datahub/api/entities/datajob/dataflow.py b/metadata-ingestion/src/datahub/api/entities/datajob/dataflow.py index f2436d56d5aca1..e169c07445e969 100644 --- a/metadata-ingestion/src/datahub/api/entities/datajob/dataflow.py +++ b/metadata-ingestion/src/datahub/api/entities/datajob/dataflow.py @@ -3,7 +3,6 @@ from typing import Callable, Dict, Iterable, List, Optional, Set, cast import datahub.emitter.mce_builder as builder -from datahub.configuration.source_common import ALL_ENV_TYPES from datahub.emitter.generic_emitter import Emitter from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.metadata.schema_classes import ( @@ -114,7 +113,7 @@ def generate_tags_aspect(self) -> List[GlobalTagsClass]: def _get_env(self) -> Optional[str]: env: Optional[str] = None - if self.env and self.env.upper() in ALL_ENV_TYPES: + if self.env and self.env.upper() in builder.ALL_ENV_TYPES: env = self.env.upper() else: logger.debug( diff --git a/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py b/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py index 0f5d18c20e055b..4958a68caa95fe 100644 --- a/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py +++ b/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py @@ -3,7 +3,6 @@ from typing import Callable, Dict, Iterable, List, Optional, Set import datahub.emitter.mce_builder as builder -from datahub.configuration.source_common import ALL_ENV_TYPES from datahub.emitter.generic_emitter import Emitter from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.metadata.schema_classes import ( @@ -109,7 +108,7 @@ def generate_mcp( self, materialize_iolets: bool = True ) -> Iterable[MetadataChangeProposalWrapper]: env: Optional[str] = None - if self.flow_urn.cluster.upper() in ALL_ENV_TYPES: + if self.flow_urn.cluster.upper() in builder.ALL_ENV_TYPES: env = self.flow_urn.cluster.upper() else: logger.debug( diff --git a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py index b48c655015d825..56e02e4329055a 100644 --- a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py +++ b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py @@ -190,7 +190,6 @@ def create(file: str, graph: Optional[DataHubGraph] = None) -> None: @classmethod def from_datahub(cls, graph: DataHubGraph, urn: str) -> "StructuredProperties": - with StructuredPropertiesConfig.use_graph(graph): structured_property: Optional[ StructuredPropertyDefinitionClass diff --git a/metadata-ingestion/src/datahub/api/graphql/__init__.py b/metadata-ingestion/src/datahub/api/graphql/__init__.py index e8c8d22bbb93df..d818b19092fcbe 100644 --- a/metadata-ingestion/src/datahub/api/graphql/__init__.py +++ b/metadata-ingestion/src/datahub/api/graphql/__init__.py @@ -1,2 +1,4 @@ from datahub.api.graphql.assertion import Assertion from datahub.api.graphql.operation import Operation + +__all__ = ["Assertion", "Operation"] diff --git a/metadata-ingestion/src/datahub/cli/check_cli.py b/metadata-ingestion/src/datahub/cli/check_cli.py index 39ed1b2bfea087..fbe07b64f0e154 100644 --- a/metadata-ingestion/src/datahub/cli/check_cli.py +++ b/metadata-ingestion/src/datahub/cli/check_cli.py @@ -268,7 +268,9 @@ def sql_lineage( ) logger.debug("Sql parsing debug info: %s", lineage.debug_info) - if lineage.debug_info.error: + if lineage.debug_info.table_error: + raise lineage.debug_info.table_error + elif lineage.debug_info.error: logger.debug("Sql parsing error details", exc_info=lineage.debug_info.error) click.echo(lineage.json(indent=4)) diff --git a/metadata-ingestion/src/datahub/cli/put_cli.py b/metadata-ingestion/src/datahub/cli/put_cli.py index 989b1a6d02fd01..0a40a9f4ccf92d 100644 --- a/metadata-ingestion/src/datahub/cli/put_cli.py +++ b/metadata-ingestion/src/datahub/cli/put_cli.py @@ -6,11 +6,12 @@ from datahub.cli.cli_utils import post_entity from datahub.configuration.config_loader import load_config_file -from datahub.emitter.mcp import MetadataChangeProposalWrapper, SystemMetadataClass +from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.graph.client import get_default_graph from datahub.metadata.schema_classes import ( DataPlatformInfoClass as DataPlatformInfo, PlatformTypeClass, + SystemMetadataClass, ) from datahub.telemetry import telemetry from datahub.upgrade import upgrade diff --git a/metadata-ingestion/src/datahub/configuration/__init__.py b/metadata-ingestion/src/datahub/configuration/__init__.py index 008d788072d0a5..21979829a4453d 100644 --- a/metadata-ingestion/src/datahub/configuration/__init__.py +++ b/metadata-ingestion/src/datahub/configuration/__init__.py @@ -1,5 +1,4 @@ from datahub.configuration.common import ( - ConfigModel, - ConfigurationMechanism, - DynamicTypedConfig, + ConfigModel as ConfigModel, + DynamicTypedConfig as DynamicTypedConfig, ) diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py index 0ce7127b440534..4fdf564162410c 100644 --- a/metadata-ingestion/src/datahub/configuration/common.py +++ b/metadata-ingestion/src/datahub/configuration/common.py @@ -21,7 +21,7 @@ from pydantic.fields import Field from typing_extensions import Protocol -from datahub.configuration._config_enum import ConfigEnum +from datahub.configuration._config_enum import ConfigEnum as ConfigEnum # noqa: I250 from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2 from datahub.utilities.dedup_list import deduplicate_list diff --git a/metadata-ingestion/src/datahub/configuration/json_loader.py b/metadata-ingestion/src/datahub/configuration/json_loader.py index 35667eb5951fc7..6ecb741be528d1 100644 --- a/metadata-ingestion/src/datahub/configuration/json_loader.py +++ b/metadata-ingestion/src/datahub/configuration/json_loader.py @@ -1,7 +1,7 @@ import json from typing import IO -from datahub.configuration import ConfigurationMechanism +from datahub.configuration.common import ConfigurationMechanism class JsonConfigurationMechanism(ConfigurationMechanism): diff --git a/metadata-ingestion/src/datahub/configuration/source_common.py b/metadata-ingestion/src/datahub/configuration/source_common.py index ad12447532335f..44c737f1bd13d4 100644 --- a/metadata-ingestion/src/datahub/configuration/source_common.py +++ b/metadata-ingestion/src/datahub/configuration/source_common.py @@ -1,14 +1,10 @@ -from typing import Dict, Optional, Set +from typing import Dict, Optional from pydantic import validator from pydantic.fields import Field from datahub.configuration.common import ConfigModel -from datahub.emitter.enum_helpers import get_enum_options -from datahub.metadata.schema_classes import FabricTypeClass - -DEFAULT_ENV = FabricTypeClass.PROD -ALL_ENV_TYPES: Set[str] = set(get_enum_options(FabricTypeClass)) +from datahub.emitter.mce_builder import ALL_ENV_TYPES, DEFAULT_ENV class PlatformInstanceConfigMixin(ConfigModel): diff --git a/metadata-ingestion/src/datahub/configuration/yaml.py b/metadata-ingestion/src/datahub/configuration/yaml.py index 1f1172836f7448..c069845e1de119 100644 --- a/metadata-ingestion/src/datahub/configuration/yaml.py +++ b/metadata-ingestion/src/datahub/configuration/yaml.py @@ -2,7 +2,7 @@ import yaml -from datahub.configuration import ConfigurationMechanism +from datahub.configuration.common import ConfigurationMechanism class YamlConfigurationMechanism(ConfigurationMechanism): diff --git a/metadata-ingestion/src/datahub/emitter/mce_builder.py b/metadata-ingestion/src/datahub/emitter/mce_builder.py index 63b03db7f5b604..69946c575908b5 100644 --- a/metadata-ingestion/src/datahub/emitter/mce_builder.py +++ b/metadata-ingestion/src/datahub/emitter/mce_builder.py @@ -13,6 +13,7 @@ Any, List, Optional, + Set, Tuple, Type, TypeVar, @@ -24,7 +25,6 @@ import typing_inspect from avrogen.dict_wrapper import DictWrapper -from datahub.configuration.source_common import DEFAULT_ENV from datahub.emitter.enum_helpers import get_enum_options from datahub.metadata.schema_classes import ( AssertionKeyClass, @@ -35,6 +35,7 @@ DatasetKeyClass, DatasetLineageTypeClass, DatasetSnapshotClass, + FabricTypeClass, GlobalTagsClass, GlossaryTermAssociationClass, GlossaryTermsClass as GlossaryTerms, @@ -56,6 +57,9 @@ logger = logging.getLogger(__name__) Aspect = TypeVar("Aspect", bound=AspectAbstract) +DEFAULT_ENV = FabricTypeClass.PROD +ALL_ENV_TYPES: Set[str] = set(get_enum_options(FabricTypeClass)) + DEFAULT_FLOW_CLUSTER = "prod" UNKNOWN_USER = "urn:li:corpuser:unknown" DATASET_URN_TO_LOWER: bool = ( diff --git a/metadata-ingestion/src/datahub/entrypoints.py b/metadata-ingestion/src/datahub/entrypoints.py index d088380d5d38c4..85968f050a3716 100644 --- a/metadata-ingestion/src/datahub/entrypoints.py +++ b/metadata-ingestion/src/datahub/entrypoints.py @@ -13,13 +13,10 @@ generate_access_token, make_shim_command, ) -from datahub.cli.config_utils import ( - DATAHUB_CONFIG_PATH, - get_boolean_env_variable, - write_gms_config, -) +from datahub.cli.config_utils import DATAHUB_CONFIG_PATH, write_gms_config from datahub.cli.delete_cli import delete from datahub.cli.docker_cli import docker +from datahub.cli.env_utils import get_boolean_env_variable from datahub.cli.exists_cli import exists from datahub.cli.get_cli import get from datahub.cli.ingest_cli import ingest diff --git a/metadata-ingestion/src/datahub/ingestion/api/decorators.py b/metadata-ingestion/src/datahub/ingestion/api/decorators.py index b390ffb9dd0362..d32c0b85ceef4c 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/decorators.py +++ b/metadata-ingestion/src/datahub/ingestion/api/decorators.py @@ -3,7 +3,10 @@ from typing import Callable, Dict, Optional, Type from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.api.source import Source, SourceCapability +from datahub.ingestion.api.source import ( # noqa: I250 + Source, + SourceCapability as SourceCapability, +) def config_class(config_cls: Type) -> Callable[[Type], Type]: diff --git a/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py b/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py index bcf077154343c8..88d1fcc52e2196 100644 --- a/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py +++ b/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py @@ -23,7 +23,7 @@ RecordTypeClass, SchemaFieldClass as SchemaField, SchemaFieldDataTypeClass, - SchemaMetadataClass as SchemaMetadata, + SchemaMetadataClass, StringTypeClass, UnionTypeClass, ) @@ -665,13 +665,13 @@ def get_schema_metadata( name: str, json_schema: Dict[Any, Any], raw_schema_string: Optional[str] = None, -) -> SchemaMetadata: +) -> SchemaMetadataClass: json_schema_as_string = raw_schema_string or json.dumps(json_schema) md5_hash: str = md5(json_schema_as_string.encode()).hexdigest() schema_fields = list(JsonSchemaTranslator.get_fields_from_schema(json_schema)) - schema_metadata = SchemaMetadata( + schema_metadata = SchemaMetadataClass( schemaName=name, platform=f"urn:li:dataPlatform:{platform}", version=0, diff --git a/metadata-ingestion/src/datahub/ingestion/extractor/protobuf_util.py b/metadata-ingestion/src/datahub/ingestion/extractor/protobuf_util.py index f62bb184252d98..e947aff384871d 100644 --- a/metadata-ingestion/src/datahub/ingestion/extractor/protobuf_util.py +++ b/metadata-ingestion/src/datahub/ingestion/extractor/protobuf_util.py @@ -32,7 +32,7 @@ OneofDescriptor, ) -from datahub.metadata.com.linkedin.pegasus2avro.schema import ( +from datahub.metadata.schema_classes import ( ArrayTypeClass, BooleanTypeClass, BytesTypeClass, @@ -41,8 +41,8 @@ MapTypeClass, NumberTypeClass, RecordTypeClass, - SchemaField, - SchemaFieldDataType, + SchemaFieldClass as SchemaField, + SchemaFieldDataTypeClass as SchemaFieldDataType, StringTypeClass, UnionTypeClass, ) diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index c90ac93eee2cc2..759aebcfd46b0a 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -33,7 +33,9 @@ from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.rest_emitter import DatahubRestEmitter from datahub.emitter.serialization_helper import post_json_transform -from datahub.ingestion.graph.config import DatahubClientConfig +from datahub.ingestion.graph.config import ( # noqa: I250; TODO: Remove this alias + DatahubClientConfig as DatahubClientConfig, +) from datahub.ingestion.graph.connections import ( connections_gql, get_id_from_connection_urn, diff --git a/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py b/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py index 33bfb63feb3fd7..5961a553a14943 100644 --- a/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +++ b/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py @@ -11,9 +11,8 @@ redact_raw_config, ) from datahub.emitter.aspect import JSON_CONTENT_TYPE -from datahub.emitter.mce_builder import datahub_guid +from datahub.emitter.mce_builder import datahub_guid, make_data_platform_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.emitter.mcp_builder import make_data_platform_urn from datahub.ingestion.api.common import PipelineContext, RecordEnvelope from datahub.ingestion.api.pipeline_run_listener import PipelineRunListener from datahub.ingestion.api.sink import NoopWriteCallback, Sink diff --git a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py index 2e56d5866efa89..7c3a42c3e08931 100644 --- a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py +++ b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py @@ -428,6 +428,7 @@ def create( def _time_to_print(self) -> bool: self.num_intermediate_workunits += 1 current_time = int(time.time()) + # TODO: Replace with ProgressTimer. if current_time - self.last_time_printed > _REPORT_PRINT_INTERVAL_SECONDS: # we print self.num_intermediate_workunits = 0 diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py index 447811f873ea8f..6f3008ccfd6923 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py @@ -1139,7 +1139,6 @@ def gen_schema_metadata( columns: List[BigqueryColumn], dataset_name: BigqueryTableIdentifier, ) -> MetadataWorkUnit: - foreign_keys: List[ForeignKeyConstraint] = [] # Foreign keys only make sense for tables if isinstance(table, BigqueryTable): diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py index b542992a7924a0..321b1b6207fabf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py @@ -934,7 +934,6 @@ def gen_lineage_workunits_for_external_table( ddl: Optional[str], graph: Optional[DataHubGraph] = None, ) -> Iterable[MetadataWorkUnit]: - if not ddl: return @@ -972,7 +971,6 @@ def get_lineage_for_external_table( source_uris: List[str], graph: Optional[DataHubGraph] = None, ) -> Optional[UpstreamLineageClass]: - upstreams_list: List[UpstreamClass] = [] fine_grained_lineages: List[FineGrainedLineageClass] = [] gcs_urns: Set[str] = set() diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py index 91d55ad879e04a..08c9beaa73c53b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py @@ -2,7 +2,7 @@ import logging import pathlib import tempfile -from datetime import datetime, timezone +from datetime import datetime, timedelta, timezone from typing import Collection, Dict, Iterable, List, Optional, TypedDict from google.cloud.bigquery import Client @@ -49,6 +49,7 @@ FileBackedDict, FileBackedList, ) +from datahub.utilities.progress_timer import ProgressTimer from datahub.utilities.time import datetime_to_ts_millis logger = logging.getLogger(__name__) @@ -270,27 +271,25 @@ def get_workunits_internal( # Preprocessing stage that deduplicates the queries using query hash per usage bucket # Note: FileBackedDict is an ordered dictionary, so the order of execution of # queries is inherently maintained - queries_deduped: FileBackedDict[Dict[int, ObservedQuery]] - queries_deduped = self.deduplicate_queries(queries) + queries_deduped: FileBackedDict[ + Dict[int, ObservedQuery] + ] = self.deduplicate_queries(queries) self.report.num_unique_queries = len(queries_deduped) logger.info(f"Found {self.report.num_unique_queries} unique queries") with self.report.audit_log_load_timer, queries_deduped: - last_log_time = datetime.now() - last_report_time = datetime.now() + log_timer = ProgressTimer(timedelta(minutes=1)) + report_timer = ProgressTimer(timedelta(minutes=5)) + for i, (_, query_instances) in enumerate(queries_deduped.items()): for query in query_instances.values(): - now = datetime.now() - if (now - last_log_time).total_seconds() >= 60: + if log_timer.should_report(): logger.info( f"Added {i} deduplicated query log entries to SQL aggregator" ) - last_log_time = now - if (now - last_report_time).total_seconds() >= 300: - if self.report.sql_aggregator: - logger.info(self.report.sql_aggregator.as_string()) - last_report_time = now + if report_timer.should_report() and self.report.sql_aggregator: + logger.info(self.report.sql_aggregator.as_string()) self.aggregator.add(query) diff --git a/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra.py b/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra.py index 6a5236563f48db..dcdccc08ce0483 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra.py +++ b/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra.py @@ -332,7 +332,6 @@ def _extract_columns_from_table( def _extract_views_from_keyspace( self, keyspace_name: str ) -> Iterable[MetadataWorkUnit]: - views: List[CassandraView] = self.cassandra_api.get_views(keyspace_name) for view in views: view_name: str = view.view_name diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py index b5d0ed42e651ea..4598ae388b827d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py @@ -117,9 +117,8 @@ ViewPropertiesClass, ) from datahub.metadata.urns import DatasetUrn -from datahub.sql_parsing.schema_resolver import SchemaResolver +from datahub.sql_parsing.schema_resolver import SchemaInfo, SchemaResolver from datahub.sql_parsing.sqlglot_lineage import ( - SchemaInfo, SqlParsingDebugInfo, SqlParsingResult, infer_output_schema, diff --git a/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py b/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py index 81a54d1327d05a..d2b4a576953daf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py @@ -6,9 +6,8 @@ from pydantic import Field from typing_extensions import Literal -from datahub.configuration.common import AllowDenyPattern +from datahub.configuration.common import AllowDenyPattern, ConfigModel from datahub.configuration.source_common import ( - ConfigModel, EnvConfigMixin, PlatformInstanceConfigMixin, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_api.py b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_api.py index db83dde7cf6131..7b9ccb52acbef4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_api.py @@ -566,7 +566,6 @@ def get_all_tables_and_columns(self, containers: Deque) -> List[Dict]: return tables def validate_schema_format(self, schema): - if "." in schema: schema_path = self.get( url=f"/catalog/{self.get_dataset_id(schema=schema, dataset='')}" @@ -687,7 +686,6 @@ def traverse_path(location_id: str, entity_path: List[str]) -> List: response.get("entityType") == DremioEntityContainerType.FOLDER.value.lower() ): - containers.append( { "id": location_id, diff --git a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_config.py b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_config.py index 9d6f65b95554e7..d966d575c03320 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_config.py @@ -121,7 +121,6 @@ class DremioSourceConfig( EnvConfigMixin, PlatformInstanceConfigMixin, ): - domain: Optional[str] = Field( default=None, description="Domain for all source objects.", diff --git a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_source.py b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_source.py index cd6ba441b5c93b..5b96845ec04961 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_source.py @@ -198,7 +198,6 @@ def _build_source_map(self) -> Dict[str, Dict]: source_platform_name = source_name for mapping in self.config.source_mappings or []: - if re.search(mapping.source_name, source_type, re.IGNORECASE): source_platform_name = mapping.source_name.lower() diff --git a/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py b/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py index 9faa12d5d9bb61..cb3f0dd9cf29f4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py @@ -52,24 +52,22 @@ from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulIngestionSourceBase, ) -from datahub.metadata.com.linkedin.pegasus2avro.schema import ( +from datahub.metadata.schema_classes import ( ArrayTypeClass, BooleanTypeClass, BytesTypeClass, + DataPlatformInstanceClass, + DatasetPropertiesClass, NullTypeClass, NumberTypeClass, RecordTypeClass, - SchemaField, - SchemaFieldDataType, + SchemaFieldClass as SchemaField, + SchemaFieldDataTypeClass as SchemaFieldDataType, SchemalessClass, - SchemaMetadata, + SchemaMetadataClass, StringTypeClass, UnionTypeClass, ) -from datahub.metadata.schema_classes import ( - DataPlatformInstanceClass, - DatasetPropertiesClass, -) from datahub.utilities.registries.domain_registry import DomainRegistry MAX_ITEMS_TO_RETRIEVE = 100 @@ -447,7 +445,7 @@ def construct_schema_metadata( dataset_properties: DatasetPropertiesClass, schema: Dict[Tuple[str, ...], SchemaDescription], primary_key_dict: Dict[str, str], - ) -> SchemaMetadata: + ) -> SchemaMetadataClass: """ " To construct the schema metadata, it will first sort the schema by the occurrence of attribute names in descending order and truncate the schema by MAX_SCHEMA_SIZE, and then start to construct the @@ -501,7 +499,7 @@ def construct_schema_metadata( canonical_schema.append(field) # create schema metadata object for table - schema_metadata = SchemaMetadata( + schema_metadata = SchemaMetadataClass( schemaName=table_name, platform=f"urn:li:dataPlatform:{self.platform}", version=0, diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py index e40e284d6e0a42..86826ae7bedc09 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py @@ -12,8 +12,9 @@ ConfigModel, ConfigurationWarning, ) -from datahub.configuration.source_common import DEFAULT_ENV, DatasetSourceConfigMixin +from datahub.configuration.source_common import DatasetSourceConfigMixin from datahub.configuration.validate_field_rename import pydantic_renamed_field +from datahub.emitter.mce_builder import DEFAULT_ENV from datahub.ingestion.api.report import Report from datahub.ingestion.source.bigquery_v2.bigquery_config import ( BigQueryConnectionConfig, diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py index 80f7b7a9f4480c..130f2c9c2e12fc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py @@ -401,7 +401,10 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: total_runs=job.get("entity").get("runs").get("total"), ) if datajob_entity.total_runs > 0: - self.delete_dpi_from_datajobs(datajob_entity) + try: + self.delete_dpi_from_datajobs(datajob_entity) + except Exception as e: + logger.error(f"While trying to delete {datajob_entity} got {e}") if ( datajob_entity.total_runs == 0 and self.config.delete_empty_data_jobs diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py index 570df4e99ab13d..3baf858e44cdc8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py @@ -69,7 +69,6 @@ def __init__( report: DatahubExecutionRequestCleanupReport, config: Optional[DatahubExecutionRequestCleanupConfig] = None, ) -> None: - self.graph = graph self.report = report self.instance_id = int(time.time()) diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py index 8b2443a589b8dc..c20506e36a844f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py @@ -95,7 +95,6 @@ class GEProfilingBaseConfig(ConfigModel): class GEProfilingConfig(GEProfilingBaseConfig): - report_dropped_profiles: bool = Field( default=False, description="Whether to report datasets or dataset columns which were not profiled. Set to `True` for debugging purposes.", diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py index 1cd3c88a527cbd..57a251ef2ed14f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py @@ -48,7 +48,7 @@ from datahub.ingestion.source.looker.looker_file_loader import LookerViewFileLoader from datahub.ingestion.source.looker.looker_lib_wrapper import LookerAPI from datahub.ingestion.source.looker.lookml_config import ( - _BASE_PROJECT_NAME, + BASE_PROJECT_NAME, LookMLSourceReport, ) from datahub.ingestion.source.looker.str_functions import remove_suffix @@ -369,7 +369,7 @@ def _form_field_name( assert view_name # for lint false positive project_include: ProjectInclude = ProjectInclude( - project=view_project_map.get(view_name, _BASE_PROJECT_NAME), + project=view_project_map.get(view_name, BASE_PROJECT_NAME), include=view_name, ) @@ -384,7 +384,7 @@ def _form_field_name( view_urn = LookerViewId( project_name=( project_include.project - if project_include.project != _BASE_PROJECT_NAME + if project_include.project != BASE_PROJECT_NAME else explore_project_name ), model_name=model_name, @@ -928,7 +928,6 @@ def from_api( # noqa: C901 reporter: SourceReport, source_config: LookerDashboardSourceConfig, ) -> Optional["LookerExplore"]: # noqa: C901 - try: explore = client.lookml_model_explore(model, explore_name) views: Set[str] = set() @@ -1109,7 +1108,7 @@ def from_api( # noqa: C901 fields=view_fields, upstream_views=list( ProjectInclude( - project=view_project_map.get(view_name, _BASE_PROJECT_NAME), + project=view_project_map.get(view_name, BASE_PROJECT_NAME), include=view_name, ) for view_name in views @@ -1235,7 +1234,7 @@ def _to_metadata_events( # noqa: C901 view_urn = LookerViewId( project_name=( view_ref.project - if view_ref.project != _BASE_PROJECT_NAME + if view_ref.project != BASE_PROJECT_NAME else self.project_name ), model_name=self.model_name, diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_dataclasses.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_dataclasses.py index 7e23079156b625..327c9ebf99bd20 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_dataclasses.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_dataclasses.py @@ -9,8 +9,8 @@ load_and_preprocess_file, ) from datahub.ingestion.source.looker.lookml_config import ( - _BASE_PROJECT_NAME, - _EXPLORE_FILE_EXTENSION, + BASE_PROJECT_NAME, + EXPLORE_FILE_EXTENSION, LookMLSourceConfig, LookMLSourceReport, ) @@ -69,7 +69,7 @@ def from_looker_dict( explore_files = [ x.include for x in resolved_includes - if x.include.endswith(_EXPLORE_FILE_EXTENSION) + if x.include.endswith(EXPLORE_FILE_EXTENSION) ] for included_file in explore_files: try: @@ -152,9 +152,9 @@ def resolve_includes( # As such, we try to handle it but are as defensive as possible. non_base_project_name = project_name - if project_name == _BASE_PROJECT_NAME and root_project_name is not None: + if project_name == BASE_PROJECT_NAME and root_project_name is not None: non_base_project_name = root_project_name - if non_base_project_name != _BASE_PROJECT_NAME and inc.startswith( + if non_base_project_name != BASE_PROJECT_NAME and inc.startswith( f"/{non_base_project_name}/" ): # This might be a local include. Let's make sure that '/{project_name}' doesn't diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_file_loader.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_file_loader.py index f894c96debc54a..9fac0b52fde0dd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_file_loader.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_file_loader.py @@ -9,8 +9,8 @@ load_and_preprocess_file, ) from datahub.ingestion.source.looker.lookml_config import ( - _EXPLORE_FILE_EXTENSION, - _VIEW_FILE_EXTENSION, + EXPLORE_FILE_EXTENSION, + VIEW_FILE_EXTENSION, LookMLSourceConfig, LookMLSourceReport, ) @@ -42,7 +42,7 @@ def _load_viewfile( ) -> Optional[LookerViewFile]: # always fully resolve paths to simplify de-dup path = str(pathlib.Path(path).resolve()) - allowed_extensions = [_VIEW_FILE_EXTENSION, _EXPLORE_FILE_EXTENSION] + allowed_extensions = [VIEW_FILE_EXTENSION, EXPLORE_FILE_EXTENSION] matched_any_extension = [ match for match in [path.endswith(x) for x in allowed_extensions] if match ] diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py index 6a623e1e97b5dc..ef7d64e4f42d43 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py @@ -14,7 +14,7 @@ from looker_sdk.sdk.api40.models import Dashboard, LookWithQuery -from datahub.emitter.mce_builder import Aspect, AspectAbstract +from datahub.emitter.mce_builder import Aspect from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.source.looker import looker_common from datahub.ingestion.source.looker.looker_common import ( @@ -40,6 +40,7 @@ DashboardUsageStatisticsClass, DashboardUserUsageCountsClass, TimeWindowSizeClass, + _Aspect as AspectAbstract, ) logger = logging.getLogger(__name__) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_view_id_cache.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_view_id_cache.py index aa45bb72d1f462..562c7863b31343 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_view_id_cache.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_view_id_cache.py @@ -6,7 +6,7 @@ from datahub.ingestion.source.looker.looker_dataclasses import LookerModel from datahub.ingestion.source.looker.looker_file_loader import LookerViewFileLoader from datahub.ingestion.source.looker.lookml_config import ( - _BASE_PROJECT_NAME, + BASE_PROJECT_NAME, NAME, LookMLSourceReport, ) @@ -103,7 +103,7 @@ def get_looker_view_id( current_project_name: str = ( include.project - if include.project != _BASE_PROJECT_NAME + if include.project != BASE_PROJECT_NAME else self.project_name ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py index da837da1613864..7ffb895349ed29 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py @@ -33,17 +33,11 @@ NAME: str = "name" -_BASE_PROJECT_NAME = "__BASE" +BASE_PROJECT_NAME = "__BASE" -_EXPLORE_FILE_EXTENSION = ".explore.lkml" - -_VIEW_FILE_EXTENSION = ".view.lkml" - -_MODEL_FILE_EXTENSION = ".model.lkml" - -VIEW_LANGUAGE_LOOKML: str = "lookml" - -VIEW_LANGUAGE_SQL: str = "sql" +EXPLORE_FILE_EXTENSION = ".explore.lkml" +VIEW_FILE_EXTENSION = ".view.lkml" +MODEL_FILE_EXTENSION = ".model.lkml" DERIVED_VIEW_SUFFIX = r".sql_table_name" diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_refinement.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_refinement.py index 892ed79754a1c2..6933d9d69394bc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_refinement.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_refinement.py @@ -5,7 +5,7 @@ from datahub.ingestion.source.looker.looker_config import LookerConnectionDefinition from datahub.ingestion.source.looker.looker_dataclasses import LookerModel -from datahub.ingestion.source.looker.looker_view_id_cache import LookerViewFileLoader +from datahub.ingestion.source.looker.looker_file_loader import LookerViewFileLoader from datahub.ingestion.source.looker.lookml_config import ( NAME, LookMLSourceConfig, diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py index 95b04468231f13..c7d3724472d3c8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py @@ -57,10 +57,8 @@ LookerViewContext, ) from datahub.ingestion.source.looker.lookml_config import ( - _BASE_PROJECT_NAME, - _MODEL_FILE_EXTENSION, - VIEW_LANGUAGE_LOOKML, - VIEW_LANGUAGE_SQL, + BASE_PROJECT_NAME, + MODEL_FILE_EXTENSION, LookerConnectionDefinition, LookMLSourceConfig, LookMLSourceReport, @@ -98,6 +96,9 @@ ) from datahub.sql_parsing.sqlglot_lineage import ColumnRef +VIEW_LANGUAGE_LOOKML: str = "lookml" +VIEW_LANGUAGE_SQL: str = "sql" + logger = logging.getLogger(__name__) @@ -318,7 +319,7 @@ def _load_model(self, path: str) -> LookerModel: looker_model = LookerModel.from_looker_dict( parsed, - _BASE_PROJECT_NAME, + BASE_PROJECT_NAME, self.source_config.project_name, self.base_projects_folder, path, @@ -541,7 +542,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.source_config.base_folder = checkout_dir.resolve() self.base_projects_folder[ - _BASE_PROJECT_NAME + BASE_PROJECT_NAME ] = self.source_config.base_folder visited_projects: Set[str] = set() @@ -573,7 +574,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.base_projects_folder[project] = p_ref self._recursively_check_manifests( - tmp_dir, _BASE_PROJECT_NAME, visited_projects + tmp_dir, BASE_PROJECT_NAME, visited_projects ) yield from self.get_internal_workunits() @@ -604,7 +605,7 @@ def _recursively_check_manifests( return # Special case handling if the root project has a name in the manifest file. - if project_name == _BASE_PROJECT_NAME and manifest.project_name: + if project_name == BASE_PROJECT_NAME and manifest.project_name: if ( self.source_config.project_name is not None and manifest.project_name != self.source_config.project_name @@ -693,7 +694,7 @@ def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 # The ** means "this directory and all subdirectories", and hence should # include all the files we want. model_files = sorted( - self.source_config.base_folder.glob(f"**/*{_MODEL_FILE_EXTENSION}") + self.source_config.base_folder.glob(f"**/*{MODEL_FILE_EXTENSION}") ) model_suffix_len = len(".model") @@ -829,7 +830,7 @@ def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 current_project_name: str = ( include.project - if include.project != _BASE_PROJECT_NAME + if include.project != BASE_PROJECT_NAME else project_name ) @@ -838,7 +839,7 @@ def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 base_folder_path: str = str( self.base_projects_folder.get( current_project_name, - self.base_projects_folder[_BASE_PROJECT_NAME], + self.base_projects_folder[BASE_PROJECT_NAME], ) ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py b/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py index 7dd2f9cb203336..8cec6f2607774e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py @@ -12,6 +12,7 @@ ViewField, ViewFieldType, ) +from datahub.ingestion.source.looker.looker_config import LookerConnectionDefinition from datahub.ingestion.source.looker.looker_view_id_cache import LookerViewIdCache from datahub.ingestion.source.looker.lookml_concept_context import ( LookerFieldContext, @@ -20,7 +21,6 @@ from datahub.ingestion.source.looker.lookml_config import ( DERIVED_VIEW_SUFFIX, NAME, - LookerConnectionDefinition, LookMLSourceConfig, LookMLSourceReport, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py index c87b025f13b55d..bbc4897d227bac 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py @@ -50,25 +50,23 @@ from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulIngestionSourceBase, ) -from datahub.metadata.com.linkedin.pegasus2avro.schema import ( +from datahub.metadata.schema_classes import ( ArrayTypeClass, BooleanTypeClass, BytesTypeClass, + DataPlatformInstanceClass, + DatasetPropertiesClass, NullTypeClass, NumberTypeClass, RecordTypeClass, - SchemaField, - SchemaFieldDataType, + SchemaFieldClass as SchemaField, + SchemaFieldDataTypeClass as SchemaFieldDataType, SchemalessClass, - SchemaMetadata, + SchemaMetadataClass as SchemaMetadata, StringTypeClass, TimeTypeClass, UnionTypeClass, ) -from datahub.metadata.schema_classes import ( - DataPlatformInstanceClass, - DatasetPropertiesClass, -) from datahub.metadata.urns import DatasetUrn logger = logging.getLogger(__name__) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index 7c8487727c9eee..91fa2e96be2cce 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -9,7 +9,7 @@ import datahub.emitter.mce_builder as builder from datahub.configuration.common import AllowDenyPattern, ConfigModel -from datahub.configuration.source_common import DEFAULT_ENV, DatasetSourceConfigMixin +from datahub.configuration.source_common import DatasetSourceConfigMixin from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.ingestion.source.common.subtypes import BIAssetSubTypes from datahub.ingestion.source.state.stale_entity_removal_handler import ( @@ -240,7 +240,7 @@ class PlatformDetail(ConfigModel): "recipe of other datahub sources.", ) env: str = pydantic.Field( - default=DEFAULT_ENV, + default=builder.DEFAULT_ENV, description="The environment that all assets produced by DataHub platform ingestion source belong to", ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py index 4bf32163538045..8ffd54613eb380 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py @@ -72,10 +72,24 @@ def get_tables(native_query: str) -> List[str]: def remove_drop_statement(query: str) -> str: # Certain PowerBI M-Queries contain a combination of DROP and SELECT statements within SQL, causing SQLParser to fail on these queries. # Therefore, these occurrences are being removed. - # Regular expression to match patterns like "DROP TABLE IF EXISTS #;" - pattern = r"DROP TABLE IF EXISTS #\w+;?" - return re.sub(pattern, "", query) + patterns = [ + # Regular expression to match patterns like: + # "DROP TABLE IF EXISTS #;" + # "DROP TABLE IF EXISTS #, , ...;" + # "DROP TABLE IF EXISTS #, , ...\n" + r"DROP\s+TABLE\s+IF\s+EXISTS\s+(?:#?\w+(?:,\s*#?\w+)*)[;\n]", + ] + + new_query = query + + for pattern in patterns: + new_query = re.sub(pattern, "", new_query, flags=re.IGNORECASE) + + # Remove extra spaces caused by consecutive replacements + new_query = re.sub(r"\s+", " ", new_query).strip() + + return new_query def parse_custom_sql( diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py index d0914f14f2be48..a40e67d08da5b2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py @@ -82,6 +82,16 @@ def urn_creator( ) +def get_next_item(items: List[str], item: str) -> Optional[str]: + if item in items: + try: + index = items.index(item) + return items[index + 1] + except IndexError: + logger.debug(f'item:"{item}", not found in item-list: {items}') + return None + + class AbstractDataPlatformTableCreator(ABC): """ Base class to share common functionalities among different dataplatform for M-Query parsing. @@ -364,7 +374,6 @@ def get_argument_list(invoke_expression: Tree) -> Optional[Tree]: return argument_list def take_first_argument(self, expression: Tree) -> Optional[Tree]: - # function is not data-access function, lets process function argument first_arg_tree: Optional[Tree] = tree_function.first_arg_list_func(expression) @@ -672,7 +681,7 @@ def two_level_access_pattern( data_access_func_detail.arg_list ) if server is None or db_name is None: - return Lineage.empty() # Return empty list + return Lineage.empty() # Return an empty list schema_name: str = cast( IdentifierAccessor, data_access_func_detail.identifier_accessor @@ -778,32 +787,38 @@ def create_lineage( ), ) - if len(arguments) == 2: - # It is a regular case of MS-SQL - logger.debug("Handling with regular case") - return self.two_level_access_pattern(data_access_func_detail) - - if len(arguments) >= 4 and arguments[2] != "Query": - logger.debug("Unsupported case is found. Second index is not the Query") - return Lineage.empty() + server, database = self.get_db_detail_from_argument( + data_access_func_detail.arg_list + ) + if server is None or database is None: + return Lineage.empty() # Return an empty list + + assert server + assert database # to silent the lint + + query: Optional[str] = get_next_item(arguments, "Query") + if query: + if self.config.enable_advance_lineage_sql_construct is False: + # Use previous parser to generate URN to keep backward compatibility + return Lineage( + upstreams=self.create_urn_using_old_parser( + query=query, + db_name=database, + server=server, + ), + column_lineage=[], + ) - if self.config.enable_advance_lineage_sql_construct is False: - # Use previous parser to generate URN to keep backward compatibility - return Lineage( - upstreams=self.create_urn_using_old_parser( - query=arguments[3], - db_name=arguments[1], - server=arguments[0], - ), - column_lineage=[], + return self.parse_custom_sql( + query=query, + database=database, + server=server, + schema=MSSqlDataPlatformTableCreator.DEFAULT_SCHEMA, ) - return self.parse_custom_sql( - query=arguments[3], - database=arguments[1], - server=arguments[0], - schema=MSSqlDataPlatformTableCreator.DEFAULT_SCHEMA, - ) + # It is a regular case of MS-SQL + logger.debug("Handling with regular case") + return self.two_level_access_pattern(data_access_func_detail) class OracleDataPlatformTableCreator(AbstractDataPlatformTableCreator): @@ -1149,27 +1164,19 @@ def get_db_name(self, data_access_tokens: List[str]) -> Optional[str]: != SupportedDataPlatform.DatabricksMultiCloud_SQL.value.powerbi_data_platform_name ): return None - try: - if "Database" in data_access_tokens: - index = data_access_tokens.index("Database") - if data_access_tokens[index + 1] != Constant.M_QUERY_NULL: - # Database name is explicitly set in argument - return data_access_tokens[index + 1] - - if "Name" in data_access_tokens: - index = data_access_tokens.index("Name") - # Next element is value of the Name. It is a database name - return data_access_tokens[index + 1] - - if "Catalog" in data_access_tokens: - index = data_access_tokens.index("Catalog") - # Next element is value of the Catalog. In Databricks Catalog can also be used in place of a database. - return data_access_tokens[index + 1] - except IndexError as e: - logger.debug("Database name is not available", exc_info=e) + database: Optional[str] = get_next_item(data_access_tokens, "Database") - return None + if ( + database and database != Constant.M_QUERY_NULL + ): # database name is explicitly set + return database + + return get_next_item( # database name is set in Name argument + data_access_tokens, "Name" + ) or get_next_item( # If both above arguments are not available, then try Catalog + data_access_tokens, "Catalog" + ) def create_lineage( self, data_access_func_detail: DataAccessFunctionDetail diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py index a59d58519d6bfe..e1301edef10b84 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py @@ -439,7 +439,6 @@ def get_app( self, app_id: str, ) -> Optional[App]: - raw_app: Optional[Dict] = self._get_app( app_id=app_id, ) @@ -1062,7 +1061,6 @@ def _get_app( self, app_id: str, ) -> Optional[Dict]: - app_endpoint = self.API_ENDPOINTS[Constant.GET_WORKSPACE_APP].format( POWERBI_ADMIN_BASE_URL=DataResolverBase.ADMIN_BASE_URL, APP_ID=app_id, diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py index 04088cb07c8416..5ae333430a78bc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py @@ -40,7 +40,6 @@ def form_full_table_name( dataset_name: str, table_name: str, ) -> str: - full_table_name: str = "{}.{}".format( dataset_name.replace(" ", "_"), table_name.replace(" ", "_") ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/redash.py b/metadata-ingestion/src/datahub/ingestion/source/redash.py index 5fd63e7f93f92a..581e32d29dceaf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redash.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redash.py @@ -41,7 +41,7 @@ ) from datahub.utilities.lossy_collections import LossyDict, LossyList from datahub.utilities.perf_timer import PerfTimer -from datahub.utilities.sql_parser import SQLParser +from datahub.utilities.sql_parser_base import SQLParser from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor logger = logging.getLogger(__name__) diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py index b18b526ef30fce..71a20890d35e88 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py @@ -9,6 +9,8 @@ # We use 290 instead instead of the standard 320, because escape characters can add to the length. _QUERY_SEQUENCE_LIMIT = 290 +_MAX_COPY_ENTRIES_PER_TABLE = 20 + class RedshiftCommonQuery: CREATE_TEMP_TABLE_CLAUSE = "create temp table" @@ -293,28 +295,37 @@ def alter_table_rename_query( def list_copy_commands_sql( db_name: str, start_time: datetime, end_time: datetime ) -> str: - return """ - select - distinct - "schema" as target_schema, - "table" as target_table, - c.file_name as filename - from - SYS_QUERY_DETAIL as si - join SYS_LOAD_DETAIL as c on - si.query_id = c.query_id - join SVV_TABLE_INFO sti on - sti.table_id = si.table_id - where - database = '{db_name}' - and si.start_time >= '{start_time}' - and si.start_time < '{end_time}' - order by target_schema, target_table, si.start_time asc - """.format( + return """\ +SELECT DISTINCT + target_schema, + target_table, + filename +FROM ( + SELECT + sti."schema" AS target_schema, + sti."table" AS target_table, + c.file_name AS filename, + ROW_NUMBER() OVER ( + PARTITION BY sti."schema", sti."table" + ORDER BY si.start_time DESC + ) AS rn + FROM + SYS_QUERY_DETAIL AS si + JOIN SYS_LOAD_DETAIL AS c ON si.query_id = c.query_id + JOIN SVV_TABLE_INFO sti ON sti.table_id = si.table_id + WHERE + sti.database = '{db_name}' + AND si.start_time >= '{start_time}' + AND si.start_time < '{end_time}' +) subquery +WHERE rn <= {_MAX_COPY_ENTRIES_PER_TABLE} +ORDER BY target_schema, target_table, filename +""".format( # We need the original database name for filtering db_name=db_name, start_time=start_time.strftime(redshift_datetime_format), end_time=end_time.strftime(redshift_datetime_format), + _MAX_COPY_ENTRIES_PER_TABLE=_MAX_COPY_ENTRIES_PER_TABLE, ) @staticmethod diff --git a/metadata-ingestion/src/datahub/ingestion/source/sac/sac.py b/metadata-ingestion/src/datahub/ingestion/source/sac/sac.py index cc5756397aaac3..b75f15c0ce770e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sac/sac.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sac/sac.py @@ -13,12 +13,9 @@ from urllib3.util.retry import Retry from datahub.configuration.common import AllowDenyPattern -from datahub.configuration.source_common import ( - DEFAULT_ENV, - DatasetSourceConfigMixin, - EnvConfigMixin, -) +from datahub.configuration.source_common import DatasetSourceConfigMixin, EnvConfigMixin from datahub.emitter.mce_builder import ( + DEFAULT_ENV, dataset_urn_to_key, make_dashboard_urn, make_data_platform_urn, diff --git a/metadata-ingestion/src/datahub/ingestion/source/schema_inference/csv_tsv.py b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/csv_tsv.py index 54f7dfb5b903c7..ab7b887cba1d80 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/schema_inference/csv_tsv.py +++ b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/csv_tsv.py @@ -3,15 +3,15 @@ from tableschema import Table from datahub.ingestion.source.schema_inference.base import SchemaInferenceBase -from datahub.metadata.com.linkedin.pegasus2avro.schema import ( +from datahub.metadata.schema_classes import ( ArrayTypeClass, BooleanTypeClass, DateTypeClass, NullTypeClass, NumberTypeClass, RecordTypeClass, - SchemaField, - SchemaFieldDataType, + SchemaFieldClass as SchemaField, + SchemaFieldDataTypeClass as SchemaFieldDataType, StringTypeClass, TimeTypeClass, UnionTypeClass, diff --git a/metadata-ingestion/src/datahub/ingestion/source/schema_inference/json.py b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/json.py index 1f2c73a2522d04..1659aaf6fa2020 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/schema_inference/json.py +++ b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/json.py @@ -7,14 +7,14 @@ from datahub.ingestion.source.schema_inference.base import SchemaInferenceBase from datahub.ingestion.source.schema_inference.object import construct_schema -from datahub.metadata.com.linkedin.pegasus2avro.schema import ( +from datahub.metadata.schema_classes import ( ArrayTypeClass, BooleanTypeClass, NullTypeClass, NumberTypeClass, RecordTypeClass, - SchemaField, - SchemaFieldDataType, + SchemaFieldClass as SchemaField, + SchemaFieldDataTypeClass as SchemaFieldDataType, StringTypeClass, UnionTypeClass, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/schema_inference/parquet.py b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/parquet.py index 1f3f2e0a1e8a83..efc605e0df8cab 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/schema_inference/parquet.py +++ b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/parquet.py @@ -4,7 +4,7 @@ import pyarrow.parquet from datahub.ingestion.source.schema_inference.base import SchemaInferenceBase -from datahub.metadata.com.linkedin.pegasus2avro.schema import ( +from datahub.metadata.schema_classes import ( ArrayTypeClass, BooleanTypeClass, BytesTypeClass, @@ -12,8 +12,8 @@ NullTypeClass, NumberTypeClass, RecordTypeClass, - SchemaField, - SchemaFieldDataType, + SchemaFieldClass as SchemaField, + SchemaFieldDataTypeClass as SchemaFieldDataType, StringTypeClass, TimeTypeClass, UnionTypeClass, diff --git a/metadata-ingestion/src/datahub/configuration/oauth.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/oauth_config.py similarity index 100% rename from metadata-ingestion/src/datahub/configuration/oauth.py rename to metadata-ingestion/src/datahub/ingestion/source/snowflake/oauth_config.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/oauth_generator.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/oauth_generator.py similarity index 97% rename from metadata-ingestion/src/datahub/ingestion/source/sql/oauth_generator.py rename to metadata-ingestion/src/datahub/ingestion/source/snowflake/oauth_generator.py index 7231c6ef6b1df5..a2dc0118b39782 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/oauth_generator.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/oauth_generator.py @@ -8,7 +8,7 @@ from OpenSSL.crypto import FILETYPE_PEM, load_certificate from pydantic.types import SecretStr -from datahub.configuration.oauth import OAuthIdentityProvider +from datahub.ingestion.source.snowflake.oauth_config import OAuthIdentityProvider logger = logging.getLogger(__name__) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py index a9f454cfd3cdb3..397606400d389c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py @@ -16,14 +16,17 @@ from datahub.configuration.common import ConfigModel, ConfigurationError, MetaError from datahub.configuration.connection_resolver import auto_connection_resolver -from datahub.configuration.oauth import OAuthConfiguration, OAuthIdentityProvider from datahub.configuration.validate_field_rename import pydantic_renamed_field from datahub.ingestion.api.closeable import Closeable from datahub.ingestion.source.snowflake.constants import ( CLIENT_PREFETCH_THREADS, CLIENT_SESSION_KEEP_ALIVE, ) -from datahub.ingestion.source.sql.oauth_generator import OAuthTokenGenerator +from datahub.ingestion.source.snowflake.oauth_config import ( + OAuthConfiguration, + OAuthIdentityProvider, +) +from datahub.ingestion.source.snowflake.oauth_generator import OAuthTokenGenerator from datahub.ingestion.source.sql.sql_config import make_sqlalchemy_uri from datahub.utilities.config_clean import ( remove_protocol, diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index 4a03717754ec26..e065e2f34bc66d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -31,14 +31,16 @@ ) from datahub.metadata.schema_classes import DatasetLineageTypeClass, UpstreamClass from datahub.sql_parsing.sql_parsing_aggregator import ( - ColumnLineageInfo, - ColumnRef, KnownLineageMapping, KnownQueryLineageInfo, SqlParsingAggregator, UrnStr, ) -from datahub.sql_parsing.sqlglot_lineage import DownstreamColumnRef +from datahub.sql_parsing.sqlglot_lineage import ( + ColumnLineageInfo, + ColumnRef, + DownstreamColumnRef, +) from datahub.utilities.perf_timer import PerfTimer from datahub.utilities.time import ts_millis_to_datetime diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index dd7f73268fdc4f..538841018067e2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -48,11 +48,9 @@ SnowflakeQueriesExtractor, SnowflakeQueriesExtractorConfig, ) +from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report -from datahub.ingestion.source.snowflake.snowflake_schema import ( - SnowflakeDataDictionary, - SnowflakeQuery, -) +from datahub.ingestion.source.snowflake.snowflake_schema import SnowflakeDataDictionary from datahub.ingestion.source.snowflake.snowflake_schema_gen import ( SnowflakeSchemaGenerator, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py index 238fd88f1c9509..e5779791ed4120 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py @@ -32,6 +32,7 @@ make_data_platform_urn, make_dataplatform_instance_urn, make_dataset_urn_with_platform_instance, + make_schema_field_urn, make_tag_urn, ) from datahub.emitter.mcp import MetadataChangeProposalWrapper @@ -669,7 +670,7 @@ def get_foreign_key_metadata( ) source_fields = [ - f"urn:li:schemaField:({dataset_urn},{f})" + make_schema_field_urn(dataset_urn, f) for f in fk_dict["constrained_columns"] ] foreign_dataset = make_dataset_urn_with_platform_instance( @@ -679,7 +680,7 @@ def get_foreign_key_metadata( env=self.config.env, ) foreign_fields = [ - f"urn:li:schemaField:({foreign_dataset},{f})" + make_schema_field_urn(foreign_dataset, f) for f in fk_dict["referred_columns"] ] diff --git a/metadata-ingestion/src/datahub/specific/dataset.py b/metadata-ingestion/src/datahub/specific/dataset.py index 9dd2616078f08d..b171dc4cc2939f 100644 --- a/metadata-ingestion/src/datahub/specific/dataset.py +++ b/metadata-ingestion/src/datahub/specific/dataset.py @@ -13,7 +13,7 @@ KafkaAuditHeaderClass, OwnerClass as Owner, OwnershipTypeClass, - SchemaMetadataClass as SchemaMetadata, + SchemaMetadataClass, SystemMetadataClass, TagAssociationClass as Tag, UpstreamClass as Upstream, @@ -40,7 +40,7 @@ def __init__( self.aspect_name = ( EditableSchemaMetadata.ASPECT_NAME if editable - else SchemaMetadata.ASPECT_NAME + else SchemaMetadataClass.ASPECT_NAME ) self.aspect_field = "editableSchemaFieldInfo" if editable else "schemaFieldInfo" diff --git a/metadata-ingestion/src/datahub/sql_parsing/_models.py b/metadata-ingestion/src/datahub/sql_parsing/_models.py index d92d178b81cf4b..d586e7d6d9045b 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/_models.py +++ b/metadata-ingestion/src/datahub/sql_parsing/_models.py @@ -42,6 +42,8 @@ def __lt__(self, other: "_FrozenModel") -> bool: class _TableName(_FrozenModel): + # TODO: Move this into the schema_resolver.py file. + database: Optional[str] = None db_schema: Optional[str] = None table: str diff --git a/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py b/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py index e7b0527d30d978..e3f2fbc786b437 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py +++ b/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py @@ -13,7 +13,7 @@ from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier from datahub.metadata.schema_classes import SchemaFieldClass, SchemaMetadataClass from datahub.metadata.urns import DataPlatformUrn -from datahub.sql_parsing._models import _TableName +from datahub.sql_parsing._models import _TableName as _TableName # noqa: I250 from datahub.sql_parsing.sql_parsing_common import PLATFORMS_WITH_CASE_SENSITIVE_TABLES from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedDict from datahub.utilities.urns.field_paths import get_simple_field_path_from_v2_field_path diff --git a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py index 265d3f027f0284..4d0f9f7d8cf59d 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py @@ -903,6 +903,15 @@ def _sqlglot_lineage_inner( logger.debug("Parsing lineage from sql statement: %s", sql) statement = parse_statement(sql, dialect=dialect) + if isinstance(statement, sqlglot.exp.Command): + # For unsupported syntax, sqlglot will usually fallback to parsing as a Command. + # This is effectively a parsing error, and we won't get any lineage from it. + # See https://github.com/tobymao/sqlglot/commit/3a13fdf4e597a2f0a3f9fc126a129183fe98262f + # and https://github.com/tobymao/sqlglot/pull/2874 + raise UnsupportedStatementTypeError( + f"Got unsupported syntax for statement: {sql}" + ) + original_statement, statement = statement, statement.copy() # logger.debug( # "Formatted sql statement: %s", diff --git a/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py b/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py index 72b5f6c5e26e4b..13be45ec1be28d 100644 --- a/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py +++ b/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py @@ -6,12 +6,8 @@ import deepdiff from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier -from datahub.sql_parsing.schema_resolver import SchemaResolver -from datahub.sql_parsing.sqlglot_lineage import ( - SchemaInfo, - SqlParsingResult, - sqlglot_lineage, -) +from datahub.sql_parsing.schema_resolver import SchemaInfo, SchemaResolver +from datahub.sql_parsing.sqlglot_lineage import SqlParsingResult, sqlglot_lineage logger = logging.getLogger(__name__) diff --git a/metadata-ingestion/src/datahub/utilities/is_pytest.py b/metadata-ingestion/src/datahub/utilities/is_pytest.py index 68bb1b285a50e9..572b4bf5356220 100644 --- a/metadata-ingestion/src/datahub/utilities/is_pytest.py +++ b/metadata-ingestion/src/datahub/utilities/is_pytest.py @@ -1,5 +1,6 @@ +import os import sys def is_pytest_running() -> bool: - return "pytest" in sys.modules + return "pytest" in sys.modules and os.environ.get("DATAHUB_TEST_MODE") == "1" diff --git a/metadata-ingestion/src/datahub/utilities/progress_timer.py b/metadata-ingestion/src/datahub/utilities/progress_timer.py new file mode 100644 index 00000000000000..eac62cddb55f2c --- /dev/null +++ b/metadata-ingestion/src/datahub/utilities/progress_timer.py @@ -0,0 +1,34 @@ +from datetime import datetime, timedelta, timezone + + +class ProgressTimer: + def __init__(self, report_every: timedelta, report_0: bool = False): + """A helper for reporting progress at a given time interval. + + Should be used for long-running processes that iterate over a large number of items, + but each iteration is fast. + + Args: + report_every: The time interval between progress reports. + report_0: Whether to report progress on the first iteration. + """ + + self._report_every = report_every + + if report_0: + # Use the earliest possible time to force reporting on the first iteration. + self._last_report_time = datetime.min.replace(tzinfo=timezone.utc) + else: + self._last_report_time = self._now() + + def _now(self) -> datetime: + return datetime.now(timezone.utc) + + def should_report(self) -> bool: + current_time = self._now() + + should_report = (self._last_report_time + self._report_every) <= current_time + if should_report: + self._last_report_time = current_time + + return should_report diff --git a/metadata-ingestion/src/datahub/utilities/urns/corp_group_urn.py b/metadata-ingestion/src/datahub/utilities/urns/corp_group_urn.py index 37c10769259459..577f90215a6353 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/corp_group_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/corp_group_urn.py @@ -1 +1,3 @@ -from datahub.metadata.urns import CorpGroupUrn # noqa: F401 +from datahub.metadata.urns import CorpGroupUrn + +__all__ = ["CorpGroupUrn"] diff --git a/metadata-ingestion/src/datahub/utilities/urns/corpuser_urn.py b/metadata-ingestion/src/datahub/utilities/urns/corpuser_urn.py index 5f9ecf65951b95..8acb86be00f6c8 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/corpuser_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/corpuser_urn.py @@ -1 +1,3 @@ -from datahub.metadata.urns import CorpUserUrn as CorpuserUrn # noqa: F401 +from datahub.metadata.urns import CorpUserUrn as CorpuserUrn + +__all__ = ["CorpuserUrn"] diff --git a/metadata-ingestion/src/datahub/utilities/urns/data_flow_urn.py b/metadata-ingestion/src/datahub/utilities/urns/data_flow_urn.py index 5b2b45927c339e..3508ae5c4a3490 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/data_flow_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/data_flow_urn.py @@ -1 +1,3 @@ -from datahub.metadata.urns import DataFlowUrn # noqa: F401 +from datahub.metadata.urns import DataFlowUrn + +__all__ = ["DataFlowUrn"] diff --git a/metadata-ingestion/src/datahub/utilities/urns/data_job_urn.py b/metadata-ingestion/src/datahub/utilities/urns/data_job_urn.py index 53e3419ee7ecb2..d003b6c6ad7a88 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/data_job_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/data_job_urn.py @@ -1 +1,3 @@ -from datahub.metadata.urns import DataJobUrn # noqa: F401 +from datahub.metadata.urns import DataJobUrn + +__all__ = ["DataJobUrn"] diff --git a/metadata-ingestion/src/datahub/utilities/urns/data_platform_urn.py b/metadata-ingestion/src/datahub/utilities/urns/data_platform_urn.py index 9d37e38f256e7f..51e013e715d4fd 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/data_platform_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/data_platform_urn.py @@ -1 +1,3 @@ -from datahub.metadata.urns import DataPlatformUrn # noqa: F401 +from datahub.metadata.urns import DataPlatformUrn + +__all__ = ["DataPlatformUrn"] diff --git a/metadata-ingestion/src/datahub/utilities/urns/data_process_instance_urn.py b/metadata-ingestion/src/datahub/utilities/urns/data_process_instance_urn.py index df6ba797d069c1..22e6b36c5f7aec 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/data_process_instance_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/data_process_instance_urn.py @@ -1 +1,3 @@ -from datahub.metadata.urns import DataProcessInstanceUrn # noqa: F401 +from datahub.metadata.urns import DataProcessInstanceUrn + +__all__ = ["DataProcessInstanceUrn"] diff --git a/metadata-ingestion/src/datahub/utilities/urns/dataset_urn.py b/metadata-ingestion/src/datahub/utilities/urns/dataset_urn.py index 6078ffefc03d85..1652e170599958 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/dataset_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/dataset_urn.py @@ -1 +1,3 @@ -from datahub.metadata.urns import DatasetUrn # noqa: F401 +from datahub.metadata.urns import DatasetUrn + +__all__ = ["DatasetUrn"] diff --git a/metadata-ingestion/src/datahub/utilities/urns/domain_urn.py b/metadata-ingestion/src/datahub/utilities/urns/domain_urn.py index 442a6b27729bba..242a3d8228320d 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/domain_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/domain_urn.py @@ -1 +1,3 @@ -from datahub.metadata.urns import DomainUrn # noqa: F401 +from datahub.metadata.urns import DomainUrn + +__all__ = ["DomainUrn"] diff --git a/metadata-ingestion/src/datahub/utilities/urns/notebook_urn.py b/metadata-ingestion/src/datahub/utilities/urns/notebook_urn.py index 60a4f5396aa468..f9b861d7f08524 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/notebook_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/notebook_urn.py @@ -1 +1,3 @@ -from datahub.metadata.urns import NotebookUrn # noqa: F401 +from datahub.metadata.urns import NotebookUrn + +__all__ = ["NotebookUrn"] diff --git a/metadata-ingestion/src/datahub/utilities/urns/structured_properties_urn.py b/metadata-ingestion/src/datahub/utilities/urns/structured_properties_urn.py index 5bd36a0656d99e..6774978c7a76d9 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/structured_properties_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/structured_properties_urn.py @@ -1,4 +1,6 @@ -from datahub.metadata.urns import StructuredPropertyUrn # noqa: F401 +from datahub.metadata.urns import StructuredPropertyUrn + +__all__ = ["StructuredPropertyUrn", "make_structured_property_urn"] def make_structured_property_urn(structured_property_id: str) -> str: diff --git a/metadata-ingestion/src/datahub/utilities/urns/tag_urn.py b/metadata-ingestion/src/datahub/utilities/urns/tag_urn.py index 0ac632ee40a015..f66d56a745a961 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/tag_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/tag_urn.py @@ -1 +1,3 @@ -from datahub.metadata.urns import TagUrn # noqa: F401 +from datahub.metadata.urns import TagUrn + +__all__ = ["TagUrn"] diff --git a/metadata-ingestion/src/datahub/utilities/urns/urn.py b/metadata-ingestion/src/datahub/utilities/urns/urn.py index 2e5cebfd0e8f55..2ded2d4d9b32c0 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/urn.py @@ -1,4 +1,6 @@ -from datahub.metadata.urns import Urn # noqa: F401 +from datahub.metadata.urns import Urn + +__all__ = ["Urn", "guess_entity_type"] def guess_entity_type(urn: str) -> str: diff --git a/metadata-ingestion/tests/conftest.py b/metadata-ingestion/tests/conftest.py index db025e7f806c06..4685faabfcb285 100644 --- a/metadata-ingestion/tests/conftest.py +++ b/metadata-ingestion/tests/conftest.py @@ -7,6 +7,7 @@ import pytest os.environ["DATAHUB_SUPPRESS_LOGGING_MANAGER"] = "1" +os.environ["DATAHUB_TEST_MODE"] = "1" # Enable debug logging. logging.getLogger().setLevel(logging.DEBUG) diff --git a/metadata-ingestion/tests/integration/dremio/test_dremio.py b/metadata-ingestion/tests/integration/dremio/test_dremio.py index eb999367962817..cc3a7e19bc93e2 100644 --- a/metadata-ingestion/tests/integration/dremio/test_dremio.py +++ b/metadata-ingestion/tests/integration/dremio/test_dremio.py @@ -192,7 +192,6 @@ def create_mysql_source(headers): def upload_dataset(headers): - url = f"{DREMIO_HOST}/apiv2/source/s3/file_format/warehouse/sample.parquet" payload = {"ignoreOtherFileFormats": False, "type": "Parquet"} diff --git a/metadata-ingestion/tests/integration/lookml/test_lookml.py b/metadata-ingestion/tests/integration/lookml/test_lookml.py index 3185d3cac53a56..ab55321a4d7342 100644 --- a/metadata-ingestion/tests/integration/lookml/test_lookml.py +++ b/metadata-ingestion/tests/integration/lookml/test_lookml.py @@ -12,16 +12,14 @@ from datahub.ingestion.run.pipeline import Pipeline from datahub.ingestion.source.file import read_metadata_file +from datahub.ingestion.source.looker.looker_dataclasses import LookerModel from datahub.ingestion.source.looker.looker_template_language import ( SpecialVariable, load_and_preprocess_file, resolve_liquid_variable, ) -from datahub.ingestion.source.looker.lookml_source import ( - LookerModel, - LookerRefinementResolver, - LookMLSourceConfig, -) +from datahub.ingestion.source.looker.lookml_config import LookMLSourceConfig +from datahub.ingestion.source.looker.lookml_refinement import LookerRefinementResolver from datahub.metadata.schema_classes import ( DatasetSnapshotClass, MetadataChangeEventClass, diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index f4613c524316e3..f22998b47b9008 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -712,7 +712,6 @@ def test_redshift_regular_case(): def test_redshift_native_query(): - table: powerbi_data_classes.Table = powerbi_data_classes.Table( expression=M_QUERIES[22], name="category", @@ -1101,7 +1100,6 @@ def test_double_quotes_in_alias(): @patch("datahub.ingestion.source.powerbi.m_query.parser.get_lark_parser") def test_m_query_timeout(mock_get_lark_parser): - q = 'let\n Source = Value.NativeQuery(Snowflake.Databases("0DD93C6BD5A6.snowflakecomputing.com","sales_analytics_warehouse_prod",[Role="sales_analytics_member_ad"]){[Name="SL_OPERATIONS"]}[Data], "select SALE_NO AS ""\x1b[4mSaleNo\x1b[0m""#(lf) ,CODE AS ""Code""#(lf) ,ENDDATE AS ""end_date""#(lf) from SL_OPERATIONS.SALE.REPORTS#(lf) where ENDDATE > \'2024-02-03\'", null, [EnableFolding=true]),\n #"selected Row" = Table.SelectRows(Source)\nin\n #"selected Row"' table: powerbi_data_classes.Table = powerbi_data_classes.Table( diff --git a/metadata-ingestion/tests/integration/powerbi/test_native_sql_parser.py b/metadata-ingestion/tests/integration/powerbi/test_native_sql_parser.py index 53e184515c1d8c..887f7fe4d6f44a 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_native_sql_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_native_sql_parser.py @@ -19,3 +19,13 @@ def test_simple_from(): assert len(tables) == 1 assert tables[0] == "OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4" + + +def test_drop_statement(): + expected: str = "SELECT#(lf)concat((UPPER(REPLACE(SELLER,'-',''))), MONTHID) as AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), MONTHID) as CD_AGENT_KEY,#(lf) *#(lf)FROM#(lf)OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4" + + query: str = "DROP TABLE IF EXISTS #table1; DROP TABLE IF EXISTS #table1,#table2; DROP TABLE IF EXISTS table1; DROP TABLE IF EXISTS table1, #table2;SELECT#(lf)concat((UPPER(REPLACE(SELLER,'-',''))), MONTHID) as AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), MONTHID) as CD_AGENT_KEY,#(lf) *#(lf)FROM#(lf)OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4" + + actual: str = native_sql_parser.remove_drop_statement(query) + + assert actual == expected diff --git a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py index 4d69ebeaf588ea..edde11ff87d293 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py +++ b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py @@ -96,7 +96,6 @@ def read_mock_data(path: Union[Path, str]) -> dict: def register_mock_api( pytestconfig: pytest.Config, request_mock: Any, override_data: Optional[dict] = None ) -> None: - default_mock_data_path = ( pytestconfig.rootpath / "tests/integration/powerbi/mock_data/default_mock_response.json" @@ -1553,7 +1552,6 @@ def test_powerbi_app_ingest( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - common_app_ingest( pytestconfig=pytestconfig, requests_mock=requests_mock, @@ -1584,7 +1582,6 @@ def test_powerbi_app_ingest_info_message( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - pipeline = common_app_ingest( pytestconfig=pytestconfig, requests_mock=requests_mock, diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index c3ea81905455a8..6c45b8a47de412 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -19,8 +19,7 @@ ) from tableauserverclient.models.reference_item import ResourceReference -from datahub.configuration.source_common import DEFAULT_ENV -from datahub.emitter.mce_builder import make_schema_field_urn +from datahub.emitter.mce_builder import DEFAULT_ENV, make_schema_field_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.run.pipeline import Pipeline, PipelineContext from datahub.ingestion.source.tableau.tableau import ( diff --git a/metadata-ingestion/tests/test_helpers/docker_helpers.py b/metadata-ingestion/tests/test_helpers/docker_helpers.py index 20aec975787e4e..d0e943bbe63daf 100644 --- a/metadata-ingestion/tests/test_helpers/docker_helpers.py +++ b/metadata-ingestion/tests/test_helpers/docker_helpers.py @@ -4,10 +4,10 @@ import pytest -from datahub.testing.docker_utils import ( # noqa: F401 - docker_compose_runner, - is_responsive, - wait_for_port, +from datahub.testing.docker_utils import ( # noqa: F401,I250 + docker_compose_runner as docker_compose_runner, + is_responsive as is_responsive, + wait_for_port as wait_for_port, ) logger = logging.getLogger(__name__) diff --git a/metadata-ingestion/tests/test_helpers/mce_helpers.py b/metadata-ingestion/tests/test_helpers/mce_helpers.py index 3b59481d8cb022..f4c629df7dba4e 100644 --- a/metadata-ingestion/tests/test_helpers/mce_helpers.py +++ b/metadata-ingestion/tests/test_helpers/mce_helpers.py @@ -17,15 +17,16 @@ Union, ) +import pytest + from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.sink.file import write_metadata_file from datahub.metadata.schema_classes import MetadataChangeEventClass +from datahub.metadata.urns import Urn from datahub.testing.compare_metadata_json import ( assert_metadata_files_equal, load_json_file, ) -from datahub.utilities.urns.urn import Urn -from tests.test_helpers.type_helpers import PytestConfig logger = logging.getLogger(__name__) @@ -77,7 +78,7 @@ def clean_nones(value): def check_golden_file( - pytestconfig: PytestConfig, + pytestconfig: pytest.Config, output_path: Union[str, os.PathLike], golden_path: Union[str, os.PathLike], ignore_paths: Sequence[str] = (), @@ -98,7 +99,7 @@ def check_golden_file( def check_goldens_stream( - pytestconfig: PytestConfig, + pytestconfig: pytest.Config, outputs: List, golden_path: Union[str, os.PathLike], ignore_paths: Sequence[str] = (), diff --git a/metadata-ingestion/tests/test_helpers/type_helpers.py b/metadata-ingestion/tests/test_helpers/type_helpers.py index 154960bbf7fc42..3a2215ed81ca99 100644 --- a/metadata-ingestion/tests/test_helpers/type_helpers.py +++ b/metadata-ingestion/tests/test_helpers/type_helpers.py @@ -1,12 +1,5 @@ from typing import Optional, TypeVar -# The current PytestConfig solution is somewhat ugly and not ideal. -# However, it is currently the best solution available, as the type itself is not -# exported: https://docs.pytest.org/en/stable/reference.html#config. -# As pytest's type support improves, this will likely change. -# TODO: revisit pytestconfig as https://github.com/pytest-dev/pytest/issues/7469 progresses. -from _pytest.config import Config as PytestConfig # noqa: F401 - _T = TypeVar("_T") diff --git a/metadata-ingestion/tests/unit/api/entities/platformresource/test_platform_resource.py b/metadata-ingestion/tests/unit/api/entities/platformresource/test_platform_resource.py index a84e373dbe72c2..6a03f511fa51c5 100644 --- a/metadata-ingestion/tests/unit/api/entities/platformresource/test_platform_resource.py +++ b/metadata-ingestion/tests/unit/api/entities/platformresource/test_platform_resource.py @@ -185,7 +185,6 @@ class TestModel(BaseModel): def test_platform_resource_filters(): - query = ( ElasticPlatformResourceQuery.create_from() .group(LogicalOperator.AND) diff --git a/metadata-ingestion/tests/unit/bigquery/test_bigquery_lineage.py b/metadata-ingestion/tests/unit/bigquery/test_bigquery_lineage.py index 415977b0f8467b..a1981ccf767916 100644 --- a/metadata-ingestion/tests/unit/bigquery/test_bigquery_lineage.py +++ b/metadata-ingestion/tests/unit/bigquery/test_bigquery_lineage.py @@ -144,7 +144,6 @@ def test_column_level_lineage(lineage_entries: List[QueryEvent]) -> None: def test_lineage_for_external_bq_table(mock_datahub_graph_instance): - pipeline_context = PipelineContext(run_id="bq_gcs_lineage") pipeline_context.graph = mock_datahub_graph_instance @@ -239,7 +238,6 @@ def fake_schema_metadata(entity_urn: str) -> models.SchemaMetadataClass: def test_lineage_for_external_bq_table_no_column_lineage(mock_datahub_graph_instance): - pipeline_context = PipelineContext(run_id="bq_gcs_lineage") pipeline_context.graph = mock_datahub_graph_instance diff --git a/metadata-ingestion/tests/unit/glue/test_glue_source.py b/metadata-ingestion/tests/unit/glue/test_glue_source.py index 4df0c6d17b06cc..693fd6bc336fd3 100644 --- a/metadata-ingestion/tests/unit/glue/test_glue_source.py +++ b/metadata-ingestion/tests/unit/glue/test_glue_source.py @@ -34,7 +34,6 @@ run_and_get_pipeline, validate_all_providers_have_committed_successfully, ) -from tests.test_helpers.type_helpers import PytestConfig from tests.unit.glue.test_glue_source_stubs import ( databases_1, databases_2, @@ -174,7 +173,7 @@ def test_column_type(hive_column_type: str, expected_type: Type) -> None: @freeze_time(FROZEN_TIME) def test_glue_ingest( tmp_path: Path, - pytestconfig: PytestConfig, + pytestconfig: pytest.Config, platform_instance: str, mce_file: str, mce_golden_file: str, @@ -410,7 +409,7 @@ def test_glue_stateful(pytestconfig, tmp_path, mock_time, mock_datahub_graph): def test_glue_with_delta_schema_ingest( tmp_path: Path, - pytestconfig: PytestConfig, + pytestconfig: pytest.Config, ) -> None: glue_source_instance = glue_source( platform_instance="delta_platform_instance", @@ -446,7 +445,7 @@ def test_glue_with_delta_schema_ingest( def test_glue_with_malformed_delta_schema_ingest( tmp_path: Path, - pytestconfig: PytestConfig, + pytestconfig: pytest.Config, ) -> None: glue_source_instance = glue_source( platform_instance="delta_platform_instance", @@ -489,7 +488,7 @@ def test_glue_with_malformed_delta_schema_ingest( @freeze_time(FROZEN_TIME) def test_glue_ingest_include_table_lineage( tmp_path: Path, - pytestconfig: PytestConfig, + pytestconfig: pytest.Config, mock_datahub_graph_instance: DataHubGraph, platform_instance: str, mce_file: str, @@ -584,7 +583,7 @@ def test_glue_ingest_include_table_lineage( @freeze_time(FROZEN_TIME) def test_glue_ingest_include_column_lineage( tmp_path: Path, - pytestconfig: PytestConfig, + pytestconfig: pytest.Config, mock_datahub_graph_instance: DataHubGraph, platform_instance: str, mce_file: str, @@ -684,7 +683,7 @@ def fake_schema_metadata(entity_urn: str) -> models.SchemaMetadataClass: @freeze_time(FROZEN_TIME) def test_glue_ingest_with_profiling( tmp_path: Path, - pytestconfig: PytestConfig, + pytestconfig: pytest.Config, ) -> None: glue_source_instance = glue_source_with_profiling() mce_file = "glue_mces.json" diff --git a/metadata-ingestion/tests/unit/redshift/test_redshift_source.py b/metadata-ingestion/tests/unit/redshift/test_redshift_source.py index 8198caf50df7f4..f016312dfe47fb 100644 --- a/metadata-ingestion/tests/unit/redshift/test_redshift_source.py +++ b/metadata-ingestion/tests/unit/redshift/test_redshift_source.py @@ -1,15 +1,15 @@ from typing import Iterable -from datahub.emitter.mcp import ( - MetadataChangeProposalClass, - MetadataChangeProposalWrapper, -) +from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.redshift.config import RedshiftConfig from datahub.ingestion.source.redshift.redshift import RedshiftSource from datahub.ingestion.source.redshift.redshift_schema import RedshiftTable -from datahub.metadata.schema_classes import MetadataChangeEventClass +from datahub.metadata.schema_classes import ( + MetadataChangeEventClass, + MetadataChangeProposalClass, +) def redshift_source_setup(custom_props_flag: bool) -> Iterable[MetadataWorkUnit]: diff --git a/metadata-ingestion/tests/unit/serde/test_serde.py b/metadata-ingestion/tests/unit/serde/test_serde.py index 727f2b10511b5e..a131ac9ce2a1bc 100644 --- a/metadata-ingestion/tests/unit/serde/test_serde.py +++ b/metadata-ingestion/tests/unit/serde/test_serde.py @@ -19,7 +19,6 @@ from datahub.metadata.schemas import getMetadataChangeEventSchema from tests.test_helpers import mce_helpers from tests.test_helpers.click_helpers import run_datahub_cmd -from tests.test_helpers.type_helpers import PytestConfig FROZEN_TIME = "2021-07-22 18:54:06" @@ -41,7 +40,7 @@ ], ) def test_serde_to_json( - pytestconfig: PytestConfig, tmp_path: pathlib.Path, json_filename: str + pytestconfig: pytest.Config, tmp_path: pathlib.Path, json_filename: str ) -> None: golden_file = pytestconfig.rootpath / json_filename output_file = tmp_path / "output.json" @@ -73,7 +72,7 @@ def test_serde_to_json( ) @freeze_time(FROZEN_TIME) def test_serde_to_avro( - pytestconfig: PytestConfig, + pytestconfig: pytest.Config, json_filename: str, ) -> None: # In this test, we want to read in from JSON -> MCE object. @@ -126,14 +125,14 @@ def test_serde_to_avro( ], ) @freeze_time(FROZEN_TIME) -def test_check_metadata_schema(pytestconfig: PytestConfig, json_filename: str) -> None: +def test_check_metadata_schema(pytestconfig: pytest.Config, json_filename: str) -> None: json_file_path = pytestconfig.rootpath / json_filename run_datahub_cmd(["check", "metadata-file", f"{json_file_path}"]) def test_check_metadata_rewrite( - pytestconfig: PytestConfig, tmp_path: pathlib.Path + pytestconfig: pytest.Config, tmp_path: pathlib.Path ) -> None: json_input = ( pytestconfig.rootpath / "tests/unit/serde/test_canonicalization_input.json" @@ -161,7 +160,7 @@ def test_check_metadata_rewrite( ], ) def test_check_mce_schema_failure( - pytestconfig: PytestConfig, json_filename: str + pytestconfig: pytest.Config, json_filename: str ) -> None: json_file_path = pytestconfig.rootpath / json_filename diff --git a/metadata-ingestion/tests/unit/test_snowflake_shares.py b/metadata-ingestion/tests/unit/snowflake/test_snowflake_shares.py similarity index 100% rename from metadata-ingestion/tests/unit/test_snowflake_shares.py rename to metadata-ingestion/tests/unit/snowflake/test_snowflake_shares.py diff --git a/metadata-ingestion/tests/unit/test_snowflake_source.py b/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py similarity index 99% rename from metadata-ingestion/tests/unit/test_snowflake_source.py rename to metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py index 72b59a3a4e4938..161dfa2b4e78f3 100644 --- a/metadata-ingestion/tests/unit/test_snowflake_source.py +++ b/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py @@ -5,7 +5,6 @@ from pydantic import ValidationError from datahub.configuration.common import AllowDenyPattern -from datahub.configuration.oauth import OAuthConfiguration from datahub.configuration.pattern_utils import UUID_REGEX from datahub.ingestion.api.source import SourceCapability from datahub.ingestion.source.snowflake.constants import ( @@ -13,6 +12,7 @@ CLIENT_SESSION_KEEP_ALIVE, SnowflakeCloudProvider, ) +from datahub.ingestion.source.snowflake.oauth_config import OAuthConfiguration from datahub.ingestion.source.snowflake.snowflake_config import ( DEFAULT_TEMP_TABLES_PATTERNS, SnowflakeV2Config, diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_sqlite_attach_database.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_sqlite_attach_database.json new file mode 100644 index 00000000000000..bcf31f6be803a2 --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_sqlite_attach_database.json @@ -0,0 +1,12 @@ +{ + "query_type": "UNKNOWN", + "query_type_props": {}, + "query_fingerprint": null, + "in_tables": [], + "out_tables": [], + "column_lineage": null, + "debug_info": { + "confidence": 0.0, + "generalized_statement": null + } +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py index 90cc863d6bd231..170341230205f3 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py @@ -1268,3 +1268,14 @@ def test_bigquery_subquery_column_inference() -> None: dialect="bigquery", expected_file=RESOURCE_DIR / "test_bigquery_subquery_column_inference.json", ) + + +def test_sqlite_attach_database() -> None: + assert_sql_result( + """\ +ATTACH DATABASE ':memory:' AS aux1 +""", + dialect="sqlite", + expected_file=RESOURCE_DIR / "test_sqlite_attach_database.json", + allow_table_error=True, + ) diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_utils.py b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_utils.py index 744d43373a0a1f..4e8ba8aa6b7770 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_utils.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_utils.py @@ -4,11 +4,9 @@ import pytest import sqlglot +from datahub.sql_parsing.query_types import get_query_type_of_sql from datahub.sql_parsing.sql_parsing_common import QueryType -from datahub.sql_parsing.sqlglot_lineage import ( - _UPDATE_ARGS_NOT_SUPPORTED_BY_SELECT, - get_query_type_of_sql, -) +from datahub.sql_parsing.sqlglot_lineage import _UPDATE_ARGS_NOT_SUPPORTED_BY_SELECT from datahub.sql_parsing.sqlglot_utils import ( generalize_query, generalize_query_fast, diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stale_entity_removal_handler.py b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stale_entity_removal_handler.py index be2d8bac12e386..b04d4b86d2e4bb 100644 --- a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stale_entity_removal_handler.py +++ b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stale_entity_removal_handler.py @@ -50,7 +50,6 @@ def test_change_percent( def test_filter_ignored_entity_types(): - assert filter_ignored_entity_types( [ "urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset1,PROD)", diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stateful_ingestion.py b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stateful_ingestion.py index 66564dc856abae..96ab8f7a01a386 100644 --- a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stateful_ingestion.py +++ b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stateful_ingestion.py @@ -10,7 +10,8 @@ from datahub.api.entities.dataprocess.dataprocess_instance import DataProcessInstance from datahub.configuration.common import AllowDenyPattern -from datahub.configuration.source_common import DEFAULT_ENV, DatasetSourceConfigMixin +from datahub.configuration.source_common import DatasetSourceConfigMixin +from datahub.emitter.mce_builder import DEFAULT_ENV from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport diff --git a/metadata-ingestion/tests/unit/test_cassandra_source.py b/metadata-ingestion/tests/unit/test_cassandra_source.py index a4ca3a0a9ef3f6..75dedde76c7c89 100644 --- a/metadata-ingestion/tests/unit/test_cassandra_source.py +++ b/metadata-ingestion/tests/unit/test_cassandra_source.py @@ -56,7 +56,6 @@ def assert_field_paths_match( def test_cassandra_schema_conversion( schema: str, expected_field_paths: List[str] ) -> None: - schema_dict: Dict[str, List[Any]] = json.loads(schema) column_infos: List = schema_dict["column_infos"] diff --git a/metadata-ingestion/tests/unit/test_sql_common.py b/metadata-ingestion/tests/unit/test_sql_common.py index a98bf641711220..cfb8f55bd977f7 100644 --- a/metadata-ingestion/tests/unit/test_sql_common.py +++ b/metadata-ingestion/tests/unit/test_sql_common.py @@ -38,7 +38,7 @@ def test_generate_foreign_key(): "referred_columns": ["test_referred_column"], # type: ignore } foreign_key = source.get_foreign_key_metadata( - dataset_urn="test_urn", + dataset_urn="urn:li:dataset:(urn:li:dataPlatform:TEST,test_schema.base_urn,PROD)", schema="test_schema", fk_dict=fk_dict, inspector=mock.Mock(), @@ -48,7 +48,9 @@ def test_generate_foreign_key(): assert [ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:TEST,test_referred_schema.test_table,PROD),test_referred_column)" ] == foreign_key.foreignFields - assert ["urn:li:schemaField:(test_urn,test_column)"] == foreign_key.sourceFields + assert [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:TEST,test_schema.base_urn,PROD),test_column)" + ] == foreign_key.sourceFields def test_use_source_schema_for_foreign_key_if_not_specified(): @@ -60,7 +62,7 @@ def test_use_source_schema_for_foreign_key_if_not_specified(): "referred_columns": ["test_referred_column"], # type: ignore } foreign_key = source.get_foreign_key_metadata( - dataset_urn="test_urn", + dataset_urn="urn:li:dataset:(urn:li:dataPlatform:TEST,test_schema.base_urn,PROD)", schema="test_schema", fk_dict=fk_dict, inspector=mock.Mock(), @@ -70,7 +72,9 @@ def test_use_source_schema_for_foreign_key_if_not_specified(): assert [ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:TEST,test_schema.test_table,PROD),test_referred_column)" ] == foreign_key.foreignFields - assert ["urn:li:schemaField:(test_urn,test_column)"] == foreign_key.sourceFields + assert [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:TEST,test_schema.base_urn,PROD),test_column)" + ] == foreign_key.sourceFields PLATFORM_FROM_SQLALCHEMY_URI_TEST_CASES: Dict[str, str] = { diff --git a/metadata-ingestion/tests/unit/test_parsing_util.py b/metadata-ingestion/tests/unit/utilities/test_parsing_util.py similarity index 100% rename from metadata-ingestion/tests/unit/test_parsing_util.py rename to metadata-ingestion/tests/unit/utilities/test_parsing_util.py diff --git a/metadata-ingestion/tests/unit/utilities/test_progress_timer.py b/metadata-ingestion/tests/unit/utilities/test_progress_timer.py new file mode 100644 index 00000000000000..139bad371bb9f4 --- /dev/null +++ b/metadata-ingestion/tests/unit/utilities/test_progress_timer.py @@ -0,0 +1,53 @@ +from datetime import timedelta +from time import sleep + +from datahub.utilities.progress_timer import ProgressTimer + + +def test_progress_timer_basic(): + timer = ProgressTimer(report_every=timedelta(milliseconds=100)) + + # First call should not report since report_0=False by default + assert not timer.should_report() + + # Call before interval elapsed should not report + sleep(0.05) # 50ms + assert not timer.should_report() + + # Call after interval elapsed should report + sleep(0.1) # Additional 100ms + assert timer.should_report() + + # Next immediate call should not report + assert not timer.should_report() + + +def test_progress_timer_with_report_0(): + timer = ProgressTimer(report_every=timedelta(milliseconds=100), report_0=True) + + # First call should report since report_0=True + assert timer.should_report() + + # Next immediate call should not report + assert not timer.should_report() + + # Call after interval elapsed should report + sleep(0.1) # 100ms + assert timer.should_report() + + +def test_progress_timer_multiple_intervals(): + timer = ProgressTimer(report_every=timedelta(milliseconds=50)) + + # First call should not report + assert not timer.should_report() + + # Check multiple intervals + sleep(0.06) # 60ms - should report + assert timer.should_report() + + sleep(0.02) # 20ms - should not report + assert not timer.should_report() + + sleep(0.05) # 50ms - should report + assert timer.should_report() diff --git a/metadata-integration/java/acryl-spark-lineage/README.md b/metadata-integration/java/acryl-spark-lineage/README.md index bd0a58b635b483..267e979b0fa073 100644 --- a/metadata-integration/java/acryl-spark-lineage/README.md +++ b/metadata-integration/java/acryl-spark-lineage/README.md @@ -165,6 +165,7 @@ information like tokens. | spark.datahub.rest.server | | http://localhost:8080 | Datahub server url eg: | | spark.datahub.rest.token | | | Authentication token. | | spark.datahub.rest.disable_ssl_verification | | false | Disable SSL certificate validation. Caution: Only use this if you know what you are doing! | +| spark.datahub.rest.disable_chunked_encoding | | false | Disable Chunked Transfer Encoding. In some environment chunked encoding causes issues. With this config option it can be disabled. || | spark.datahub.rest.max_retries | | 0 | Number of times a request retried if failed | | spark.datahub.rest.retry_interval | | 10 | Number of seconds to wait between retries | | spark.datahub.file.filename | | | The file where metadata will be written if file emitter is set | diff --git a/metadata-integration/java/acryl-spark-lineage/build.gradle b/metadata-integration/java/acryl-spark-lineage/build.gradle index 6620c34021ac4a..3f83e5657bbf4d 100644 --- a/metadata-integration/java/acryl-spark-lineage/build.gradle +++ b/metadata-integration/java/acryl-spark-lineage/build.gradle @@ -1,7 +1,7 @@ plugins { id("com.palantir.git-version") apply false } -apply plugin: 'java' +apply plugin: 'java-library' apply plugin: 'com.github.johnrengelman.shadow' apply plugin: 'signing' apply plugin: 'io.codearte.nexus-staging' @@ -51,8 +51,8 @@ dependencies { implementation project(':metadata-integration:java:openlineage-converter') - implementation project(path: ':metadata-integration:java:datahub-client', configuration: 'shadow') - implementation project(path: ':metadata-integration:java:openlineage-converter', configuration: 'shadow') + implementation project(path: ':metadata-integration:java:datahub-client') + implementation project(path: ':metadata-integration:java:openlineage-converter') //implementation "io.acryl:datahub-client:0.10.2" implementation "io.openlineage:openlineage-spark_2.12:$openLineageVersion" @@ -91,6 +91,8 @@ shadowJar { zip64 = true archiveClassifier = '' mergeServiceFiles() + project.configurations.implementation.canBeResolved = true + configurations = [project.configurations.implementation] def exclude_modules = project .configurations @@ -106,6 +108,8 @@ shadowJar { exclude(dependency { exclude_modules.contains(it.name) }) + exclude(dependency("org.slf4j::")) + exclude("org/apache/commons/logging/**") } // preventing java multi-release JAR leakage @@ -123,39 +127,36 @@ shadowJar { relocate 'com.sun.activation', 'io.acryl.shaded.com.sun.activation' relocate 'com.sun.codemodel', 'io.acryl.shaded.com.sun.codemodel' relocate 'com.sun.mail', 'io.acryl.shaded.com.sun.mail' - relocate 'com.fasterxml.jackson', 'datahub.spark2.shaded.jackson' - relocate 'org.slf4j', 'datahub.spark2.shaded.org.slf4j' // relocate 'org.apache.hc', 'io.acryl.shaded.http' - relocate 'org.apache.commons.codec', 'datahub.spark2.shaded.o.a.c.codec' - relocate 'org.apache.commons.compress', 'datahub.spark2.shaded.o.a.c.compress' - relocate 'org.apache.commons.lang3', 'datahub.spark2.shaded.o.a.c.lang3' + relocate 'org.apache.commons.codec', 'io.acryl.shaded.org.apache.commons.codec' + relocate 'org.apache.commons.compress', 'io.acryl.shaded.org.apache.commons.compress' + relocate 'org.apache.commons.lang3', 'io.acryl.shaded.org.apache.commons.lang3' relocate 'mozilla', 'datahub.spark2.shaded.mozilla' - relocate 'com.typesafe', 'datahub.spark2.shaded.typesafe' - relocate 'io.opentracing', 'datahub.spark2.shaded.io.opentracing' - relocate 'io.netty', 'datahub.spark2.shaded.io.netty' - relocate 'ch.randelshofer', 'datahub.spark2.shaded.ch.randelshofer' - relocate 'ch.qos', 'datahub.spark2.shaded.ch.qos' + relocate 'com.typesafe', 'io.acryl.shaded.com.typesafe' + relocate 'io.opentracing', 'io.acryl.shaded.io.opentracing' + relocate 'io.netty', 'io.acryl.shaded.io.netty' + relocate 'ch.randelshofer', 'io.acryl.shaded.ch.randelshofer' + relocate 'ch.qos', 'io.acryl.shaded.ch.qos' relocate 'org.springframework', 'io.acryl.shaded.org.springframework' relocate 'com.fasterxml.jackson', 'io.acryl.shaded.jackson' relocate 'org.yaml', 'io.acryl.shaded.org.yaml' // Required for shading snakeyaml relocate 'net.jcip.annotations', 'io.acryl.shaded.annotations' relocate 'javassist', 'io.acryl.shaded.javassist' relocate 'edu.umd.cs.findbugs', 'io.acryl.shaded.findbugs' - relocate 'org.antlr', 'io.acryl.shaded.org.antlr' - relocate 'antlr', 'io.acryl.shaded.antlr' + //relocate 'org.antlr', 'io.acryl.shaded.org.antlr' + //relocate 'antlr', 'io.acryl.shaded.antlr' relocate 'com.google.common', 'io.acryl.shaded.com.google.common' - relocate 'org.apache.commons', 'io.acryl.shaded.org.apache.commons' relocate 'org.reflections', 'io.acryl.shaded.org.reflections' relocate 'st4hidden', 'io.acryl.shaded.st4hidden' relocate 'org.stringtemplate', 'io.acryl.shaded.org.stringtemplate' relocate 'org.abego.treelayout', 'io.acryl.shaded.treelayout' - relocate 'org.slf4j', 'io.acryl.shaded.slf4j' relocate 'javax.annotation', 'io.acryl.shaded.javax.annotation' relocate 'com.github.benmanes.caffeine', 'io.acryl.shaded.com.github.benmanes.caffeine' relocate 'org.checkerframework', 'io.acryl.shaded.org.checkerframework' relocate 'com.google.errorprone', 'io.acryl.shaded.com.google.errorprone' relocate 'com.sun.jna', 'io.acryl.shaded.com.sun.jna' + } checkShadowJar { diff --git a/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/DatahubSparkListener.java b/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/DatahubSparkListener.java index ee0938edb50454..b594f6bae954fa 100644 --- a/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/DatahubSparkListener.java +++ b/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/DatahubSparkListener.java @@ -120,7 +120,9 @@ public Optional initializeEmitter(Config sparkConf) { boolean disableSslVerification = sparkConf.hasPath(SparkConfigParser.DISABLE_SSL_VERIFICATION_KEY) && sparkConf.getBoolean(SparkConfigParser.DISABLE_SSL_VERIFICATION_KEY); - + boolean disableChunkedEncoding = + sparkConf.hasPath(SparkConfigParser.REST_DISABLE_CHUNKED_ENCODING) + && sparkConf.getBoolean(SparkConfigParser.REST_DISABLE_CHUNKED_ENCODING); int retry_interval_in_sec = sparkConf.hasPath(SparkConfigParser.RETRY_INTERVAL_IN_SEC) ? sparkConf.getInt(SparkConfigParser.RETRY_INTERVAL_IN_SEC) @@ -150,6 +152,7 @@ public Optional initializeEmitter(Config sparkConf) { .disableSslVerification(disableSslVerification) .maxRetries(max_retries) .retryIntervalSec(retry_interval_in_sec) + .disableChunkedEncoding(disableChunkedEncoding) .build(); return Optional.of(new RestDatahubEmitterConfig(restEmitterConf)); case "kafka": @@ -374,7 +377,8 @@ private static void initializeMetrics(OpenLineageConfig openLineageConfig) { String disabledFacets; if (openLineageConfig.getFacetsConfig() != null && openLineageConfig.getFacetsConfig().getDisabledFacets() != null) { - disabledFacets = String.join(";", openLineageConfig.getFacetsConfig().getDisabledFacets()); + disabledFacets = + String.join(";", openLineageConfig.getFacetsConfig().getEffectiveDisabledFacets()); } else { disabledFacets = ""; } diff --git a/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/conf/SparkConfigParser.java b/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/conf/SparkConfigParser.java index 45ec5365d09b36..3860285083c4bb 100644 --- a/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/conf/SparkConfigParser.java +++ b/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/conf/SparkConfigParser.java @@ -30,6 +30,8 @@ public class SparkConfigParser { public static final String GMS_AUTH_TOKEN = "rest.token"; public static final String FILE_EMITTER_FILE_NAME = "file.filename"; public static final String DISABLE_SSL_VERIFICATION_KEY = "rest.disable_ssl_verification"; + public static final String REST_DISABLE_CHUNKED_ENCODING = "rest.disable_chunked_encoding"; + public static final String MAX_RETRIES = "rest.max_retries"; public static final String RETRY_INTERVAL_IN_SEC = "rest.retry_interval_in_sec"; public static final String KAFKA_MCP_TOPIC = "kafka.mcp_topic"; diff --git a/metadata-integration/java/acryl-spark-lineage/src/main/java/io/openlineage/spark/agent/util/PlanUtils.java b/metadata-integration/java/acryl-spark-lineage/src/main/java/io/openlineage/spark/agent/util/PlanUtils.java index d46d741d155b8b..5f87df2a65d6c2 100644 --- a/metadata-integration/java/acryl-spark-lineage/src/main/java/io/openlineage/spark/agent/util/PlanUtils.java +++ b/metadata-integration/java/acryl-spark-lineage/src/main/java/io/openlineage/spark/agent/util/PlanUtils.java @@ -5,14 +5,13 @@ package io.openlineage.spark.agent.util; -import static io.openlineage.spark.agent.lifecycle.ExecutionContext.CAMEL_TO_SNAKE_CASE; - import com.typesafe.config.Config; import com.typesafe.config.ConfigFactory; import datahub.spark.conf.SparkLineageConf; import io.datahubproject.openlineage.dataset.HdfsPathDataset; import io.openlineage.client.OpenLineage; import io.openlineage.spark.agent.Versions; +import io.openlineage.spark.api.naming.NameNormalizer; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; @@ -21,7 +20,6 @@ import java.util.Collection; import java.util.Collections; import java.util.List; -import java.util.Locale; import java.util.Objects; import java.util.Optional; import java.util.UUID; @@ -186,7 +184,7 @@ public static OpenLineage.ParentRunFacet parentRunFacet( .run(new OpenLineage.ParentRunFacetRunBuilder().runId(parentRunId).build()) .job( new OpenLineage.ParentRunFacetJobBuilder() - .name(parentJob.replaceAll(CAMEL_TO_SNAKE_CASE, "_$1").toLowerCase(Locale.ROOT)) + .name(NameNormalizer.normalize(parentJob)) .namespace(parentJobNamespace) .build()) .build(); @@ -287,8 +285,6 @@ public static boolean safeIsDefinedAt(PartialFunction pfn, Object x) { * @param pfn * @param x * @return - * @param - * @param */ public static List safeApply(PartialFunction> pfn, D x) { try { diff --git a/metadata-integration/java/acryl-spark-lineage/src/main/java/io/openlineage/spark/agent/util/RddPathUtils.java b/metadata-integration/java/acryl-spark-lineage/src/main/java/io/openlineage/spark/agent/util/RddPathUtils.java index 62005bf15f8505..6ef7403362a909 100644 --- a/metadata-integration/java/acryl-spark-lineage/src/main/java/io/openlineage/spark/agent/util/RddPathUtils.java +++ b/metadata-integration/java/acryl-spark-lineage/src/main/java/io/openlineage/spark/agent/util/RddPathUtils.java @@ -7,6 +7,7 @@ import java.util.Arrays; import java.util.Objects; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.stream.Stream; import lombok.extern.slf4j.Slf4j; import org.apache.commons.lang3.reflect.FieldUtils; @@ -18,6 +19,7 @@ import org.apache.spark.rdd.MapPartitionsRDD; import org.apache.spark.rdd.ParallelCollectionRDD; import org.apache.spark.rdd.RDD; +import org.apache.spark.sql.execution.datasources.FilePartition; import org.apache.spark.sql.execution.datasources.FileScanRDD; import scala.Tuple2; import scala.collection.immutable.Seq; @@ -90,7 +92,7 @@ public boolean isDefinedAt(Object rdd) { @SuppressWarnings("PMD.AvoidLiteralsInIfCondition") public Stream extract(FileScanRDD rdd) { return ScalaConversionUtils.fromSeq(rdd.filePartitions()).stream() - .flatMap(fp -> Arrays.stream(fp.files())) + .flatMap((FilePartition fp) -> Arrays.stream(fp.files())) .map( f -> { if ("3.4".compareTo(package$.MODULE$.SPARK_VERSION()) <= 0) { @@ -115,11 +117,15 @@ public boolean isDefinedAt(Object rdd) { @Override public Stream extract(ParallelCollectionRDD rdd) { + int SEQ_LIMIT = 1000; + AtomicBoolean loggingDone = new AtomicBoolean(false); try { Object data = FieldUtils.readField(rdd, "data", true); log.debug("ParallelCollectionRDD data: {}", data); - if (data instanceof Seq) { - return ScalaConversionUtils.fromSeq((Seq) data).stream() + if ((data instanceof Seq) && ((Seq) data).head() instanceof Tuple2) { + // exit if the first element is invalid + Seq data_slice = (Seq) ((Seq) data).slice(0, SEQ_LIMIT); + return ScalaConversionUtils.fromSeq(data_slice).stream() .map( el -> { Path path = null; @@ -127,9 +133,9 @@ public Stream extract(ParallelCollectionRDD rdd) { // we're able to extract path path = parentOf(((Tuple2) el)._1.toString()); log.debug("Found input {}", path); - } else { - // Change to debug to silence error - log.debug("unable to extract Path from {}", el.getClass().getCanonicalName()); + } else if (!loggingDone.get()) { + log.warn("unable to extract Path from {}", el.getClass().getCanonicalName()); + loggingDone.set(true); } return path; }) diff --git a/metadata-integration/java/datahub-client/build.gradle b/metadata-integration/java/datahub-client/build.gradle index d9087347e1b5c6..1bdc848d0385b1 100644 --- a/metadata-integration/java/datahub-client/build.gradle +++ b/metadata-integration/java/datahub-client/build.gradle @@ -1,6 +1,6 @@ plugins { id("com.palantir.git-version") apply false - id 'java' + id 'java-library' id 'com.github.johnrengelman.shadow' id 'jacoco' id 'signing' @@ -12,11 +12,13 @@ apply from: "../versioning.gradle" import org.apache.tools.ant.filters.ReplaceTokens -jar.enabled = false // Since we only want to build shadow jars, disabling the regular jar creation +jar { + archiveClassifier = "lib" +} dependencies { - implementation project(':entity-registry') - implementation project(':metadata-integration:java:datahub-event') + api project(':entity-registry') + api project(':metadata-integration:java:datahub-event') implementation(externalDependency.kafkaAvroSerializer) { exclude group: "org.apache.avro" } @@ -33,7 +35,7 @@ dependencies { implementation externalDependency.jacksonDataBind runtimeOnly externalDependency.jna - implementation externalDependency.slf4jApi + api externalDependency.slf4jApi compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok // VisibleForTesting @@ -78,6 +80,11 @@ shadowJar { // https://github.com/johnrengelman/shadow/issues/729 exclude('module-info.class', 'META-INF/versions/**', '**/LICENSE', '**/LICENSE*.txt', '**/NOTICE', '**/NOTICE.txt', 'licenses/**', 'log4j2.*', 'log4j.*') + dependencies { + exclude(dependency("org.slf4j::")) + exclude(dependency("antlr::")) + exclude("org/apache/commons/logging/**") + } mergeServiceFiles() // we relocate namespaces manually, because we want to know exactly which libs we are exposing and why // we can move to automatic relocation using ConfigureShadowRelocation after we get to a good place on these first @@ -88,15 +95,20 @@ shadowJar { relocate 'javassist', 'datahub.shaded.javassist' relocate 'edu.umd.cs.findbugs', 'datahub.shaded.findbugs' relocate 'org.antlr', 'datahub.shaded.org.antlr' - relocate 'antlr', 'datahub.shaded.antlr' + //relocate 'antlr', 'datahub.shaded.antlr' relocate 'com.google.common', 'datahub.shaded.com.google.common' - relocate 'org.apache.commons', 'datahub.shaded.org.apache.commons' + relocate 'org.apache.commons.codec', 'datahub.shaded.org.apache.commons.codec' + relocate 'org.apache.commons.compress', 'datahub.shaded.org.apache.commons.compress' + relocate 'org.apache.commons.lang3', 'datahub.shaded.org.apache.commons.lang3' + relocate 'org.apache.commons.lang', 'datahub.shaded.org.apache.commons.lang' + relocate 'org.apache.commons.cli', 'datahub.shaded.org.apache.commons.cli' + relocate 'org.apache.commons.text', 'datahub.shaded.org.apache.commons.text' + relocate 'org.apache.commons.io', 'datahub.shaded.org.apache.commons.io' relocate 'org.apache.maven', 'datahub.shaded.org.apache.maven' relocate 'org.reflections', 'datahub.shaded.org.reflections' relocate 'st4hidden', 'datahub.shaded.st4hidden' relocate 'org.stringtemplate', 'datahub.shaded.org.stringtemplate' relocate 'org.abego.treelayout', 'datahub.shaded.treelayout' - relocate 'org.slf4j', 'datahub.shaded.slf4j' relocate 'javax.annotation', 'datahub.shaded.javax.annotation' relocate 'com.github.benmanes.caffeine', 'datahub.shaded.com.github.benmanes.caffeine' relocate 'org.checkerframework', 'datahub.shaded.org.checkerframework' diff --git a/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/DatahubHttpRequestRetryStrategy.java b/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/DatahubHttpRequestRetryStrategy.java index 71a4b93baf48f4..50c0277c98b03b 100644 --- a/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/DatahubHttpRequestRetryStrategy.java +++ b/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/DatahubHttpRequestRetryStrategy.java @@ -48,7 +48,6 @@ public boolean retryRequest( @Override public boolean retryRequest(HttpResponse response, int execCount, HttpContext context) { - log.warn("Retrying request due to error: {}", response); return super.retryRequest(response, execCount, context); } } diff --git a/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/RestEmitter.java b/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/RestEmitter.java index e1017372be124b..d70c5baf10879d 100644 --- a/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/RestEmitter.java +++ b/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/RestEmitter.java @@ -1,6 +1,7 @@ package datahub.client.rest; import static com.linkedin.metadata.Constants.*; +import static org.apache.hc.core5.http.HttpHeaders.*; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.core.StreamReadConstraints; @@ -18,6 +19,7 @@ import datahub.event.UpsertAspectRequest; import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.security.KeyManagementException; import java.security.KeyStoreException; import java.security.NoSuchAlgorithmException; @@ -26,6 +28,7 @@ import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicReference; import java.util.function.Consumer; import javax.annotation.concurrent.ThreadSafe; @@ -97,17 +100,20 @@ public RestEmitter(RestEmitterConfig config) { this.config = config; HttpAsyncClientBuilder httpClientBuilder = this.config.getAsyncHttpClientBuilder(); httpClientBuilder.setRetryStrategy(new DatahubHttpRequestRetryStrategy()); - - // Override httpClient settings with RestEmitter configs if present - if (config.getTimeoutSec() != null) { - httpClientBuilder.setDefaultRequestConfig( - RequestConfig.custom() - .setConnectionRequestTimeout( - config.getTimeoutSec() * 1000, java.util.concurrent.TimeUnit.MILLISECONDS) - .setResponseTimeout( - config.getTimeoutSec() * 1000, java.util.concurrent.TimeUnit.MILLISECONDS) - .build()); + if ((config.getTimeoutSec() != null) || (config.isDisableChunkedEncoding())) { + RequestConfig.Builder requestConfigBuilder = RequestConfig.custom(); + // Override httpClient settings with RestEmitter configs if present + if (config.getTimeoutSec() != null) { + requestConfigBuilder + .setConnectionRequestTimeout(config.getTimeoutSec() * 1000, TimeUnit.MILLISECONDS) + .setResponseTimeout(config.getTimeoutSec() * 1000, TimeUnit.MILLISECONDS); + } + if (config.isDisableChunkedEncoding()) { + requestConfigBuilder.setContentCompressionEnabled(false); + } + httpClientBuilder.setDefaultRequestConfig(requestConfigBuilder.build()); } + PoolingAsyncClientConnectionManagerBuilder poolingAsyncClientConnectionManagerBuilder = PoolingAsyncClientConnectionManagerBuilder.create(); @@ -223,8 +229,13 @@ private Future postGeneric( if (this.config.getToken() != null) { simpleRequestBuilder.setHeader("Authorization", "Bearer " + this.config.getToken()); } + if (this.config.isDisableChunkedEncoding()) { + byte[] payloadBytes = payloadJson.getBytes(StandardCharsets.UTF_8); + simpleRequestBuilder.setBody(payloadBytes, ContentType.APPLICATION_JSON); + } else { + simpleRequestBuilder.setBody(payloadJson, ContentType.APPLICATION_JSON); + } - simpleRequestBuilder.setBody(payloadJson, ContentType.APPLICATION_JSON); AtomicReference responseAtomicReference = new AtomicReference<>(); CountDownLatch responseLatch = new CountDownLatch(1); FutureCallback httpCallback = diff --git a/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/RestEmitterConfig.java b/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/RestEmitterConfig.java index e28ad4ed660f0b..55c11aab0ebf3c 100644 --- a/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/RestEmitterConfig.java +++ b/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/RestEmitterConfig.java @@ -30,6 +30,8 @@ public class RestEmitterConfig { Integer timeoutSec; @Builder.Default boolean disableSslVerification = false; + @Builder.Default boolean disableChunkedEncoding = false; + @Builder.Default int maxRetries = 0; @Builder.Default int retryIntervalSec = 10; diff --git a/metadata-integration/java/openlineage-converter/build.gradle b/metadata-integration/java/openlineage-converter/build.gradle index 2e04881ab5ccda..d149104f089b36 100644 --- a/metadata-integration/java/openlineage-converter/build.gradle +++ b/metadata-integration/java/openlineage-converter/build.gradle @@ -1,4 +1,4 @@ -apply plugin: 'java' +apply plugin: 'java-library' apply plugin: 'com.github.johnrengelman.shadow' apply plugin: 'signing' apply plugin: 'maven-publish' diff --git a/smoke-test/tests/cypress/yarn.lock b/smoke-test/tests/cypress/yarn.lock index 2433e9f8fae08e..c6116609b11467 100644 --- a/smoke-test/tests/cypress/yarn.lock +++ b/smoke-test/tests/cypress/yarn.lock @@ -510,9 +510,9 @@ core-util-is@1.0.2: integrity sha1-tf1UIgqivFq1eqtxQMlAdUUDwac= cross-spawn@^7.0.0, cross-spawn@^7.0.2: - version "7.0.3" - resolved "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.3.tgz" - integrity sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w== + version "7.0.6" + resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-7.0.6.tgz#8a58fe78f00dcd70c370451759dfbfaf03e8ee9f" + integrity sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA== dependencies: path-key "^3.1.0" shebang-command "^2.0.0"