From 2d155ccaa9eb2966295d9c248fdb61a23354f305 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Tue, 19 Nov 2024 14:33:47 -0600 Subject: [PATCH 01/24] feat(mcl-upgrade): implement resume & urn pagination (#11889) --- .../upgrade/system/AbstractMCLStep.java | 131 ++++++++++----- ...ateSchemaFieldsFromSchemaMetadataStep.java | 2 +- .../DatahubUpgradeNonBlockingTest.java | 149 +++++++++++++++++- ...pgradeCliApplicationTestConfiguration.java | 12 +- 4 files changed, 248 insertions(+), 46 deletions(-) diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java index 6c70aee88675c5..cd7947ce3c11aa 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java @@ -1,13 +1,12 @@ package com.linkedin.datahub.upgrade.system; -import static com.linkedin.metadata.Constants.DATA_HUB_UPGRADE_RESULT_ASPECT_NAME; - import com.linkedin.common.urn.Urn; import com.linkedin.datahub.upgrade.UpgradeContext; import com.linkedin.datahub.upgrade.UpgradeStep; import com.linkedin.datahub.upgrade.UpgradeStepResult; import com.linkedin.datahub.upgrade.impl.DefaultUpgradeStepResult; import com.linkedin.events.metadata.ChangeType; +import com.linkedin.metadata.aspect.SystemAspect; import com.linkedin.metadata.boot.BootstrapStep; import com.linkedin.metadata.entity.AspectDao; import com.linkedin.metadata.entity.EntityService; @@ -16,10 +15,13 @@ import com.linkedin.metadata.entity.ebean.PartitionedStream; import com.linkedin.metadata.entity.restoreindices.RestoreIndicesArgs; import com.linkedin.metadata.utils.AuditStampUtils; +import com.linkedin.upgrade.DataHubUpgradeResult; import com.linkedin.upgrade.DataHubUpgradeState; import com.linkedin.util.Pair; import io.datahubproject.metadata.context.OperationContext; import java.util.List; +import java.util.Map; +import java.util.Optional; import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; import java.util.function.Function; @@ -33,6 +35,8 @@ */ @Slf4j public abstract class AbstractMCLStep implements UpgradeStep { + public static final String LAST_URN_KEY = "lastUrn"; + private final OperationContext opContext; private final EntityService entityService; private final AspectDao aspectDao; @@ -70,10 +74,30 @@ protected Urn getUpgradeIdUrn() { @Override public Function executable() { return (context) -> { + // Resume state + Optional prevResult = + context.upgrade().getUpgradeResult(opContext, getUpgradeIdUrn(), entityService); + String resumeUrn = + prevResult + .filter( + result -> + DataHubUpgradeState.IN_PROGRESS.equals(result.getState()) + && result.getResult() != null + && result.getResult().containsKey(LAST_URN_KEY)) + .map(result -> result.getResult().get(LAST_URN_KEY)) + .orElse(null); + if (resumeUrn != null) { + log.info("{}: Resuming from URN: {}", getUpgradeIdUrn(), resumeUrn); + } // re-using for configuring the sql scan RestoreIndicesArgs args = - new RestoreIndicesArgs().aspectName(getAspectName()).batchSize(batchSize).limit(limit); + new RestoreIndicesArgs() + .aspectName(getAspectName()) + .batchSize(batchSize) + .lastUrn(resumeUrn) + .urnBasedPagination(resumeUrn != null) + .limit(limit); if (getUrnLike() != null) { args = args.urnLike(getUrnLike()); @@ -86,40 +110,62 @@ public Function executable() { batch -> { log.info("Processing batch({}) of size {}.", getAspectName(), batchSize); - List, Boolean>> futures; - + List, SystemAspect>> futures; futures = EntityUtils.toSystemAspectFromEbeanAspects( opContext.getRetrieverContext().get(), batch.collect(Collectors.toList())) .stream() .map( - systemAspect -> - entityService.alwaysProduceMCLAsync( - opContext, - systemAspect.getUrn(), - systemAspect.getUrn().getEntityType(), - getAspectName(), - systemAspect.getAspectSpec(), - null, - systemAspect.getRecordTemplate(), - null, - systemAspect - .getSystemMetadata() - .setRunId(id()) - .setLastObserved(System.currentTimeMillis()), - AuditStampUtils.createDefaultAuditStamp(), - ChangeType.UPSERT)) - .collect(Collectors.toList()); - - futures.forEach( - f -> { - try { - f.getFirst().get(); - } catch (InterruptedException | ExecutionException e) { - throw new RuntimeException(e); - } - }); + systemAspect -> { + Pair, Boolean> future = + entityService.alwaysProduceMCLAsync( + opContext, + systemAspect.getUrn(), + systemAspect.getUrn().getEntityType(), + getAspectName(), + systemAspect.getAspectSpec(), + null, + systemAspect.getRecordTemplate(), + null, + systemAspect + .getSystemMetadata() + .setRunId(id()) + .setLastObserved(System.currentTimeMillis()), + AuditStampUtils.createDefaultAuditStamp(), + ChangeType.UPSERT); + return Pair., SystemAspect>of( + future.getFirst(), systemAspect); + }) + .toList(); + + SystemAspect lastAspect = + futures.stream() + .map( + f -> { + try { + f.getFirst().get(); + return f.getSecond(); + } catch (InterruptedException | ExecutionException e) { + throw new RuntimeException(e); + } + }) + .reduce((a, b) -> b) + .orElse(null); + + // record progress + if (lastAspect != null) { + log.info( + "{}: Saving state. Last urn:{}", getUpgradeIdUrn(), lastAspect.getUrn()); + context + .upgrade() + .setUpgradeResult( + opContext, + getUpgradeIdUrn(), + entityService, + DataHubUpgradeState.IN_PROGRESS, + Map.of(LAST_URN_KEY, lastAspect.getUrn().toString())); + } if (batchDelayMs > 0) { log.info("Sleeping for {} ms", batchDelayMs); @@ -142,12 +188,23 @@ public Function executable() { @Override /** Returns whether the upgrade should be skipped. */ public boolean skip(UpgradeContext context) { - boolean previouslyRun = - entityService.exists( - opContext, getUpgradeIdUrn(), DATA_HUB_UPGRADE_RESULT_ASPECT_NAME, true); - if (previouslyRun) { - log.info("{} was already run. Skipping.", id()); + Optional prevResult = + context.upgrade().getUpgradeResult(opContext, getUpgradeIdUrn(), entityService); + + boolean previousRunFinal = + prevResult + .filter( + result -> + DataHubUpgradeState.SUCCEEDED.equals(result.getState()) + || DataHubUpgradeState.ABORTED.equals(result.getState())) + .isPresent(); + + if (previousRunFinal) { + log.info( + "{} was already run. State: {} Skipping.", + id(), + prevResult.map(DataHubUpgradeResult::getState)); } - return previouslyRun; + return previousRunFinal; } } diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java index eece83f4ab713e..55bc8edbf6a768 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java @@ -1,5 +1,6 @@ package com.linkedin.datahub.upgrade.system.schemafield; +import static com.linkedin.datahub.upgrade.system.AbstractMCLStep.LAST_URN_KEY; import static com.linkedin.metadata.Constants.APP_SOURCE; import static com.linkedin.metadata.Constants.DATASET_ENTITY_NAME; import static com.linkedin.metadata.Constants.SCHEMA_METADATA_ASPECT_NAME; @@ -61,7 +62,6 @@ */ @Slf4j public class GenerateSchemaFieldsFromSchemaMetadataStep implements UpgradeStep { - private static final String LAST_URN_KEY = "lastUrn"; private static final List REQUIRED_ASPECTS = List.of(SCHEMA_METADATA_ASPECT_NAME, STATUS_ASPECT_NAME); diff --git a/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/DatahubUpgradeNonBlockingTest.java b/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/DatahubUpgradeNonBlockingTest.java index f340e688ad7f77..21bc6b725cba2b 100644 --- a/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/DatahubUpgradeNonBlockingTest.java +++ b/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/DatahubUpgradeNonBlockingTest.java @@ -1,14 +1,18 @@ package com.linkedin.datahub.upgrade; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyInt; import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertTrue; import static org.testng.AssertJUnit.assertNotNull; +import com.linkedin.data.template.StringMap; import com.linkedin.datahub.upgrade.impl.DefaultUpgradeManager; import com.linkedin.datahub.upgrade.system.SystemUpdateNonBlocking; import com.linkedin.datahub.upgrade.system.bootstrapmcps.BootstrapMCPStep; @@ -20,17 +24,30 @@ import com.linkedin.metadata.entity.AspectDao; import com.linkedin.metadata.entity.EntityService; import com.linkedin.metadata.entity.EntityServiceImpl; +import com.linkedin.metadata.entity.ebean.EbeanAspectV2; +import com.linkedin.metadata.entity.ebean.PartitionedStream; import com.linkedin.metadata.entity.restoreindices.RestoreIndicesArgs; import com.linkedin.mxe.Topics; +import com.linkedin.upgrade.DataHubUpgradeResult; +import com.linkedin.upgrade.DataHubUpgradeState; +import com.linkedin.util.Pair; import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.test.metadata.context.TestOperationContexts; +import java.sql.Timestamp; +import java.util.Arrays; import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.concurrent.CompletableFuture; import java.util.stream.Collectors; +import java.util.stream.Stream; import javax.inject.Named; +import org.mockito.ArgumentCaptor; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.test.context.SpringBootTest; import org.springframework.test.context.ActiveProfiles; import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; +import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; @ActiveProfiles("test") @@ -63,7 +80,12 @@ public class DatahubUpgradeNonBlockingTest extends AbstractTestNGSpringContextTe @Autowired private EntityServiceImpl entityService; - @Autowired private OperationContext opContext; + private OperationContext opContext; + + @BeforeClass + public void init() { + opContext = TestOperationContexts.systemContextNoValidate(); + } @Test public void testSystemUpdateNonBlockingInit() { @@ -81,10 +103,13 @@ public void testSystemUpdateNonBlockingInit() { } @Test - public void testReindexDataJobViaNodesCLLPaging() { + public void testReindexDataJobViaNodesCLLPagingArgs() { EntityService mockService = mock(EntityService.class); AspectDao mockAspectDao = mock(AspectDao.class); + PartitionedStream mockStream = mock(PartitionedStream.class); + when(mockStream.partition(anyInt())).thenReturn(Stream.empty()); + when(mockAspectDao.streamAspectBatches(any(RestoreIndicesArgs.class))).thenReturn(mockStream); ReindexDataJobViaNodesCLL cllUpgrade = new ReindexDataJobViaNodesCLL(opContext, mockService, mockAspectDao, true, 10, 0, 0); @@ -102,9 +127,79 @@ public void testReindexDataJobViaNodesCLLPaging() { .batchSize(10) .limit(0) .aspectName("dataJobInputOutput") + .urnBasedPagination(false) + .lastUrn(null) .urnLike("urn:li:dataJob:%"))); } + @Test + public void testReindexDataJobViaNodesCLLResumePaging() throws Exception { + // Mock services + EntityService mockService = mock(EntityService.class); + AspectDao mockAspectDao = mock(AspectDao.class); + + // Create test data + EbeanAspectV2 aspect1 = createMockEbeanAspect("urn:li:dataJob:job1", "dataJobInputOutput"); + EbeanAspectV2 aspect2 = createMockEbeanAspect("urn:li:dataJob:job2", "dataJobInputOutput"); + EbeanAspectV2 aspect3 = createMockEbeanAspect("urn:li:dataJob:job3", "dataJobInputOutput"); + List initialBatch = Arrays.asList(aspect1, aspect2); + List resumeBatch = Arrays.asList(aspect3); + + // Mock the stream for first batch + PartitionedStream initialStream = mock(PartitionedStream.class); + when(initialStream.partition(anyInt())).thenReturn(Stream.of(initialBatch.stream())); + + // Mock the stream for second batch + PartitionedStream resumeStream = mock(PartitionedStream.class); + when(resumeStream.partition(anyInt())).thenReturn(Stream.of(resumeBatch.stream())); + + // Setup the AspectDao using Answer to handle null safely + when(mockAspectDao.streamAspectBatches(any(RestoreIndicesArgs.class))) + .thenAnswer( + invocation -> { + RestoreIndicesArgs args = invocation.getArgument(0); + if (args.lastUrn() == null) { + return initialStream; + } else if ("urn:li:dataJob:job2".equals(args.lastUrn())) { + return resumeStream; + } + return mock(PartitionedStream.class); + }); + + // Mock successful MCL production + when(mockService.alwaysProduceMCLAsync( + any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), any())) + .thenReturn(Pair.of(CompletableFuture.completedFuture(null), true)); + + // Create the upgrade + ReindexDataJobViaNodesCLL cllUpgrade = + new ReindexDataJobViaNodesCLL(opContext, mockService, mockAspectDao, true, 2, 0, 0); + + // Initial Run + cllUpgrade.steps().get(0).executable().apply(createMockInitialUpgrade()); + + // Resumed + cllUpgrade.steps().get(0).executable().apply(createMockResumeUpgrade()); + + // Use ArgumentCaptor to verify the calls + ArgumentCaptor argsCaptor = + ArgumentCaptor.forClass(RestoreIndicesArgs.class); + verify(mockAspectDao, times(2)).streamAspectBatches(argsCaptor.capture()); + + List capturedArgs = argsCaptor.getAllValues(); + + // Verify both the initial and resume calls were made with correct arguments + assertEquals(capturedArgs.get(0).lastUrn(), null); + assertEquals(capturedArgs.get(0).urnBasedPagination(), false); + assertEquals(capturedArgs.get(1).lastUrn(), "urn:li:dataJob:job2"); + assertEquals(capturedArgs.get(1).urnBasedPagination(), true); + + // Verify MCL production was called for each aspect + verify(mockService, times(3)) + .alwaysProduceMCLAsync( + any(), any(), any(), any(), any(), any(), any(), any(), any(), any(), any()); + } + @Test public void testNonBlockingBootstrapMCP() { List mcpTemplate = @@ -123,4 +218,54 @@ public void testNonBlockingBootstrapMCP() { .map(update -> update.getMcpTemplate().getName()) .collect(Collectors.toSet()))); } + + private UpgradeContext createMockInitialUpgrade() { + // Mock the Upgrade instance + Upgrade mockUpgrade = mock(Upgrade.class); + + // Configure the mock upgrade to return no previous result + when(mockUpgrade.getUpgradeResult(any(), any(), any())).thenReturn(Optional.empty()); + + UpgradeContext mockInitialContext = mock(UpgradeContext.class); + when(mockInitialContext.opContext()).thenReturn(opContext); + when(mockInitialContext.upgrade()).thenReturn(mockUpgrade); + when(mockInitialContext.report()).thenReturn(mock(UpgradeReport.class)); + + return mockInitialContext; + } + + private UpgradeContext createMockResumeUpgrade() { + // Mock the Upgrade instance + Upgrade mockUpgrade = mock(Upgrade.class); + DataHubUpgradeResult mockPrevResult = mock(DataHubUpgradeResult.class); + + // Configure the mock previous result + when(mockPrevResult.getState()).thenReturn(DataHubUpgradeState.IN_PROGRESS); + when(mockPrevResult.getResult()) + .thenReturn(new StringMap(Map.of("lastUrn", "urn:li:dataJob:job2"))); + + // Configure the mock upgrade to return our previous result + when(mockUpgrade.getUpgradeResult(any(), any(), any())).thenReturn(Optional.of(mockPrevResult)); + + UpgradeContext mockResumeContext = mock(UpgradeContext.class); + when(mockResumeContext.opContext()).thenReturn(opContext); + when(mockResumeContext.upgrade()).thenReturn(mockUpgrade); + when(mockResumeContext.report()).thenReturn(mock(UpgradeReport.class)); + + return mockResumeContext; + } + + private static EbeanAspectV2 createMockEbeanAspect(String urn, String aspectName) { + Timestamp now = new Timestamp(System.currentTimeMillis()); + return new EbeanAspectV2( + urn, + aspectName, + 0L, + "{}", // metadata + now, // createdOn + "urn:li:corpuser:testUser", // createdBy + null, // createdFor + null // systemMetadata + ); + } } diff --git a/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/UpgradeCliApplicationTestConfiguration.java b/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/UpgradeCliApplicationTestConfiguration.java index 81d883d8ce36b7..5b7b8756f11fb1 100644 --- a/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/UpgradeCliApplicationTestConfiguration.java +++ b/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/UpgradeCliApplicationTestConfiguration.java @@ -19,17 +19,17 @@ @Import(value = {SystemAuthenticationFactory.class}) public class UpgradeCliApplicationTestConfiguration { - @MockBean private UpgradeCli upgradeCli; + @MockBean public UpgradeCli upgradeCli; - @MockBean private Database ebeanServer; + @MockBean public Database ebeanServer; - @MockBean private SearchService searchService; + @MockBean public SearchService searchService; - @MockBean private GraphService graphService; + @MockBean public GraphService graphService; - @MockBean private EntityRegistry entityRegistry; + @MockBean public EntityRegistry entityRegistry; - @MockBean ConfigEntityRegistry configEntityRegistry; + @MockBean public ConfigEntityRegistry configEntityRegistry; @MockBean public EntityIndexBuilders entityIndexBuilders; From 44affd7f8211cb902112156660666b05b5f4dbe6 Mon Sep 17 00:00:00 2001 From: Chris Collins Date: Tue, 19 Nov 2024 17:24:17 -0500 Subject: [PATCH 02/24] fix(ui) Fix merging siblings schema with mix of v1 & v2 fields (#11837) --- .../src/app/entity/shared/siblingUtils.ts | 33 ++++++++++++++----- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/datahub-web-react/src/app/entity/shared/siblingUtils.ts b/datahub-web-react/src/app/entity/shared/siblingUtils.ts index 2f50dc99df191b..aa9e4bcb5e46e1 100644 --- a/datahub-web-react/src/app/entity/shared/siblingUtils.ts +++ b/datahub-web-react/src/app/entity/shared/siblingUtils.ts @@ -5,6 +5,7 @@ import * as QueryString from 'query-string'; import { Dataset, Entity, Maybe, SiblingProperties } from '../../../types.generated'; import { GenericEntityProperties } from './types'; import { useIsShowSeparateSiblingsEnabled } from '../../useAppConfig'; +import { downgradeV2FieldPath } from '../dataset/profile/schema/utils/utils'; export function stripSiblingsFromEntity(entity: any) { return { @@ -55,16 +56,30 @@ const combineMerge = (target, source, options) => { return destination; }; -function convertObjectKeysToLowercase(object: Record) { - return Object.fromEntries(Object.entries(object).map(([key, value]) => [key.toLowerCase(), value])); +// this function is responsible for normalizing object keys to make sure merging on key matches keys appropriately +function normalizeObjectKeys(object: Record, isSchemaField = false) { + return Object.fromEntries( + Object.entries(object).map(([key, value]) => { + let normalizedKey = key.toLowerCase(); + if (isSchemaField) { + normalizedKey = downgradeV2FieldPath(normalizedKey) || normalizedKey; + } + return [normalizedKey, value]; + }), + ); } // use when you want to merge an array of objects by key in the object as opposed to by index of array -const mergeArrayOfObjectsByKey = (destinationArray: any[], sourceArray: any[], key: string) => { - const destination = convertObjectKeysToLowercase(keyBy(destinationArray, key)); - const source = convertObjectKeysToLowercase(keyBy(sourceArray, key)); +const mergeArrayOfObjectsByKey = (destinationArray: any[], sourceArray: any[], key: string, isSchemaField = false) => { + const destination = normalizeObjectKeys(keyBy(destinationArray, key), isSchemaField); + const source = normalizeObjectKeys(keyBy(sourceArray, key), isSchemaField); - return values(merge(destination, source)); + return values( + merge(destination, source, { + arrayMerge: combineMerge, + customMerge, + }), + ); }; const mergeTags = (destinationArray, sourceArray, _options) => { @@ -88,7 +103,7 @@ const mergeOwners = (destinationArray, sourceArray, _options) => { }; const mergeFields = (destinationArray, sourceArray, _options) => { - return mergeArrayOfObjectsByKey(destinationArray, sourceArray, 'fieldPath'); + return mergeArrayOfObjectsByKey(destinationArray, sourceArray, 'fieldPath', true); }; function getArrayMergeFunction(key) { @@ -112,7 +127,7 @@ function getArrayMergeFunction(key) { } } -const customMerge = (isPrimary, key) => { +function customMerge(isPrimary, key) { if (key === 'upstream' || key === 'downstream') { return (_secondary, primary) => primary; } @@ -145,7 +160,7 @@ const customMerge = (isPrimary, key) => { customMerge: customMerge.bind({}, isPrimary), }); }; -}; +} export const getEntitySiblingData = (baseEntity: T): Maybe => { if (!baseEntity) { From 85c8e605be045deb59f7548380b550d12e70c900 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 19 Nov 2024 15:06:16 -0800 Subject: [PATCH 03/24] fix(ingest): consider sql parsing fallback as failure (#11896) --- metadata-ingestion/src/datahub/cli/check_cli.py | 4 +++- .../src/datahub/sql_parsing/sqlglot_lineage.py | 9 +++++++++ .../goldens/test_sqlite_attach_database.json | 12 ++++++++++++ .../tests/unit/sql_parsing/test_sqlglot_lineage.py | 11 +++++++++++ 4 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 metadata-ingestion/tests/unit/sql_parsing/goldens/test_sqlite_attach_database.json diff --git a/metadata-ingestion/src/datahub/cli/check_cli.py b/metadata-ingestion/src/datahub/cli/check_cli.py index 39ed1b2bfea087..fbe07b64f0e154 100644 --- a/metadata-ingestion/src/datahub/cli/check_cli.py +++ b/metadata-ingestion/src/datahub/cli/check_cli.py @@ -268,7 +268,9 @@ def sql_lineage( ) logger.debug("Sql parsing debug info: %s", lineage.debug_info) - if lineage.debug_info.error: + if lineage.debug_info.table_error: + raise lineage.debug_info.table_error + elif lineage.debug_info.error: logger.debug("Sql parsing error details", exc_info=lineage.debug_info.error) click.echo(lineage.json(indent=4)) diff --git a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py index b635f8cb47b6d2..506bd1d8c6be40 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py @@ -904,6 +904,15 @@ def _sqlglot_lineage_inner( logger.debug("Parsing lineage from sql statement: %s", sql) statement = parse_statement(sql, dialect=dialect) + if isinstance(statement, sqlglot.exp.Command): + # For unsupported syntax, sqlglot will usually fallback to parsing as a Command. + # This is effectively a parsing error, and we won't get any lineage from it. + # See https://github.com/tobymao/sqlglot/commit/3a13fdf4e597a2f0a3f9fc126a129183fe98262f + # and https://github.com/tobymao/sqlglot/pull/2874 + raise UnsupportedStatementTypeError( + f"Got unsupported syntax for statement: {sql}" + ) + original_statement, statement = statement, statement.copy() # logger.debug( # "Formatted sql statement: %s", diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_sqlite_attach_database.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_sqlite_attach_database.json new file mode 100644 index 00000000000000..bcf31f6be803a2 --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_sqlite_attach_database.json @@ -0,0 +1,12 @@ +{ + "query_type": "UNKNOWN", + "query_type_props": {}, + "query_fingerprint": null, + "in_tables": [], + "out_tables": [], + "column_lineage": null, + "debug_info": { + "confidence": 0.0, + "generalized_statement": null + } +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py index 90cc863d6bd231..170341230205f3 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py @@ -1268,3 +1268,14 @@ def test_bigquery_subquery_column_inference() -> None: dialect="bigquery", expected_file=RESOURCE_DIR / "test_bigquery_subquery_column_inference.json", ) + + +def test_sqlite_attach_database() -> None: + assert_sql_result( + """\ +ATTACH DATABASE ':memory:' AS aux1 +""", + dialect="sqlite", + expected_file=RESOURCE_DIR / "test_sqlite_attach_database.json", + allow_table_error=True, + ) From 1f396e87c1c48ad7b8a9996dc94c227ffd53e876 Mon Sep 17 00:00:00 2001 From: Tamas Nemeth Date: Wed, 20 Nov 2024 00:13:24 +0100 Subject: [PATCH 04/24] feat(spark): OpenLineage 1.24.2 upgrade (#11830) --- build.gradle | 2 +- entity-registry/build.gradle | 2 ++ .../java/acryl-spark-lineage/README.md | 1 + .../java/acryl-spark-lineage/build.gradle | 35 ++++++++++--------- .../datahub/spark/DatahubSparkListener.java | 8 +++-- .../datahub/spark/conf/SparkConfigParser.java | 2 ++ .../spark/agent/util/PlanUtils.java | 8 ++--- .../spark/agent/util/RddPathUtils.java | 18 ++++++---- .../java/datahub-client/build.gradle | 28 ++++++++++----- .../rest/DatahubHttpRequestRetryStrategy.java | 1 - .../java/datahub/client/rest/RestEmitter.java | 33 +++++++++++------ .../client/rest/RestEmitterConfig.java | 2 ++ .../java/openlineage-converter/build.gradle | 2 +- 13 files changed, 89 insertions(+), 53 deletions(-) diff --git a/build.gradle b/build.gradle index 6e6dadb7ebfa34..9ee756d41e11ef 100644 --- a/build.gradle +++ b/build.gradle @@ -56,7 +56,7 @@ buildscript { ext.hazelcastVersion = '5.3.6' ext.ebeanVersion = '15.5.2' ext.googleJavaFormatVersion = '1.18.1' - ext.openLineageVersion = '1.19.0' + ext.openLineageVersion = '1.24.2' ext.logbackClassicJava8 = '1.2.12' ext.docker_registry = 'acryldata' diff --git a/entity-registry/build.gradle b/entity-registry/build.gradle index 2dedea1f16d99c..22e5b601d39db2 100644 --- a/entity-registry/build.gradle +++ b/entity-registry/build.gradle @@ -25,6 +25,8 @@ dependencies { because("previous versions are vulnerable to CVE-2022-25857") } } + api project(path: ':li-utils') + api project(path: ':li-utils', configuration: "dataTemplate") dataModel project(':li-utils') annotationProcessor externalDependency.lombok diff --git a/metadata-integration/java/acryl-spark-lineage/README.md b/metadata-integration/java/acryl-spark-lineage/README.md index bd0a58b635b483..267e979b0fa073 100644 --- a/metadata-integration/java/acryl-spark-lineage/README.md +++ b/metadata-integration/java/acryl-spark-lineage/README.md @@ -165,6 +165,7 @@ information like tokens. | spark.datahub.rest.server | | http://localhost:8080 | Datahub server url eg: | | spark.datahub.rest.token | | | Authentication token. | | spark.datahub.rest.disable_ssl_verification | | false | Disable SSL certificate validation. Caution: Only use this if you know what you are doing! | +| spark.datahub.rest.disable_chunked_encoding | | false | Disable Chunked Transfer Encoding. In some environment chunked encoding causes issues. With this config option it can be disabled. || | spark.datahub.rest.max_retries | | 0 | Number of times a request retried if failed | | spark.datahub.rest.retry_interval | | 10 | Number of seconds to wait between retries | | spark.datahub.file.filename | | | The file where metadata will be written if file emitter is set | diff --git a/metadata-integration/java/acryl-spark-lineage/build.gradle b/metadata-integration/java/acryl-spark-lineage/build.gradle index 6620c34021ac4a..3f83e5657bbf4d 100644 --- a/metadata-integration/java/acryl-spark-lineage/build.gradle +++ b/metadata-integration/java/acryl-spark-lineage/build.gradle @@ -1,7 +1,7 @@ plugins { id("com.palantir.git-version") apply false } -apply plugin: 'java' +apply plugin: 'java-library' apply plugin: 'com.github.johnrengelman.shadow' apply plugin: 'signing' apply plugin: 'io.codearte.nexus-staging' @@ -51,8 +51,8 @@ dependencies { implementation project(':metadata-integration:java:openlineage-converter') - implementation project(path: ':metadata-integration:java:datahub-client', configuration: 'shadow') - implementation project(path: ':metadata-integration:java:openlineage-converter', configuration: 'shadow') + implementation project(path: ':metadata-integration:java:datahub-client') + implementation project(path: ':metadata-integration:java:openlineage-converter') //implementation "io.acryl:datahub-client:0.10.2" implementation "io.openlineage:openlineage-spark_2.12:$openLineageVersion" @@ -91,6 +91,8 @@ shadowJar { zip64 = true archiveClassifier = '' mergeServiceFiles() + project.configurations.implementation.canBeResolved = true + configurations = [project.configurations.implementation] def exclude_modules = project .configurations @@ -106,6 +108,8 @@ shadowJar { exclude(dependency { exclude_modules.contains(it.name) }) + exclude(dependency("org.slf4j::")) + exclude("org/apache/commons/logging/**") } // preventing java multi-release JAR leakage @@ -123,39 +127,36 @@ shadowJar { relocate 'com.sun.activation', 'io.acryl.shaded.com.sun.activation' relocate 'com.sun.codemodel', 'io.acryl.shaded.com.sun.codemodel' relocate 'com.sun.mail', 'io.acryl.shaded.com.sun.mail' - relocate 'com.fasterxml.jackson', 'datahub.spark2.shaded.jackson' - relocate 'org.slf4j', 'datahub.spark2.shaded.org.slf4j' // relocate 'org.apache.hc', 'io.acryl.shaded.http' - relocate 'org.apache.commons.codec', 'datahub.spark2.shaded.o.a.c.codec' - relocate 'org.apache.commons.compress', 'datahub.spark2.shaded.o.a.c.compress' - relocate 'org.apache.commons.lang3', 'datahub.spark2.shaded.o.a.c.lang3' + relocate 'org.apache.commons.codec', 'io.acryl.shaded.org.apache.commons.codec' + relocate 'org.apache.commons.compress', 'io.acryl.shaded.org.apache.commons.compress' + relocate 'org.apache.commons.lang3', 'io.acryl.shaded.org.apache.commons.lang3' relocate 'mozilla', 'datahub.spark2.shaded.mozilla' - relocate 'com.typesafe', 'datahub.spark2.shaded.typesafe' - relocate 'io.opentracing', 'datahub.spark2.shaded.io.opentracing' - relocate 'io.netty', 'datahub.spark2.shaded.io.netty' - relocate 'ch.randelshofer', 'datahub.spark2.shaded.ch.randelshofer' - relocate 'ch.qos', 'datahub.spark2.shaded.ch.qos' + relocate 'com.typesafe', 'io.acryl.shaded.com.typesafe' + relocate 'io.opentracing', 'io.acryl.shaded.io.opentracing' + relocate 'io.netty', 'io.acryl.shaded.io.netty' + relocate 'ch.randelshofer', 'io.acryl.shaded.ch.randelshofer' + relocate 'ch.qos', 'io.acryl.shaded.ch.qos' relocate 'org.springframework', 'io.acryl.shaded.org.springframework' relocate 'com.fasterxml.jackson', 'io.acryl.shaded.jackson' relocate 'org.yaml', 'io.acryl.shaded.org.yaml' // Required for shading snakeyaml relocate 'net.jcip.annotations', 'io.acryl.shaded.annotations' relocate 'javassist', 'io.acryl.shaded.javassist' relocate 'edu.umd.cs.findbugs', 'io.acryl.shaded.findbugs' - relocate 'org.antlr', 'io.acryl.shaded.org.antlr' - relocate 'antlr', 'io.acryl.shaded.antlr' + //relocate 'org.antlr', 'io.acryl.shaded.org.antlr' + //relocate 'antlr', 'io.acryl.shaded.antlr' relocate 'com.google.common', 'io.acryl.shaded.com.google.common' - relocate 'org.apache.commons', 'io.acryl.shaded.org.apache.commons' relocate 'org.reflections', 'io.acryl.shaded.org.reflections' relocate 'st4hidden', 'io.acryl.shaded.st4hidden' relocate 'org.stringtemplate', 'io.acryl.shaded.org.stringtemplate' relocate 'org.abego.treelayout', 'io.acryl.shaded.treelayout' - relocate 'org.slf4j', 'io.acryl.shaded.slf4j' relocate 'javax.annotation', 'io.acryl.shaded.javax.annotation' relocate 'com.github.benmanes.caffeine', 'io.acryl.shaded.com.github.benmanes.caffeine' relocate 'org.checkerframework', 'io.acryl.shaded.org.checkerframework' relocate 'com.google.errorprone', 'io.acryl.shaded.com.google.errorprone' relocate 'com.sun.jna', 'io.acryl.shaded.com.sun.jna' + } checkShadowJar { diff --git a/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/DatahubSparkListener.java b/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/DatahubSparkListener.java index ee0938edb50454..b594f6bae954fa 100644 --- a/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/DatahubSparkListener.java +++ b/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/DatahubSparkListener.java @@ -120,7 +120,9 @@ public Optional initializeEmitter(Config sparkConf) { boolean disableSslVerification = sparkConf.hasPath(SparkConfigParser.DISABLE_SSL_VERIFICATION_KEY) && sparkConf.getBoolean(SparkConfigParser.DISABLE_SSL_VERIFICATION_KEY); - + boolean disableChunkedEncoding = + sparkConf.hasPath(SparkConfigParser.REST_DISABLE_CHUNKED_ENCODING) + && sparkConf.getBoolean(SparkConfigParser.REST_DISABLE_CHUNKED_ENCODING); int retry_interval_in_sec = sparkConf.hasPath(SparkConfigParser.RETRY_INTERVAL_IN_SEC) ? sparkConf.getInt(SparkConfigParser.RETRY_INTERVAL_IN_SEC) @@ -150,6 +152,7 @@ public Optional initializeEmitter(Config sparkConf) { .disableSslVerification(disableSslVerification) .maxRetries(max_retries) .retryIntervalSec(retry_interval_in_sec) + .disableChunkedEncoding(disableChunkedEncoding) .build(); return Optional.of(new RestDatahubEmitterConfig(restEmitterConf)); case "kafka": @@ -374,7 +377,8 @@ private static void initializeMetrics(OpenLineageConfig openLineageConfig) { String disabledFacets; if (openLineageConfig.getFacetsConfig() != null && openLineageConfig.getFacetsConfig().getDisabledFacets() != null) { - disabledFacets = String.join(";", openLineageConfig.getFacetsConfig().getDisabledFacets()); + disabledFacets = + String.join(";", openLineageConfig.getFacetsConfig().getEffectiveDisabledFacets()); } else { disabledFacets = ""; } diff --git a/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/conf/SparkConfigParser.java b/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/conf/SparkConfigParser.java index 45ec5365d09b36..3860285083c4bb 100644 --- a/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/conf/SparkConfigParser.java +++ b/metadata-integration/java/acryl-spark-lineage/src/main/java/datahub/spark/conf/SparkConfigParser.java @@ -30,6 +30,8 @@ public class SparkConfigParser { public static final String GMS_AUTH_TOKEN = "rest.token"; public static final String FILE_EMITTER_FILE_NAME = "file.filename"; public static final String DISABLE_SSL_VERIFICATION_KEY = "rest.disable_ssl_verification"; + public static final String REST_DISABLE_CHUNKED_ENCODING = "rest.disable_chunked_encoding"; + public static final String MAX_RETRIES = "rest.max_retries"; public static final String RETRY_INTERVAL_IN_SEC = "rest.retry_interval_in_sec"; public static final String KAFKA_MCP_TOPIC = "kafka.mcp_topic"; diff --git a/metadata-integration/java/acryl-spark-lineage/src/main/java/io/openlineage/spark/agent/util/PlanUtils.java b/metadata-integration/java/acryl-spark-lineage/src/main/java/io/openlineage/spark/agent/util/PlanUtils.java index d46d741d155b8b..5f87df2a65d6c2 100644 --- a/metadata-integration/java/acryl-spark-lineage/src/main/java/io/openlineage/spark/agent/util/PlanUtils.java +++ b/metadata-integration/java/acryl-spark-lineage/src/main/java/io/openlineage/spark/agent/util/PlanUtils.java @@ -5,14 +5,13 @@ package io.openlineage.spark.agent.util; -import static io.openlineage.spark.agent.lifecycle.ExecutionContext.CAMEL_TO_SNAKE_CASE; - import com.typesafe.config.Config; import com.typesafe.config.ConfigFactory; import datahub.spark.conf.SparkLineageConf; import io.datahubproject.openlineage.dataset.HdfsPathDataset; import io.openlineage.client.OpenLineage; import io.openlineage.spark.agent.Versions; +import io.openlineage.spark.api.naming.NameNormalizer; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; @@ -21,7 +20,6 @@ import java.util.Collection; import java.util.Collections; import java.util.List; -import java.util.Locale; import java.util.Objects; import java.util.Optional; import java.util.UUID; @@ -186,7 +184,7 @@ public static OpenLineage.ParentRunFacet parentRunFacet( .run(new OpenLineage.ParentRunFacetRunBuilder().runId(parentRunId).build()) .job( new OpenLineage.ParentRunFacetJobBuilder() - .name(parentJob.replaceAll(CAMEL_TO_SNAKE_CASE, "_$1").toLowerCase(Locale.ROOT)) + .name(NameNormalizer.normalize(parentJob)) .namespace(parentJobNamespace) .build()) .build(); @@ -287,8 +285,6 @@ public static boolean safeIsDefinedAt(PartialFunction pfn, Object x) { * @param pfn * @param x * @return - * @param - * @param */ public static List safeApply(PartialFunction> pfn, D x) { try { diff --git a/metadata-integration/java/acryl-spark-lineage/src/main/java/io/openlineage/spark/agent/util/RddPathUtils.java b/metadata-integration/java/acryl-spark-lineage/src/main/java/io/openlineage/spark/agent/util/RddPathUtils.java index 62005bf15f8505..6ef7403362a909 100644 --- a/metadata-integration/java/acryl-spark-lineage/src/main/java/io/openlineage/spark/agent/util/RddPathUtils.java +++ b/metadata-integration/java/acryl-spark-lineage/src/main/java/io/openlineage/spark/agent/util/RddPathUtils.java @@ -7,6 +7,7 @@ import java.util.Arrays; import java.util.Objects; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.stream.Stream; import lombok.extern.slf4j.Slf4j; import org.apache.commons.lang3.reflect.FieldUtils; @@ -18,6 +19,7 @@ import org.apache.spark.rdd.MapPartitionsRDD; import org.apache.spark.rdd.ParallelCollectionRDD; import org.apache.spark.rdd.RDD; +import org.apache.spark.sql.execution.datasources.FilePartition; import org.apache.spark.sql.execution.datasources.FileScanRDD; import scala.Tuple2; import scala.collection.immutable.Seq; @@ -90,7 +92,7 @@ public boolean isDefinedAt(Object rdd) { @SuppressWarnings("PMD.AvoidLiteralsInIfCondition") public Stream extract(FileScanRDD rdd) { return ScalaConversionUtils.fromSeq(rdd.filePartitions()).stream() - .flatMap(fp -> Arrays.stream(fp.files())) + .flatMap((FilePartition fp) -> Arrays.stream(fp.files())) .map( f -> { if ("3.4".compareTo(package$.MODULE$.SPARK_VERSION()) <= 0) { @@ -115,11 +117,15 @@ public boolean isDefinedAt(Object rdd) { @Override public Stream extract(ParallelCollectionRDD rdd) { + int SEQ_LIMIT = 1000; + AtomicBoolean loggingDone = new AtomicBoolean(false); try { Object data = FieldUtils.readField(rdd, "data", true); log.debug("ParallelCollectionRDD data: {}", data); - if (data instanceof Seq) { - return ScalaConversionUtils.fromSeq((Seq) data).stream() + if ((data instanceof Seq) && ((Seq) data).head() instanceof Tuple2) { + // exit if the first element is invalid + Seq data_slice = (Seq) ((Seq) data).slice(0, SEQ_LIMIT); + return ScalaConversionUtils.fromSeq(data_slice).stream() .map( el -> { Path path = null; @@ -127,9 +133,9 @@ public Stream extract(ParallelCollectionRDD rdd) { // we're able to extract path path = parentOf(((Tuple2) el)._1.toString()); log.debug("Found input {}", path); - } else { - // Change to debug to silence error - log.debug("unable to extract Path from {}", el.getClass().getCanonicalName()); + } else if (!loggingDone.get()) { + log.warn("unable to extract Path from {}", el.getClass().getCanonicalName()); + loggingDone.set(true); } return path; }) diff --git a/metadata-integration/java/datahub-client/build.gradle b/metadata-integration/java/datahub-client/build.gradle index d9087347e1b5c6..1bdc848d0385b1 100644 --- a/metadata-integration/java/datahub-client/build.gradle +++ b/metadata-integration/java/datahub-client/build.gradle @@ -1,6 +1,6 @@ plugins { id("com.palantir.git-version") apply false - id 'java' + id 'java-library' id 'com.github.johnrengelman.shadow' id 'jacoco' id 'signing' @@ -12,11 +12,13 @@ apply from: "../versioning.gradle" import org.apache.tools.ant.filters.ReplaceTokens -jar.enabled = false // Since we only want to build shadow jars, disabling the regular jar creation +jar { + archiveClassifier = "lib" +} dependencies { - implementation project(':entity-registry') - implementation project(':metadata-integration:java:datahub-event') + api project(':entity-registry') + api project(':metadata-integration:java:datahub-event') implementation(externalDependency.kafkaAvroSerializer) { exclude group: "org.apache.avro" } @@ -33,7 +35,7 @@ dependencies { implementation externalDependency.jacksonDataBind runtimeOnly externalDependency.jna - implementation externalDependency.slf4jApi + api externalDependency.slf4jApi compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok // VisibleForTesting @@ -78,6 +80,11 @@ shadowJar { // https://github.com/johnrengelman/shadow/issues/729 exclude('module-info.class', 'META-INF/versions/**', '**/LICENSE', '**/LICENSE*.txt', '**/NOTICE', '**/NOTICE.txt', 'licenses/**', 'log4j2.*', 'log4j.*') + dependencies { + exclude(dependency("org.slf4j::")) + exclude(dependency("antlr::")) + exclude("org/apache/commons/logging/**") + } mergeServiceFiles() // we relocate namespaces manually, because we want to know exactly which libs we are exposing and why // we can move to automatic relocation using ConfigureShadowRelocation after we get to a good place on these first @@ -88,15 +95,20 @@ shadowJar { relocate 'javassist', 'datahub.shaded.javassist' relocate 'edu.umd.cs.findbugs', 'datahub.shaded.findbugs' relocate 'org.antlr', 'datahub.shaded.org.antlr' - relocate 'antlr', 'datahub.shaded.antlr' + //relocate 'antlr', 'datahub.shaded.antlr' relocate 'com.google.common', 'datahub.shaded.com.google.common' - relocate 'org.apache.commons', 'datahub.shaded.org.apache.commons' + relocate 'org.apache.commons.codec', 'datahub.shaded.org.apache.commons.codec' + relocate 'org.apache.commons.compress', 'datahub.shaded.org.apache.commons.compress' + relocate 'org.apache.commons.lang3', 'datahub.shaded.org.apache.commons.lang3' + relocate 'org.apache.commons.lang', 'datahub.shaded.org.apache.commons.lang' + relocate 'org.apache.commons.cli', 'datahub.shaded.org.apache.commons.cli' + relocate 'org.apache.commons.text', 'datahub.shaded.org.apache.commons.text' + relocate 'org.apache.commons.io', 'datahub.shaded.org.apache.commons.io' relocate 'org.apache.maven', 'datahub.shaded.org.apache.maven' relocate 'org.reflections', 'datahub.shaded.org.reflections' relocate 'st4hidden', 'datahub.shaded.st4hidden' relocate 'org.stringtemplate', 'datahub.shaded.org.stringtemplate' relocate 'org.abego.treelayout', 'datahub.shaded.treelayout' - relocate 'org.slf4j', 'datahub.shaded.slf4j' relocate 'javax.annotation', 'datahub.shaded.javax.annotation' relocate 'com.github.benmanes.caffeine', 'datahub.shaded.com.github.benmanes.caffeine' relocate 'org.checkerframework', 'datahub.shaded.org.checkerframework' diff --git a/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/DatahubHttpRequestRetryStrategy.java b/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/DatahubHttpRequestRetryStrategy.java index 71a4b93baf48f4..50c0277c98b03b 100644 --- a/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/DatahubHttpRequestRetryStrategy.java +++ b/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/DatahubHttpRequestRetryStrategy.java @@ -48,7 +48,6 @@ public boolean retryRequest( @Override public boolean retryRequest(HttpResponse response, int execCount, HttpContext context) { - log.warn("Retrying request due to error: {}", response); return super.retryRequest(response, execCount, context); } } diff --git a/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/RestEmitter.java b/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/RestEmitter.java index e1017372be124b..d70c5baf10879d 100644 --- a/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/RestEmitter.java +++ b/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/RestEmitter.java @@ -1,6 +1,7 @@ package datahub.client.rest; import static com.linkedin.metadata.Constants.*; +import static org.apache.hc.core5.http.HttpHeaders.*; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.core.StreamReadConstraints; @@ -18,6 +19,7 @@ import datahub.event.UpsertAspectRequest; import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.security.KeyManagementException; import java.security.KeyStoreException; import java.security.NoSuchAlgorithmException; @@ -26,6 +28,7 @@ import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicReference; import java.util.function.Consumer; import javax.annotation.concurrent.ThreadSafe; @@ -97,17 +100,20 @@ public RestEmitter(RestEmitterConfig config) { this.config = config; HttpAsyncClientBuilder httpClientBuilder = this.config.getAsyncHttpClientBuilder(); httpClientBuilder.setRetryStrategy(new DatahubHttpRequestRetryStrategy()); - - // Override httpClient settings with RestEmitter configs if present - if (config.getTimeoutSec() != null) { - httpClientBuilder.setDefaultRequestConfig( - RequestConfig.custom() - .setConnectionRequestTimeout( - config.getTimeoutSec() * 1000, java.util.concurrent.TimeUnit.MILLISECONDS) - .setResponseTimeout( - config.getTimeoutSec() * 1000, java.util.concurrent.TimeUnit.MILLISECONDS) - .build()); + if ((config.getTimeoutSec() != null) || (config.isDisableChunkedEncoding())) { + RequestConfig.Builder requestConfigBuilder = RequestConfig.custom(); + // Override httpClient settings with RestEmitter configs if present + if (config.getTimeoutSec() != null) { + requestConfigBuilder + .setConnectionRequestTimeout(config.getTimeoutSec() * 1000, TimeUnit.MILLISECONDS) + .setResponseTimeout(config.getTimeoutSec() * 1000, TimeUnit.MILLISECONDS); + } + if (config.isDisableChunkedEncoding()) { + requestConfigBuilder.setContentCompressionEnabled(false); + } + httpClientBuilder.setDefaultRequestConfig(requestConfigBuilder.build()); } + PoolingAsyncClientConnectionManagerBuilder poolingAsyncClientConnectionManagerBuilder = PoolingAsyncClientConnectionManagerBuilder.create(); @@ -223,8 +229,13 @@ private Future postGeneric( if (this.config.getToken() != null) { simpleRequestBuilder.setHeader("Authorization", "Bearer " + this.config.getToken()); } + if (this.config.isDisableChunkedEncoding()) { + byte[] payloadBytes = payloadJson.getBytes(StandardCharsets.UTF_8); + simpleRequestBuilder.setBody(payloadBytes, ContentType.APPLICATION_JSON); + } else { + simpleRequestBuilder.setBody(payloadJson, ContentType.APPLICATION_JSON); + } - simpleRequestBuilder.setBody(payloadJson, ContentType.APPLICATION_JSON); AtomicReference responseAtomicReference = new AtomicReference<>(); CountDownLatch responseLatch = new CountDownLatch(1); FutureCallback httpCallback = diff --git a/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/RestEmitterConfig.java b/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/RestEmitterConfig.java index e28ad4ed660f0b..55c11aab0ebf3c 100644 --- a/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/RestEmitterConfig.java +++ b/metadata-integration/java/datahub-client/src/main/java/datahub/client/rest/RestEmitterConfig.java @@ -30,6 +30,8 @@ public class RestEmitterConfig { Integer timeoutSec; @Builder.Default boolean disableSslVerification = false; + @Builder.Default boolean disableChunkedEncoding = false; + @Builder.Default int maxRetries = 0; @Builder.Default int retryIntervalSec = 10; diff --git a/metadata-integration/java/openlineage-converter/build.gradle b/metadata-integration/java/openlineage-converter/build.gradle index 2e04881ab5ccda..d149104f089b36 100644 --- a/metadata-integration/java/openlineage-converter/build.gradle +++ b/metadata-integration/java/openlineage-converter/build.gradle @@ -1,4 +1,4 @@ -apply plugin: 'java' +apply plugin: 'java-library' apply plugin: 'com.github.johnrengelman.shadow' apply plugin: 'signing' apply plugin: 'maven-publish' From 8638bf974a00cb18c837616ed69b794b90de720f Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Tue, 19 Nov 2024 17:51:58 -0600 Subject: [PATCH 05/24] chore(cleanup): remove unused UrnUtils function (#11897) --- .../com/linkedin/common/urn/UrnUtils.java | 22 ------------------- 1 file changed, 22 deletions(-) diff --git a/li-utils/src/main/javaPegasus/com/linkedin/common/urn/UrnUtils.java b/li-utils/src/main/javaPegasus/com/linkedin/common/urn/UrnUtils.java index 89f0cd8fbc9791..0a2400badfc627 100644 --- a/li-utils/src/main/javaPegasus/com/linkedin/common/urn/UrnUtils.java +++ b/li-utils/src/main/javaPegasus/com/linkedin/common/urn/UrnUtils.java @@ -27,28 +27,6 @@ public static DatasetUrn toDatasetUrn( new DataPlatformUrn(platformName), datasetName, FabricType.valueOf(origin.toUpperCase())); } - /** - * Convert fabric String to FabricType - * - * @param fabric PROD, CORP, EI, DEV, LIT, PRIME - * @return FabricType - */ - @Nonnull - public static FabricType toFabricType(@Nonnull String fabric) { - switch (fabric.toUpperCase()) { - case "PROD": - return FabricType.PROD; - case "CORP": - return FabricType.CORP; - case "EI": - return FabricType.EI; - case "DEV": - return FabricType.DEV; - default: - throw new IllegalArgumentException("Unsupported Fabric Type: " + fabric); - } - } - public static Urn getUrn(String urnStr) { try { return Urn.createFromString(urnStr); From 524ef8c6d0a07961576a2e69b8c3d7e4313550a7 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 20 Nov 2024 00:29:05 -0800 Subject: [PATCH 06/24] perf(ingest/redshift): limit copy lineage (#11662) Co-authored-by: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> --- .../ingestion/source/redshift/query.py | 47 ++++++++++++------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py index b18b526ef30fce..71a20890d35e88 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py @@ -9,6 +9,8 @@ # We use 290 instead instead of the standard 320, because escape characters can add to the length. _QUERY_SEQUENCE_LIMIT = 290 +_MAX_COPY_ENTRIES_PER_TABLE = 20 + class RedshiftCommonQuery: CREATE_TEMP_TABLE_CLAUSE = "create temp table" @@ -293,28 +295,37 @@ def alter_table_rename_query( def list_copy_commands_sql( db_name: str, start_time: datetime, end_time: datetime ) -> str: - return """ - select - distinct - "schema" as target_schema, - "table" as target_table, - c.file_name as filename - from - SYS_QUERY_DETAIL as si - join SYS_LOAD_DETAIL as c on - si.query_id = c.query_id - join SVV_TABLE_INFO sti on - sti.table_id = si.table_id - where - database = '{db_name}' - and si.start_time >= '{start_time}' - and si.start_time < '{end_time}' - order by target_schema, target_table, si.start_time asc - """.format( + return """\ +SELECT DISTINCT + target_schema, + target_table, + filename +FROM ( + SELECT + sti."schema" AS target_schema, + sti."table" AS target_table, + c.file_name AS filename, + ROW_NUMBER() OVER ( + PARTITION BY sti."schema", sti."table" + ORDER BY si.start_time DESC + ) AS rn + FROM + SYS_QUERY_DETAIL AS si + JOIN SYS_LOAD_DETAIL AS c ON si.query_id = c.query_id + JOIN SVV_TABLE_INFO sti ON sti.table_id = si.table_id + WHERE + sti.database = '{db_name}' + AND si.start_time >= '{start_time}' + AND si.start_time < '{end_time}' +) subquery +WHERE rn <= {_MAX_COPY_ENTRIES_PER_TABLE} +ORDER BY target_schema, target_table, filename +""".format( # We need the original database name for filtering db_name=db_name, start_time=start_time.strftime(redshift_datetime_format), end_time=end_time.strftime(redshift_datetime_format), + _MAX_COPY_ENTRIES_PER_TABLE=_MAX_COPY_ENTRIES_PER_TABLE, ) @staticmethod From 05d362a94e140d508019d686ccf29b010db5e40d Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Wed, 20 Nov 2024 20:47:39 +0530 Subject: [PATCH 07/24] fix(ingest): add error handling (#11905) --- .../src/datahub/ingestion/source/gc/dataprocess_cleanup.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py index 80f7b7a9f4480c..130f2c9c2e12fc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py @@ -401,7 +401,10 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: total_runs=job.get("entity").get("runs").get("total"), ) if datajob_entity.total_runs > 0: - self.delete_dpi_from_datajobs(datajob_entity) + try: + self.delete_dpi_from_datajobs(datajob_entity) + except Exception as e: + logger.error(f"While trying to delete {datajob_entity} got {e}") if ( datajob_entity.total_runs == 0 and self.config.delete_empty_data_jobs From 310f559c6d3b66cbef3620742544624539fc2ebd Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Wed, 20 Nov 2024 10:27:57 -0600 Subject: [PATCH 08/24] chore(docs): Update restli-overview.md (#11908) --- docs/api/restli/restli-overview.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/api/restli/restli-overview.md b/docs/api/restli/restli-overview.md index 22b913d9a25df4..3e9ab00b522670 100644 --- a/docs/api/restli/restli-overview.md +++ b/docs/api/restli/restli-overview.md @@ -1156,7 +1156,7 @@ curl -X POST 'http://localhost:8080/entities?action=search' \ "and": [ { "field": "title", - "value": "Baz Chart 1", + "values": ["Baz Chart 1"], "condition": "EQUAL" } ] @@ -1261,7 +1261,7 @@ curl -X POST 'http://localhost:8080/entities?action=autocomplete' \ "and": [ { "field": "tool", - "value": "looker", + "values": ["looker"], "condition": "EQUAL" } ] From 07ccdc5cc8754f0f4bec7d5769c9ed5d7406673c Mon Sep 17 00:00:00 2001 From: Hyejin Yoon <0327jane@gmail.com> Date: Thu, 21 Nov 2024 02:53:22 +0900 Subject: [PATCH 09/24] docs: add hudi to integrations (#11901) --- docs-website/filterTagIndexes.json | 11 +++++++++++ docs-website/static/img/logos/platforms/hudi.png | Bin 0 -> 7238 bytes 2 files changed, 11 insertions(+) create mode 100644 docs-website/static/img/logos/platforms/hudi.png diff --git a/docs-website/filterTagIndexes.json b/docs-website/filterTagIndexes.json index 2309593b2c3b9f..b269f23cccd667 100644 --- a/docs-website/filterTagIndexes.json +++ b/docs-website/filterTagIndexes.json @@ -231,6 +231,17 @@ "Features": "Stateful Ingestion, UI Ingestion, Status Aspect" } }, + { + "Path": "https://hudi.apache.org/docs/syncing_datahub/", + "imgPath": "img/logos/platforms/hudi.png", + "Title": "Apache Hudi", + "Description": "Apache Hudi is an open-source data lake framework that provides ACID transactions, efficient upserts, time travel queries, and incremental data processing for large-scale datasets.", + "tags": { + "Platform Type": "Datastore", + "Connection Type": "Pull", + "Features": "" + } + }, { "Path": "docs/generated/ingestion/sources/iceberg", "imgPath": "img/logos/platforms/iceberg.png", diff --git a/docs-website/static/img/logos/platforms/hudi.png b/docs-website/static/img/logos/platforms/hudi.png new file mode 100644 index 0000000000000000000000000000000000000000..c5e79bcc86ce3463bda6aa4e73c641d5875e5fb9 GIT binary patch literal 7238 zcmV-M9J%9(P)00CAA0{{R3sNA1j00012P)t-s05)R) zG+_WAK>!^-06bv;N?ic3>ql@*05MHq=+05xU+gSh}kcK`zf0Ilc%v+e*+e*iaU09=sw+qj+p000?uQchFR@;)RF2KNBi z<-cfArjjVfis5Gb{`{GG5sB-n000|>NklKSpp8F?3&zioMmFNCZF+Hn#Wzuv1B>k#FAtqp zJ>JHCo#*kg&_&T3+1Ri1NL~}Vfq(b?x|fGuHodv|dD_>6&WnDH`MJthgf2SS_ZGPO z0RKYJ*ZFrpU-zQWD`|nc^z*c@1pUz2fIf2fecG3So;AHS^L^SEfnHXR4KNyBi^x~IAkbQ1qAXPx^{El64XQPd+Z z_06D{1pdA0_)@VToBEH3z9sYnLD5!}%Wg&He?0XqV|r;H`dmOi@M{7wNImpXeIw{u z(MbE%nwS;(^_wML>Kj2919JP$y_-YH@0CK6d#P^%T_NuY>a~Ic3I#bz-Ph`Fp>u** zehKIYz0gOg{~0C(0SQal8Sjz zf?I!H>h7QuB44P`pM-vo>Yh3^be>fW8S>ed`;^dqt?mfAC@Bhaean4H=)P8W1Dz|+ zPKSJU4gLJOO)Tb}qq?U~2wkW$jS_iZWeeYYYAsLJzE-D&F1n^CwS@|OZYzzC>ZH(> zdnIN*-*O*px);>^^HH4+I;WQTVue1tXHHgL>QvB0S@hIaEio5!P2ay`RQH3i_Mw+l zW6q#8^tsgCFSZdv7bUrCUu?M_7Pry(#WoV?Wb0|LHT3Cb+%Kv7sBRTH&ngGZtHr&l zi=*|vEoU7%?_4lnYPlbDll4BTTZUd1^x<4rp-*+)4WqjFG>5=;%e*g^&l%Fy4sfaS zH1g1O%qthjmn(Fw&=DgvB1}^5p=%T-T`+GnCr@ab1mIdds#zX+=u5K5+~~Yn4(NKJ zU!|^X%PFBV_E}O#Lg$&2aIR^&A7GneVNzE@XCEeY9CTHxxnZse=n$dbrLGtHhe@67 zk-0XY19!heMrcN%GnPai`nJq96*^Gppj!PRbR~7+p-;9wT`|`MbOEgz^&NPA7W!vJ zbvSh4D2LNl=!jG5^!wFAcC`ELHx7kcT6xvu4Yz?~+) z9Mwhr038$6g`!<2Nqh`~HfWWd2KzdFeGPvMAFKZt1i{xaW{UmJWY5T%4~IBQivfLp zGHxcSFHFll3LRdn^EzR{74zx*!L*%U28BOP2J(AZ&4WQVKWU(;x)PHPRQki7ao0vH zNL}#y2Xv^^IcpbD&%6LfmZtld>i|JFJ=0lm1nnE80A0&0Saf}B* zH!Rj3uTunm1zG7Ya1dCpb8gE`Kz|ZCgRZMb$I}|yoI3!z(E_^saRUc14+I2sfxZ)H zb};ry=uoK(o}AWEZ%04R_Y2)5iIXaQZqg%~miq719VPWJSP#8zzm(jgJwi7L=qhDD2z;Gi0hn`>W8^2J`jpVY zQjgflinVdF1$G^Dd{;&mQ`wmQ%QTSdn9Q2NQzJCdl+b~-y5PAT=0o+q!Bi⋙2cA zCw~X6LYKF#tr7atbgXJDy_(H~0EfYqqVe6Z*sm&8*OYQjc<*(A)NyhYqnbo6fyjsJXiJ0e!{t z?r*8j3LPkQUbiQ^7T50DHEhwjBVGH_hQm^>tk72o{b>e%LFh24i!`UZc%LWhme zM0snUx=QFGu`P65Tc*%~n=b^GUBpAzea%45qtq9L4vXsCX3_S5-HF(lrJlQ%`MiIJ z1@yHSH9wv$7qVr8NIi^CB-ELM$UYAp)pOS~pDhI>D4^rZ$N9lnjnF|+UDz$!9tUjb zC-B+4(CspxJt77c(AQt?cRw+9Iiimpc>RxLzi7Kh_E2mC9d$u&0pd0-)urQpZ3_J! z)wM#0Nj*KDs4O6E$Fue2ngnWf(swi+?5##l#x1R@hfEJ0@YKVE4Em|fNwavix(_0& z49acdYZky+#L~k4nzPg+z0e_27d&t2ptq!1JX_uHg!<>n8W0-Lg^i;M&mDMx&_Pls zD%$S-Iqn!bBBD1L*bzPZ_XhaF2s;~W2C0V-p@T+fQg)&=!rG0kjOa~Qg1^^Or43p` zw@{-$M)f}+<`AhzNlOX6jqAuIbhyx=MfPO@r}_2CP!DEl_oU4K{rSh@*oxFu6%de0 z{S;HBb8MZ^s}l3getlhNKwq=G_Iqa=lsQ1^lttUMDN93r3OWbR%QPtFzr1t>&IYV) zlWzVd$79T3Qr9}yke#ThC!sUw&}S?=v#@PKad;1l`x`#U&-scWNp#M+A<2MdOyC(a+X zb_;ev550p{YX&W&tEUF8q1zVvf8T@D6?IrHr<1zKkAX@ORQw-Kn91|H9{yDSNQUEQ zf}pk4oDdbU$qbQBO z-`JEx1aw|n7W!4{RuXvqORmUAO7ls)j=SBt3T8O0P-SL6#c-woro88cMWKI<(7(p2 z4JdUh&ss@dDgM~*rCw)G{c%Z_#a$f<;&X+4p<|?O@o;D!g>_D@R|)=G5(Nc4J1$l7;aPP!^c?rL=b-?M>)*Yjq5FmD`0i zYfV>QIF-_2xpGySe@9fM*YyNKza;RnTd7~UM=3LF-UKz3(n0Q2c4E!%jfm)t{fLf~ zI>D6>JT)rf+M}%(kePI5)(9P!^^pku)|Rt0LW8%+3_D9npo6^oiinP5PLaC1uhpk> z`I-eLg`2Ps9aa@L&V+8c8TTM{Oaf0bmde8%oyoU>oi~sfc=y5$bL(CAN9bRO1U{sB zrK1#Gx3@vftwRTmE6B_ZbL(AqL#dNZpsV@~EA}?F@xw55NSD>?3mw;%BilwQR36`M z(1m9|LptOUzJt)Q2|QKiD#AQ)-;ZVJY}r*BM=NwptsYbE7?0|s^RUi-u6qK+<*Hum zq$Cvj9uoLls|Fd?S37uy4(XYA3!(2Jfxo(V5jT_#1VZj3H+sa%TfKg zcCCYF$SU1_Oj;cXuK5 zi`3b+wfe1Mk5Q|v+!A!q24y$i-HFimB=vzhreI%}!82TIX4Oji9+uaPqk2Mox|^bY zbmidLV;&~~-rWUsPBIxcoxpE2LNl#(cwApKQ1J*|Kc*s`gmjBS?-zx>*|gkDa}ML- z*UUt-$9|mt8Fns0y69Xn-)1m&p(Pt2ayelGI^a<^odf1XBd{+Zb-@wKFDo{#vxkhC za@-yBtJ|E6(?Lfi@Z?e-dUFm2;TbA)Cu z_q48lUtbTXrOq&&cL#Z#YoQ~j{Z@ho4Glw#0gm z>D3(2)mDIaynV-ul+oE1~aTYGxauk5WJG zD5?*~iUS>dEY?5=^k&Ex(>D_OP3rMpYW3kRR~YryB$c}-a&mj6Zzc5ICGcZA7Pzj? z*C<`;lXw26Lf_T2-1TA9usNDzxCZ*>W%>rqxIIf9crTF2@k0pc)Jo@Wm)EAEdb}s8 z1Bapw7JP5ev#onGn+kn5n^=G3av-V4DJ68`St)Mw%s4#suU$>coy)cuR;mkQ==w)a zyraqGWZY{+b>MBTc>j)o-jHrmwmC5KZDc!?I)jbClR|%Iq3=-Y2bk3H{(Ub>7lqt1 zqVF?-KY*e-ev8vyV!GQA*jGv5Vb}ekPY?aSy=&1(jT4N!WViwv9o;uyUvj4dTu^$b!?6?0^| ze!WHapF%T9VI2}l`ZhDlu<36Sh%QhvGiRJ5(+LtEV`)Ka<}~*apgQuUcP{C5YWxiiGFtAGeHi#Zt0D>2h+Jk z6oQB@FjhAT*x$eX{uCb(1(QZj5Yex~SV2@r_IJ-n{U(71=1}Ox6P@~hK+G~ud zhcJ>!4J#!^ms)ItB=COn$Q#oE-8)BiM*oN+xMZIj&h%RYj>=|?< zKjZcGyEQv}yd*oLc}`QEjOr64;B^DFuNofE83J`9p^h$l;M8^t42|^@Qe-|AZTd+R z!d!Rnfm2kcpN{YEHSrN2xBhfwb|5+EuUE#+z=21Mx|jRzHFJi(qW&(K?+_DT-}C+p_gh^4x(A z1kALaoRMkK#R+2#;>r2wDEtNj)23%cy6zIwa)^9p7sPZVO+tJ8qP%3B%rTM%kw?}+ zOrQL)CL@>V!Xz|E&BZ5Bzdz2yjR9#%T{6*SspXKSRK|(Y^ zphM@fX*!~B1QXrQs&k*&rkw7({Z>JsBknt0CRV?HN+!B2s}96$xeD>#v9ne`euYHv^0A*~H3AbtF5}27->wgZN2> z=z^&31l`(D&@oFJ-{>+6i!SQO2lqs|V9>WeP{JhEl@nY?mnxxgZ|~utWBdC@TF}9x@nx=dkoV-Ou`$S=8}w)rIu?e%T!v0l@t*b)|q78g{kF$Dx394 z6Fh;gF(!TniYewtbP08J?H0c0DgyK^m=_NlKqFom7w^Dx1nWSHOI}25?`Xbk6V%r> zN22>U@IF@ED1`zAoW>X^f%c4}L&CP%l-H(k;7ya2t%9&}OQ1uW9jL z1Ah&~^15x*Fz$dv&h8>X|AjUjd2aACwdj1p*!vc>6YhE-On1xP+epw2N_Ee{a`PoR z6V)Gc4=iNqwjvu7svBAqY zdrj07nm${dm)Rrk2I*9e1|6rm$LJwFBV}ByRY&)~q8Cm3z;^>2-jfFn4-Cr19^y9i zKR7SJ%p?+;;S;z!2)@eeRgE2xHv`=G z%*;=~nKSbzx>u|YTXp1A9%O}`>=5B>1A>Z~NnoR6b-38pSWAI<-WUqdjYs~>>CDYd z0y|x-t}7Cv2cFgxl|XY*eb&jiP_-PyJzbN4o)LlWk)io;G5zgh2}Ea^7)xW+15aS= zOU8bpyL%c$XPGUhathGBocA%%QD4rjmXkgl@|CeVbW}EFdy4~fYzB=R^I1w@rz*B- ziv2zGDDD1Wox-3m#-g*-)sbq7bCrq^bmQKa-G}+iSTHkzS`Mk35+ADU0^J#31LBz_ z5uL%RBeEE`^G*o5lchlW>SA2?jss6OTMlSha@+LXpuR2u^zE;4$be$YvWQMkb*(6p za`{1TgYrF#!ff-V6C*l09f8RuV@7IB#Kh)MY@4u2JI(1r( zHfZb5)86T~P|!`Mll24YWm;Y`?t$tVQC*$Ws9&X%O#svH^)H`qQ2A(yF?yI<&bWg% z%L=)z8)E`Mzq#-m>}a+n6a6wdGH~x-Xr@sxG_P!ux9KSHO$T%^t+HUd9 z2lVqS3Gg(&MH8JkEjQe#PO-{I`{D!o0rkzF4c3ruel|-cx@QJnJB>P9&}K8GO$_J< z1v_}w>e7i$oPlpjHW%)BFrgRdN4u^;Uw`35Cyv#1#z_X<5S@15R6&}4w_|BQ4$#d~1v8uy8lx$VdOl>%`9Poja7Z2v zyyI2dQAX#Zoh}@pr$cm72fnd{8JXB-2k$lSanB)YITEX0UI%s#h&KAU0bqx*P{Ss^_7^ z=lo~XMBgMwbULf9n~3>=3Lhm$3wk1T^d1ME#Hxq)JlLJ0IKySN=)P2kI@?B@SWRfq zhMRMG(Di$gMEAAohPYAPW`2{uX+ckw=wuGO36V$UKIWVp^h7N>Wm-;UA<_Bvg`lTP zbSeo=K%=^+Ij06a8D$)!FxGdvm@U0d;Ufnb>WCXH)#oGn5gd6sE+6*m_biR0?VnE^Jru&n?TIzKtB_GbGvOiH+=p1@9!S; ziEo{UucSI$X$v#w#Gt>u{k*m9uD-0_+1*wD##dJ5a+W>akGJ!O)AenS*Wq?q6@>#2 z25o@+afhkIQ_r*h(>(&fsVl8Fu zoELOc9I4*B)a{eHeyRHJ2d2+;p+h9cGVsn!tUVN1j1bUuae#eb6~{{*zT@ys#p$c_ z6L)v@l^ytUEMM@f>U^up``huleR%xz@bLOX2XTDb)L32Jc{pFbQ;}E^X5{yM4;San?w$RDX*=WHeVE2-Z9LiA-~ZLPLT z1U)gL4`<+&?TgO0O9VYRq7PF2X}Xv#TWyvIy3wf-eUR#p)5UDoP+~3#^z?{6NcHBi zc20`9AkY&e`XJRi(`Tw3G3J6mS6PayUoCe(61r-MAr}OCl0+Y(x@oEO60$*0ljuWK zzXolkGTX(1o+!}=sBVh#JZqrfc(J8`o}%kM;=ms(ZI;2D3j*CFOY|YD-q^tA#2GPN z)n&4v_twr-Ngka`1HDL>=>5euQ|$K9IoHdbtrq>|RPWoxwJOYZ(V!!&n)w3B8y}n$7YuCB&hE~h7c4O=Mhh6@@zSWr2K-a3dE!Fh2M4i|E z!Cn8vLz~3$nd^gVQ$_c7ebjesmoy*9!W&R?t__*R_Jau0^i@0epvg Ubr=SfnE(I)07*qoM6N<$f;07L9smFU literal 0 HcmV?d00001 From 218c05915268015b825737fb103836a0ab679939 Mon Sep 17 00:00:00 2001 From: kanavnarula <33712591+kanavnarula@users.noreply.github.com> Date: Thu, 21 Nov 2024 01:00:21 +0530 Subject: [PATCH 10/24] feat(ui): Display username while removing the user from the group (#11706) --- .../src/app/entity/group/GroupMembers.tsx | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/datahub-web-react/src/app/entity/group/GroupMembers.tsx b/datahub-web-react/src/app/entity/group/GroupMembers.tsx index 147c3f8030d0e0..28e81b438d4cb4 100644 --- a/datahub-web-react/src/app/entity/group/GroupMembers.tsx +++ b/datahub-web-react/src/app/entity/group/GroupMembers.tsx @@ -137,12 +137,13 @@ export default function GroupMembers({ urn, pageSize, isExternalGroup, onChangeM }, 3000); }; - const onRemoveMember = (memberUrn: string) => { + const onRemoveMember = (memberEntity: CorpUser) => { + const memberName = entityRegistry.getDisplayName(EntityType.CorpUser, memberEntity); Modal.confirm({ title: `Confirm Group Member Removal`, - content: `Are you sure you want to remove this user from the group?`, + content: `Are you sure you want to remove ${memberName} user from the group?`, onOk() { - removeGroupMember(memberUrn); + removeGroupMember(memberEntity?.urn); }, onCancel() {}, okText: 'Yes', @@ -155,7 +156,7 @@ export default function GroupMembers({ urn, pageSize, isExternalGroup, onChangeM const total = relationships?.total || 0; const groupMembers = relationships?.relationships?.map((rel) => rel.entity as CorpUser) || []; - const getItems = (urnID: string): MenuProps['items'] => { + const getItems = (userEntity: CorpUser): MenuProps['items'] => { return [ { key: 'make', @@ -169,7 +170,7 @@ export default function GroupMembers({ urn, pageSize, isExternalGroup, onChangeM { key: 'remove', disabled: isExternalGroup, - onClick: () => onRemoveMember(urnID), + onClick: () => onRemoveMember(userEntity), label: ( Remove from Group @@ -210,7 +211,7 @@ export default function GroupMembers({ urn, pageSize, isExternalGroup, onChangeM - + From 3f267afc9743d341c35dc1032de0fc0f53655203 Mon Sep 17 00:00:00 2001 From: sid-acryl <155424659+sid-acryl@users.noreply.github.com> Date: Thu, 21 Nov 2024 01:01:54 +0530 Subject: [PATCH 11/24] fix(ingest/powerbi): m-query fixes (#11906) Co-authored-by: Harshal Sheth Co-authored-by: Aseem Bansal --- .../powerbi/m_query/native_sql_parser.py | 20 +++- .../source/powerbi/m_query/resolver.py | 94 ++++++++++--------- .../powerbi/test_native_sql_parser.py | 10 ++ 3 files changed, 78 insertions(+), 46 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py index 61b1164825257e..63a6073c90a1a9 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py @@ -72,10 +72,24 @@ def get_tables(native_query: str) -> List[str]: def remove_drop_statement(query: str) -> str: # Certain PowerBI M-Queries contain a combination of DROP and SELECT statements within SQL, causing SQLParser to fail on these queries. # Therefore, these occurrences are being removed. - # Regular expression to match patterns like "DROP TABLE IF EXISTS #;" - pattern = r"DROP TABLE IF EXISTS #\w+;?" - return re.sub(pattern, "", query) + patterns = [ + # Regular expression to match patterns like: + # "DROP TABLE IF EXISTS #;" + # "DROP TABLE IF EXISTS #, , ...;" + # "DROP TABLE IF EXISTS #, , ...\n" + r"DROP\s+TABLE\s+IF\s+EXISTS\s+(?:#?\w+(?:,\s*#?\w+)*)[;\n]", + ] + + new_query = query + + for pattern in patterns: + new_query = re.sub(pattern, "", new_query, flags=re.IGNORECASE) + + # Remove extra spaces caused by consecutive replacements + new_query = re.sub(r"\s+", " ", new_query).strip() + + return new_query def parse_custom_sql( diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py index 9eafde2f75ecdd..32de95d6bd015e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py @@ -83,6 +83,16 @@ def urn_creator( ) +def get_next_item(items: List[str], item: str) -> Optional[str]: + if item in items: + try: + index = items.index(item) + return items[index + 1] + except IndexError: + logger.debug(f'item:"{item}", not found in item-list: {items}') + return None + + class AbstractDataPlatformTableCreator(ABC): """ Base class to share common functionalities among different dataplatform for M-Query parsing. @@ -675,7 +685,7 @@ def two_level_access_pattern( data_access_func_detail.arg_list ) if server is None or db_name is None: - return Lineage.empty() # Return empty list + return Lineage.empty() # Return an empty list schema_name: str = cast( IdentifierAccessor, data_access_func_detail.identifier_accessor @@ -782,32 +792,38 @@ def create_lineage( ), ) - if len(arguments) == 2: - # It is a regular case of MS-SQL - logger.debug("Handling with regular case") - return self.two_level_access_pattern(data_access_func_detail) - - if len(arguments) >= 4 and arguments[2] != "Query": - logger.debug("Unsupported case is found. Second index is not the Query") - return Lineage.empty() + server, database = self.get_db_detail_from_argument( + data_access_func_detail.arg_list + ) + if server is None or database is None: + return Lineage.empty() # Return an empty list + + assert server + assert database # to silent the lint + + query: Optional[str] = get_next_item(arguments, "Query") + if query: + if self.config.enable_advance_lineage_sql_construct is False: + # Use previous parser to generate URN to keep backward compatibility + return Lineage( + upstreams=self.create_urn_using_old_parser( + query=query, + db_name=database, + server=server, + ), + column_lineage=[], + ) - if self.config.enable_advance_lineage_sql_construct is False: - # Use previous parser to generate URN to keep backward compatibility - return Lineage( - upstreams=self.create_urn_using_old_parser( - query=arguments[3], - db_name=arguments[1], - server=arguments[0], - ), - column_lineage=[], + return self.parse_custom_sql( + query=query, + database=database, + server=server, + schema=MSSqlDataPlatformTableCreator.DEFAULT_SCHEMA, ) - return self.parse_custom_sql( - query=arguments[3], - database=arguments[1], - server=arguments[0], - schema=MSSqlDataPlatformTableCreator.DEFAULT_SCHEMA, - ) + # It is a regular case of MS-SQL + logger.debug("Handling with regular case") + return self.two_level_access_pattern(data_access_func_detail) class OracleDataPlatformTableCreator(AbstractDataPlatformTableCreator): @@ -1154,27 +1170,19 @@ def get_db_name(self, data_access_tokens: List[str]) -> Optional[str]: != SupportedDataPlatform.DatabricksMultiCloud_SQL.value.powerbi_data_platform_name ): return None - try: - if "Database" in data_access_tokens: - index = data_access_tokens.index("Database") - if data_access_tokens[index + 1] != Constant.M_QUERY_NULL: - # Database name is explicitly set in argument - return data_access_tokens[index + 1] - if "Name" in data_access_tokens: - index = data_access_tokens.index("Name") - # Next element is value of the Name. It is a database name - return data_access_tokens[index + 1] + database: Optional[str] = get_next_item(data_access_tokens, "Database") - if "Catalog" in data_access_tokens: - index = data_access_tokens.index("Catalog") - # Next element is value of the Catalog. In Databricks Catalog can also be used in place of a database. - return data_access_tokens[index + 1] - - except IndexError as e: - logger.debug("Database name is not available", exc_info=e) - - return None + if ( + database and database != Constant.M_QUERY_NULL + ): # database name is explicitly set + return database + + return get_next_item( # database name is set in Name argument + data_access_tokens, "Name" + ) or get_next_item( # If both above arguments are not available, then try Catalog + data_access_tokens, "Catalog" + ) def create_lineage( self, data_access_func_detail: DataAccessFunctionDetail diff --git a/metadata-ingestion/tests/integration/powerbi/test_native_sql_parser.py b/metadata-ingestion/tests/integration/powerbi/test_native_sql_parser.py index 53e184515c1d8c..887f7fe4d6f44a 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_native_sql_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_native_sql_parser.py @@ -19,3 +19,13 @@ def test_simple_from(): assert len(tables) == 1 assert tables[0] == "OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4" + + +def test_drop_statement(): + expected: str = "SELECT#(lf)concat((UPPER(REPLACE(SELLER,'-',''))), MONTHID) as AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), MONTHID) as CD_AGENT_KEY,#(lf) *#(lf)FROM#(lf)OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4" + + query: str = "DROP TABLE IF EXISTS #table1; DROP TABLE IF EXISTS #table1,#table2; DROP TABLE IF EXISTS table1; DROP TABLE IF EXISTS table1, #table2;SELECT#(lf)concat((UPPER(REPLACE(SELLER,'-',''))), MONTHID) as AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), MONTHID) as CD_AGENT_KEY,#(lf) *#(lf)FROM#(lf)OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4" + + actual: str = native_sql_parser.remove_drop_statement(query) + + assert actual == expected From 2a37483b49fb6d2b76defd99e1678add6e9d4527 Mon Sep 17 00:00:00 2001 From: Meenakshi Kamalaseshan Radha <62914384+mkamalas@users.noreply.github.com> Date: Thu, 21 Nov 2024 01:37:07 +0530 Subject: [PATCH 12/24] fix(auth)- Fix Redirect url flow in OidcCallback (#11878) --- datahub-frontend/app/auth/sso/oidc/OidcCallbackLogic.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/datahub-frontend/app/auth/sso/oidc/OidcCallbackLogic.java b/datahub-frontend/app/auth/sso/oidc/OidcCallbackLogic.java index ef5833f607efdb..113aeeb36551f0 100644 --- a/datahub-frontend/app/auth/sso/oidc/OidcCallbackLogic.java +++ b/datahub-frontend/app/auth/sso/oidc/OidcCallbackLogic.java @@ -130,8 +130,6 @@ public Object perform( CallContext ctx = ctxResult.getFirst(); Result result = (Result) ctxResult.getSecond(); - setContextRedirectUrl(ctx); - // Handle OIDC authentication errors. if (OidcResponseErrorHandler.isError(ctx)) { return OidcResponseErrorHandler.handleError(ctx); @@ -192,6 +190,9 @@ private Pair superPerform( } } + // Set the redirect url from cookie before creating action + setContextRedirectUrl(ctx); + action = this.redirectToOriginallyRequestedUrl(ctx, defaultUrl); } } catch (RuntimeException var20) { From 7dbb3e60cbefe8da96985a2de607eb2ef18514f4 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 20 Nov 2024 13:33:30 -0800 Subject: [PATCH 13/24] chore(ingest): start using explicit exports (#11899) --- metadata-ingestion/scripts/avro_codegen.py | 4 ++-- metadata-ingestion/setup.cfg | 10 +++++++- .../datahub/api/circuit_breaker/__init__.py | 7 ++++++ .../api/circuit_breaker/circuit_breaker.py | 2 +- .../datahub/api/entities/datajob/__init__.py | 3 +++ .../datahub/api/entities/datajob/dataflow.py | 3 +-- .../datahub/api/entities/datajob/datajob.py | 3 +-- .../src/datahub/api/graphql/__init__.py | 2 ++ metadata-ingestion/src/datahub/cli/put_cli.py | 3 ++- .../src/datahub/configuration/__init__.py | 5 ++-- .../src/datahub/configuration/common.py | 2 +- .../src/datahub/configuration/json_loader.py | 2 +- .../datahub/configuration/source_common.py | 8 ++----- .../src/datahub/configuration/yaml.py | 2 +- .../src/datahub/emitter/mce_builder.py | 6 ++++- metadata-ingestion/src/datahub/entrypoints.py | 7 ++---- .../src/datahub/ingestion/api/decorators.py | 5 +++- .../ingestion/extractor/json_schema_util.py | 6 ++--- .../ingestion/extractor/protobuf_util.py | 6 ++--- .../src/datahub/ingestion/graph/client.py | 4 +++- .../datahub_ingestion_run_summary_provider.py | 3 +-- .../ingestion/source/dbt/dbt_common.py | 3 +-- .../ingestion/source/delta_lake/config.py | 3 +-- .../ingestion/source/dynamodb/dynamodb.py | 18 +++++++-------- .../ingestion/source/fivetran/config.py | 3 ++- .../ingestion/source/looker/looker_common.py | 10 ++++---- .../source/looker/looker_dataclasses.py | 10 ++++---- .../source/looker/looker_file_loader.py | 6 ++--- .../ingestion/source/looker/looker_usage.py | 3 ++- .../source/looker/looker_view_id_cache.py | 4 ++-- .../ingestion/source/looker/lookml_config.py | 14 ++++------- .../source/looker/lookml_refinement.py | 2 +- .../ingestion/source/looker/lookml_source.py | 23 ++++++++++--------- .../ingestion/source/looker/view_upstream.py | 2 +- .../src/datahub/ingestion/source/mongodb.py | 14 +++++------ .../ingestion/source/powerbi/config.py | 4 ++-- .../src/datahub/ingestion/source/redash.py | 2 +- .../src/datahub/ingestion/source/sac/sac.py | 7 ++---- .../source/schema_inference/csv_tsv.py | 6 ++--- .../ingestion/source/schema_inference/json.py | 6 ++--- .../source/schema_inference/parquet.py | 6 ++--- .../source/snowflake/snowflake_lineage_v2.py | 8 ++++--- .../source/snowflake/snowflake_v2.py | 6 ++--- .../src/datahub/specific/dataset.py | 4 ++-- .../src/datahub/sql_parsing/_models.py | 2 ++ .../datahub/sql_parsing/schema_resolver.py | 2 +- .../testing/check_sql_parser_result.py | 8 ++----- .../datahub/utilities/urns/corp_group_urn.py | 4 +++- .../datahub/utilities/urns/corpuser_urn.py | 4 +++- .../datahub/utilities/urns/data_flow_urn.py | 4 +++- .../datahub/utilities/urns/data_job_urn.py | 4 +++- .../utilities/urns/data_platform_urn.py | 4 +++- .../urns/data_process_instance_urn.py | 4 +++- .../src/datahub/utilities/urns/dataset_urn.py | 4 +++- .../src/datahub/utilities/urns/domain_urn.py | 4 +++- .../datahub/utilities/urns/notebook_urn.py | 4 +++- .../urns/structured_properties_urn.py | 4 +++- .../src/datahub/utilities/urns/tag_urn.py | 4 +++- .../src/datahub/utilities/urns/urn.py | 4 +++- .../tests/integration/lookml/test_lookml.py | 8 +++---- .../tableau/test_tableau_ingest.py | 3 +-- .../tests/test_helpers/docker_helpers.py | 8 +++---- .../tests/test_helpers/mce_helpers.py | 9 ++++---- .../tests/test_helpers/type_helpers.py | 7 ------ .../tests/unit/glue/test_glue_source.py | 13 +++++------ .../unit/redshift/test_redshift_source.py | 10 ++++---- .../tests/unit/serde/test_serde.py | 11 ++++----- .../unit/sql_parsing/test_sqlglot_utils.py | 6 ++--- .../state/test_stateful_ingestion.py | 3 ++- 69 files changed, 206 insertions(+), 189 deletions(-) diff --git a/metadata-ingestion/scripts/avro_codegen.py b/metadata-ingestion/scripts/avro_codegen.py index e2dd5151439923..e5792da32fb5d7 100644 --- a/metadata-ingestion/scripts/avro_codegen.py +++ b/metadata-ingestion/scripts/avro_codegen.py @@ -769,7 +769,7 @@ def generate( import importlib from typing import TYPE_CHECKING -from datahub._codegen.aspect import _Aspect +from datahub._codegen.aspect import _Aspect as _Aspect from datahub.utilities.docs_build import IS_SPHINX_BUILD from datahub.utilities._custom_package_loader import get_custom_models_package @@ -802,7 +802,7 @@ def generate( from datahub.utilities.docs_build import IS_SPHINX_BUILD from datahub.utilities._custom_package_loader import get_custom_urns_package -from datahub.utilities.urns._urn_base import Urn # noqa: F401 +from datahub.utilities.urns._urn_base import Urn as Urn # noqa: F401 _custom_package_path = get_custom_urns_package() diff --git a/metadata-ingestion/setup.cfg b/metadata-ingestion/setup.cfg index c095420e4e3f30..057779bc87c622 100644 --- a/metadata-ingestion/setup.cfg +++ b/metadata-ingestion/setup.cfg @@ -31,7 +31,7 @@ exclude = __pycache__ per-file-ignores = # imported but unused - __init__.py: F401 + __init__.py: F401, I250 ban-relative-imports = true [mypy] @@ -53,6 +53,14 @@ disallow_untyped_defs = no # try to be a bit more strict in certain areas of the codebase [mypy-datahub.*] ignore_missing_imports = no +implicit_reexport = no +[mypy-datahub.metadata.*] +# TODO: Remove this once all the code has been updated. +implicit_reexport = yes +[mypy-datahub.ingestion.*] +# TODO: Remove this once all the code has been updated. +implicit_reexport = yes + [mypy-datahub_provider.*] ignore_missing_imports = no [mypy-tests.*] diff --git a/metadata-ingestion/src/datahub/api/circuit_breaker/__init__.py b/metadata-ingestion/src/datahub/api/circuit_breaker/__init__.py index 27317826264b85..0b04bfa4025a1b 100644 --- a/metadata-ingestion/src/datahub/api/circuit_breaker/__init__.py +++ b/metadata-ingestion/src/datahub/api/circuit_breaker/__init__.py @@ -12,3 +12,10 @@ ) requests_logger.setLevel(logging.WARNING) + +__all__ = [ + "AssertionCircuitBreaker", + "AssertionCircuitBreakerConfig", + "OperationCircuitBreaker", + "OperationCircuitBreakerConfig", +] diff --git a/metadata-ingestion/src/datahub/api/circuit_breaker/circuit_breaker.py b/metadata-ingestion/src/datahub/api/circuit_breaker/circuit_breaker.py index a3c54046faf681..7c1180536a90fb 100644 --- a/metadata-ingestion/src/datahub/api/circuit_breaker/circuit_breaker.py +++ b/metadata-ingestion/src/datahub/api/circuit_breaker/circuit_breaker.py @@ -6,7 +6,7 @@ from gql.transport.requests import RequestsHTTPTransport from pydantic import Field -from datahub.configuration import ConfigModel +from datahub.configuration.common import ConfigModel logger = logging.getLogger(__name__) diff --git a/metadata-ingestion/src/datahub/api/entities/datajob/__init__.py b/metadata-ingestion/src/datahub/api/entities/datajob/__init__.py index 6d85a1569cb63d..3a073005968222 100644 --- a/metadata-ingestion/src/datahub/api/entities/datajob/__init__.py +++ b/metadata-ingestion/src/datahub/api/entities/datajob/__init__.py @@ -1,2 +1,5 @@ from datahub.api.entities.datajob.dataflow import DataFlow from datahub.api.entities.datajob.datajob import DataJob + +# TODO: Remove this and start importing directly from the inner files. +__all__ = ["DataFlow", "DataJob"] diff --git a/metadata-ingestion/src/datahub/api/entities/datajob/dataflow.py b/metadata-ingestion/src/datahub/api/entities/datajob/dataflow.py index f2436d56d5aca1..e169c07445e969 100644 --- a/metadata-ingestion/src/datahub/api/entities/datajob/dataflow.py +++ b/metadata-ingestion/src/datahub/api/entities/datajob/dataflow.py @@ -3,7 +3,6 @@ from typing import Callable, Dict, Iterable, List, Optional, Set, cast import datahub.emitter.mce_builder as builder -from datahub.configuration.source_common import ALL_ENV_TYPES from datahub.emitter.generic_emitter import Emitter from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.metadata.schema_classes import ( @@ -114,7 +113,7 @@ def generate_tags_aspect(self) -> List[GlobalTagsClass]: def _get_env(self) -> Optional[str]: env: Optional[str] = None - if self.env and self.env.upper() in ALL_ENV_TYPES: + if self.env and self.env.upper() in builder.ALL_ENV_TYPES: env = self.env.upper() else: logger.debug( diff --git a/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py b/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py index 0f5d18c20e055b..4958a68caa95fe 100644 --- a/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py +++ b/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py @@ -3,7 +3,6 @@ from typing import Callable, Dict, Iterable, List, Optional, Set import datahub.emitter.mce_builder as builder -from datahub.configuration.source_common import ALL_ENV_TYPES from datahub.emitter.generic_emitter import Emitter from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.metadata.schema_classes import ( @@ -109,7 +108,7 @@ def generate_mcp( self, materialize_iolets: bool = True ) -> Iterable[MetadataChangeProposalWrapper]: env: Optional[str] = None - if self.flow_urn.cluster.upper() in ALL_ENV_TYPES: + if self.flow_urn.cluster.upper() in builder.ALL_ENV_TYPES: env = self.flow_urn.cluster.upper() else: logger.debug( diff --git a/metadata-ingestion/src/datahub/api/graphql/__init__.py b/metadata-ingestion/src/datahub/api/graphql/__init__.py index e8c8d22bbb93df..d818b19092fcbe 100644 --- a/metadata-ingestion/src/datahub/api/graphql/__init__.py +++ b/metadata-ingestion/src/datahub/api/graphql/__init__.py @@ -1,2 +1,4 @@ from datahub.api.graphql.assertion import Assertion from datahub.api.graphql.operation import Operation + +__all__ = ["Assertion", "Operation"] diff --git a/metadata-ingestion/src/datahub/cli/put_cli.py b/metadata-ingestion/src/datahub/cli/put_cli.py index 989b1a6d02fd01..0a40a9f4ccf92d 100644 --- a/metadata-ingestion/src/datahub/cli/put_cli.py +++ b/metadata-ingestion/src/datahub/cli/put_cli.py @@ -6,11 +6,12 @@ from datahub.cli.cli_utils import post_entity from datahub.configuration.config_loader import load_config_file -from datahub.emitter.mcp import MetadataChangeProposalWrapper, SystemMetadataClass +from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.graph.client import get_default_graph from datahub.metadata.schema_classes import ( DataPlatformInfoClass as DataPlatformInfo, PlatformTypeClass, + SystemMetadataClass, ) from datahub.telemetry import telemetry from datahub.upgrade import upgrade diff --git a/metadata-ingestion/src/datahub/configuration/__init__.py b/metadata-ingestion/src/datahub/configuration/__init__.py index 008d788072d0a5..21979829a4453d 100644 --- a/metadata-ingestion/src/datahub/configuration/__init__.py +++ b/metadata-ingestion/src/datahub/configuration/__init__.py @@ -1,5 +1,4 @@ from datahub.configuration.common import ( - ConfigModel, - ConfigurationMechanism, - DynamicTypedConfig, + ConfigModel as ConfigModel, + DynamicTypedConfig as DynamicTypedConfig, ) diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py index 0ce7127b440534..4fdf564162410c 100644 --- a/metadata-ingestion/src/datahub/configuration/common.py +++ b/metadata-ingestion/src/datahub/configuration/common.py @@ -21,7 +21,7 @@ from pydantic.fields import Field from typing_extensions import Protocol -from datahub.configuration._config_enum import ConfigEnum +from datahub.configuration._config_enum import ConfigEnum as ConfigEnum # noqa: I250 from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2 from datahub.utilities.dedup_list import deduplicate_list diff --git a/metadata-ingestion/src/datahub/configuration/json_loader.py b/metadata-ingestion/src/datahub/configuration/json_loader.py index 35667eb5951fc7..6ecb741be528d1 100644 --- a/metadata-ingestion/src/datahub/configuration/json_loader.py +++ b/metadata-ingestion/src/datahub/configuration/json_loader.py @@ -1,7 +1,7 @@ import json from typing import IO -from datahub.configuration import ConfigurationMechanism +from datahub.configuration.common import ConfigurationMechanism class JsonConfigurationMechanism(ConfigurationMechanism): diff --git a/metadata-ingestion/src/datahub/configuration/source_common.py b/metadata-ingestion/src/datahub/configuration/source_common.py index ad12447532335f..44c737f1bd13d4 100644 --- a/metadata-ingestion/src/datahub/configuration/source_common.py +++ b/metadata-ingestion/src/datahub/configuration/source_common.py @@ -1,14 +1,10 @@ -from typing import Dict, Optional, Set +from typing import Dict, Optional from pydantic import validator from pydantic.fields import Field from datahub.configuration.common import ConfigModel -from datahub.emitter.enum_helpers import get_enum_options -from datahub.metadata.schema_classes import FabricTypeClass - -DEFAULT_ENV = FabricTypeClass.PROD -ALL_ENV_TYPES: Set[str] = set(get_enum_options(FabricTypeClass)) +from datahub.emitter.mce_builder import ALL_ENV_TYPES, DEFAULT_ENV class PlatformInstanceConfigMixin(ConfigModel): diff --git a/metadata-ingestion/src/datahub/configuration/yaml.py b/metadata-ingestion/src/datahub/configuration/yaml.py index 1f1172836f7448..c069845e1de119 100644 --- a/metadata-ingestion/src/datahub/configuration/yaml.py +++ b/metadata-ingestion/src/datahub/configuration/yaml.py @@ -2,7 +2,7 @@ import yaml -from datahub.configuration import ConfigurationMechanism +from datahub.configuration.common import ConfigurationMechanism class YamlConfigurationMechanism(ConfigurationMechanism): diff --git a/metadata-ingestion/src/datahub/emitter/mce_builder.py b/metadata-ingestion/src/datahub/emitter/mce_builder.py index 63b03db7f5b604..69946c575908b5 100644 --- a/metadata-ingestion/src/datahub/emitter/mce_builder.py +++ b/metadata-ingestion/src/datahub/emitter/mce_builder.py @@ -13,6 +13,7 @@ Any, List, Optional, + Set, Tuple, Type, TypeVar, @@ -24,7 +25,6 @@ import typing_inspect from avrogen.dict_wrapper import DictWrapper -from datahub.configuration.source_common import DEFAULT_ENV from datahub.emitter.enum_helpers import get_enum_options from datahub.metadata.schema_classes import ( AssertionKeyClass, @@ -35,6 +35,7 @@ DatasetKeyClass, DatasetLineageTypeClass, DatasetSnapshotClass, + FabricTypeClass, GlobalTagsClass, GlossaryTermAssociationClass, GlossaryTermsClass as GlossaryTerms, @@ -56,6 +57,9 @@ logger = logging.getLogger(__name__) Aspect = TypeVar("Aspect", bound=AspectAbstract) +DEFAULT_ENV = FabricTypeClass.PROD +ALL_ENV_TYPES: Set[str] = set(get_enum_options(FabricTypeClass)) + DEFAULT_FLOW_CLUSTER = "prod" UNKNOWN_USER = "urn:li:corpuser:unknown" DATASET_URN_TO_LOWER: bool = ( diff --git a/metadata-ingestion/src/datahub/entrypoints.py b/metadata-ingestion/src/datahub/entrypoints.py index d088380d5d38c4..85968f050a3716 100644 --- a/metadata-ingestion/src/datahub/entrypoints.py +++ b/metadata-ingestion/src/datahub/entrypoints.py @@ -13,13 +13,10 @@ generate_access_token, make_shim_command, ) -from datahub.cli.config_utils import ( - DATAHUB_CONFIG_PATH, - get_boolean_env_variable, - write_gms_config, -) +from datahub.cli.config_utils import DATAHUB_CONFIG_PATH, write_gms_config from datahub.cli.delete_cli import delete from datahub.cli.docker_cli import docker +from datahub.cli.env_utils import get_boolean_env_variable from datahub.cli.exists_cli import exists from datahub.cli.get_cli import get from datahub.cli.ingest_cli import ingest diff --git a/metadata-ingestion/src/datahub/ingestion/api/decorators.py b/metadata-ingestion/src/datahub/ingestion/api/decorators.py index b390ffb9dd0362..d32c0b85ceef4c 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/decorators.py +++ b/metadata-ingestion/src/datahub/ingestion/api/decorators.py @@ -3,7 +3,10 @@ from typing import Callable, Dict, Optional, Type from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.api.source import Source, SourceCapability +from datahub.ingestion.api.source import ( # noqa: I250 + Source, + SourceCapability as SourceCapability, +) def config_class(config_cls: Type) -> Callable[[Type], Type]: diff --git a/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py b/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py index bcf077154343c8..88d1fcc52e2196 100644 --- a/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py +++ b/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py @@ -23,7 +23,7 @@ RecordTypeClass, SchemaFieldClass as SchemaField, SchemaFieldDataTypeClass, - SchemaMetadataClass as SchemaMetadata, + SchemaMetadataClass, StringTypeClass, UnionTypeClass, ) @@ -665,13 +665,13 @@ def get_schema_metadata( name: str, json_schema: Dict[Any, Any], raw_schema_string: Optional[str] = None, -) -> SchemaMetadata: +) -> SchemaMetadataClass: json_schema_as_string = raw_schema_string or json.dumps(json_schema) md5_hash: str = md5(json_schema_as_string.encode()).hexdigest() schema_fields = list(JsonSchemaTranslator.get_fields_from_schema(json_schema)) - schema_metadata = SchemaMetadata( + schema_metadata = SchemaMetadataClass( schemaName=name, platform=f"urn:li:dataPlatform:{platform}", version=0, diff --git a/metadata-ingestion/src/datahub/ingestion/extractor/protobuf_util.py b/metadata-ingestion/src/datahub/ingestion/extractor/protobuf_util.py index f62bb184252d98..e947aff384871d 100644 --- a/metadata-ingestion/src/datahub/ingestion/extractor/protobuf_util.py +++ b/metadata-ingestion/src/datahub/ingestion/extractor/protobuf_util.py @@ -32,7 +32,7 @@ OneofDescriptor, ) -from datahub.metadata.com.linkedin.pegasus2avro.schema import ( +from datahub.metadata.schema_classes import ( ArrayTypeClass, BooleanTypeClass, BytesTypeClass, @@ -41,8 +41,8 @@ MapTypeClass, NumberTypeClass, RecordTypeClass, - SchemaField, - SchemaFieldDataType, + SchemaFieldClass as SchemaField, + SchemaFieldDataTypeClass as SchemaFieldDataType, StringTypeClass, UnionTypeClass, ) diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index c90ac93eee2cc2..759aebcfd46b0a 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -33,7 +33,9 @@ from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.rest_emitter import DatahubRestEmitter from datahub.emitter.serialization_helper import post_json_transform -from datahub.ingestion.graph.config import DatahubClientConfig +from datahub.ingestion.graph.config import ( # noqa: I250; TODO: Remove this alias + DatahubClientConfig as DatahubClientConfig, +) from datahub.ingestion.graph.connections import ( connections_gql, get_id_from_connection_urn, diff --git a/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py b/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py index 33bfb63feb3fd7..5961a553a14943 100644 --- a/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +++ b/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py @@ -11,9 +11,8 @@ redact_raw_config, ) from datahub.emitter.aspect import JSON_CONTENT_TYPE -from datahub.emitter.mce_builder import datahub_guid +from datahub.emitter.mce_builder import datahub_guid, make_data_platform_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.emitter.mcp_builder import make_data_platform_urn from datahub.ingestion.api.common import PipelineContext, RecordEnvelope from datahub.ingestion.api.pipeline_run_listener import PipelineRunListener from datahub.ingestion.api.sink import NoopWriteCallback, Sink diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py index b5d0ed42e651ea..4598ae388b827d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py @@ -117,9 +117,8 @@ ViewPropertiesClass, ) from datahub.metadata.urns import DatasetUrn -from datahub.sql_parsing.schema_resolver import SchemaResolver +from datahub.sql_parsing.schema_resolver import SchemaInfo, SchemaResolver from datahub.sql_parsing.sqlglot_lineage import ( - SchemaInfo, SqlParsingDebugInfo, SqlParsingResult, infer_output_schema, diff --git a/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py b/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py index 81a54d1327d05a..d2b4a576953daf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py @@ -6,9 +6,8 @@ from pydantic import Field from typing_extensions import Literal -from datahub.configuration.common import AllowDenyPattern +from datahub.configuration.common import AllowDenyPattern, ConfigModel from datahub.configuration.source_common import ( - ConfigModel, EnvConfigMixin, PlatformInstanceConfigMixin, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py b/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py index acda656526ef53..4f1de6fb06c695 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py @@ -52,24 +52,22 @@ from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulIngestionSourceBase, ) -from datahub.metadata.com.linkedin.pegasus2avro.schema import ( +from datahub.metadata.schema_classes import ( ArrayTypeClass, BooleanTypeClass, BytesTypeClass, + DataPlatformInstanceClass, + DatasetPropertiesClass, NullTypeClass, NumberTypeClass, RecordTypeClass, - SchemaField, - SchemaFieldDataType, + SchemaFieldClass as SchemaField, + SchemaFieldDataTypeClass as SchemaFieldDataType, SchemalessClass, - SchemaMetadata, + SchemaMetadataClass, StringTypeClass, UnionTypeClass, ) -from datahub.metadata.schema_classes import ( - DataPlatformInstanceClass, - DatasetPropertiesClass, -) from datahub.utilities.registries.domain_registry import DomainRegistry MAX_ITEMS_TO_RETRIEVE = 100 @@ -448,7 +446,7 @@ def construct_schema_metadata( dataset_properties: DatasetPropertiesClass, schema: Dict[Tuple[str, ...], SchemaDescription], primary_key_dict: Dict[str, str], - ) -> SchemaMetadata: + ) -> SchemaMetadataClass: """ " To construct the schema metadata, it will first sort the schema by the occurrence of attribute names in descending order and truncate the schema by MAX_SCHEMA_SIZE, and then start to construct the @@ -502,7 +500,7 @@ def construct_schema_metadata( canonical_schema.append(field) # create schema metadata object for table - schema_metadata = SchemaMetadata( + schema_metadata = SchemaMetadataClass( schemaName=table_name, platform=f"urn:li:dataPlatform:{self.platform}", version=0, diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py index e40e284d6e0a42..86826ae7bedc09 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py @@ -12,8 +12,9 @@ ConfigModel, ConfigurationWarning, ) -from datahub.configuration.source_common import DEFAULT_ENV, DatasetSourceConfigMixin +from datahub.configuration.source_common import DatasetSourceConfigMixin from datahub.configuration.validate_field_rename import pydantic_renamed_field +from datahub.emitter.mce_builder import DEFAULT_ENV from datahub.ingestion.api.report import Report from datahub.ingestion.source.bigquery_v2.bigquery_config import ( BigQueryConnectionConfig, diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py index 3d1683100474e8..3e2872a4b5caa1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py @@ -48,7 +48,7 @@ from datahub.ingestion.source.looker.looker_file_loader import LookerViewFileLoader from datahub.ingestion.source.looker.looker_lib_wrapper import LookerAPI from datahub.ingestion.source.looker.lookml_config import ( - _BASE_PROJECT_NAME, + BASE_PROJECT_NAME, LookMLSourceReport, ) from datahub.ingestion.source.looker.str_functions import remove_suffix @@ -370,7 +370,7 @@ def _form_field_name( assert view_name # for lint false positive project_include: ProjectInclude = ProjectInclude( - project=view_project_map.get(view_name, _BASE_PROJECT_NAME), + project=view_project_map.get(view_name, BASE_PROJECT_NAME), include=view_name, ) @@ -385,7 +385,7 @@ def _form_field_name( view_urn = LookerViewId( project_name=( project_include.project - if project_include.project != _BASE_PROJECT_NAME + if project_include.project != BASE_PROJECT_NAME else explore_project_name ), model_name=model_name, @@ -1113,7 +1113,7 @@ def from_api( # noqa: C901 fields=view_fields, upstream_views=list( ProjectInclude( - project=view_project_map.get(view_name, _BASE_PROJECT_NAME), + project=view_project_map.get(view_name, BASE_PROJECT_NAME), include=view_name, ) for view_name in views @@ -1239,7 +1239,7 @@ def _to_metadata_events( # noqa: C901 view_urn = LookerViewId( project_name=( view_ref.project - if view_ref.project != _BASE_PROJECT_NAME + if view_ref.project != BASE_PROJECT_NAME else self.project_name ), model_name=self.model_name, diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_dataclasses.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_dataclasses.py index 7e23079156b625..327c9ebf99bd20 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_dataclasses.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_dataclasses.py @@ -9,8 +9,8 @@ load_and_preprocess_file, ) from datahub.ingestion.source.looker.lookml_config import ( - _BASE_PROJECT_NAME, - _EXPLORE_FILE_EXTENSION, + BASE_PROJECT_NAME, + EXPLORE_FILE_EXTENSION, LookMLSourceConfig, LookMLSourceReport, ) @@ -69,7 +69,7 @@ def from_looker_dict( explore_files = [ x.include for x in resolved_includes - if x.include.endswith(_EXPLORE_FILE_EXTENSION) + if x.include.endswith(EXPLORE_FILE_EXTENSION) ] for included_file in explore_files: try: @@ -152,9 +152,9 @@ def resolve_includes( # As such, we try to handle it but are as defensive as possible. non_base_project_name = project_name - if project_name == _BASE_PROJECT_NAME and root_project_name is not None: + if project_name == BASE_PROJECT_NAME and root_project_name is not None: non_base_project_name = root_project_name - if non_base_project_name != _BASE_PROJECT_NAME and inc.startswith( + if non_base_project_name != BASE_PROJECT_NAME and inc.startswith( f"/{non_base_project_name}/" ): # This might be a local include. Let's make sure that '/{project_name}' doesn't diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_file_loader.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_file_loader.py index f894c96debc54a..9fac0b52fde0dd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_file_loader.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_file_loader.py @@ -9,8 +9,8 @@ load_and_preprocess_file, ) from datahub.ingestion.source.looker.lookml_config import ( - _EXPLORE_FILE_EXTENSION, - _VIEW_FILE_EXTENSION, + EXPLORE_FILE_EXTENSION, + VIEW_FILE_EXTENSION, LookMLSourceConfig, LookMLSourceReport, ) @@ -42,7 +42,7 @@ def _load_viewfile( ) -> Optional[LookerViewFile]: # always fully resolve paths to simplify de-dup path = str(pathlib.Path(path).resolve()) - allowed_extensions = [_VIEW_FILE_EXTENSION, _EXPLORE_FILE_EXTENSION] + allowed_extensions = [VIEW_FILE_EXTENSION, EXPLORE_FILE_EXTENSION] matched_any_extension = [ match for match in [path.endswith(x) for x in allowed_extensions] if match ] diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py index 6a623e1e97b5dc..ef7d64e4f42d43 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_usage.py @@ -14,7 +14,7 @@ from looker_sdk.sdk.api40.models import Dashboard, LookWithQuery -from datahub.emitter.mce_builder import Aspect, AspectAbstract +from datahub.emitter.mce_builder import Aspect from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.source.looker import looker_common from datahub.ingestion.source.looker.looker_common import ( @@ -40,6 +40,7 @@ DashboardUsageStatisticsClass, DashboardUserUsageCountsClass, TimeWindowSizeClass, + _Aspect as AspectAbstract, ) logger = logging.getLogger(__name__) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_view_id_cache.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_view_id_cache.py index aa45bb72d1f462..562c7863b31343 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_view_id_cache.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_view_id_cache.py @@ -6,7 +6,7 @@ from datahub.ingestion.source.looker.looker_dataclasses import LookerModel from datahub.ingestion.source.looker.looker_file_loader import LookerViewFileLoader from datahub.ingestion.source.looker.lookml_config import ( - _BASE_PROJECT_NAME, + BASE_PROJECT_NAME, NAME, LookMLSourceReport, ) @@ -103,7 +103,7 @@ def get_looker_view_id( current_project_name: str = ( include.project - if include.project != _BASE_PROJECT_NAME + if include.project != BASE_PROJECT_NAME else self.project_name ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py index da837da1613864..7ffb895349ed29 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py @@ -33,17 +33,11 @@ NAME: str = "name" -_BASE_PROJECT_NAME = "__BASE" +BASE_PROJECT_NAME = "__BASE" -_EXPLORE_FILE_EXTENSION = ".explore.lkml" - -_VIEW_FILE_EXTENSION = ".view.lkml" - -_MODEL_FILE_EXTENSION = ".model.lkml" - -VIEW_LANGUAGE_LOOKML: str = "lookml" - -VIEW_LANGUAGE_SQL: str = "sql" +EXPLORE_FILE_EXTENSION = ".explore.lkml" +VIEW_FILE_EXTENSION = ".view.lkml" +MODEL_FILE_EXTENSION = ".model.lkml" DERIVED_VIEW_SUFFIX = r".sql_table_name" diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_refinement.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_refinement.py index 892ed79754a1c2..6933d9d69394bc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_refinement.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_refinement.py @@ -5,7 +5,7 @@ from datahub.ingestion.source.looker.looker_config import LookerConnectionDefinition from datahub.ingestion.source.looker.looker_dataclasses import LookerModel -from datahub.ingestion.source.looker.looker_view_id_cache import LookerViewFileLoader +from datahub.ingestion.source.looker.looker_file_loader import LookerViewFileLoader from datahub.ingestion.source.looker.lookml_config import ( NAME, LookMLSourceConfig, diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py index d258570ec384f7..3c83b8728aa6f7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py @@ -57,10 +57,8 @@ LookerViewContext, ) from datahub.ingestion.source.looker.lookml_config import ( - _BASE_PROJECT_NAME, - _MODEL_FILE_EXTENSION, - VIEW_LANGUAGE_LOOKML, - VIEW_LANGUAGE_SQL, + BASE_PROJECT_NAME, + MODEL_FILE_EXTENSION, LookerConnectionDefinition, LookMLSourceConfig, LookMLSourceReport, @@ -98,6 +96,9 @@ ) from datahub.sql_parsing.sqlglot_lineage import ColumnRef +VIEW_LANGUAGE_LOOKML: str = "lookml" +VIEW_LANGUAGE_SQL: str = "sql" + logger = logging.getLogger(__name__) @@ -319,7 +320,7 @@ def _load_model(self, path: str) -> LookerModel: looker_model = LookerModel.from_looker_dict( parsed, - _BASE_PROJECT_NAME, + BASE_PROJECT_NAME, self.source_config.project_name, self.base_projects_folder, path, @@ -544,7 +545,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.source_config.base_folder = checkout_dir.resolve() self.base_projects_folder[ - _BASE_PROJECT_NAME + BASE_PROJECT_NAME ] = self.source_config.base_folder visited_projects: Set[str] = set() @@ -576,7 +577,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.base_projects_folder[project] = p_ref self._recursively_check_manifests( - tmp_dir, _BASE_PROJECT_NAME, visited_projects + tmp_dir, BASE_PROJECT_NAME, visited_projects ) yield from self.get_internal_workunits() @@ -607,7 +608,7 @@ def _recursively_check_manifests( return # Special case handling if the root project has a name in the manifest file. - if project_name == _BASE_PROJECT_NAME and manifest.project_name: + if project_name == BASE_PROJECT_NAME and manifest.project_name: if ( self.source_config.project_name is not None and manifest.project_name != self.source_config.project_name @@ -696,7 +697,7 @@ def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 # The ** means "this directory and all subdirectories", and hence should # include all the files we want. model_files = sorted( - self.source_config.base_folder.glob(f"**/*{_MODEL_FILE_EXTENSION}") + self.source_config.base_folder.glob(f"**/*{MODEL_FILE_EXTENSION}") ) model_suffix_len = len(".model") @@ -832,7 +833,7 @@ def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 current_project_name: str = ( include.project - if include.project != _BASE_PROJECT_NAME + if include.project != BASE_PROJECT_NAME else project_name ) @@ -841,7 +842,7 @@ def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 base_folder_path: str = str( self.base_projects_folder.get( current_project_name, - self.base_projects_folder[_BASE_PROJECT_NAME], + self.base_projects_folder[BASE_PROJECT_NAME], ) ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py b/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py index 057dbca4281849..632d0caf712323 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py @@ -12,6 +12,7 @@ ViewField, ViewFieldType, ) +from datahub.ingestion.source.looker.looker_config import LookerConnectionDefinition from datahub.ingestion.source.looker.looker_view_id_cache import LookerViewIdCache from datahub.ingestion.source.looker.lookml_concept_context import ( LookerFieldContext, @@ -20,7 +21,6 @@ from datahub.ingestion.source.looker.lookml_config import ( DERIVED_VIEW_SUFFIX, NAME, - LookerConnectionDefinition, LookMLSourceConfig, LookMLSourceReport, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py index c87b025f13b55d..bbc4897d227bac 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py @@ -50,25 +50,23 @@ from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulIngestionSourceBase, ) -from datahub.metadata.com.linkedin.pegasus2avro.schema import ( +from datahub.metadata.schema_classes import ( ArrayTypeClass, BooleanTypeClass, BytesTypeClass, + DataPlatformInstanceClass, + DatasetPropertiesClass, NullTypeClass, NumberTypeClass, RecordTypeClass, - SchemaField, - SchemaFieldDataType, + SchemaFieldClass as SchemaField, + SchemaFieldDataTypeClass as SchemaFieldDataType, SchemalessClass, - SchemaMetadata, + SchemaMetadataClass as SchemaMetadata, StringTypeClass, TimeTypeClass, UnionTypeClass, ) -from datahub.metadata.schema_classes import ( - DataPlatformInstanceClass, - DatasetPropertiesClass, -) from datahub.metadata.urns import DatasetUrn logger = logging.getLogger(__name__) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index 7c8487727c9eee..91fa2e96be2cce 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -9,7 +9,7 @@ import datahub.emitter.mce_builder as builder from datahub.configuration.common import AllowDenyPattern, ConfigModel -from datahub.configuration.source_common import DEFAULT_ENV, DatasetSourceConfigMixin +from datahub.configuration.source_common import DatasetSourceConfigMixin from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.ingestion.source.common.subtypes import BIAssetSubTypes from datahub.ingestion.source.state.stale_entity_removal_handler import ( @@ -240,7 +240,7 @@ class PlatformDetail(ConfigModel): "recipe of other datahub sources.", ) env: str = pydantic.Field( - default=DEFAULT_ENV, + default=builder.DEFAULT_ENV, description="The environment that all assets produced by DataHub platform ingestion source belong to", ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/redash.py b/metadata-ingestion/src/datahub/ingestion/source/redash.py index 5fd63e7f93f92a..581e32d29dceaf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redash.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redash.py @@ -41,7 +41,7 @@ ) from datahub.utilities.lossy_collections import LossyDict, LossyList from datahub.utilities.perf_timer import PerfTimer -from datahub.utilities.sql_parser import SQLParser +from datahub.utilities.sql_parser_base import SQLParser from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor logger = logging.getLogger(__name__) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sac/sac.py b/metadata-ingestion/src/datahub/ingestion/source/sac/sac.py index de0904107b9bbe..66962b5d96d389 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sac/sac.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sac/sac.py @@ -13,12 +13,9 @@ from urllib3.util.retry import Retry from datahub.configuration.common import AllowDenyPattern -from datahub.configuration.source_common import ( - DEFAULT_ENV, - DatasetSourceConfigMixin, - EnvConfigMixin, -) +from datahub.configuration.source_common import DatasetSourceConfigMixin, EnvConfigMixin from datahub.emitter.mce_builder import ( + DEFAULT_ENV, dataset_urn_to_key, make_dashboard_urn, make_data_platform_urn, diff --git a/metadata-ingestion/src/datahub/ingestion/source/schema_inference/csv_tsv.py b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/csv_tsv.py index 54f7dfb5b903c7..ab7b887cba1d80 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/schema_inference/csv_tsv.py +++ b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/csv_tsv.py @@ -3,15 +3,15 @@ from tableschema import Table from datahub.ingestion.source.schema_inference.base import SchemaInferenceBase -from datahub.metadata.com.linkedin.pegasus2avro.schema import ( +from datahub.metadata.schema_classes import ( ArrayTypeClass, BooleanTypeClass, DateTypeClass, NullTypeClass, NumberTypeClass, RecordTypeClass, - SchemaField, - SchemaFieldDataType, + SchemaFieldClass as SchemaField, + SchemaFieldDataTypeClass as SchemaFieldDataType, StringTypeClass, TimeTypeClass, UnionTypeClass, diff --git a/metadata-ingestion/src/datahub/ingestion/source/schema_inference/json.py b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/json.py index 1f2c73a2522d04..1659aaf6fa2020 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/schema_inference/json.py +++ b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/json.py @@ -7,14 +7,14 @@ from datahub.ingestion.source.schema_inference.base import SchemaInferenceBase from datahub.ingestion.source.schema_inference.object import construct_schema -from datahub.metadata.com.linkedin.pegasus2avro.schema import ( +from datahub.metadata.schema_classes import ( ArrayTypeClass, BooleanTypeClass, NullTypeClass, NumberTypeClass, RecordTypeClass, - SchemaField, - SchemaFieldDataType, + SchemaFieldClass as SchemaField, + SchemaFieldDataTypeClass as SchemaFieldDataType, StringTypeClass, UnionTypeClass, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/schema_inference/parquet.py b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/parquet.py index 1f3f2e0a1e8a83..efc605e0df8cab 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/schema_inference/parquet.py +++ b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/parquet.py @@ -4,7 +4,7 @@ import pyarrow.parquet from datahub.ingestion.source.schema_inference.base import SchemaInferenceBase -from datahub.metadata.com.linkedin.pegasus2avro.schema import ( +from datahub.metadata.schema_classes import ( ArrayTypeClass, BooleanTypeClass, BytesTypeClass, @@ -12,8 +12,8 @@ NullTypeClass, NumberTypeClass, RecordTypeClass, - SchemaField, - SchemaFieldDataType, + SchemaFieldClass as SchemaField, + SchemaFieldDataTypeClass as SchemaFieldDataType, StringTypeClass, TimeTypeClass, UnionTypeClass, diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index 6f9c9259b27844..ac47abf4874499 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -31,14 +31,16 @@ ) from datahub.metadata.schema_classes import DatasetLineageTypeClass, UpstreamClass from datahub.sql_parsing.sql_parsing_aggregator import ( - ColumnLineageInfo, - ColumnRef, KnownLineageMapping, KnownQueryLineageInfo, SqlParsingAggregator, UrnStr, ) -from datahub.sql_parsing.sqlglot_lineage import DownstreamColumnRef +from datahub.sql_parsing.sqlglot_lineage import ( + ColumnLineageInfo, + ColumnRef, + DownstreamColumnRef, +) from datahub.utilities.perf_timer import PerfTimer from datahub.utilities.time import ts_millis_to_datetime diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index dd7f73268fdc4f..538841018067e2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -48,11 +48,9 @@ SnowflakeQueriesExtractor, SnowflakeQueriesExtractorConfig, ) +from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report -from datahub.ingestion.source.snowflake.snowflake_schema import ( - SnowflakeDataDictionary, - SnowflakeQuery, -) +from datahub.ingestion.source.snowflake.snowflake_schema import SnowflakeDataDictionary from datahub.ingestion.source.snowflake.snowflake_schema_gen import ( SnowflakeSchemaGenerator, ) diff --git a/metadata-ingestion/src/datahub/specific/dataset.py b/metadata-ingestion/src/datahub/specific/dataset.py index 9dd2616078f08d..b171dc4cc2939f 100644 --- a/metadata-ingestion/src/datahub/specific/dataset.py +++ b/metadata-ingestion/src/datahub/specific/dataset.py @@ -13,7 +13,7 @@ KafkaAuditHeaderClass, OwnerClass as Owner, OwnershipTypeClass, - SchemaMetadataClass as SchemaMetadata, + SchemaMetadataClass, SystemMetadataClass, TagAssociationClass as Tag, UpstreamClass as Upstream, @@ -40,7 +40,7 @@ def __init__( self.aspect_name = ( EditableSchemaMetadata.ASPECT_NAME if editable - else SchemaMetadata.ASPECT_NAME + else SchemaMetadataClass.ASPECT_NAME ) self.aspect_field = "editableSchemaFieldInfo" if editable else "schemaFieldInfo" diff --git a/metadata-ingestion/src/datahub/sql_parsing/_models.py b/metadata-ingestion/src/datahub/sql_parsing/_models.py index d92d178b81cf4b..d586e7d6d9045b 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/_models.py +++ b/metadata-ingestion/src/datahub/sql_parsing/_models.py @@ -42,6 +42,8 @@ def __lt__(self, other: "_FrozenModel") -> bool: class _TableName(_FrozenModel): + # TODO: Move this into the schema_resolver.py file. + database: Optional[str] = None db_schema: Optional[str] = None table: str diff --git a/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py b/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py index e7b0527d30d978..e3f2fbc786b437 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py +++ b/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py @@ -13,7 +13,7 @@ from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier from datahub.metadata.schema_classes import SchemaFieldClass, SchemaMetadataClass from datahub.metadata.urns import DataPlatformUrn -from datahub.sql_parsing._models import _TableName +from datahub.sql_parsing._models import _TableName as _TableName # noqa: I250 from datahub.sql_parsing.sql_parsing_common import PLATFORMS_WITH_CASE_SENSITIVE_TABLES from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedDict from datahub.utilities.urns.field_paths import get_simple_field_path_from_v2_field_path diff --git a/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py b/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py index 72b5f6c5e26e4b..13be45ec1be28d 100644 --- a/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py +++ b/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py @@ -6,12 +6,8 @@ import deepdiff from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier -from datahub.sql_parsing.schema_resolver import SchemaResolver -from datahub.sql_parsing.sqlglot_lineage import ( - SchemaInfo, - SqlParsingResult, - sqlglot_lineage, -) +from datahub.sql_parsing.schema_resolver import SchemaInfo, SchemaResolver +from datahub.sql_parsing.sqlglot_lineage import SqlParsingResult, sqlglot_lineage logger = logging.getLogger(__name__) diff --git a/metadata-ingestion/src/datahub/utilities/urns/corp_group_urn.py b/metadata-ingestion/src/datahub/utilities/urns/corp_group_urn.py index 37c10769259459..577f90215a6353 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/corp_group_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/corp_group_urn.py @@ -1 +1,3 @@ -from datahub.metadata.urns import CorpGroupUrn # noqa: F401 +from datahub.metadata.urns import CorpGroupUrn + +__all__ = ["CorpGroupUrn"] diff --git a/metadata-ingestion/src/datahub/utilities/urns/corpuser_urn.py b/metadata-ingestion/src/datahub/utilities/urns/corpuser_urn.py index 5f9ecf65951b95..8acb86be00f6c8 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/corpuser_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/corpuser_urn.py @@ -1 +1,3 @@ -from datahub.metadata.urns import CorpUserUrn as CorpuserUrn # noqa: F401 +from datahub.metadata.urns import CorpUserUrn as CorpuserUrn + +__all__ = ["CorpuserUrn"] diff --git a/metadata-ingestion/src/datahub/utilities/urns/data_flow_urn.py b/metadata-ingestion/src/datahub/utilities/urns/data_flow_urn.py index 5b2b45927c339e..3508ae5c4a3490 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/data_flow_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/data_flow_urn.py @@ -1 +1,3 @@ -from datahub.metadata.urns import DataFlowUrn # noqa: F401 +from datahub.metadata.urns import DataFlowUrn + +__all__ = ["DataFlowUrn"] diff --git a/metadata-ingestion/src/datahub/utilities/urns/data_job_urn.py b/metadata-ingestion/src/datahub/utilities/urns/data_job_urn.py index 53e3419ee7ecb2..d003b6c6ad7a88 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/data_job_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/data_job_urn.py @@ -1 +1,3 @@ -from datahub.metadata.urns import DataJobUrn # noqa: F401 +from datahub.metadata.urns import DataJobUrn + +__all__ = ["DataJobUrn"] diff --git a/metadata-ingestion/src/datahub/utilities/urns/data_platform_urn.py b/metadata-ingestion/src/datahub/utilities/urns/data_platform_urn.py index 9d37e38f256e7f..51e013e715d4fd 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/data_platform_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/data_platform_urn.py @@ -1 +1,3 @@ -from datahub.metadata.urns import DataPlatformUrn # noqa: F401 +from datahub.metadata.urns import DataPlatformUrn + +__all__ = ["DataPlatformUrn"] diff --git a/metadata-ingestion/src/datahub/utilities/urns/data_process_instance_urn.py b/metadata-ingestion/src/datahub/utilities/urns/data_process_instance_urn.py index df6ba797d069c1..22e6b36c5f7aec 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/data_process_instance_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/data_process_instance_urn.py @@ -1 +1,3 @@ -from datahub.metadata.urns import DataProcessInstanceUrn # noqa: F401 +from datahub.metadata.urns import DataProcessInstanceUrn + +__all__ = ["DataProcessInstanceUrn"] diff --git a/metadata-ingestion/src/datahub/utilities/urns/dataset_urn.py b/metadata-ingestion/src/datahub/utilities/urns/dataset_urn.py index 6078ffefc03d85..1652e170599958 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/dataset_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/dataset_urn.py @@ -1 +1,3 @@ -from datahub.metadata.urns import DatasetUrn # noqa: F401 +from datahub.metadata.urns import DatasetUrn + +__all__ = ["DatasetUrn"] diff --git a/metadata-ingestion/src/datahub/utilities/urns/domain_urn.py b/metadata-ingestion/src/datahub/utilities/urns/domain_urn.py index 442a6b27729bba..242a3d8228320d 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/domain_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/domain_urn.py @@ -1 +1,3 @@ -from datahub.metadata.urns import DomainUrn # noqa: F401 +from datahub.metadata.urns import DomainUrn + +__all__ = ["DomainUrn"] diff --git a/metadata-ingestion/src/datahub/utilities/urns/notebook_urn.py b/metadata-ingestion/src/datahub/utilities/urns/notebook_urn.py index 60a4f5396aa468..f9b861d7f08524 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/notebook_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/notebook_urn.py @@ -1 +1,3 @@ -from datahub.metadata.urns import NotebookUrn # noqa: F401 +from datahub.metadata.urns import NotebookUrn + +__all__ = ["NotebookUrn"] diff --git a/metadata-ingestion/src/datahub/utilities/urns/structured_properties_urn.py b/metadata-ingestion/src/datahub/utilities/urns/structured_properties_urn.py index 5bd36a0656d99e..6774978c7a76d9 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/structured_properties_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/structured_properties_urn.py @@ -1,4 +1,6 @@ -from datahub.metadata.urns import StructuredPropertyUrn # noqa: F401 +from datahub.metadata.urns import StructuredPropertyUrn + +__all__ = ["StructuredPropertyUrn", "make_structured_property_urn"] def make_structured_property_urn(structured_property_id: str) -> str: diff --git a/metadata-ingestion/src/datahub/utilities/urns/tag_urn.py b/metadata-ingestion/src/datahub/utilities/urns/tag_urn.py index 0ac632ee40a015..f66d56a745a961 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/tag_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/tag_urn.py @@ -1 +1,3 @@ -from datahub.metadata.urns import TagUrn # noqa: F401 +from datahub.metadata.urns import TagUrn + +__all__ = ["TagUrn"] diff --git a/metadata-ingestion/src/datahub/utilities/urns/urn.py b/metadata-ingestion/src/datahub/utilities/urns/urn.py index 2e5cebfd0e8f55..2ded2d4d9b32c0 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/urn.py @@ -1,4 +1,6 @@ -from datahub.metadata.urns import Urn # noqa: F401 +from datahub.metadata.urns import Urn + +__all__ = ["Urn", "guess_entity_type"] def guess_entity_type(urn: str) -> str: diff --git a/metadata-ingestion/tests/integration/lookml/test_lookml.py b/metadata-ingestion/tests/integration/lookml/test_lookml.py index 94b3b103d0548c..a4cfbd5eadb7f3 100644 --- a/metadata-ingestion/tests/integration/lookml/test_lookml.py +++ b/metadata-ingestion/tests/integration/lookml/test_lookml.py @@ -12,16 +12,14 @@ from datahub.ingestion.run.pipeline import Pipeline from datahub.ingestion.source.file import read_metadata_file +from datahub.ingestion.source.looker.looker_dataclasses import LookerModel from datahub.ingestion.source.looker.looker_template_language import ( SpecialVariable, load_and_preprocess_file, resolve_liquid_variable, ) -from datahub.ingestion.source.looker.lookml_source import ( - LookerModel, - LookerRefinementResolver, - LookMLSourceConfig, -) +from datahub.ingestion.source.looker.lookml_config import LookMLSourceConfig +from datahub.ingestion.source.looker.lookml_refinement import LookerRefinementResolver from datahub.metadata.schema_classes import ( DatasetSnapshotClass, MetadataChangeEventClass, diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index edfc41616e44be..62f8f6a654b588 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -19,8 +19,7 @@ ) from tableauserverclient.models.reference_item import ResourceReference -from datahub.configuration.source_common import DEFAULT_ENV -from datahub.emitter.mce_builder import make_schema_field_urn +from datahub.emitter.mce_builder import DEFAULT_ENV, make_schema_field_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.run.pipeline import Pipeline, PipelineContext from datahub.ingestion.source.tableau.tableau import ( diff --git a/metadata-ingestion/tests/test_helpers/docker_helpers.py b/metadata-ingestion/tests/test_helpers/docker_helpers.py index 20aec975787e4e..d0e943bbe63daf 100644 --- a/metadata-ingestion/tests/test_helpers/docker_helpers.py +++ b/metadata-ingestion/tests/test_helpers/docker_helpers.py @@ -4,10 +4,10 @@ import pytest -from datahub.testing.docker_utils import ( # noqa: F401 - docker_compose_runner, - is_responsive, - wait_for_port, +from datahub.testing.docker_utils import ( # noqa: F401,I250 + docker_compose_runner as docker_compose_runner, + is_responsive as is_responsive, + wait_for_port as wait_for_port, ) logger = logging.getLogger(__name__) diff --git a/metadata-ingestion/tests/test_helpers/mce_helpers.py b/metadata-ingestion/tests/test_helpers/mce_helpers.py index 3b59481d8cb022..f4c629df7dba4e 100644 --- a/metadata-ingestion/tests/test_helpers/mce_helpers.py +++ b/metadata-ingestion/tests/test_helpers/mce_helpers.py @@ -17,15 +17,16 @@ Union, ) +import pytest + from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.sink.file import write_metadata_file from datahub.metadata.schema_classes import MetadataChangeEventClass +from datahub.metadata.urns import Urn from datahub.testing.compare_metadata_json import ( assert_metadata_files_equal, load_json_file, ) -from datahub.utilities.urns.urn import Urn -from tests.test_helpers.type_helpers import PytestConfig logger = logging.getLogger(__name__) @@ -77,7 +78,7 @@ def clean_nones(value): def check_golden_file( - pytestconfig: PytestConfig, + pytestconfig: pytest.Config, output_path: Union[str, os.PathLike], golden_path: Union[str, os.PathLike], ignore_paths: Sequence[str] = (), @@ -98,7 +99,7 @@ def check_golden_file( def check_goldens_stream( - pytestconfig: PytestConfig, + pytestconfig: pytest.Config, outputs: List, golden_path: Union[str, os.PathLike], ignore_paths: Sequence[str] = (), diff --git a/metadata-ingestion/tests/test_helpers/type_helpers.py b/metadata-ingestion/tests/test_helpers/type_helpers.py index 154960bbf7fc42..3a2215ed81ca99 100644 --- a/metadata-ingestion/tests/test_helpers/type_helpers.py +++ b/metadata-ingestion/tests/test_helpers/type_helpers.py @@ -1,12 +1,5 @@ from typing import Optional, TypeVar -# The current PytestConfig solution is somewhat ugly and not ideal. -# However, it is currently the best solution available, as the type itself is not -# exported: https://docs.pytest.org/en/stable/reference.html#config. -# As pytest's type support improves, this will likely change. -# TODO: revisit pytestconfig as https://github.com/pytest-dev/pytest/issues/7469 progresses. -from _pytest.config import Config as PytestConfig # noqa: F401 - _T = TypeVar("_T") diff --git a/metadata-ingestion/tests/unit/glue/test_glue_source.py b/metadata-ingestion/tests/unit/glue/test_glue_source.py index 4df0c6d17b06cc..693fd6bc336fd3 100644 --- a/metadata-ingestion/tests/unit/glue/test_glue_source.py +++ b/metadata-ingestion/tests/unit/glue/test_glue_source.py @@ -34,7 +34,6 @@ run_and_get_pipeline, validate_all_providers_have_committed_successfully, ) -from tests.test_helpers.type_helpers import PytestConfig from tests.unit.glue.test_glue_source_stubs import ( databases_1, databases_2, @@ -174,7 +173,7 @@ def test_column_type(hive_column_type: str, expected_type: Type) -> None: @freeze_time(FROZEN_TIME) def test_glue_ingest( tmp_path: Path, - pytestconfig: PytestConfig, + pytestconfig: pytest.Config, platform_instance: str, mce_file: str, mce_golden_file: str, @@ -410,7 +409,7 @@ def test_glue_stateful(pytestconfig, tmp_path, mock_time, mock_datahub_graph): def test_glue_with_delta_schema_ingest( tmp_path: Path, - pytestconfig: PytestConfig, + pytestconfig: pytest.Config, ) -> None: glue_source_instance = glue_source( platform_instance="delta_platform_instance", @@ -446,7 +445,7 @@ def test_glue_with_delta_schema_ingest( def test_glue_with_malformed_delta_schema_ingest( tmp_path: Path, - pytestconfig: PytestConfig, + pytestconfig: pytest.Config, ) -> None: glue_source_instance = glue_source( platform_instance="delta_platform_instance", @@ -489,7 +488,7 @@ def test_glue_with_malformed_delta_schema_ingest( @freeze_time(FROZEN_TIME) def test_glue_ingest_include_table_lineage( tmp_path: Path, - pytestconfig: PytestConfig, + pytestconfig: pytest.Config, mock_datahub_graph_instance: DataHubGraph, platform_instance: str, mce_file: str, @@ -584,7 +583,7 @@ def test_glue_ingest_include_table_lineage( @freeze_time(FROZEN_TIME) def test_glue_ingest_include_column_lineage( tmp_path: Path, - pytestconfig: PytestConfig, + pytestconfig: pytest.Config, mock_datahub_graph_instance: DataHubGraph, platform_instance: str, mce_file: str, @@ -684,7 +683,7 @@ def fake_schema_metadata(entity_urn: str) -> models.SchemaMetadataClass: @freeze_time(FROZEN_TIME) def test_glue_ingest_with_profiling( tmp_path: Path, - pytestconfig: PytestConfig, + pytestconfig: pytest.Config, ) -> None: glue_source_instance = glue_source_with_profiling() mce_file = "glue_mces.json" diff --git a/metadata-ingestion/tests/unit/redshift/test_redshift_source.py b/metadata-ingestion/tests/unit/redshift/test_redshift_source.py index 8198caf50df7f4..f016312dfe47fb 100644 --- a/metadata-ingestion/tests/unit/redshift/test_redshift_source.py +++ b/metadata-ingestion/tests/unit/redshift/test_redshift_source.py @@ -1,15 +1,15 @@ from typing import Iterable -from datahub.emitter.mcp import ( - MetadataChangeProposalClass, - MetadataChangeProposalWrapper, -) +from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.redshift.config import RedshiftConfig from datahub.ingestion.source.redshift.redshift import RedshiftSource from datahub.ingestion.source.redshift.redshift_schema import RedshiftTable -from datahub.metadata.schema_classes import MetadataChangeEventClass +from datahub.metadata.schema_classes import ( + MetadataChangeEventClass, + MetadataChangeProposalClass, +) def redshift_source_setup(custom_props_flag: bool) -> Iterable[MetadataWorkUnit]: diff --git a/metadata-ingestion/tests/unit/serde/test_serde.py b/metadata-ingestion/tests/unit/serde/test_serde.py index 727f2b10511b5e..a131ac9ce2a1bc 100644 --- a/metadata-ingestion/tests/unit/serde/test_serde.py +++ b/metadata-ingestion/tests/unit/serde/test_serde.py @@ -19,7 +19,6 @@ from datahub.metadata.schemas import getMetadataChangeEventSchema from tests.test_helpers import mce_helpers from tests.test_helpers.click_helpers import run_datahub_cmd -from tests.test_helpers.type_helpers import PytestConfig FROZEN_TIME = "2021-07-22 18:54:06" @@ -41,7 +40,7 @@ ], ) def test_serde_to_json( - pytestconfig: PytestConfig, tmp_path: pathlib.Path, json_filename: str + pytestconfig: pytest.Config, tmp_path: pathlib.Path, json_filename: str ) -> None: golden_file = pytestconfig.rootpath / json_filename output_file = tmp_path / "output.json" @@ -73,7 +72,7 @@ def test_serde_to_json( ) @freeze_time(FROZEN_TIME) def test_serde_to_avro( - pytestconfig: PytestConfig, + pytestconfig: pytest.Config, json_filename: str, ) -> None: # In this test, we want to read in from JSON -> MCE object. @@ -126,14 +125,14 @@ def test_serde_to_avro( ], ) @freeze_time(FROZEN_TIME) -def test_check_metadata_schema(pytestconfig: PytestConfig, json_filename: str) -> None: +def test_check_metadata_schema(pytestconfig: pytest.Config, json_filename: str) -> None: json_file_path = pytestconfig.rootpath / json_filename run_datahub_cmd(["check", "metadata-file", f"{json_file_path}"]) def test_check_metadata_rewrite( - pytestconfig: PytestConfig, tmp_path: pathlib.Path + pytestconfig: pytest.Config, tmp_path: pathlib.Path ) -> None: json_input = ( pytestconfig.rootpath / "tests/unit/serde/test_canonicalization_input.json" @@ -161,7 +160,7 @@ def test_check_metadata_rewrite( ], ) def test_check_mce_schema_failure( - pytestconfig: PytestConfig, json_filename: str + pytestconfig: pytest.Config, json_filename: str ) -> None: json_file_path = pytestconfig.rootpath / json_filename diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_utils.py b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_utils.py index 744d43373a0a1f..4e8ba8aa6b7770 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_utils.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_utils.py @@ -4,11 +4,9 @@ import pytest import sqlglot +from datahub.sql_parsing.query_types import get_query_type_of_sql from datahub.sql_parsing.sql_parsing_common import QueryType -from datahub.sql_parsing.sqlglot_lineage import ( - _UPDATE_ARGS_NOT_SUPPORTED_BY_SELECT, - get_query_type_of_sql, -) +from datahub.sql_parsing.sqlglot_lineage import _UPDATE_ARGS_NOT_SUPPORTED_BY_SELECT from datahub.sql_parsing.sqlglot_utils import ( generalize_query, generalize_query_fast, diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stateful_ingestion.py b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stateful_ingestion.py index 66564dc856abae..96ab8f7a01a386 100644 --- a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stateful_ingestion.py +++ b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stateful_ingestion.py @@ -10,7 +10,8 @@ from datahub.api.entities.dataprocess.dataprocess_instance import DataProcessInstance from datahub.configuration.common import AllowDenyPattern -from datahub.configuration.source_common import DEFAULT_ENV, DatasetSourceConfigMixin +from datahub.configuration.source_common import DatasetSourceConfigMixin +from datahub.emitter.mce_builder import DEFAULT_ENV from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport From 5519a330e2d0f2cb3a3c9a75cdcdefaff2673078 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 20 Nov 2024 13:33:54 -0800 Subject: [PATCH 14/24] chore(ingest): bump black (#11898) --- .../examples/cli_usage/gen_schemas.py | 1 - metadata-ingestion/setup.py | 21 ++++++++++++------- .../structuredproperties.py | 1 - .../src/datahub/ingestion/fs/s3_fs.py | 1 - .../glossary/classification_mixin.py | 2 -- .../source/bigquery_v2/bigquery_config.py | 1 - .../source/bigquery_v2/bigquery_schema_gen.py | 4 ---- .../ingestion/source/bigquery_v2/lineage.py | 2 -- .../source/bigquery_v2/queries_extractor.py | 4 ---- .../ingestion/source/cassandra/cassandra.py | 1 - .../source/confluent_schema_registry.py | 1 - .../source/data_lake_common/config.py | 1 - .../source/datahub/datahub_source.py | 1 - .../ingestion/source/dremio/dremio_api.py | 2 -- .../ingestion/source/dremio/dremio_config.py | 1 - .../ingestion/source/dremio/dremio_source.py | 1 - .../ingestion/source/dynamodb/dynamodb.py | 1 - .../source/gc/execution_request_cleanup.py | 1 - .../ingestion/source/ge_profiling_config.py | 1 - .../ingestion/source/looker/looker_common.py | 5 ----- .../ingestion/source/looker/looker_source.py | 2 -- .../source/looker/looker_template_language.py | 3 --- .../source/looker/lookml_concept_context.py | 2 -- .../ingestion/source/looker/lookml_source.py | 3 --- .../ingestion/source/looker/view_upstream.py | 3 --- .../ingestion/source/metadata/lineage.py | 1 - .../src/datahub/ingestion/source/nifi.py | 1 - .../powerbi/m_query/native_sql_parser.py | 1 - .../source/powerbi/m_query/parser.py | 1 - .../source/powerbi/m_query/resolver.py | 6 ------ .../powerbi/rest_api_wrapper/data_resolver.py | 2 -- .../powerbi/rest_api_wrapper/powerbi_api.py | 2 -- .../powerbi_report_server/report_server.py | 1 - .../ingestion/source/redshift/redshift.py | 1 - .../src/datahub/ingestion/source/s3/source.py | 1 - .../src/datahub/ingestion/source/sac/sac.py | 1 - .../source/snowflake/snowflake_lineage_v2.py | 1 - .../ingestion/source/sql/cockroachdb.py | 1 - .../datahub/ingestion/source/sql/oracle.py | 6 ------ .../source/state/entity_removal_state.py | 6 +++++- .../ingestion/source/tableau/tableau.py | 1 - .../transformer/add_dataset_dataproduct.py | 1 - .../ingestion/transformer/add_dataset_tags.py | 1 - .../extract_ownership_from_tags.py | 1 - .../transformer/replace_external_url.py | 1 - .../ingestion/transformer/tags_to_terms.py | 1 - .../snowflake/metric_sql_generator.py | 1 - .../src/datahub/specific/dashboard.py | 1 - .../sql_parsing/sql_parsing_aggregator.py | 1 - .../datahub/sql_parsing/sqlglot_lineage.py | 1 - .../sql_parsing/tool_meta_extractor.py | 1 - .../src/datahub/testing/mcp_diff.py | 2 +- .../src/datahub/utilities/mapping.py | 2 -- .../utilities/threaded_iterator_executor.py | 1 - .../integration/azure_ad/test_azure_ad.py | 1 - .../bigquery_v2/test_bigquery_queries.py | 1 - .../tests/integration/dremio/test_dremio.py | 1 - .../tests/integration/looker/test_looker.py | 1 - .../tests/integration/lookml/test_lookml.py | 1 - .../tests/integration/okta/test_okta.py | 3 --- .../tests/integration/oracle/common.py | 1 - .../integration/powerbi/test_m_parser.py | 2 -- .../tests/integration/powerbi/test_powerbi.py | 9 -------- .../integration/qlik_sense/test_qlik_sense.py | 2 -- .../tests/integration/sigma/test_sigma.py | 3 --- .../tests/integration/snowflake/common.py | 1 - .../tableau/test_tableau_ingest.py | 2 -- .../unity/test_unity_catalog_ingest.py | 1 - .../performance/snowflake/test_snowflake.py | 1 - .../entities/common/test_serialized_value.py | 3 --- .../test_platform_resource.py | 1 - .../test_incremental_lineage_helper.py | 1 - .../unit/bigquery/test_bigquery_lineage.py | 2 -- .../bigquery/test_bigqueryv2_usage_source.py | 1 - .../unit/redshift/test_redshift_lineage.py | 1 - .../unit/sql_parsing/test_sql_aggregator.py | 1 - .../test_stale_entity_removal_handler.py | 1 - .../tests/unit/test_cassandra_source.py | 1 - 78 files changed, 19 insertions(+), 137 deletions(-) diff --git a/metadata-ingestion/examples/cli_usage/gen_schemas.py b/metadata-ingestion/examples/cli_usage/gen_schemas.py index 2fd4683347a3ba..80b2c6712977ad 100644 --- a/metadata-ingestion/examples/cli_usage/gen_schemas.py +++ b/metadata-ingestion/examples/cli_usage/gen_schemas.py @@ -28,7 +28,6 @@ class CorpGroupFile(BaseModel): with open("user/user.dhub.yaml_schema.json", "w") as fp: - fp.write(json.dumps(CorpUserFile.schema(), indent=4)) with open("group/group.dhub.yaml_schema.json", "w") as fp: diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 2469af74b03343..c6530c51c949d0 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -591,22 +591,26 @@ "memray", } -base_dev_requirements = { - *base_requirements, - *framework_common, - *mypy_stubs, - *s3_base, +lint_requirements = { # This is pinned only to avoid spurious errors in CI. # We should make an effort to keep it up to date. - "black==22.12.0", - "coverage>=5.1", - "faker>=18.4.0", + "black==23.3.0", "flake8>=6.0.0", "flake8-tidy-imports>=4.3.0", "flake8-bugbear==23.3.12", "isort>=5.7.0", "mypy==1.10.1", +} + +base_dev_requirements = { + *base_requirements, + *framework_common, + *mypy_stubs, + *s3_base, + *lint_requirements, *test_api_requirements, + "coverage>=5.1", + "faker>=18.4.0", "pytest-asyncio>=0.16.0", "pytest-cov>=2.8.1", "pytest-random-order~=1.1.0", @@ -931,6 +935,7 @@ ), "cloud": ["acryl-datahub-cloud"], "dev": list(dev_requirements), + "lint": list(lint_requirements), "testing-utils": list(test_api_requirements), # To import `datahub.testing` "integration-tests": list(full_test_dev_requirements), "debug": list(debug_requirements), diff --git a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py index b48c655015d825..56e02e4329055a 100644 --- a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py +++ b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py @@ -190,7 +190,6 @@ def create(file: str, graph: Optional[DataHubGraph] = None) -> None: @classmethod def from_datahub(cls, graph: DataHubGraph, urn: str) -> "StructuredProperties": - with StructuredPropertiesConfig.use_graph(graph): structured_property: Optional[ StructuredPropertyDefinitionClass diff --git a/metadata-ingestion/src/datahub/ingestion/fs/s3_fs.py b/metadata-ingestion/src/datahub/ingestion/fs/s3_fs.py index a135b7b6ce8375..9c34c4f83b0a93 100644 --- a/metadata-ingestion/src/datahub/ingestion/fs/s3_fs.py +++ b/metadata-ingestion/src/datahub/ingestion/fs/s3_fs.py @@ -32,7 +32,6 @@ def __str__(self): class S3ListIterator(Iterator): - MAX_KEYS = 1000 def __init__( diff --git a/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py b/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py index 1d381acbf3dbe9..98c43079a3bc15 100644 --- a/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py +++ b/metadata-ingestion/src/datahub/ingestion/glossary/classification_mixin.py @@ -33,7 +33,6 @@ @dataclass class ClassificationReportMixin: - num_tables_fetch_sample_values_failed: int = 0 num_tables_classification_attempted: int = 0 @@ -112,7 +111,6 @@ def classify_schema_fields( schema_metadata: SchemaMetadata, sample_data: Union[Dict[str, list], Callable[[], Dict[str, list]]], ) -> None: - if not isinstance(sample_data, Dict): try: # TODO: In future, sample_data fetcher can be lazily called if classification diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index ad293c702a5205..4af41921c9fa3c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -374,7 +374,6 @@ class BigQueryV2Config( StatefulProfilingConfigMixin, ClassificationSourceConfigMixin, ): - include_schema_metadata: bool = Field( default=True, description="Whether to ingest the BigQuery schema, i.e. projects, schemas, tables, and views.", diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py index 345467ab76c866..6f3008ccfd6923 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py @@ -356,7 +356,6 @@ def _process_project( project_id ) except Exception as e: - if self.config.project_ids and "not enabled BigQuery." in str(e): action_mesage = ( "The project has not enabled BigQuery API. " @@ -417,7 +416,6 @@ def _process_project_datasets( bigquery_project: BigqueryProject, db_tables: Dict[str, List[BigqueryTable]], ) -> Iterable[MetadataWorkUnit]: - db_views: Dict[str, List[BigqueryView]] = {} db_snapshots: Dict[str, List[BigqueryTableSnapshot]] = {} project_id = bigquery_project.id @@ -1141,7 +1139,6 @@ def gen_schema_metadata( columns: List[BigqueryColumn], dataset_name: BigqueryTableIdentifier, ) -> MetadataWorkUnit: - foreign_keys: List[ForeignKeyConstraint] = [] # Foreign keys only make sense for tables if isinstance(table, BigqueryTable): @@ -1183,7 +1180,6 @@ def get_tables_for_dataset( ) -> Iterable[BigqueryTable]: # In bigquery there is no way to query all tables in a Project id with PerfTimer() as timer: - # PARTITIONS INFORMATION_SCHEMA view is not available for BigLake tables # based on Amazon S3 and Blob Storage data. # https://cloud.google.com/bigquery/docs/omni-introduction#limitations diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py index b542992a7924a0..321b1b6207fabf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py @@ -934,7 +934,6 @@ def gen_lineage_workunits_for_external_table( ddl: Optional[str], graph: Optional[DataHubGraph] = None, ) -> Iterable[MetadataWorkUnit]: - if not ddl: return @@ -972,7 +971,6 @@ def get_lineage_for_external_table( source_uris: List[str], graph: Optional[DataHubGraph] = None, ) -> Optional[UpstreamLineageClass]: - upstreams_list: List[UpstreamClass] = [] fine_grained_lineages: List[FineGrainedLineageClass] = [] gcs_urns: Set[str] = set() diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py index 497947abe4ef9a..91d55ad879e04a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py @@ -304,7 +304,6 @@ def get_workunits_internal( def deduplicate_queries( self, queries: FileBackedList[ObservedQuery] ) -> FileBackedDict[Dict[int, ObservedQuery]]: - # This fingerprint based deduplication is done here to reduce performance hit due to # repetitive sql parsing while adding observed query to aggregator that would otherwise # parse same query multiple times. In future, aggregator may absorb this deduplication. @@ -342,7 +341,6 @@ def deduplicate_queries( return queries_deduped def fetch_query_log(self, project: BigqueryProject) -> Iterable[ObservedQuery]: - # Multi-regions from https://cloud.google.com/bigquery/docs/locations#supported_locations regions = self.config.region_qualifiers @@ -355,7 +353,6 @@ def fetch_query_log(self, project: BigqueryProject) -> Iterable[ObservedQuery]: def fetch_region_query_log( self, project: BigqueryProject, region: str ) -> Iterable[ObservedQuery]: - # Each region needs to be a different query query_log_query = _build_enriched_query_log_query( project_id=project.id, @@ -452,7 +449,6 @@ def _build_enriched_query_log_query( start_time: datetime, end_time: datetime, ) -> str: - audit_start_time = start_time.strftime(BQ_DATETIME_FORMAT) audit_end_time = end_time.strftime(BQ_DATETIME_FORMAT) diff --git a/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra.py b/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra.py index 6a5236563f48db..dcdccc08ce0483 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra.py +++ b/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra.py @@ -332,7 +332,6 @@ def _extract_columns_from_table( def _extract_views_from_keyspace( self, keyspace_name: str ) -> Iterable[MetadataWorkUnit]: - views: List[CassandraView] = self.cassandra_api.get_views(keyspace_name) for view in views: view_name: str = view.view_name diff --git a/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py b/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py index 09ce8b5b05203c..ed51487ea6dab2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py +++ b/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py @@ -371,7 +371,6 @@ def _get_schema_fields( def _get_schema_metadata( self, topic: str, platform_urn: str, is_subject: bool ) -> Optional[SchemaMetadata]: - # Process the value schema schema, fields = self._get_schema_and_fields( topic=topic, diff --git a/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/config.py b/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/config.py index 5f88cf0234947a..ede7d3c3c56959 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/config.py @@ -7,7 +7,6 @@ class PathSpecsConfigMixin(ConfigModel): - path_specs: List[PathSpec] = Field( description="List of PathSpec. See [below](#path-spec) the details about PathSpec" ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py index de212ca9a67716..63cea45f75864b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py @@ -107,7 +107,6 @@ def _get_database_workunits( logger.info(f"Fetching database aspects starting from {from_createdon}") mcps = reader.get_aspects(from_createdon, self.report.stop_time) for i, (mcp, createdon) in enumerate(mcps): - if not self.urn_pattern.allowed(str(mcp.entityUrn)): continue diff --git a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_api.py b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_api.py index db83dde7cf6131..7b9ccb52acbef4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_api.py @@ -566,7 +566,6 @@ def get_all_tables_and_columns(self, containers: Deque) -> List[Dict]: return tables def validate_schema_format(self, schema): - if "." in schema: schema_path = self.get( url=f"/catalog/{self.get_dataset_id(schema=schema, dataset='')}" @@ -687,7 +686,6 @@ def traverse_path(location_id: str, entity_path: List[str]) -> List: response.get("entityType") == DremioEntityContainerType.FOLDER.value.lower() ): - containers.append( { "id": location_id, diff --git a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_config.py b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_config.py index 9d6f65b95554e7..d966d575c03320 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_config.py @@ -121,7 +121,6 @@ class DremioSourceConfig( EnvConfigMixin, PlatformInstanceConfigMixin, ): - domain: Optional[str] = Field( default=None, description="Domain for all source objects.", diff --git a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_source.py b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_source.py index cd6ba441b5c93b..5b96845ec04961 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_source.py @@ -198,7 +198,6 @@ def _build_source_map(self) -> Dict[str, Dict]: source_platform_name = source_name for mapping in self.config.source_mappings or []: - if re.search(mapping.source_name, source_type, re.IGNORECASE): source_platform_name = mapping.source_name.lower() diff --git a/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py b/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py index 4f1de6fb06c695..cb3f0dd9cf29f4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py @@ -233,7 +233,6 @@ def _process_table( table_name: str, dataset_name: str, ) -> Iterable[MetadataWorkUnit]: - logger.debug(f"Processing table: {dataset_name}") table_info = dynamodb_client.describe_table(TableName=table_name)["Table"] account_id = table_info["TableArn"].split(":")[4] diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py index 570df4e99ab13d..3baf858e44cdc8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/execution_request_cleanup.py @@ -69,7 +69,6 @@ def __init__( report: DatahubExecutionRequestCleanupReport, config: Optional[DatahubExecutionRequestCleanupConfig] = None, ) -> None: - self.graph = graph self.report = report self.instance_id = int(time.time()) diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py index 8b2443a589b8dc..c20506e36a844f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py @@ -95,7 +95,6 @@ class GEProfilingBaseConfig(ConfigModel): class GEProfilingConfig(GEProfilingBaseConfig): - report_dropped_profiles: bool = Field( default=False, description="Whether to report datasets or dataset columns which were not profiled. Set to `True` for debugging purposes.", diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py index 3e2872a4b5caa1..57a251ef2ed14f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py @@ -307,7 +307,6 @@ def view_fields_from_dict( type_cls: ViewFieldType, populate_sql_logic_in_descriptions: bool, ) -> "ViewField": - is_primary_key = field_dict.get("primary_key", "no") == "yes" name = field_dict["name"] @@ -929,7 +928,6 @@ def from_api( # noqa: C901 reporter: SourceReport, source_config: LookerDashboardSourceConfig, ) -> Optional["LookerExplore"]: # noqa: C901 - try: explore = client.lookml_model_explore(model, explore_name) views: Set[str] = set() @@ -987,13 +985,11 @@ def from_api( # noqa: C901 field_name_vs_raw_explore_field: Dict = {} if explore.fields is not None: - if explore.fields.dimensions is not None: for dim_field in explore.fields.dimensions: if dim_field.name is None: continue else: - field_name_vs_raw_explore_field[dim_field.name] = dim_field view_fields.append( @@ -1034,7 +1030,6 @@ def from_api( # noqa: C901 if measure_field.name is None: continue else: - field_name_vs_raw_explore_field[ measure_field.name ] = measure_field diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py index e42ac7b61c1777..cd8ccb8217257c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py @@ -604,7 +604,6 @@ def _get_folder_browse_path_v2_entries( def _create_platform_instance_aspect( self, ) -> DataPlatformInstance: - assert ( self.source_config.platform_name ), "Platform name is not set in the configuration." @@ -999,7 +998,6 @@ def _gen_folder_key(self, folder_id: str) -> LookerFolderKey: def _make_dashboard_and_chart_mces( self, looker_dashboard: LookerDashboard ) -> Iterable[Union[MetadataChangeEvent, MetadataChangeProposalWrapper]]: - # Step 1: Emit metadata for each Chart inside the Dashboard. chart_events = [] for element in looker_dashboard.dashboard_elements: diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py index 1e60c08fe00c2b..6d49d57e077435 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py @@ -55,7 +55,6 @@ def _create_new_liquid_variables_with_default( current_dict: dict = new_dict for key in keys[:-1]: - if key not in current_dict: current_dict[key] = {} @@ -392,7 +391,6 @@ def process_lookml_template_language( source_config: LookMLSourceConfig, view_lkml_file_dict: dict, ) -> None: - if "views" not in view_lkml_file_dict: return @@ -425,7 +423,6 @@ def load_and_preprocess_file( path: Union[str, pathlib.Path], source_config: LookMLSourceConfig, ) -> dict: - parsed = load_lkml(path) process_lookml_template_language( diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_concept_context.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_concept_context.py index ce4a242027e11a..80be566cdcd468 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_concept_context.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_concept_context.py @@ -320,7 +320,6 @@ def get_including_extends( self, field: str, ) -> Optional[Any]: - # According to Looker's inheritance rules, we need to merge the fields(i.e. dimensions, measures and # dimension_groups) from both the child and parent. if field in [DIMENSIONS, DIMENSION_GROUPS, MEASURES]: @@ -345,7 +344,6 @@ def _get_sql_table_name_field(self) -> Optional[str]: return self.get_including_extends(field="sql_table_name") def _is_dot_sql_table_name_present(self) -> bool: - sql_table_name: Optional[str] = self._get_sql_table_name_field() if sql_table_name is None: diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py index 3c83b8728aa6f7..c7d3724472d3c8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py @@ -144,7 +144,6 @@ def from_looker_dict( extract_col_level_lineage: bool = False, populate_sql_logic_in_descriptions: bool = False, ) -> Optional["LookerView"]: - view_name = view_context.name() logger.debug(f"Handling view {view_name} in model {model_name}") @@ -418,7 +417,6 @@ def _get_custom_properties(self, looker_view: LookerView) -> DatasetPropertiesCl def _build_dataset_mcps( self, looker_view: LookerView ) -> List[MetadataChangeProposalWrapper]: - view_urn = looker_view.id.get_urn(self.source_config) subTypeEvent = MetadataChangeProposalWrapper( @@ -502,7 +500,6 @@ def get_project_name(self, model_name: str) -> str: def get_manifest_if_present(self, folder: pathlib.Path) -> Optional[LookerManifest]: manifest_file = folder / "manifest.lkml" if manifest_file.exists(): - manifest_dict = load_and_preprocess_file( path=manifest_file, source_config=self.source_config ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py b/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py index 632d0caf712323..8cec6f2607774e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py @@ -72,7 +72,6 @@ def resolve_derived_view_urn_of_col_ref( base_folder_path: str, config: LookMLSourceConfig, ) -> List[ColumnRef]: - new_column_refs: List[ColumnRef] = [] for col_ref in column_refs: if is_derived_view(col_ref.table.lower()): @@ -641,7 +640,6 @@ def create_view_upstream( ctx: PipelineContext, reporter: LookMLSourceReport, ) -> AbstractViewUpstream: - if view_context.is_regular_case(): return RegularViewUpstream( view_context=view_context, @@ -666,7 +664,6 @@ def create_view_upstream( view_context.is_sql_based_derived_view_without_fields_case(), ] ): - return DerivedQueryUpstreamSource( view_context=view_context, config=config, diff --git a/metadata-ingestion/src/datahub/ingestion/source/metadata/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/metadata/lineage.py index 08ed7677c7ab4c..9f96f837eb9b3a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/metadata/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/metadata/lineage.py @@ -210,7 +210,6 @@ def _get_lineage_mcp( # extract the old lineage and save it for the new mcp if preserve_upstream: - client = get_default_graph() old_upstream_lineage = get_aspects_for_entity( diff --git a/metadata-ingestion/src/datahub/ingestion/source/nifi.py b/metadata-ingestion/src/datahub/ingestion/source/nifi.py index 7072ebf6473df1..f55d7a883edefe 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/nifi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/nifi.py @@ -464,7 +464,6 @@ def report_dropped(self, ent_name: str) -> None: @support_status(SupportStatus.CERTIFIED) @capability(SourceCapability.LINEAGE_COARSE, "Supported. See docs for limitations") class NifiSource(Source): - config: NifiSourceConfig report: NifiSourceReport diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py index 63a6073c90a1a9..8ffd54613eb380 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py @@ -101,7 +101,6 @@ def parse_custom_sql( env: str, platform_instance: Optional[str], ) -> Optional["SqlParsingResult"]: - logger.debug("Using sqlglot_lineage to parse custom sql") logger.debug(f"Processing native query using DataHub Sql Parser = {query}") diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py index 15524137c0a85e..97698a3d0d56c1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py @@ -66,7 +66,6 @@ def get_upstream_tables( config: PowerBiDashboardSourceConfig, parameters: Dict[str, str] = {}, ) -> List[resolver.Lineage]: - if table.expression is None: logger.debug(f"There is no M-Query expression in table {table.full_name}") return [] diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py index 32de95d6bd015e..a40e67d08da5b2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py @@ -65,7 +65,6 @@ def urn_creator( server: str, qualified_table_name: str, ) -> str: - platform_detail: PlatformDetail = platform_instance_resolver.get_platform_instance( PowerBIPlatformDetail( data_platform_pair=data_platform_pair, @@ -179,7 +178,6 @@ def create_reference_table( arg_list: Tree, table_detail: Dict[str, str], ) -> Optional[ReferencedTable]: - arguments: List[str] = tree_function.strip_char_from_list( values=tree_function.remove_whitespaces_from_list( tree_function.token_values(arg_list) @@ -219,7 +217,6 @@ def create_reference_table( def parse_custom_sql( self, query: str, server: str, database: Optional[str], schema: Optional[str] ) -> Lineage: - dataplatform_tables: List[DataPlatformTable] = [] platform_detail: PlatformDetail = ( @@ -377,7 +374,6 @@ def get_argument_list(invoke_expression: Tree) -> Optional[Tree]: return argument_list def take_first_argument(self, expression: Tree) -> Optional[Tree]: - # function is not data-access function, lets process function argument first_arg_tree: Optional[Tree] = tree_function.first_arg_list_func(expression) @@ -785,7 +781,6 @@ def create_urn_using_old_parser( def create_lineage( self, data_access_func_detail: DataAccessFunctionDetail ) -> Lineage: - arguments: List[str] = tree_function.strip_char_from_list( values=tree_function.remove_whitespaces_from_list( tree_function.token_values(data_access_func_detail.arg_list) @@ -897,7 +892,6 @@ def form_qualified_table_name( table_reference: ReferencedTable, data_platform_pair: DataPlatformPair, ) -> str: - platform_detail: PlatformDetail = ( self.platform_instance_resolver.get_platform_instance( PowerBIPlatformDetail( diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py index a59d58519d6bfe..e1301edef10b84 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py @@ -439,7 +439,6 @@ def get_app( self, app_id: str, ) -> Optional[App]: - raw_app: Optional[Dict] = self._get_app( app_id=app_id, ) @@ -1062,7 +1061,6 @@ def _get_app( self, app_id: str, ) -> Optional[Dict]: - app_endpoint = self.API_ENDPOINTS[Constant.GET_WORKSPACE_APP].format( POWERBI_ADMIN_BASE_URL=DataResolverBase.ADMIN_BASE_URL, APP_ID=app_id, diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py index b49f1f09fa966e..5ae333430a78bc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py @@ -40,7 +40,6 @@ def form_full_table_name( dataset_name: str, table_name: str, ) -> str: - full_table_name: str = "{}.{}".format( dataset_name.replace(" ", "_"), table_name.replace(" ", "_") ) @@ -596,7 +595,6 @@ def _fill_metadata_from_scan_result( return workspaces def _fill_independent_datasets(self, workspace: Workspace) -> None: - reachable_datasets: List[str] = [] # Find out reachable datasets for dashboard in workspace.dashboards: diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server.py index 8854f9ff48348d..2a247d0c63957a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server.py @@ -126,7 +126,6 @@ def log_http_error(e: BaseException, message: str) -> Any: def get_response_dict(response: requests.Response, error_message: str) -> dict: - result_dict: dict = {} try: response.raise_for_status() diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py index 76030cea984946..4bc4c1451c262f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py @@ -436,7 +436,6 @@ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit def _extract_metadata( self, connection: redshift_connector.Connection, database: str ) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]: - yield from self.gen_database_container( database=database, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py index e8c70260ebc7ce..1863663f98bb24 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py @@ -804,7 +804,6 @@ def get_dir_to_process( protocol: str, min: bool = False, ) -> List[str]: - # if len(path_spec.include.split("/")) == len(f"{protocol}{bucket_name}/{folder}".split("/")): # return [f"{protocol}{bucket_name}/{folder}"] diff --git a/metadata-ingestion/src/datahub/ingestion/source/sac/sac.py b/metadata-ingestion/src/datahub/ingestion/source/sac/sac.py index 66962b5d96d389..b75f15c0ce770e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sac/sac.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sac/sac.py @@ -401,7 +401,6 @@ def get_model_workunits( columns = self.get_import_data_model_columns(model_id=model.model_id) for column in columns: - schema_field = SchemaFieldClass( fieldPath=column.name, type=self.get_schema_field_data_type(column), diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index ac47abf4874499..e065e2f34bc66d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -236,7 +236,6 @@ def populate_known_query_lineage( def get_known_query_lineage( self, query: Query, dataset_name: str, db_row: UpstreamLineageEdge ) -> Optional[KnownQueryLineageInfo]: - if not db_row.UPSTREAM_TABLES: return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/cockroachdb.py b/metadata-ingestion/src/datahub/ingestion/source/sql/cockroachdb.py index 5356cee7f6ea30..76b72d8e37f74b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/cockroachdb.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/cockroachdb.py @@ -28,7 +28,6 @@ class CockroachDBConfig(PostgresConfig): @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") class CockroachDBSource(PostgresSource): - config: CockroachDBConfig def __init__(self, config: CockroachDBConfig, ctx: PipelineContext): diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py b/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py index bce3b29130ec9e..766b704d6ffafe 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py @@ -178,7 +178,6 @@ def get_table_names(self, schema: Optional[str] = None) -> List[str]: ] def get_view_names(self, schema: Optional[str] = None) -> List[str]: - schema = self._inspector_instance.dialect.denormalize_name( schema or self.default_schema_name ) @@ -200,7 +199,6 @@ def get_view_names(self, schema: Optional[str] = None) -> List[str]: def get_columns( self, table_name: str, schema: Optional[str] = None, dblink: str = "" ) -> List[dict]: - denormalized_table_name = self._inspector_instance.dialect.denormalize_name( table_name ) @@ -344,7 +342,6 @@ def get_columns( return columns def get_table_comment(self, table_name: str, schema: Optional[str] = None) -> Dict: - denormalized_table_name = self._inspector_instance.dialect.denormalize_name( table_name ) @@ -416,7 +413,6 @@ def _get_constraint_data( def get_pk_constraint( self, table_name: str, schema: Optional[str] = None, dblink: str = "" ) -> Dict: - denormalized_table_name = self._inspector_instance.dialect.denormalize_name( table_name ) @@ -458,7 +454,6 @@ def get_pk_constraint( def get_foreign_keys( self, table_name: str, schema: Optional[str] = None, dblink: str = "" ) -> List: - denormalized_table_name = self._inspector_instance.dialect.denormalize_name( table_name ) @@ -540,7 +535,6 @@ def fkey_rec(): def get_view_definition( self, view_name: str, schema: Optional[str] = None ) -> Union[str, None]: - denormalized_view_name = self._inspector_instance.dialect.denormalize_name( view_name ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/state/entity_removal_state.py b/metadata-ingestion/src/datahub/ingestion/source/state/entity_removal_state.py index 318395d4e66b2a..2b10ca1fa57ed8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/state/entity_removal_state.py +++ b/metadata-ingestion/src/datahub/ingestion/source/state/entity_removal_state.py @@ -146,7 +146,11 @@ def urn_count(self) -> int: def compute_percent_entities_changed( new_entities: List[str], old_entities: List[str] ) -> float: - (overlap_count, old_count, _,) = _get_entity_overlap_and_cardinalities( + ( + overlap_count, + old_count, + _, + ) = _get_entity_overlap_and_cardinalities( new_entities=new_entities, old_entities=old_entities ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index c1d899f11f2e1d..0eafdb4ad23ba0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -2117,7 +2117,6 @@ def parse_custom_sql( def _enrich_database_tables_with_parsed_schemas( self, parsing_result: SqlParsingResult ) -> None: - in_tables_schemas: Dict[ str, Set[str] ] = transform_parsing_result_to_in_tables_schemas(parsing_result) diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_dataproduct.py b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_dataproduct.py index ce224bde003fd3..bb1c297513de10 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_dataproduct.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_dataproduct.py @@ -105,7 +105,6 @@ class SimpleAddDatasetDataProduct(AddDatasetDataProduct): """Transformer that adds a specified dataproduct entity for provided dataset as its asset.""" def __init__(self, config: SimpleDatasetDataProductConfig, ctx: PipelineContext): - generic_config = AddDatasetDataProductConfig( get_data_product_to_add=lambda dataset_urn: config.dataset_to_data_product_urns.get( dataset_urn diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_tags.py b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_tags.py index ef6ef43fa2d7f3..c60f4dca28882d 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_tags.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_tags.py @@ -67,7 +67,6 @@ def transform_aspect( def handle_end_of_stream( self, ) -> List[Union[MetadataChangeProposalWrapper, MetadataChangeProposalClass]]: - mcps: List[ Union[MetadataChangeProposalWrapper, MetadataChangeProposalClass] ] = [] diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/extract_ownership_from_tags.py b/metadata-ingestion/src/datahub/ingestion/transformer/extract_ownership_from_tags.py index 245a3aa3d9db15..212e018dd64fb7 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/extract_ownership_from_tags.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/extract_ownership_from_tags.py @@ -105,7 +105,6 @@ def convert_tag_as_per_mapping(self, tag: str) -> str: def handle_end_of_stream( self, ) -> Sequence[Union[MetadataChangeProposalWrapper, MetadataChangeProposalClass]]: - return self.owner_mcps def transform_aspect( diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/replace_external_url.py b/metadata-ingestion/src/datahub/ingestion/transformer/replace_external_url.py index 57af10d1040c8a..f6847f234aefe6 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/replace_external_url.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/replace_external_url.py @@ -103,7 +103,6 @@ def create( def transform_aspect( self, entity_urn: str, aspect_name: str, aspect: Optional[Aspect] ) -> Optional[Aspect]: - in_container_properties_aspect: ContainerPropertiesClass = cast( ContainerPropertiesClass, aspect ) diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/tags_to_terms.py b/metadata-ingestion/src/datahub/ingestion/transformer/tags_to_terms.py index 338f191c0829df..7e6125079f16e3 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/tags_to_terms.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/tags_to_terms.py @@ -84,7 +84,6 @@ def get_tags_from_schema_metadata( def transform_aspect( self, entity_urn: str, aspect_name: str, aspect: Optional[Aspect] ) -> Optional[Aspect]: - in_glossary_terms: Optional[GlossaryTermsClass] = cast( Optional[GlossaryTermsClass], aspect ) diff --git a/metadata-ingestion/src/datahub/integrations/assertion/snowflake/metric_sql_generator.py b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/metric_sql_generator.py index 5b079129e0a9c5..facc7d107d1ba7 100644 --- a/metadata-ingestion/src/datahub/integrations/assertion/snowflake/metric_sql_generator.py +++ b/metadata-ingestion/src/datahub/integrations/assertion/snowflake/metric_sql_generator.py @@ -72,7 +72,6 @@ def _(self, assertion: FixedIntervalFreshnessAssertion) -> str: @metric_sql.register def _(self, assertion: RowCountTotalVolumeAssertion) -> str: - # Can not use information schema here due to error - # Data metric function body cannot refer to the non-deterministic function 'CURRENT_DATABASE_MAIN_METASTORE_ID'. diff --git a/metadata-ingestion/src/datahub/specific/dashboard.py b/metadata-ingestion/src/datahub/specific/dashboard.py index 8228dbc011db2f..f57df15914369c 100644 --- a/metadata-ingestion/src/datahub/specific/dashboard.py +++ b/metadata-ingestion/src/datahub/specific/dashboard.py @@ -433,7 +433,6 @@ def set_description(self, description: str) -> "DashboardPatchBuilder": def add_custom_properties( self, custom_properties: Optional[Dict[str, str]] = None ) -> "DashboardPatchBuilder": - if custom_properties: for key, value in custom_properties.items(): self.custom_properties_patch_helper.add_property(key, value) diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py index e8a0369597d53a..360ccd7bf35073 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py @@ -831,7 +831,6 @@ def add_preparsed_query( session_has_temp_tables: bool = True, _is_internal: bool = False, ) -> None: - # Adding tool specific metadata extraction here allows it # to work for both ObservedQuery and PreparsedQuery as # add_preparsed_query it used within add_observed_query. diff --git a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py index 506bd1d8c6be40..4d0f9f7d8cf59d 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py @@ -880,7 +880,6 @@ def _sqlglot_lineage_inner( default_schema: Optional[str] = None, default_dialect: Optional[str] = None, ) -> SqlParsingResult: - if not default_dialect: dialect = get_dialect(schema_resolver.platform) else: diff --git a/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py b/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py index cdd35c23e30885..0d85002776e5e2 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py +++ b/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py @@ -79,7 +79,6 @@ def _extract_mode_query(self, entry: QueryLog) -> bool: return True def extract_bi_metadata(self, entry: QueryLog) -> bool: - for tool, meta_extractor in self.known_tool_extractors: try: if meta_extractor(entry): diff --git a/metadata-ingestion/src/datahub/testing/mcp_diff.py b/metadata-ingestion/src/datahub/testing/mcp_diff.py index 95b8e83c7a64a5..5e669a718e9ad3 100644 --- a/metadata-ingestion/src/datahub/testing/mcp_diff.py +++ b/metadata-ingestion/src/datahub/testing/mcp_diff.py @@ -206,7 +206,7 @@ def apply_delta(self, golden: List[Dict[str, Any]]) -> None: """ aspect_diffs = [v for d in self.aspect_changes.values() for v in d.values()] for aspect_diff in aspect_diffs: - for (_, old, new) in aspect_diff.aspects_changed.keys(): + for _, old, new in aspect_diff.aspects_changed.keys(): golden[old.delta_info.idx] = new.delta_info.original indices_to_remove = set() diff --git a/metadata-ingestion/src/datahub/utilities/mapping.py b/metadata-ingestion/src/datahub/utilities/mapping.py index 4ea42d568da635..17023c7b388e76 100644 --- a/metadata-ingestion/src/datahub/utilities/mapping.py +++ b/metadata-ingestion/src/datahub/utilities/mapping.py @@ -43,7 +43,6 @@ def _make_owner_category_list( owner_category_urn: Optional[str], owner_ids: List[str], ) -> List[Dict]: - return [ { "urn": mce_builder.make_owner_urn(owner_id, owner_type), @@ -285,7 +284,6 @@ def convert_to_aspects(self, operation_map: Dict[str, list]) -> Dict[str, Any]: aspect_map[Constants.ADD_TAG_OPERATION] = tag_aspect if Constants.ADD_OWNER_OPERATION in operation_map: - owner_aspect = OwnershipClass( owners=[ OwnerClass( diff --git a/metadata-ingestion/src/datahub/utilities/threaded_iterator_executor.py b/metadata-ingestion/src/datahub/utilities/threaded_iterator_executor.py index 4d328ad31c6c4a..ab8987a7d2e8b2 100644 --- a/metadata-ingestion/src/datahub/utilities/threaded_iterator_executor.py +++ b/metadata-ingestion/src/datahub/utilities/threaded_iterator_executor.py @@ -19,7 +19,6 @@ def process( args_list: Iterable[Tuple[Any, ...]], max_workers: int, ) -> Generator[T, None, None]: - out_q: queue.Queue[T] = queue.Queue() def _worker_wrapper( diff --git a/metadata-ingestion/tests/integration/azure_ad/test_azure_ad.py b/metadata-ingestion/tests/integration/azure_ad/test_azure_ad.py index 7005bc2e4411bf..024bb62bbe9ce9 100644 --- a/metadata-ingestion/tests/integration/azure_ad/test_azure_ad.py +++ b/metadata-ingestion/tests/integration/azure_ad/test_azure_ad.py @@ -68,7 +68,6 @@ def run_ingest( "datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph", mock_datahub_graph, ) as mock_checkpoint: - mock_checkpoint.return_value = mock_datahub_graph mocked_functions_reference( diff --git a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery_queries.py b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery_queries.py index ef846f698f156e..806779475dea9d 100644 --- a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery_queries.py +++ b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery_queries.py @@ -47,7 +47,6 @@ def _generate_queries_cached_file(tmp_path: Path, queries_json_path: Path) -> No @patch("google.cloud.bigquery.Client") @patch("google.cloud.resourcemanager_v3.ProjectsClient") def test_queries_ingestion(project_client, client, pytestconfig, monkeypatch, tmp_path): - test_resources_dir = pytestconfig.rootpath / "tests/integration/bigquery_v2" mcp_golden_path = f"{test_resources_dir}/bigquery_queries_mcps_golden.json" mcp_output_path = tmp_path / "bigquery_queries_mcps.json" diff --git a/metadata-ingestion/tests/integration/dremio/test_dremio.py b/metadata-ingestion/tests/integration/dremio/test_dremio.py index eb999367962817..cc3a7e19bc93e2 100644 --- a/metadata-ingestion/tests/integration/dremio/test_dremio.py +++ b/metadata-ingestion/tests/integration/dremio/test_dremio.py @@ -192,7 +192,6 @@ def create_mysql_source(headers): def upload_dataset(headers): - url = f"{DREMIO_HOST}/apiv2/source/s3/file_format/warehouse/sample.parquet" payload = {"ignoreOtherFileFormats": False, "type": "Parquet"} diff --git a/metadata-ingestion/tests/integration/looker/test_looker.py b/metadata-ingestion/tests/integration/looker/test_looker.py index 7238a49cb37d2f..8bbf14709ff9fb 100644 --- a/metadata-ingestion/tests/integration/looker/test_looker.py +++ b/metadata-ingestion/tests/integration/looker/test_looker.py @@ -1047,7 +1047,6 @@ def test_independent_soft_deleted_looks( mocked_client = mock.MagicMock() with mock.patch("looker_sdk.init40") as mock_sdk: - mock_sdk.return_value = mocked_client setup_mock_look(mocked_client) setup_mock_soft_deleted_look(mocked_client) diff --git a/metadata-ingestion/tests/integration/lookml/test_lookml.py b/metadata-ingestion/tests/integration/lookml/test_lookml.py index a4cfbd5eadb7f3..ab55321a4d7342 100644 --- a/metadata-ingestion/tests/integration/lookml/test_lookml.py +++ b/metadata-ingestion/tests/integration/lookml/test_lookml.py @@ -831,7 +831,6 @@ def test_manifest_parser(pytestconfig: pytest.Config) -> None: @freeze_time(FROZEN_TIME) def test_duplicate_field_ingest(pytestconfig, tmp_path, mock_time): - test_resources_dir = pytestconfig.rootpath / "tests/integration/lookml" mce_out_file = "duplicate_ingest_mces_output.json" diff --git a/metadata-ingestion/tests/integration/okta/test_okta.py b/metadata-ingestion/tests/integration/okta/test_okta.py index 63ef8793cadddc..10148273c93666 100644 --- a/metadata-ingestion/tests/integration/okta/test_okta.py +++ b/metadata-ingestion/tests/integration/okta/test_okta.py @@ -58,14 +58,12 @@ def run_ingest( mocked_functions_reference, recipe, ): - with patch( "datahub.ingestion.source.identity.okta.OktaClient" ) as MockClient, patch( "datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph", mock_datahub_graph, ) as mock_checkpoint: - mock_checkpoint.return_value = mock_datahub_graph mocked_functions_reference(MockClient=MockClient) @@ -277,7 +275,6 @@ def overwrite_group_in_mocked_data(test_resources_dir, MockClient): def _init_mock_okta_client( test_resources_dir, MockClient, mock_users_json=None, mock_groups_json=None ): - okta_users_json_file = ( test_resources_dir / "okta_users.json" if mock_users_json is None diff --git a/metadata-ingestion/tests/integration/oracle/common.py b/metadata-ingestion/tests/integration/oracle/common.py index 79dbda8c30f896..9e2cc42ef10256 100644 --- a/metadata-ingestion/tests/integration/oracle/common.py +++ b/metadata-ingestion/tests/integration/oracle/common.py @@ -33,7 +33,6 @@ def scalar(self): @dataclass class MockConstraints: - constraint_name: str = "mock constraint name" constraint_type: str = "P" local_column: str = "mock column name" diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index f4613c524316e3..f22998b47b9008 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -712,7 +712,6 @@ def test_redshift_regular_case(): def test_redshift_native_query(): - table: powerbi_data_classes.Table = powerbi_data_classes.Table( expression=M_QUERIES[22], name="category", @@ -1101,7 +1100,6 @@ def test_double_quotes_in_alias(): @patch("datahub.ingestion.source.powerbi.m_query.parser.get_lark_parser") def test_m_query_timeout(mock_get_lark_parser): - q = 'let\n Source = Value.NativeQuery(Snowflake.Databases("0DD93C6BD5A6.snowflakecomputing.com","sales_analytics_warehouse_prod",[Role="sales_analytics_member_ad"]){[Name="SL_OPERATIONS"]}[Data], "select SALE_NO AS ""\x1b[4mSaleNo\x1b[0m""#(lf) ,CODE AS ""Code""#(lf) ,ENDDATE AS ""end_date""#(lf) from SL_OPERATIONS.SALE.REPORTS#(lf) where ENDDATE > \'2024-02-03\'", null, [EnableFolding=true]),\n #"selected Row" = Table.SelectRows(Source)\nin\n #"selected Row"' table: powerbi_data_classes.Table = powerbi_data_classes.Table( diff --git a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py index 0f360d44c38cbe..edde11ff87d293 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py +++ b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py @@ -96,7 +96,6 @@ def read_mock_data(path: Union[Path, str]) -> dict: def register_mock_api( pytestconfig: pytest.Config, request_mock: Any, override_data: Optional[dict] = None ) -> None: - default_mock_data_path = ( pytestconfig.rootpath / "tests/integration/powerbi/mock_data/default_mock_response.json" @@ -467,7 +466,6 @@ def test_scan_all_workspaces( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api(pytestconfig=pytestconfig, request_mock=requests_mock) @@ -517,7 +515,6 @@ def test_extract_reports( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - enable_logging() test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" @@ -1219,7 +1216,6 @@ def test_independent_datasets_extraction( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api( @@ -1323,7 +1319,6 @@ def test_cll_extraction( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api( @@ -1380,7 +1375,6 @@ def test_cll_extraction_flags( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - register_mock_api( pytestconfig=pytestconfig, request_mock=requests_mock, @@ -1392,7 +1386,6 @@ def test_cll_extraction_flags( ) with pytest.raises(Exception, match=pattern): - Pipeline.create( { "run_id": "powerbi-test", @@ -1559,7 +1552,6 @@ def test_powerbi_app_ingest( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - common_app_ingest( pytestconfig=pytestconfig, requests_mock=requests_mock, @@ -1590,7 +1582,6 @@ def test_powerbi_app_ingest_info_message( mock_time: datetime.datetime, requests_mock: Any, ) -> None: - pipeline = common_app_ingest( pytestconfig=pytestconfig, requests_mock=requests_mock, diff --git a/metadata-ingestion/tests/integration/qlik_sense/test_qlik_sense.py b/metadata-ingestion/tests/integration/qlik_sense/test_qlik_sense.py index ee1aafb6cf32dc..95f096cc3def35 100644 --- a/metadata-ingestion/tests/integration/qlik_sense/test_qlik_sense.py +++ b/metadata-ingestion/tests/integration/qlik_sense/test_qlik_sense.py @@ -1011,7 +1011,6 @@ def default_config(): def test_qlik_sense_ingest( pytestconfig, tmp_path, requests_mock, mock_websocket_send_request ): - test_resources_dir = pytestconfig.rootpath / "tests/integration/qlik_sense" register_mock_api(request_mock=requests_mock) @@ -1051,7 +1050,6 @@ def test_qlik_sense_ingest( def test_platform_instance_ingest( pytestconfig, tmp_path, requests_mock, mock_websocket_send_request ): - test_resources_dir = pytestconfig.rootpath / "tests/integration/qlik_sense" register_mock_api(request_mock=requests_mock) diff --git a/metadata-ingestion/tests/integration/sigma/test_sigma.py b/metadata-ingestion/tests/integration/sigma/test_sigma.py index 6c01bf6dc80fe7..19fa1448fee598 100644 --- a/metadata-ingestion/tests/integration/sigma/test_sigma.py +++ b/metadata-ingestion/tests/integration/sigma/test_sigma.py @@ -420,7 +420,6 @@ def register_mock_api(request_mock: Any, override_data: dict = {}) -> None: @pytest.mark.integration def test_sigma_ingest(pytestconfig, tmp_path, requests_mock): - test_resources_dir = pytestconfig.rootpath / "tests/integration/sigma" register_mock_api(request_mock=requests_mock) @@ -464,7 +463,6 @@ def test_sigma_ingest(pytestconfig, tmp_path, requests_mock): @pytest.mark.integration def test_platform_instance_ingest(pytestconfig, tmp_path, requests_mock): - test_resources_dir = pytestconfig.rootpath / "tests/integration/sigma" register_mock_api(request_mock=requests_mock) @@ -510,7 +508,6 @@ def test_platform_instance_ingest(pytestconfig, tmp_path, requests_mock): @pytest.mark.integration def test_sigma_ingest_shared_entities(pytestconfig, tmp_path, requests_mock): - test_resources_dir = pytestconfig.rootpath / "tests/integration/sigma" override_data = { diff --git a/metadata-ingestion/tests/integration/snowflake/common.py b/metadata-ingestion/tests/integration/snowflake/common.py index 8f45be96625a45..9e4bb2f0eb634f 100644 --- a/metadata-ingestion/tests/integration/snowflake/common.py +++ b/metadata-ingestion/tests/integration/snowflake/common.py @@ -441,7 +441,6 @@ def default_query_results( # noqa: C901 include_column_lineage=True, ), ): - return [ { "DOWNSTREAM_TABLE_NAME": f"TEST_DB.TEST_SCHEMA.TABLE_{op_idx}", diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index 62f8f6a654b588..6c45b8a47de412 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -276,7 +276,6 @@ def mock_sdk_client( datasources_side_effect: List[dict], sign_out_side_effect: List[dict], ) -> mock.MagicMock: - mock_client = mock.Mock() mocked_metadata = mock.Mock() mocked_metadata.query.side_effect = side_effect_query_metadata_response @@ -1228,7 +1227,6 @@ def test_permission_ingestion(pytestconfig, tmp_path, mock_datahub_graph): @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_permission_mode_switched_error(pytestconfig, tmp_path, mock_datahub_graph): - with mock.patch( "datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph", mock_datahub_graph, diff --git a/metadata-ingestion/tests/integration/unity/test_unity_catalog_ingest.py b/metadata-ingestion/tests/integration/unity/test_unity_catalog_ingest.py index c078f1b77fd1be..b8b0563a1d24e5 100644 --- a/metadata-ingestion/tests/integration/unity/test_unity_catalog_ingest.py +++ b/metadata-ingestion/tests/integration/unity/test_unity_catalog_ingest.py @@ -282,7 +282,6 @@ def register_mock_data(workspace_client): def mock_hive_sql(query): - if query == "DESCRIBE EXTENDED `bronze_kambi`.`bet` betStatusId": return [ ("col_name", "betStatusId"), diff --git a/metadata-ingestion/tests/performance/snowflake/test_snowflake.py b/metadata-ingestion/tests/performance/snowflake/test_snowflake.py index 73b7790b62e9e7..5042c78c2e7b91 100644 --- a/metadata-ingestion/tests/performance/snowflake/test_snowflake.py +++ b/metadata-ingestion/tests/performance/snowflake/test_snowflake.py @@ -16,7 +16,6 @@ def run_test(): - with mock.patch("snowflake.connector.connect") as mock_connect: sf_connection = mock.MagicMock() sf_cursor = mock.MagicMock() diff --git a/metadata-ingestion/tests/unit/api/entities/common/test_serialized_value.py b/metadata-ingestion/tests/unit/api/entities/common/test_serialized_value.py index c9f16bbcef6fc4..a72087376b78a3 100644 --- a/metadata-ingestion/tests/unit/api/entities/common/test_serialized_value.py +++ b/metadata-ingestion/tests/unit/api/entities/common/test_serialized_value.py @@ -10,7 +10,6 @@ class MyTestModel(BaseModel): def test_base_model(): - test_base_model = MyTestModel( test_string_field="test_string_field", test_int_field=42, @@ -31,7 +30,6 @@ def test_base_model(): def test_dictwrapper(): - from datahub.metadata.schema_classes import DatasetPropertiesClass dataset_properties = DatasetPropertiesClass( @@ -58,7 +56,6 @@ def test_dictwrapper(): def test_raw_dictionary(): - test_object = { "test_string_field": "test_string_field", "test_int_field": 42, diff --git a/metadata-ingestion/tests/unit/api/entities/platformresource/test_platform_resource.py b/metadata-ingestion/tests/unit/api/entities/platformresource/test_platform_resource.py index a84e373dbe72c2..6a03f511fa51c5 100644 --- a/metadata-ingestion/tests/unit/api/entities/platformresource/test_platform_resource.py +++ b/metadata-ingestion/tests/unit/api/entities/platformresource/test_platform_resource.py @@ -185,7 +185,6 @@ class TestModel(BaseModel): def test_platform_resource_filters(): - query = ( ElasticPlatformResourceQuery.create_from() .group(LogicalOperator.AND) diff --git a/metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_helper.py b/metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_helper.py index cafca521ae0148..c5c4a378894c32 100644 --- a/metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_helper.py +++ b/metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_helper.py @@ -104,7 +104,6 @@ def test_incremental_table_lineage(tmp_path, pytestconfig): def test_incremental_table_lineage_empty_upstreams(tmp_path, pytestconfig): - urn = make_dataset_urn(platform, "dataset1") aspect = make_lineage_aspect( "dataset1", diff --git a/metadata-ingestion/tests/unit/bigquery/test_bigquery_lineage.py b/metadata-ingestion/tests/unit/bigquery/test_bigquery_lineage.py index 415977b0f8467b..a1981ccf767916 100644 --- a/metadata-ingestion/tests/unit/bigquery/test_bigquery_lineage.py +++ b/metadata-ingestion/tests/unit/bigquery/test_bigquery_lineage.py @@ -144,7 +144,6 @@ def test_column_level_lineage(lineage_entries: List[QueryEvent]) -> None: def test_lineage_for_external_bq_table(mock_datahub_graph_instance): - pipeline_context = PipelineContext(run_id="bq_gcs_lineage") pipeline_context.graph = mock_datahub_graph_instance @@ -239,7 +238,6 @@ def fake_schema_metadata(entity_urn: str) -> models.SchemaMetadataClass: def test_lineage_for_external_bq_table_no_column_lineage(mock_datahub_graph_instance): - pipeline_context = PipelineContext(run_id="bq_gcs_lineage") pipeline_context.graph = mock_datahub_graph_instance diff --git a/metadata-ingestion/tests/unit/bigquery/test_bigqueryv2_usage_source.py b/metadata-ingestion/tests/unit/bigquery/test_bigqueryv2_usage_source.py index 63de742b201a97..3247a64631da76 100644 --- a/metadata-ingestion/tests/unit/bigquery/test_bigqueryv2_usage_source.py +++ b/metadata-ingestion/tests/unit/bigquery/test_bigqueryv2_usage_source.py @@ -184,7 +184,6 @@ def test_bigquery_table_sanitasitation(): def test_unquote_and_decode_unicode_escape_seq(): - # Test with a string that starts and ends with quotes and has Unicode escape sequences input_string = '"Hello \\u003cWorld\\u003e"' expected_output = "Hello " diff --git a/metadata-ingestion/tests/unit/redshift/test_redshift_lineage.py b/metadata-ingestion/tests/unit/redshift/test_redshift_lineage.py index 2e3eb8fde1292b..941d13be0a6139 100644 --- a/metadata-ingestion/tests/unit/redshift/test_redshift_lineage.py +++ b/metadata-ingestion/tests/unit/redshift/test_redshift_lineage.py @@ -221,7 +221,6 @@ def mock_redshift_connection() -> MagicMock: def mock_graph() -> DataHubGraph: - graph = MagicMock() graph._make_schema_resolver.return_value = SchemaResolver( diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py b/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py index b1ad9eb5c15d76..2a771a9847abd8 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py @@ -958,7 +958,6 @@ def test_table_lineage_via_temp_table_disordered_add( @freeze_time(FROZEN_TIME) def test_basic_usage(pytestconfig: pytest.Config) -> None: - frozen_timestamp = parse_user_datetime(FROZEN_TIME) aggregator = SqlParsingAggregator( platform="redshift", diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stale_entity_removal_handler.py b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stale_entity_removal_handler.py index be2d8bac12e386..b04d4b86d2e4bb 100644 --- a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stale_entity_removal_handler.py +++ b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stale_entity_removal_handler.py @@ -50,7 +50,6 @@ def test_change_percent( def test_filter_ignored_entity_types(): - assert filter_ignored_entity_types( [ "urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset1,PROD)", diff --git a/metadata-ingestion/tests/unit/test_cassandra_source.py b/metadata-ingestion/tests/unit/test_cassandra_source.py index a4ca3a0a9ef3f6..75dedde76c7c89 100644 --- a/metadata-ingestion/tests/unit/test_cassandra_source.py +++ b/metadata-ingestion/tests/unit/test_cassandra_source.py @@ -56,7 +56,6 @@ def assert_field_paths_match( def test_cassandra_schema_conversion( schema: str, expected_field_paths: List[str] ) -> None: - schema_dict: Dict[str, List[Any]] = json.loads(schema) column_infos: List = schema_dict["column_infos"] From 3b415cde6996645e00ed42df74808f08a92cc209 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 20 Nov 2024 13:34:47 -0800 Subject: [PATCH 15/24] refactor(ingest/snowflake): move oauth config into snowflake dir (#11888) --- .../source/snowflake/oauth_config.py} | 0 .../ingestion/source/{sql => snowflake}/oauth_generator.py | 2 +- .../ingestion/source/snowflake/snowflake_connection.py | 7 +++++-- .../tests/unit/{ => snowflake}/test_snowflake_shares.py | 0 .../tests/unit/{ => snowflake}/test_snowflake_source.py | 2 +- .../tests/unit/{ => utilities}/test_parsing_util.py | 0 6 files changed, 7 insertions(+), 4 deletions(-) rename metadata-ingestion/src/datahub/{configuration/oauth.py => ingestion/source/snowflake/oauth_config.py} (100%) rename metadata-ingestion/src/datahub/ingestion/source/{sql => snowflake}/oauth_generator.py (97%) rename metadata-ingestion/tests/unit/{ => snowflake}/test_snowflake_shares.py (100%) rename metadata-ingestion/tests/unit/{ => snowflake}/test_snowflake_source.py (99%) rename metadata-ingestion/tests/unit/{ => utilities}/test_parsing_util.py (100%) diff --git a/metadata-ingestion/src/datahub/configuration/oauth.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/oauth_config.py similarity index 100% rename from metadata-ingestion/src/datahub/configuration/oauth.py rename to metadata-ingestion/src/datahub/ingestion/source/snowflake/oauth_config.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/oauth_generator.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/oauth_generator.py similarity index 97% rename from metadata-ingestion/src/datahub/ingestion/source/sql/oauth_generator.py rename to metadata-ingestion/src/datahub/ingestion/source/snowflake/oauth_generator.py index 7231c6ef6b1df5..a2dc0118b39782 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/oauth_generator.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/oauth_generator.py @@ -8,7 +8,7 @@ from OpenSSL.crypto import FILETYPE_PEM, load_certificate from pydantic.types import SecretStr -from datahub.configuration.oauth import OAuthIdentityProvider +from datahub.ingestion.source.snowflake.oauth_config import OAuthIdentityProvider logger = logging.getLogger(__name__) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py index a9f454cfd3cdb3..397606400d389c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py @@ -16,14 +16,17 @@ from datahub.configuration.common import ConfigModel, ConfigurationError, MetaError from datahub.configuration.connection_resolver import auto_connection_resolver -from datahub.configuration.oauth import OAuthConfiguration, OAuthIdentityProvider from datahub.configuration.validate_field_rename import pydantic_renamed_field from datahub.ingestion.api.closeable import Closeable from datahub.ingestion.source.snowflake.constants import ( CLIENT_PREFETCH_THREADS, CLIENT_SESSION_KEEP_ALIVE, ) -from datahub.ingestion.source.sql.oauth_generator import OAuthTokenGenerator +from datahub.ingestion.source.snowflake.oauth_config import ( + OAuthConfiguration, + OAuthIdentityProvider, +) +from datahub.ingestion.source.snowflake.oauth_generator import OAuthTokenGenerator from datahub.ingestion.source.sql.sql_config import make_sqlalchemy_uri from datahub.utilities.config_clean import ( remove_protocol, diff --git a/metadata-ingestion/tests/unit/test_snowflake_shares.py b/metadata-ingestion/tests/unit/snowflake/test_snowflake_shares.py similarity index 100% rename from metadata-ingestion/tests/unit/test_snowflake_shares.py rename to metadata-ingestion/tests/unit/snowflake/test_snowflake_shares.py diff --git a/metadata-ingestion/tests/unit/test_snowflake_source.py b/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py similarity index 99% rename from metadata-ingestion/tests/unit/test_snowflake_source.py rename to metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py index 72b59a3a4e4938..161dfa2b4e78f3 100644 --- a/metadata-ingestion/tests/unit/test_snowflake_source.py +++ b/metadata-ingestion/tests/unit/snowflake/test_snowflake_source.py @@ -5,7 +5,6 @@ from pydantic import ValidationError from datahub.configuration.common import AllowDenyPattern -from datahub.configuration.oauth import OAuthConfiguration from datahub.configuration.pattern_utils import UUID_REGEX from datahub.ingestion.api.source import SourceCapability from datahub.ingestion.source.snowflake.constants import ( @@ -13,6 +12,7 @@ CLIENT_SESSION_KEEP_ALIVE, SnowflakeCloudProvider, ) +from datahub.ingestion.source.snowflake.oauth_config import OAuthConfiguration from datahub.ingestion.source.snowflake.snowflake_config import ( DEFAULT_TEMP_TABLES_PATTERNS, SnowflakeV2Config, diff --git a/metadata-ingestion/tests/unit/test_parsing_util.py b/metadata-ingestion/tests/unit/utilities/test_parsing_util.py similarity index 100% rename from metadata-ingestion/tests/unit/test_parsing_util.py rename to metadata-ingestion/tests/unit/utilities/test_parsing_util.py From 42bb07a35e255f5a6eda88fa3449964e591a9f1c Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 20 Nov 2024 13:35:01 -0800 Subject: [PATCH 16/24] fix(ingest/bigquery): increase logging in bigquery-queries extractor (#11774) --- .../src/datahub/ingestion/run/pipeline.py | 1 + .../source/bigquery_v2/queries_extractor.py | 23 ++++---- .../src/datahub/utilities/progress_timer.py | 34 ++++++++++++ .../unit/utilities/test_progress_timer.py | 53 +++++++++++++++++++ 4 files changed, 99 insertions(+), 12 deletions(-) create mode 100644 metadata-ingestion/src/datahub/utilities/progress_timer.py create mode 100644 metadata-ingestion/tests/unit/utilities/test_progress_timer.py diff --git a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py index 2e56d5866efa89..7c3a42c3e08931 100644 --- a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py +++ b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py @@ -428,6 +428,7 @@ def create( def _time_to_print(self) -> bool: self.num_intermediate_workunits += 1 current_time = int(time.time()) + # TODO: Replace with ProgressTimer. if current_time - self.last_time_printed > _REPORT_PRINT_INTERVAL_SECONDS: # we print self.num_intermediate_workunits = 0 diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py index 91d55ad879e04a..08c9beaa73c53b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py @@ -2,7 +2,7 @@ import logging import pathlib import tempfile -from datetime import datetime, timezone +from datetime import datetime, timedelta, timezone from typing import Collection, Dict, Iterable, List, Optional, TypedDict from google.cloud.bigquery import Client @@ -49,6 +49,7 @@ FileBackedDict, FileBackedList, ) +from datahub.utilities.progress_timer import ProgressTimer from datahub.utilities.time import datetime_to_ts_millis logger = logging.getLogger(__name__) @@ -270,27 +271,25 @@ def get_workunits_internal( # Preprocessing stage that deduplicates the queries using query hash per usage bucket # Note: FileBackedDict is an ordered dictionary, so the order of execution of # queries is inherently maintained - queries_deduped: FileBackedDict[Dict[int, ObservedQuery]] - queries_deduped = self.deduplicate_queries(queries) + queries_deduped: FileBackedDict[ + Dict[int, ObservedQuery] + ] = self.deduplicate_queries(queries) self.report.num_unique_queries = len(queries_deduped) logger.info(f"Found {self.report.num_unique_queries} unique queries") with self.report.audit_log_load_timer, queries_deduped: - last_log_time = datetime.now() - last_report_time = datetime.now() + log_timer = ProgressTimer(timedelta(minutes=1)) + report_timer = ProgressTimer(timedelta(minutes=5)) + for i, (_, query_instances) in enumerate(queries_deduped.items()): for query in query_instances.values(): - now = datetime.now() - if (now - last_log_time).total_seconds() >= 60: + if log_timer.should_report(): logger.info( f"Added {i} deduplicated query log entries to SQL aggregator" ) - last_log_time = now - if (now - last_report_time).total_seconds() >= 300: - if self.report.sql_aggregator: - logger.info(self.report.sql_aggregator.as_string()) - last_report_time = now + if report_timer.should_report() and self.report.sql_aggregator: + logger.info(self.report.sql_aggregator.as_string()) self.aggregator.add(query) diff --git a/metadata-ingestion/src/datahub/utilities/progress_timer.py b/metadata-ingestion/src/datahub/utilities/progress_timer.py new file mode 100644 index 00000000000000..eac62cddb55f2c --- /dev/null +++ b/metadata-ingestion/src/datahub/utilities/progress_timer.py @@ -0,0 +1,34 @@ +from datetime import datetime, timedelta, timezone + + +class ProgressTimer: + def __init__(self, report_every: timedelta, report_0: bool = False): + """A helper for reporting progress at a given time interval. + + Should be used for long-running processes that iterate over a large number of items, + but each iteration is fast. + + Args: + report_every: The time interval between progress reports. + report_0: Whether to report progress on the first iteration. + """ + + self._report_every = report_every + + if report_0: + # Use the earliest possible time to force reporting on the first iteration. + self._last_report_time = datetime.min.replace(tzinfo=timezone.utc) + else: + self._last_report_time = self._now() + + def _now(self) -> datetime: + return datetime.now(timezone.utc) + + def should_report(self) -> bool: + current_time = self._now() + + should_report = (self._last_report_time + self._report_every) <= current_time + if should_report: + self._last_report_time = current_time + + return should_report diff --git a/metadata-ingestion/tests/unit/utilities/test_progress_timer.py b/metadata-ingestion/tests/unit/utilities/test_progress_timer.py new file mode 100644 index 00000000000000..139bad371bb9f4 --- /dev/null +++ b/metadata-ingestion/tests/unit/utilities/test_progress_timer.py @@ -0,0 +1,53 @@ +from datetime import timedelta +from time import sleep + +from datahub.utilities.progress_timer import ProgressTimer + + +def test_progress_timer_basic(): + timer = ProgressTimer(report_every=timedelta(milliseconds=100)) + + # First call should not report since report_0=False by default + assert not timer.should_report() + + # Call before interval elapsed should not report + sleep(0.05) # 50ms + assert not timer.should_report() + + # Call after interval elapsed should report + sleep(0.1) # Additional 100ms + assert timer.should_report() + + # Next immediate call should not report + assert not timer.should_report() + + +def test_progress_timer_with_report_0(): + timer = ProgressTimer(report_every=timedelta(milliseconds=100), report_0=True) + + # First call should report since report_0=True + assert timer.should_report() + + # Next immediate call should not report + assert not timer.should_report() + + # Call after interval elapsed should report + sleep(0.1) # 100ms + assert timer.should_report() + + +def test_progress_timer_multiple_intervals(): + timer = ProgressTimer(report_every=timedelta(milliseconds=50)) + + # First call should not report + assert not timer.should_report() + + # Check multiple intervals + sleep(0.06) # 60ms - should report + assert timer.should_report() + + sleep(0.02) # 20ms - should not report + assert not timer.should_report() + + sleep(0.05) # 50ms - should report + assert timer.should_report() From 7e9fa815807e13d96ac6fb7bd79b9851ea174d1b Mon Sep 17 00:00:00 2001 From: "Albert T. Wong" Date: Wed, 20 Nov 2024 13:35:50 -0800 Subject: [PATCH 17/24] Update the AWS instructions with EBS CSI and IAM policy instructions (#11872) Co-authored-by: david-leifker <114954101+david-leifker@users.noreply.github.com> --- docs/deploy/aws.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/docs/deploy/aws.md b/docs/deploy/aws.md index 49b0ea1d69ae19..c625ba26a3865a 100644 --- a/docs/deploy/aws.md +++ b/docs/deploy/aws.md @@ -53,7 +53,18 @@ ip-192-168-64-56.us-west-2.compute.internal Ready 3h v1.18.9-ek ip-192-168-8-126.us-west-2.compute.internal Ready 3h v1.18.9-eks-d1db3c ``` -Once your cluster is running, make sure to install the EBS CSI driver, Core DNS, and VPC CNI plugin for Kubernetes. [add-ons](https://docs.aws.amazon.com/eks/latest/userguide/eks-add-ons.html) +### Install EBS CSI driver, Core DNS, and VPC CNI plugin for Kubernetes + +Once your cluster is running, make sure to install the EBS CSI driver, Core DNS, and VPC CNI plugin for Kubernetes. [add-ons](https://docs.aws.amazon.com/eks/latest/userguide/eks-add-ons.html). By default Core DNS and VPC CNI plugins are installed. You need to manually install the EBS CSI driver. It show look this in your console when you are done. + +![Screenshot 2024-11-15 at 4 42 09 PM](https://github.com/user-attachments/assets/5a9a2af0-e804-4896-85bb-dc5834208719) + +### Add the AmazonEBSCSIDriverPolicy role to the EKS node group + +Next is to add the AmazonEBSCSIDriverPolicy role to the EKS node group. You will from the EKS Node group by going to the Compute tab in your EKS cluster and clicking on the IAM entry for the EKS node group. Add the AmazonEBSCSIDriverPolicy policy. + +![Screenshot 2024-11-15 at 4 42 29 PM](https://github.com/user-attachments/assets/8971c8d6-8543-408b-9a07-814aacb2532d) +![Screenshot 2024-11-15 at 4 42 46 PM](https://github.com/user-attachments/assets/397f9131-5f13-4d9f-a664-9921d9bbf44e) ## Setup DataHub using Helm From 37bb0c9eae507b6c552c073430cb07a35568f81f Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 20 Nov 2024 13:48:56 -0800 Subject: [PATCH 18/24] fix(ingest/sql): disable patch checker (#11910) --- metadata-ingestion/src/datahub/utilities/is_pytest.py | 3 ++- metadata-ingestion/tests/conftest.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/utilities/is_pytest.py b/metadata-ingestion/src/datahub/utilities/is_pytest.py index 68bb1b285a50e9..572b4bf5356220 100644 --- a/metadata-ingestion/src/datahub/utilities/is_pytest.py +++ b/metadata-ingestion/src/datahub/utilities/is_pytest.py @@ -1,5 +1,6 @@ +import os import sys def is_pytest_running() -> bool: - return "pytest" in sys.modules + return "pytest" in sys.modules and os.environ.get("DATAHUB_TEST_MODE") == "1" diff --git a/metadata-ingestion/tests/conftest.py b/metadata-ingestion/tests/conftest.py index db025e7f806c06..4685faabfcb285 100644 --- a/metadata-ingestion/tests/conftest.py +++ b/metadata-ingestion/tests/conftest.py @@ -7,6 +7,7 @@ import pytest os.environ["DATAHUB_SUPPRESS_LOGGING_MANAGER"] = "1" +os.environ["DATAHUB_TEST_MODE"] = "1" # Enable debug logging. logging.getLogger().setLevel(logging.DEBUG) From e4d010d7571f70c752635ac5692dff2d1a9b133c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20L=C3=BCdin?= <13187726+Masterchen09@users.noreply.github.com> Date: Thu, 21 Nov 2024 01:05:50 +0100 Subject: [PATCH 19/24] docs(ingest/sac): add additional permission for SAP Analytics Cloud source to docs (#11903) --- metadata-ingestion/docs/sources/sac/sac_pre.md | 1 + 1 file changed, 1 insertion(+) diff --git a/metadata-ingestion/docs/sources/sac/sac_pre.md b/metadata-ingestion/docs/sources/sac/sac_pre.md index c62cd81fa27534..624eb61f716f92 100644 --- a/metadata-ingestion/docs/sources/sac/sac_pre.md +++ b/metadata-ingestion/docs/sources/sac/sac_pre.md @@ -4,6 +4,7 @@ - Purpose: API Access - Access: + - Story Listing - Data Import Service - Authorization Grant: Client Credentials From 7bdd0a801602fab3702b16afa766683d406ecdf5 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 21 Nov 2024 00:19:41 -0800 Subject: [PATCH 20/24] chore(ingest): always use urn creation helpers (#11911) --- metadata-ingestion/scripts/modeldocgen.py | 10 +++++++--- .../src/datahub/ingestion/source/sql/sql_common.py | 5 +++-- metadata-ingestion/tests/unit/test_sql_common.py | 12 ++++++++---- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/metadata-ingestion/scripts/modeldocgen.py b/metadata-ingestion/scripts/modeldocgen.py index ee5f06cb801baa..998947e5caa954 100644 --- a/metadata-ingestion/scripts/modeldocgen.py +++ b/metadata-ingestion/scripts/modeldocgen.py @@ -14,7 +14,11 @@ import click from datahub.configuration.common import ConfigEnum, PermissiveConfigModel -from datahub.emitter.mce_builder import make_data_platform_urn, make_dataset_urn +from datahub.emitter.mce_builder import ( + make_data_platform_urn, + make_dataset_urn, + make_schema_field_urn, +) from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.rest_emitter import DatahubRestEmitter from datahub.ingestion.api.common import PipelineContext, RecordEnvelope @@ -442,10 +446,10 @@ def strip_types(field_path: str) -> str: name=relnship_name, foreignDataset=foreign_dataset_urn, foreignFields=[ - f"urn:li:schemaField:({foreign_dataset_urn}, urn)" + make_schema_field_urn(foreign_dataset_urn, "urn") ], sourceFields=[ - f"urn:li:schemaField:({source_dataset_urn},{f_field.fieldPath})" + make_schema_field_urn(source_dataset_urn, f_field.fieldPath) ], ) foreign_keys.append(fkey) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py index 238fd88f1c9509..e5779791ed4120 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py @@ -32,6 +32,7 @@ make_data_platform_urn, make_dataplatform_instance_urn, make_dataset_urn_with_platform_instance, + make_schema_field_urn, make_tag_urn, ) from datahub.emitter.mcp import MetadataChangeProposalWrapper @@ -669,7 +670,7 @@ def get_foreign_key_metadata( ) source_fields = [ - f"urn:li:schemaField:({dataset_urn},{f})" + make_schema_field_urn(dataset_urn, f) for f in fk_dict["constrained_columns"] ] foreign_dataset = make_dataset_urn_with_platform_instance( @@ -679,7 +680,7 @@ def get_foreign_key_metadata( env=self.config.env, ) foreign_fields = [ - f"urn:li:schemaField:({foreign_dataset},{f})" + make_schema_field_urn(foreign_dataset, f) for f in fk_dict["referred_columns"] ] diff --git a/metadata-ingestion/tests/unit/test_sql_common.py b/metadata-ingestion/tests/unit/test_sql_common.py index a98bf641711220..cfb8f55bd977f7 100644 --- a/metadata-ingestion/tests/unit/test_sql_common.py +++ b/metadata-ingestion/tests/unit/test_sql_common.py @@ -38,7 +38,7 @@ def test_generate_foreign_key(): "referred_columns": ["test_referred_column"], # type: ignore } foreign_key = source.get_foreign_key_metadata( - dataset_urn="test_urn", + dataset_urn="urn:li:dataset:(urn:li:dataPlatform:TEST,test_schema.base_urn,PROD)", schema="test_schema", fk_dict=fk_dict, inspector=mock.Mock(), @@ -48,7 +48,9 @@ def test_generate_foreign_key(): assert [ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:TEST,test_referred_schema.test_table,PROD),test_referred_column)" ] == foreign_key.foreignFields - assert ["urn:li:schemaField:(test_urn,test_column)"] == foreign_key.sourceFields + assert [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:TEST,test_schema.base_urn,PROD),test_column)" + ] == foreign_key.sourceFields def test_use_source_schema_for_foreign_key_if_not_specified(): @@ -60,7 +62,7 @@ def test_use_source_schema_for_foreign_key_if_not_specified(): "referred_columns": ["test_referred_column"], # type: ignore } foreign_key = source.get_foreign_key_metadata( - dataset_urn="test_urn", + dataset_urn="urn:li:dataset:(urn:li:dataPlatform:TEST,test_schema.base_urn,PROD)", schema="test_schema", fk_dict=fk_dict, inspector=mock.Mock(), @@ -70,7 +72,9 @@ def test_use_source_schema_for_foreign_key_if_not_specified(): assert [ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:TEST,test_schema.test_table,PROD),test_referred_column)" ] == foreign_key.foreignFields - assert ["urn:li:schemaField:(test_urn,test_column)"] == foreign_key.sourceFields + assert [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:TEST,test_schema.base_urn,PROD),test_column)" + ] == foreign_key.sourceFields PLATFORM_FROM_SQLALCHEMY_URI_TEST_CASES: Dict[str, str] = { From 05c99abb4a075e0568999bf7c3f4f9b4ee4a6497 Mon Sep 17 00:00:00 2001 From: kevinkarchacryl Date: Thu, 21 Nov 2024 09:20:04 -0500 Subject: [PATCH 21/24] chore: update contributors list (#11915) --- .github/workflows/pr-labeler.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pr-labeler.yml b/.github/workflows/pr-labeler.yml index 1ae3edae7aa90b..7da20ece44f6d6 100644 --- a/.github/workflows/pr-labeler.yml +++ b/.github/workflows/pr-labeler.yml @@ -45,7 +45,8 @@ jobs: "Salman-Apptware", "mayurinehate", "noggi", - "skrydal" + "skrydal", + "kevinkarchacryl" ]'), github.actor ) From 98544aaa6a1dc85dab857a247412114c58d8ba2c Mon Sep 17 00:00:00 2001 From: Pinaki Bhattacharjee Date: Thu, 21 Nov 2024 23:01:04 +0530 Subject: [PATCH 22/24] fix(ts): Suppress ts errors on Editor.tsx (#11275) --- .../Documentation/components/editor/Editor.tsx | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/editor/Editor.tsx b/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/editor/Editor.tsx index fe2a8c51f9377b..8ee0f637094d6d 100644 --- a/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/editor/Editor.tsx +++ b/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/editor/Editor.tsx @@ -50,26 +50,26 @@ export const Editor = forwardRef((props: EditorProps, ref) => { const { manager, state, getContext } = useRemirror({ extensions: () => [ new BlockquoteExtension(), - new BoldExtension(), - new BulletListExtension(), + new BoldExtension({}), + new BulletListExtension({}), new CodeBlockExtension({ syntaxTheme: 'base16_ateliersulphurpool_light' }), new CodeExtension(), - new DataHubMentionsExtension(), - new DropCursorExtension(), + new DataHubMentionsExtension({}), + new DropCursorExtension({}), new HardBreakExtension(), - new HeadingExtension(), - new HistoryExtension(), - new HorizontalRuleExtension(), + new HeadingExtension({}), + new HistoryExtension({}), + new HorizontalRuleExtension({}), new ImageExtension({ enableResizing: !readOnly }), new ItalicExtension(), new LinkExtension({ autoLink: true, defaultTarget: '_blank' }), - new ListItemExtension(), + new ListItemExtension({}), new MarkdownExtension({ htmlSanitizer: DOMPurify.sanitize, htmlToMarkdown, markdownToHtml }), new OrderedListExtension(), new UnderlineExtension(), new StrikeExtension(), new TableExtension({ resizable: false }), - ...(readOnly ? [] : [new HistoryExtension()]), + ...(readOnly ? [] : [new HistoryExtension({})]), ], content, stringHandler: 'markdown', From 87ee6747d42a8ddf4914d11ee58f7442610b0ea1 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 21 Nov 2024 11:37:53 -0600 Subject: [PATCH 23/24] chore(deps): bump cross-spawn from 7.0.3 to 7.0.6 in /smoke-test/tests/cypress (#11890) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- smoke-test/tests/cypress/yarn.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/smoke-test/tests/cypress/yarn.lock b/smoke-test/tests/cypress/yarn.lock index 2433e9f8fae08e..c6116609b11467 100644 --- a/smoke-test/tests/cypress/yarn.lock +++ b/smoke-test/tests/cypress/yarn.lock @@ -510,9 +510,9 @@ core-util-is@1.0.2: integrity sha1-tf1UIgqivFq1eqtxQMlAdUUDwac= cross-spawn@^7.0.0, cross-spawn@^7.0.2: - version "7.0.3" - resolved "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.3.tgz" - integrity sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w== + version "7.0.6" + resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-7.0.6.tgz#8a58fe78f00dcd70c370451759dfbfaf03e8ee9f" + integrity sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA== dependencies: path-key "^3.1.0" shebang-command "^2.0.0" From b5d5db3fbb7fd9ed4f6b49458c628b2dd513195e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 21 Nov 2024 13:33:10 -0600 Subject: [PATCH 24/24] chore(deps): bump cross-spawn from 7.0.3 to 7.0.6 in /docs-website (#11919) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- docs-website/yarn.lock | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/docs-website/yarn.lock b/docs-website/yarn.lock index 4e457abd50af7a..9c82b27c3b61f3 100644 --- a/docs-website/yarn.lock +++ b/docs-website/yarn.lock @@ -1827,7 +1827,7 @@ "@docusaurus/theme-search-algolia" "2.4.3" "@docusaurus/types" "2.4.3" -"@docusaurus/react-loadable@5.5.2", "react-loadable@npm:@docusaurus/react-loadable@5.5.2": +"@docusaurus/react-loadable@5.5.2": version "5.5.2" resolved "https://registry.yarnpkg.com/@docusaurus/react-loadable/-/react-loadable-5.5.2.tgz#81aae0db81ecafbdaee3651f12804580868fa6ce" integrity sha512-A3dYjdBGuy0IGT+wyLIGIKLRE+sAk1iNk0f1HjNDysO7u8lhL4N3VEm+FAubmJbAztn94F7MxBTPmnixbiyFdQ== @@ -4757,9 +4757,9 @@ cross-fetch@^3.1.5: node-fetch "^2.6.12" cross-spawn@^7.0.3: - version "7.0.3" - resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-7.0.3.tgz#f73a85b9d5d41d045551c177e2882d4ac85728a6" - integrity sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w== + version "7.0.6" + resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-7.0.6.tgz#8a58fe78f00dcd70c370451759dfbfaf03e8ee9f" + integrity sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA== dependencies: path-key "^3.1.0" shebang-command "^2.0.0" @@ -9713,6 +9713,14 @@ react-loadable-ssr-addon-v5-slorber@^1.0.1: dependencies: "@babel/runtime" "^7.10.3" +"react-loadable@npm:@docusaurus/react-loadable@5.5.2": + version "5.5.2" + resolved "https://registry.yarnpkg.com/@docusaurus/react-loadable/-/react-loadable-5.5.2.tgz#81aae0db81ecafbdaee3651f12804580868fa6ce" + integrity sha512-A3dYjdBGuy0IGT+wyLIGIKLRE+sAk1iNk0f1HjNDysO7u8lhL4N3VEm+FAubmJbAztn94F7MxBTPmnixbiyFdQ== + dependencies: + "@types/react" "*" + prop-types "^15.6.2" + react-markdown@^8.0.6: version "8.0.7" resolved "https://registry.yarnpkg.com/react-markdown/-/react-markdown-8.0.7.tgz#c8dbd1b9ba5f1c5e7e5f2a44de465a3caafdf89b"