diff --git a/metadata-ingestion/setup.cfg b/metadata-ingestion/setup.cfg index 057779bc87c62..147ac6cf50551 100644 --- a/metadata-ingestion/setup.cfg +++ b/metadata-ingestion/setup.cfg @@ -86,7 +86,7 @@ disallow_untyped_defs = yes [tool:pytest] asyncio_mode = auto -addopts = --cov=src --cov-report= --cov-config setup.cfg --strict-markers -p no:faker +; addopts = --cov=src --cov-report= --cov-config setup.cfg --strict-markers -p no:faker markers = slow: marks tests that are slow to run, including all docker-based tests (deselect with '-m not slow') integration: marks all integration tests, across all batches (deselect with '-m "not integration"') diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index e14e875ab6a28..2e795ebfe2cd2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -5,7 +5,6 @@ from dataclasses import dataclass, field as dataclass_field from functools import lru_cache from typing import ( - TYPE_CHECKING, Any, DefaultDict, Dict, @@ -22,6 +21,7 @@ import botocore.exceptions import yaml +from mypy_boto3_glue.type_defs import DatabasePaginatorTypeDef, TablePaginatorTypeDef from pydantic import validator from pydantic.fields import Field @@ -115,12 +115,6 @@ from datahub.utilities.delta import delta_type_to_hive_type from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_column -if TYPE_CHECKING: - from mypy_boto3_glue.type_defs import ( - DatabasePaginatorTypeDef, - TablePaginatorTypeDef, - ) - logger = logging.getLogger(__name__) DEFAULT_PLATFORM = "glue" @@ -234,9 +228,10 @@ def platform_validator(cls, v: str) -> str: f"'platform' can only take following values: {VALID_PLATFORMS}" ) - def __post_init__(self) -> None: - current_account_id = self.sts_client.get_caller_identity().get("Account") + def __init__(self, **data: Any): + super().__init__(**data) if self.catalog_id: + current_account_id = self.sts_client.get_caller_identity().get("Account") if self.catalog_id == current_account_id: self.catalog_name = DEFAULT_CATALOG_NAME else: @@ -1142,7 +1137,7 @@ def _gen_table_wu(self, table: TablePaginatorTypeDef) -> Iterable[MetadataWorkUn platform_instance=self.source_config.platform_instance, ) - mce = self._extract_record(dataset_urn, table, full_table_name) + mce = self._extract_record(dataset_urn, dict(table), full_table_name) yield MetadataWorkUnit(full_table_name, mce=mce) # We also want to assign "table" subType to the dataset representing glue table - unfortunately it is not diff --git a/metadata-ingestion/tests/unit/glue/glue_mces_golden.json b/metadata-ingestion/tests/unit/glue/glue_mces_golden.json index 71d7c31b222bd..2372d1e77975d 100644 --- a/metadata-ingestion/tests/unit/glue/glue_mces_golden.json +++ b/metadata-ingestion/tests/unit/glue/glue_mces_golden.json @@ -167,7 +167,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database.avro,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,awsdatacatalog.flights-database.avro,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -205,7 +205,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "flights-database.avro", + "schemaName": "awsdatacatalog.flights-database.avro", "platform": "urn:li:dataPlatform:glue", "version": 0, "created": { @@ -370,7 +370,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database.avro,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,awsdatacatalog.flights-database.avro,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -383,7 +383,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database.avro,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,awsdatacatalog.flights-database.avro,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -395,7 +395,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_jsons_markers,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,awsdatacatalog.test-database.test_jsons_markers,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -432,7 +432,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "test-database.test_jsons_markers", + "schemaName": "awsdatacatalog.test-database.test_jsons_markers", "platform": "urn:li:dataPlatform:glue", "version": 0, "created": { @@ -555,7 +555,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_jsons_markers,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,awsdatacatalog.test-database.test_jsons_markers,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -568,7 +568,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_jsons_markers,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,awsdatacatalog.test-database.test_jsons_markers,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -580,7 +580,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_parquet,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,awsdatacatalog.test-database.test_parquet,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -617,7 +617,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "test-database.test_parquet", + "schemaName": "awsdatacatalog.test-database.test_parquet", "platform": "urn:li:dataPlatform:glue", "version": 0, "created": { @@ -741,7 +741,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_parquet,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,awsdatacatalog.test-database.test_parquet,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -754,7 +754,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_parquet,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,awsdatacatalog.test-database.test_parquet,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -896,7 +896,7 @@ { "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database.avro,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:glue,awsdatacatalog.flights-database.avro,PROD)" ], "outputDatasets": [], "inputDatajobs": [] @@ -1085,7 +1085,7 @@ { "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_parquet,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:glue,awsdatacatalog.test-database.test_parquet,PROD)" ], "outputDatasets": [], "inputDatajobs": [] diff --git a/metadata-ingestion/tests/unit/glue/glue_mces_platform_instance_golden.json b/metadata-ingestion/tests/unit/glue/glue_mces_platform_instance_golden.json index b700335c26e5a..d041fbc9eedbe 100644 --- a/metadata-ingestion/tests/unit/glue/glue_mces_platform_instance_golden.json +++ b/metadata-ingestion/tests/unit/glue/glue_mces_platform_instance_golden.json @@ -173,7 +173,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,some_instance_name.flights-database.avro,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,some_instance_name.awsdatacatalog.flights-database.avro,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -211,7 +211,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "flights-database.avro", + "schemaName": "awsdatacatalog.flights-database.avro", "platform": "urn:li:dataPlatform:glue", "version": 0, "created": { @@ -377,7 +377,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,some_instance_name.flights-database.avro,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,some_instance_name.awsdatacatalog.flights-database.avro,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -390,7 +390,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,some_instance_name.flights-database.avro,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,some_instance_name.awsdatacatalog.flights-database.avro,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -402,7 +402,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,some_instance_name.test-database.test_jsons_markers,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,some_instance_name.awsdatacatalog.test-database.test_jsons_markers,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -439,7 +439,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "test-database.test_jsons_markers", + "schemaName": "awsdatacatalog.test-database.test_jsons_markers", "platform": "urn:li:dataPlatform:glue", "version": 0, "created": { @@ -563,7 +563,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,some_instance_name.test-database.test_jsons_markers,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,some_instance_name.awsdatacatalog.test-database.test_jsons_markers,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -576,7 +576,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,some_instance_name.test-database.test_jsons_markers,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,some_instance_name.awsdatacatalog.test-database.test_jsons_markers,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -588,7 +588,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,some_instance_name.test-database.test_parquet,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,some_instance_name.awsdatacatalog.test-database.test_parquet,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -625,7 +625,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "test-database.test_parquet", + "schemaName": "awsdatacatalog.test-database.test_parquet", "platform": "urn:li:dataPlatform:glue", "version": 0, "created": { @@ -750,7 +750,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,some_instance_name.test-database.test_parquet,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,some_instance_name.awsdatacatalog.test-database.test_parquet,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -763,7 +763,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,some_instance_name.test-database.test_parquet,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,some_instance_name.awsdatacatalog.test-database.test_parquet,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -905,7 +905,7 @@ { "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:glue,some_instance_name.flights-database.avro,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:glue,some_instance_name.awsdatacatalog.flights-database.avro,PROD)" ], "outputDatasets": [], "inputDatajobs": [] @@ -1094,7 +1094,7 @@ { "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:glue,some_instance_name.test-database.test_parquet,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:glue,some_instance_name.awsdatacatalog.test-database.test_parquet,PROD)" ], "outputDatasets": [], "inputDatajobs": [] diff --git a/metadata-ingestion/tests/unit/glue/test_glue_source.py b/metadata-ingestion/tests/unit/glue/test_glue_source.py index 9e3f260a23f1c..16170b7ab3ecf 100644 --- a/metadata-ingestion/tests/unit/glue/test_glue_source.py +++ b/metadata-ingestion/tests/unit/glue/test_glue_source.py @@ -7,6 +7,7 @@ import pytest from botocore.stub import Stubber from freezegun import freeze_time +from moto import mock_athena, mock_sts import datahub.metadata.schema_classes as models from datahub.ingestion.api.common import PipelineContext @@ -256,6 +257,8 @@ def test_glue_ingest( ) +@mock_athena +@mock_sts def test_platform_config(): source = GlueSource( ctx=PipelineContext(run_id="glue-source-test"),