From becf51575e3be279951a9a97801ac8998218c688 Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Wed, 11 Dec 2024 18:21:58 +0530 Subject: [PATCH 1/3] docs(ingest/athena): update recipe with aws key pair example (#12076) --- datahub-web-react/src/app/ingest/source/builder/sources.json | 2 +- metadata-ingestion/docs/sources/athena/athena_recipe.yml | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/datahub-web-react/src/app/ingest/source/builder/sources.json b/datahub-web-react/src/app/ingest/source/builder/sources.json index 776b6703895c3..102cce0f491e3 100644 --- a/datahub-web-react/src/app/ingest/source/builder/sources.json +++ b/datahub-web-react/src/app/ingest/source/builder/sources.json @@ -181,7 +181,7 @@ "displayName": "Athena", "description": "Import Schemas, Tables, Views, and lineage to S3 from Athena.", "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/athena/", - "recipe": "source:\n type: athena\n config:\n # Coordinates\n aws_region: my_aws_region\n work_group: primary\n\n # Options\n s3_staging_dir: \"s3://my_staging_athena_results_bucket/results/\"" + "recipe": "source:\n type: athena\n config:\n # AWS Keys (Optional - Required only if local aws credentials are not set)\n username: aws_access_key_id\n password: aws_secret_access_key\n # Coordinates\n aws_region: my_aws_region\n work_group: primary\n\n # Options\n s3_staging_dir: \"s3://my_staging_athena_results_bucket/results/\"" }, { "urn": "urn:li:dataPlatform:clickhouse", diff --git a/metadata-ingestion/docs/sources/athena/athena_recipe.yml b/metadata-ingestion/docs/sources/athena/athena_recipe.yml index 540d8101737a3..c93047ffed9ff 100644 --- a/metadata-ingestion/docs/sources/athena/athena_recipe.yml +++ b/metadata-ingestion/docs/sources/athena/athena_recipe.yml @@ -1,6 +1,11 @@ source: type: athena config: + + # AWS Keys (Optional - Required only if local aws credentials are not set) + username: my_aws_access_key_id + password: my_aws_secret_access_key + # Coordinates aws_region: my_aws_region work_group: primary From 54d53cfc625a1583fbd5e93b4b6bfff55b9897c6 Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Wed, 11 Dec 2024 18:24:18 +0530 Subject: [PATCH 2/3] fix(ingest/gc): minor tweak gc source (#12093) --- .../src/datahub/ingestion/source/gc/datahub_gc.py | 10 +++++----- .../ingestion/source/gc/soft_deleted_entity_cleanup.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py b/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py index 52807ca2a3f02..814f65ecb45cf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py @@ -153,11 +153,6 @@ def get_workunits_internal( self.truncate_indices() except Exception as e: self.report.failure("While trying to truncate indices ", exc=e) - if self.dataprocess_cleanup: - try: - yield from self.dataprocess_cleanup.get_workunits_internal() - except Exception as e: - self.report.failure("While trying to cleanup data process ", exc=e) if self.soft_deleted_entities_cleanup: try: self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities() @@ -170,6 +165,11 @@ def get_workunits_internal( self.execution_request_cleanup.run() except Exception as e: self.report.failure("While trying to cleanup execution request ", exc=e) + if self.dataprocess_cleanup: + try: + yield from self.dataprocess_cleanup.get_workunits_internal() + except Exception as e: + self.report.failure("While trying to cleanup data process ", exc=e) yield from [] def truncate_indices(self) -> None: diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py index 3b367cdea5813..bb4ab753543b7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py @@ -60,7 +60,7 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel): description="Query to filter entities", ) limit_entities_delete: Optional[int] = Field( - 10000, description="Max number of entities to delete." + 25000, description="Max number of entities to delete." ) runtime_limit_seconds: Optional[int] = Field( From e6cc676b23e06da184c3324b8331acd994b112b1 Mon Sep 17 00:00:00 2001 From: Jonny Dixon <45681293+acrylJonny@users.noreply.github.com> Date: Wed, 11 Dec 2024 12:54:47 +0000 Subject: [PATCH 3/3] fix(ingest/abs): detect jsonl schema (#11775) --- metadata-ingestion/src/datahub/ingestion/source/abs/source.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/metadata-ingestion/src/datahub/ingestion/source/abs/source.py b/metadata-ingestion/src/datahub/ingestion/source/abs/source.py index 66f268799b2f1..ad2bc36cf558b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/abs/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/abs/source.py @@ -201,6 +201,10 @@ def get_fields(self, table_data: TableData, path_spec: PathSpec) -> List: ).infer_schema(file) elif extension == ".json": fields = json.JsonInferrer().infer_schema(file) + elif extension == ".jsonl": + fields = json.JsonInferrer( + max_rows=self.source_config.max_rows, format="jsonl" + ).infer_schema(file) elif extension == ".avro": fields = avro.AvroInferrer().infer_schema(file) else: