From 124e2006e11e3f94f131187c848877d65bb810b2 Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Tue, 7 Jan 2025 16:42:08 +0530 Subject: [PATCH 01/14] ci: fix shellcheck warnings, update actions (#12281) --- .github/workflows/build-and-test.yml | 4 ++-- .github/workflows/close-stale-issues.yml | 2 +- .github/workflows/contributor-open-pr-comment.yml | 6 +++--- .github/workflows/docker-unified.yml | 6 +++--- .github/workflows/metadata-io.yml | 4 ++-- .github/workflows/spark-smoke-test.yml | 4 ++-- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 784dce0f11b2b5..0cca80c8fdf982 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -113,7 +113,7 @@ jobs: if: ${{ matrix.command == 'except_metadata_ingestion' && needs.setup.outputs.backend_change == 'true' }} run: | ./gradlew -PjavaClassVersionDefault=8 :metadata-integration:java:spark-lineage:compileJava - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 if: always() with: name: Test Results (build) @@ -152,7 +152,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/close-stale-issues.yml b/.github/workflows/close-stale-issues.yml index 98e3041f288040..005f41b767ea6d 100644 --- a/.github/workflows/close-stale-issues.yml +++ b/.github/workflows/close-stale-issues.yml @@ -10,7 +10,7 @@ jobs: issues: write pull-requests: write steps: - - uses: actions/stale@v6 + - uses: actions/stale@v9 with: ascending: true operations-per-run: 100 diff --git a/.github/workflows/contributor-open-pr-comment.yml b/.github/workflows/contributor-open-pr-comment.yml index decc7ab27a411d..fe60601b0159bd 100644 --- a/.github/workflows/contributor-open-pr-comment.yml +++ b/.github/workflows/contributor-open-pr-comment.yml @@ -17,12 +17,12 @@ jobs: - name: Get and Format Username (PR only) if: github.event_name == 'pull_request' run: | - formatted_username=$(echo "${{ github.event.pull_request.user.login }}" | tr '[:upper:]' '[:lower:]' | sed 's/ /-/g') - echo "FORMATTED_USERNAME=$formatted_username" >> $GITHUB_ENV + formatted_username="$(echo "${{ github.event.pull_request.user.login }}" | tr '[:upper:]' '[:lower:]' | sed 's/ /-/g')" + echo "FORMATTED_USERNAME=${formatted_username}" >> "$GITHUB_ENV" - name: Create Comment (PR only) if: github.event_name == 'pull_request' - uses: actions/github-script@v6 + uses: actions/github-script@v7 with: script: | if (context.payload.pull_request) { diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index a5200c7e917d81..e44e6b11c6d057 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -1253,19 +1253,19 @@ jobs: TEST_STRATEGY="-${{ matrix.test_strategy }}-${{ matrix.batch }}" source .github/scripts/docker_logs.sh - name: Upload logs - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: failure() with: name: docker-logs-${{ matrix.test_strategy }}-${{ matrix.batch }} path: "docker_logs/*.log" retention-days: 5 - name: Upload screenshots - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: failure() with: name: cypress-snapshots-${{ matrix.test_strategy }}-${{ matrix.batch }} path: smoke-test/tests/cypress/cypress/screenshots/ - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 if: always() with: name: Test Results (smoke tests) ${{ matrix.test_strategy }} ${{ matrix.batch }} diff --git a/.github/workflows/metadata-io.yml b/.github/workflows/metadata-io.yml index 2225baecde64c6..aedcd9257d83ba 100644 --- a/.github/workflows/metadata-io.yml +++ b/.github/workflows/metadata-io.yml @@ -70,7 +70,7 @@ jobs: - name: Gradle build (and test) run: | ./gradlew :metadata-io:test - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 if: always() with: name: Test Results (metadata-io) @@ -95,7 +95,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/spark-smoke-test.yml b/.github/workflows/spark-smoke-test.yml index 23413336404f2b..e6a6705a72879c 100644 --- a/.github/workflows/spark-smoke-test.yml +++ b/.github/workflows/spark-smoke-test.yml @@ -72,14 +72,14 @@ jobs: docker logs elasticsearch >& elasticsearch-${{ matrix.test_strategy }}.log || true docker logs datahub-frontend-react >& frontend-${{ matrix.test_strategy }}.log || true - name: Upload logs - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: failure() with: name: docker logs path: | "**/build/container-logs/*.log" "*.log" - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 if: always() with: name: Test Results (smoke tests) From f940c70c73acedc625257eb2a1a4aa5164738c02 Mon Sep 17 00:00:00 2001 From: skrydal Date: Tue, 7 Jan 2025 14:34:09 +0100 Subject: [PATCH 02/14] docs(business attribute): clarify support (#12260) --- docs/businessattributes.md | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/docs/businessattributes.md b/docs/businessattributes.md index 3e912e7e609805..2359c2ac85b585 100644 --- a/docs/businessattributes.md +++ b/docs/businessattributes.md @@ -1,5 +1,10 @@ +import FeatureAvailability from '@site/src/components/FeatureAvailability'; + # Business Attributes + + +>**Note:** This is BETA feature ## What are Business Attributes A Business Attribute, as its name implies, is an attribute with a business focus. It embodies the traits or properties of an entity within a business framework. This attribute is a crucial piece of data for a business, utilised to define or control the entity throughout the organisation. If a business process or concept is depicted as a comprehensive logical model, then each Business Attribute can be considered as an individual component within that model. While business names and descriptions are generally managed through glossary terms, Business Attributes encompass additional characteristics such as data quality rules/assertions, data privacy markers, data usage protocols, standard tags, and supplementary documentation, alongside Names and Descriptions. @@ -70,9 +75,11 @@ Description inherited from business attribute is greyed out to differentiate bet

### Enable Business Attributes Feature -By default, business attribute is disabled. To enable Business Attributes feature, set the following configuration in [application.yaml](../metadata-service/configuration/src/main/resources/application.yaml) - -businessAttributeEntityEnabled : true +By default, business attribute is disabled. To enable Business Attributes feature, export environmental variable +(may be done via `extraEnvs` for GMS deployment): +```shell +BUSINESS_ATTRIBUTE_ENTITY_ENABLED=true +``` ### What updates are planned for the Business Attributes feature? From 03e3f46175df71b83f2c3adcffd97a9962747698 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 7 Jan 2025 10:46:35 -0500 Subject: [PATCH 03/14] fix(airflow): fix tests with Airflow 2.4 (#12279) --- metadata-ingestion-modules/airflow-plugin/setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/metadata-ingestion-modules/airflow-plugin/setup.py b/metadata-ingestion-modules/airflow-plugin/setup.py index 2693aab0700da3..d07063dbffc5c4 100644 --- a/metadata-ingestion-modules/airflow-plugin/setup.py +++ b/metadata-ingestion-modules/airflow-plugin/setup.py @@ -119,6 +119,7 @@ def get_long_description(): "pendulum<3.0", "Flask-Session<0.6.0", "connexion<3.0", + "marshmallow<3.24.0", }, } From afa94a588754c28c8e11f5aa0963808ba5ee6599 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 7 Jan 2025 17:00:13 -0500 Subject: [PATCH 04/14] fix(ingest): better correctness on the emitter -> graph conversion (#12272) --- .../src/datahub/cli/cli_utils.py | 11 +- .../src/datahub/emitter/rest_emitter.py | 209 +++++++++++------- .../src/datahub/ingestion/graph/client.py | 25 ++- .../src/datahub/ingestion/graph/config.py | 2 +- .../tests/unit/sdk/test_rest_emitter.py | 32 +-- 5 files changed, 167 insertions(+), 112 deletions(-) diff --git a/metadata-ingestion/src/datahub/cli/cli_utils.py b/metadata-ingestion/src/datahub/cli/cli_utils.py index f80181192ba583..ca4a11b41925e5 100644 --- a/metadata-ingestion/src/datahub/cli/cli_utils.py +++ b/metadata-ingestion/src/datahub/cli/cli_utils.py @@ -3,7 +3,7 @@ import time import typing from datetime import datetime -from typing import Any, Dict, List, Optional, Tuple, Type, Union +from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union import click import requests @@ -33,6 +33,15 @@ def first_non_null(ls: List[Optional[str]]) -> Optional[str]: return next((el for el in ls if el is not None and el.strip() != ""), None) +_T = TypeVar("_T") + + +def get_or_else(value: Optional[_T], default: _T) -> _T: + # Normally we'd use `value or default`. However, that runs into issues + # when value is falsey but not None. + return value if value is not None else default + + def parse_run_restli_response(response: requests.Response) -> dict: response_json = response.json() if response.status_code != 200: diff --git a/metadata-ingestion/src/datahub/emitter/rest_emitter.py b/metadata-ingestion/src/datahub/emitter/rest_emitter.py index 7c67349c74db10..74b8ade7da445b 100644 --- a/metadata-ingestion/src/datahub/emitter/rest_emitter.py +++ b/metadata-ingestion/src/datahub/emitter/rest_emitter.py @@ -1,9 +1,21 @@ +from __future__ import annotations + import functools import json import logging import os from json.decoder import JSONDecodeError -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Sequence, Union +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + List, + Optional, + Sequence, + Tuple, + Union, +) import requests from deprecated import deprecated @@ -12,9 +24,13 @@ from datahub import nice_version_name from datahub.cli import config_utils -from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url +from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url, get_or_else from datahub.cli.env_utils import get_boolean_env_variable -from datahub.configuration.common import ConfigurationError, OperationalError +from datahub.configuration.common import ( + ConfigModel, + ConfigurationError, + OperationalError, +) from datahub.emitter.generic_emitter import Emitter from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.request_helper import make_curl_command @@ -31,10 +47,8 @@ logger = logging.getLogger(__name__) -_DEFAULT_CONNECT_TIMEOUT_SEC = 30 # 30 seconds should be plenty to connect -_DEFAULT_READ_TIMEOUT_SEC = ( - 30 # Any ingest call taking longer than 30 seconds should be abandoned -) +_DEFAULT_TIMEOUT_SEC = 30 # 30 seconds should be plenty to connect +_TIMEOUT_LOWER_BOUND_SEC = 1 # if below this, we log a warning _DEFAULT_RETRY_STATUS_CODES = [ # Additional status codes to retry on 429, 500, @@ -63,15 +77,76 @@ ) +class RequestsSessionConfig(ConfigModel): + timeout: Union[float, Tuple[float, float], None] = _DEFAULT_TIMEOUT_SEC + + retry_status_codes: List[int] = _DEFAULT_RETRY_STATUS_CODES + retry_methods: List[str] = _DEFAULT_RETRY_METHODS + retry_max_times: int = _DEFAULT_RETRY_MAX_TIMES + + extra_headers: Dict[str, str] = {} + + ca_certificate_path: Optional[str] = None + client_certificate_path: Optional[str] = None + disable_ssl_verification: bool = False + + def build_session(self) -> requests.Session: + session = requests.Session() + + if self.extra_headers: + session.headers.update(self.extra_headers) + + if self.client_certificate_path: + session.cert = self.client_certificate_path + + if self.ca_certificate_path: + session.verify = self.ca_certificate_path + + if self.disable_ssl_verification: + session.verify = False + + try: + # Set raise_on_status to False to propagate errors: + # https://stackoverflow.com/questions/70189330/determine-status-code-from-python-retry-exception + # Must call `raise_for_status` after making a request, which we do + retry_strategy = Retry( + total=self.retry_max_times, + status_forcelist=self.retry_status_codes, + backoff_factor=2, + allowed_methods=self.retry_methods, + raise_on_status=False, + ) + except TypeError: + # Prior to urllib3 1.26, the Retry class used `method_whitelist` instead of `allowed_methods`. + retry_strategy = Retry( + total=self.retry_max_times, + status_forcelist=self.retry_status_codes, + backoff_factor=2, + method_whitelist=self.retry_methods, + raise_on_status=False, + ) + + adapter = HTTPAdapter( + pool_connections=100, pool_maxsize=100, max_retries=retry_strategy + ) + session.mount("http://", adapter) + session.mount("https://", adapter) + + if self.timeout is not None: + # Shim session.request to apply default timeout values. + # Via https://stackoverflow.com/a/59317604. + session.request = functools.partial( # type: ignore + session.request, + timeout=self.timeout, + ) + + return session + + class DataHubRestEmitter(Closeable, Emitter): _gms_server: str _token: Optional[str] _session: requests.Session - _connect_timeout_sec: float = _DEFAULT_CONNECT_TIMEOUT_SEC - _read_timeout_sec: float = _DEFAULT_READ_TIMEOUT_SEC - _retry_status_codes: List[int] = _DEFAULT_RETRY_STATUS_CODES - _retry_methods: List[str] = _DEFAULT_RETRY_METHODS - _retry_max_times: int = _DEFAULT_RETRY_MAX_TIMES def __init__( self, @@ -102,15 +177,13 @@ def __init__( self._session = requests.Session() - self._session.headers.update( - { - "X-RestLi-Protocol-Version": "2.0.0", - "X-DataHub-Py-Cli-Version": nice_version_name(), - "Content-Type": "application/json", - } - ) + headers = { + "X-RestLi-Protocol-Version": "2.0.0", + "X-DataHub-Py-Cli-Version": nice_version_name(), + "Content-Type": "application/json", + } if token: - self._session.headers.update({"Authorization": f"Bearer {token}"}) + headers["Authorization"] = f"Bearer {token}" else: # HACK: When no token is provided but system auth env variables are set, we use them. # Ideally this should simply get passed in as config, instead of being sneakily injected @@ -119,75 +192,43 @@ def __init__( # rest emitter, and the rest sink uses the rest emitter under the hood. system_auth = config_utils.get_system_auth() if system_auth is not None: - self._session.headers.update({"Authorization": system_auth}) - - if extra_headers: - self._session.headers.update(extra_headers) - - if client_certificate_path: - self._session.cert = client_certificate_path - - if ca_certificate_path: - self._session.verify = ca_certificate_path - - if disable_ssl_verification: - self._session.verify = False - - self._connect_timeout_sec = ( - connect_timeout_sec or timeout_sec or _DEFAULT_CONNECT_TIMEOUT_SEC - ) - self._read_timeout_sec = ( - read_timeout_sec or timeout_sec or _DEFAULT_READ_TIMEOUT_SEC - ) - - if self._connect_timeout_sec < 1 or self._read_timeout_sec < 1: - logger.warning( - f"Setting timeout values lower than 1 second is not recommended. Your configuration is connect_timeout:{self._connect_timeout_sec}s, read_timeout:{self._read_timeout_sec}s" - ) - - if retry_status_codes is not None: # Only if missing. Empty list is allowed - self._retry_status_codes = retry_status_codes - - if retry_methods is not None: - self._retry_methods = retry_methods - - if retry_max_times: - self._retry_max_times = retry_max_times + headers["Authorization"] = system_auth - try: - # Set raise_on_status to False to propagate errors: - # https://stackoverflow.com/questions/70189330/determine-status-code-from-python-retry-exception - # Must call `raise_for_status` after making a request, which we do - retry_strategy = Retry( - total=self._retry_max_times, - status_forcelist=self._retry_status_codes, - backoff_factor=2, - allowed_methods=self._retry_methods, - raise_on_status=False, - ) - except TypeError: - # Prior to urllib3 1.26, the Retry class used `method_whitelist` instead of `allowed_methods`. - retry_strategy = Retry( - total=self._retry_max_times, - status_forcelist=self._retry_status_codes, - backoff_factor=2, - method_whitelist=self._retry_methods, - raise_on_status=False, + timeout: float | tuple[float, float] + if connect_timeout_sec is not None or read_timeout_sec is not None: + timeout = ( + connect_timeout_sec or timeout_sec or _DEFAULT_TIMEOUT_SEC, + read_timeout_sec or timeout_sec or _DEFAULT_TIMEOUT_SEC, ) + if ( + timeout[0] < _TIMEOUT_LOWER_BOUND_SEC + or timeout[1] < _TIMEOUT_LOWER_BOUND_SEC + ): + logger.warning( + f"Setting timeout values lower than {_TIMEOUT_LOWER_BOUND_SEC} second is not recommended. Your configuration is (connect_timeout, read_timeout) = {timeout} seconds" + ) + else: + timeout = get_or_else(timeout_sec, _DEFAULT_TIMEOUT_SEC) + if timeout < _TIMEOUT_LOWER_BOUND_SEC: + logger.warning( + f"Setting timeout values lower than {_TIMEOUT_LOWER_BOUND_SEC} second is not recommended. Your configuration is timeout = {timeout} seconds" + ) - adapter = HTTPAdapter( - pool_connections=100, pool_maxsize=100, max_retries=retry_strategy - ) - self._session.mount("http://", adapter) - self._session.mount("https://", adapter) - - # Shim session.request to apply default timeout values. - # Via https://stackoverflow.com/a/59317604. - self._session.request = functools.partial( # type: ignore - self._session.request, - timeout=(self._connect_timeout_sec, self._read_timeout_sec), + self._session_config = RequestsSessionConfig( + timeout=timeout, + retry_status_codes=get_or_else( + retry_status_codes, _DEFAULT_RETRY_STATUS_CODES + ), + retry_methods=get_or_else(retry_methods, _DEFAULT_RETRY_METHODS), + retry_max_times=get_or_else(retry_max_times, _DEFAULT_RETRY_MAX_TIMES), + extra_headers={**headers, **(extra_headers or {})}, + ca_certificate_path=ca_certificate_path, + client_certificate_path=client_certificate_path, + disable_ssl_verification=disable_ssl_verification, ) + self._session = self._session_config.build_session() + def test_connection(self) -> None: url = f"{self._gms_server}/config" response = self._session.get(url) diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index ca9a41172e5b6e..7de6e8130a7ab6 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -179,21 +179,24 @@ def frontend_base_url(self) -> str: @classmethod def from_emitter(cls, emitter: DatahubRestEmitter) -> "DataHubGraph": + session_config = emitter._session_config + if isinstance(session_config.timeout, tuple): + # TODO: This is slightly lossy. Eventually, we want to modify the emitter + # to accept a tuple for timeout_sec, and then we'll be able to remove this. + timeout_sec: Optional[float] = session_config.timeout[0] + else: + timeout_sec = session_config.timeout return cls( DatahubClientConfig( server=emitter._gms_server, token=emitter._token, - timeout_sec=emitter._read_timeout_sec, - retry_status_codes=emitter._retry_status_codes, - retry_max_times=emitter._retry_max_times, - extra_headers=emitter._session.headers, - disable_ssl_verification=emitter._session.verify is False, - ca_certificate_path=( - emitter._session.verify - if isinstance(emitter._session.verify, str) - else None - ), - client_certificate_path=emitter._session.cert, + timeout_sec=timeout_sec, + retry_status_codes=session_config.retry_status_codes, + retry_max_times=session_config.retry_max_times, + extra_headers=session_config.extra_headers, + disable_ssl_verification=session_config.disable_ssl_verification, + ca_certificate_path=session_config.ca_certificate_path, + client_certificate_path=session_config.client_certificate_path, ) ) diff --git a/metadata-ingestion/src/datahub/ingestion/graph/config.py b/metadata-ingestion/src/datahub/ingestion/graph/config.py index 5f269e14e1a4af..8f0a5844c97c4b 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/config.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/config.py @@ -10,7 +10,7 @@ class DatahubClientConfig(ConfigModel): # by callers / the CLI, but the actual client should not have any magic. server: str token: Optional[str] = None - timeout_sec: Optional[int] = None + timeout_sec: Optional[float] = None retry_status_codes: Optional[List[int]] = None retry_max_times: Optional[int] = None extra_headers: Optional[Dict[str, str]] = None diff --git a/metadata-ingestion/tests/unit/sdk/test_rest_emitter.py b/metadata-ingestion/tests/unit/sdk/test_rest_emitter.py index b4d7cb17b66f5c..81120dfc87aba3 100644 --- a/metadata-ingestion/tests/unit/sdk/test_rest_emitter.py +++ b/metadata-ingestion/tests/unit/sdk/test_rest_emitter.py @@ -4,39 +4,41 @@ MOCK_GMS_ENDPOINT = "http://fakegmshost:8080" -def test_datahub_rest_emitter_construction(): +def test_datahub_rest_emitter_construction() -> None: emitter = DatahubRestEmitter(MOCK_GMS_ENDPOINT) - assert emitter._connect_timeout_sec == rest_emitter._DEFAULT_CONNECT_TIMEOUT_SEC - assert emitter._read_timeout_sec == rest_emitter._DEFAULT_READ_TIMEOUT_SEC - assert emitter._retry_status_codes == rest_emitter._DEFAULT_RETRY_STATUS_CODES - assert emitter._retry_max_times == rest_emitter._DEFAULT_RETRY_MAX_TIMES + assert emitter._session_config.timeout == rest_emitter._DEFAULT_TIMEOUT_SEC + assert ( + emitter._session_config.retry_status_codes + == rest_emitter._DEFAULT_RETRY_STATUS_CODES + ) + assert ( + emitter._session_config.retry_max_times == rest_emitter._DEFAULT_RETRY_MAX_TIMES + ) -def test_datahub_rest_emitter_timeout_construction(): +def test_datahub_rest_emitter_timeout_construction() -> None: emitter = DatahubRestEmitter( MOCK_GMS_ENDPOINT, connect_timeout_sec=2, read_timeout_sec=4 ) - assert emitter._connect_timeout_sec == 2 - assert emitter._read_timeout_sec == 4 + assert emitter._session_config.timeout == (2, 4) -def test_datahub_rest_emitter_general_timeout_construction(): +def test_datahub_rest_emitter_general_timeout_construction() -> None: emitter = DatahubRestEmitter(MOCK_GMS_ENDPOINT, timeout_sec=2, read_timeout_sec=4) - assert emitter._connect_timeout_sec == 2 - assert emitter._read_timeout_sec == 4 + assert emitter._session_config.timeout == (2, 4) -def test_datahub_rest_emitter_retry_construction(): +def test_datahub_rest_emitter_retry_construction() -> None: emitter = DatahubRestEmitter( MOCK_GMS_ENDPOINT, retry_status_codes=[418], retry_max_times=42, ) - assert emitter._retry_status_codes == [418] - assert emitter._retry_max_times == 42 + assert emitter._session_config.retry_status_codes == [418] + assert emitter._session_config.retry_max_times == 42 -def test_datahub_rest_emitter_extra_params(): +def test_datahub_rest_emitter_extra_params() -> None: emitter = DatahubRestEmitter( MOCK_GMS_ENDPOINT, extra_headers={"key1": "value1", "key2": "value2"} ) From cbb36bbe590812b525e6f92608279c624123333c Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 7 Jan 2025 19:23:58 -0500 Subject: [PATCH 05/14] feat(ingest): configurable query generation in combined sources (#12284) --- .../src/datahub/ingestion/source/bigquery_v2/bigquery.py | 2 ++ .../ingestion/source/bigquery_v2/bigquery_config.py | 8 ++++++++ .../ingestion/source/snowflake/snowflake_config.py | 8 ++++++++ .../datahub/ingestion/source/snowflake/snowflake_v2.py | 2 ++ 4 files changed, 20 insertions(+) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index 38eab3606b7e95..db7b0540e49e71 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -281,6 +281,8 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: include_lineage=self.config.include_table_lineage, include_usage_statistics=self.config.include_usage_statistics, include_operations=self.config.usage.include_operational_stats, + include_queries=self.config.include_queries, + include_query_usage_statistics=self.config.include_query_usage_statistics, top_n_queries=self.config.usage.top_n_queries, region_qualifiers=self.config.region_qualifiers, ), diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index ef323260b014e6..afbe919df4dcae 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -447,6 +447,14 @@ class BigQueryV2Config( default=False, description="If enabled, uses the new queries extractor to extract queries from bigquery.", ) + include_queries: bool = Field( + default=True, + description="If enabled, generate query entities associated with lineage edges. Only applicable if `use_queries_v2` is enabled.", + ) + include_query_usage_statistics: bool = Field( + default=True, + description="If enabled, generate query popularity statistics. Only applicable if `use_queries_v2` is enabled.", + ) @property def have_table_data_read_permission(self) -> bool: diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index 12e5fb72b00de8..2d61ce59857778 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -221,6 +221,14 @@ class SnowflakeV2Config( default=False, description="If enabled, uses the new queries extractor to extract queries from snowflake.", ) + include_queries: bool = Field( + default=True, + description="If enabled, generate query entities associated with lineage edges. Only applicable if `use_queries_v2` is enabled.", + ) + include_query_usage_statistics: bool = Field( + default=True, + description="If enabled, generate query popularity statistics. Only applicable if `use_queries_v2` is enabled.", + ) lazy_schema_resolver: bool = Field( default=True, diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index 954e8a29c1a1bd..aede3d056709a2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -528,6 +528,8 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: include_lineage=self.config.include_table_lineage, include_usage_statistics=self.config.include_usage_stats, include_operations=self.config.include_operational_stats, + include_queries=self.config.include_queries, + include_query_usage_statistics=self.config.include_query_usage_statistics, user_email_pattern=self.config.user_email_pattern, ), structured_report=self.report, From 98a5a2c086df1667a1b669410efaeafbeb5e3d8b Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Wed, 8 Jan 2025 06:34:10 -0600 Subject: [PATCH 06/14] fix(javaEntityClient): correct config parameter (#12287) --- .../java/com/linkedin/metadata/client/JavaEntityClient.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java index 3d35f5956b0f4f..35d133c74c0692 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java @@ -775,7 +775,8 @@ public List batchIngestProposals( List updatedUrns = new ArrayList<>(); Iterators.partition( - metadataChangeProposals.iterator(), Math.max(1, entityClientConfig.getBatchGetV2Size())) + metadataChangeProposals.iterator(), + Math.max(1, entityClientConfig.getBatchIngestSize())) .forEachRemaining( batch -> { AspectsBatch aspectsBatch = From c0b13f087aaff9898ab8377259fb0b691b128ca0 Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Wed, 8 Jan 2025 18:40:19 +0530 Subject: [PATCH 07/14] ci: upload test coverage to codecov (#12291) --- .github/workflows/airflow-plugin.yml | 5 +++++ .github/workflows/build-and-test.yml | 5 +++++ .github/workflows/dagster-plugin.yml | 5 +++++ .github/workflows/gx-plugin.yml | 5 +++++ .github/workflows/metadata-ingestion.yml | 5 +++++ .github/workflows/metadata-io.yml | 5 +++++ .github/workflows/prefect-plugin.yml | 5 +++++ 7 files changed, 35 insertions(+) diff --git a/.github/workflows/airflow-plugin.yml b/.github/workflows/airflow-plugin.yml index b824a21be63f8f..89e0c9e2513d8b 100644 --- a/.github/workflows/airflow-plugin.yml +++ b/.github/workflows/airflow-plugin.yml @@ -87,6 +87,11 @@ jobs: flags: airflow-${{ matrix.python-version }}-${{ matrix.extra_pip_extras }} name: pytest-airflow verbose: true + - name: Upload test results to Codecov + if: ${{ !cancelled() }} + uses: codecov/test-results-action@v1 + with: + token: ${{ secrets.CODECOV_TOKEN }} event-file: runs-on: ubuntu-latest diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 0cca80c8fdf982..058ac4a5c9b1e5 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -134,6 +134,11 @@ jobs: flags: ${{ matrix.timezone }} name: ${{ matrix.command }} verbose: true + - name: Upload test results to Codecov + if: ${{ !cancelled() }} + uses: codecov/test-results-action@v1 + with: + token: ${{ secrets.CODECOV_TOKEN }} quickstart-compose-validation: runs-on: ubuntu-latest diff --git a/.github/workflows/dagster-plugin.yml b/.github/workflows/dagster-plugin.yml index ae9a0b1605cdf3..c29e72367c53c5 100644 --- a/.github/workflows/dagster-plugin.yml +++ b/.github/workflows/dagster-plugin.yml @@ -74,6 +74,11 @@ jobs: flags: dagster-${{ matrix.python-version }}-${{ matrix.extraPythonRequirement }} name: pytest-dagster verbose: true + - name: Upload test results to Codecov + if: ${{ !cancelled() }} + uses: codecov/test-results-action@v1 + with: + token: ${{ secrets.CODECOV_TOKEN }} event-file: runs-on: ubuntu-latest diff --git a/.github/workflows/gx-plugin.yml b/.github/workflows/gx-plugin.yml index 2fd814a0764858..825f8beda2f561 100644 --- a/.github/workflows/gx-plugin.yml +++ b/.github/workflows/gx-plugin.yml @@ -78,6 +78,11 @@ jobs: flags: gx-${{ matrix.python-version }}-${{ matrix.extraPythonRequirement }} name: pytest-gx verbose: true + - name: Upload test results to Codecov + if: ${{ !cancelled() }} + uses: codecov/test-results-action@v1 + with: + token: ${{ secrets.CODECOV_TOKEN }} event-file: runs-on: ubuntu-latest diff --git a/.github/workflows/metadata-ingestion.yml b/.github/workflows/metadata-ingestion.yml index f4d87b361b5edc..aa404c4c35c505 100644 --- a/.github/workflows/metadata-ingestion.yml +++ b/.github/workflows/metadata-ingestion.yml @@ -98,6 +98,11 @@ jobs: flags: ingestion-${{ matrix.python-version }}-${{ matrix.command }} name: pytest-ingestion verbose: true + - name: Upload test results to Codecov + if: ${{ !cancelled() }} + uses: codecov/test-results-action@v1 + with: + token: ${{ secrets.CODECOV_TOKEN }} event-file: runs-on: ubuntu-latest diff --git a/.github/workflows/metadata-io.yml b/.github/workflows/metadata-io.yml index aedcd9257d83ba..bcadc641ee2f7c 100644 --- a/.github/workflows/metadata-io.yml +++ b/.github/workflows/metadata-io.yml @@ -90,6 +90,11 @@ jobs: fail_ci_if_error: false name: metadata-io-test verbose: true + - name: Upload test results to Codecov + if: ${{ !cancelled() }} + uses: codecov/test-results-action@v1 + with: + token: ${{ secrets.CODECOV_TOKEN }} event-file: runs-on: ubuntu-latest diff --git a/.github/workflows/prefect-plugin.yml b/.github/workflows/prefect-plugin.yml index 879df032409f28..0bce4d5ef19f31 100644 --- a/.github/workflows/prefect-plugin.yml +++ b/.github/workflows/prefect-plugin.yml @@ -70,6 +70,11 @@ jobs: flags: prefect-${{ matrix.python-version }} name: pytest-prefect verbose: true + - name: Upload test results to Codecov + if: ${{ !cancelled() }} + uses: codecov/test-results-action@v1 + with: + token: ${{ secrets.CODECOV_TOKEN }} event-file: runs-on: ubuntu-latest From 333445326a627a28353f1def04955f5812dc17bb Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Wed, 8 Jan 2025 18:50:07 +0530 Subject: [PATCH 08/14] log(elastic/index builder): add est time remaining (#12280) --- .../indexbuilder/ESIndexBuilder.java | 25 ++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilder.java index 6de79b6c4b181e..792e67e69f2da6 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilder.java @@ -411,6 +411,8 @@ private void reindex(ReindexConfig indexState) throws Throwable { boolean reindexTaskCompleted = false; Pair documentCounts = getDocumentCounts(indexState.name(), tempIndexName); long documentCountsLastUpdated = System.currentTimeMillis(); + long previousDocCount = documentCounts.getSecond(); + long estimatedMinutesRemaining = 0; while (System.currentTimeMillis() < timeoutAt) { log.info( @@ -421,8 +423,22 @@ private void reindex(ReindexConfig indexState) throws Throwable { Pair tempDocumentsCount = getDocumentCounts(indexState.name(), tempIndexName); if (!tempDocumentsCount.equals(documentCounts)) { - documentCountsLastUpdated = System.currentTimeMillis(); + long currentTime = System.currentTimeMillis(); + long timeElapsed = currentTime - documentCountsLastUpdated; + long docsIndexed = tempDocumentsCount.getSecond() - previousDocCount; + + // Calculate indexing rate (docs per millisecond) + double indexingRate = timeElapsed > 0 ? (double) docsIndexed / timeElapsed : 0; + + // Calculate remaining docs and estimated time + long remainingDocs = tempDocumentsCount.getFirst() - tempDocumentsCount.getSecond(); + long estimatedMillisRemaining = + indexingRate > 0 ? (long) (remainingDocs / indexingRate) : 0; + estimatedMinutesRemaining = estimatedMillisRemaining / (1000 * 60); + + documentCountsLastUpdated = currentTime; documentCounts = tempDocumentsCount; + previousDocCount = documentCounts.getSecond(); } if (documentCounts.getFirst().equals(documentCounts.getSecond())) { @@ -435,12 +451,15 @@ private void reindex(ReindexConfig indexState) throws Throwable { break; } else { + float progressPercentage = + 100 * (1.0f * documentCounts.getSecond()) / documentCounts.getFirst(); log.warn( - "Task: {} - Document counts do not match {} != {}. Complete: {}%", + "Task: {} - Document counts do not match {} != {}. Complete: {}%. Estimated time remaining: {} minutes", parentTaskId, documentCounts.getFirst(), documentCounts.getSecond(), - 100 * (1.0f * documentCounts.getSecond()) / documentCounts.getFirst()); + progressPercentage, + estimatedMinutesRemaining); long lastUpdateDelta = System.currentTimeMillis() - documentCountsLastUpdated; if (lastUpdateDelta > (300 * 1000)) { From 99c30f2b3c80ed55a7c39448ffc8fad3bfc010f3 Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Wed, 8 Jan 2025 19:04:19 +0530 Subject: [PATCH 09/14] fix(ingest/glue): don't fail on profile (#12288) --- .../src/datahub/ingestion/source/aws/glue.py | 87 +++++++++++-------- 1 file changed, 52 insertions(+), 35 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index 7a5ed154d40bc7..a0bed4ae9a7581 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -1054,49 +1054,66 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: yield from self.gen_database_containers(database) for table in tables: - database_name = table["DatabaseName"] table_name = table["Name"] - full_table_name = f"{database_name}.{table_name}" - self.report.report_table_scanned() - if not self.source_config.database_pattern.allowed( - database_name - ) or not self.source_config.table_pattern.allowed(full_table_name): - self.report.report_table_dropped(full_table_name) - continue + try: + yield from self._gen_table_wu(table=table) + except KeyError as e: + self.report.report_failure( + message="Failed to extract workunit for table", + context=f"Table: {table_name}", + exc=e, + ) + if self.extract_transforms: + yield from self._transform_extraction() - dataset_urn = make_dataset_urn_with_platform_instance( - platform=self.platform, - name=full_table_name, - env=self.env, - platform_instance=self.source_config.platform_instance, - ) + def _gen_table_wu(self, table: Dict) -> Iterable[MetadataWorkUnit]: + database_name = table["DatabaseName"] + table_name = table["Name"] + full_table_name = f"{database_name}.{table_name}" + self.report.report_table_scanned() + if not self.source_config.database_pattern.allowed( + database_name + ) or not self.source_config.table_pattern.allowed(full_table_name): + self.report.report_table_dropped(full_table_name) + return + + dataset_urn = make_dataset_urn_with_platform_instance( + platform=self.platform, + name=full_table_name, + env=self.env, + platform_instance=self.source_config.platform_instance, + ) - mce = self._extract_record(dataset_urn, table, full_table_name) - yield MetadataWorkUnit(full_table_name, mce=mce) + mce = self._extract_record(dataset_urn, table, full_table_name) + yield MetadataWorkUnit(full_table_name, mce=mce) - # We also want to assign "table" subType to the dataset representing glue table - unfortunately it is not - # possible via Dataset snapshot embedded in a mce, so we have to generate a mcp. - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, - aspect=SubTypes(typeNames=[DatasetSubTypes.TABLE]), - ).as_workunit() + # We also want to assign "table" subType to the dataset representing glue table - unfortunately it is not + # possible via Dataset snapshot embedded in a mce, so we have to generate a mcp. + yield MetadataChangeProposalWrapper( + entityUrn=dataset_urn, + aspect=SubTypes(typeNames=[DatasetSubTypes.TABLE]), + ).as_workunit() - yield from self._get_domain_wu( - dataset_name=full_table_name, - entity_urn=dataset_urn, - ) - yield from self.add_table_to_database_container( - dataset_urn=dataset_urn, db_name=database_name - ) + yield from self._get_domain_wu( + dataset_name=full_table_name, + entity_urn=dataset_urn, + ) + yield from self.add_table_to_database_container( + dataset_urn=dataset_urn, db_name=database_name + ) - wu = self.get_lineage_if_enabled(mce) - if wu: - yield wu + wu = self.get_lineage_if_enabled(mce) + if wu: + yield wu + try: yield from self.get_profile_if_enabled(mce, database_name, table_name) - - if self.extract_transforms: - yield from self._transform_extraction() + except KeyError as e: + self.report.report_failure( + message="Failed to extract profile for table", + context=f"Table: {dataset_urn}", + exc=e, + ) def _transform_extraction(self) -> Iterable[MetadataWorkUnit]: dags: Dict[str, Optional[Dict[str, Any]]] = {} From 0fe4163332eec5cf527ee0a0110507eb8934c4c9 Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Wed, 8 Jan 2025 19:13:31 +0530 Subject: [PATCH 10/14] fix(ingest/gc): also query data process instance (#12292) --- .../source/gc/soft_deleted_entity_cleanup.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py index 32243106bb53f6..0a52b7e17bf714 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py @@ -19,8 +19,8 @@ logger = logging.getLogger(__name__) -QUERY_QUERY_ENTITY = """ -query listQueries($input: ScrollAcrossEntitiesInput!) { +QUERY_ENTITIES = """ +query listEntities($input: ScrollAcrossEntitiesInput!) { scrollAcrossEntities(input: $input) { nextScrollId count @@ -29,6 +29,9 @@ ... on QueryEntity { urn } + ... on DataProcessInstance { + urn + } } } } @@ -225,16 +228,16 @@ def _process_futures(self, futures: Dict[Future, str]) -> Dict[Future, str]: time.sleep(self.config.delay) return futures - def _get_soft_deleted_queries(self) -> Iterable[str]: + def _get_soft_deleted(self, graphql_query: str, entity_type: str) -> Iterable[str]: assert self.ctx.graph scroll_id: Optional[str] = None while True: try: result = self.ctx.graph.execute_graphql( - QUERY_QUERY_ENTITY, + graphql_query, { "input": { - "types": ["QUERY"], + "types": [entity_type], "query": "*", "scrollId": scroll_id if scroll_id else None, "count": self.config.batch_size, @@ -254,7 +257,7 @@ def _get_soft_deleted_queries(self) -> Iterable[str]: ) except Exception as e: self.report.failure( - f"While trying to get queries with {scroll_id}", exc=e + f"While trying to get {entity_type} with {scroll_id}", exc=e ) break scroll_across_entities = result.get("scrollAcrossEntities") @@ -275,7 +278,8 @@ def _get_urns(self) -> Iterable[str]: status=RemovedStatusFilter.ONLY_SOFT_DELETED, batch_size=self.config.batch_size, ) - yield from self._get_soft_deleted_queries() + yield from self._get_soft_deleted(QUERY_ENTITIES, "QUERY") + yield from self._get_soft_deleted(QUERY_ENTITIES, "DATA_PROCESS_INSTANCE") def _times_up(self) -> bool: if ( From a4c47fa343cec4e6bc7addc11c553bace0a852a9 Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Wed, 8 Jan 2025 19:46:57 +0530 Subject: [PATCH 11/14] fix(cli): correct url ending with acryl.io:8080 (#12289) --- metadata-ingestion/src/datahub/cli/cli_utils.py | 2 ++ metadata-ingestion/tests/unit/cli/test_cli_utils.py | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/metadata-ingestion/src/datahub/cli/cli_utils.py b/metadata-ingestion/src/datahub/cli/cli_utils.py index ca4a11b41925e5..f6b5ba6176c59d 100644 --- a/metadata-ingestion/src/datahub/cli/cli_utils.py +++ b/metadata-ingestion/src/datahub/cli/cli_utils.py @@ -330,6 +330,8 @@ def get_frontend_session_login_as( def _ensure_valid_gms_url_acryl_cloud(url: str) -> str: if "acryl.io" not in url: return url + if url.endswith(":8080"): + url = url.replace(":8080", "") if url.startswith("http://"): url = url.replace("http://", "https://") if url.endswith("acryl.io"): diff --git a/metadata-ingestion/tests/unit/cli/test_cli_utils.py b/metadata-ingestion/tests/unit/cli/test_cli_utils.py index c9693c75d96fe9..c430f585200e5a 100644 --- a/metadata-ingestion/tests/unit/cli/test_cli_utils.py +++ b/metadata-ingestion/tests/unit/cli/test_cli_utils.py @@ -70,6 +70,10 @@ def test_fixup_gms_url(): cli_utils.fixup_gms_url("http://abc.acryl.io/api/gms") == "https://abc.acryl.io/gms" ) + assert ( + cli_utils.fixup_gms_url("http://abcd.acryl.io:8080") + == "https://abcd.acryl.io/gms" + ) def test_guess_frontend_url_from_gms_url(): From 58b6a5bee54f23a6ea1801d9f1f92d3f3600b763 Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Wed, 8 Jan 2025 21:57:55 +0530 Subject: [PATCH 12/14] dev: add pre-commit hooks installed by default (#12293) --- .github/scripts/generate_pre_commit.py | 265 +++++++++++++++ .github/scripts/pre-commit-override.yaml | 8 + .pre-commit-config.yaml | 402 +++++++++++++++++++++-- build.gradle | 4 - settings.gradle | 49 +++ 5 files changed, 700 insertions(+), 28 deletions(-) create mode 100755 .github/scripts/generate_pre_commit.py create mode 100644 .github/scripts/pre-commit-override.yaml diff --git a/.github/scripts/generate_pre_commit.py b/.github/scripts/generate_pre_commit.py new file mode 100755 index 00000000000000..740d3c20d263b0 --- /dev/null +++ b/.github/scripts/generate_pre_commit.py @@ -0,0 +1,265 @@ +"""Generate pre-commit hooks for Java and Python projects. + +This script scans a repository for Java and Python projects and generates appropriate +pre-commit hooks for linting and formatting. It also merges in additional hooks from +an override file. +""" + +import os +from dataclasses import dataclass +from enum import Enum, auto +from pathlib import Path + +import yaml + + +class ProjectType(Enum): + """Types of projects supported for hook generation.""" + + JAVA = auto() + PYTHON = auto() + + +@dataclass +class Project: + """Represents a project found in the repository.""" + + path: str + type: ProjectType + + @property + def gradle_path(self) -> str: + """Convert path to Gradle task format.""" + return ":" + self.path.replace("/", ":") + + @property + def project_id(self) -> str: + """Generate a unique identifier for the project.""" + return self.path.replace("/", "-").replace(".", "-") + + +class ProjectFinder: + """Find Java and Python projects in a repository.""" + + JAVA_PATTERNS = [ + "plugins.hasPlugin('java')", + "apply plugin: 'java'", + "id 'java'", + "id 'java-library'", + "plugins.hasPlugin('java-library')", + "apply plugin: 'java-library'", + "plugins.hasPlugin('pegasus')", + "org.springframework.boot", + ] + + EXCLUDED_DIRS = {".git", "build", "node_modules", ".tox", "venv"} + SOURCE_EXTENSIONS = {".java", ".kt", ".groovy"} + + def __init__(self, root_dir: str): + self.root_path = Path(root_dir) + + def find_all_projects(self) -> list[Project]: + """Find all Java and Python projects in the repository.""" + java_projects = self._find_java_projects() + python_projects = self._find_python_projects() + + all_projects = [] + all_projects.extend( + Project(path=p, type=ProjectType.JAVA) for p in java_projects + ) + all_projects.extend( + Project(path=p, type=ProjectType.PYTHON) for p in python_projects + ) + + return sorted(all_projects, key=lambda p: p.path) + + def _find_java_projects(self) -> set[str]: + """Find all Java projects by checking build.gradle files.""" + java_projects = set() + + # Search both build.gradle and build.gradle.kts + for pattern in ["build.gradle", "build.gradle.kts"]: + for gradle_file in self.root_path.rglob(pattern): + if self._should_skip_directory(gradle_file.parent): + continue + + if self._is_java_project(gradle_file): + java_projects.add(self._get_relative_path(gradle_file.parent)) + + return { + p + for p in java_projects + if "buildSrc" not in p and "spark-smoke-test" not in p and p != "." + } + + def _find_python_projects(self) -> set[str]: + """Find all Python projects by checking for setup.py or pyproject.toml.""" + python_projects = set() + + for file_name in ["setup.py", "pyproject.toml"]: + for path in self.root_path.rglob(file_name): + if self._should_skip_directory(path.parent): + continue + + rel_path = self._get_relative_path(path.parent) + if "examples" not in rel_path: + python_projects.add(rel_path) + + return python_projects + + def _should_skip_directory(self, path: Path) -> bool: + """Check if directory should be skipped.""" + return any( + part in self.EXCLUDED_DIRS or part.startswith(".") for part in path.parts + ) + + def _is_java_project(self, gradle_file: Path) -> bool: + """Check if a Gradle file represents a Java project.""" + try: + content = gradle_file.read_text() + has_java_plugin = any(pattern in content for pattern in self.JAVA_PATTERNS) + + if has_java_plugin: + # Verify presence of source files + return any( + list(gradle_file.parent.rglob(f"*{ext}")) + for ext in self.SOURCE_EXTENSIONS + ) + return False + + except Exception as e: + print(f"Warning: Error reading {gradle_file}: {e}") + return False + + def _get_relative_path(self, path: Path) -> str: + """Get relative path from root, normalized with forward slashes.""" + return str(path.relative_to(self.root_path)).replace("\\", "/") + + +class HookGenerator: + """Generate pre-commit hooks for projects.""" + + def __init__(self, projects: list[Project], override_file: str = None): + self.projects = projects + self.override_file = override_file + + def generate_config(self) -> dict: + """Generate the complete pre-commit config.""" + hooks = [] + + for project in self.projects: + if project.type == ProjectType.PYTHON: + hooks.append(self._generate_lint_fix_hook(project)) + else: # ProjectType.JAVA + hooks.append(self._generate_spotless_hook(project)) + + config = {"repos": [{"repo": "local", "hooks": hooks}]} + + # Merge override hooks if they exist + if self.override_file and os.path.exists(self.override_file): + try: + with open(self.override_file, 'r') as f: + override_config = yaml.safe_load(f) + + if override_config and 'repos' in override_config: + for override_repo in override_config['repos']: + matching_repo = next( + (repo for repo in config['repos'] + if repo['repo'] == override_repo['repo']), + None + ) + + if matching_repo: + matching_repo['hooks'].extend(override_repo.get('hooks', [])) + else: + config['repos'].append(override_repo) + + print(f"Merged additional hooks from {self.override_file}") + except Exception as e: + print(f"Warning: Error reading override file {self.override_file}: {e}") + + return config + + def _generate_lint_fix_hook(self, project: Project) -> dict: + """Generate a lint-fix hook for Python projects.""" + return { + "id": f"{project.project_id}-lint-fix", + "name": f"{project.path} Lint Fix", + "entry": f"./gradlew {project.gradle_path}:lintFix", + "language": "system", + "files": f"^{project.path}/.*\\.py$", + } + + def _generate_spotless_hook(self, project: Project) -> dict: + """Generate a spotless hook for Java projects.""" + return { + "id": f"{project.project_id}-spotless", + "name": f"{project.path} Spotless Apply", + "entry": f"./gradlew {project.gradle_path}:spotlessApply", + "language": "system", + "files": f"^{project.path}/.*\\.java$", + } + + +class PrecommitDumper(yaml.Dumper): + """Custom YAML dumper that maintains proper indentation.""" + + def increase_indent(self, flow=False, *args, **kwargs): + return super().increase_indent(flow=flow, indentless=False) + + +def write_yaml_with_spaces(file_path: str, data: dict): + """Write YAML file with extra spacing between hooks.""" + with open(file_path, "w") as f: + yaml_str = yaml.dump( + data, Dumper=PrecommitDumper, sort_keys=False, default_flow_style=False + ) + + # Add extra newline between hooks + lines = yaml_str.split("\n") + result = [] + in_hook = False + + for line in lines: + if line.strip().startswith("- id:"): + if in_hook: # If we were already in a hook, add extra newline + result.append("") + in_hook = True + elif not line.strip() and in_hook: + in_hook = False + + result.append(line) + + f.write("\n".join(result)) + + +def main(): + root_dir = os.path.abspath(os.curdir) + override_file = ".github/scripts/pre-commit-override.yaml" + + # Find projects + finder = ProjectFinder(root_dir) + projects = finder.find_all_projects() + + # Print summary + print("Found projects:") + print("\nJava projects:") + for project in projects: + if project.type == ProjectType.JAVA: + print(f" - {project.path}") + + print("\nPython projects:") + for project in projects: + if project.type == ProjectType.PYTHON: + print(f" - {project.path}") + + # Generate and write config + generator = HookGenerator(projects, override_file) + config = generator.generate_config() + write_yaml_with_spaces(".pre-commit-config.yaml", config) + + print("\nGenerated .pre-commit-config.yaml") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/.github/scripts/pre-commit-override.yaml b/.github/scripts/pre-commit-override.yaml new file mode 100644 index 00000000000000..a085d9ea3ee93b --- /dev/null +++ b/.github/scripts/pre-commit-override.yaml @@ -0,0 +1,8 @@ +repos: + - repo: local + hooks: + - id: smoke-test-cypress-lint-fix + name: smoke-test cypress Lint Fix + entry: ./gradlew :smoke-test:cypressLintFix + language: system + files: ^smoke-test/tests/cypress/.*$ \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 898e3d262b3941..c4edc2cc176355 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,26 +1,380 @@ -exclude: ^$ -files: ^(docs/|docs-website/|metadata-ingestion/) repos: - - repo: https://github.com/pre-commit/mirrors-isort - rev: v5.10.1 + - repo: local hooks: - - id: isort - - repo: https://github.com/ambv/black - rev: 23.1.0 - hooks: - - id: black - - repo: https://github.com/myint/autoflake - rev: v1.4 - hooks: - - id: autoflake - args: - - --in-place - - --remove-unused-variables - - --remove-all-unused-imports - - --expand-star-imports - - repo: https://github.com/pre-commit/mirrors-prettier - rev: "v3.0.0-alpha.6" # Use the sha or tag you want to point at - hooks: - - id: prettier - args: - - --write \ No newline at end of file + - id: datahub-graphql-core-spotless + name: datahub-graphql-core Spotless Apply + entry: ./gradlew :datahub-graphql-core:spotlessApply + language: system + files: ^datahub-graphql-core/.*\.java$ + + - id: datahub-upgrade-spotless + name: datahub-upgrade Spotless Apply + entry: ./gradlew :datahub-upgrade:spotlessApply + language: system + files: ^datahub-upgrade/.*\.java$ + + - id: entity-registry-spotless + name: entity-registry Spotless Apply + entry: ./gradlew :entity-registry:spotlessApply + language: system + files: ^entity-registry/.*\.java$ + + - id: ingestion-scheduler-spotless + name: ingestion-scheduler Spotless Apply + entry: ./gradlew :ingestion-scheduler:spotlessApply + language: system + files: ^ingestion-scheduler/.*\.java$ + + - id: li-utils-spotless + name: li-utils Spotless Apply + entry: ./gradlew :li-utils:spotlessApply + language: system + files: ^li-utils/.*\.java$ + + - id: metadata-auth-auth-api-spotless + name: metadata-auth/auth-api Spotless Apply + entry: ./gradlew :metadata-auth:auth-api:spotlessApply + language: system + files: ^metadata-auth/auth-api/.*\.java$ + + - id: metadata-dao-impl-kafka-producer-spotless + name: metadata-dao-impl/kafka-producer Spotless Apply + entry: ./gradlew :metadata-dao-impl:kafka-producer:spotlessApply + language: system + files: ^metadata-dao-impl/kafka-producer/.*\.java$ + + - id: metadata-events-mxe-avro-spotless + name: metadata-events/mxe-avro Spotless Apply + entry: ./gradlew :metadata-events:mxe-avro:spotlessApply + language: system + files: ^metadata-events/mxe-avro/.*\.java$ + + - id: metadata-events-mxe-registration-spotless + name: metadata-events/mxe-registration Spotless Apply + entry: ./gradlew :metadata-events:mxe-registration:spotlessApply + language: system + files: ^metadata-events/mxe-registration/.*\.java$ + + - id: metadata-events-mxe-schemas-spotless + name: metadata-events/mxe-schemas Spotless Apply + entry: ./gradlew :metadata-events:mxe-schemas:spotlessApply + language: system + files: ^metadata-events/mxe-schemas/.*\.java$ + + - id: metadata-events-mxe-utils-avro-spotless + name: metadata-events/mxe-utils-avro Spotless Apply + entry: ./gradlew :metadata-events:mxe-utils-avro:spotlessApply + language: system + files: ^metadata-events/mxe-utils-avro/.*\.java$ + + - id: metadata-ingestion-lint-fix + name: metadata-ingestion Lint Fix + entry: ./gradlew :metadata-ingestion:lintFix + language: system + files: ^metadata-ingestion/.*\.py$ + + - id: metadata-ingestion-modules-airflow-plugin-lint-fix + name: metadata-ingestion-modules/airflow-plugin Lint Fix + entry: ./gradlew :metadata-ingestion-modules:airflow-plugin:lintFix + language: system + files: ^metadata-ingestion-modules/airflow-plugin/.*\.py$ + + - id: metadata-ingestion-modules-dagster-plugin-lint-fix + name: metadata-ingestion-modules/dagster-plugin Lint Fix + entry: ./gradlew :metadata-ingestion-modules:dagster-plugin:lintFix + language: system + files: ^metadata-ingestion-modules/dagster-plugin/.*\.py$ + + - id: metadata-ingestion-modules-gx-plugin-lint-fix + name: metadata-ingestion-modules/gx-plugin Lint Fix + entry: ./gradlew :metadata-ingestion-modules:gx-plugin:lintFix + language: system + files: ^metadata-ingestion-modules/gx-plugin/.*\.py$ + + - id: metadata-ingestion-modules-prefect-plugin-lint-fix + name: metadata-ingestion-modules/prefect-plugin Lint Fix + entry: ./gradlew :metadata-ingestion-modules:prefect-plugin:lintFix + language: system + files: ^metadata-ingestion-modules/prefect-plugin/.*\.py$ + + - id: metadata-integration-java-acryl-spark-lineage-spotless + name: metadata-integration/java/acryl-spark-lineage Spotless Apply + entry: ./gradlew :metadata-integration:java:acryl-spark-lineage:spotlessApply + language: system + files: ^metadata-integration/java/acryl-spark-lineage/.*\.java$ + + - id: metadata-integration-java-datahub-client-spotless + name: metadata-integration/java/datahub-client Spotless Apply + entry: ./gradlew :metadata-integration:java:datahub-client:spotlessApply + language: system + files: ^metadata-integration/java/datahub-client/.*\.java$ + + - id: metadata-integration-java-datahub-event-spotless + name: metadata-integration/java/datahub-event Spotless Apply + entry: ./gradlew :metadata-integration:java:datahub-event:spotlessApply + language: system + files: ^metadata-integration/java/datahub-event/.*\.java$ + + - id: metadata-integration-java-datahub-protobuf-spotless + name: metadata-integration/java/datahub-protobuf Spotless Apply + entry: ./gradlew :metadata-integration:java:datahub-protobuf:spotlessApply + language: system + files: ^metadata-integration/java/datahub-protobuf/.*\.java$ + + - id: metadata-integration-java-datahub-schematron-cli-spotless + name: metadata-integration/java/datahub-schematron/cli Spotless Apply + entry: ./gradlew :metadata-integration:java:datahub-schematron:cli:spotlessApply + language: system + files: ^metadata-integration/java/datahub-schematron/cli/.*\.java$ + + - id: metadata-integration-java-datahub-schematron-lib-spotless + name: metadata-integration/java/datahub-schematron/lib Spotless Apply + entry: ./gradlew :metadata-integration:java:datahub-schematron:lib:spotlessApply + language: system + files: ^metadata-integration/java/datahub-schematron/lib/.*\.java$ + + - id: metadata-integration-java-examples-spotless + name: metadata-integration/java/examples Spotless Apply + entry: ./gradlew :metadata-integration:java:examples:spotlessApply + language: system + files: ^metadata-integration/java/examples/.*\.java$ + + - id: metadata-integration-java-openlineage-converter-spotless + name: metadata-integration/java/openlineage-converter Spotless Apply + entry: ./gradlew :metadata-integration:java:openlineage-converter:spotlessApply + language: system + files: ^metadata-integration/java/openlineage-converter/.*\.java$ + + - id: metadata-integration-java-spark-lineage-legacy-spotless + name: metadata-integration/java/spark-lineage-legacy Spotless Apply + entry: ./gradlew :metadata-integration:java:spark-lineage-legacy:spotlessApply + language: system + files: ^metadata-integration/java/spark-lineage-legacy/.*\.java$ + + - id: metadata-io-spotless + name: metadata-io Spotless Apply + entry: ./gradlew :metadata-io:spotlessApply + language: system + files: ^metadata-io/.*\.java$ + + - id: metadata-io-metadata-io-api-spotless + name: metadata-io/metadata-io-api Spotless Apply + entry: ./gradlew :metadata-io:metadata-io-api:spotlessApply + language: system + files: ^metadata-io/metadata-io-api/.*\.java$ + + - id: metadata-jobs-common-spotless + name: metadata-jobs/common Spotless Apply + entry: ./gradlew :metadata-jobs:common:spotlessApply + language: system + files: ^metadata-jobs/common/.*\.java$ + + - id: metadata-jobs-mae-consumer-spotless + name: metadata-jobs/mae-consumer Spotless Apply + entry: ./gradlew :metadata-jobs:mae-consumer:spotlessApply + language: system + files: ^metadata-jobs/mae-consumer/.*\.java$ + + - id: metadata-jobs-mae-consumer-job-spotless + name: metadata-jobs/mae-consumer-job Spotless Apply + entry: ./gradlew :metadata-jobs:mae-consumer-job:spotlessApply + language: system + files: ^metadata-jobs/mae-consumer-job/.*\.java$ + + - id: metadata-jobs-mce-consumer-spotless + name: metadata-jobs/mce-consumer Spotless Apply + entry: ./gradlew :metadata-jobs:mce-consumer:spotlessApply + language: system + files: ^metadata-jobs/mce-consumer/.*\.java$ + + - id: metadata-jobs-mce-consumer-job-spotless + name: metadata-jobs/mce-consumer-job Spotless Apply + entry: ./gradlew :metadata-jobs:mce-consumer-job:spotlessApply + language: system + files: ^metadata-jobs/mce-consumer-job/.*\.java$ + + - id: metadata-jobs-pe-consumer-spotless + name: metadata-jobs/pe-consumer Spotless Apply + entry: ./gradlew :metadata-jobs:pe-consumer:spotlessApply + language: system + files: ^metadata-jobs/pe-consumer/.*\.java$ + + - id: metadata-models-spotless + name: metadata-models Spotless Apply + entry: ./gradlew :metadata-models:spotlessApply + language: system + files: ^metadata-models/.*\.java$ + + - id: metadata-models-custom-spotless + name: metadata-models-custom Spotless Apply + entry: ./gradlew :metadata-models-custom:spotlessApply + language: system + files: ^metadata-models-custom/.*\.java$ + + - id: metadata-models-validator-spotless + name: metadata-models-validator Spotless Apply + entry: ./gradlew :metadata-models-validator:spotlessApply + language: system + files: ^metadata-models-validator/.*\.java$ + + - id: metadata-operation-context-spotless + name: metadata-operation-context Spotless Apply + entry: ./gradlew :metadata-operation-context:spotlessApply + language: system + files: ^metadata-operation-context/.*\.java$ + + - id: metadata-service-auth-config-spotless + name: metadata-service/auth-config Spotless Apply + entry: ./gradlew :metadata-service:auth-config:spotlessApply + language: system + files: ^metadata-service/auth-config/.*\.java$ + + - id: metadata-service-auth-filter-spotless + name: metadata-service/auth-filter Spotless Apply + entry: ./gradlew :metadata-service:auth-filter:spotlessApply + language: system + files: ^metadata-service/auth-filter/.*\.java$ + + - id: metadata-service-auth-impl-spotless + name: metadata-service/auth-impl Spotless Apply + entry: ./gradlew :metadata-service:auth-impl:spotlessApply + language: system + files: ^metadata-service/auth-impl/.*\.java$ + + - id: metadata-service-auth-servlet-impl-spotless + name: metadata-service/auth-servlet-impl Spotless Apply + entry: ./gradlew :metadata-service:auth-servlet-impl:spotlessApply + language: system + files: ^metadata-service/auth-servlet-impl/.*\.java$ + + - id: metadata-service-configuration-spotless + name: metadata-service/configuration Spotless Apply + entry: ./gradlew :metadata-service:configuration:spotlessApply + language: system + files: ^metadata-service/configuration/.*\.java$ + + - id: metadata-service-factories-spotless + name: metadata-service/factories Spotless Apply + entry: ./gradlew :metadata-service:factories:spotlessApply + language: system + files: ^metadata-service/factories/.*\.java$ + + - id: metadata-service-graphql-servlet-impl-spotless + name: metadata-service/graphql-servlet-impl Spotless Apply + entry: ./gradlew :metadata-service:graphql-servlet-impl:spotlessApply + language: system + files: ^metadata-service/graphql-servlet-impl/.*\.java$ + + - id: metadata-service-openapi-analytics-servlet-spotless + name: metadata-service/openapi-analytics-servlet Spotless Apply + entry: ./gradlew :metadata-service:openapi-analytics-servlet:spotlessApply + language: system + files: ^metadata-service/openapi-analytics-servlet/.*\.java$ + + - id: metadata-service-openapi-entity-servlet-spotless + name: metadata-service/openapi-entity-servlet Spotless Apply + entry: ./gradlew :metadata-service:openapi-entity-servlet:spotlessApply + language: system + files: ^metadata-service/openapi-entity-servlet/.*\.java$ + + - id: metadata-service-openapi-entity-servlet-generators-spotless + name: metadata-service/openapi-entity-servlet/generators Spotless Apply + entry: ./gradlew :metadata-service:openapi-entity-servlet:generators:spotlessApply + language: system + files: ^metadata-service/openapi-entity-servlet/generators/.*\.java$ + + - id: metadata-service-openapi-servlet-spotless + name: metadata-service/openapi-servlet Spotless Apply + entry: ./gradlew :metadata-service:openapi-servlet:spotlessApply + language: system + files: ^metadata-service/openapi-servlet/.*\.java$ + + - id: metadata-service-openapi-servlet-models-spotless + name: metadata-service/openapi-servlet/models Spotless Apply + entry: ./gradlew :metadata-service:openapi-servlet:models:spotlessApply + language: system + files: ^metadata-service/openapi-servlet/models/.*\.java$ + + - id: metadata-service-plugin-spotless + name: metadata-service/plugin Spotless Apply + entry: ./gradlew :metadata-service:plugin:spotlessApply + language: system + files: ^metadata-service/plugin/.*\.java$ + + - id: metadata-service-plugin-src-test-sample-test-plugins-spotless + name: metadata-service/plugin/src/test/sample-test-plugins Spotless Apply + entry: ./gradlew :metadata-service:plugin:src:test:sample-test-plugins:spotlessApply + language: system + files: ^metadata-service/plugin/src/test/sample-test-plugins/.*\.java$ + + - id: metadata-service-restli-client-spotless + name: metadata-service/restli-client Spotless Apply + entry: ./gradlew :metadata-service:restli-client:spotlessApply + language: system + files: ^metadata-service/restli-client/.*\.java$ + + - id: metadata-service-restli-client-api-spotless + name: metadata-service/restli-client-api Spotless Apply + entry: ./gradlew :metadata-service:restli-client-api:spotlessApply + language: system + files: ^metadata-service/restli-client-api/.*\.java$ + + - id: metadata-service-restli-servlet-impl-spotless + name: metadata-service/restli-servlet-impl Spotless Apply + entry: ./gradlew :metadata-service:restli-servlet-impl:spotlessApply + language: system + files: ^metadata-service/restli-servlet-impl/.*\.java$ + + - id: metadata-service-schema-registry-api-spotless + name: metadata-service/schema-registry-api Spotless Apply + entry: ./gradlew :metadata-service:schema-registry-api:spotlessApply + language: system + files: ^metadata-service/schema-registry-api/.*\.java$ + + - id: metadata-service-schema-registry-servlet-spotless + name: metadata-service/schema-registry-servlet Spotless Apply + entry: ./gradlew :metadata-service:schema-registry-servlet:spotlessApply + language: system + files: ^metadata-service/schema-registry-servlet/.*\.java$ + + - id: metadata-service-services-spotless + name: metadata-service/services Spotless Apply + entry: ./gradlew :metadata-service:services:spotlessApply + language: system + files: ^metadata-service/services/.*\.java$ + + - id: metadata-service-servlet-spotless + name: metadata-service/servlet Spotless Apply + entry: ./gradlew :metadata-service:servlet:spotlessApply + language: system + files: ^metadata-service/servlet/.*\.java$ + + - id: metadata-utils-spotless + name: metadata-utils Spotless Apply + entry: ./gradlew :metadata-utils:spotlessApply + language: system + files: ^metadata-utils/.*\.java$ + + - id: mock-entity-registry-spotless + name: mock-entity-registry Spotless Apply + entry: ./gradlew :mock-entity-registry:spotlessApply + language: system + files: ^mock-entity-registry/.*\.java$ + + - id: smoke-test-lint-fix + name: smoke-test Lint Fix + entry: ./gradlew :smoke-test:lintFix + language: system + files: ^smoke-test/.*\.py$ + + - id: test-models-spotless + name: test-models Spotless Apply + entry: ./gradlew :test-models:spotlessApply + language: system + files: ^test-models/.*\.java$ + + - id: smoke-test-cypress-lint-fix + name: smoke-test cypress Lint Fix + entry: ./gradlew :smoke-test:cypressLintFix + language: system + files: ^smoke-test/tests/cypress/.*$ diff --git a/build.gradle b/build.gradle index 8929b4e644972c..3c36feadc5f4bb 100644 --- a/build.gradle +++ b/build.gradle @@ -474,10 +474,6 @@ subprojects { if (compileJavaTask != null) { spotlessJavaTask.dependsOn compileJavaTask } - // TODO - Do not run this in CI. How? - // tasks.withType(JavaCompile) { - // finalizedBy(tasks.findByName('spotlessApply')) - // } } } diff --git a/settings.gradle b/settings.gradle index b0c2c707d566c0..77d0706549a439 100644 --- a/settings.gradle +++ b/settings.gradle @@ -78,3 +78,52 @@ include ':metadata-operation-context' include ':metadata-service:openapi-servlet:models' include ':metadata-integration:java:datahub-schematron:lib' include ':metadata-integration:java:datahub-schematron:cli' + +def installPreCommitHooks() { + def preCommitInstalled = false + try { + def process = ["which", "pre-commit"].execute() + def stdout = new StringBuilder() + def stderr = new StringBuilder() + process.waitForProcessOutput(stdout, stderr) + preCommitInstalled = (process.exitValue() == 0) + println "Pre-commit check: ${stdout}" + } catch (Exception e) { + println "Error checking pre-commit: ${e.message}" + return + } + + if (!preCommitInstalled) { + try { + def installProcess = ["python", "-m", "pip", "install", "pre-commit"].execute() + def stdout = new StringBuilder() + def stderr = new StringBuilder() + installProcess.waitForProcessOutput(stdout, stderr) + if (installProcess.exitValue() != 0) { + println "Failed to install pre-commit: ${stderr}" + return + } + println "Install output: ${stdout}" + } catch (Exception e) { + println "Error installing pre-commit: ${e.message}" + return + } + } + + try { + def installHooksProcess = ["python", "-m", "pre_commit", "install"].execute() + def stdout = new StringBuilder() + def stderr = new StringBuilder() + installHooksProcess.waitForProcessOutput(stdout, stderr) + if (installHooksProcess.exitValue() != 0) { + println "Failed to install hooks: ${stderr}" + return + } + println "Hooks output: ${stdout}" + } catch (Exception e) { + println "Error installing hooks: ${e.message}" + return + } +} + +installPreCommitHooks() \ No newline at end of file From 92f013e6e179a63b8877ad7344f428a9869ae1d7 Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Wed, 8 Jan 2025 11:40:02 -0800 Subject: [PATCH 13/14] fix(ingest/file-backed-collections): Properly set _use_sqlite_on_conflict (#12297) --- .../utilities/file_backed_collections.py | 2 +- .../utilities/test_file_backed_collections.py | 31 +++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/utilities/file_backed_collections.py b/metadata-ingestion/src/datahub/utilities/file_backed_collections.py index b8c27666d7f538..fb028605c35b77 100644 --- a/metadata-ingestion/src/datahub/utilities/file_backed_collections.py +++ b/metadata-ingestion/src/datahub/utilities/file_backed_collections.py @@ -243,7 +243,7 @@ def __post_init__(self) -> None: # This was added in 3.24.0 from 2018-06-04. # See https://www.sqlite.org/lang_conflict.html if OVERRIDE_SQLITE_VERSION_REQUIREMENT: - self.use_sqlite_on_conflict = False + self._use_sqlite_on_conflict = False else: raise RuntimeError("SQLite version 3.24.0 or later is required") diff --git a/metadata-ingestion/tests/unit/utilities/test_file_backed_collections.py b/metadata-ingestion/tests/unit/utilities/test_file_backed_collections.py index 6230c2e37edc6a..7e1627151c6ebf 100644 --- a/metadata-ingestion/tests/unit/utilities/test_file_backed_collections.py +++ b/metadata-ingestion/tests/unit/utilities/test_file_backed_collections.py @@ -5,6 +5,7 @@ import sqlite3 from dataclasses import dataclass from typing import Counter, Dict +from unittest.mock import patch import pytest @@ -15,6 +16,36 @@ ) +def test_set_use_sqlite_on_conflict(): + with patch("sqlite3.sqlite_version_info", (3, 24, 0)): + cache = FileBackedDict[int]( + tablename="cache", + cache_max_size=10, + cache_eviction_batch_size=10, + ) + assert cache._use_sqlite_on_conflict is True + + with pytest.raises(RuntimeError): + with patch("sqlite3.sqlite_version_info", (3, 23, 1)): + cache = FileBackedDict[int]( + tablename="cache", + cache_max_size=10, + cache_eviction_batch_size=10, + ) + assert cache._use_sqlite_on_conflict is False + + with patch("sqlite3.sqlite_version_info", (3, 23, 1)), patch( + "datahub.utilities.file_backed_collections.OVERRIDE_SQLITE_VERSION_REQUIREMENT", + True, + ): + cache = FileBackedDict[int]( + tablename="cache", + cache_max_size=10, + cache_eviction_batch_size=10, + ) + assert cache._use_sqlite_on_conflict is False + + @pytest.mark.parametrize("use_sqlite_on_conflict", [True, False]) def test_file_dict(use_sqlite_on_conflict: bool) -> None: cache = FileBackedDict[int]( From ea4d40e5353f4061fa02c4af229fdfc5a58af9d3 Mon Sep 17 00:00:00 2001 From: kevinkarchacryl Date: Wed, 8 Jan 2025 17:51:51 -0500 Subject: [PATCH 14/14] fix(doc): make folder_path_pattern usage more clear (#12298) --- .../datahub/ingestion/source/looker/looker_config.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py index bfae3060013d59..4e9d0f68928a45 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py @@ -300,11 +300,16 @@ class LookerDashboardSourceConfig( folder_path_pattern: AllowDenyPattern = Field( default=AllowDenyPattern.allow_all(), - description="Allow or deny dashboards from specific folders. " + description="Allow or deny dashboards from specific folders using their fully qualified paths. " "For example: \n" "deny: \n" - " - sales/deprecated \n" - "This pattern will deny the ingestion of all dashboards and looks within the sales/deprecated folder. \n" + " - Shared/deprecated \n" + "This pattern will deny the ingestion of all dashboards and looks within the Shared/deprecated folder. \n" + "allow: \n" + " - Shared/sales \n" + "This pattern will allow only the ingestion of dashboards within the Shared/sales folder. \n" + "To get the correct path from Looker, take the folder hierarchy shown in the UI and join it with slashes. " + "For example, Shared -> Customer Reports -> Sales becomes Shared/Customer Reports/Sales. " "Dashboards will only be ingested if they're allowed by both this config and dashboard_pattern.", )