From 576d86759dfcb4966577889558c5e29ccc56ded8 Mon Sep 17 00:00:00 2001 From: Tsung-Ju Lii Date: Tue, 1 Oct 2024 18:28:05 +0800 Subject: [PATCH] [sc-27944] Add config to disable tag collection in Snowflake (#992) --- metaphor/snowflake/README.md | 8 ++++++++ metaphor/snowflake/config.py | 3 +++ metaphor/snowflake/extractor.py | 9 +++++++-- pyproject.toml | 2 +- 4 files changed, 19 insertions(+), 3 deletions(-) diff --git a/metaphor/snowflake/README.md b/metaphor/snowflake/README.md index 6225deb9..2a7a6ba9 100644 --- a/metaphor/snowflake/README.md +++ b/metaphor/snowflake/README.md @@ -122,6 +122,14 @@ account_usage_schema: . See [Tag Matcher Config](../common/docs/tag_matcher.md) for more information on the optional `tag_matcher` config. +#### Disable Platform Tags Collection + +To stop the crawler from collecting platform tags from Snowflake, set `collect_tags` to `False`: + +```yaml +collect_tags: false # Default is true. +``` + #### Query Logs By default, the snowflake connector will fetch a full day's query logs from yesterday, to be analyzed for additional metadata, such as dataset usage and lineage information. To backfill log data, one can set `lookback_days` to the desired value. To turn off query log fetching, set `lookback_days` to 0. diff --git a/metaphor/snowflake/config.py b/metaphor/snowflake/config.py index 3400a6f6..c4ec17ae 100644 --- a/metaphor/snowflake/config.py +++ b/metaphor/snowflake/config.py @@ -58,6 +58,9 @@ class SnowflakeBaseConfig(SnowflakeAuthConfig): # The fully qualified schema that contains all the account_usage views account_usage_schema: str = "SNOWFLAKE.ACCOUNT_USAGE" + # Whether to collect platform tags. + collect_tags: bool = True + @dataclass(config=ConnectorConfig) class SnowflakeConfig(SnowflakeBaseConfig): diff --git a/metaphor/snowflake/extractor.py b/metaphor/snowflake/extractor.py index d4b19b0b..6c81a6b8 100644 --- a/metaphor/snowflake/extractor.py +++ b/metaphor/snowflake/extractor.py @@ -171,7 +171,10 @@ async def extract(self) -> Collection[ENTITY_TYPES]: self._fetch_primary_keys(cursor) self._fetch_unique_keys(cursor) - self._fetch_tags(cursor) + + # Only fetch the tags when collect_tags is True + if self._config.collect_tags: + self._fetch_tags(cursor) datasets = list(self._datasets.values()) tag_datasets(datasets, self._tag_matchers) @@ -902,7 +905,9 @@ def _init_dataset( database=database, schema=schema, table=table ) - dataset.system_tags = SystemTags(tags=[]) + # Only initialize this when collect_tags is True + if self._config.collect_tags: + dataset.system_tags = SystemTags(tags=[]) return dataset diff --git a/pyproject.toml b/pyproject.toml index 3b9f9c55..058fc5a1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "metaphor-connectors" -version = "0.14.110" +version = "0.14.111" license = "Apache-2.0" description = "A collection of Python-based 'connectors' that extract metadata from various sources to ingest into the Metaphor app." authors = ["Metaphor "]