From 9e94d824054c50d894a654b3766e69776d54cc16 Mon Sep 17 00:00:00 2001 From: alyiwang Date: Tue, 5 Nov 2024 23:21:03 -0800 Subject: [PATCH] dataset crawler filtering not working with [sc-29743] --- metaphor/bigquery/README.md | 1 + metaphor/snowflake/extractor.py | 10 +++++----- metaphor/snowflake/profile/extractor.py | 10 +++++----- pyproject.toml | 2 +- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/metaphor/bigquery/README.md b/metaphor/bigquery/README.md index b22a0033..af564afc 100644 --- a/metaphor/bigquery/README.md +++ b/metaphor/bigquery/README.md @@ -112,6 +112,7 @@ See [Output Config](../common/docs/output.md) for more information. #### Filtering See [Filter Config](../common/docs/filter.md) for more information on the optional `filter` config. +> NOTE: While the filter config uses `database -> schema -> table/view` hierarchy, BigQuery uses `project -> database -> table/view` hierarchy. Please use project id at the top level in the include/exclude of the filter config. #### Tag Assignment diff --git a/metaphor/snowflake/extractor.py b/metaphor/snowflake/extractor.py index 1bc8fffa..534e5056 100644 --- a/metaphor/snowflake/extractor.py +++ b/metaphor/snowflake/extractor.py @@ -141,11 +141,11 @@ async def extract(self) -> Collection[ENTITY_TYPES]: with self._conn: cursor = self._conn.cursor() - databases = ( - self.fetch_databases(cursor) - if self._filter.includes is None - else list(self._filter.includes.keys()) - ) + databases = [ + db + for db in self.fetch_databases(cursor) + if self._filter.include_database(db) + ] logger.info(f"Databases to include: {databases}") shared_databases = self._fetch_shared_databases(cursor) diff --git a/metaphor/snowflake/profile/extractor.py b/metaphor/snowflake/profile/extractor.py index 680988d6..b6b86fca 100644 --- a/metaphor/snowflake/profile/extractor.py +++ b/metaphor/snowflake/profile/extractor.py @@ -86,11 +86,11 @@ async def extract(self) -> Collection[ENTITY_TYPES]: with self._conn: cursor = self._conn.cursor() - databases = ( - SnowflakeExtractor.fetch_databases(cursor) - if self._filter.includes is None - else list(self._filter.includes.keys()) - ) + databases = [ + db + for db in SnowflakeExtractor.fetch_databases(cursor) + if self._filter.include_database(db) + ] for database in databases: tables = self._fetch_tables(cursor, database) diff --git a/pyproject.toml b/pyproject.toml index 0f6225ce..76b80906 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "metaphor-connectors" -version = "0.14.150" +version = "0.14.151" license = "Apache-2.0" description = "A collection of Python-based 'connectors' that extract metadata from various sources to ingest into the Metaphor app." authors = ["Metaphor "]