From 93c04a0d88006794fd428480efb2cb498d821e75 Mon Sep 17 00:00:00 2001 From: Tsung-Ju Lii Date: Thu, 17 Oct 2024 12:24:39 +0800 Subject: [PATCH 1/2] [sc-29393] MongoDB crawler ignore system collections --- metaphor/mongodb/README.md | 2 +- metaphor/mongodb/config.py | 6 +++++- pyproject.toml | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/metaphor/mongodb/README.md b/metaphor/mongodb/README.md index fa6df335..93243812 100644 --- a/metaphor/mongodb/README.md +++ b/metaphor/mongodb/README.md @@ -15,7 +15,7 @@ infer_schema_sample_size: # Number of documents to sample in a collection excluded_databases: # Databases to ignore. By default the databases "admin", "config", "local", "system" are excluded. - db1 - db2 -excluded_collections: # Collections to ignore. +excluded_collections: # Collections to ignore. By default the system specific collections are ignored, see https://www.mongodb.com/docs/manual/reference/system-collections/#database-specific-collections. - coll1 - coll2 ``` diff --git a/metaphor/mongodb/config.py b/metaphor/mongodb/config.py index c81deb68..8530102e 100644 --- a/metaphor/mongodb/config.py +++ b/metaphor/mongodb/config.py @@ -19,7 +19,11 @@ class MongoDBConfig(BaseConfig): excluded_databases: Set[str] = Field( default_factory=lambda: set(["admin", "config", "local", "system"]) ) - excluded_collections: Set[str] = Field(default_factory=set) + excluded_collections: Set[str] = Field( + default_factory=lambda: set( + ["system.buckets", "system.profile", "system.js", "system.views"] + ) + ) @field_validator("auth_mechanism", mode="before") def _validate_auth_mechanism(cls, auth_mechanism: str): diff --git a/pyproject.toml b/pyproject.toml index 55d1806e..df4b2bd7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "metaphor-connectors" -version = "0.14.126" +version = "0.14.127" license = "Apache-2.0" description = "A collection of Python-based 'connectors' that extract metadata from various sources to ingest into the Metaphor app." authors = ["Metaphor "] From b0b3e69220b41d376bf2963226098f2b124cdd17 Mon Sep 17 00:00:00 2001 From: Tsung-Ju Lii Date: Thu, 17 Oct 2024 12:42:51 +0800 Subject: [PATCH 2/2] address comments --- metaphor/mongodb/README.md | 2 +- metaphor/mongodb/config.py | 6 +----- metaphor/mongodb/extractor.py | 6 ++++++ 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/metaphor/mongodb/README.md b/metaphor/mongodb/README.md index 93243812..4fcb089c 100644 --- a/metaphor/mongodb/README.md +++ b/metaphor/mongodb/README.md @@ -15,7 +15,7 @@ infer_schema_sample_size: # Number of documents to sample in a collection excluded_databases: # Databases to ignore. By default the databases "admin", "config", "local", "system" are excluded. - db1 - db2 -excluded_collections: # Collections to ignore. By default the system specific collections are ignored, see https://www.mongodb.com/docs/manual/reference/system-collections/#database-specific-collections. +excluded_collections: # Extra collections to ignore. Note that the system specific collections (`system.views`, `system.profile`, etc.) are always ignored, see https://www.mongodb.com/docs/manual/reference/system-collections/#database-specific-collections for more details. - coll1 - coll2 ``` diff --git a/metaphor/mongodb/config.py b/metaphor/mongodb/config.py index 8530102e..c81deb68 100644 --- a/metaphor/mongodb/config.py +++ b/metaphor/mongodb/config.py @@ -19,11 +19,7 @@ class MongoDBConfig(BaseConfig): excluded_databases: Set[str] = Field( default_factory=lambda: set(["admin", "config", "local", "system"]) ) - excluded_collections: Set[str] = Field( - default_factory=lambda: set( - ["system.buckets", "system.profile", "system.js", "system.views"] - ) - ) + excluded_collections: Set[str] = Field(default_factory=set) @field_validator("auth_mechanism", mode="before") def _validate_auth_mechanism(cls, auth_mechanism: str): diff --git a/metaphor/mongodb/extractor.py b/metaphor/mongodb/extractor.py index 907860fc..4cb16cb0 100644 --- a/metaphor/mongodb/extractor.py +++ b/metaphor/mongodb/extractor.py @@ -50,7 +50,13 @@ class MongoDBExtractor(BaseExtractor): def __init__(self, config: MongoDBConfig) -> None: super().__init__(config) self._sample_size = config.infer_schema_sample_size + self._excluded_collections = config.excluded_collections + # Always ignore these system collections + self._excluded_collections.update( + ["system.buckets", "system.profile", "system.js", "system.views"] + ) + self._excluded_databases = config.excluded_databases self.client = config.get_client() self._datasets: Dict[str, Dataset] = {}