From 64b95e35e5089a920c5c27d617cc1f2668c49e72 Mon Sep 17 00:00:00 2001 From: Tsung-Ju Lii Date: Thu, 17 Oct 2024 15:16:25 +0800 Subject: [PATCH] [sc-29393] MongoDB crawler ignore system collections (#1010) --- metaphor/mongodb/README.md | 2 +- metaphor/mongodb/extractor.py | 6 ++++++ pyproject.toml | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/metaphor/mongodb/README.md b/metaphor/mongodb/README.md index fa6df335..4fcb089c 100644 --- a/metaphor/mongodb/README.md +++ b/metaphor/mongodb/README.md @@ -15,7 +15,7 @@ infer_schema_sample_size: # Number of documents to sample in a collection excluded_databases: # Databases to ignore. By default the databases "admin", "config", "local", "system" are excluded. - db1 - db2 -excluded_collections: # Collections to ignore. +excluded_collections: # Extra collections to ignore. Note that the system specific collections (`system.views`, `system.profile`, etc.) are always ignored, see https://www.mongodb.com/docs/manual/reference/system-collections/#database-specific-collections for more details. - coll1 - coll2 ``` diff --git a/metaphor/mongodb/extractor.py b/metaphor/mongodb/extractor.py index 907860fc..4cb16cb0 100644 --- a/metaphor/mongodb/extractor.py +++ b/metaphor/mongodb/extractor.py @@ -50,7 +50,13 @@ class MongoDBExtractor(BaseExtractor): def __init__(self, config: MongoDBConfig) -> None: super().__init__(config) self._sample_size = config.infer_schema_sample_size + self._excluded_collections = config.excluded_collections + # Always ignore these system collections + self._excluded_collections.update( + ["system.buckets", "system.profile", "system.js", "system.views"] + ) + self._excluded_databases = config.excluded_databases self.client = config.get_client() self._datasets: Dict[str, Dataset] = {} diff --git a/pyproject.toml b/pyproject.toml index 55d1806e..df4b2bd7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "metaphor-connectors" -version = "0.14.126" +version = "0.14.127" license = "Apache-2.0" description = "A collection of Python-based 'connectors' that extract metadata from various sources to ingest into the Metaphor app." authors = ["Metaphor "]