Skip to content

Commit

Permalink
Merge branch 'main' into scotthuang/sc-29299/improve-athena-crawler
Browse files Browse the repository at this point in the history
  • Loading branch information
elic-eon committed Oct 17, 2024
2 parents bf847bf + 64b95e3 commit af1fffb
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 1 deletion.
2 changes: 1 addition & 1 deletion metaphor/mongodb/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ infer_schema_sample_size: <int> # Number of documents to sample in a collection
excluded_databases: # Databases to ignore. By default the databases "admin", "config", "local", "system" are excluded.
- db1
- db2
excluded_collections: # Collections to ignore.
excluded_collections: # Extra collections to ignore. Note that the system specific collections (`system.views`, `system.profile`, etc.) are always ignored, see https://www.mongodb.com/docs/manual/reference/system-collections/#database-specific-collections for more details.
- coll1
- coll2
```
Expand Down
6 changes: 6 additions & 0 deletions metaphor/mongodb/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,13 @@ class MongoDBExtractor(BaseExtractor):
def __init__(self, config: MongoDBConfig) -> None:
super().__init__(config)
self._sample_size = config.infer_schema_sample_size

self._excluded_collections = config.excluded_collections
# Always ignore these system collections
self._excluded_collections.update(
["system.buckets", "system.profile", "system.js", "system.views"]
)

self._excluded_databases = config.excluded_databases
self.client = config.get_client()
self._datasets: Dict[str, Dataset] = {}
Expand Down

0 comments on commit af1fffb

Please sign in to comment.