Skip to content

Commit

Permalink
address comments
Browse files Browse the repository at this point in the history
  • Loading branch information
usefulalgorithm committed Oct 1, 2024
1 parent 748410e commit 3acee90
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 19 deletions.
3 changes: 2 additions & 1 deletion metaphor/common/infer_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,9 @@ def _get_field_native_type(
elif set(field_types.keys()) == {datetime, str}:
field_type = datetime
else:
field_type = field_types.most_common(1)[0][0]
logger.warning(
f"Multiple types found in field: field = {field_path}, types = {list(field_types.keys())}"
f"Multiple types found in field: field = {field_path}, types = {list(field_types.keys())}. Using the most common type {str(field_type)}"
)

if isinstance(field_type, str):
Expand Down
2 changes: 1 addition & 1 deletion metaphor/mongodb/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ uri: <uri> # The connection URI.
auth_mechanism: <auth_mechanism> # The authentication mechanism. Allowed values are "GSSAPI", "MONGODB-CR", "MONGODB-OIDC", "MONGODB-X509", "MONGODB-AWS", "PLAIN", "SCRAM-SHA-1", "SCRAM-SHA-256", "DEFAULT". Default is "DEFAULT".
tls: <boolean> # Whether to set TLS when connecting to MongoDB. Default is False.

infer_schema_sample_size: <int> # Number of documents to sample in a collection in order to infer the schema. Set this to `null` to disable sampling and use all documents in the collections. Default is 1000.
infer_schema_sample_size: <int> # Number of documents to sample in a collection in order to infer the schema. Set this to `null` to disable sampling and use all documents in the collections. To disable schema inference altogether, set this to 0. Default is 1000.
excluded_databases: # Databases to ignore. By default the databases "admin", "config", "local", "system" are excluded.
- db1
- db2
Expand Down
12 changes: 12 additions & 0 deletions metaphor/mongodb/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from metaphor.common.entity_id import dataset_normalized_name
from metaphor.common.event_util import ENTITY_TYPES
from metaphor.common.infer_schema import SchemaTypeNameMapping, infer_schema
from metaphor.common.logger import get_logger
from metaphor.common.utils import safe_float
from metaphor.models.crawler_run_metadata import Platform
from metaphor.models.metadata_change_event import (
Expand All @@ -22,6 +23,9 @@
from metaphor.mongodb.config import MongoDBConfig


logger = get_logger()


class MongoDBExtractor(BaseExtractor):
"""MongoDB metadata extractor"""

Expand Down Expand Up @@ -51,12 +55,20 @@ def __init__(self, config: MongoDBConfig) -> None:
self._excluded_databases = config.excluded_databases
self.client = config.get_client()
self._datasets: Dict[str, Dataset] = {}
if self._sample_size is None:
logger.info(
"Not sampling, all objects in a collection will be used to infer the collection's schema"
)
if self._sample_size == 0:
logger.info("Infer sample size set to 0, not inferring collection schema")

@staticmethod
def from_config_file(config_file: str) -> "MongoDBExtractor":
return MongoDBExtractor(MongoDBConfig.from_yaml_file(config_file))

def _get_collection_schema(self, collection: MongoCollection):
if self._sample_size == 0:
return []
pipeline = []
if self._sample_size:
pipeline.append(
Expand Down
34 changes: 17 additions & 17 deletions tests/mongodb/expected_datasets.json
Original file line number Diff line number Diff line change
Expand Up @@ -3541,7 +3541,7 @@
{
"fieldName": "depth",
"fieldPath": "depth",
"nativeType": "mixed",
"nativeType": "String",
"nullable": false
},
{
Expand Down Expand Up @@ -3755,13 +3755,13 @@
{
"fieldName": "rating",
"fieldPath": "imdb.rating",
"nativeType": "mixed",
"nativeType": "Float",
"nullable": false
},
{
"fieldName": "votes",
"fieldPath": "imdb.votes",
"nativeType": "mixed",
"nativeType": "Int",
"nullable": false
}
]
Expand Down Expand Up @@ -3949,7 +3949,7 @@
{
"fieldName": "year",
"fieldPath": "year",
"nativeType": "mixed",
"nativeType": "Int",
"nullable": false
}
],
Expand Down Expand Up @@ -4320,7 +4320,7 @@
{
"fieldName": "price_amount",
"fieldPath": "acquisition.price_amount",
"nativeType": "mixed",
"nativeType": "Int",
"nullable": true
},
{
Expand Down Expand Up @@ -4396,7 +4396,7 @@
{
"fieldName": "price_amount",
"fieldPath": "acquisitions.price_amount",
"nativeType": "mixed",
"nativeType": "Int",
"nullable": true
},
{
Expand Down Expand Up @@ -4772,7 +4772,7 @@
{
"fieldName": "raised_amount",
"fieldPath": "investments.funding_round.raised_amount",
"nativeType": "mixed",
"nativeType": "Int",
"nullable": true
},
{
Expand Down Expand Up @@ -4836,7 +4836,7 @@
{
"fieldName": "valuation_amount",
"fieldPath": "ipo.valuation_amount",
"nativeType": "mixed",
"nativeType": "Int",
"nullable": true
},
{
Expand Down Expand Up @@ -5338,19 +5338,19 @@
{
"fieldName": "number",
"fieldPath": "address.number",
"nativeType": "mixed",
"nativeType": "Int",
"nullable": false
},
{
"fieldName": "street",
"fieldPath": "address.street",
"nativeType": "mixed",
"nativeType": "String",
"nullable": false
},
{
"fieldName": "zip",
"fieldPath": "address.zip",
"nativeType": "mixed",
"nativeType": "Int",
"nullable": false
}
]
Expand All @@ -5364,7 +5364,7 @@
{
"fieldName": "certificate_number",
"fieldPath": "certificate_number",
"nativeType": "mixed",
"nativeType": "Int",
"nullable": false
},
{
Expand Down Expand Up @@ -5538,7 +5538,7 @@
{
"fieldName": "airplane",
"fieldPath": "airplane",
"nativeType": "mixed",
"nativeType": "String",
"nullable": false
},
{
Expand All @@ -5550,13 +5550,13 @@
{
"fieldName": "dst_airport",
"fieldPath": "dst_airport",
"nativeType": "mixed",
"nativeType": "String",
"nullable": false
},
{
"fieldName": "src_airport",
"fieldPath": "src_airport",
"nativeType": "mixed",
"nativeType": "String",
"nullable": false
},
{
Expand Down Expand Up @@ -5637,7 +5637,7 @@
{
"fieldName": "id",
"fieldPath": "id",
"nativeType": "mixed",
"nativeType": "String",
"nullable": false
},
{
Expand Down Expand Up @@ -5850,7 +5850,7 @@
{
"fieldName": "birth year",
"fieldPath": "birth year",
"nativeType": "mixed",
"nativeType": "Int",
"nullable": false
},
{
Expand Down

0 comments on commit 3acee90

Please sign in to comment.