diff --git a/metaphor/common/infer_schema.py b/metaphor/common/infer_schema.py index 8f5f6c4a..a0f5bf41 100644 --- a/metaphor/common/infer_schema.py +++ b/metaphor/common/infer_schema.py @@ -83,8 +83,9 @@ def _get_field_native_type( elif set(field_types.keys()) == {datetime, str}: field_type = datetime else: + field_type = field_types.most_common(1)[0][0] logger.warning( - f"Multiple types found in field: field = {field_path}, types = {list(field_types.keys())}" + f"Multiple types found in field: field = {field_path}, types = {list(field_types.keys())}. Using the most common type {str(field_type)}" ) if isinstance(field_type, str): diff --git a/metaphor/mongodb/README.md b/metaphor/mongodb/README.md index 9fa209a1..fa6df335 100644 --- a/metaphor/mongodb/README.md +++ b/metaphor/mongodb/README.md @@ -11,7 +11,7 @@ uri: # The connection URI. auth_mechanism: # The authentication mechanism. Allowed values are "GSSAPI", "MONGODB-CR", "MONGODB-OIDC", "MONGODB-X509", "MONGODB-AWS", "PLAIN", "SCRAM-SHA-1", "SCRAM-SHA-256", "DEFAULT". Default is "DEFAULT". tls: # Whether to set TLS when connecting to MongoDB. Default is False. -infer_schema_sample_size: # Number of documents to sample in a collection in order to infer the schema. Set this to `null` to disable sampling and use all documents in the collections. Default is 1000. +infer_schema_sample_size: # Number of documents to sample in a collection in order to infer the schema. Set this to `null` to disable sampling and use all documents in the collections. To disable schema inference altogether, set this to 0. Default is 1000. excluded_databases: # Databases to ignore. By default the databases "admin", "config", "local", "system" are excluded. - db1 - db2 diff --git a/metaphor/mongodb/extractor.py b/metaphor/mongodb/extractor.py index d501e660..2b754002 100644 --- a/metaphor/mongodb/extractor.py +++ b/metaphor/mongodb/extractor.py @@ -8,6 +8,7 @@ from metaphor.common.entity_id import dataset_normalized_name from metaphor.common.event_util import ENTITY_TYPES from metaphor.common.infer_schema import SchemaTypeNameMapping, infer_schema +from metaphor.common.logger import get_logger from metaphor.common.utils import safe_float from metaphor.models.crawler_run_metadata import Platform from metaphor.models.metadata_change_event import ( @@ -22,6 +23,9 @@ from metaphor.mongodb.config import MongoDBConfig +logger = get_logger() + + class MongoDBExtractor(BaseExtractor): """MongoDB metadata extractor""" @@ -51,12 +55,20 @@ def __init__(self, config: MongoDBConfig) -> None: self._excluded_databases = config.excluded_databases self.client = config.get_client() self._datasets: Dict[str, Dataset] = {} + if self._sample_size is None: + logger.info( + "Not sampling, all objects in a collection will be used to infer the collection's schema" + ) + if self._sample_size == 0: + logger.info("Infer sample size set to 0, not inferring collection schema") @staticmethod def from_config_file(config_file: str) -> "MongoDBExtractor": return MongoDBExtractor(MongoDBConfig.from_yaml_file(config_file)) def _get_collection_schema(self, collection: MongoCollection): + if self._sample_size == 0: + return [] pipeline = [] if self._sample_size: pipeline.append( diff --git a/tests/mongodb/expected_datasets.json b/tests/mongodb/expected_datasets.json index 04e9a16a..ad41a4fc 100644 --- a/tests/mongodb/expected_datasets.json +++ b/tests/mongodb/expected_datasets.json @@ -3541,7 +3541,7 @@ { "fieldName": "depth", "fieldPath": "depth", - "nativeType": "mixed", + "nativeType": "String", "nullable": false }, { @@ -3755,13 +3755,13 @@ { "fieldName": "rating", "fieldPath": "imdb.rating", - "nativeType": "mixed", + "nativeType": "Float", "nullable": false }, { "fieldName": "votes", "fieldPath": "imdb.votes", - "nativeType": "mixed", + "nativeType": "Int", "nullable": false } ] @@ -3949,7 +3949,7 @@ { "fieldName": "year", "fieldPath": "year", - "nativeType": "mixed", + "nativeType": "Int", "nullable": false } ], @@ -4320,7 +4320,7 @@ { "fieldName": "price_amount", "fieldPath": "acquisition.price_amount", - "nativeType": "mixed", + "nativeType": "Int", "nullable": true }, { @@ -4396,7 +4396,7 @@ { "fieldName": "price_amount", "fieldPath": "acquisitions.price_amount", - "nativeType": "mixed", + "nativeType": "Int", "nullable": true }, { @@ -4772,7 +4772,7 @@ { "fieldName": "raised_amount", "fieldPath": "investments.funding_round.raised_amount", - "nativeType": "mixed", + "nativeType": "Int", "nullable": true }, { @@ -4836,7 +4836,7 @@ { "fieldName": "valuation_amount", "fieldPath": "ipo.valuation_amount", - "nativeType": "mixed", + "nativeType": "Int", "nullable": true }, { @@ -5338,19 +5338,19 @@ { "fieldName": "number", "fieldPath": "address.number", - "nativeType": "mixed", + "nativeType": "Int", "nullable": false }, { "fieldName": "street", "fieldPath": "address.street", - "nativeType": "mixed", + "nativeType": "String", "nullable": false }, { "fieldName": "zip", "fieldPath": "address.zip", - "nativeType": "mixed", + "nativeType": "Int", "nullable": false } ] @@ -5364,7 +5364,7 @@ { "fieldName": "certificate_number", "fieldPath": "certificate_number", - "nativeType": "mixed", + "nativeType": "Int", "nullable": false }, { @@ -5538,7 +5538,7 @@ { "fieldName": "airplane", "fieldPath": "airplane", - "nativeType": "mixed", + "nativeType": "String", "nullable": false }, { @@ -5550,13 +5550,13 @@ { "fieldName": "dst_airport", "fieldPath": "dst_airport", - "nativeType": "mixed", + "nativeType": "String", "nullable": false }, { "fieldName": "src_airport", "fieldPath": "src_airport", - "nativeType": "mixed", + "nativeType": "String", "nullable": false }, { @@ -5637,7 +5637,7 @@ { "fieldName": "id", "fieldPath": "id", - "nativeType": "mixed", + "nativeType": "String", "nullable": false }, { @@ -5850,7 +5850,7 @@ { "fieldName": "birth year", "fieldPath": "birth year", - "nativeType": "mixed", + "nativeType": "Int", "nullable": false }, {