From c3c9dac4acf0cc42d4dceb32df806c7f06592bae Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Tue, 6 Jun 2023 00:14:52 +0530 Subject: [PATCH] add support for regex-based custom infotype (#17) --- datahub-classify/README.md | 3 + .../src/datahub_classify/helper_classes.py | 18 +-- .../src/datahub_classify/infotype_helper.py | 29 ++++ .../datahub_classify/infotype_predictor.py | 8 +- .../tests/test_custom_infotype_predictor.py | 133 ++++++++++++++++++ 5 files changed, 180 insertions(+), 11 deletions(-) create mode 100644 datahub-classify/tests/test_custom_infotype_predictor.py diff --git a/datahub-classify/README.md b/datahub-classify/README.md index 5990cdd..aca7b10 100644 --- a/datahub-classify/README.md +++ b/datahub-classify/README.md @@ -95,6 +95,7 @@ A debug information is associated with each infotype proposal, it provides detai ## Supported Infotypes +Below Infotypes are supported out of the box. 1. Age 2. Gender 3. Person Name / Full Name @@ -110,6 +111,8 @@ A debug information is associated with each infotype proposal, it provides detai 13. Swift Code 14. US Driving License Number +Regex based custom infotypes are supported. Specify custom infotype configuration in format mentioned [here](#infotype-configuration). + ## Assumptions - If value prediction factor weight is non-zero (indicating values should be used for infotype inspection) then a minimum 50 non-null column values should be present. diff --git a/datahub-classify/src/datahub_classify/helper_classes.py b/datahub-classify/src/datahub_classify/helper_classes.py index d61d3c6..2a17518 100644 --- a/datahub-classify/src/datahub_classify/helper_classes.py +++ b/datahub-classify/src/datahub_classify/helper_classes.py @@ -2,11 +2,19 @@ from typing import Any, Dict, List, Optional +@dataclass +class DebugInfo: + name: Optional[float] = None + description: Optional[float] = None + datatype: Optional[float] = None + values: Optional[float] = None + + @dataclass class InfotypeProposal: infotype: str confidence_level: float - debug_info: Dict[str, Any] + debug_info: DebugInfo @dataclass @@ -29,11 +37,3 @@ class ColumnInfo: metadata: Metadata values: List[Any] infotype_proposals: Optional[List[InfotypeProposal]] = None - - -@dataclass -class DebugInfo: - name: Optional[float] = None - description: Optional[float] = None - datatype: Optional[float] = None - values: Optional[float] = None diff --git a/datahub-classify/src/datahub_classify/infotype_helper.py b/datahub-classify/src/datahub_classify/infotype_helper.py index 23593d4..0ce9271 100644 --- a/datahub-classify/src/datahub_classify/infotype_helper.py +++ b/datahub-classify/src/datahub_classify/infotype_helper.py @@ -121,6 +121,35 @@ def inspect_for_email_address( return confidence_level, debug_info +def inspect_for_custom_infotype( + metadata: Metadata, values: List[Any], config: Dict[str, Dict] +) -> Tuple[float, DebugInfo]: + prediction_factors_weights = config[PREDICTION_FACTORS_AND_WEIGHTS] + debug_info = DebugInfo() + # Value Logic + if prediction_factors_weights.get(VALUES, 0) > 0: + values_score = 0.0 + try: + if config[VALUES][PREDICTION_TYPE] == "regex": + values_score = match_regex_for_values(values, config[VALUES][REGEX]) + elif config[VALUES][PREDICTION_TYPE] == "library": + raise Exception( + "Currently prediction type 'library' is not supported for custom infotype" + ) + else: + raise Exception( + f"Inappropriate Prediction type {config[VALUES][PREDICTION_TYPE]}" + ) + except Exception as e: + logger.error(f"Column {metadata.name} failed due to {e}") + values_score = np.round(values_score, 2) + debug_info.values = values_score + + debug_info = compute_name_description_dtype_score(metadata, config, debug_info) + confidence_level = compute_overall_confidence(debug_info, config) + return confidence_level, debug_info + + def inspect_for_street_address( metadata: Metadata, values: List[Any], config: Dict[str, Dict] ) -> Tuple[float, DebugInfo]: # noqa: C901 diff --git a/datahub-classify/src/datahub_classify/infotype_predictor.py b/datahub-classify/src/datahub_classify/infotype_predictor.py index 3c56f5a..31db532 100644 --- a/datahub-classify/src/datahub_classify/infotype_predictor.py +++ b/datahub-classify/src/datahub_classify/infotype_predictor.py @@ -24,7 +24,11 @@ def get_infotype_function_mapping( logger.warning(f"Configuration is not available for infotype - {infotype}") else: fn_name = f"inspect_for_{infotype.lower()}" - infotype_function_map[infotype] = module_fn_dict[fn_name] + if fn_name in module_fn_dict: + infotype_function_map[infotype] = module_fn_dict[fn_name] + else: + fn_name = "inspect_for_custom_infotype" + infotype_function_map[infotype] = module_fn_dict[fn_name] return infotype_function_map @@ -75,7 +79,7 @@ def predict_infotypes( except Exception as e: # traceback.print_exc() - logger.warning(f"Failed to extract info type due to {e}") + logger.warning(f"Failed to extract info type {infotype} due to {e}") if len(proposal_list) > 0: num_cols_with_infotype_assigned += 1 column_info.infotype_proposals = proposal_list diff --git a/datahub-classify/tests/test_custom_infotype_predictor.py b/datahub-classify/tests/test_custom_infotype_predictor.py new file mode 100644 index 0000000..a3c338f --- /dev/null +++ b/datahub-classify/tests/test_custom_infotype_predictor.py @@ -0,0 +1,133 @@ +import random +import string +import uuid +from datetime import datetime, timedelta + +import pytest + +from datahub_classify.helper_classes import ColumnInfo, Metadata +from datahub_classify.infotype_predictor import predict_infotypes +from datahub_classify.reference_input import input1 as default_config + + +def random_vehicle_number(): + state_codes = ["MH", "TN", "BH", "DL"] + separators = ["-", " ", "", "_"] + return "".join( + [ + random.choice(state_codes), + random.choice(separators), + str(random.randint(1, 20)), + random.choice(separators), + "".join(random.choices(string.ascii_letters, k=random.randint(1, 3))), + random.choice(separators), + "".join(random.choices(string.digits, k=4)), + ] + ) + + +@pytest.fixture(scope="module") +def column_infos(): + return [ + ColumnInfo( + metadata=Metadata( + meta_info={ + "Name": "id", + "Description": "Primary", + "Datatype": "str", + "Dataset_Name": "entry_register", + } + ), + values=[uuid.uuid4() for i in range(1, 100)], + ), + ColumnInfo( + metadata=Metadata( + meta_info={ + "Name": "vehicle_number", + "Description": "Vehicle registration number ", + "Datatype": "str", + "Dataset_Name": "entry_register", + } + ), + values=[random_vehicle_number() for i in range(1, 100)], + ), + ColumnInfo( + metadata=Metadata( + meta_info={ + "Name": "entry_time", + "Description": "Time of vehicle's entry", + "Datatype": "datetime", + "Dataset_Name": "entry_register", + } + ), + values=[ + datetime.now() - timedelta(hours=random.randint(0, 24)) + for i in range(1, 100) + ], + ), + ] + + +@pytest.fixture +def custom_config_patch(): + return { + "IN_Vehicle_Registration_Number": { + "Prediction_Factors_and_Weights": { + "Name": 0.2, + "Description": 0.1, + "Datatype": 0.1, + "Values": 0.6, + }, + "Name": { + "regex": [ + "^.*vehicle.*num.*$", + "^.*license.*plat.*num.*$", + "^.*license.*plat.*num.*$", + "^.*vehicle.*plat.*num.*$", + "^.*vehicle.*num.*plat.*$", + ] + }, + "Description": { + "regex": [ + "^.*vehicle.*num.*$", + "^.*license.*plat.*num.*$", + "^.*license.*plat.*num.*$", + "^.*vehicle.*plat.*num.*$", + "^.*vehicle.*num.*plat.*$", + ] + }, + "Datatype": {"type": ["str", "varchar", "text"]}, + "Values": { + "prediction_type": "regex", + "regex": [r"[a-z]{2}[-_\s]?[0-9]{1,2}[-_\s]?[a-z]{2,3}[-_\s]?[0-9]{4}"], + "library": [], + }, + } + } + + +def test_custom_infotype_prediction(column_infos, custom_config_patch): + # Default config + out_column_infos = predict_infotypes( + column_infos, confidence_level_threshold=0.7, global_config=default_config + ) + assert not out_column_infos[0].infotype_proposals + assert not out_column_infos[1].infotype_proposals + assert not out_column_infos[2].infotype_proposals + + # Config with new custom infotype, all factors + config_new = default_config.copy() + config_new.update(custom_config_patch) + out_column_infos = predict_infotypes( + column_infos, confidence_level_threshold=0.7, global_config=config_new + ) + assert not out_column_infos[0].infotype_proposals + assert not out_column_infos[2].infotype_proposals + + predicted_infotypes = out_column_infos[1].infotype_proposals + assert predicted_infotypes + assert len(predicted_infotypes) == 1 + assert predicted_infotypes[0].infotype == "IN_Vehicle_Registration_Number" + assert predicted_infotypes[0].debug_info.name == 1 + assert predicted_infotypes[0].debug_info.description == 1 + assert predicted_infotypes[0].debug_info.datatype == 1