add support for regex-based custom infotype (#17)

acryldata · Jun 5, 2023 · c3c9dac · c3c9dac
1 parent 5d4ebb1
commit c3c9dac
Show file tree

Hide file tree

Showing 5 changed files with 180 additions and 11 deletions.
diff --git a/datahub-classify/README.md b/datahub-classify/README.md
@@ -95,6 +95,7 @@ A debug information is associated with each infotype proposal, it provides detai
 
 ## Supported Infotypes
 
+Below Infotypes are supported out of the box.
 1. Age
 2. Gender
 3. Person Name / Full Name
@@ -110,6 +111,8 @@ A debug information is associated with each infotype proposal, it provides detai
 13. Swift Code
 14. US Driving License Number
 
+Regex based custom infotypes are supported. Specify custom infotype configuration in format mentioned [here](#infotype-configuration).
+
 ## Assumptions
 
 - If value prediction factor weight is non-zero (indicating values should be used for infotype inspection) then a minimum 50 non-null column values should be present.

diff --git a/datahub-classify/src/datahub_classify/helper_classes.py b/datahub-classify/src/datahub_classify/helper_classes.py
@@ -2,11 +2,19 @@
 from typing import Any, Dict, List, Optional
 
 
+@dataclass
+class DebugInfo:
+    name: Optional[float] = None
+    description: Optional[float] = None
+    datatype: Optional[float] = None
+    values: Optional[float] = None
+
+
 @dataclass
 class InfotypeProposal:
     infotype: str
     confidence_level: float
-    debug_info: Dict[str, Any]
+    debug_info: DebugInfo
 
 
 @dataclass
@@ -29,11 +37,3 @@ class ColumnInfo:
     metadata: Metadata
     values: List[Any]
     infotype_proposals: Optional[List[InfotypeProposal]] = None
-
-
-@dataclass
-class DebugInfo:
-    name: Optional[float] = None
-    description: Optional[float] = None
-    datatype: Optional[float] = None
-    values: Optional[float] = None
diff --git a/datahub-classify/src/datahub_classify/infotype_helper.py b/datahub-classify/src/datahub_classify/infotype_helper.py
@@ -121,6 +121,35 @@ def inspect_for_email_address(
     return confidence_level, debug_info
 
 
+def inspect_for_custom_infotype(
+    metadata: Metadata, values: List[Any], config: Dict[str, Dict]
+) -> Tuple[float, DebugInfo]:
+    prediction_factors_weights = config[PREDICTION_FACTORS_AND_WEIGHTS]
+    debug_info = DebugInfo()
+    # Value Logic
+    if prediction_factors_weights.get(VALUES, 0) > 0:
+        values_score = 0.0
+        try:
+            if config[VALUES][PREDICTION_TYPE] == "regex":
+                values_score = match_regex_for_values(values, config[VALUES][REGEX])
+            elif config[VALUES][PREDICTION_TYPE] == "library":
+                raise Exception(
+                    "Currently prediction type 'library' is not supported for custom infotype"
+                )
+            else:
+                raise Exception(
+                    f"Inappropriate Prediction type {config[VALUES][PREDICTION_TYPE]}"
+                )
+        except Exception as e:
+            logger.error(f"Column {metadata.name} failed due to {e}")
+        values_score = np.round(values_score, 2)
+        debug_info.values = values_score
+
+    debug_info = compute_name_description_dtype_score(metadata, config, debug_info)
+    confidence_level = compute_overall_confidence(debug_info, config)
+    return confidence_level, debug_info
+
+
 def inspect_for_street_address(
     metadata: Metadata, values: List[Any], config: Dict[str, Dict]
 ) -> Tuple[float, DebugInfo]:  # noqa: C901

diff --git a/datahub-classify/src/datahub_classify/infotype_predictor.py b/datahub-classify/src/datahub_classify/infotype_predictor.py
@@ -24,7 +24,11 @@ def get_infotype_function_mapping(
             logger.warning(f"Configuration is not available for infotype - {infotype}")
         else:
             fn_name = f"inspect_for_{infotype.lower()}"
-            infotype_function_map[infotype] = module_fn_dict[fn_name]
+            if fn_name in module_fn_dict:
+                infotype_function_map[infotype] = module_fn_dict[fn_name]
+            else:
+                fn_name = "inspect_for_custom_infotype"
+                infotype_function_map[infotype] = module_fn_dict[fn_name]
     return infotype_function_map
 
 
@@ -75,7 +79,7 @@ def predict_infotypes(
 
             except Exception as e:
                 # traceback.print_exc()
-                logger.warning(f"Failed to extract info type due to {e}")
+                logger.warning(f"Failed to extract info type {infotype} due to {e}")
         if len(proposal_list) > 0:
             num_cols_with_infotype_assigned += 1
         column_info.infotype_proposals = proposal_list

diff --git a/datahub-classify/tests/test_custom_infotype_predictor.py b/datahub-classify/tests/test_custom_infotype_predictor.py
@@ -0,0 +1,133 @@
+import random
+import string
+import uuid
+from datetime import datetime, timedelta
+
+import pytest
+
+from datahub_classify.helper_classes import ColumnInfo, Metadata
+from datahub_classify.infotype_predictor import predict_infotypes
+from datahub_classify.reference_input import input1 as default_config
+
+
+def random_vehicle_number():
+    state_codes = ["MH", "TN", "BH", "DL"]
+    separators = ["-", " ", "", "_"]
+    return "".join(
+        [
+            random.choice(state_codes),
+            random.choice(separators),
+            str(random.randint(1, 20)),
+            random.choice(separators),
+            "".join(random.choices(string.ascii_letters, k=random.randint(1, 3))),
+            random.choice(separators),
+            "".join(random.choices(string.digits, k=4)),
+        ]
+    )
+
+
+@pytest.fixture(scope="module")
+def column_infos():
+    return [
+        ColumnInfo(
+            metadata=Metadata(
+                meta_info={
+                    "Name": "id",
+                    "Description": "Primary",
+                    "Datatype": "str",
+                    "Dataset_Name": "entry_register",
+                }
+            ),
+            values=[uuid.uuid4() for i in range(1, 100)],
+        ),
+        ColumnInfo(
+            metadata=Metadata(
+                meta_info={
+                    "Name": "vehicle_number",
+                    "Description": "Vehicle registration number ",
+                    "Datatype": "str",
+                    "Dataset_Name": "entry_register",
+                }
+            ),
+            values=[random_vehicle_number() for i in range(1, 100)],
+        ),
+        ColumnInfo(
+            metadata=Metadata(
+                meta_info={
+                    "Name": "entry_time",
+                    "Description": "Time of vehicle's entry",
+                    "Datatype": "datetime",
+                    "Dataset_Name": "entry_register",
+                }
+            ),
+            values=[
+                datetime.now() - timedelta(hours=random.randint(0, 24))
+                for i in range(1, 100)
+            ],
+        ),
+    ]
+
+
+@pytest.fixture
+def custom_config_patch():
+    return {
+        "IN_Vehicle_Registration_Number": {
+            "Prediction_Factors_and_Weights": {
+                "Name": 0.2,
+                "Description": 0.1,
+                "Datatype": 0.1,
+                "Values": 0.6,
+            },
+            "Name": {
+                "regex": [
+                    "^.*vehicle.*num.*$",
+                    "^.*license.*plat.*num.*$",
+                    "^.*license.*plat.*num.*$",
+                    "^.*vehicle.*plat.*num.*$",
+                    "^.*vehicle.*num.*plat.*$",
+                ]
+            },
+            "Description": {
+                "regex": [
+                    "^.*vehicle.*num.*$",
+                    "^.*license.*plat.*num.*$",
+                    "^.*license.*plat.*num.*$",
+                    "^.*vehicle.*plat.*num.*$",
+                    "^.*vehicle.*num.*plat.*$",
+                ]
+            },
+            "Datatype": {"type": ["str", "varchar", "text"]},
+            "Values": {
+                "prediction_type": "regex",
+                "regex": [r"[a-z]{2}[-_\s]?[0-9]{1,2}[-_\s]?[a-z]{2,3}[-_\s]?[0-9]{4}"],
+                "library": [],
+            },
+        }
+    }
+
+
+def test_custom_infotype_prediction(column_infos, custom_config_patch):
+    # Default config
+    out_column_infos = predict_infotypes(
+        column_infos, confidence_level_threshold=0.7, global_config=default_config
+    )
+    assert not out_column_infos[0].infotype_proposals
+    assert not out_column_infos[1].infotype_proposals
+    assert not out_column_infos[2].infotype_proposals
+
+    # Config with new custom infotype, all factors
+    config_new = default_config.copy()
+    config_new.update(custom_config_patch)
+    out_column_infos = predict_infotypes(
+        column_infos, confidence_level_threshold=0.7, global_config=config_new
+    )
+    assert not out_column_infos[0].infotype_proposals
+    assert not out_column_infos[2].infotype_proposals
+
+    predicted_infotypes = out_column_infos[1].infotype_proposals
+    assert predicted_infotypes
+    assert len(predicted_infotypes) == 1
+    assert predicted_infotypes[0].infotype == "IN_Vehicle_Registration_Number"
+    assert predicted_infotypes[0].debug_info.name == 1
+    assert predicted_infotypes[0].debug_info.description == 1
+    assert predicted_infotypes[0].debug_info.datatype == 1