feat: Add support for excluding list of exact column names (#20)

Co-authored-by: Ethan Cartwright <[email protected]>
acryldata · Jan 2, 2024 · 63b8397 · 63b8397
1 parent fc290dc
commit 63b8397
Show file tree

Hide file tree

Showing 9 changed files with 236 additions and 7 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 .gradle/
+venv/
 
 #VS Code
 .vscode

diff --git a/datahub-classify/README.md b/datahub-classify/README.md
@@ -40,6 +40,7 @@ Infotype configuration is a dictionary with all infotypes at root level key. Eac
   2. Description
   3. Datatype
   4. Values
+- `ExcludeName` - optional exact match list for column names to exclude from classification for this info_type
 - `Name` - regex list which is to be matched against column name
 - `Description` - regex list which is to be matched against column description
 - `Datatype` - list of datatypes to be matched against column datatype

diff --git a/datahub-classify/src/datahub_classify/constants.py b/datahub-classify/src/datahub_classify/constants.py
@@ -1,5 +1,6 @@
 # Input config dictionary keys
 PREDICTION_FACTORS_AND_WEIGHTS = "Prediction_Factors_and_Weights"
+EXCLUDE_NAME = "ExcludeName"
 NAME = "Name"
 DESCRIPTION = "Description"
 DATATYPE = "Datatype"

diff --git a/datahub-classify/src/datahub_classify/infotype_helper.py b/datahub-classify/src/datahub_classify/infotype_helper.py
@@ -86,7 +86,7 @@ def compute_overall_confidence(debug_info: DebugInfo, config: Dict[str, Dict]) -
     }
     confidence_level = 0
     for key, value in vars(debug_info).items():
-        if value and type(value) != str:
+        if value and not isinstance(value, str):
             confidence_level += prediction_factors_weights[key] * value
     confidence_level = np.round(confidence_level, 2)
     return confidence_level

diff --git a/datahub-classify/src/datahub_classify/infotype_predictor.py b/datahub-classify/src/datahub_classify/infotype_predictor.py
@@ -2,8 +2,9 @@
 import logging
 from typing import Any, Dict, List, Optional
 
+from datahub_classify.constants import EXCLUDE_NAME
 from datahub_classify.helper_classes import ColumnInfo, InfotypeProposal
-from datahub_classify.infotype_utils import perform_basic_checks
+from datahub_classify.infotype_utils import perform_basic_checks, strip_formatting
 
 logger = logging.getLogger(__name__)
 
@@ -45,6 +46,8 @@ def predict_infotypes(
     logger.debug("===========================================================")
     basic_checks_failed_columns = []
     num_cols_with_infotype_assigned = 0
+    strip_exclusion_formatting = global_config.get("strip_exclusion_formatting")
+
     for column_info in column_infos:
         logger.debug(
             f"processing column: {column_info.metadata.name} -- dataset: {column_info.metadata.dataset_name}"
@@ -55,6 +58,16 @@ def predict_infotypes(
             # get the configuration
             config_dict = global_config[infotype]
 
+            # convert exclude_name list into a set for o(1) checking
+            if EXCLUDE_NAME in config_dict and config_dict[EXCLUDE_NAME] is not None:
+                config_dict[EXCLUDE_NAME] = (
+                    set(config_dict[EXCLUDE_NAME])
+                    if not strip_exclusion_formatting
+                    else set([strip_formatting(s) for s in config_dict[EXCLUDE_NAME]])
+                )
+            else:
+                config_dict[EXCLUDE_NAME] = set()
+
             # call the infotype prediction function
             column_info.values = [
                 val

diff --git a/datahub-classify/src/datahub_classify/infotype_utils.py b/datahub-classify/src/datahub_classify/infotype_utils.py
@@ -1,13 +1,23 @@
 import logging
 import re
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Union
 
-from datahub_classify.constants import PREDICTION_FACTORS_AND_WEIGHTS, VALUES
+from datahub_classify.constants import (
+    EXCLUDE_NAME,
+    PREDICTION_FACTORS_AND_WEIGHTS,
+    VALUES,
+)
 from datahub_classify.helper_classes import Metadata
 
 logger = logging.getLogger(__name__)
 
 
+def strip_formatting(s):
+    s = s.lower()
+    s = re.sub(r"[^a-z0-9\s]", "", s)
+    return s
+
+
 # TODO: Exception handling
 # Match regex for Name and Description
 def match_regex(text_to_match: str, regex_list: List[str]) -> float:
@@ -82,15 +92,30 @@ def detect_named_entity_spacy(
 def perform_basic_checks(
     metadata: Metadata,
     values: List[Any],
-    config_dict: Dict[str, Dict],
+    config_dict: Dict[str, Union[Dict, List[str], None]],
     infotype: str,
     minimum_values_threshold: int,
 ) -> bool:
     basic_checks_status = True
+    metadata.name = (
+        metadata.name
+        if not config_dict.get("strip_formatting")
+        else strip_formatting(metadata.name)
+    )
+    prediction_factors = config_dict.get(PREDICTION_FACTORS_AND_WEIGHTS)
+    exclude_name = config_dict.get(EXCLUDE_NAME, [])
     if (
-        config_dict[PREDICTION_FACTORS_AND_WEIGHTS].get(VALUES, None)
+        isinstance(prediction_factors, dict)
+        and prediction_factors.get(VALUES, None)
         and len(values) < minimum_values_threshold
     ):
+        logger.warning(
+            f"The number of values for column {metadata.name}"
+            f"does not meet minimum threshold for {infotype}"
+        )
+        basic_checks_status = False
+    elif exclude_name is not None and metadata.name in exclude_name:
+        logger.warning(f"Excluding match for {infotype} on column {metadata.name}")
         basic_checks_status = False
     # TODO: Add more basic checks
     return basic_checks_status
diff --git a/datahub-classify/src/datahub_classify/reference_input.py b/datahub-classify/src/datahub_classify/reference_input.py
@@ -1,13 +1,16 @@
+from typing import Any, Dict, List, Union
+
 # Input Dictionary Format
 
-input1 = {
+input1: Dict[str, Dict[str, Union[Dict[str, Any], List[str], None]]] = {
     "Email_Address": {
         "Prediction_Factors_and_Weights": {
             "Name": 0.4,
             "Description": 0,
             "Datatype": 0,
             "Values": 0.6,
         },
+        "ExcludeName": [],
         "Name": {
             "regex": [
                 "^.*mail.*id.*$",

diff --git a/datahub-classify/tests/exclude_name_test_config.py b/datahub-classify/tests/exclude_name_test_config.py
@@ -0,0 +1,63 @@
+from typing import Any, Dict, List, Union
+
+# Input Dictionary Format
+
+exclude_name_test_config: Dict[
+    str, Dict[str, Union[Dict[str, Any], List[str], None]]
+] = {
+    "Email_Address": {
+        "Prediction_Factors_and_Weights": {
+            "Name": 1,
+            "Description": 0,
+            "Datatype": 0,
+            "Values": 0,
+        },
+        "ExcludeName": ["email_sent", "email_received"],
+        "Name": {
+            "regex": [
+                "^.*mail.*id.*$",
+                "^.*id.*mail.*$",
+                "^.*mail.*add.*$",
+                "^.*add.*mail.*$",
+                "email",
+                "mail",
+            ]
+        },
+        "Description": {"regex": []},
+        "Datatype": {"type": ["str"]},
+        "Values": {
+            "prediction_type": "regex",
+            "regex": [],
+            "library": [],
+        },
+    },
+}
+
+none_exclude_name_test_config: Dict[str, Dict[str, Union[Dict[str, Any], List[str], None]]] = {  # type: ignore
+    "Email_Address": {
+        "Prediction_Factors_and_Weights": {
+            "Name": 1,
+            "Description": 0,
+            "Datatype": 0,
+            "Values": 0,
+        },
+        "ExcludeName": None,
+        "Name": {
+            "regex": [
+                "^.*mail.*id.*$",
+                "^.*id.*mail.*$",
+                "^.*mail.*add.*$",
+                "^.*add.*mail.*$",
+                "email",
+                "mail",
+            ]
+        },
+        "Description": {"regex": []},
+        "Datatype": {"type": ["str"]},
+        "Values": {
+            "prediction_type": "regex",
+            "regex": [],
+            "library": [],
+        },
+    },
+}
diff --git a/datahub-classify/tests/test_info_type_utils.py b/datahub-classify/tests/test_info_type_utils.py
@@ -0,0 +1,122 @@
+from exclude_name_test_config import (
+    exclude_name_test_config,
+    none_exclude_name_test_config,
+)
+
+from datahub_classify.helper_classes import ColumnInfo, Metadata
+from datahub_classify.infotype_utils import perform_basic_checks, strip_formatting
+
+
+def column_infos():
+    return [
+        ColumnInfo(
+            metadata=Metadata(
+                meta_info={
+                    "Name": "id",
+                    "Description": "Unique identifier",
+                    "Datatype": "int",
+                    "Dataset_Name": "email_data",
+                }
+            ),
+            values=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+        ),
+        ColumnInfo(
+            metadata=Metadata(
+                meta_info={
+                    "Name": "email_from",
+                    "Description": "Sender's email address",
+                    "Datatype": "str",
+                    "Dataset_Name": "email_data",
+                }
+            ),
+            values=[
+                "[email protected]",
+                "[email protected]",
+                "[email protected]",
+                "[email protected]",
+                "[email protected]",
+                "[email protected]",
+                "[email protected]",
+                "[email protected]",
+                "[email protected]",
+                "[email protected]",
+            ],
+        ),
+        ColumnInfo(
+            metadata=Metadata(
+                meta_info={
+                    "Name": "email_to",
+                    "Description": "Recipient's email address",
+                    "Datatype": "str",
+                    "Dataset_Name": "email_data",
+                }
+            ),
+            values=[
+                "[email protected]",
+                "[email protected]",
+                "[email protected]",
+                "[email protected]",
+                "[email protected]",
+                "[email protected]",
+                "[email protected]",
+                "[email protected]",
+                "[email protected]",
+                "[email protected]",
+            ],
+        ),
+        ColumnInfo(
+            metadata=Metadata(
+                meta_info={
+                    "Name": "email_sent",
+                    "Description": "Indicates if email was sent",
+                    "Datatype": "bool",
+                    "Dataset_Name": "email_data",
+                }
+            ),
+            values=[False, True, True, False, True, False, True, True, False, True],
+        ),
+        ColumnInfo(
+            metadata=Metadata(
+                meta_info={
+                    "Name": "email_received",
+                    "Description": "Indicates if email was received",
+                    "Datatype": "bool",
+                    "Dataset_Name": "email_data",
+                }
+            ),
+            values=[False, True, False, False, True, False, False, True, False, False],
+        ),
+    ]
+
+
+def test_perform_basic_checks_with_exclude_name():
+    for col_data in column_infos():
+        result = perform_basic_checks(
+            Metadata(col_data.metadata.meta_info),
+            col_data.values,
+            exclude_name_test_config["Email_Address"],
+            "Email_Address",
+            1,
+        )
+        if col_data.metadata.meta_info["Name"] in ["email_sent", "email_received"]:
+            assert not result
+        else:
+            assert result
+
+
+def test_perform_basic_checks_with_none_exclude_name():
+    for col_data in column_infos():
+        result = perform_basic_checks(
+            Metadata(col_data.metadata.meta_info),
+            col_data.values,
+            none_exclude_name_test_config["Email_Address"],
+            "Email_Address",
+            1,
+        )
+        assert result
+
+
+def test_strip_formatting():
+    assert strip_formatting("Name") == "name"
+    assert strip_formatting("my_column_name") == "mycolumnname"
+    assert strip_formatting("Col.Name") == "colname"