Skip to content

Commit

Permalink
add support for regex-based custom infotype (#17)
Browse files Browse the repository at this point in the history
  • Loading branch information
mayurinehate authored Jun 5, 2023
1 parent 5d4ebb1 commit c3c9dac
Show file tree
Hide file tree
Showing 5 changed files with 180 additions and 11 deletions.
3 changes: 3 additions & 0 deletions datahub-classify/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ A debug information is associated with each infotype proposal, it provides detai

## Supported Infotypes

Below Infotypes are supported out of the box.
1. Age
2. Gender
3. Person Name / Full Name
Expand All @@ -110,6 +111,8 @@ A debug information is associated with each infotype proposal, it provides detai
13. Swift Code
14. US Driving License Number

Regex based custom infotypes are supported. Specify custom infotype configuration in format mentioned [here](#infotype-configuration).

## Assumptions

- If value prediction factor weight is non-zero (indicating values should be used for infotype inspection) then a minimum 50 non-null column values should be present.
Expand Down
18 changes: 9 additions & 9 deletions datahub-classify/src/datahub_classify/helper_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,19 @@
from typing import Any, Dict, List, Optional


@dataclass
class DebugInfo:
name: Optional[float] = None
description: Optional[float] = None
datatype: Optional[float] = None
values: Optional[float] = None


@dataclass
class InfotypeProposal:
infotype: str
confidence_level: float
debug_info: Dict[str, Any]
debug_info: DebugInfo


@dataclass
Expand All @@ -29,11 +37,3 @@ class ColumnInfo:
metadata: Metadata
values: List[Any]
infotype_proposals: Optional[List[InfotypeProposal]] = None


@dataclass
class DebugInfo:
name: Optional[float] = None
description: Optional[float] = None
datatype: Optional[float] = None
values: Optional[float] = None
29 changes: 29 additions & 0 deletions datahub-classify/src/datahub_classify/infotype_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,35 @@ def inspect_for_email_address(
return confidence_level, debug_info


def inspect_for_custom_infotype(
metadata: Metadata, values: List[Any], config: Dict[str, Dict]
) -> Tuple[float, DebugInfo]:
prediction_factors_weights = config[PREDICTION_FACTORS_AND_WEIGHTS]
debug_info = DebugInfo()
# Value Logic
if prediction_factors_weights.get(VALUES, 0) > 0:
values_score = 0.0
try:
if config[VALUES][PREDICTION_TYPE] == "regex":
values_score = match_regex_for_values(values, config[VALUES][REGEX])
elif config[VALUES][PREDICTION_TYPE] == "library":
raise Exception(
"Currently prediction type 'library' is not supported for custom infotype"
)
else:
raise Exception(
f"Inappropriate Prediction type {config[VALUES][PREDICTION_TYPE]}"
)
except Exception as e:
logger.error(f"Column {metadata.name} failed due to {e}")
values_score = np.round(values_score, 2)
debug_info.values = values_score

debug_info = compute_name_description_dtype_score(metadata, config, debug_info)
confidence_level = compute_overall_confidence(debug_info, config)
return confidence_level, debug_info


def inspect_for_street_address(
metadata: Metadata, values: List[Any], config: Dict[str, Dict]
) -> Tuple[float, DebugInfo]: # noqa: C901
Expand Down
8 changes: 6 additions & 2 deletions datahub-classify/src/datahub_classify/infotype_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,11 @@ def get_infotype_function_mapping(
logger.warning(f"Configuration is not available for infotype - {infotype}")
else:
fn_name = f"inspect_for_{infotype.lower()}"
infotype_function_map[infotype] = module_fn_dict[fn_name]
if fn_name in module_fn_dict:
infotype_function_map[infotype] = module_fn_dict[fn_name]
else:
fn_name = "inspect_for_custom_infotype"
infotype_function_map[infotype] = module_fn_dict[fn_name]
return infotype_function_map


Expand Down Expand Up @@ -75,7 +79,7 @@ def predict_infotypes(

except Exception as e:
# traceback.print_exc()
logger.warning(f"Failed to extract info type due to {e}")
logger.warning(f"Failed to extract info type {infotype} due to {e}")
if len(proposal_list) > 0:
num_cols_with_infotype_assigned += 1
column_info.infotype_proposals = proposal_list
Expand Down
133 changes: 133 additions & 0 deletions datahub-classify/tests/test_custom_infotype_predictor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import random
import string
import uuid
from datetime import datetime, timedelta

import pytest

from datahub_classify.helper_classes import ColumnInfo, Metadata
from datahub_classify.infotype_predictor import predict_infotypes
from datahub_classify.reference_input import input1 as default_config


def random_vehicle_number():
state_codes = ["MH", "TN", "BH", "DL"]
separators = ["-", " ", "", "_"]
return "".join(
[
random.choice(state_codes),
random.choice(separators),
str(random.randint(1, 20)),
random.choice(separators),
"".join(random.choices(string.ascii_letters, k=random.randint(1, 3))),
random.choice(separators),
"".join(random.choices(string.digits, k=4)),
]
)


@pytest.fixture(scope="module")
def column_infos():
return [
ColumnInfo(
metadata=Metadata(
meta_info={
"Name": "id",
"Description": "Primary",
"Datatype": "str",
"Dataset_Name": "entry_register",
}
),
values=[uuid.uuid4() for i in range(1, 100)],
),
ColumnInfo(
metadata=Metadata(
meta_info={
"Name": "vehicle_number",
"Description": "Vehicle registration number ",
"Datatype": "str",
"Dataset_Name": "entry_register",
}
),
values=[random_vehicle_number() for i in range(1, 100)],
),
ColumnInfo(
metadata=Metadata(
meta_info={
"Name": "entry_time",
"Description": "Time of vehicle's entry",
"Datatype": "datetime",
"Dataset_Name": "entry_register",
}
),
values=[
datetime.now() - timedelta(hours=random.randint(0, 24))
for i in range(1, 100)
],
),
]


@pytest.fixture
def custom_config_patch():
return {
"IN_Vehicle_Registration_Number": {
"Prediction_Factors_and_Weights": {
"Name": 0.2,
"Description": 0.1,
"Datatype": 0.1,
"Values": 0.6,
},
"Name": {
"regex": [
"^.*vehicle.*num.*$",
"^.*license.*plat.*num.*$",
"^.*license.*plat.*num.*$",
"^.*vehicle.*plat.*num.*$",
"^.*vehicle.*num.*plat.*$",
]
},
"Description": {
"regex": [
"^.*vehicle.*num.*$",
"^.*license.*plat.*num.*$",
"^.*license.*plat.*num.*$",
"^.*vehicle.*plat.*num.*$",
"^.*vehicle.*num.*plat.*$",
]
},
"Datatype": {"type": ["str", "varchar", "text"]},
"Values": {
"prediction_type": "regex",
"regex": [r"[a-z]{2}[-_\s]?[0-9]{1,2}[-_\s]?[a-z]{2,3}[-_\s]?[0-9]{4}"],
"library": [],
},
}
}


def test_custom_infotype_prediction(column_infos, custom_config_patch):
# Default config
out_column_infos = predict_infotypes(
column_infos, confidence_level_threshold=0.7, global_config=default_config
)
assert not out_column_infos[0].infotype_proposals
assert not out_column_infos[1].infotype_proposals
assert not out_column_infos[2].infotype_proposals

# Config with new custom infotype, all factors
config_new = default_config.copy()
config_new.update(custom_config_patch)
out_column_infos = predict_infotypes(
column_infos, confidence_level_threshold=0.7, global_config=config_new
)
assert not out_column_infos[0].infotype_proposals
assert not out_column_infos[2].infotype_proposals

predicted_infotypes = out_column_infos[1].infotype_proposals
assert predicted_infotypes
assert len(predicted_infotypes) == 1
assert predicted_infotypes[0].infotype == "IN_Vehicle_Registration_Number"
assert predicted_infotypes[0].debug_info.name == 1
assert predicted_infotypes[0].debug_info.description == 1
assert predicted_infotypes[0].debug_info.datatype == 1

0 comments on commit c3c9dac

Please sign in to comment.