Skip to content

Commit

Permalink
feat: Add support for excluding list of exact column names (#20)
Browse files Browse the repository at this point in the history
Co-authored-by: Ethan Cartwright <[email protected]>
  • Loading branch information
ethan-cartwright and ethan-cartwright authored Jan 2, 2024
1 parent fc290dc commit 63b8397
Show file tree
Hide file tree
Showing 9 changed files with 236 additions and 7 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
.gradle/
venv/

#VS Code
.vscode
Expand Down
1 change: 1 addition & 0 deletions datahub-classify/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ Infotype configuration is a dictionary with all infotypes at root level key. Eac
2. Description
3. Datatype
4. Values
- `ExcludeName` - optional exact match list for column names to exclude from classification for this info_type
- `Name` - regex list which is to be matched against column name
- `Description` - regex list which is to be matched against column description
- `Datatype` - list of datatypes to be matched against column datatype
Expand Down
1 change: 1 addition & 0 deletions datahub-classify/src/datahub_classify/constants.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Input config dictionary keys
PREDICTION_FACTORS_AND_WEIGHTS = "Prediction_Factors_and_Weights"
EXCLUDE_NAME = "ExcludeName"
NAME = "Name"
DESCRIPTION = "Description"
DATATYPE = "Datatype"
Expand Down
2 changes: 1 addition & 1 deletion datahub-classify/src/datahub_classify/infotype_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def compute_overall_confidence(debug_info: DebugInfo, config: Dict[str, Dict]) -
}
confidence_level = 0
for key, value in vars(debug_info).items():
if value and type(value) != str:
if value and not isinstance(value, str):
confidence_level += prediction_factors_weights[key] * value
confidence_level = np.round(confidence_level, 2)
return confidence_level
Expand Down
15 changes: 14 additions & 1 deletion datahub-classify/src/datahub_classify/infotype_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
import logging
from typing import Any, Dict, List, Optional

from datahub_classify.constants import EXCLUDE_NAME
from datahub_classify.helper_classes import ColumnInfo, InfotypeProposal
from datahub_classify.infotype_utils import perform_basic_checks
from datahub_classify.infotype_utils import perform_basic_checks, strip_formatting

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -45,6 +46,8 @@ def predict_infotypes(
logger.debug("===========================================================")
basic_checks_failed_columns = []
num_cols_with_infotype_assigned = 0
strip_exclusion_formatting = global_config.get("strip_exclusion_formatting")

for column_info in column_infos:
logger.debug(
f"processing column: {column_info.metadata.name} -- dataset: {column_info.metadata.dataset_name}"
Expand All @@ -55,6 +58,16 @@ def predict_infotypes(
# get the configuration
config_dict = global_config[infotype]

# convert exclude_name list into a set for o(1) checking
if EXCLUDE_NAME in config_dict and config_dict[EXCLUDE_NAME] is not None:
config_dict[EXCLUDE_NAME] = (
set(config_dict[EXCLUDE_NAME])
if not strip_exclusion_formatting
else set([strip_formatting(s) for s in config_dict[EXCLUDE_NAME]])
)
else:
config_dict[EXCLUDE_NAME] = set()

# call the infotype prediction function
column_info.values = [
val
Expand Down
33 changes: 29 additions & 4 deletions datahub-classify/src/datahub_classify/infotype_utils.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,23 @@
import logging
import re
from typing import Any, Dict, List
from typing import Any, Dict, List, Union

from datahub_classify.constants import PREDICTION_FACTORS_AND_WEIGHTS, VALUES
from datahub_classify.constants import (
EXCLUDE_NAME,
PREDICTION_FACTORS_AND_WEIGHTS,
VALUES,
)
from datahub_classify.helper_classes import Metadata

logger = logging.getLogger(__name__)


def strip_formatting(s):
s = s.lower()
s = re.sub(r"[^a-z0-9\s]", "", s)
return s


# TODO: Exception handling
# Match regex for Name and Description
def match_regex(text_to_match: str, regex_list: List[str]) -> float:
Expand Down Expand Up @@ -82,15 +92,30 @@ def detect_named_entity_spacy(
def perform_basic_checks(
metadata: Metadata,
values: List[Any],
config_dict: Dict[str, Dict],
config_dict: Dict[str, Union[Dict, List[str], None]],
infotype: str,
minimum_values_threshold: int,
) -> bool:
basic_checks_status = True
metadata.name = (
metadata.name
if not config_dict.get("strip_formatting")
else strip_formatting(metadata.name)
)
prediction_factors = config_dict.get(PREDICTION_FACTORS_AND_WEIGHTS)
exclude_name = config_dict.get(EXCLUDE_NAME, [])
if (
config_dict[PREDICTION_FACTORS_AND_WEIGHTS].get(VALUES, None)
isinstance(prediction_factors, dict)
and prediction_factors.get(VALUES, None)
and len(values) < minimum_values_threshold
):
logger.warning(
f"The number of values for column {metadata.name}"
f"does not meet minimum threshold for {infotype}"
)
basic_checks_status = False
elif exclude_name is not None and metadata.name in exclude_name:
logger.warning(f"Excluding match for {infotype} on column {metadata.name}")
basic_checks_status = False
# TODO: Add more basic checks
return basic_checks_status
5 changes: 4 additions & 1 deletion datahub-classify/src/datahub_classify/reference_input.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
from typing import Any, Dict, List, Union

# Input Dictionary Format

input1 = {
input1: Dict[str, Dict[str, Union[Dict[str, Any], List[str], None]]] = {
"Email_Address": {
"Prediction_Factors_and_Weights": {
"Name": 0.4,
"Description": 0,
"Datatype": 0,
"Values": 0.6,
},
"ExcludeName": [],
"Name": {
"regex": [
"^.*mail.*id.*$",
Expand Down
63 changes: 63 additions & 0 deletions datahub-classify/tests/exclude_name_test_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from typing import Any, Dict, List, Union

# Input Dictionary Format

exclude_name_test_config: Dict[
str, Dict[str, Union[Dict[str, Any], List[str], None]]
] = {
"Email_Address": {
"Prediction_Factors_and_Weights": {
"Name": 1,
"Description": 0,
"Datatype": 0,
"Values": 0,
},
"ExcludeName": ["email_sent", "email_received"],
"Name": {
"regex": [
"^.*mail.*id.*$",
"^.*id.*mail.*$",
"^.*mail.*add.*$",
"^.*add.*mail.*$",
"email",
"mail",
]
},
"Description": {"regex": []},
"Datatype": {"type": ["str"]},
"Values": {
"prediction_type": "regex",
"regex": [],
"library": [],
},
},
}

none_exclude_name_test_config: Dict[str, Dict[str, Union[Dict[str, Any], List[str], None]]] = { # type: ignore
"Email_Address": {
"Prediction_Factors_and_Weights": {
"Name": 1,
"Description": 0,
"Datatype": 0,
"Values": 0,
},
"ExcludeName": None,
"Name": {
"regex": [
"^.*mail.*id.*$",
"^.*id.*mail.*$",
"^.*mail.*add.*$",
"^.*add.*mail.*$",
"email",
"mail",
]
},
"Description": {"regex": []},
"Datatype": {"type": ["str"]},
"Values": {
"prediction_type": "regex",
"regex": [],
"library": [],
},
},
}
122 changes: 122 additions & 0 deletions datahub-classify/tests/test_info_type_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
from exclude_name_test_config import (
exclude_name_test_config,
none_exclude_name_test_config,
)

from datahub_classify.helper_classes import ColumnInfo, Metadata
from datahub_classify.infotype_utils import perform_basic_checks, strip_formatting


def column_infos():
return [
ColumnInfo(
metadata=Metadata(
meta_info={
"Name": "id",
"Description": "Unique identifier",
"Datatype": "int",
"Dataset_Name": "email_data",
}
),
values=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
),
ColumnInfo(
metadata=Metadata(
meta_info={
"Name": "email_from",
"Description": "Sender's email address",
"Datatype": "str",
"Dataset_Name": "email_data",
}
),
values=[
"[email protected]",
"[email protected]",
"[email protected]",
"[email protected]",
"[email protected]",
"[email protected]",
"[email protected]",
"[email protected]",
"[email protected]",
"[email protected]",
],
),
ColumnInfo(
metadata=Metadata(
meta_info={
"Name": "email_to",
"Description": "Recipient's email address",
"Datatype": "str",
"Dataset_Name": "email_data",
}
),
values=[
"[email protected]",
"[email protected]",
"[email protected]",
"[email protected]",
"[email protected]",
"[email protected]",
"[email protected]",
"[email protected]",
"[email protected]",
"[email protected]",
],
),
ColumnInfo(
metadata=Metadata(
meta_info={
"Name": "email_sent",
"Description": "Indicates if email was sent",
"Datatype": "bool",
"Dataset_Name": "email_data",
}
),
values=[False, True, True, False, True, False, True, True, False, True],
),
ColumnInfo(
metadata=Metadata(
meta_info={
"Name": "email_received",
"Description": "Indicates if email was received",
"Datatype": "bool",
"Dataset_Name": "email_data",
}
),
values=[False, True, False, False, True, False, False, True, False, False],
),
]


def test_perform_basic_checks_with_exclude_name():
for col_data in column_infos():
result = perform_basic_checks(
Metadata(col_data.metadata.meta_info),
col_data.values,
exclude_name_test_config["Email_Address"],
"Email_Address",
1,
)
if col_data.metadata.meta_info["Name"] in ["email_sent", "email_received"]:
assert not result
else:
assert result


def test_perform_basic_checks_with_none_exclude_name():
for col_data in column_infos():
result = perform_basic_checks(
Metadata(col_data.metadata.meta_info),
col_data.values,
none_exclude_name_test_config["Email_Address"],
"Email_Address",
1,
)
assert result


def test_strip_formatting():
assert strip_formatting("Name") == "name"
assert strip_formatting("my_column_name") == "mycolumnname"
assert strip_formatting("Col.Name") == "colname"

0 comments on commit 63b8397

Please sign in to comment.