Skip to content

Commit

Permalink
add code to handle unspecified excludeName better
Browse files Browse the repository at this point in the history
  • Loading branch information
ethan-cartwright committed Dec 16, 2023
1 parent 6762f0f commit 9fd8fb6
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 4 deletions.
4 changes: 3 additions & 1 deletion datahub-classify/src/datahub_classify/infotype_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,10 @@ def predict_infotypes(
config_dict = global_config[infotype]

# convert exclude_name list into a set for o(1) checking
if EXCLUDE_NAME in config_dict:
if EXCLUDE_NAME in config_dict and config_dict[EXCLUDE_NAME] is not None:
config_dict[EXCLUDE_NAME] = set(config_dict[EXCLUDE_NAME])
else:
config_dict[EXCLUDE_NAME] = set()

# call the infotype prediction function
column_info.values = [
Expand Down
4 changes: 3 additions & 1 deletion datahub-classify/src/datahub_classify/infotype_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,9 @@ def perform_basic_checks(
f"does not meet minimum threshold for {infotype}"
)
basic_checks_status = False
elif metadata.name in config_dict.get(EXCLUDE_NAME, set()):
elif config_dict[EXCLUDE_NAME] is not None and metadata.name in config_dict.get(
EXCLUDE_NAME, set()
):
logger.warning(f"Excluding match for {infotype} on column {metadata.name}")
basic_checks_status = False
# TODO: Add more basic checks
Expand Down
29 changes: 29 additions & 0 deletions datahub-classify/tests/exclude_name_test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,32 @@
},
},
}

none_exclude_name_test_config = {
"Email_Address": {
"Prediction_Factors_and_Weights": {
"Name": 1,
"Description": 0,
"Datatype": 0,
"Values": 0,
},
"ExcludeName": None,
"Name": {
"regex": [
"^.*mail.*id.*$",
"^.*id.*mail.*$",
"^.*mail.*add.*$",
"^.*add.*mail.*$",
"email",
"mail",
]
},
"Description": {"regex": []},
"Datatype": {"type": ["str"]},
"Values": {
"prediction_type": "regex",
"regex": [],
"library": [],
},
},
}
19 changes: 17 additions & 2 deletions datahub-classify/tests/test_info_type_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from exclude_name_test_config import exclude_name_test_config
from exclude_name_test_config import (
exclude_name_test_config,
none_exclude_name_test_config,
)
from datahub_classify.helper_classes import ColumnInfo, Metadata
from datahub_classify.infotype_utils import perform_basic_checks

Expand Down Expand Up @@ -85,7 +88,7 @@ def column_infos():
]


def test_perform_basic_checks():
def test_perform_basic_checks_with_exclude_name():
for col_data in column_infos():
result = perform_basic_checks(
Metadata(col_data.metadata.meta_info),
Expand All @@ -98,3 +101,15 @@ def test_perform_basic_checks():
assert not result
else:
assert result


def test_perform_basic_checks_with_none_exclude_name():
for col_data in column_infos():
result = perform_basic_checks(
Metadata(col_data.metadata.meta_info),
col_data.values,
none_exclude_name_test_config["Email_Address"],
"Email_Address",
1,
)
assert result

0 comments on commit 9fd8fb6

Please sign in to comment.