-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Add support for excluding list of exact column names (#20)
Co-authored-by: Ethan Cartwright <[email protected]>
- Loading branch information
1 parent
fc290dc
commit 63b8397
Showing
9 changed files
with
236 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
.gradle/ | ||
venv/ | ||
|
||
#VS Code | ||
.vscode | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
from typing import Any, Dict, List, Union | ||
|
||
# Input Dictionary Format | ||
|
||
exclude_name_test_config: Dict[ | ||
str, Dict[str, Union[Dict[str, Any], List[str], None]] | ||
] = { | ||
"Email_Address": { | ||
"Prediction_Factors_and_Weights": { | ||
"Name": 1, | ||
"Description": 0, | ||
"Datatype": 0, | ||
"Values": 0, | ||
}, | ||
"ExcludeName": ["email_sent", "email_received"], | ||
"Name": { | ||
"regex": [ | ||
"^.*mail.*id.*$", | ||
"^.*id.*mail.*$", | ||
"^.*mail.*add.*$", | ||
"^.*add.*mail.*$", | ||
"email", | ||
"mail", | ||
] | ||
}, | ||
"Description": {"regex": []}, | ||
"Datatype": {"type": ["str"]}, | ||
"Values": { | ||
"prediction_type": "regex", | ||
"regex": [], | ||
"library": [], | ||
}, | ||
}, | ||
} | ||
|
||
none_exclude_name_test_config: Dict[str, Dict[str, Union[Dict[str, Any], List[str], None]]] = { # type: ignore | ||
"Email_Address": { | ||
"Prediction_Factors_and_Weights": { | ||
"Name": 1, | ||
"Description": 0, | ||
"Datatype": 0, | ||
"Values": 0, | ||
}, | ||
"ExcludeName": None, | ||
"Name": { | ||
"regex": [ | ||
"^.*mail.*id.*$", | ||
"^.*id.*mail.*$", | ||
"^.*mail.*add.*$", | ||
"^.*add.*mail.*$", | ||
"email", | ||
"mail", | ||
] | ||
}, | ||
"Description": {"regex": []}, | ||
"Datatype": {"type": ["str"]}, | ||
"Values": { | ||
"prediction_type": "regex", | ||
"regex": [], | ||
"library": [], | ||
}, | ||
}, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
from exclude_name_test_config import ( | ||
exclude_name_test_config, | ||
none_exclude_name_test_config, | ||
) | ||
|
||
from datahub_classify.helper_classes import ColumnInfo, Metadata | ||
from datahub_classify.infotype_utils import perform_basic_checks, strip_formatting | ||
|
||
|
||
def column_infos(): | ||
return [ | ||
ColumnInfo( | ||
metadata=Metadata( | ||
meta_info={ | ||
"Name": "id", | ||
"Description": "Unique identifier", | ||
"Datatype": "int", | ||
"Dataset_Name": "email_data", | ||
} | ||
), | ||
values=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], | ||
), | ||
ColumnInfo( | ||
metadata=Metadata( | ||
meta_info={ | ||
"Name": "email_from", | ||
"Description": "Sender's email address", | ||
"Datatype": "str", | ||
"Dataset_Name": "email_data", | ||
} | ||
), | ||
values=[ | ||
"[email protected]", | ||
"[email protected]", | ||
"[email protected]", | ||
"[email protected]", | ||
"[email protected]", | ||
"[email protected]", | ||
"[email protected]", | ||
"[email protected]", | ||
"[email protected]", | ||
"[email protected]", | ||
], | ||
), | ||
ColumnInfo( | ||
metadata=Metadata( | ||
meta_info={ | ||
"Name": "email_to", | ||
"Description": "Recipient's email address", | ||
"Datatype": "str", | ||
"Dataset_Name": "email_data", | ||
} | ||
), | ||
values=[ | ||
"[email protected]", | ||
"[email protected]", | ||
"[email protected]", | ||
"[email protected]", | ||
"[email protected]", | ||
"[email protected]", | ||
"[email protected]", | ||
"[email protected]", | ||
"[email protected]", | ||
"[email protected]", | ||
], | ||
), | ||
ColumnInfo( | ||
metadata=Metadata( | ||
meta_info={ | ||
"Name": "email_sent", | ||
"Description": "Indicates if email was sent", | ||
"Datatype": "bool", | ||
"Dataset_Name": "email_data", | ||
} | ||
), | ||
values=[False, True, True, False, True, False, True, True, False, True], | ||
), | ||
ColumnInfo( | ||
metadata=Metadata( | ||
meta_info={ | ||
"Name": "email_received", | ||
"Description": "Indicates if email was received", | ||
"Datatype": "bool", | ||
"Dataset_Name": "email_data", | ||
} | ||
), | ||
values=[False, True, False, False, True, False, False, True, False, False], | ||
), | ||
] | ||
|
||
|
||
def test_perform_basic_checks_with_exclude_name(): | ||
for col_data in column_infos(): | ||
result = perform_basic_checks( | ||
Metadata(col_data.metadata.meta_info), | ||
col_data.values, | ||
exclude_name_test_config["Email_Address"], | ||
"Email_Address", | ||
1, | ||
) | ||
if col_data.metadata.meta_info["Name"] in ["email_sent", "email_received"]: | ||
assert not result | ||
else: | ||
assert result | ||
|
||
|
||
def test_perform_basic_checks_with_none_exclude_name(): | ||
for col_data in column_infos(): | ||
result = perform_basic_checks( | ||
Metadata(col_data.metadata.meta_info), | ||
col_data.values, | ||
none_exclude_name_test_config["Email_Address"], | ||
"Email_Address", | ||
1, | ||
) | ||
assert result | ||
|
||
|
||
def test_strip_formatting(): | ||
assert strip_formatting("Name") == "name" | ||
assert strip_formatting("my_column_name") == "mycolumnname" | ||
assert strip_formatting("Col.Name") == "colname" |