Skip to content

Commit

Permalink
Optional argument to specify minimum number of column values required…
Browse files Browse the repository at this point in the history
… to process (#18)
  • Loading branch information
mardikark-gslab authored Jun 7, 2023
1 parent c3c9dac commit a84830f
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 4 deletions.
1 change: 1 addition & 0 deletions datahub-classify/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ API expects following parameters in the output
- `confidence_level_threshold` - If the infotype prediction confidence is greater than the confidence threshold then the prediction is considered as a proposal. This is the common threshold for all infotypes.
- `global_config` - This dictionary contains configuration details about all supported infotypes. Refer section [Infotype Configuration](#infotype-configuration) for more information.
- `infotypes` - This is a list of infotypes that is to be processed. This is an optional argument, if specified then it will override the default list of all supported infotypes. If user is interested in only few infotypes then this list can be specified with correct infotype names. Infotype names are case sensitive.
- `minimum_values_threshold` - Minimum number of column values required for processing. This is an optional argument, default is 50.

### API Output

Expand Down
7 changes: 6 additions & 1 deletion datahub-classify/src/datahub_classify/infotype_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def predict_infotypes(
confidence_level_threshold: float,
global_config: Dict[str, Dict],
infotypes: Optional[List[str]] = None,
minimum_values_threshold: int = 50,
) -> List[ColumnInfo]:
infotype_function_map = get_infotype_function_mapping(infotypes, global_config)
logger.debug(f"Total columns to be processed --> {len(column_infos)}")
Expand All @@ -62,7 +63,11 @@ def predict_infotypes(
]
try:
if perform_basic_checks(
column_info.metadata, column_info.values, config_dict, infotype
column_info.metadata,
column_info.values,
config_dict,
infotype,
minimum_values_threshold,
):
confidence_level, debug_info = infotype_fn(
column_info.metadata, column_info.values, config_dict
Expand Down
6 changes: 3 additions & 3 deletions datahub-classify/src/datahub_classify/infotype_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
import re
from typing import Any, Dict, List, Optional
from typing import Any, Dict, List

from datahub_classify.constants import PREDICTION_FACTORS_AND_WEIGHTS, VALUES
from datahub_classify.helper_classes import Metadata
Expand Down Expand Up @@ -83,10 +83,10 @@ def perform_basic_checks(
metadata: Metadata,
values: List[Any],
config_dict: Dict[str, Dict],
infotype: Optional[str] = None,
infotype: str,
minimum_values_threshold: int,
) -> bool:
basic_checks_status = True
minimum_values_threshold = 50
if (
config_dict[PREDICTION_FACTORS_AND_WEIGHTS].get(VALUES, None)
and len(values) < minimum_values_threshold
Expand Down

0 comments on commit a84830f

Please sign in to comment.