diff --git a/datahub-classify/README.md b/datahub-classify/README.md index aca7b10..26ce002 100644 --- a/datahub-classify/README.md +++ b/datahub-classify/README.md @@ -18,6 +18,7 @@ API expects following parameters in the output - `confidence_level_threshold` - If the infotype prediction confidence is greater than the confidence threshold then the prediction is considered as a proposal. This is the common threshold for all infotypes. - `global_config` - This dictionary contains configuration details about all supported infotypes. Refer section [Infotype Configuration](#infotype-configuration) for more information. - `infotypes` - This is a list of infotypes that is to be processed. This is an optional argument, if specified then it will override the default list of all supported infotypes. If user is interested in only few infotypes then this list can be specified with correct infotype names. Infotype names are case sensitive. +- `minimum_values_threshold` - Minimum number of column values required for processing. This is an optional argument, default is 50. ### API Output diff --git a/datahub-classify/src/datahub_classify/infotype_predictor.py b/datahub-classify/src/datahub_classify/infotype_predictor.py index 31db532..98db50c 100644 --- a/datahub-classify/src/datahub_classify/infotype_predictor.py +++ b/datahub-classify/src/datahub_classify/infotype_predictor.py @@ -37,6 +37,7 @@ def predict_infotypes( confidence_level_threshold: float, global_config: Dict[str, Dict], infotypes: Optional[List[str]] = None, + minimum_values_threshold: int = 50, ) -> List[ColumnInfo]: infotype_function_map = get_infotype_function_mapping(infotypes, global_config) logger.debug(f"Total columns to be processed --> {len(column_infos)}") @@ -62,7 +63,11 @@ def predict_infotypes( ] try: if perform_basic_checks( - column_info.metadata, column_info.values, config_dict, infotype + column_info.metadata, + column_info.values, + config_dict, + infotype, + minimum_values_threshold, ): confidence_level, debug_info = infotype_fn( column_info.metadata, column_info.values, config_dict diff --git a/datahub-classify/src/datahub_classify/infotype_utils.py b/datahub-classify/src/datahub_classify/infotype_utils.py index 6888d0b..32ca53f 100644 --- a/datahub-classify/src/datahub_classify/infotype_utils.py +++ b/datahub-classify/src/datahub_classify/infotype_utils.py @@ -1,6 +1,6 @@ import logging import re -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List from datahub_classify.constants import PREDICTION_FACTORS_AND_WEIGHTS, VALUES from datahub_classify.helper_classes import Metadata @@ -83,10 +83,10 @@ def perform_basic_checks( metadata: Metadata, values: List[Any], config_dict: Dict[str, Dict], - infotype: Optional[str] = None, + infotype: str, + minimum_values_threshold: int, ) -> bool: basic_checks_status = True - minimum_values_threshold = 50 if ( config_dict[PREDICTION_FACTORS_AND_WEIGHTS].get(VALUES, None) and len(values) < minimum_values_threshold