From a84830ff8aff99fc4cf8874f113e5d163d56ddd5 Mon Sep 17 00:00:00 2001 From: mardikark-gslab Date: Thu, 8 Jun 2023 04:01:05 +0530 Subject: [PATCH] Optional argument to specify minimum number of column values required to process (#18) --- datahub-classify/README.md | 1 + .../src/datahub_classify/infotype_predictor.py | 7 ++++++- datahub-classify/src/datahub_classify/infotype_utils.py | 6 +++--- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/datahub-classify/README.md b/datahub-classify/README.md index aca7b10..26ce002 100644 --- a/datahub-classify/README.md +++ b/datahub-classify/README.md @@ -18,6 +18,7 @@ API expects following parameters in the output - `confidence_level_threshold` - If the infotype prediction confidence is greater than the confidence threshold then the prediction is considered as a proposal. This is the common threshold for all infotypes. - `global_config` - This dictionary contains configuration details about all supported infotypes. Refer section [Infotype Configuration](#infotype-configuration) for more information. - `infotypes` - This is a list of infotypes that is to be processed. This is an optional argument, if specified then it will override the default list of all supported infotypes. If user is interested in only few infotypes then this list can be specified with correct infotype names. Infotype names are case sensitive. +- `minimum_values_threshold` - Minimum number of column values required for processing. This is an optional argument, default is 50. ### API Output diff --git a/datahub-classify/src/datahub_classify/infotype_predictor.py b/datahub-classify/src/datahub_classify/infotype_predictor.py index 31db532..98db50c 100644 --- a/datahub-classify/src/datahub_classify/infotype_predictor.py +++ b/datahub-classify/src/datahub_classify/infotype_predictor.py @@ -37,6 +37,7 @@ def predict_infotypes( confidence_level_threshold: float, global_config: Dict[str, Dict], infotypes: Optional[List[str]] = None, + minimum_values_threshold: int = 50, ) -> List[ColumnInfo]: infotype_function_map = get_infotype_function_mapping(infotypes, global_config) logger.debug(f"Total columns to be processed --> {len(column_infos)}") @@ -62,7 +63,11 @@ def predict_infotypes( ] try: if perform_basic_checks( - column_info.metadata, column_info.values, config_dict, infotype + column_info.metadata, + column_info.values, + config_dict, + infotype, + minimum_values_threshold, ): confidence_level, debug_info = infotype_fn( column_info.metadata, column_info.values, config_dict diff --git a/datahub-classify/src/datahub_classify/infotype_utils.py b/datahub-classify/src/datahub_classify/infotype_utils.py index 6888d0b..32ca53f 100644 --- a/datahub-classify/src/datahub_classify/infotype_utils.py +++ b/datahub-classify/src/datahub_classify/infotype_utils.py @@ -1,6 +1,6 @@ import logging import re -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List from datahub_classify.constants import PREDICTION_FACTORS_AND_WEIGHTS, VALUES from datahub_classify.helper_classes import Metadata @@ -83,10 +83,10 @@ def perform_basic_checks( metadata: Metadata, values: List[Any], config_dict: Dict[str, Dict], - infotype: Optional[str] = None, + infotype: str, + minimum_values_threshold: int, ) -> bool: basic_checks_status = True - minimum_values_threshold = 50 if ( config_dict[PREDICTION_FACTORS_AND_WEIGHTS].get(VALUES, None) and len(values) < minimum_values_threshold