feat(config): code changes to support/process user specified infotype…

…s only (#3)
acryldata · Nov 4, 2022 · 815aa26 · 815aa26
1 parent 852ccce
commit 815aa26
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -5,8 +5,9 @@ This API populates infotype proposal(s) for each input column by using metadata,
 ### Input Contract:
 API expects following parameters in the output
 - _column_infos_ - This is a list of ColumnInfo objects. Each ColumnInfo object contains metadata (col_name, description, datatype, etc) and values of a column. 
-- _confidence_level_threshold_ - If the infotype prediction confidence is greater than the confidence threshold then the prediction is considered as a proposal.
-- _global_config_ - This dictionary contains configuration details about all supported infotypes. Refer section X for more information.
+- _confidence_level_threshold_ - If the infotype prediction confidence is greater than the confidence threshold then the prediction is considered as a proposal. This is the common threshold for all infotypes.
+- _global_config_ - This dictionary contains configuration details about all supported infotypes. Refer section _"Infotype Configuration"_ for more information.
+- _infotypes_ - This is a list of infotypes that is to be processed. This is an optional argument, if specified then it will override the default list of all supported infotypes. If user is interested in only few infotypes then this list can be specified with correct infotype names. Infotype names are case sensitive.
 ### Output Contract:
 API returns a list of ColumnInfo objects of length same as input ColumnInfo objects list. A populated list of Infotype proposal(s), if any, is added in the ColumnInfo object itself with a variable name as infotype_proposals. The infotype_proposals list contains InfotypeProposal objects which has following information
 - _infotype_ - A proposed infotype name.
@@ -59,7 +60,14 @@ A debug information is associated with each infotype proposal, it provides detai
 - Name 
 - Description 
 - Datatype 
-- Values
+- Values 
+
+  {
+      'Name': 0.4, 
+      'Description': 0.2, 
+      'Values': 0.6, 
+      'Datatype': 0.3
+  }
 # Supported Infotypes
 1. Age 
 2. Gender 
@@ -75,7 +83,7 @@ Following libraries are required
 #### Required Spacy model
 $ python3 -m spacy download en_core_web_sm
 # Assumptions
-- If value prediction factor weight is non zero (indicating values should be used for infotype inspection) then a minimum 50 non-null column values should be present.
+- If value prediction factor weight is non-zero (indicating values should be used for infotype inspection) then a minimum 50 non-null column values should be present.
 # How to Run
     $ cd <datahub-classify repo root folder>
     $ python3 -m venv venv

diff --git a/src/datahub_classify/infotype_predictor.py b/src/datahub_classify/infotype_predictor.py
@@ -9,21 +9,24 @@
 logger = logging.getLogger(__name__)
 
 
-def get_infotype_function_mapping():
+def get_infotype_function_mapping(infotypes):
     from inspect import getmembers, isfunction
     module_name = "datahub_classify.infotype_helper"
     module = importlib.import_module(module_name)
     module_fn_dict = dict(getmembers(module, isfunction))
     infotype_function_map = {}
-    for infotype in infotypes_to_use:
+    if not infotypes:
+        infotypes = infotypes_to_use
+    for infotype in infotypes:
         fn_name = 'inspect_for_%s' % infotype.lower()
         infotype_function_map[infotype] = module_fn_dict[fn_name]
     return infotype_function_map
 
 
-def predict_infotypes(column_infos: list[ColumnInfo], confidence_level_threshold: float, global_config: dict):
+def predict_infotypes(column_infos: list[ColumnInfo], confidence_level_threshold: float, global_config: dict,
+                      infotypes: list[str] = None):
     # assert type(column_infos) == list, "type of column_infos should be list"
-    infotype_function_map = get_infotype_function_mapping()
+    infotype_function_map = get_infotype_function_mapping(infotypes)
     logger.info(f"Total columns to be processed --> {len(column_infos)}")
     logger.info(f"Confidence Level Threshold set to --> {confidence_level_threshold}")
     logger.info("===========================================================")
@@ -44,7 +47,7 @@ def predict_infotypes(column_infos: list[ColumnInfo], confidence_level_threshold
                         infotype_proposal = InfotypeProposal(infotype, confidence_level, debug_info)
                         proposal_list.append(infotype_proposal)
                 else:
-                    raise Exception("Failed basic checks for infotype - %s and column - %s" % \
+                    raise Exception("Failed basic checks for infotype - %s and column - %s" %
                                     (infotype, column_info.metadata.name))
             except Exception as e:
                 # traceback.print_exc()

diff --git a/test/sample_testing.py b/test/sample_testing.py
@@ -20,6 +20,7 @@
 from datahub_classify.sample_input import input1 as input_dict
 from datahub_classify.supported_infotypes import infotypes_to_use
 
+
 current_wdr = os.getcwd()
 input_data_dir = current_wdr + "\\datasets\\"
 input_jsons_dir = current_wdr + "\\expected_output\\"
@@ -102,7 +103,7 @@ def get_public_data_expected_output(public_data_list, infotypes_to_use):
 
 def get_best_infotype_pred(public_data_list, confidence_threshold, expected_output_unit_testing):
     column_info_list = populate_column_info_list(public_data_list)
-    column_info_pred_list = predict_infotypes(column_info_list, confidence_threshold, input_dict)
+    column_info_pred_list = predict_infotypes(column_info_list, confidence_threshold, input_dict, infotypes_to_use)
     public_data_predicted_infotype = dict()
     # get_thresholds_for_unit_test = dict()
     public_data_predicted_infotype_confidence = dict()