From 5163693666e1e46e34c3a4343ec342aecd97e832 Mon Sep 17 00:00:00 2001 From: kasun Date: Fri, 30 Jun 2023 15:26:06 +0000 Subject: [PATCH 1/3] handle the case of not having a explicit cohort config - no retrain --- src/triage/predictlist/__init__.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/src/triage/predictlist/__init__.py b/src/triage/predictlist/__init__.py index 1fc83f903..bf56d79db 100644 --- a/src/triage/predictlist/__init__.py +++ b/src/triage/predictlist/__init__.py @@ -72,15 +72,41 @@ def predict_forward_with_existed_model(db_engine, project_path, model_id, as_of_ upgrade_db(db_engine=db_engine) project_storage = ProjectStorage(project_path) matrix_storage_engine = project_storage.matrix_storage_engine() + # 1. Get feature and cohort config from database (train_matrix_uuid, matrix_metadata) = train_matrix_info_from_model_id(db_engine, model_id) experiment_config = experiment_config_from_model_id(db_engine, model_id) # 2. Generate cohort - cohort_table_name = f"triage_production.cohort_{experiment_config['cohort_config']['name']}" + cohort_config = experiment_config.get('cohort_config') + + + if cohort_config is None: + logger.info('Experiment config does not contain a cohort config. Using the label query') + label_query = experiment_config['label_config']['query'] + + # We need to remove the '{label_timespan} param' with some default + # It doesn't matter since we are not using the labels + # label_query = label_query.format(label_timespan='1week') + label_query = label_query.replace('{label_timespan}', '1week') + + # Just selecting the entity id from the labels query + cohort_query = f""" + select + entity_id + from ({label_query}) as lq + """ + print(cohort_query) + + cohort_config = dict() + cohort_config['query'] = cohort_query + cohort_config['name'] = 'default' + + + cohort_table_name = f"triage_production.cohort_{cohort_config['name']}" cohort_table_generator = EntityDateTableGenerator( db_engine=db_engine, - query=experiment_config['cohort_config']['query'], + query=cohort_config['query'], entity_date_table_name=cohort_table_name ) cohort_table_generator.generate_entity_date_table(as_of_dates=[dt_from_str(as_of_date)]) From 775f71cfb6acaccce800f91a417faddfcc63d077 Mon Sep 17 00:00:00 2001 From: kasun Date: Fri, 7 Jul 2023 15:43:40 +0000 Subject: [PATCH 2/3] get cohort query from the label query --- src/triage/predictlist/__init__.py | 41 ++++++++++-------------------- src/triage/predictlist/utils.py | 21 ++++++++++++++- 2 files changed, 34 insertions(+), 28 deletions(-) diff --git a/src/triage/predictlist/__init__.py b/src/triage/predictlist/__init__.py index bf56d79db..66bd4b64e 100644 --- a/src/triage/predictlist/__init__.py +++ b/src/triage/predictlist/__init__.py @@ -43,6 +43,7 @@ save_retrain_and_get_hash, get_retrain_config_from_model_id, temporal_params_from_matrix_metadata, + cohort_config_from_label_config ) @@ -78,35 +79,17 @@ def predict_forward_with_existed_model(db_engine, project_path, model_id, as_of_ experiment_config = experiment_config_from_model_id(db_engine, model_id) # 2. Generate cohort - cohort_config = experiment_config.get('cohort_config') + # cohort_config = experiment_config.get('cohort_config') - if cohort_config is None: + if experiment_config.get('cohort_config') is None: logger.info('Experiment config does not contain a cohort config. Using the label query') - label_query = experiment_config['label_config']['query'] - - # We need to remove the '{label_timespan} param' with some default - # It doesn't matter since we are not using the labels - # label_query = label_query.format(label_timespan='1week') - label_query = label_query.replace('{label_timespan}', '1week') - - # Just selecting the entity id from the labels query - cohort_query = f""" - select - entity_id - from ({label_query}) as lq - """ - print(cohort_query) - - cohort_config = dict() - cohort_config['query'] = cohort_query - cohort_config['name'] = 'default' + experiment_config['cohort_config'] = cohort_config_from_label_config(experiment_config['label_config']) - - cohort_table_name = f"triage_production.cohort_{cohort_config['name']}" + cohort_table_name = f"triage_production.cohort_{experiment_config['cohort_config']['name']}" cohort_table_generator = EntityDateTableGenerator( db_engine=db_engine, - query=cohort_config['query'], + query=experiment_config['cohort_config']['query'], entity_date_table_name=cohort_table_name ) cohort_table_generator.generate_entity_date_table(as_of_dates=[dt_from_str(as_of_date)]) @@ -176,7 +159,7 @@ def predict_forward_with_existed_model(db_engine, project_path, model_id, as_of_ label_name = experiment_config['label_config']['name'] label_type = 'binary' cohort_name = experiment_config['cohort_config']['name'] - user_metadata = experiment_config['user_metadata'] + user_metadata = experiment_config.get('user_metadata', {}) # Use timechop to get the time definition for production temporal_config = experiment_config["temporal_config"] @@ -260,6 +243,10 @@ def __init__(self, db_engine, project_path, model_group_id): self.test_duration = self.experiment_config['temporal_config']['test_durations'][0] self.feature_start_time=self.experiment_config['temporal_config']['feature_start_time'] + # Handling the case where a separate cohort_config is not defined + if self.experiment_config.get('cohort_config') is None: + self.experiment_config['cohort_config'] = cohort_config_from_label_config(self.experiment_config['label_config']) + self.label_name = self.experiment_config['label_config']['name'] self.cohort_name = self.experiment_config['cohort_config']['name'] self.user_metadata = self.experiment_config['user_metadata'] @@ -387,9 +374,9 @@ def retrain(self, prediction_date): chops_train_matrix = chops[0]['train_matrix'] as_of_date = datetime.strftime(chops_train_matrix['last_as_of_time'], "%Y-%m-%d") retrain_definition = { - 'first_as_of_time': chops_train_matrix['first_as_of_time'], - 'last_as_of_time': chops_train_matrix['last_as_of_time'], - 'matrix_info_end_time': chops_train_matrix['matrix_info_end_time'], + 'first_as_of_time': str(chops_train_matrix['first_as_of_time']), + 'last_as_of_time': str(chops_train_matrix['last_as_of_time']), + 'matrix_info_end_time': str(chops_train_matrix['matrix_info_end_time']), 'as_of_times': [as_of_date], 'training_label_timespan': chops_train_matrix['training_label_timespan'], 'max_training_history': chops_train_matrix['max_training_history'], diff --git a/src/triage/predictlist/utils.py b/src/triage/predictlist/utils.py index 9b5eeaf51..7b6e5709f 100644 --- a/src/triage/predictlist/utils.py +++ b/src/triage/predictlist/utils.py @@ -137,7 +137,7 @@ def get_feature_names(aggregation, matrix_metadata): logger.spam("Feature prefix = %s", feature_prefix) feature_group = aggregation.get_table_name(imputed=True).split('.')[1].replace('"', '') logger.spam("Feature group = %s", feature_group) - feature_names_in_group = [f for f in matrix_metadata['feature_names'] if re.match(f'\\A{feature_prefix}_', f)] + feature_names_in_group = [f for f in matrix_metadata['feature_names'] if re.match(f'\\A{feature_prefix}_entity_id', f)] logger.spam("Feature names in group = %s", feature_names_in_group) return feature_group, feature_names_in_group @@ -206,3 +206,22 @@ def save_retrain_and_get_hash(config, db_engine): return retrain_hash +def cohort_config_from_label_config(label_config): + """Hande the cases where the cohort query is not specified""" + + label_query = label_config['query'] + + cohort_config = dict() + cohort_config['name'] = 'default' + + # We can't have the label_timespan in the cohort query + label_query = label_query.replace('{label_timespan}', '1week') + + # We use the label query as a subquery and extract the entity ids + cohort_config['query'] = f""" + select + entity_id + from ({label_query}) as lq + """ + + return cohort_config From fa71501b7b7e27c5d3b73e5946a8f658e86043b1 Mon Sep 17 00:00:00 2001 From: kasun Date: Fri, 7 Jul 2023 17:48:03 +0000 Subject: [PATCH 3/3] fix datetime type serializability error in matrix metadata --- src/triage/predictlist/__init__.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/triage/predictlist/__init__.py b/src/triage/predictlist/__init__.py index 66bd4b64e..59e8b997a 100644 --- a/src/triage/predictlist/__init__.py +++ b/src/triage/predictlist/__init__.py @@ -79,10 +79,8 @@ def predict_forward_with_existed_model(db_engine, project_path, model_id, as_of_ experiment_config = experiment_config_from_model_id(db_engine, model_id) # 2. Generate cohort - # cohort_config = experiment_config.get('cohort_config') - - if experiment_config.get('cohort_config') is None: + # If a separate cohort_config is not defined in the config logger.info('Experiment config does not contain a cohort config. Using the label query') experiment_config['cohort_config'] = cohort_config_from_label_config(experiment_config['label_config']) @@ -171,8 +169,16 @@ def predict_forward_with_existed_model(db_engine, project_path, model_id, as_of_ test_label_timespan=temporal_config['test_label_timespans'][0] ) + last_split_definition = prod_definitions[-1] + + # formating the datetimes as strings to be saved as JSON + last_split_definition['first_as_of_time'] = str(last_split_definition['first_as_of_time']) + last_split_definition['last_as_of_time'] = str(last_split_definition['last_as_of_time']) + last_split_definition['matrix_info_end_time'] = str(last_split_definition['matrix_info_end_time']) + last_split_definition['as_of_times'] = [str(last_split_definition['as_of_times'][0])] + matrix_metadata = Planner.make_metadata( - prod_definitions[-1], + last_split_definition, reconstructed_feature_dict, label_name, label_type,