From 51ed8c3162020650b704e5944a33fbd4487c655f Mon Sep 17 00:00:00 2001 From: Chooi Je Qin <42904912+jeqinchooi@users.noreply.github.com> Date: Sat, 9 Dec 2023 18:37:14 -0500 Subject: [PATCH 1/8] categorised errors from log file and write to persistent file --- ersilia/core/tracking.py | 58 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 53 insertions(+), 5 deletions(-) diff --git a/ersilia/core/tracking.py b/ersilia/core/tracking.py index 0e4600748..352081bcd 100644 --- a/ersilia/core/tracking.py +++ b/ersilia/core/tracking.py @@ -7,6 +7,7 @@ import boto3 from botocore.exceptions import ClientError import os +import re PERSISTENT_FILE_PATH = os.path.abspath("current_session.txt") # Temporary path to log files @@ -17,14 +18,61 @@ def log_files_metrics(file): error_count = 0 warning_count = 0 + ersilia_error_flag = False + misc_error_flag = False + error_name = "" + errors = {} with open(file, "r") as file: + line = None for line in file: - if "| ERROR" in line: - error_count += 1 - elif "| WARNING" in line: - warning_count += 1 + if not re.match(r"^\d{2}.\d{2}.\d{2} \| ", line): + # continuation of log + if ersilia_error_flag: + # catch the error name if hinted by previous line + error_name = line.rstrip() + errors[error_name] += 1 + ersilia_error_flag = False + continue + elif misc_error_flag: + error_name += line.rstrip() + if len(error_name) > 100: + error_name = error_name[:97] + "..." + misc_error_flag = False + else: + # encountering new logs + # make sure error flags are closed + if ersilia_error_flag: + errors["Unknown Ersilia exception class"] += 1 + ersilia_error_flag = False + if misc_error_flag: + errors[error_name] += 1 + misc_error_flag = False + if "| ERROR" in line: + error_count += 1 + # checking which type of errors + if "Ersilia exception class:" in line: + # combine this with the next line, usually EmptyOutputError or SourceCodeBaseInformationError + # the detailed message is long + ersilia_error_flag = True + else: + # other errors are pretty self-descriptive and short. Will cap by character + misc_error_flag = True + error_name = line.split('| ERROR | ')[1].rstrip() + elif "| WARNING" in line: + warning_count += 1 + if line is not None: + # in case last log is error + # make sure error flags are closed + if ersilia_error_flag: + errors["Unknown Ersilia exception class"] += 1 + if misc_error_flag: + errors[error_name] += 1 write_persistent_file(f"Error count: {error_count}") + if len(errors) > 0: + write_persistent_file(f"Breakdown by error types:") + for error in errors: + write_persistent_file(f"{error}: {errors[error]}") write_persistent_file(f"Warning count: {warning_count}") @@ -74,7 +122,7 @@ def upload_to_s3(json_dict, bucket="t4sg-ersilia", object_name=None): # If S3 object_name was not specified, use file_name if object_name is None: object_name = ( - datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + "-" + json_dict["model_id"] + datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + "-" + json_dict["model_id"] ) # Dump JSON into a temporary file to upload From 70da68a9c5322963e838fb24704710ff727621e8 Mon Sep 17 00:00:00 2001 From: Chooi Je Qin <42904912+jeqinchooi@users.noreply.github.com> Date: Sat, 9 Dec 2023 18:49:34 -0500 Subject: [PATCH 2/8] Comments for get_file_sizes --- ersilia/core/tracking.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ersilia/core/tracking.py b/ersilia/core/tracking.py index 352081bcd..01ad3fe80 100644 --- a/ersilia/core/tracking.py +++ b/ersilia/core/tracking.py @@ -191,6 +191,13 @@ def stats(self, result): return stats def get_file_sizes(self, input_df, output_df): + """ + Calculates the size of the input and output dataframes, as well as the average size of each row. + + :input_df: Pandas dataframe containing the input data + :output_df: Pandas dataframe containing the output data + :return: dictionary containing the input size, output size, average input size, and average output size + """ input_size = input_df.memory_usage(deep=True).sum() / 1024 output_size = output_df.memory_usage(deep=True).sum() / 1024 From ca21906bd85fa46e2ebb16ab908277d5cb56467f Mon Sep 17 00:00:00 2001 From: Eric Date: Sat, 9 Dec 2023 23:45:49 -0500 Subject: [PATCH 3/8] documentation for check_types and log_files_metrics --- ersilia/core/tracking.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/ersilia/core/tracking.py b/ersilia/core/tracking.py index 0e4600748..640b0376d 100644 --- a/ersilia/core/tracking.py +++ b/ersilia/core/tracking.py @@ -14,6 +14,14 @@ def log_files_metrics(file): + """ + This function will log the number of errors and warnings in the log files. + + :param file: The log file to be read + :return: None (writes to file) + """ + + error_count = 0 warning_count = 0 @@ -157,6 +165,15 @@ def get_file_sizes(self, input_df, output_df): } def check_types(self, resultDf, metadata): + """ + This class is responsible for checking the types of the output dataframe against the expected types. + This includes checking the shape of the output dataframe (list vs single) and the types of each column. + + :param resultDf: The output dataframe + :param metadata: The metadata dictionary + :return: A dictionary containing the number of mismatched types and a boolean for whether the shape is correct + """ + typeDict = {"float64": "Float", "int64": "Int"} count = 0 From 731dc200a3a0a46c4e5db9e8843254433ba41d32 Mon Sep 17 00:00:00 2001 From: Eric Date: Mon, 18 Dec 2023 15:32:39 -0500 Subject: [PATCH 4/8] adding create_csv function --- ersilia/core/tracking.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/ersilia/core/tracking.py b/ersilia/core/tracking.py index 640b0376d..f6e6cb002 100644 --- a/ersilia/core/tracking.py +++ b/ersilia/core/tracking.py @@ -13,6 +13,27 @@ TEMP_FILE_LOGS = os.path.abspath("") +def create_csv(output_df): + """ + This function takes in the output dataframe from the model run and returns + a new temporary csv file that will later be passed to CDD vault. The CSV + file has two columns: the first column is the input molecules and the + second column is the ISO-formatted time of the run. + + :param file: The output dataframe from the model run + :return: A new temporary csv file + """ + + new_df = output_df[['input']].copy() + current_time = datetime.now().isoformat() + + new_df['time'] = current_time + csv_file = tempfile.NamedTemporaryFile(mode="w", suffix=".csv") + new_df.to_csv(csv_file.name, index=False) + + return csv_file + + def log_files_metrics(file): """ This function will log the number of errors and warnings in the log files. @@ -193,7 +214,8 @@ def check_types(self, resultDf, metadata): else: print("Output is correct shape.") correct_shape = True - + + print(resultDf) print("Output has", count, "mismatched types.\n") return {"mismatched_types": count, "correct_shape": correct_shape} @@ -240,5 +262,7 @@ def track(self, input, result, meta): json_object = json.dumps(json_dict, indent=4) write_persistent_file(json_object) + create_csv(result_dataframe) + # Upload run stats to s3 upload_to_s3(json_dict) From ea26754c843922131b44ec692eda43c9f6db063a Mon Sep 17 00:00:00 2001 From: Anthony Cui Date: Wed, 20 Dec 2023 22:56:11 -0500 Subject: [PATCH 5/8] Update exception handling to catch more common errors --- ersilia/core/tracking.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/ersilia/core/tracking.py b/ersilia/core/tracking.py index 0e4600748..15f27e353 100644 --- a/ersilia/core/tracking.py +++ b/ersilia/core/tracking.py @@ -5,7 +5,7 @@ import tempfile import logging import boto3 -from botocore.exceptions import ClientError +from botocore.exceptions import ClientError, NoCredentialsError import os PERSISTENT_FILE_PATH = os.path.abspath("current_session.txt") @@ -17,15 +17,18 @@ def log_files_metrics(file): error_count = 0 warning_count = 0 - with open(file, "r") as file: - for line in file: - if "| ERROR" in line: - error_count += 1 - elif "| WARNING" in line: - warning_count += 1 + try: + with open(file, "r") as file: + for line in file: + if "| ERROR" in line: + error_count += 1 + elif "| WARNING" in line: + warning_count += 1 - write_persistent_file(f"Error count: {error_count}") - write_persistent_file(f"Warning count: {warning_count}") + write_persistent_file(f"Error count: {error_count}") + write_persistent_file(f"Warning count: {warning_count}") + except FileNotFoundError: + logging.warning("Log file not found") def read_csv(file): @@ -89,6 +92,8 @@ def upload_to_s3(json_dict, bucket="t4sg-ersilia", object_name=None): s3_client = boto3.client("s3") try: s3_client.upload_file(tmp.name, bucket, f"{object_name}.json") + except NoCredentialsError: + logging.error("Unable to upload tracking data to AWS: Credentials not found") except ClientError as e: logging.error(e) return False From 26d9bd48b1db81d5f66ee214982cca4e2d6626a0 Mon Sep 17 00:00:00 2001 From: Anthony Cui Date: Wed, 20 Dec 2023 23:19:59 -0500 Subject: [PATCH 6/8] Update documentation --- ersilia/core/tracking.py | 90 ++++++++++++++++++++++++---------------- 1 file changed, 54 insertions(+), 36 deletions(-) diff --git a/ersilia/core/tracking.py b/ersilia/core/tracking.py index 88af73455..f4096f405 100644 --- a/ersilia/core/tracking.py +++ b/ersilia/core/tracking.py @@ -10,7 +10,7 @@ import re PERSISTENT_FILE_PATH = os.path.abspath("current_session.txt") -# Temporary path to log files +# Temporary path to log files until log files are fixed TEMP_FILE_LOGS = os.path.abspath("") @@ -21,17 +21,17 @@ def create_csv(output_df): file has two columns: the first column is the input molecules and the second column is the ISO-formatted time of the run. - :param file: The output dataframe from the model run + :param output_df: The output dataframe from the model run :return: A new temporary csv file """ - new_df = output_df[['input']].copy() + new_df = output_df[["input"]].copy() current_time = datetime.now().isoformat() - new_df['time'] = current_time + new_df["time"] = current_time csv_file = tempfile.NamedTemporaryFile(mode="w", suffix=".csv") new_df.to_csv(csv_file.name, index=False) - + return csv_file @@ -43,7 +43,6 @@ def log_files_metrics(file): :return: None (writes to file) """ - error_count = 0 warning_count = 0 @@ -88,7 +87,7 @@ def log_files_metrics(file): else: # other errors are pretty self-descriptive and short. Will cap by character misc_error_flag = True - error_name = line.split('| ERROR | ')[1].rstrip() + error_name = line.split("| ERROR | ")[1].rstrip() elif "| WARNING" in line: warning_count += 1 if line is not None: @@ -109,22 +108,21 @@ def log_files_metrics(file): logging.warning("Log file not found") -def read_csv(file): - # reads csv file and returns Pandas dataframe - return pd.read_csv(file) - - -def read_json(result): - data = json.load(result) - return data - - def open_persistent_file(model_id): + """ + Opens a new persistent file, specifically for a run of model_id + :param model_id: The currently running model + """ with open(PERSISTENT_FILE_PATH, "w") as f: f.write("Session started for model: {0}\n".format(model_id)) def write_persistent_file(contents): + """ + Writes contents to the current persistent file. Only writes if the file actually exists. + :param contents: The contents to write to the file. + """ + # Only write to file if it already exists (we're meant to be tracking this run) if os.path.isfile(PERSISTENT_FILE_PATH): with open(PERSISTENT_FILE_PATH, "a") as f: @@ -132,6 +130,10 @@ def write_persistent_file(contents): def close_persistent_file(): + """ + Closes the persistent file, renaming it to a unique name. + """ + # Make sure the file actually exists before we try renaming if os.path.isfile(PERSISTENT_FILE_PATH): log_files_metrics(TEMP_FILE_LOGS) @@ -171,7 +173,9 @@ def upload_to_s3(json_dict, bucket="t4sg-ersilia", object_name=None): try: s3_client.upload_file(tmp.name, bucket, f"{object_name}.json") except NoCredentialsError: - logging.error("Unable to upload tracking data to AWS: Credentials not found") + logging.error( + "Unable to upload tracking data to AWS: Credentials not found" + ) except ClientError as e: logging.error(e) return False @@ -181,9 +185,8 @@ def upload_to_s3(json_dict, bucket="t4sg-ersilia", object_name=None): class RunTracker: """ This class will be responsible for tracking model runs. It calculates the desired metadata based on a model's - inputs, outputs, and other run-specific features, before uploading them to Ersilia's Splunk dashboard. - - NOTE: Currently, the Splunk connection is not set up. For now, we will print tracking results to the console. + inputs, outputs, and other run-specific features, before uploading them to AWS to be ingested + to Ersilia's Splunk dashboard. """ def __init__(self): @@ -192,6 +195,10 @@ def __init__(self): # function to be called before model is run def start_tracking(self): + """ + Runs any code necessary for the beginning of the run. + Currently necessary for tracking the runtime and memory usage of a run. + """ self.time_start = datetime.now() tracemalloc.start() self.memory_usage_start = tracemalloc.get_traced_memory()[0] @@ -202,10 +209,16 @@ def sample_df(self, df, num_rows, num_cols): """ return df.sample(num_rows, axis=0).sample(num_cols, axis=1) -# Stats function: calculates the basic statistics of the output file from a model. This includes the -# mode (if applicable), minimum, maximum, and standard deviation. def stats(self, result): - dat = read_csv(result) + """ + Stats function: calculates the basic statistics of the output file from a model. This includes the + mode (if applicable), minimum, maximum, and standard deviation. + + :param result: The path to the model's output file. + :return: A dictionary containing the stats for each column of the result. + """ + + dat = pd.read_csv(result) # drop first two columns (key, input) dat = dat.drop(["key", "input"], axis=1) @@ -248,42 +261,47 @@ def get_file_sizes(self, input_df, output_df): "avg_output_size": output_avg_row_size, } - def check_types(self, resultDf, metadata): + def check_types(self, result_df, metadata): """ This class is responsible for checking the types of the output dataframe against the expected types. This includes checking the shape of the output dataframe (list vs single) and the types of each column. - :param resultDf: The output dataframe + :param result_df: The output dataframe :param metadata: The metadata dictionary :return: A dictionary containing the number of mismatched types and a boolean for whether the shape is correct """ - typeDict = {"float64": "Float", "int64": "Int"} + type_dict = {"float64": "Float", "int64": "Int"} count = 0 # ignore key and input columns - dtypesLst = resultDf.loc[:, ~resultDf.columns.isin(["key", "input"])].dtypes + dtypes_list = result_df.loc[:, ~result_df.columns.isin(["key", "input"])].dtypes - for i in dtypesLst: - if typeDict[str(i)] != metadata["Output Type"][0]: + for i in dtypes_list: + if type_dict[str(i)] != metadata["Output Type"][0]: count += 1 - if len(dtypesLst) > 1 and metadata["Output Shape"] != "List": + if len(dtypes_list) > 1 and metadata["Output Shape"] != "List": print("Not right shape. Expected List but got Single") correct_shape = False - elif len(dtypesLst) == 1 and metadata["Output Shape"] != "Single": + elif len(dtypes_list) == 1 and metadata["Output Shape"] != "Single": print("Not right shape. Expected Single but got List") correct_shape = False else: print("Output is correct shape.") correct_shape = True - - print(resultDf) + + print(result_df) print("Output has", count, "mismatched types.\n") return {"mismatched_types": count, "correct_shape": correct_shape} def get_peak_memory(self): + """ + Calculates the peak memory usage of ersilia's Python instance during the run. + :return: The peak memory usage in bytes. + """ + # Compare memory between peak and amount when we started peak_memory = tracemalloc.get_traced_memory()[1] - self.memory_usage_start tracemalloc.stop() @@ -295,8 +313,8 @@ def track(self, input, result, meta): Tracks the results after a model run. """ json_dict = {} - input_dataframe = read_csv(input) - result_dataframe = read_csv(result) + input_dataframe = pd.read_csv(input) + result_dataframe = pd.read_csv(result) json_dict["input_dataframe"] = input_dataframe.to_dict() json_dict["result_dataframe"] = result_dataframe.to_dict() From bdeb14a58955096e2a934273c1dafe984d94fbb4 Mon Sep 17 00:00:00 2001 From: Anthony Cui Date: Wed, 20 Dec 2023 23:24:13 -0500 Subject: [PATCH 7/8] More clean up --- ersilia/core/tracking.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/ersilia/core/tracking.py b/ersilia/core/tracking.py index f4096f405..ab046bfac 100644 --- a/ersilia/core/tracking.py +++ b/ersilia/core/tracking.py @@ -104,8 +104,8 @@ def log_files_metrics(file): for error in errors: write_persistent_file(f"{error}: {errors[error]}") write_persistent_file(f"Warning count: {warning_count}") - except FileNotFoundError: - logging.warning("Log file not found") + except (IsADirectoryError, FileNotFoundError): + logging.warning("Unable to calculate metrics for log file: log file not found") def open_persistent_file(model_id): @@ -223,7 +223,7 @@ def stats(self, result): # drop first two columns (key, input) dat = dat.drop(["key", "input"], axis=1) - # calculate and print statistics + # calculate statistics stats = {} for column in dat: column_stats = {} @@ -282,17 +282,15 @@ def check_types(self, result_df, metadata): count += 1 if len(dtypes_list) > 1 and metadata["Output Shape"] != "List": - print("Not right shape. Expected List but got Single") + logging.warning("Not right shape. Expected List but got Single") correct_shape = False elif len(dtypes_list) == 1 and metadata["Output Shape"] != "Single": - print("Not right shape. Expected Single but got List") + logging.warning("Not right shape. Expected Single but got List") correct_shape = False else: - print("Output is correct shape.") correct_shape = True - print(result_df) - print("Output has", count, "mismatched types.\n") + logging.info("Output has", count, "mismatched types.\n") return {"mismatched_types": count, "correct_shape": correct_shape} From 7eafc437c78eac16e6279898256c6ce608640dc9 Mon Sep 17 00:00:00 2001 From: Anthony Cui Date: Wed, 20 Dec 2023 23:44:30 -0500 Subject: [PATCH 8/8] Add skeleton code for CDD Vault tracking --- ersilia/core/tracking.py | 86 +++++++++++++++++++++++++++++----------- pyproject.toml | 1 + 2 files changed, 64 insertions(+), 23 deletions(-) diff --git a/ersilia/core/tracking.py b/ersilia/core/tracking.py index ab046bfac..2d20a5bce 100644 --- a/ersilia/core/tracking.py +++ b/ersilia/core/tracking.py @@ -8,33 +8,13 @@ from botocore.exceptions import ClientError, NoCredentialsError import os import re +import requests PERSISTENT_FILE_PATH = os.path.abspath("current_session.txt") # Temporary path to log files until log files are fixed TEMP_FILE_LOGS = os.path.abspath("") -def create_csv(output_df): - """ - This function takes in the output dataframe from the model run and returns - a new temporary csv file that will later be passed to CDD vault. The CSV - file has two columns: the first column is the input molecules and the - second column is the ISO-formatted time of the run. - - :param output_df: The output dataframe from the model run - :return: A new temporary csv file - """ - - new_df = output_df[["input"]].copy() - current_time = datetime.now().isoformat() - - new_df["time"] = current_time - csv_file = tempfile.NamedTemporaryFile(mode="w", suffix=".csv") - new_df.to_csv(csv_file.name, index=False) - - return csv_file - - def log_files_metrics(file): """ This function will log the number of errors and warnings in the log files. @@ -182,6 +162,68 @@ def upload_to_s3(json_dict, bucket="t4sg-ersilia", object_name=None): return True +def upload_to_cddvault(output_df, api_key): + """ + This function takes in the output dataframe from the model run and uploads the data to CDD vault. + + NOTE: Currently, this is simply a skeleton of what the final code should look like. The TODO details + what the remaining changes should look like. + + :param output_df: The output dataframe from the model run + :param api_key: The API key for CDD Vault's API + :return: The response from the API call + """ + + # We use the slurps API path to be able to bulk upload data + url = "https://app.collaborativedrug.com/api/v1/vaults//slurps" + headers = {"CDD-Token": api_key} + # TODO: Update project and header_mappings ids, as well as adding mappings for other + # output columns if those are to be tracked as well. + data = { + "project": "", + "autoreject": "true", + "mapping_template": { + "registration_type": "CHEMICAL_STRUCTURE", + "header_mappings": [ + { + "header": {"name": "input", "position": 0}, + "definition": { + "id": -1, + "type": "InternalFieldDefinition::MoleculeStructure", + }, + }, + { + "header": {"name": "time", "position": 1}, + "definition": { + "id": -1, + "type": "InternalFieldDefinition::BatchFieldDefinition", + }, + }, + ], + }, + } + + # Save output_df to a CSV of the correct format + new_df = output_df[["input"]].copy() + current_time = datetime.now().isoformat() + + new_df["time"] = current_time + csv_file = tempfile.NamedTemporaryFile(mode="w", suffix=".csv") + new_df.to_csv(csv_file.name, index=False) + + files = {"file": open(csv_file.name, "rb")} + + # Create and make API call + response = requests.post( + url, headers=headers, data={"json": json.dumps(data)}, files=files + ) + if response.status_code == 200: + return response.json() + else: + logging.warning("API call to CDD Vault was Unsuccessful") + return response.text + + class RunTracker: """ This class will be responsible for tracking model runs. It calculates the desired metadata based on a model's @@ -341,7 +383,5 @@ def track(self, input, result, meta): json_object = json.dumps(json_dict, indent=4) write_persistent_file(json_object) - create_csv(result_dataframe) - # Upload run stats to s3 upload_to_s3(json_dict) diff --git a/pyproject.toml b/pyproject.toml index f64a7a5de..c3f0ef442 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,6 +53,7 @@ sphinx = {version = ">=5.3.0", optional = true} # For compatibility with python jinja2 = {version = "^3.1.2", optional = true} levenshtein = {version = ">=0.21.1,<0.23.0", optional = true} # For faster fuzzy search boto3 = "^1.28.40" +requests = "^2.31.0" [tool.poetry.extras]