From d6ad7607024ac08a16742aa60a5df5ddb062d4f5 Mon Sep 17 00:00:00 2001 From: Theresa Date: Sun, 22 Oct 2023 16:59:31 -0400 Subject: [PATCH 1/5] Write outputs to JSON and upload to S3 Bucket --- ersilia/core/tracking.py | 46 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/ersilia/core/tracking.py b/ersilia/core/tracking.py index d3e7b272c..5c3ae196f 100644 --- a/ersilia/core/tracking.py +++ b/ersilia/core/tracking.py @@ -1,7 +1,12 @@ from datetime import datetime import json import pandas as pd - +import tracemalloc +# from ersilia import cli +import logging +import boto3 +from botocore.exceptions import ClientError +import os class RunTracker: """ @@ -48,3 +53,42 @@ def read_json(self, result): data = json.load(result) self.log_to_console(result) return data + + def start(self): + tracemalloc.start() + self.time_start = tracemalloc.get_traced_memory()[0] + + def track_memory(self): + peak_memory = tracemalloc.get_traced_memory()[1] - self.time_start + print(f"Peak memory: {peak_memory}") + tracemalloc.stop() + + +def write_file(dict): + str = json.dump(dict) + tmp = tempfile.NamedTemporaryFile() + + with open(tmp.name, 'w') as f: + f.write(str) + +def upload_file(file_name, bucket, object_name=None): + """Upload a file to an S3 bucket + + :param file_name: File to upload + :param bucket: Bucket to upload to + :param object_name: S3 object name. If not specified then file_name is used + :return: True if file was uploaded, else False + """ + + # If S3 object_name was not specified, use file_name + if object_name is None: + object_name = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + '-' + os.path.basename(file_name) + + # Upload the file + s3_client = boto3.client('s3') + try: + response = s3_client.upload_file(file_name, bucket, object_name) + except ClientError as e: + logging.error(e) + return False + return True \ No newline at end of file From d13c4e566d7deb43256b244064ad303704d610c2 Mon Sep 17 00:00:00 2001 From: Anthony Cui Date: Tue, 14 Nov 2023 15:56:08 -0500 Subject: [PATCH 2/5] Update imports --- ersilia/core/tracking.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ersilia/core/tracking.py b/ersilia/core/tracking.py index 5c3ae196f..d5e75ab08 100644 --- a/ersilia/core/tracking.py +++ b/ersilia/core/tracking.py @@ -2,7 +2,7 @@ import json import pandas as pd import tracemalloc -# from ersilia import cli +import tempfile import logging import boto3 from botocore.exceptions import ClientError From aa5bbf26d61798ffb4bcf419a511a0640ec6aba0 Mon Sep 17 00:00:00 2001 From: Anthony Cui Date: Tue, 14 Nov 2023 16:04:43 -0500 Subject: [PATCH 3/5] Clean up memory usage code --- ersilia/core/tracking.py | 71 +++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 34 deletions(-) diff --git a/ersilia/core/tracking.py b/ersilia/core/tracking.py index 1d9b04cd1..d1a25b124 100644 --- a/ersilia/core/tracking.py +++ b/ersilia/core/tracking.py @@ -53,10 +53,13 @@ class RunTracker: def __init__(self): self.time_start = None + self.memory_usage_start = 0 # function to be called before model is run def start_tracking(self): self.time_start = datetime.now() + tracemalloc.start() + self.memory_usage_start = tracemalloc.get_traced_memory()[0] def sample_df(self, df, num_rows, num_cols): """ @@ -101,6 +104,38 @@ def get_file_sizes(self, input_df, output_df): "avg_output_size": output_avg_row_size, } + def check_types(self, resultDf, metadata): + typeDict = {"float64": "Float", "int64": "Int"} + count = 0 + + # ignore key and input columns + dtypesLst = resultDf.loc[:, ~resultDf.columns.isin(["key", "input"])].dtypes + + for i in dtypesLst: + if typeDict[str(i)] != metadata["Output Type"][0]: + count += 1 + + if len(dtypesLst) > 1 and metadata["Output Shape"] != "List": + print("Not right shape. Expected List but got Single") + correct_shape = False + elif len(dtypesLst) == 1 and metadata["Output Shape"] != "Single": + print("Not right shape. Expected Single but got List") + correct_shape = False + else: + print("Output is correct shape.") + correct_shape = True + + print("Output has", count, "mismatched types.\n") + + return {"mismatched_types": count, "correct_shape": correct_shape} + + def get_peak_memory(self): + # Compare memory between peak and amount when we started + peak_memory = tracemalloc.get_traced_memory()[1] - self.memory_usage_start + tracemalloc.stop() + + return peak_memory + def track(self, input, result, meta): """ Tracks the results after a model run. @@ -130,45 +165,13 @@ def track(self, input, result, meta): json_dict["file_sizes"] = self.get_file_sizes(input_dataframe, result_dataframe) + json_dict["peak_memory_use"] = self.get_peak_memory() + json_object = json.dumps(json_dict, indent=4) - print("\nJSON Dictionary:\n", json_object) # log results to persistent tracking file write_persistent_file(json_object) - def check_types(self, resultDf, metadata): - typeDict = {"float64": "Float", "int64": "Int"} - count = 0 - - # ignore key and input columns - dtypesLst = resultDf.loc[:, ~resultDf.columns.isin(["key", "input"])].dtypes - - for i in dtypesLst: - if typeDict[str(i)] != metadata["Output Type"][0]: - count += 1 - - if len(dtypesLst) > 1 and metadata["Output Shape"] != "List": - print("Not right shape. Expected List but got Single") - correct_shape = False - elif len(dtypesLst) == 1 and metadata["Output Shape"] != "Single": - print("Not right shape. Expected Single but got List") - correct_shape = False - else: - print("Output is correct shape.") - correct_shape = True - - print("Output has", count, "mismatched types.\n") - - return {"mismatched_types": count, "correct_shape": correct_shape} - - def start(self): - tracemalloc.start() - self.time_start = tracemalloc.get_traced_memory()[0] - - def track_memory(self): - peak_memory = tracemalloc.get_traced_memory()[1] - self.time_start - print(f"Peak memory: {peak_memory}") - tracemalloc.stop() def write_file(dict): str = json.dump(dict) From 78e9f926fc83645c2adaeabed778faa9674e8320 Mon Sep 17 00:00:00 2001 From: Anthony Cui Date: Tue, 14 Nov 2023 16:37:09 -0500 Subject: [PATCH 4/5] Clean up s3 code --- ersilia/core/tracking.py | 63 ++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 31 deletions(-) diff --git a/ersilia/core/tracking.py b/ersilia/core/tracking.py index d1a25b124..9888f8848 100644 --- a/ersilia/core/tracking.py +++ b/ersilia/core/tracking.py @@ -42,6 +42,35 @@ def close_persistent_file(): ) os.rename(PERSISTENT_FILE_PATH, new_file_path) +def upload_to_s3(json_dict, bucket="t4sg-ersilia", object_name=None): + """Upload a file to an S3 bucket + + :param json_dict: JSON object to upload + :param bucket: Bucket to upload to + :param object_name: S3 object name. If not specified then we generate a name based on the timestamp and model id. + :return: True if file was uploaded, else False + """ + + # If S3 object_name was not specified, use file_name + if object_name is None: + object_name = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + '-' + json_dict["model_id"] + + # Dump JSON into a temporary file to upload + json_str = json.dumps(json_dict, indent=4) + tmp = tempfile.NamedTemporaryFile() + + with open(tmp.name, 'w') as f: + f.write(json_str) + f.flush() + + # Upload the file + s3_client = boto3.client('s3') + try: + s3_client.upload_file(tmp.name, bucket, f"{object_name}.json") + except ClientError as e: + logging.error(e) + return False + return True class RunTracker: """ @@ -167,37 +196,9 @@ def track(self, input, result, meta): json_dict["peak_memory_use"] = self.get_peak_memory() - json_object = json.dumps(json_dict, indent=4) - # log results to persistent tracking file + json_object = json.dumps(json_dict, indent=4) write_persistent_file(json_object) - -def write_file(dict): - str = json.dump(dict) - tmp = tempfile.NamedTemporaryFile() - - with open(tmp.name, 'w') as f: - f.write(str) - -def upload_file(file_name, bucket, object_name=None): - """Upload a file to an S3 bucket - - :param file_name: File to upload - :param bucket: Bucket to upload to - :param object_name: S3 object name. If not specified then file_name is used - :return: True if file was uploaded, else False - """ - - # If S3 object_name was not specified, use file_name - if object_name is None: - object_name = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + '-' + os.path.basename(file_name) - - # Upload the file - s3_client = boto3.client('s3') - try: - response = s3_client.upload_file(file_name, bucket, object_name) - except ClientError as e: - logging.error(e) - return False - return True \ No newline at end of file + # Upload run stats to s3 + upload_to_s3(json_dict) From d1c2adbbee1192156f35e1ec3fe6e9e06a4ffd7e Mon Sep 17 00:00:00 2001 From: Anthony Cui Date: Tue, 14 Nov 2023 16:37:49 -0500 Subject: [PATCH 5/5] Refactor code --- ersilia/core/tracking.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/ersilia/core/tracking.py b/ersilia/core/tracking.py index 9888f8848..5cb9d1e27 100644 --- a/ersilia/core/tracking.py +++ b/ersilia/core/tracking.py @@ -42,6 +42,7 @@ def close_persistent_file(): ) os.rename(PERSISTENT_FILE_PATH, new_file_path) + def upload_to_s3(json_dict, bucket="t4sg-ersilia", object_name=None): """Upload a file to an S3 bucket @@ -53,18 +54,20 @@ def upload_to_s3(json_dict, bucket="t4sg-ersilia", object_name=None): # If S3 object_name was not specified, use file_name if object_name is None: - object_name = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + '-' + json_dict["model_id"] + object_name = ( + datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + "-" + json_dict["model_id"] + ) # Dump JSON into a temporary file to upload json_str = json.dumps(json_dict, indent=4) tmp = tempfile.NamedTemporaryFile() - with open(tmp.name, 'w') as f: + with open(tmp.name, "w") as f: f.write(json_str) f.flush() # Upload the file - s3_client = boto3.client('s3') + s3_client = boto3.client("s3") try: s3_client.upload_file(tmp.name, bucket, f"{object_name}.json") except ClientError as e: @@ -72,6 +75,7 @@ def upload_to_s3(json_dict, bucket="t4sg-ersilia", object_name=None): return False return True + class RunTracker: """ This class will be responsible for tracking model runs. It calculates the desired metadata based on a model's