-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathrun.py
191 lines (158 loc) · 8.97 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
from datetime import datetime, timezone
import subprocess
import argparse
import time
import os
import signal
import uuid
import sys
import json
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "src"))
import logger
import util
import hf
import hw
# config arguments
parser = argparse.ArgumentParser(description='Run performance benchmark for an LLM model')
parser.add_argument('--name', type=str, default=None, help='name of this performance benchmark run')
parser.add_argument('--config', type=str, default=None, help='path to config file that will be used for the performance benchmark')
parser.add_argument('--loops', type=int, default=1, help='number of times the performance benchmark will be ran (default=1)')
def main(name=None, config_path=None):
ID = str(uuid.uuid4())
logger.info(f"This performance run's ID is {ID} with name={name}", True)
# Get the current script path
current_script_path = os.path.dirname(os.path.abspath(__file__))
if config_path == None:
raise Exception(f"please provide a path to a test config file (json)")
elif os.path.isfile(str(config_path)) == False:
logger.error(f"[{ID}] Config path {config_path} does not exist! Existing...", True)
sys.exit(1)
config = util.read_json(config_path)
logger.info(f"[{ID}] Loaded config file {config_path} for this benchmark run, with the following configuration: {config}", True)
# NOTE: make sure a python environment named "env" is created in the same repo as this script
env_path = os.path.join(current_script_path, "env/bin/python3")
if os.path.isfile(env_path) == False:
logger.critical(f"[{ID}] python environment {env_path} does not exist, please create it!", True)
sys.exit(1)
# TODO: (11-24-2023) this is commented out because not every LLM framework uses HuggingFace or the same model name(s)
# TODO: (11-24-2023) a solution for this needs to be found or this needs to get ripped out entirly
# logger.info(f"[{ID}] checking if model exists and is downloaded locally...", True)
# local_hf_exists = hf.get_hf_model(str(config["model"]))
# if local_hf_exists == False:
# logger.critical(f"[{ID}] failed to download model {config['model']}, please look into this, existing...", True)
# sys.exit(1)
################################################################################################
logger.info(f"[{ID}] Starting metrics collector...", True)
try:
collecting_process = subprocess.Popen([env_path, os.path.join(current_script_path, "metrics.py"),
'--time-delay', str(config["time_delay"]),
"--uuid", str(ID)
])
logger.info(f"[{ID}] the metrics collector is running with a PID of {collecting_process.pid}", True)
except Exception as err:
logger.error(f"[{ID}] failed to run metric collector due to error: {err}, so existing...", True)
sys.exit(1)
################################################################################################
logger.info(f"[{ID}] Initiated {config['model_start_pause']} second pre model start to gather hardware metrics BEFORE the model is activated", True)
time.sleep(config["model_start_pause"])
################################################################################################
logger.info(f"[{ID}] Activating model {config['model']} with following parameters: {str(config)}", True)
try:
model_running_process = subprocess.Popen([env_path, os.path.join(current_script_path, "model.py"),
"--framework", str(config.get("framework")),
"--max_length", str(config.get("max_length")),
"--temperature", str(config.get("temperature")),
"--top_k", str(config.get("top_k")),
"--top_p", str(config.get("top_p")),
"--num_return_sequences", str(config.get("num_return_sequences")),
"--uuid", str(ID),
"--prompt", str(config.get("prompt")),
"--model", str(config.get("model")),
"--device", str(config.get("device")),
"--dtype", str(config.get("dtype"))
])
logger.info(f"[{ID}] model {config['model']} is running with a PID of {model_running_process.pid}", True)
except Exception as err:
logger.error(f"[{ID}] failed to run model {config['model']} due to error: {err}", True)
logger.error(f"[{ID}] attempting to kill metrics collector due to model failing to run", True)
collecting_process.send_signal(signal.SIGTERM)
collecting_process.wait()
sys.exit(1)
################################################################################################
logger.info(f"[{ID}] waiting for model {config['model']} to finish running...", True)
model_running_process.wait()
logger.info(f"[{ID}] model {config['model']} finished running! no longer waiting!", True)
logger.info(f"[{ID}] Initiated {config['model_start_pause']} second post model end to gather hardware metrics AFTER the model has completed it's run time", True)
time.sleep(config["model_end_pause"])
logger.info(f"[{ID}] Kill signal has been sent to metrics collector, is should finish running soon...", True)
collecting_process.send_signal(signal.SIGTERM)
collecting_process.wait()
exported_files_paths = util.get_id_files(ID, current_script_path)
if len(exported_files_paths) != 2:
logger.critical(f"[{ID}] The metrics-collector and model have completed their runs BUT there are only {len(exported_files_paths)} exported data files NOT 2, look into this, existing...", True)
sys.exit(1)
# get full file paths for metrics data file & model data file
metrics_data = None
model_data = None
for file in exported_files_paths:
if "_metrics.json" in file:
metrics_data = file
elif "_model.json" in file:
model_data = file
else:
logger.critical(f"[{ID}] Of the expected data output files, this file has an unexpected file 'extension': {file}", True)
sys.exit(1)
# create reports/ directory if it does not exist
reports_path = os.path.join(current_script_path, "reports")
if not os.path.exists(reports_path):
os.makedirs(reports_path)
# build filepath for final report file
final_data_path = f'report_{datetime.now(timezone.utc).strftime("%Y-%m-%d_%H-%M-%S.%f_utc")}_{ID}.json'
if name != None:
final_data_path = f"{name}_{final_data_path}"
final_data_path = os.path.join(reports_path, final_data_path)
final_dataset = {
"model": util.read_json(model_data),
"test_env": {
"params": config,
"commit": util.get_current_commit(),
"hardware": hw.get_all(static_only=True)
},
"metric": util.read_json(metrics_data)
}
# export file data/results
util.write_json(final_data_path, final_dataset)
# delete exported data files from metrics-collector and model-runner
# NOTE: we have to be careful here
util.delete_file(model_data)
util.delete_file(metrics_data)
logger.warning(f"[{ID}] Deleted exported sub-data files: {model_data} & {metrics_data}", True)
logger.info(f"[{ID}] ==> Created final report from this performance benchmark to file: {final_data_path}", True)
# TODO: returning the final output data's filepath for now
return final_data_path
if __name__ == "__main__":
args = parser.parse_args()
loops = int(args.loops)
if loops < 1:
raise Exception(f"loops MOST be greater then or equal to 1!")
# single benchmark run
if loops <= 1:
start_time = time.time()
main(name=args.name, config_path=args.config)
runtime = time.time() - start_time
logger.info(f"(single) Total Runtime: {runtime} seconds", True)
sys.exit(0)
# multiple benchmark runs
start_time = time.time()
all_filepaths = []
for i in range(int(args.loops)):
i_name = f"run_{i}"
if args.name != None:
i_name = f"{args.name}_{i_name}"
logger.info(f"Run {i+1}/{args.loops} for performance benchmark", True)
filepath = main(name=i_name, config_path=args.config)
all_filepaths.append(filepath)
logger.info(f"==> Muli-Run completed for performance benchmark. A total of {args.loops} runs we done and the following data was exported: {all_filepaths}", True)
runtime = time.time() - start_time
logger.info(f"(multiple) Total Runtime: {runtime} seconds", True)
sys.exit(0)