forked from intel/ai-reference-models
-
Notifications
You must be signed in to change notification settings - Fork 0
/
launch_benchmark.py
493 lines (409 loc) · 21.2 KB
/
launch_benchmark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) 2018-2021 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import glob
import os
import signal
import subprocess
import sys
import platform as system_platform
from argparse import ArgumentParser
from common import base_benchmark_util
from common import platform_util
from common.utils.validators import check_no_spaces, check_volume_mount, check_shm_size
from common.base_model_init import BaseModelInitializer
class LaunchBenchmark(base_benchmark_util.BaseBenchmarkUtil):
"""Launches benchmarking job based on the specified args """
def __init__(self, *args, **kwargs):
super(LaunchBenchmark, self).__init__(*args, **kwargs)
self.args, self.unknown_args = self.parse_args()
try:
self.validate_args()
except (IOError, ValueError) as e:
sys.exit("\nError: {}".format(e))
def main(self):
benchmark_scripts = os.path.dirname(os.path.realpath(__file__))
os_type = system_platform.system()
use_case = self.get_model_use_case(benchmark_scripts, os_type)
intelai_models = self.get_model_dir(benchmark_scripts, use_case, os_type)
intelai_models_common = self.get_model_dir(benchmark_scripts, "common", os_type)
env_var_dict = self.get_env_vars(benchmark_scripts, use_case, intelai_models,
intelai_models_common, os_type)
if "Windows" == os_type:
if os.getenv("PYTHONPATH") is None:
os.environ["PYTHONPATH"] = os.path.dirname(sys.executable)
os.environ["PYTHONPATH"] = "{};{};{}".format(
benchmark_scripts, intelai_models, os.environ["PYTHONPATH"])
if self.args.docker_image:
if self.args.framework == 'tensorflow_serving':
self.run_bare_metal(benchmark_scripts, intelai_models,
intelai_models_common, env_var_dict, os_type)
elif self.args.framework == 'tensorflow':
self.run_docker_container(benchmark_scripts, intelai_models,
intelai_models_common, env_var_dict)
else:
self.run_bare_metal(benchmark_scripts, intelai_models,
intelai_models_common, env_var_dict, os_type)
def parse_args(self):
# Additional args that are only used with the launch script
arg_parser = ArgumentParser(
parents=[self._common_arg_parser],
description="Parse args for benchmark interface")
arg_parser.add_argument(
"--docker-image",
help="Specify the docker image/tag to use when running benchmarking within a container."
"If no docker image is specified, then no docker container will be used.",
dest="docker_image", default=None, type=check_no_spaces)
arg_parser.add_argument(
"--volume",
help="Specify a custom volume to mount in the container, which follows the same format as the "
"docker --volume flag (https://docs.docker.com/storage/volumes/). "
"This argument can only be used in conjunction with a --docker-image.",
action="append", dest="custom_volumes", type=check_volume_mount)
arg_parser.add_argument(
"--shm-size",
help="Specify the size of docker /dev/shm. The format is <number><unit>. "
"number must be greater than 0. Unit is optional and can be b (bytes), k (kilobytes), "
"m (megabytes), or g (gigabytes).",
dest="shm_size", default="64m", type=check_shm_size)
arg_parser.add_argument(
"--debug",
help="Launches debug mode which doesn't execute "
"start.sh when running in a docker container.", action="store_true")
arg_parser.add_argument(
"--noinstall",
help="whether to install packages for a given model when running in docker "
"(default --noinstall='False') or on bare metal (default --noinstall='True')",
dest="noinstall", action="store_true", default=None)
arg_parser.add_argument(
"--dry-run",
help="Shows the call to the model without actually running it",
dest="dry_run", action="store_true", default=None)
return arg_parser.parse_known_args()
def validate_args(self):
"""validate the args"""
# validate that we support this framework by checking folder names
benchmark_dir = os.path.dirname(os.path.realpath(__file__))
if glob.glob("{}/*/{}".format(benchmark_dir, self.args.framework)) == []:
raise ValueError("The specified framework is not supported: {}".
format(self.args.framework))
# if neither benchmark_only or accuracy_only are specified, then enable
# benchmark_only as the default
if not self.args.benchmark_only and not self.args.accuracy_only:
self.args.benchmark_only = True
# default disable_tcmalloc=False for int8 and disable_tcmalloc=True for other precisions
if not self.args.disable_tcmalloc:
self.args.disable_tcmalloc = str(self.args.precision != "int8")
if self.args.custom_volumes and not self.args.docker_image:
raise ValueError("Volume mounts can only be used when running in a docker container "
"(a --docker-image must be specified when using --volume).")
if self.args.mode == "inference" and self.args.checkpoint:
print("Warning: The --checkpoint argument is being deprecated in favor of using frozen graphs.")
def get_model_use_case(self, benchmark_scripts, os_type):
"""
Infers the use case based on the directory structure for the specified model.
"""
args = self.args
# find the path to the model's benchmarks folder
search_path = os.path.join(
benchmark_scripts, "*", args.framework, args.model_name,
args.mode, args.precision)
matches = glob.glob(search_path)
error_str = ""
if len(matches) > 1:
error_str = "Found multiple model locations for {} {} {}"
elif len(matches) == 0:
error_str = "No model was found for {} {} {}"
if error_str:
raise ValueError(error_str.format(args.framework, args.model_name, args.precision))
# use the benchmarks directory path to find the use case
if "Windows" == os_type:
dir_list = matches[0].split("\\")
else:
dir_list = matches[0].split("/")
# find the last occurrence of framework in the list, then return
# the element before it in the path, which is the use case
return next(dir_list[elem - 1] for elem in range(len(dir_list) - 1, -1, -1)
if dir_list[elem] == args.framework)
def get_model_dir(self, benchmark_scripts, use_case, os_type):
"""
Finds the path to the optimized model directory in this repo, if it exists.
"""
# use the models directory as a default
intelai_models = os.path.join(benchmark_scripts, os.pardir, "models")
if use_case == "common":
return os.path.join(intelai_models, "common", self.args.framework)
# find the intelai_optimized model directory
args = self.args
optimized_model_dir = os.path.join(intelai_models, use_case,
args.framework, args.model_name)
if "Windows" == os_type:
optimized_model_dir = optimized_model_dir.replace("\\", "/")
# if we find an optimized model, then we will use that path
if os.path.isdir(optimized_model_dir):
intelai_models = optimized_model_dir
return intelai_models
def get_env_vars(self, benchmark_scripts, use_case, intelai_models,
intelai_models_common, os_type):
"""
Sets up dictionary of standard env vars that are used by start.sh
"""
# Standard env vars
args = self.args
python_exe = str(sys.executable if not args.docker_image else "python")
if "Windows" == os_type:
python_exe = r'"{}"'.format(python_exe)
env_var_dict = {
"ACCURACY_ONLY": args.accuracy_only,
"BACKBONE_MODEL_DIRECTORY_VOL": args.backbone_model,
"BATCH_SIZE": args.batch_size,
"BENCHMARK_ONLY": args.benchmark_only,
"BENCHMARK_SCRIPTS": benchmark_scripts,
"CHECKPOINT_DIRECTORY_VOL": args.checkpoint,
"DATASET_LOCATION_VOL": args.data_location,
"DATA_NUM_INTER_THREADS": args.data_num_inter_threads,
"DATA_NUM_INTRA_THREADS": args.data_num_intra_threads,
"DISABLE_TCMALLOC": args.disable_tcmalloc,
"DOCKER": str(args.docker_image) if args.docker_image is not None else "",
"DRY_RUN": str(args.dry_run) if args.dry_run is not None else "",
"EXTERNAL_MODELS_SOURCE_DIRECTORY": args.model_source_dir,
"FRAMEWORK": args.framework,
"INTELAI_MODELS": intelai_models,
"INTELAI_MODELS_COMMON": intelai_models_common,
"MODE": args.mode,
"MODEL_NAME": args.model_name,
"MPI_HOSTNAMES": args.mpi_hostnames,
"MPI_NUM_PROCESSES": args.mpi,
"MPI_NUM_PROCESSES_PER_SOCKET": args.num_mpi,
"NUMA_CORES_PER_INSTANCE": args.numa_cores_per_instance,
"NOINSTALL": str(args.noinstall) if args.noinstall is not None else "True" if not args.docker_image else "False", # noqa: E501
"NUM_CORES": args.num_cores,
"NUM_INTER_THREADS": args.num_inter_threads,
"NUM_INTRA_THREADS": args.num_intra_threads,
"NUM_TRAIN_STEPS": args.num_train_steps,
"OUTPUT_RESULTS": args.output_results,
"PRECISION": args.precision,
"PYTHON_EXE": python_exe,
"SOCKET_ID": args.socket_id,
"TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD": args.tcmalloc_large_alloc_report_threshold,
"TF_SERVING_VERSION": args.tf_serving_version,
"USE_CASE": str(use_case),
"VERBOSE": args.verbose,
"WEIGHT_SHARING": args.weight_sharing,
"SYNTHETIC_DATA": args.synthetic_data,
"GPU": str(args.gpu)
}
# Add custom model args as env vars)
for custom_arg in args.model_args + self.unknown_args:
if "=" not in custom_arg:
raise ValueError("Expected model args in the format "
"`name=value` but received: {}".
format(custom_arg))
split_arg = custom_arg.split("=")
split_arg[0] = split_arg[0].replace("-", "_").lstrip('_')
# Convert the arg names to upper case to work on both Windows and Linux systems
split_arg[0] = split_arg[0].upper()
env_var_dict[split_arg[0]] = split_arg[1]
return env_var_dict
def run_bare_metal(self, benchmark_scripts, intelai_models,
intelai_models_common, env_var_dict, os_type):
"""
Runs the model without a container
"""
# setup volume directories to be the local system directories, since we aren't
# mounting volumes when running bare metal, but start.sh expects these args
args = self.args
workspace = os.path.join(benchmark_scripts, "common", args.framework)
mount_benchmark = benchmark_scripts
in_graph_path = args.input_graph
checkpoint_path = args.checkpoint
backbone_model_path = args.backbone_model
dataset_path = args.data_location
batch_size = args.batch_size
mount_external_models_source = args.model_source_dir
mount_intelai_models = intelai_models
# To Launch Tensorflow Serving benchmark we need only --in-graph arg.
# It does not support checkpoint files.
if args.framework == "tensorflow_serving":
if checkpoint_path:
raise ValueError("--checkpoint-path arg is not supported with tensorflow serving benchmarking")
if args.mode != "inference":
raise ValueError("--mode arg should be set to inference")
if in_graph_path:
env_var_dict["IN_GRAPH"] = in_graph_path
else:
raise ValueError("--in-graph arg is required to run tensorflow serving benchmarking")
for env_var_name in env_var_dict:
os.environ[env_var_name] = str(env_var_dict[env_var_name])
# We need this env to be set for the platform util
os.environ["PYTHON_EXE"] = str(sys.executable if not args.docker_image else "python")
# Get Platformutil
platform_util_obj = None or platform_util.PlatformUtil(self.args)
# Configure num_inter_threads and num_intra_threads
base_obj = BaseModelInitializer(args=self.args, custom_args=[], platform_util=platform_util_obj)
base_obj.set_num_inter_intra_threads()
# Update num_inter_threads and num_intra_threads in env dictionary
env_var_dict["NUM_INTER_THREADS"] = self.args.num_inter_threads
env_var_dict["NUM_INTRA_THREADS"] = self.args.num_intra_threads
# Set OMP_NUM_THREADS
env_var_dict["OMP_NUM_THREADS"] = self.args.num_intra_threads
else:
mount_external_models_source = args.model_source_dir
mount_intelai_models = intelai_models
mount_intelai_models_common = intelai_models_common
# Add env vars with bare metal settings
env_var_dict["MOUNT_EXTERNAL_MODELS_SOURCE"] = mount_external_models_source
env_var_dict["MOUNT_INTELAI_MODELS_SOURCE"] = mount_intelai_models
env_var_dict["MOUNT_INTELAI_MODELS_COMMON_SOURCE"] = mount_intelai_models_common
if in_graph_path:
env_var_dict["IN_GRAPH"] = in_graph_path
if checkpoint_path:
env_var_dict["CHECKPOINT_DIRECTORY"] = checkpoint_path
if backbone_model_path:
env_var_dict["BACKBONE_MODEL_DIRECTORY"] = backbone_model_path
if dataset_path:
env_var_dict["DATASET_LOCATION"] = dataset_path
if batch_size:
env_var_dict["BATCH_SIZE"] = batch_size
# if using the default output directory, get the full path
if args.output_dir == "/models/benchmarks/common/tensorflow/logs":
args.output_dir = os.path.join(workspace, "logs")
# Add env vars with bare metal settings
env_var_dict["WORKSPACE"] = workspace
env_var_dict["MOUNT_BENCHMARK"] = mount_benchmark
env_var_dict["OUTPUT_DIR"] = args.output_dir
# Set env vars for bare metal
for env_var_name in env_var_dict:
os.environ[env_var_name] = str(env_var_dict[env_var_name])
# Run the start script
start_script = os.path.join(workspace, "start.sh")
if "Windows" == os_type:
self._launch_command([os.environ["MSYS64_BASH"], start_script])
else:
self._launch_command(["bash", start_script])
def run_docker_container(self, benchmark_scripts, intelai_models,
intelai_models_common, env_var_dict):
"""
Runs a docker container with the specified image and environment
variables to start running the benchmarking job.
"""
args = self.args
mount_benchmark = "/workspace/benchmarks"
mount_external_models_source = "/workspace/models"
mount_intelai_models = "/workspace/intelai_models"
mount_intelai_models_common = "/workspace/intelai_models_common"
workspace = os.path.join(mount_benchmark, "common", args.framework)
mount_output_dir = False
output_dir = os.path.join(workspace, 'logs')
if args.output_dir != "/models/benchmarks/common/tensorflow/logs":
# we don't need to mount log dir otherwise since default is workspace folder
mount_output_dir = True
output_dir = args.output_dir
in_graph_dir = os.path.dirname(args.input_graph) if args.input_graph \
else ""
in_graph_filename = os.path.basename(args.input_graph) if \
args.input_graph else ""
# env vars with docker settings
env_vars = ["--env", "WORKSPACE={}".format(workspace),
"--env", "MOUNT_BENCHMARK={}".format(mount_benchmark),
"--env", "MOUNT_EXTERNAL_MODELS_SOURCE={}".format(mount_external_models_source),
"--env", "MOUNT_INTELAI_MODELS_SOURCE={}".format(mount_intelai_models),
"--env", "MOUNT_INTELAI_MODELS_COMMON_SOURCE={}".format(mount_intelai_models_common),
"--env", "OUTPUT_DIR={}".format(output_dir)]
if args.input_graph:
env_vars += ["--env", "IN_GRAPH=/in_graph/{}".format(in_graph_filename)]
if args.data_location:
env_vars += ["--env", "DATASET_LOCATION=/dataset"]
if args.batch_size:
env_vars += ["--env", "BATCH_SIZE=args.batch_size"]
if args.checkpoint:
env_vars += ["--env", "CHECKPOINT_DIRECTORY=/checkpoints"]
if args.backbone_model:
env_vars += ["--env", "BACKBONE_MODEL_DIRECTORY=/backbone_model"]
# Add env vars with common settings
for env_var_name in env_var_dict:
env_vars += ["--env", "{}={}".format(env_var_name, env_var_dict[env_var_name])]
# Add proxy to env variables if any set on host
for environment_proxy_setting in [
"http_proxy",
"ftp_proxy",
"https_proxy",
"no_proxy",
]:
if not os.environ.get(environment_proxy_setting):
continue
env_vars.append("--env")
env_vars.append("{}={}".format(
environment_proxy_setting,
os.environ.get(environment_proxy_setting)
))
volume_mounts = ["--volume", "{}:{}".format(benchmark_scripts, mount_benchmark),
"--volume", "{}:{}".format(args.model_source_dir, mount_external_models_source),
"--volume", "{}:{}".format(intelai_models, mount_intelai_models),
"--volume", "{}:{}".format(intelai_models_common, mount_intelai_models_common)]
if mount_output_dir:
volume_mounts.extend([
"--volume", "{}:{}".format(output_dir, output_dir)])
if args.data_location:
volume_mounts.extend([
"--volume", "{}:{}".format(args.data_location, "/dataset")])
if args.checkpoint:
volume_mounts.extend([
"--volume", "{}:{}".format(args.checkpoint, "/checkpoints")])
if args.backbone_model:
volume_mounts.extend([
"--volume", "{}:{}".format(args.backbone_model, "/backbone_model")])
if in_graph_dir:
volume_mounts.extend([
"--volume", "{}:{}".format(in_graph_dir, "/in_graph")])
if args.custom_volumes:
for custom_volume in args.custom_volumes:
volume_mounts.extend(["--volume", custom_volume])
docker_run_cmd = ["docker", "run"]
# only use -it when debugging, otherwise we might get TTY error
if args.debug:
docker_run_cmd.append("-it")
if args.numa_cores_per_instance is not None or args.socket_id != -1 or \
args.num_cores != -1 or args.mpi is not None or args.num_mpi > 1:
docker_run_cmd.append("--privileged")
docker_shm_size = "--shm-size={}".format(args.shm_size)
docker_run_cmd = docker_run_cmd + env_vars + volume_mounts + [
docker_shm_size, "-u", "root:root", "-w",
workspace, args.docker_image, "/bin/bash"]
if not args.debug:
docker_run_cmd.append("start.sh")
if args.verbose:
print("Docker run command:\n{}".format(docker_run_cmd))
self._launch_command(docker_run_cmd)
def _launch_command(self, run_cmd):
"""runs command that runs the start script in a container or on bare metal and exits on ctrl c"""
os_type = system_platform.system()
if "Windows" == os_type:
p = subprocess.Popen(run_cmd, start_new_session=True)
else:
p = subprocess.Popen(run_cmd, preexec_fn=os.setsid)
try:
p.communicate()
except KeyboardInterrupt:
os.killpg(os.getpgid(p.pid), signal.SIGKILL)
if __name__ == "__main__":
util = LaunchBenchmark()
util.main()