Add a dry command generator #193

Delaunay · 2024-02-05T15:01:55Z

No description provided.

Delaunay · 2024-02-05T15:07:34Z

Generated bash script

example.bash.txt

Delaunay · 2024-02-08T18:06:45Z

# ---
# Virtual Env
# ===========
export VIRTUAL_ENV="/Tmp/slurm.4123709.0/base/venv/torch"


# ---
# Milabench
# =========
export MILABENCH_DIR_BASE="/Tmp/slurm.4123709.0/base"
export MILABENCH_DIR_VENV="/Tmp/slurm.4123709.0/base/venv/torch"
export MILABENCH_DIR_DATA="/Tmp/slurm.4123709.0/base/data"
export MILABENCH_DIR_RUNS="/Tmp/slurm.4123709.0/base/runs"
export MILABENCH_DIR_EXTRA="/Tmp/slurm.4123709.0/base/extra/torchvision"
export MILABENCH_DIR_CACHE="/Tmp/slurm.4123709.0/base/cache"
export MILABENCH_CONFIG='{"system": {"arch": "cuda", "sshkey": null, "nodes": [{"ip": "127.0.0.1", "main": true, "name": "0", "port": 22, "user": "username", "hostname": "localhost", "aliaslist": [], "ipaddrlist": ["70:b5:e8:f0:5a:08", "fe80::1270:fd03:cd:a394%ibp161s0", "::1", "172.16.9.28", "fe80::72b5:e8ff:fef0:5a08%eno8303", "00:00:00:00:00:00", "00:00:02:5d:fe:80:00:00:00:00:00:00:10:70:fd:03:00:cd:a3:94", "10.20.9.28", "00:00:00:bf:fe:80:00:00:00:00:00:00:10:70:fd:03:00:e6:1b:38", "fe80::1270:fd03:e6:1b38%ibp37s0", "127.0.0.1", "10.20.137.28"], "local": true}], "gpu": {"capacity": "0 MiB"}, "self": {"ip": "127.0.0.1", "main": true, "name": "0", "port": 22, "user": "username", "hostname": "localhost", "aliaslist": [], "ipaddrlist": ["70:b5:e8:f0:5a:08", "fe80::1270:fd03:cd:a394%ibp161s0", "::1", "172.16.9.28", "fe80::72b5:e8ff:fef0:5a08%eno8303", "00:00:00:00:00:00", "00:00:02:5d:fe:80:00:00:00:00:00:00:10:70:fd:03:00:cd:a3:94", "10.20.9.28", "00:00:00:bf:fe:80:00:00:00:00:00:00:10:70:fd:03:00:e6:1b:38", "fe80::1270:fd03:e6:1b38%ibp37s0", "127.0.0.1", "10.20.137.28"], "local": true}}, "dirs": {"base": "/Tmp/slurm.4123709.0/base", "venv": "/Tmp/slurm.4123709.0/base/venv/torch", "data": "/Tmp/slurm.4123709.0/base/data", "runs": "/Tmp/slurm.4123709.0/base/runs", "extra": "/Tmp/slurm.4123709.0/base/extra/torchvision", "cache": "/Tmp/slurm.4123709.0/base/cache"}, "group": "torchvision", "install_group": "torch", "install_variant": "cuda", "run_name": "dev", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 600, "voir": {"options": {"stop": 60, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "/home/mila/d/delaunap/milabench/config", "config_file": "/home/mila/d/delaunap/milabench/config/standard.yaml", "definition": "/home/mila/d/delaunap/milabench/benchmarks/torchvision", "plan": {"method": "per_gpu"}, "argv": {"--precision": "tf32-fp16", "--lr": 0.01, "--no-stdout": true, "--epochs": 50, "--model": "resnet50", "--batch-size": 64}, "tags": ["classification", "convnet", "resnet", "vision"], "weight": 1.0, "name": "resnet50", "tag": ["resnet50"]}'

source $VIRTUAL_ENV/bin/activate

# ---
# resnet50
# ========
(
  python  -m cProfile -- /home/mila/d/delaunap/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 10 --model resnet50 --batch-size 64
)

  Epoch: 691.5778216147169: 5.9226884841918945
  Epoch: 765.4521793476132: 5.351085424423218
  Epoch: 737.8570982393342: 5.551210403442383
  Epoch: 729.1394473683574: 5.617581129074097
  Epoch: 713.6589541618099: 5.739436149597168
  Epoch: 732.8301529058771: 5.589289665222168
  Epoch: 719.4321779253913: 5.693378925323486
  Epoch: 729.3648331902754: 5.615845203399658
  Epoch: 724.8426908086968: 5.650881290435791
  Epoch: 719.4161505716805: 5.693505764007568
Train speed: 725.7633960699875
         6232948 function calls (5938733 primitive calls) in 59.084 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      160    0.002    0.000  107.938    0.675 util.py:205(__call__)
       80    0.012    0.000  104.343    1.304 util.py:463(close_fds)
        1    0.005    0.005   56.927   56.927 main.py:119(main)
3060/2853    0.027    0.000   56.424    0.020 {built-in method posix.close}
       10    0.001    0.000   56.421    5.642 main.py:62(train_epoch)
    80/10    0.011    0.000   56.409    5.641 threading.py:1016(_bootstrap)
    80/10    0.050    0.001   56.409    5.641 threading.py:1056(_bootstrap_inner)
    80/10    0.020    0.000   53.051    5.305 threading.py:999(run)
    80/10    0.045    0.001   53.050    5.305 queues.py:226(_feed)
1440/1300    0.179    0.000   47.555    0.037 connection.py:174(close)
1460/1320    0.003    0.000   47.397    0.036 connection.py:376(_close)
      640    0.009    0.000   17.576    0.027 grad_scaler.py:353(step)
      640    0.005    0.000   17.082    0.027 grad_scaler.py:341(_maybe_opt_step)
      642    0.002    0.000   16.484    0.026 {built-in method builtins.sum}
     1280    0.001    0.000   16.482    0.013 grad_scaler.py:349(<genexpr>)
      660   16.481    0.025   16.481    0.025 {method 'item' of 'torch._C.TensorBase' objects}
   790/22    0.078    0.000   15.034    0.683 threading.py:323(wait)
  4912/42    0.247    0.000   15.020    0.358 {method 'acquire' of '_thread.lock' objects}
      661    8.565    0.013   12.198    0.018 helpers.py:26(iterate)
4560/4471    0.032    0.000    9.338    0.002 connection.py:182(send_bytes)
     4160    6.669    0.002    6.683    0.002 {method 'to' of 'torch._C.TensorBase' objects}
117760/1280    0.105    0.000    6.085    0.005 module.py:1507(_wrapped_call_impl)
117760/1280    0.222    0.000    6.082    0.005 module.py:1513(_call_impl)
      640    0.001    0.000    5.654    0.009 resnet.py:284(forward)
      640    0.014    0.000    5.653    0.009 resnet.py:266(_forward_impl)
      640    0.005    0.000    5.222    0.008 _tensor.py:463(backward)
      640    0.006    0.000    5.216    0.008 __init__.py:164(backward)
      640    5.192    0.008    5.192    0.008 {method 'run_backward' of 'torch._C._EngineBase' objects}
5120/2560    0.021    0.000    5.107    0.002 container.py:215(forward)
    10240    0.247    0.000    5.057    0.000 resnet.py:143(forward)
      248    0.006    0.000    4.588    0.019 __init__.py:1(<module>)
      720    0.007    0.000    4.210    0.006 connection.py:1121(wait)
      650    0.002    0.000    4.161    0.006 helpers.py:126(get_batch)
      720    0.004    0.000    4.099    0.006 selectors.py:402(select)
      720    4.092    0.006    4.093    0.006 {method 'poll' of 'select.poll' objects}
      640    0.005    0.000    3.938    0.006 dataloader.py:1299(_next_data)
      640    0.002    0.000    3.893    0.006 dataloader.py:1266(_get_data)
      640    0.002    0.000    3.891    0.006 dataloader.py:1120(_try_get_data)
      640    0.006    0.000    3.889    0.006 queues.py:98(get)
12859/11099    0.021    0.000    3.822    0.000 {built-in method builtins.next}
      640    0.008    0.000    3.362    0.005 dataloader.py:626(__next__)
      640    0.002    0.000    2.850    0.004 connection.py:253(poll)
      640    0.002    0.000    2.848    0.004 connection.py:439(_poll)
       80    0.003    0.000    2.837    0.035 process.py:110(start)
       80    0.001    0.000    2.833    0.035 context.py:222(_Popen)
       80    0.001    0.000    2.832    0.035 context.py:279(_Popen)
       80    0.001    0.000    2.831    0.035 popen_fork.py:15(__init__)
       80    0.002    0.000    2.827    0.035 popen_fork.py:62(_launch)
       80    2.821    0.035    2.822    0.035 {built-in method posix.fork}
    33920    0.038    0.000    2.266    0.000 conv.py:459(forward)
    33920    0.044    0.000    2.201    0.000 conv.py:451(_conv_forward)
      800    0.009    0.000    2.195    0.003 dataloader.py:1348(_try_put_index)
      720    2.068    0.003    2.172    0.003 queues.py:86(put)
    33920    2.156    0.000    2.157    0.000 {built-in method torch.conv2d}
    33920    0.123    0.000    2.034    0.000 batchnorm.py:141(forward)
       10    0.000    0.000    1.385    0.138 dataloader.py:1478(__del__)
       10    0.001    0.000    1.384    0.138 dataloader.py:1401(_shutdown_workers)
       80    0.001    0.000    1.379    0.017 process.py:142(join)
       80    0.000    0.000    1.378    0.017 popen_fork.py:36(wait)
    33920    0.086    0.000    1.318    0.000 functional.py:2451(batch_norm)
5840/5751    0.013    0.000    1.131    0.000 connection.py:406(_send_bytes)
    33920    1.125    0.000    1.125    0.000 {built-in method torch.batch_norm}
5840/5751    0.018    0.000    1.080    0.000 connection.py:381(_send)
      640    0.030    0.000    1.018    0.002 {built-in method _pickle.loads}
     1280    0.010    0.000    0.940    0.001 reductions.py:494(rebuild_storage_fd)
     1280    0.005    0.000    0.883    0.001 resource_sharer.py:55(detach)
     1280    0.009    0.000    0.790    0.001 resource_sharer.py:81(get_connection)
        1    0.000    0.000    0.766    0.766 convnext.py:1(<module>)
        1    0.000    0.000    0.760    0.760 poolers.py:1(<module>)
5840/5751    0.532    0.000    0.689    0.000 {built-in method posix.write}
     1280    0.005    0.000    0.688    0.001 connection.py:509(Client)
        1    0.000    0.000    0.684    0.684 roi_align.py:1(<module>)
        1    0.000    0.000    0.683    0.683 convert_frame.py:1(<module>)
      637    0.015    0.000    0.592    0.001 optimizer.py:368(wrapper)
      637    0.013    0.000    0.530    0.001 optimizer.py:58(_use_grad)
       12    0.000    0.000    0.514    0.043 _jit_internal.py:890(_overload)
      637    0.024    0.000    0.514    0.001 sgd.py:55(step)
      640    0.017    0.000    0.482    0.001 grad_scaler.py:285(unscale_)
1960/1832    0.003    0.000    0.469    0.000 <frozen importlib._bootstrap>:806(module_from_spec)
    33920    0.468    0.000    0.468    0.000 {method 'add_' of 'torch._C.TensorBase' objects}
     1280    0.006    0.000    0.464    0.000 connection.py:948(answer_challenge)
       29    0.001    0.000    0.454    0.016 utils.py:1(<module>)
     4480    0.006    0.000    0.450    0.000 connection.py:208(recv_bytes)
    31360    0.021    0.000    0.449    0.000 activation.py:100(forward)
     4480    0.008    0.000    0.443    0.000 connection.py:429(_recv_bytes)
    53/32    0.000    0.000    0.438    0.014 <frozen importlib._bootstrap_external>:1286(create_module)
    53/32    0.369    0.007    0.438    0.014 {built-in method _imp.create_dynamic}
     8960    0.011    0.000    0.431    0.000 connection.py:390(_recv)
    31360    0.021    0.000    0.428    0.000 functional.py:1462(relu)
     8960    0.416    0.000    0.416    0.000 {built-in method posix.read}
      640    0.254    0.000    0.412    0.001 grad_scaler.py:232(_unscale_grads_)
      641    0.007    0.000    0.411    0.001 _compile.py:20(inner)
    31360    0.397    0.000    0.397    0.000 {built-in method torch.relu_}
 2099/222    0.003    0.000    0.387    0.002 <frozen importlib._bootstrap>:1390(_handle_fromlist)
        1    0.000    0.000    0.357    0.357 allowed_functions.py:1(<module>)
        1    0.000    0.000    0.337    0.337 symbolic_shapes.py:1(<module>)
        5    0.000    0.000    0.325    0.065 functions.py:1(<module>)
      642    0.009    0.000    0.311    0.000 eval_frame.py:454(_fn)
     1902    0.011    0.000    0.306    0.000 <frozen importlib._bootstrap_external>:1061(get_code)
       90    0.003    0.000    0.296    0.003 context.py:100(Queue)
       90    0.002    0.000    0.292    0.003 queues.py:37(__init__

Delaunay · 2024-02-21T16:03:59Z

Possible bench that could have reduced std with an Infinite Random Sampler

stargan
dlrm
convnext_large-fp16

Delaunay · 2024-02-23T18:23:37Z

example.bash.txt

Add a dry command generator

0de401d

Delaunay mentioned this pull request Feb 5, 2024

Exec regression #152

Closed

Pierre Delaunay added 3 commits February 5, 2024 10:51

Add command regression

e89b5ba

Update command generator for multinode

295d433

Add global flag to disable voi

0a43675

Delaunay force-pushed the overhead branch from 23abc0b to 0a43675 Compare February 5, 2024 16:52

Pierre Delaunay and others added 8 commits February 5, 2024 11:53

Add coleo options

8492625

add slurm scripts

ac2ebf6

Update shlex quite

0ff0b9a

Fix incorrect batch output

a09e5ff

Update regressioon test

01b8d25

-

d87f421

-

4c89618

Add timers

0ec435b

pierre.delaunay added 3 commits February 8, 2024 15:26

Add timer group

7a50026

-

a94ba6e

Merge branch 'overhead' of github.com:mila-iqia/milabench into overhead

e75f56f

Delaunay force-pushed the overhead branch from b2c8c13 to 9838a33 Compare February 9, 2024 17:56

Pin update

c0a9360

Delaunay force-pushed the overhead branch from 9838a33 to c0a9360 Compare February 9, 2024 17:57

pierre.delaunay added 2 commits February 9, 2024 14:12

-

83ff7c5

Add new dependency

4d1c332

Pierre Delaunay added 2 commits March 5, 2024 10:24

Fix bad windows path

8867d15

Merge branch 'overhead' of github.com:mila-iqia/milabench into overhead

fda71bd

Delaunay closed this Jul 3, 2024

Delaunay deleted the overhead branch July 3, 2024 19:16

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add a dry command generator #193

Add a dry command generator #193

Delaunay commented Feb 5, 2024

Delaunay commented Feb 5, 2024 •

edited

Loading

Delaunay commented Feb 8, 2024 •

edited

Loading

Delaunay commented Feb 21, 2024

Delaunay commented Feb 23, 2024

Add a dry command generator #193

Add a dry command generator #193

Conversation

Delaunay commented Feb 5, 2024

Delaunay commented Feb 5, 2024 • edited Loading

Delaunay commented Feb 8, 2024 • edited Loading

Delaunay commented Feb 21, 2024

Delaunay commented Feb 23, 2024

Delaunay commented Feb 5, 2024 •

edited

Loading

Delaunay commented Feb 8, 2024 •

edited

Loading