diff --git a/.pin/constraints-hpu-torch.txt b/.pin/constraints-hpu-torch.txt index 20f5f2672..de77f3bfd 100644 --- a/.pin/constraints-hpu-torch.txt +++ b/.pin/constraints-hpu-torch.txt @@ -587,7 +587,7 @@ urllib3==1.26.18 # torchx varname==0.10.0 # via giving -voir==0.2.14 +voir==0.2.15 # via # -c .pin/../constraints/hpu.txt # -r benchmarks/accelerate_opt/requirements.in diff --git a/.pin/constraints-xpu-torch.txt b/.pin/constraints-xpu-torch.txt index 1d12ca32a..1ffcf44a1 100644 --- a/.pin/constraints-xpu-torch.txt +++ b/.pin/constraints-xpu-torch.txt @@ -592,11 +592,7 @@ urllib3==1.26.18 # torchx varname==0.10.0 # via giving -<<<<<<< HEAD voir==0.2.15 -======= -voir==0.2.14 ->>>>>>> baa6757f78c08eb64ed139ebec96250f9ef6f180 # via # -c .pin/../constraints/xpu.txt # -r benchmarks/accelerate_opt/requirements.in diff --git a/benchmarks/brax/requirements.cuda.txt b/benchmarks/brax/requirements.cuda.txt index 5e7dc7c3d..38a54509a 100644 --- a/benchmarks/brax/requirements.cuda.txt +++ b/benchmarks/brax/requirements.cuda.txt @@ -432,7 +432,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # giving -voir==0.2.14 +voir==0.2.15 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -c .pin/../constraints/cuda.txt diff --git a/benchmarks/brax/requirements.hpu.txt b/benchmarks/brax/requirements.hpu.txt index ed3084061..d79e7242c 100644 --- a/benchmarks/brax/requirements.hpu.txt +++ b/benchmarks/brax/requirements.hpu.txt @@ -431,7 +431,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # giving -voir==0.2.14 +voir==0.2.15 # via # -c .pin/../.pin/constraints-hpu-torch.txt # -c .pin/../constraints/hpu.txt diff --git a/benchmarks/brax/requirements.rocm.txt b/benchmarks/brax/requirements.rocm.txt index 6e1503248..c77018b5e 100644 --- a/benchmarks/brax/requirements.rocm.txt +++ b/benchmarks/brax/requirements.rocm.txt @@ -417,7 +417,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -voir==0.2.14 +voir==0.2.15 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -c .pin/../constraints/rocm.txt diff --git a/benchmarks/brax/requirements.xpu.txt b/benchmarks/brax/requirements.xpu.txt index 41b63f8a5..75a03f5aa 100644 --- a/benchmarks/brax/requirements.xpu.txt +++ b/benchmarks/brax/requirements.xpu.txt @@ -433,7 +433,7 @@ varname==0.10.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -voir==0.2.14 +voir==0.2.15 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -c .pin/../constraints/xpu.txt diff --git a/benchmate/benchmate/metrics.py b/benchmate/benchmate/metrics.py index 53bef1ad1..d46c500ab 100644 --- a/benchmate/benchmate/metrics.py +++ b/benchmate/benchmate/metrics.py @@ -54,7 +54,7 @@ def __init__(self, task): self.delayed = [] def append(self, *args, **kwargs): - self.delayed.append(args, kwargs) + self.delayed.append((args, kwargs)) def record(self, *args, **kwargs): """Record data for a future metric. @@ -230,6 +230,7 @@ def __init__( self.raise_stop_program = raise_stop_program # Does TimedIterator raise StopProgram self.profile_instrumentation = False self.overhead = [] + self.previous_overhead = 0 self.loader_init_time = [] self.sub_overhead = 0 @@ -259,7 +260,8 @@ def wrapped(self, iterator): # Time IO wait + batch compute start = self.event_fn(enable_timing=True) start.record() - + self.previous_overhead = 0 + for data in iterator: yield data @@ -268,7 +270,7 @@ def wrapped(self, iterator): end.record() bs = self.deduce_batch_size(data) - self.events.append((start, end, bs, self.overhead[-1])) + self.events.append((start, end, bs, self.previous_overhead)) # Log progress so it looks somewhat responsive self.log_progress() @@ -279,8 +281,15 @@ def wrapped(self, iterator): break start = end - self.overhead.append(ct.elapsed()) - + + # Note: first step does not have overhead because end event is recorded + # before the overhead starts + # Note: It is not sure if the CPU overhead impacst the device at all + # since we avoid sync it is possible the device is working during + # the overhead section and that the effective overhead ends up being minimal + self.previous_overhead = ct.elapsed() + self.overhead.append(self.previous_overhead) + self._push() self.earlystop() @@ -323,7 +332,7 @@ def batch_size(self, bs): def _push_time_steps(self): for start, end, bs, overhead in self.events: end.synchronize() - elapsed = (start.elapsed_time(end) - self.sub_overhead * overhead) / self.unit + elapsed = (start.elapsed_time(end)) / self.unit rate = self.batch_size(bs) / elapsed self.log_rate(rate) @@ -337,6 +346,7 @@ def _push_profile_metrics(self): for iterinit in self.loader_init_time: self.message(__iter__=iterinit, units="s", task=self.task) + self.previous_overhead = 0 self.overhead = [] self.loader_init_time = [] diff --git a/constraints/cuda.txt b/constraints/cuda.txt index 56109d809..41588f46a 100644 --- a/constraints/cuda.txt +++ b/constraints/cuda.txt @@ -2,5 +2,5 @@ # # -voir > 0.2.10 +voir >= 0.2.15 torchcompat >= 1.0.0 \ No newline at end of file diff --git a/constraints/hpu.txt b/constraints/hpu.txt index 1dba3a1ee..3cc36920d 100644 --- a/constraints/hpu.txt +++ b/constraints/hpu.txt @@ -3,6 +3,6 @@ # # -voir > 0.2.10 +voir >= 0.2.15 torchcompat >= 1.0.0 diff --git a/constraints/rocm.txt b/constraints/rocm.txt index 1bf0919e8..cdde4e6a1 100644 --- a/constraints/rocm.txt +++ b/constraints/rocm.txt @@ -2,5 +2,5 @@ # # -voir > 0.2.10 +voir >= 0.2.15 torchcompat >= 1.0.0 \ No newline at end of file diff --git a/constraints/xpu.txt b/constraints/xpu.txt index d0cf6bdac..8b8b39db7 100644 --- a/constraints/xpu.txt +++ b/constraints/xpu.txt @@ -10,8 +10,9 @@ torchaudio>=2.1.0a0 intel-extension-for-pytorch>=2.1.10+xpu oneccl_bind_pt==2.1.100+xpu intel-extension-for-pytorch-deepspeed>=2.1.30 +intel-extension-for-openxla>=0.3.0 # # -voir > 0.2.10 +voir >= 0.2.15 torchcompat >= 1.0.0 \ No newline at end of file diff --git a/milabench/_version.py b/milabench/_version.py index 57c79d91e..ced2a5852 100644 --- a/milabench/_version.py +++ b/milabench/_version.py @@ -1,5 +1,5 @@ """This file is generated, do not modify""" -__tag__ = "v0.0.6-140-g57343f1" -__commit__ = "57343f10ef2b4ce598011ee308ebd06b4c654495" -__date__ = "2024-06-10 11:52:37 -0400" +__tag__ = "v0.0.10-145-gc151b985" +__commit__ = "c151b98546f32d9c0671507f8526ed13598e3407" +__date__ = "2024-06-11 14:30:04 -0400" diff --git a/milabench/sizer.py b/milabench/sizer.py index 5c206b7a8..bd0bc82c6 100644 --- a/milabench/sizer.py +++ b/milabench/sizer.py @@ -284,6 +284,9 @@ def resolve_argv(pack, argv): "cpu": "gloo" } + if device_count <= 0: + device_count = 1 + context["arch"] = arch context["ccl"] = ccl.get(arch, "gloo") context["cpu_count"] = multiprocessing.cpu_count() diff --git a/scripts/article/run_hpu.sh b/scripts/article/run_hpu.sh index c732b09a4..f6add4850 100644 --- a/scripts/article/run_hpu.sh +++ b/scripts/article/run_hpu.sh @@ -60,6 +60,7 @@ install_prepare() { # # Generate/download datasets, download models etc... + # milabench prepare } diff --git a/scripts/article/run_rocm.sh b/scripts/article/run_rocm.sh index eaafd522f..819374e66 100644 --- a/scripts/article/run_rocm.sh +++ b/scripts/article/run_rocm.sh @@ -27,16 +27,26 @@ install_prepare() { # milabench install + # + # Override/add package to milabench venv here + # which pip + # pip install ... ( . $BENCHMARK_VENV/bin/activate + + # + # Override/add package to the benchmark venv here + # which pip + pip uninstall torch torchvision torchaudio pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.0 ) # # Generate/download datasets, download models etc... + # milabench prepare } diff --git a/scripts/article/run_xpu.sh b/scripts/article/run_xpu.sh index 9e51b0bc0..86c741107 100644 --- a/scripts/article/run_xpu.sh +++ b/scripts/article/run_xpu.sh @@ -44,6 +44,7 @@ install_prepare() { # # Generate/download datasets, download models etc... + # milabench prepare } diff --git a/tests/benchmate/test_timed_iterator.py b/tests/benchmate/test_timed_iterator.py new file mode 100644 index 000000000..0c581a704 --- /dev/null +++ b/tests/benchmate/test_timed_iterator.py @@ -0,0 +1,55 @@ +import time + +import pytest + +from benchmate.metrics import TimedIterator, StopProgram + + +class CPUEvent: + def __init__(self, **kwargs): + self.start = 0 + + def record(self): + self.start = time.time() + + def elapsed_time(self, end): + # shoudl return ms + return (end.start - self.start) * 1000 + + def synchronize(self): + pass + + +def test_wrapper(): + batch = [1, 2] + process_time = 0.1 + + iterable = [(batch, 3) for i in range(10)] + messages = [] + + def push(**kwargs): + nonlocal messages + messages.append(kwargs) + + loader = TimedIterator( + iterable, event_fn=CPUEvent, earlystop=50, raise_stop_program=True, push=push + ) + + with pytest.raises(StopProgram): + for e in range(200): + for i in loader: + time.sleep(process_time) + + assert len(messages) == 117 + + rate_acc = 0 + rate_count = 0 + for msg in messages: + if rate := msg.get("rate"): + rate_acc += rate + rate_count += 1 + + assert rate_count == 50, "Program should stop once we reached the necessary count" + assert ( + abs((rate_acc / rate_count) - len(batch) / process_time) < 0.5 + ), "Computed rate should be close to theorical rate" \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index 2ff6efc34..10e2d28f6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,6 @@ import os from pathlib import Path +import stat import voir.instruments.gpu as voirgpu @@ -88,6 +89,18 @@ def set_env(): os.environ["MILABENCH_DASH"] = "no" os.environ["MILABENCH_GPU_ARCH"] = backend + # + # milabench expects voir to be installed in the bench venv + # we fake one to use the one we have in the current env + os.makedirs("output/venv/benchio/bin/", exist_ok=True) + voirexec = "output/venv/benchio/bin/voir" + with open(voirexec, "w") as fp: + fp.write("#!/bin/bash\n") + fp.write("python -m voir \"$@\"") + + current_permissions = stat.S_IMODE(os.lstat(voirexec).st_mode) + os.chmod(voirexec, current_permissions | (stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)) + if backend == "mock": oldsmi = voirgpu.DEVICESMI voirgpu.DEVICESMI = MockDeviceSMI() diff --git a/tests/test_capabilities.py b/tests/test_capabilities.py index 468ef0712..6fa7236f0 100644 --- a/tests/test_capabilities.py +++ b/tests/test_capabilities.py @@ -21,7 +21,7 @@ def fake_config(n): def test_capabilties_ok(): pack = BasePackage(fake_config(10)) - assert sync_is_system_capable(pack) + assert sync_is_system_capable(pack) is True def test_capabilties_not_ok(): diff --git a/tests/test_executors.py b/tests/test_executors.py index 4a4bf598a..c6837b005 100644 --- a/tests/test_executors.py +++ b/tests/test_executors.py @@ -197,7 +197,7 @@ def test_njobs_gpus_executor(): acc = 0 for r in proceed(njobs.execute()): if r.event == "start": - assert r.data["command"][0] == "torchrun" + assert r.data["command"][0].endswith("torchrun") acc += 1 print(r) @@ -218,7 +218,7 @@ def test_njobs_gpu_executor(): print(r) if r.event == "start": - assert r.data["command"][0] == "voir" + assert r.data["command"][0].endswith("voir") acc += 1 diff --git a/tests/test_scaler.py b/tests/test_scaler.py index 283048c8b..5d8d561b4 100644 --- a/tests/test_scaler.py +++ b/tests/test_scaler.py @@ -66,6 +66,13 @@ def test_scaler_disabled(multipack): assert pack.argv == [] +def fakeexec(pack): + from milabench.sizer import resolve_argv, scale_argv + sized_args = scale_argv(pack, pack.argv) + final_args = resolve_argv(pack, sized_args) + return final_args + + def test_scaler_enabled(multipack, config): from milabench.config import system_global import contextvars @@ -83,12 +90,13 @@ def update_ctx(): ) sizer_global.set(sizer) system = system_global.get() - system["gpu"]["capacity"] = "41920 MiB" + gpu = system.setdefault("gpu", dict()) + gpu["capacity"] = "41920 MiB" ctx.run(update_ctx) for k, pack in multipack.packs.items(): - assert ctx.run(lambda: pack.argv) == ["--batch_size", "232"] + assert ctx.run(lambda: fakeexec(pack)) == ["--batch_size", "232"] # Sizer is only enabled inside the context - assert pack.argv == [] + assert fakeexec(pack) == [] diff --git a/tests/test_summary/test_report.txt b/tests/test_summary/test_report.txt index 7d8474e01..937a59561 100644 --- a/tests/test_summary/test_report.txt +++ b/tests/test_summary/test_report.txt @@ -2,8 +2,8 @@ Source: XXX ================= Benchmark results ================= - fail n perf sem% std% peak_memory score weight -benchio 0 4 7979.82 2.9% 17.2% -1 7979.81831 2.00 +bench | fail | n | perf | sem% | std% | peak_memory | score | weight +benchio | 0 | 4 | 7979.82 | 2.9% | 17.2% | -1 | 7979.82 | 2.00 Scores ------ diff --git a/tests/test_summary/test_report_folder_does_average.txt b/tests/test_summary/test_report_folder_does_average.txt index 3cc299dbf..5abe96e68 100644 --- a/tests/test_summary/test_report_folder_does_average.txt +++ b/tests/test_summary/test_report_folder_does_average.txt @@ -2,8 +2,8 @@ Source: XXX ================= Benchmark results ================= - fail n perf sem% std% peak_memory score weight -benchio 0 6 7878.45 2.5% 18.0% 24456 7878.451302 2.00 +bench | fail | n | perf | sem% | std% | peak_memory | score | weight +benchio | 0 | 6 | 7878.45 | 2.5% | 18.0% | 24456 | 7878.45 | 2.00 Scores ------