From aa2a4990fd52633aadb350a590086539b70812d3 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Fri, 21 Jun 2024 13:03:09 -0400 Subject: [PATCH 1/9] Update observer.py --- benchmate/benchmate/observer.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/benchmate/benchmate/observer.py b/benchmate/benchmate/observer.py index 5ead66a5b..aae642511 100644 --- a/benchmate/benchmate/observer.py +++ b/benchmate/benchmate/observer.py @@ -59,6 +59,9 @@ def override_return_value(self, function, override): else: raise RuntimeError("Not running through voir") + def iterate(self, iterator): + return self.loader(loader) + def loader(self, loader): """Wrap a dataloader or an iterable which enable accurate measuring of time spent in the loop's body""" self.wrapped = TimedIterator( From fb56c6f216a4bb359da2162ac3a099b45af9fcfb Mon Sep 17 00:00:00 2001 From: Setepenre Date: Fri, 21 Jun 2024 13:05:00 -0400 Subject: [PATCH 2/9] Update metrics.py --- benchmate/benchmate/metrics.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/benchmate/benchmate/metrics.py b/benchmate/benchmate/metrics.py index b6ca483c7..4db61c8f9 100644 --- a/benchmate/benchmate/metrics.py +++ b/benchmate/benchmate/metrics.py @@ -84,7 +84,10 @@ def record(self, loss): def materialize(self, loss): # synch here is fine - return {"loss": loss.item(), "task": self.task} + value = loss + if hasattr(loss, "item"): + value = loss.item() + return {"loss": value, "task": self.task} class CPUTimer: From 0d721a5bc1384de845763ba4b6b7c27ee70d6d70 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Fri, 21 Jun 2024 13:09:11 -0400 Subject: [PATCH 3/9] Update main.py --- benchmarks/_template/main.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/benchmarks/_template/main.py b/benchmarks/_template/main.py index 9169a0f07..ea96a3830 100644 --- a/benchmarks/_template/main.py +++ b/benchmarks/_template/main.py @@ -6,14 +6,23 @@ import time -import voir -from giving import give +import torchcompat.core as accelerator +from benchmate.observer import BenchObserver def main(): - for i in voir.iterate("train", range(10000), report_batch=True, batch_size=64): - give(loss=1 / (i + 1)) - time.sleep(0.1) + device = accelerator.fetch_device(0) # <= This is your cuda device + + observer = BenchObserver() + dataloader = [1, 2, 3, 4] + + for epoch in range(10): + for i in observer.iterate(dataloader): + # avoid .item() + # avoid torch.cuda; use accelerator from torchcompat instead + # avoid torch.cuda.synchronize or accelerator.synchronize + observer.record_loss(loss=1 / (i + 1)) + time.sleep(0.1) if __name__ == "__main__": From 75cdf6cecd06962e8f20d12e0df627764967c0ae Mon Sep 17 00:00:00 2001 From: Setepenre Date: Fri, 21 Jun 2024 13:11:51 -0400 Subject: [PATCH 4/9] Update main.py --- benchmarks/_template/main.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/benchmarks/_template/main.py b/benchmarks/_template/main.py index ea96a3830..5cf8ef76f 100644 --- a/benchmarks/_template/main.py +++ b/benchmarks/_template/main.py @@ -13,7 +13,10 @@ def main(): device = accelerator.fetch_device(0) # <= This is your cuda device - observer = BenchObserver() + observer = BenchObserver( + event_fn=accelerator.Event, + batch_size_fn=lambda batch: 1 + ) dataloader = [1, 2, 3, 4] for epoch in range(10): From 9919ce4bd978ddf9ca1f1749efc7c500409a3075 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Fri, 21 Jun 2024 13:14:19 -0400 Subject: [PATCH 5/9] Update metrics.py --- benchmate/benchmate/metrics.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/benchmate/benchmate/metrics.py b/benchmate/benchmate/metrics.py index 4db61c8f9..a73c8e03d 100644 --- a/benchmate/benchmate/metrics.py +++ b/benchmate/benchmate/metrics.py @@ -84,10 +84,7 @@ def record(self, loss): def materialize(self, loss): # synch here is fine - value = loss - if hasattr(loss, "item"): - value = loss.item() - return {"loss": value, "task": self.task} + return {"loss": loss.item(), "task": self.task} class CPUTimer: @@ -135,6 +132,15 @@ def elapsed(self): return self._start.elapsed_time(self._end) +def default_event(): + try: + import torchcompat.core as accelerator + return accelerator.Event + except: + print("Could not find a device timer") + return CPUTimer() + + class TimedIterator: """Time the body of a loop, ignoring the time it took to initialize the iterator.` The timings are measured using `torch.cuda.Event` to avoid explicit sync. @@ -200,7 +206,7 @@ def with_give(cls, *args, push=None, **kwargs): def __init__( self, loader, - event_fn, + event_fn=default_event(), rank=0, push=file_push(), device=None, From c70edc14ddece55989dd5884e19f411301eb06b6 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Fri, 21 Jun 2024 13:15:09 -0400 Subject: [PATCH 6/9] Update main.py --- benchmarks/_template/main.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/benchmarks/_template/main.py b/benchmarks/_template/main.py index 5cf8ef76f..5a684896e 100644 --- a/benchmarks/_template/main.py +++ b/benchmarks/_template/main.py @@ -13,10 +13,7 @@ def main(): device = accelerator.fetch_device(0) # <= This is your cuda device - observer = BenchObserver( - event_fn=accelerator.Event, - batch_size_fn=lambda batch: 1 - ) + observer = BenchObserver(batch_size_fn=lambda batch: 1) dataloader = [1, 2, 3, 4] for epoch in range(10): From 32239f9ae073921a093d88a24611ad260d401d38 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Fri, 21 Jun 2024 13:19:55 -0400 Subject: [PATCH 7/9] Update main.py --- benchmarks/_template/main.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/benchmarks/_template/main.py b/benchmarks/_template/main.py index 5a684896e..46fed8e8e 100644 --- a/benchmarks/_template/main.py +++ b/benchmarks/_template/main.py @@ -14,6 +14,9 @@ def main(): device = accelerator.fetch_device(0) # <= This is your cuda device observer = BenchObserver(batch_size_fn=lambda batch: 1) + # optimizer = observer.optimizer(optimizer) + # criterion = observer.optimizer(criterion) + dataloader = [1, 2, 3, 4] for epoch in range(10): @@ -21,7 +24,12 @@ def main(): # avoid .item() # avoid torch.cuda; use accelerator from torchcompat instead # avoid torch.cuda.synchronize or accelerator.synchronize - observer.record_loss(loss=1 / (i + 1)) + + # y = model(i) + # loss = criterion(y) + # loss.backward() + # optimizer.step() + time.sleep(0.1) From 4321c196ab6c6bf796589cee30445105e7ba8990 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Fri, 21 Jun 2024 13:23:38 -0400 Subject: [PATCH 8/9] Update main.py --- benchmarks/_template/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/_template/main.py b/benchmarks/_template/main.py index 46fed8e8e..99f0f0adc 100644 --- a/benchmarks/_template/main.py +++ b/benchmarks/_template/main.py @@ -15,7 +15,7 @@ def main(): observer = BenchObserver(batch_size_fn=lambda batch: 1) # optimizer = observer.optimizer(optimizer) - # criterion = observer.optimizer(criterion) + # criterion = observer.criterion(criterion) dataloader = [1, 2, 3, 4] From 5b2ff63178e457321fc155df41fdd87ebf00f9a1 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Fri, 21 Jun 2024 13:29:30 -0400 Subject: [PATCH 9/9] Update new_benchmarks.rst --- docs/new_benchmarks.rst | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/docs/new_benchmarks.rst b/docs/new_benchmarks.rst index 26356f8c4..058b99c0d 100644 --- a/docs/new_benchmarks.rst +++ b/docs/new_benchmarks.rst @@ -78,15 +78,28 @@ The template ``main.py`` demonstrates a simple loop that you can adapt to any sc .. code-block:: python + def main(): - for i in voir.iterate("train", range(100), report_batch=True, batch_size=64): - give(loss=1/(i + 1)) - time.sleep(0.1) - -* Wrap the training loop with ``voir.iterate``. - * ``report_batch=True`` triggers the computation of the number of training samples per second. - * Set ``batch_size`` to the batch_size. milabench can also figure it out automatically if you are iterating over the input batches (it will use the first number in the tensor's shape). -* ``give(loss=loss.item())`` will forward the value of the loss to milabench. Make sure the value is a plain Python ``float``. + observer = BenchObserver(batch_size_fn=lambda batch: 1) + criterion = observer.criterion(criterion) + optimizer = observer.optimizer(optimizer) + + for epoch in range(10): + for i in observer.iterate(dataloader): + # ... + time.sleep(0.1) + +* Create a new bench observer, this class is used to time the benchmark and measure batch times. + * Set ``batch_size_fn`` to provide a function to compute the right batch size given a batch. +* ``observer.criterion(criterion)`` will wrap the criterion function so the loss will be reported automatically. +* ``observer.optimizer(optimizer)`` will wrap the optimizer so device that need special handling can have their logic executed there +* Wrap the batch loop with ``observer.iterate``, it will take care of timing the body of the loop and handle early stopping if necessary + +.. note:: + + Avoid calls to ``.item()``, ``torch.cuda`` and ``torch.cuda.synchronize()``. + To access ``cuda`` related features use ``accelerator`` from torchcompat. + ``accelerator`` is a light wrapper around ``torch.cuda`` to allow a wider range of devices to be used. If the script takes command line arguments, you can parse them however you like, for example with ``argparse.ArgumentParser``. Then, you can add an ``argv`` section in ``dev.yaml``, just like this: