From 17d2db5b3f6c4bd7a7c36278a628b318688fba56 Mon Sep 17 00:00:00 2001 From: "pierre.delaunay" Date: Thu, 6 Jun 2024 11:23:22 -0400 Subject: [PATCH] Install benchmate after requirements --- docs/flow.rst | 234 ++++++++++++++++++++++++++++++++++++---------- milabench/pack.py | 6 +- 2 files changed, 186 insertions(+), 54 deletions(-) diff --git a/docs/flow.rst b/docs/flow.rst index b196f1b29..d4c00138d 100644 --- a/docs/flow.rst +++ b/docs/flow.rst @@ -1,21 +1,22 @@ -Milabench Setup Overview ------------------------- - - - +Milabench Overview +------------------ .. code-block:: txt - MILABENCH_BASE=workdir + MILABENCH_BASE=workdir/results workdir ├── milabench │ ├── benchmarks # Each benchmark is inside a folder │ │ ├── torchvision - | | | ├── benchfile.py # Benchmark configuration (source to checkout, script to runs etc...) - | | | ├── voirfile.py # Instrumentation to insert timers - | | | ├── prepare.py # Prepare script executed to fetch datasets, download pretrained models - | | | └── main.py # benchmark script to be ran + | | | ├── benchfile.py # Benchmark configuration (source to checkout, script to runs etc...) + | | | ├── voirfile.py # Instrumentation to insert timers + | | | ├── prepare.py # Prepare script executed to fetch datasets, download pretrained models + | | | ├── main.py # benchmark script to be ran + | | | ├── requirements.in # base requirements + | | | ├── requirements.cuda.txt # pinned requirements for cuda + | | | ├── requirements.rocm.txt # pinned requirements for rocm + | | | └── requirements.xpu.txt # pinned requirements for xpu | | └── timm | | ├── benchfile.py # Benchmark configuration (source to checkout, script to runs etc...) | | ├── voirfile.py # Instrumentation to insert timers @@ -24,9 +25,10 @@ Milabench Setup Overview │ ├── benchmate # benchmate module │ ├── milabench # milabench module │ ├── constraints # pip constraints for different vendors - │ └── config # benchmark suite configuration + │ └── config # benchmark suite configurations + │ └── standard.yaml # <= MILABENCH_CONFIG ├── env # virtual environment where milabench is installed - └── results + └── results # <= MILABENCH_BASE ├── data # Datasets, pre-trained models ├── extra ├── venv # Benchmark virtual environments @@ -52,45 +54,175 @@ Milabench is configured using a yaml file that specify where are the benchmark a .. code-block:: yaml - _defaults: - max_duration: 600 # Bench time out - voir: - options: - stop: 60 # Bench stops after gathering 60 observations - interval: "1s" # Gathering interval - - validation: # Validation (disabled by default) - usage: - gpu_load_threshold: 0.5 # ensure GPU load is higher than 50% - gpu_mem_threshold: 0.5 # ensure GPU memory is higher than 50% - - _torchvision: - inherits: _defaults # base configuration - definition: ../benchmarks/torchvision # benchmark definition location - group: torchvision - install_group: torch # venv name to use for this benchmark - plan: # Specify how the benchmark is scheduled - method: per_gpu # `per_gpu` means it will spawn one bench per GPU - argv: # arguments to forward - --precision: 'tf32-fp16' - --lr: 0.01 - --no-stdout: true - --epochs: 50 - --num-workers: 8 - - resnet50: # benchmark name "_" are "private" and never run - inherits: _torchvision - tags: # selection tags - - vision - - classification - - convnet - - resnet - - argv: - --model: resnet50 - --batch-size: 64 - --num-workers: "{cpu_per_gpu}" # Placeholder variable to be resolved + _defaults: + max_duration: 600 # Bench time out + voir: + options: + stop: 60 # Bench stops after gathering 60 observations + interval: "1s" # Gathering interval + + validation: # Validation (disabled by default) + usage: + gpu_load_threshold: 0.5 # ensure GPU load is higher than 50% + gpu_mem_threshold: 0.5 # ensure GPU memory is higher than 50% + + _torchvision: + inherits: _defaults # base configuration + definition: ../benchmarks/torchvision # benchmark definition location + group: torchvision + install_group: torch # venv name to use for this benchmark + plan: # Specify how the benchmark is scheduled + method: per_gpu # `per_gpu` means it will spawn one bench per GPU + argv: # arguments to forward + --precision: 'tf32-fp16' + --lr: 0.01 + --no-stdout: true + --epochs: 50 + --num-workers: 8 + + resnet50: # benchmark name "_" are "private" and never run + inherits: _torchvision + tags: # selection tags + - vision + - classification + - convnet + - resnet + + argv: + --model: resnet50 + --batch-size: 64 + --num-workers: "{cpu_per_gpu}" # Placeholder variable to be resolved + + # milabench can also define matrix job + resnet-matrix-noio: + matrix: + batch-size: [32, 64, 128, 256, 512, 1024] + + job: + name: 'resnet50-noio-bs{batch-size}' + inherits: _resnet50 + argv: + --batch-size: '{batch-size}' + --synthetic-data: true + --fixed-batch: true System Configuration --------------------- \ No newline at end of file +-------------------- + +milabench can run benchmarks across multiple nodes, to do so a system configuration needs to be provided. +This file will define all the nodes accessible to milabench. + +.. code-block:: yaml + + system: + arch: cuda # Default arch + sshkey: ~/.ssh/id_ed25519 # sshkey used in remote milabench operations + # Docker image to use + docker_image: ghcr.io/mila-iqia/milabench:${system.arch}-nightly + + # Nodes list + nodes: + # Alias used to reference the node + - name: manager + ip: 192.168.11.11 + port: 5000 + main: true # Use this node as the rank=0 node or not + user: manager # User to use in remote milabench operations + + - name: node1 + ip: 192.168.11.12 + main: false + user: username + +Multinode +********* + +Milabench takes care of sending the commands to all the nodes when appropriate. + + +Methodology +----------- + +.. code-block:: python + + for i in range(epoch): + events = [] + + # Creation of the iterator from the dataloader is time consuming + # it would get amortized across many batch during real training + # but we want benchmarking to be fast so it is something we cannot afford + batch_iter = iter(loader) + total_obs = 0 + + # Avoid sync in the batch loop + start = Event() + start.record() + + for batch in batch_iter: + pred = model(batch) + loss = fn(pred, target) + + end = Event() # +-> + end.record() # | + events.append((start, end, len(batch), loss.detach())) # | Limited overhead + if len(events) + total_obs >= 60: # | + break # | + start = end # +-> + + # Force sync at the end of the epoch # +-> + for start, end, bs, loss in events: # | Timer is off does not impact perf measures + end.wait() # | + log(loss=loss.item()) # | + log(rate=bs / (end - start)) # | + # | + total_obs += len(events) # | + if total_obs >= 60: # | + raise StopProgram() # +-> + +Instrumentations +**************** + +To minimize code change, milabench use `ptera `_ to modify +the code that will be run and insert the necessary hooks to measure performance. + +The hooks are defined inside the `voirfile.py`. +The example below override the return value of the `dataloader()` function which is defined in the `__main__` module. +It wraps the original object with a custom wrapper that will time the time between `__next__` calls. + +This allows milabench to integrate benchmarks from code coming from third parties without modifying the code directly. + +.. code-block:: python + + def wrapper(loader): + print("received loader obj") + return Wrapper(loader) + + probe = ov.probe("//dataloader() as loader", overridable=True) + probe['loader'].override(wrapper) + + +Execution Flow +-------------- + +* `milabench install` + * Creates virtual env for benchmarks and install their dependencies + * Modify: `$MILABENCH_BASE/venv/{bench}` + +* `milabench prepare` + * Call the prepare script for each benchmarks to download/generate dataset + * Modify: `$MILABENCH_BASE/data/{dataset}` + +* `milabench run` + * Execute each benchmark + * Modify: `$MILABENCH_BASE/runs/{runame}.{time}` + + +How do I +-------- + +* I want to run a benchmark without milabench for debugging purposes + * `milabench dev {benchname}` will open bash with the benchmark venv sourced + * alternatively: `source $MILABENCH_BASE/venv/torch/bin/activate` + + diff --git a/milabench/pack.py b/milabench/pack.py index 66e7562d4..0760f2208 100644 --- a/milabench/pack.py +++ b/milabench/pack.py @@ -374,15 +374,15 @@ async def install(self): """ assert self.phase == "install" - if is_editable_install(): - await install_benchmate(self) - for reqs in self.requirements_files(self.config.get("install_variant", None)): if reqs.exists(): await self.pip_install("-r", reqs) else: raise FileNotFoundError(f"Requirements file not found: {reqs}") + if is_editable_install(): + await install_benchmate(self) + async def pin( self, clear_previous: bool = True,