From 8d48221129fc00e68b06d956c088be2a1afc88c2 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Thu, 1 Aug 2024 09:40:52 -0400 Subject: [PATCH 1/9] Create execution_modes.rst --- docs/execution_modes.rst | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 docs/execution_modes.rst diff --git a/docs/execution_modes.rst b/docs/execution_modes.rst new file mode 100644 index 000000000..4318130d6 --- /dev/null +++ b/docs/execution_modes.rst @@ -0,0 +1,38 @@ +Plan +==== + +* ``per_gpu``: used for mono gpu benchmarks, spawn one process per gpu and run the same benchmark + +.. code-block:: yaml + + _torchvision: + inherits: _defaults + definition: ../benchmarks/torchvision + group: torchvision + install_group: torch + plan: + method: per_gpu + +* ``njobs``: used to launch a single jobs that can see all the gpus. + +.. code-block:: yaml + + _torchvision_ddp: + inherits: _defaults + definition: ../benchmarks/torchvision_ddp + group: torchvision + install_group: torch + plan: + method: njobs + n: 1 + + +Milabench processes overview +---------------------------- + +* milabench main process + * gather metrics + +* milabench launches a new benchmark process + * milabench launch monitoring processes + * torchrun will launch one process per GPU From d9f037322adb91b5058cb31fc8b88e47e11133a2 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Thu, 1 Aug 2024 09:50:50 -0400 Subject: [PATCH 2/9] Update execution_modes.rst --- docs/execution_modes.rst | 51 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 47 insertions(+), 4 deletions(-) diff --git a/docs/execution_modes.rst b/docs/execution_modes.rst index 4318130d6..7b3fbd915 100644 --- a/docs/execution_modes.rst +++ b/docs/execution_modes.rst @@ -13,6 +13,23 @@ Plan plan: method: per_gpu +.. code-block:: bash + + echo "---" + echo "fp16" + echo "====" + time ( + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + wait + ) + * ``njobs``: used to launch a single jobs that can see all the gpus. .. code-block:: yaml @@ -26,13 +43,39 @@ Plan method: njobs n: 1 +.. code-block:: bash + + echo "---" + echo "lightning-gpus" + echo "==============" + time ( + $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & + wait + ) + Milabench processes overview ---------------------------- * milabench main process - * gather metrics + * gather metrics from benchmark processes + + * if ``per_gpu`` is used, milabench will launch one process per GPU (sets ``CUDA_VISIBLE_DEVCES``) + * each processes log their GPU data + * might spawn a monitor process + * will init pynvml + * dataloader will also spawn process workers + * usually not using GPU + + * if ``njobs`` is used, milabench will launch a single process (torchrun) + * torchrun in turn will spawn one process per GPU + * RANK 0 is used for logging + * RANK 0 might spawn a monitor process + * will init pynvml + * dataloader will also spawn process workers + * usually not using GPU + + + + -* milabench launches a new benchmark process - * milabench launch monitoring processes - * torchrun will launch one process per GPU From 3d838c58decccc557063c67fd7ac5241917f2c0c Mon Sep 17 00:00:00 2001 From: Setepenre Date: Thu, 1 Aug 2024 09:51:21 -0400 Subject: [PATCH 3/9] Update execution_modes.rst --- docs/execution_modes.rst | 42 ++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/docs/execution_modes.rst b/docs/execution_modes.rst index 7b3fbd915..1511ff24b 100644 --- a/docs/execution_modes.rst +++ b/docs/execution_modes.rst @@ -15,20 +15,20 @@ Plan .. code-block:: bash - echo "---" - echo "fp16" - echo "====" - time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - wait - ) + echo "---" + echo "fp16" + echo "====" + time ( + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + wait + ) * ``njobs``: used to launch a single jobs that can see all the gpus. @@ -45,13 +45,13 @@ Plan .. code-block:: bash - echo "---" - echo "lightning-gpus" - echo "==============" - time ( - $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & - wait - ) + echo "---" + echo "lightning-gpus" + echo "==============" + time ( + $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & + wait + ) Milabench processes overview From 677f096c48a5f2acf974cc38fe16e40a64e73940 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Thu, 1 Aug 2024 09:51:46 -0400 Subject: [PATCH 4/9] Update execution_modes.rst --- docs/execution_modes.rst | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/docs/execution_modes.rst b/docs/execution_modes.rst index 1511ff24b..2fd955f6f 100644 --- a/docs/execution_modes.rst +++ b/docs/execution_modes.rst @@ -5,13 +5,13 @@ Plan .. code-block:: yaml - _torchvision: - inherits: _defaults - definition: ../benchmarks/torchvision - group: torchvision - install_group: torch - plan: - method: per_gpu + _torchvision: + inherits: _defaults + definition: ../benchmarks/torchvision + group: torchvision + install_group: torch + plan: + method: per_gpu .. code-block:: bash @@ -34,14 +34,14 @@ Plan .. code-block:: yaml - _torchvision_ddp: - inherits: _defaults - definition: ../benchmarks/torchvision_ddp - group: torchvision - install_group: torch - plan: - method: njobs - n: 1 + _torchvision_ddp: + inherits: _defaults + definition: ../benchmarks/torchvision_ddp + group: torchvision + install_group: torch + plan: + method: njobs + n: 1 .. code-block:: bash From a40fdc5fb9374daa8d5f5f92c66a016a543f8bbb Mon Sep 17 00:00:00 2001 From: Setepenre Date: Thu, 1 Aug 2024 09:52:43 -0400 Subject: [PATCH 5/9] Update execution_modes.rst --- docs/execution_modes.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/execution_modes.rst b/docs/execution_modes.rst index 2fd955f6f..e9412b936 100644 --- a/docs/execution_modes.rst +++ b/docs/execution_modes.rst @@ -58,7 +58,8 @@ Milabench processes overview ---------------------------- * milabench main process - * gather metrics from benchmark processes + * gather metrics from benchmark processes, save them to file + * manages the benchmarks (timeout etc...) * if ``per_gpu`` is used, milabench will launch one process per GPU (sets ``CUDA_VISIBLE_DEVCES``) * each processes log their GPU data From d5a0a9e54b48b1bcf51badcfef4df9be6046d93c Mon Sep 17 00:00:00 2001 From: Setepenre Date: Thu, 1 Aug 2024 09:53:26 -0400 Subject: [PATCH 6/9] Update execution_modes.rst --- docs/execution_modes.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/execution_modes.rst b/docs/execution_modes.rst index e9412b936..998513d8f 100644 --- a/docs/execution_modes.rst +++ b/docs/execution_modes.rst @@ -13,6 +13,8 @@ Plan plan: method: per_gpu +Milabench will essentially execute something akin to below. + .. code-block:: bash echo "---" @@ -43,6 +45,8 @@ Plan method: njobs n: 1 +Milabench will essentially execute something akin to below. + .. code-block:: bash echo "---" From b8a2ac9b211e4bf1b18a96666f88616628cabc78 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Thu, 1 Aug 2024 09:54:10 -0400 Subject: [PATCH 7/9] Update execution_modes.rst --- docs/execution_modes.rst | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/docs/execution_modes.rst b/docs/execution_modes.rst index 998513d8f..2a664ec67 100644 --- a/docs/execution_modes.rst +++ b/docs/execution_modes.rst @@ -1,7 +1,10 @@ Plan ==== -* ``per_gpu``: used for mono gpu benchmarks, spawn one process per gpu and run the same benchmark +per_gpu +------- + +``per_gpu`: used for mono gpu benchmarks, spawn one process per gpu and run the same benchmark .. code-block:: yaml @@ -32,7 +35,10 @@ Milabench will essentially execute something akin to below. wait ) -* ``njobs``: used to launch a single jobs that can see all the gpus. +njobs +----- + +``njobs`` used to launch a single jobs that can see all the gpus. .. code-block:: yaml From a59068616b06272a08519db3c9bd88dedcd15626 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Thu, 1 Aug 2024 09:54:28 -0400 Subject: [PATCH 8/9] Update execution_modes.rst --- docs/execution_modes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/execution_modes.rst b/docs/execution_modes.rst index 2a664ec67..ed1bc7275 100644 --- a/docs/execution_modes.rst +++ b/docs/execution_modes.rst @@ -4,7 +4,7 @@ Plan per_gpu ------- -``per_gpu`: used for mono gpu benchmarks, spawn one process per gpu and run the same benchmark +``per_gpu``: used for mono gpu benchmarks, spawn one process per gpu and run the same benchmark .. code-block:: yaml From 4d2f716d282afe718f7c180971eee3e1a65b0d1a Mon Sep 17 00:00:00 2001 From: Setepenre Date: Thu, 1 Aug 2024 09:56:43 -0400 Subject: [PATCH 9/9] Update execution_modes.rst --- docs/execution_modes.rst | 49 ++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/docs/execution_modes.rst b/docs/execution_modes.rst index ed1bc7275..8d40fc44d 100644 --- a/docs/execution_modes.rst +++ b/docs/execution_modes.rst @@ -1,8 +1,30 @@ +Milabench processes overview +============================ + +* milabench main process + * gather metrics from benchmark processes, save them to file + * manages the benchmarks (timeout etc...) + + * if ``per_gpu`` is used, milabench will launch one process per GPU (sets ``CUDA_VISIBLE_DEVCES``) + * each processes log their GPU data + * might spawn a monitor process + * will init pynvml + * dataloader will also spawn process workers + * usually not using GPU + + * if ``njobs`` is used, milabench will launch a single process (torchrun) + * torchrun in turn will spawn one process per GPU + * RANK 0 is used for logging + * RANK 0 might spawn a monitor process + * will init pynvml + * dataloader will also spawn process workers + * usually not using GPU + Plan -==== +---- per_gpu -------- ++++++++ ``per_gpu``: used for mono gpu benchmarks, spawn one process per gpu and run the same benchmark @@ -36,7 +58,7 @@ Milabench will essentially execute something akin to below. ) njobs ------ ++++++ ``njobs`` used to launch a single jobs that can see all the gpus. @@ -64,27 +86,6 @@ Milabench will essentially execute something akin to below. ) -Milabench processes overview ----------------------------- - -* milabench main process - * gather metrics from benchmark processes, save them to file - * manages the benchmarks (timeout etc...) - - * if ``per_gpu`` is used, milabench will launch one process per GPU (sets ``CUDA_VISIBLE_DEVCES``) - * each processes log their GPU data - * might spawn a monitor process - * will init pynvml - * dataloader will also spawn process workers - * usually not using GPU - - * if ``njobs`` is used, milabench will launch a single process (torchrun) - * torchrun in turn will spawn one process per GPU - * RANK 0 is used for logging - * RANK 0 might spawn a monitor process - * will init pynvml - * dataloader will also spawn process workers - * usually not using GPU