diff --git a/.gitignore b/.gitignore index 482e776df..90a1e78d7 100644 --- a/.gitignore +++ b/.gitignore @@ -32,7 +32,8 @@ scripts/article/cuda/ scripts/article/xpu/ dependencies/ -benchmarks/gflownet/gflownet +benchmarks/recursiongfn/gflownet +benchmarks/recursiongfn/logs/ scripts/inventory.yaml output/ diff --git a/.pin/constraints-cuda-gnn.txt b/.pin/constraints-cuda-gnn.txt new file mode 100644 index 000000000..cc12cdab2 --- /dev/null +++ b/.pin/constraints-cuda-gnn.txt @@ -0,0 +1,337 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=.pin/constraints-cuda-gnn.txt .pin/tmp-constraints.txt benchmarks/geo_gnn/requirements-pre.in benchmarks/geo_gnn/requirements.in benchmarks/recursiongfn/requirements.in constraints/extra/gnn.cuda.txt +# +--extra-index-url https://pypi.ngc.nvidia.com +--extra-index-url https://download.pytorch.org/whl/cu121 +--find-links https://data.pyg.org/whl/torch-2.3.0+cu121.html +--trusted-host pypi.ngc.nvidia.com + +absl-py==2.1.0 + # via tensorboard +aiohappyeyeballs==2.4.0 + # via aiohttp +aiohttp==3.10.5 + # via torch-geometric +aiosignal==1.3.1 + # via aiohttp +antlr4-python3-runtime==4.9.3 + # via omegaconf +asttokens==2.4.1 + # via giving +async-timeout==4.0.3 + # via aiohttp +attrs==24.2.0 + # via aiohttp +blosc2==2.7.1 + # via tables +botorch==0.11.3 + # via -r benchmarks/recursiongfn/requirements.in +certifi==2024.7.4 + # via + # requests + # sentry-sdk +charset-normalizer==3.3.2 + # via requests +click==8.1.7 + # via wandb +codefind==0.1.6 + # via ptera +cvxopt==1.3.2 + # via -r benchmarks/recursiongfn/requirements.in +docker-pycreds==0.4.0 + # via wandb +executing==1.2.0 + # via varname +filelock==3.15.4 + # via + # torch + # triton +frozenlist==1.4.1 + # via + # aiohttp + # aiosignal +fsspec==2024.6.1 + # via + # torch + # torch-geometric +gitdb==4.0.11 + # via gitpython +gitpython==3.1.43 + # via + # -r benchmarks/recursiongfn/requirements.in + # wandb +giving==0.4.2 + # via + # ptera + # voir +gpytorch==1.12 + # via + # -r benchmarks/recursiongfn/requirements.in + # botorch +grpcio==1.66.0 + # via tensorboard +idna==3.8 + # via + # requests + # yarl +jaxtyping==0.2.33 + # via linear-operator +jinja2==3.1.4 + # via + # torch + # torch-geometric +joblib==1.4.2 + # via scikit-learn +linear-operator==0.5.2 + # via + # botorch + # gpytorch +markdown==3.7 + # via tensorboard +markdown-it-py==3.0.0 + # via rich +markupsafe==2.1.5 + # via + # jinja2 + # werkzeug +mdurl==0.1.2 + # via markdown-it-py +mpmath==1.3.0 + # via + # botorch + # gpytorch + # sympy +msgpack==1.0.8 + # via blosc2 +multidict==6.0.5 + # via + # aiohttp + # yarl +multipledispatch==1.0.0 + # via botorch +ndindex==1.8 + # via blosc2 +networkx==3.3 + # via + # -r benchmarks/recursiongfn/requirements.in + # torch +numexpr==2.10.1 + # via + # blosc2 + # tables +numpy==1.26.4 + # via + # -r benchmarks/geo_gnn/requirements.in + # blosc2 + # botorch + # numexpr + # opt-einsum + # pandas + # pyarrow + # pyro-ppl + # rdkit + # scikit-learn + # scipy + # tables + # tensorboard + # torch-geometric +nvidia-cublas-cu12==12.1.3.1 + # via + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.1.105 + # via torch +nvidia-cuda-nvrtc-cu12==12.1.105 + # via torch +nvidia-cuda-runtime-cu12==12.1.105 + # via torch +nvidia-cudnn-cu12==8.9.2.26 + # via torch +nvidia-cufft-cu12==11.0.2.54 + # via torch +nvidia-curand-cu12==10.3.2.106 + # via torch +nvidia-cusolver-cu12==11.4.5.107 + # via torch +nvidia-cusparse-cu12==12.1.0.106 + # via + # nvidia-cusolver-cu12 + # torch +nvidia-ml-py==12.560.30 + # via voir +nvidia-nccl-cu12==2.20.5 + # via torch +nvidia-nvjitlink-cu12==12.6.20 + # via + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 +nvidia-nvtx-cu12==12.1.105 + # via torch +omegaconf==2.3.0 + # via + # -r benchmarks/recursiongfn/requirements.in + # voir +opt-einsum==3.3.0 + # via pyro-ppl +ovld==0.3.9 + # via voir +packaging==24.1 + # via + # tables + # tensorboard +pandas==2.2.2 + # via + # -r benchmarks/geo_gnn/requirements.in + # -r benchmarks/recursiongfn/requirements.in +pillow==10.4.0 + # via rdkit +platformdirs==4.2.2 + # via wandb +protobuf==5.27.3 + # via + # tensorboard + # wandb +psutil==5.9.8 + # via + # torch-geometric + # voir + # wandb +ptera==1.4.1 + # via voir +py-cpuinfo==9.0.0 + # via + # blosc2 + # tables +pyarrow==17.0.0 + # via -r benchmarks/recursiongfn/requirements.in +pygments==2.18.0 + # via rich +pyparsing==3.1.4 + # via torch-geometric +pyro-api==0.1.2 + # via pyro-ppl +pyro-ppl==1.9.1 + # via + # -r benchmarks/recursiongfn/requirements.in + # botorch +python-dateutil==2.9.0.post0 + # via pandas +pytz==2024.1 + # via pandas +pyyaml==6.0.2 + # via + # omegaconf + # wandb +rdkit==2024.3.5 + # via + # -r benchmarks/geo_gnn/requirements.in + # -r benchmarks/recursiongfn/requirements.in +reactivex==4.0.4 + # via giving +requests==2.32.3 + # via + # torch-geometric + # wandb +rich==13.8.0 + # via voir +scikit-learn==1.5.1 + # via + # gpytorch + # torch-geometric +scipy==1.14.1 + # via + # -r benchmarks/recursiongfn/requirements.in + # botorch + # gpytorch + # linear-operator + # scikit-learn + # torch-cluster + # torch-geometric + # torch-sparse +sentry-sdk==2.13.0 + # via wandb +setproctitle==1.3.3 + # via wandb +six==1.16.0 + # via + # asttokens + # docker-pycreds + # python-dateutil + # tensorboard +smmap==5.0.1 + # via gitdb +sympy==1.13.2 + # via torch +tables==3.10.1 + # via -r benchmarks/recursiongfn/requirements.in +tensorboard==2.17.1 + # via -r benchmarks/recursiongfn/requirements.in +tensorboard-data-server==0.7.2 + # via tensorboard +threadpoolctl==3.5.0 + # via scikit-learn +torch==2.3.1+cu121 + # via + # -r benchmarks/geo_gnn/requirements-pre.in + # -r benchmarks/recursiongfn/requirements.in + # -r constraints/extra/gnn.cuda.txt + # botorch + # linear-operator + # pyro-ppl +torch-cluster==1.6.3+pt23cu121 + # via + # -r benchmarks/geo_gnn/requirements.in + # -r benchmarks/recursiongfn/requirements.in +torch-geometric==2.5.3 + # via + # -r benchmarks/geo_gnn/requirements.in + # -r benchmarks/recursiongfn/requirements.in +torch-scatter==2.1.2+pt23cu121 + # via + # -r benchmarks/geo_gnn/requirements.in + # -r benchmarks/recursiongfn/requirements.in +torch-sparse==0.6.18+pt23cu121 + # via + # -r benchmarks/geo_gnn/requirements.in + # -r benchmarks/recursiongfn/requirements.in +tqdm==4.66.5 + # via + # pyro-ppl + # torch-geometric +triton==2.3.1 + # via torch +typeguard==2.13.3 + # via + # jaxtyping + # linear-operator +typing-extensions==4.12.2 + # via + # reactivex + # tables + # torch +tzdata==2024.1 + # via pandas +urllib3==2.2.2 + # via + # requests + # sentry-sdk +varname==0.10.0 + # via giving +voir==0.2.19 + # via + # -c .pin/../constraints/cuda.txt + # -r benchmarks/geo_gnn/requirements.in + # -r benchmarks/recursiongfn/requirements.in +wandb==0.17.7 + # via -r benchmarks/recursiongfn/requirements.in +werkzeug==3.0.4 + # via tensorboard +yarl==1.9.4 + # via aiohttp + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/.pin/constraints-cuda-torch.txt b/.pin/constraints-cuda-torch.txt index d691495d5..09c97e23f 100644 --- a/.pin/constraints-cuda-torch.txt +++ b/.pin/constraints-cuda-torch.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # -# pip-compile --output-file=.pin/constraints-cuda-torch.txt .pin/tmp-constraints.txt benchmarks/brax/requirements.in benchmarks/diffusion/requirements.in benchmarks/dinov2/requirements.in benchmarks/flops/requirements.in benchmarks/huggingface/requirements.in benchmarks/lightning/requirements.in benchmarks/llama/requirements.in benchmarks/llm/requirements.in benchmarks/super-slomo/requirements.in benchmarks/timm/requirements.in benchmarks/torchvision/requirements.in benchmarks/torchvision_ddp/requirements.in +# pip-compile --output-file=.pin/constraints-cuda-torch.txt .pin/tmp-constraints.txt benchmarks/brax/requirements.in benchmarks/diffusion/requirements.in benchmarks/dinov2/requirements.in benchmarks/flops/requirements.in benchmarks/huggingface/requirements.in benchmarks/lightning/requirements.in benchmarks/llama/requirements.in benchmarks/llm/requirements.in benchmarks/super-slomo/requirements.in benchmarks/timm/requirements.in benchmarks/torchatari/requirements.in benchmarks/torchvision/requirements.in benchmarks/torchvision_ddp/requirements.in constraints/extra/torch.cuda.txt # --extra-index-url https://pypi.ngc.nvidia.com --extra-index-url https://download.pytorch.org/whl/cu121 @@ -19,13 +19,14 @@ absl-py==2.1.0 # mujoco-mjx # optax # orbax-checkpoint + # tensorboard accelerate==0.33.0 # via # -r benchmarks/diffusion/requirements.in # diffusers -aiohappyeyeballs==2.3.4 +aiohappyeyeballs==2.4.0 # via aiohttp -aiohttp==3.10.0 +aiohttp==3.10.5 # via # datasets # fsspec @@ -33,6 +34,8 @@ aiosignal==1.3.1 # via aiohttp antlr4-python3-runtime==4.9.3 # via omegaconf +appdirs==1.4.4 + # via cantilever argklass==1.4.4 # via # -r benchmarks/diffusion/requirements.in @@ -41,14 +44,16 @@ asttokens==2.4.1 # via giving async-timeout==4.0.3 # via aiohttp -attrs==23.2.0 +attrs==24.2.0 # via aiohttp blinker==1.8.2 # via flask -blobfile==2.1.1 +blobfile==3.0.0 # via torchtune brax==0.10.5 # via -r benchmarks/brax/requirements.in +cantilever==0.1.0 + # via -r benchmarks/torchatari/requirements.in certifi==2024.7.4 # via requests charset-normalizer==3.3.2 @@ -60,26 +65,33 @@ click==8.1.7 cloudpickle==3.0.0 # via # gym + # gymnasium # submitit codefind==0.1.6 # via ptera contextlib2==21.6.0 # via ml-collections -datasets==2.20.0 +datasets==2.21.0 # via # -r benchmarks/diffusion/requirements.in # -r benchmarks/llama/requirements.in # torchtune -diffusers[torch]==0.29.2 +diffusers[torch]==0.30.1 # via -r benchmarks/diffusion/requirements.in dill==0.3.8 # via # datasets # multiprocess dm-env==1.6 - # via brax + # via + # brax + # envpool dm-tree==0.1.8 # via dm-env +docstring-parser==0.16 + # via tyro +envpool==0.8.4 + # via -r benchmarks/torchatari/requirements.in etils[epath,epy]==1.7.0 # via # brax @@ -91,6 +103,8 @@ executing==1.2.0 # via varname fairscale==0.4.13 # via -r benchmarks/llama/requirements.in +farama-notifications==0.0.4 + # via gymnasium filelock==3.15.4 # via # blobfile @@ -108,13 +122,13 @@ flask==3.0.3 # flask-cors flask-cors==4.0.1 # via brax -flax==0.8.5 +flax==0.9.0 # via brax frozenlist==1.4.1 # via # aiohttp # aiosignal -fsspec[http]==2024.5.0 +fsspec[http]==2024.6.1 # via # datasets # etils @@ -130,15 +144,22 @@ giving==0.4.2 # voir glfw==2.7.0 # via mujoco -grpcio==1.65.2 - # via brax -gym==0.26.2 - # via brax +grpcio==1.66.0 + # via + # brax + # tensorboard +gym==0.23.1 + # via + # -r benchmarks/torchatari/requirements.in + # brax + # envpool gym-notices==0.0.8 # via gym +gymnasium==0.29.1 + # via envpool hjson==3.1.0 # via argklass -huggingface-hub==0.24.5 +huggingface-hub==0.24.6 # via # -r benchmarks/timm/requirements.in # accelerate @@ -147,15 +168,18 @@ huggingface-hub==0.24.5 # tokenizers # torchtune # transformers -idna==3.7 +humanize==4.10.0 + # via orbax-checkpoint +idna==3.8 # via # requests # yarl -importlib-metadata==8.2.0 +importlib-metadata==8.4.0 # via diffusers -importlib-resources==6.4.0 +importlib-resources==6.4.4 # via # argklass + # cantilever # etils # torchcompat iopath==0.1.10 @@ -167,6 +191,7 @@ itsdangerous==2.2.0 jax[cuda12]==0.4.31 # via # -r benchmarks/brax/requirements.in + # -r constraints/extra/torch.cuda.txt # brax # chex # flax @@ -194,15 +219,17 @@ jinja2==3.1.4 # brax # flask # torch -lightning==2.3.3 +lightning==2.4.0 # via -r benchmarks/lightning/requirements.in lightning-utilities==0.11.6 # via # lightning # pytorch-lightning # torchmetrics -lxml==4.9.4 +lxml==5.3.0 # via blobfile +markdown==3.7 + # via tensorboard markdown-it-py==3.0.0 # via rich markupsafe==2.1.5 @@ -224,11 +251,11 @@ msgpack==1.0.8 # via # flax # orbax-checkpoint -mujoco==3.2.0 +mujoco==3.2.2 # via # brax # mujoco-mjx -mujoco-mjx==3.2.0 +mujoco-mjx==3.2.2 # via brax multidict==6.0.5 # via @@ -243,20 +270,21 @@ networkx==3.3 numpy==1.26.4 # via # -r benchmarks/super-slomo/requirements.in + # -r benchmarks/torchatari/requirements.in # accelerate # brax # chex # datasets # diffusers # dm-env + # envpool # fairscale - # flax # fvcore # gym + # gymnasium # jax # jaxlib # jaxopt - # lightning # ml-dtypes # mujoco # opencv-python @@ -265,8 +293,8 @@ numpy==1.26.4 # orbax-checkpoint # pandas # pyarrow - # pytorch-lightning # scipy + # tensorboard # tensorboardx # tensorstore # torchmetrics @@ -312,6 +340,8 @@ nvidia-cusparse-cu12==12.1.0.106 # jax-cuda12-plugin # nvidia-cusolver-cu12 # torch +nvidia-ml-py==12.560.30 + # via voir nvidia-nccl-cu12==2.20.5 # via # jax-cuda12-plugin @@ -336,20 +366,24 @@ optax==0.2.3 # via # brax # flax -orbax-checkpoint==0.5.23 +optree==0.12.1 + # via envpool +orbax-checkpoint==0.6.1 # via # brax # flax -ovld==0.3.6 +ovld==0.3.9 # via voir packaging==24.1 # via # accelerate # datasets + # envpool # huggingface-hub # lightning # lightning-utilities # pytorch-lightning + # tensorboard # tensorboardx # torchmetrics # transformers @@ -367,6 +401,7 @@ portalocker==2.10.1 protobuf==5.27.3 # via # orbax-checkpoint + # tensorboard # tensorboardx psutil==5.9.8 # via @@ -376,25 +411,21 @@ ptera==1.4.1 # via voir pyarrow==17.0.0 # via datasets -pyarrow-hotfix==0.6 - # via datasets pycryptodomex==3.20.0 # via blobfile pygments==2.18.0 # via rich -pynvml==11.5.3 - # via voir pyopengl==3.1.7 # via mujoco python-dateutil==2.9.0.post0 # via pandas pytinyrenderer==0.0.14 # via brax -pytorch-lightning==2.3.3 +pytorch-lightning==2.4.0 # via lightning pytz==2024.1 # via pandas -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -r benchmarks/llm/requirements.in # -r benchmarks/timm/requirements.in @@ -424,18 +455,19 @@ requests==2.32.3 # huggingface-hub # tiktoken # transformers -rich==13.7.1 +rich==13.8.0 # via # flax + # tyro # voir -safetensors==0.4.3 +safetensors==0.4.4 # via # -r benchmarks/timm/requirements.in # accelerate # diffusers # torchtune # transformers -scipy==1.14.0 +scipy==1.14.1 # via # -r benchmarks/dinov2/requirements.in # brax @@ -447,21 +479,28 @@ sentencepiece==0.2.0 # via # -r benchmarks/llama/requirements.in # torchtune +shtab==1.7.1 + # via tyro six==1.16.0 # via # asttokens # fire # ml-collections # python-dateutil + # tensorboard submitit==1.5.1 # via -r benchmarks/dinov2/requirements.in -sympy==1.13.1 +sympy==1.13.2 # via torch tabulate==0.9.0 # via fvcore +tensorboard==2.17.1 + # via -r benchmarks/torchatari/requirements.in +tensorboard-data-server==0.7.2 + # via tensorboard tensorboardx==2.6.2.2 # via brax -tensorstore==0.1.63 +tensorstore==0.1.64 # via # flax # orbax-checkpoint @@ -486,6 +525,7 @@ torch==2.4.0+cu121 # -r benchmarks/llm/requirements.in # -r benchmarks/super-slomo/requirements.in # -r benchmarks/timm/requirements.in + # -r benchmarks/torchatari/requirements.in # -r benchmarks/torchvision/requirements.in # -r benchmarks/torchvision_ddp/requirements.in # accelerate @@ -503,9 +543,10 @@ torchcompat==1.1.4 # -c .pin/../constraints/cuda.txt # -r benchmarks/flops/requirements.in # -r benchmarks/lightning/requirements.in + # -r benchmarks/torchatari/requirements.in # -r benchmarks/torchvision/requirements.in # -r benchmarks/torchvision_ddp/requirements.in -torchmetrics==1.4.0.post0 +torchmetrics==1.4.1 # via # -r benchmarks/dinov2/requirements.in # lightning @@ -522,7 +563,7 @@ torchvision==0.19.0+cu121 # -r benchmarks/timm/requirements.in # -r benchmarks/torchvision/requirements.in # -r benchmarks/torchvision_ddp/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -r benchmarks/diffusion/requirements.in # -r benchmarks/flops/requirements.in @@ -537,32 +578,40 @@ tqdm==4.66.4 # pytorch-lightning # torchtune # transformers -transformers==4.43.3 +transformers==4.44.2 # via # -r benchmarks/diffusion/requirements.in # -r benchmarks/huggingface/requirements.in # -r benchmarks/llama/requirements.in -trimesh==4.4.3 +trimesh==4.4.7 # via # brax # mujoco-mjx triton==3.0.0 # via torch +types-protobuf==5.27.0.20240626 + # via envpool typing-extensions==4.12.2 # via # brax # chex + # envpool # etils # flax + # gymnasium # huggingface-hub # iopath # lightning # lightning-utilities + # optree # orbax-checkpoint # pytorch-lightning # reactivex # submitit # torch + # tyro +tyro==0.8.10 + # via -r benchmarks/torchatari/requirements.in tzdata==2024.1 # via pandas urllib3==2.2.2 @@ -584,19 +633,22 @@ voir==0.2.19 # -r benchmarks/llm/requirements.in # -r benchmarks/super-slomo/requirements.in # -r benchmarks/timm/requirements.in + # -r benchmarks/torchatari/requirements.in # -r benchmarks/torchvision/requirements.in # -r benchmarks/torchvision_ddp/requirements.in -werkzeug==3.0.3 - # via flask +werkzeug==3.0.4 + # via + # flask + # tensorboard xformers==0.0.27.post2 # via -r benchmarks/dinov2/requirements.in -xxhash==3.4.1 +xxhash==3.5.0 # via datasets yacs==0.1.8 # via fvcore yarl==1.9.4 # via aiohttp -zipp==3.19.2 +zipp==3.20.1 # via # etils # importlib-metadata diff --git a/.pin/constraints-rocm-gnn.txt b/.pin/constraints-rocm-gnn.txt new file mode 100644 index 000000000..dd945fc95 --- /dev/null +++ b/.pin/constraints-rocm-gnn.txt @@ -0,0 +1,305 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=.pin/constraints-rocm-gnn.txt .pin/tmp-constraints.txt benchmarks/geo_gnn/requirements-pre.in benchmarks/geo_gnn/requirements.in benchmarks/recursiongfn/requirements.in +# +--extra-index-url https://download.pytorch.org/whl/rocm6.0 + +absl-py==2.1.0 + # via tensorboard +aiohappyeyeballs==2.4.0 + # via aiohttp +aiohttp==3.10.5 + # via torch-geometric +aiosignal==1.3.1 + # via aiohttp +antlr4-python3-runtime==4.9.3 + # via omegaconf +asttokens==2.4.1 + # via giving +async-timeout==4.0.3 + # via aiohttp +attrs==24.2.0 + # via aiohttp +blosc2==2.7.1 + # via tables +botorch==0.11.3 + # via gflownet +certifi==2024.7.4 + # via + # requests + # sentry-sdk +charset-normalizer==3.3.2 + # via requests +click==8.1.7 + # via wandb +codefind==0.1.6 + # via ptera +cvxopt==1.3.2 + # via gflownet +docker-pycreds==0.4.0 + # via wandb +executing==1.2.0 + # via varname +filelock==3.15.4 + # via + # pytorch-triton-rocm + # torch +frozenlist==1.4.1 + # via + # aiohttp + # aiosignal +fsspec==2024.6.1 + # via + # torch + # torch-geometric +gflownet @ git+https://github.com/Delaunay/gflownet@milabench + # via -r benchmarks/recursiongfn/requirements.in +gitdb==4.0.11 + # via gitpython +gitpython==3.1.43 + # via + # gflownet + # wandb +giving==0.4.2 + # via + # ptera + # voir +gpytorch==1.12 + # via + # botorch + # gflownet +grpcio==1.65.5 + # via tensorboard +idna==3.7 + # via + # requests + # yarl +jaxtyping==0.2.33 + # via linear-operator +jinja2==3.1.4 + # via + # torch + # torch-geometric +joblib==1.4.2 + # via scikit-learn +linear-operator==0.5.2 + # via + # botorch + # gpytorch +markdown==3.7 + # via tensorboard +markdown-it-py==3.0.0 + # via rich +markupsafe==2.1.5 + # via + # jinja2 + # werkzeug +mdurl==0.1.2 + # via markdown-it-py +mpmath==1.3.0 + # via + # botorch + # gpytorch + # sympy +msgpack==1.0.8 + # via blosc2 +multidict==6.0.5 + # via + # aiohttp + # yarl +multipledispatch==1.0.0 + # via botorch +ndindex==1.8 + # via blosc2 +networkx==3.3 + # via + # gflownet + # torch +numexpr==2.10.1 + # via + # blosc2 + # tables +numpy==1.26.4 + # via + # -r benchmarks/geo_gnn/requirements.in + # blosc2 + # botorch + # numexpr + # opt-einsum + # pandas + # pyarrow + # pyro-ppl + # rdkit + # scikit-learn + # scipy + # tables + # tensorboard + # torch-geometric +omegaconf==2.3.0 + # via + # gflownet + # voir +opt-einsum==3.3.0 + # via pyro-ppl +ovld==0.3.8 + # via voir +packaging==24.1 + # via + # tables + # tensorboard +pandas==2.2.2 + # via + # -r benchmarks/geo_gnn/requirements.in + # gflownet +pillow==10.4.0 + # via rdkit +platformdirs==4.2.2 + # via wandb +protobuf==5.27.3 + # via + # tensorboard + # wandb +psutil==5.9.8 + # via + # torch-geometric + # voir + # wandb +ptera==1.4.1 + # via voir +py-cpuinfo==9.0.0 + # via + # blosc2 + # tables +pyarrow==17.0.0 + # via gflownet +pygments==2.18.0 + # via rich +pynvml==11.5.3 + # via voir +pyparsing==3.1.2 + # via torch-geometric +pyro-api==0.1.2 + # via pyro-ppl +pyro-ppl==1.9.1 + # via + # botorch + # gflownet +python-dateutil==2.9.0.post0 + # via pandas +pytorch-triton-rocm==3.0.0 + # via torch +pytz==2024.1 + # via pandas +pyyaml==6.0.2 + # via + # omegaconf + # wandb +rdkit==2024.3.5 + # via + # -r benchmarks/geo_gnn/requirements.in + # gflownet +reactivex==4.0.4 + # via giving +requests==2.32.3 + # via + # torch-geometric + # wandb +rich==13.7.1 + # via voir +scikit-learn==1.5.1 + # via + # gpytorch + # torch-geometric +scipy==1.14.0 + # via + # botorch + # gflownet + # gpytorch + # linear-operator + # scikit-learn + # torch-cluster + # torch-geometric + # torch-sparse +sentry-sdk==2.13.0 + # via wandb +setproctitle==1.3.3 + # via wandb +six==1.16.0 + # via + # asttokens + # docker-pycreds + # python-dateutil + # tensorboard +smmap==5.0.1 + # via gitdb +sympy==1.13.2 + # via torch +tables==3.10.1 + # via gflownet +tensorboard==2.17.1 + # via gflownet +tensorboard-data-server==0.7.2 + # via tensorboard +threadpoolctl==3.5.0 + # via scikit-learn +torch==2.4.0+rocm6.0 + # via + # -r benchmarks/geo_gnn/requirements-pre.in + # -r benchmarks/recursiongfn/requirements.in + # botorch + # gflownet + # linear-operator + # pyro-ppl +torch-cluster==1.6.3 + # via + # -r benchmarks/geo_gnn/requirements.in + # gflownet +torch-geometric==2.5.3 + # via + # -r benchmarks/geo_gnn/requirements.in + # gflownet +torch-scatter==2.1.2 + # via + # -r benchmarks/geo_gnn/requirements.in + # gflownet +torch-sparse==0.6.18 + # via + # -r benchmarks/geo_gnn/requirements.in + # gflownet +tqdm==4.66.5 + # via + # pyro-ppl + # torch-geometric +typeguard==2.13.3 + # via + # jaxtyping + # linear-operator +typing-extensions==4.12.2 + # via + # reactivex + # tables + # torch +tzdata==2024.1 + # via pandas +urllib3==2.2.2 + # via + # requests + # sentry-sdk +varname==0.10.0 + # via giving +voir==0.2.17 + # via + # -c .pin/../constraints/rocm.txt + # -r benchmarks/geo_gnn/requirements.in + # -r benchmarks/recursiongfn/requirements.in +wandb==0.17.7 + # via gflownet +werkzeug==3.0.3 + # via tensorboard +yarl==1.9.4 + # via aiohttp + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/.pin/constraints-rocm-torch.txt b/.pin/constraints-rocm-torch.txt index f2a057ae5..4fe6ae9da 100644 --- a/.pin/constraints-rocm-torch.txt +++ b/.pin/constraints-rocm-torch.txt @@ -2,12 +2,9 @@ # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # -# pip-compile --output-file=.pin/constraints-rocm-torch.txt .pin/tmp-constraints.txt benchmarks/accelerate_opt/requirements.in benchmarks/brax/requirements.in benchmarks/dlrm/requirements.in benchmarks/flops/requirements.in benchmarks/huggingface/requirements.in benchmarks/llama/requirements.in benchmarks/stargan/requirements.in benchmarks/super-slomo/requirements.in benchmarks/timm/requirements.in benchmarks/torchvision/requirements.in benchmarks/torchvision_ddp/requirements.in +# pip-compile --output-file=.pin/constraints-rocm-torch.txt .pin/tmp-constraints.txt benchmarks/brax/requirements.in benchmarks/diffusion/requirements.in benchmarks/dinov2/requirements.in benchmarks/flops/requirements.in benchmarks/huggingface/requirements.in benchmarks/lightning/requirements.in benchmarks/llama/requirements.in benchmarks/llm/requirements.in benchmarks/super-slomo/requirements.in benchmarks/timm/requirements.in benchmarks/torchatari/requirements.in benchmarks/torchvision/requirements.in benchmarks/torchvision_ddp/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com --extra-index-url https://download.pytorch.org/whl/rocm6.0 ---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com absl-py==2.1.0 # via @@ -20,31 +17,41 @@ absl-py==2.1.0 # optax # orbax-checkpoint # tensorboard -accelerate==0.32.1 - # via -r benchmarks/accelerate_opt/requirements.in -aiohttp==3.9.5 +accelerate==0.33.0 + # via + # -r benchmarks/diffusion/requirements.in + # diffusers +aiohappyeyeballs==2.4.0 + # via aiohttp +aiohttp==3.10.5 # via # datasets # fsspec aiosignal==1.3.1 # via aiohttp -annotated-types==0.7.0 - # via pydantic antlr4-python3-runtime==4.9.3 # via omegaconf +appdirs==1.4.4 + # via cantilever +argklass==1.4.4 + # via + # -r benchmarks/diffusion/requirements.in + # -r benchmarks/llm/requirements.in asttokens==2.4.1 # via giving async-timeout==4.0.3 # via aiohttp -attrs==23.2.0 +attrs==24.2.0 # via aiohttp -beautifulsoup4==4.12.3 - # via gdown blinker==1.8.2 # via flask +blobfile==2.1.1 + # via torchtune brax==0.10.5 # via -r benchmarks/brax/requirements.in -certifi==2024.6.2 +cantilever==0.1.0 + # via -r benchmarks/torchatari/requirements.in +certifi==2024.7.4 # via requests charset-normalizer==3.3.2 # via requests @@ -53,31 +60,35 @@ chex==0.1.86 click==8.1.7 # via flask cloudpickle==3.0.0 - # via gym + # via + # gym + # gymnasium + # submitit codefind==0.1.6 # via ptera contextlib2==21.6.0 # via ml-collections -datasets==2.20.0 +datasets==2.21.0 # via - # -r benchmarks/accelerate_opt/requirements.in + # -r benchmarks/diffusion/requirements.in # -r benchmarks/llama/requirements.in - # evaluate -deepspeed==0.14.4 - # via -r benchmarks/accelerate_opt/requirements.in + # torchtune +diffusers[torch]==0.30.0 + # via -r benchmarks/diffusion/requirements.in dill==0.3.8 # via # datasets - # evaluate # multiprocess dm-env==1.6 - # via brax + # via + # brax + # envpool dm-tree==0.1.8 # via dm-env -docker==7.1.0 - # via torchx docstring-parser==0.16 - # via torchx + # via tyro +envpool==0.8.4 + # via -r benchmarks/torchatari/requirements.in etils[epath,epy]==1.7.0 # via # brax @@ -85,22 +96,20 @@ etils[epath,epy]==1.7.0 # mujoco-mjx # optax # orbax-checkpoint -evaluate==0.4.2 - # via -r benchmarks/accelerate_opt/requirements.in executing==1.2.0 # via varname fairscale==0.4.13 # via -r benchmarks/llama/requirements.in -fbgemm-gpu==0.7.0+rocm6.0 - # via torchrec +farama-notifications==0.0.4 + # via gymnasium filelock==3.15.4 # via + # blobfile # datasets - # gdown + # diffusers # huggingface-hub # pytorch-triton-rocm # torch - # torchx # transformers fire==0.6.0 # via -r benchmarks/llama/requirements.in @@ -116,57 +125,67 @@ frozenlist==1.4.1 # via # aiohttp # aiosignal -fsspec[http]==2024.5.0 +fsspec[http]==2024.6.1 # via # datasets # etils - # evaluate # huggingface-hub + # lightning + # pytorch-lightning # torch - # torchx -future==1.0.0 - # via -r benchmarks/dlrm/requirements.in -gdown==5.2.0 - # via -r benchmarks/stargan/requirements.in +fvcore==0.1.5.post20221221 + # via -r benchmarks/dinov2/requirements.in giving==0.4.2 # via # ptera # voir glfw==2.7.0 # via mujoco -graphviz==0.20.3 - # via torchviz -grpcio==1.65.1 +grpcio==1.65.5 # via # brax # tensorboard -gym==0.26.2 - # via brax +gym==0.23.1 + # via + # -r benchmarks/torchatari/requirements.in + # brax + # envpool gym-notices==0.0.8 # via gym +gymnasium==0.29.1 + # via envpool hjson==3.1.0 - # via deepspeed -huggingface-hub==0.23.5 + # via argklass +huggingface-hub==0.24.6 # via # -r benchmarks/timm/requirements.in # accelerate # datasets - # evaluate + # diffusers # tokenizers + # torchtune # transformers +humanize==4.10.0 + # via orbax-checkpoint idna==3.7 # via # requests # yarl -importlib-metadata==8.0.0 - # via torchx -importlib-resources==6.4.0 +importlib-metadata==8.4.0 + # via diffusers +importlib-resources==6.4.3 # via + # argklass + # cantilever # etils # torchcompat +iopath==0.1.10 + # via + # -r benchmarks/dinov2/requirements.in + # fvcore itsdangerous==2.2.0 # via flask -jax[cuda12]==0.4.30 +jax==0.4.31 # via # -r benchmarks/brax/requirements.in # brax @@ -176,11 +195,7 @@ jax[cuda12]==0.4.30 # mujoco-mjx # optax # orbax-checkpoint -jax-cuda12-pjrt==0.4.30 - # via jax-cuda12-plugin -jax-cuda12-plugin[with-cuda]==0.4.30 - # via jax -jaxlib==0.4.30 +jaxlib==0.4.31 # via # brax # chex @@ -196,11 +211,16 @@ jinja2==3.1.4 # brax # flask # torch -joblib==1.4.2 - # via scikit-learn -lightning-utilities==0.11.5 - # via torchmetrics -markdown==3.6 +lightning==2.4.0 + # via -r benchmarks/lightning/requirements.in +lightning-utilities==0.11.6 + # via + # lightning + # pytorch-lightning + # torchmetrics +lxml==4.9.4 + # via blobfile +markdown==3.7 # via tensorboard markdown-it-py==3.0.0 # via rich @@ -223,100 +243,64 @@ msgpack==1.0.8 # via # flax # orbax-checkpoint -mujoco==3.2.0 +mujoco==3.2.2 # via # brax # mujoco-mjx -mujoco-mjx==3.2.0 +mujoco-mjx==3.2.2 # via brax multidict==6.0.5 # via # aiohttp # yarl multiprocess==0.70.16 - # via - # datasets - # evaluate -mypy-extensions==1.0.0 - # via typing-inspect + # via datasets nest-asyncio==1.6.0 # via orbax-checkpoint networkx==3.3 # via torch -ninja==1.11.1.1 - # via deepspeed numpy==1.26.4 # via - # -r benchmarks/dlrm/requirements.in - # -r benchmarks/stargan/requirements.in # -r benchmarks/super-slomo/requirements.in + # -r benchmarks/torchatari/requirements.in # accelerate # brax # chex # datasets - # deepspeed + # diffusers # dm-env - # evaluate + # envpool # fairscale - # fbgemm-gpu # flax + # fvcore # gym + # gymnasium # jax # jaxlib # jaxopt # ml-dtypes # mujoco - # onnx # opencv-python # opt-einsum # optax # orbax-checkpoint # pandas # pyarrow - # scikit-learn # scipy # tensorboard # tensorboardx # tensorstore # torchmetrics + # torchtune # torchvision # transformers # trimesh -nvidia-cublas-cu12==12.5.3.2 - # via - # jax-cuda12-plugin - # nvidia-cudnn-cu12 - # nvidia-cusolver-cu12 -nvidia-cuda-cupti-cu12==12.5.82 - # via jax-cuda12-plugin -nvidia-cuda-nvcc-cu12==12.5.82 - # via jax-cuda12-plugin -nvidia-cuda-runtime-cu12==12.5.82 - # via jax-cuda12-plugin -nvidia-cudnn-cu12==9.2.1.18 - # via jax-cuda12-plugin -nvidia-cufft-cu12==11.2.3.61 - # via jax-cuda12-plugin -nvidia-cusolver-cu12==11.6.3.83 - # via jax-cuda12-plugin -nvidia-cusparse-cu12==12.5.1.3 - # via - # jax-cuda12-plugin - # nvidia-cusolver-cu12 -nvidia-ml-py==12.555.43 - # via deepspeed -nvidia-nccl-cu12==2.22.3 - # via jax-cuda12-plugin -nvidia-nvjitlink-cu12==12.5.82 - # via - # jax-cuda12-plugin - # nvidia-cufft-cu12 - # nvidia-cusolver-cu12 - # nvidia-cusparse-cu12 + # xformers omegaconf==2.3.0 - # via voir -onnx==1.16.1 - # via -r benchmarks/dlrm/requirements.in + # via + # -r benchmarks/dinov2/requirements.in + # torchtune + # voir opencv-python==4.10.0.84 # via -r benchmarks/super-slomo/requirements.in opt-einsum==3.3.0 @@ -325,122 +309,125 @@ optax==0.2.3 # via # brax # flax -orbax-checkpoint==0.5.21 +optree==0.12.1 + # via envpool +orbax-checkpoint==0.6.0 # via # brax # flax -ovld==0.3.5 +ovld==0.3.8 # via voir packaging==24.1 # via # accelerate # datasets - # deepspeed - # evaluate + # envpool # huggingface-hub + # lightning # lightning-utilities + # pytorch-lightning + # tensorboard # tensorboardx # torchmetrics # transformers pandas==2.2.2 - # via - # datasets - # evaluate + # via datasets pillow==10.4.0 # via + # -r benchmarks/huggingface/requirements.in # brax + # diffusers + # fvcore # torchvision -protobuf==4.25.3 +portalocker==2.10.1 + # via iopath +protobuf==5.27.3 # via - # onnx # orbax-checkpoint # tensorboard # tensorboardx psutil==5.9.8 # via # accelerate - # deepspeed # voir ptera==1.4.1 # via voir -py-cpuinfo==9.0.0 - # via deepspeed pyarrow==17.0.0 # via datasets -pyarrow-hotfix==0.6 - # via datasets -pydantic==2.7.4 - # via deepspeed -pydantic-core==2.18.4 - # via pydantic -pydot==3.0.1 - # via -r benchmarks/dlrm/requirements.in +pycryptodomex==3.20.0 + # via blobfile pygments==2.18.0 # via rich pynvml==11.5.3 # via voir pyopengl==3.1.7 # via mujoco -pyparsing==3.1.2 - # via pydot -pyre-extensions==0.0.30 - # via torchx -pysocks==1.7.1 - # via requests python-dateutil==2.9.0.post0 # via pandas pytinyrenderer==0.0.14 # via brax -pytorch-triton-rocm==2.3.1 +pytorch-lightning==2.4.0 + # via lightning +pytorch-triton-rocm==3.0.0 # via torch pytz==2024.1 # via pandas -pyyaml==6.0.1 +pyyaml==6.0.2 # via + # -r benchmarks/llm/requirements.in # -r benchmarks/timm/requirements.in # accelerate # datasets # flax + # fvcore # huggingface-hub + # lightning # ml-collections # omegaconf # orbax-checkpoint - # torchx + # pytorch-lightning # transformers + # yacs reactivex==4.0.4 # via giving -regex==2024.5.15 - # via transformers -requests[socks]==2.32.3 +regex==2024.7.24 + # via + # diffusers + # tiktoken + # transformers +requests==2.32.3 # via # datasets - # docker - # evaluate - # gdown + # diffusers # huggingface-hub + # tiktoken # transformers rich==13.7.1 # via - # -r benchmarks/accelerate_opt/requirements.in # flax + # tyro # voir -safetensors==0.4.3 +safetensors==0.4.4 # via # -r benchmarks/timm/requirements.in # accelerate + # diffusers + # torchtune # transformers -scikit-learn==1.5.1 - # via -r benchmarks/dlrm/requirements.in scipy==1.14.0 # via + # -r benchmarks/dinov2/requirements.in # brax # jax # jaxlib # jaxopt # mujoco-mjx - # scikit-learn sentencepiece==0.2.0 - # via -r benchmarks/llama/requirements.in + # via + # -r benchmarks/llama/requirements.in + # torchtune +shtab==1.7.1 + # via tyro six==1.16.0 # via # asttokens @@ -448,104 +435,165 @@ six==1.16.0 # ml-collections # python-dateutil # tensorboard -soupsieve==2.5 - # via beautifulsoup4 -sympy==1.13.0 +submitit==1.5.1 + # via -r benchmarks/dinov2/requirements.in +sympy==1.13.2 # via torch tabulate==0.9.0 - # via torchx -tensorboard==2.17.0 - # via -r benchmarks/dlrm/requirements.in + # via fvcore +tensorboard==2.17.1 + # via -r benchmarks/torchatari/requirements.in tensorboard-data-server==0.7.2 # via tensorboard tensorboardx==2.6.2.2 # via brax -tensorstore==0.1.63 +tensorstore==0.1.64 # via # flax # orbax-checkpoint termcolor==2.4.0 - # via fire -threadpoolctl==3.5.0 - # via scikit-learn + # via + # fire + # fvcore +tiktoken==0.7.0 + # via torchtune tokenizers==0.19.1 # via transformers toolz==0.12.1 # via chex -tqdm==4.66.4 +torch==2.4.0+rocm6.0 # via - # -r benchmarks/dlrm/requirements.in + # -r benchmarks/brax/requirements.in + # -r benchmarks/dinov2/requirements.in + # -r benchmarks/flops/requirements.in + # -r benchmarks/huggingface/requirements.in + # -r benchmarks/lightning/requirements.in + # -r benchmarks/llama/requirements.in + # -r benchmarks/llm/requirements.in + # -r benchmarks/super-slomo/requirements.in + # -r benchmarks/timm/requirements.in + # -r benchmarks/torchatari/requirements.in + # -r benchmarks/torchvision/requirements.in + # -r benchmarks/torchvision_ddp/requirements.in + # accelerate + # diffusers + # fairscale + # lightning + # pytorch-lightning + # torchmetrics + # torchvision + # xformers +torchao==0.3.1 + # via torchtune +torchcompat==1.1.4 + # via + # -c .pin/../constraints/rocm.txt + # -r benchmarks/flops/requirements.in + # -r benchmarks/lightning/requirements.in + # -r benchmarks/torchatari/requirements.in + # -r benchmarks/torchvision/requirements.in + # -r benchmarks/torchvision_ddp/requirements.in +torchmetrics==1.4.1 + # via + # -r benchmarks/dinov2/requirements.in + # lightning + # pytorch-lightning +torchtune==0.2.1 + # via -r benchmarks/llm/requirements.in +torchvision==0.19.0+rocm6.0 + # via + # -r benchmarks/diffusion/requirements.in + # -r benchmarks/dinov2/requirements.in + # -r benchmarks/flops/requirements.in + # -r benchmarks/lightning/requirements.in + # -r benchmarks/super-slomo/requirements.in + # -r benchmarks/timm/requirements.in + # -r benchmarks/torchvision/requirements.in + # -r benchmarks/torchvision_ddp/requirements.in +tqdm==4.66.5 + # via + # -r benchmarks/diffusion/requirements.in # -r benchmarks/flops/requirements.in # -r benchmarks/super-slomo/requirements.in # -r benchmarks/torchvision/requirements.in # -r benchmarks/torchvision_ddp/requirements.in # datasets - # deepspeed - # evaluate - # gdown + # fvcore # huggingface-hub - # torchrec + # iopath + # lightning + # pytorch-lightning + # torchtune # transformers -transformers==4.42.4 +transformers==4.44.1 # via - # -r benchmarks/accelerate_opt/requirements.in + # -r benchmarks/diffusion/requirements.in # -r benchmarks/huggingface/requirements.in # -r benchmarks/llama/requirements.in -trimesh==4.4.3 +trimesh==4.4.7 # via # brax # mujoco-mjx +types-protobuf==5.27.0.20240626 + # via envpool typing-extensions==4.12.2 # via # brax # chex + # envpool # etils # flax + # gymnasium # huggingface-hub + # iopath + # lightning # lightning-utilities + # optree # orbax-checkpoint - # pydantic - # pydantic-core - # pyre-extensions + # pytorch-lightning # reactivex + # submitit # torch - # typing-inspect -typing-inspect==0.9.0 - # via pyre-extensions + # tyro +tyro==0.8.8 + # via -r benchmarks/torchatari/requirements.in tzdata==2024.1 # via pandas -urllib3==1.26.19 +urllib3==2.2.2 # via - # docker + # blobfile # requests - # torchx varname==0.10.0 # via giving voir==0.2.19 # via # -c .pin/../constraints/rocm.txt - # -r benchmarks/accelerate_opt/requirements.in # -r benchmarks/brax/requirements.in - # -r benchmarks/dlrm/requirements.in + # -r benchmarks/diffusion/requirements.in + # -r benchmarks/dinov2/requirements.in # -r benchmarks/flops/requirements.in # -r benchmarks/huggingface/requirements.in + # -r benchmarks/lightning/requirements.in # -r benchmarks/llama/requirements.in - # -r benchmarks/stargan/requirements.in + # -r benchmarks/llm/requirements.in # -r benchmarks/super-slomo/requirements.in # -r benchmarks/timm/requirements.in + # -r benchmarks/torchatari/requirements.in # -r benchmarks/torchvision/requirements.in # -r benchmarks/torchvision_ddp/requirements.in werkzeug==3.0.3 # via # flask # tensorboard -xxhash==3.4.1 - # via - # datasets - # evaluate +xformers==0.0.27.post2 + # via -r benchmarks/dinov2/requirements.in +xxhash==3.5.0 + # via datasets +yacs==0.1.8 + # via fvcore yarl==1.9.4 # via aiohttp -zipp==3.19.2 +zipp==3.20.0 # via # etils # importlib-metadata diff --git a/.pin/constraints-xpu-torch.txt b/.pin/constraints-xpu-torch.txt index 71a3d6f33..9e4276398 100644 --- a/.pin/constraints-xpu-torch.txt +++ b/.pin/constraints-xpu-torch.txt @@ -2,12 +2,10 @@ # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # -# pip-compile --output-file=.pin/constraints-xpu-torch.txt .pin/tmp-constraints.txt benchmarks/accelerate_opt/requirements.in benchmarks/brax/requirements.in benchmarks/dlrm/requirements.in benchmarks/flops/requirements.in benchmarks/huggingface/requirements.in benchmarks/llama/requirements.in benchmarks/stargan/requirements.in benchmarks/super-slomo/requirements.in benchmarks/timm/requirements.in benchmarks/torchvision/requirements.in benchmarks/torchvision_ddp/requirements.in +# pip-compile --output-file=.pin/constraints-xpu-torch.txt .pin/tmp-constraints.txt benchmarks/brax/requirements.in benchmarks/diffusion/requirements.in benchmarks/dinov2/requirements.in benchmarks/flops/requirements.in benchmarks/huggingface/requirements.in benchmarks/lightning/requirements.in benchmarks/llama/requirements.in benchmarks/llm/requirements.in benchmarks/super-slomo/requirements.in benchmarks/timm/requirements.in benchmarks/torchvision/requirements.in benchmarks/torchvision_ddp/requirements.in constraints/extra/torch.xpu.txt # ---extra-index-url https://pypi.ngc.nvidia.com +--extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ ---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com absl-py==2.1.0 # via @@ -19,32 +17,37 @@ absl-py==2.1.0 # mujoco-mjx # optax # orbax-checkpoint - # tensorboard -accelerate==0.32.1 - # via -r benchmarks/accelerate_opt/requirements.in -aiohttp==3.9.5 +accelerate==0.33.0 + # via + # -r benchmarks/diffusion/requirements.in + # diffusers +aiohappyeyeballs==2.3.5 + # via aiohttp +aiohttp==3.10.2 # via # datasets # fsspec aiosignal==1.3.1 # via aiohttp -annotated-types==0.7.0 - # via pydantic antlr4-python3-runtime==4.9.3 # via omegaconf +argklass==1.4.4 + # via + # -r benchmarks/diffusion/requirements.in + # -r benchmarks/llm/requirements.in asttokens==2.4.1 # via giving async-timeout==4.0.3 # via aiohttp -attrs==23.2.0 +attrs==24.2.0 # via aiohttp -beautifulsoup4==4.12.3 - # via gdown blinker==1.8.2 # via flask +blobfile==2.1.1 + # via torchtune brax==0.10.5 # via -r benchmarks/brax/requirements.in -certifi==2024.6.2 +certifi==2024.7.4 # via requests charset-normalizer==3.3.2 # via requests @@ -53,31 +56,28 @@ chex==0.1.86 click==8.1.7 # via flask cloudpickle==3.0.0 - # via gym + # via + # gym + # submitit codefind==0.1.6 # via ptera contextlib2==21.6.0 # via ml-collections datasets==2.20.0 # via - # -r benchmarks/accelerate_opt/requirements.in + # -r benchmarks/diffusion/requirements.in # -r benchmarks/llama/requirements.in - # evaluate -deepspeed==0.14.4 - # via -r benchmarks/accelerate_opt/requirements.in + # torchtune +diffusers[torch]==0.30.0 + # via -r benchmarks/diffusion/requirements.in dill==0.3.8 # via # datasets - # evaluate # multiprocess dm-env==1.6 # via brax dm-tree==0.1.8 # via dm-env -docker==7.1.0 - # via torchx -docstring-parser==0.16 - # via torchx etils[epath,epy]==1.7.0 # via # brax @@ -85,21 +85,17 @@ etils[epath,epy]==1.7.0 # mujoco-mjx # optax # orbax-checkpoint -evaluate==0.4.2 - # via -r benchmarks/accelerate_opt/requirements.in executing==1.2.0 # via varname fairscale==0.4.13 # via -r benchmarks/llama/requirements.in -fbgemm-gpu==0.7.0 - # via torchrec filelock==3.15.4 # via + # blobfile # datasets - # gdown + # diffusers # huggingface-hub # torch - # torchx # transformers fire==0.6.0 # via -r benchmarks/llama/requirements.in @@ -119,53 +115,65 @@ fsspec[http]==2024.5.0 # via # datasets # etils - # evaluate # huggingface-hub + # lightning + # pytorch-lightning # torch - # torchx -future==1.0.0 - # via -r benchmarks/dlrm/requirements.in -gdown==5.2.0 - # via -r benchmarks/stargan/requirements.in +fvcore==0.1.5.post20221221 + # via -r benchmarks/dinov2/requirements.in giving==0.4.2 # via # ptera # voir glfw==2.7.0 # via mujoco -graphviz==0.20.3 - # via torchviz -grpcio==1.65.1 - # via - # brax - # tensorboard +grpcio==1.65.4 + # via brax gym==0.26.2 # via brax gym-notices==0.0.8 # via gym hjson==3.1.0 - # via deepspeed -huggingface-hub==0.24.0 + # via argklass +huggingface-hub==0.24.5 # via # -r benchmarks/timm/requirements.in # accelerate # datasets - # evaluate + # diffusers # tokenizers + # torchtune # transformers idna==3.7 # via # requests # yarl -importlib-metadata==8.0.0 - # via torchx +importlib-metadata==8.2.0 + # via diffusers importlib-resources==6.4.0 # via + # argklass # etils # torchcompat +intel-extension-for-openxla==0.3.0 + # via + # -c .pin/../constraints/xpu.txt + # -r constraints/extra/torch.xpu.txt +intel-extension-for-pytorch==2.3.100 + # via + # -c .pin/../constraints/xpu.txt + # -r constraints/extra/torch.xpu.txt +intel-extension-for-pytorch-deepspeed==2.1.40 + # via + # -c .pin/../constraints/xpu.txt + # -r constraints/extra/torch.xpu.txt +iopath==0.1.10 + # via + # -r benchmarks/dinov2/requirements.in + # fvcore itsdangerous==2.2.0 # via flask -jax[cuda12]==0.4.30 +jax==0.4.31 # via # -r benchmarks/brax/requirements.in # brax @@ -175,11 +183,7 @@ jax[cuda12]==0.4.30 # mujoco-mjx # optax # orbax-checkpoint -jax-cuda12-pjrt==0.4.30 - # via jax-cuda12-plugin -jax-cuda12-plugin[with-cuda]==0.4.30 - # via jax -jaxlib==0.4.30 +jaxlib==0.4.31 # via # brax # chex @@ -195,12 +199,15 @@ jinja2==3.1.4 # brax # flask # torch -joblib==1.4.2 - # via scikit-learn -lightning-utilities==0.11.5 - # via torchmetrics -markdown==3.6 - # via tensorboard +lightning==2.4.0 + # via -r benchmarks/lightning/requirements.in +lightning-utilities==0.11.6 + # via + # lightning + # pytorch-lightning + # torchmetrics +lxml==4.9.4 + # via blobfile markdown-it-py==3.0.0 # via rich markupsafe==2.1.5 @@ -222,100 +229,66 @@ msgpack==1.0.8 # via # flax # orbax-checkpoint -mujoco==3.2.0 +mujoco==3.2.2 # via # brax # mujoco-mjx -mujoco-mjx==3.2.0 +mujoco-mjx==3.2.2 # via brax multidict==6.0.5 # via # aiohttp # yarl multiprocess==0.70.16 - # via - # datasets - # evaluate -mypy-extensions==1.0.0 - # via typing-inspect + # via datasets nest-asyncio==1.6.0 # via orbax-checkpoint networkx==3.3 # via torch -ninja==1.11.1.1 - # via deepspeed numpy==1.26.4 # via - # -r benchmarks/dlrm/requirements.in - # -r benchmarks/stargan/requirements.in # -r benchmarks/super-slomo/requirements.in # accelerate # brax # chex # datasets - # deepspeed + # diffusers # dm-env - # evaluate # fairscale - # fbgemm-gpu # flax + # fvcore # gym + # intel-extension-for-openxla + # intel-extension-for-pytorch # jax # jaxlib # jaxopt # ml-dtypes # mujoco - # onnx # opencv-python # opt-einsum # optax # orbax-checkpoint # pandas # pyarrow - # scikit-learn # scipy - # tensorboard # tensorboardx # tensorstore # torchmetrics + # torchtune # torchvision # transformers # trimesh -nvidia-cublas-cu12==12.5.3.2 - # via - # jax-cuda12-plugin - # nvidia-cudnn-cu12 - # nvidia-cusolver-cu12 -nvidia-cuda-cupti-cu12==12.5.82 - # via jax-cuda12-plugin -nvidia-cuda-nvcc-cu12==12.5.82 - # via jax-cuda12-plugin -nvidia-cuda-runtime-cu12==12.5.82 - # via jax-cuda12-plugin -nvidia-cudnn-cu12==9.2.1.18 - # via jax-cuda12-plugin -nvidia-cufft-cu12==11.2.3.61 - # via jax-cuda12-plugin -nvidia-cusolver-cu12==11.6.3.83 - # via jax-cuda12-plugin -nvidia-cusparse-cu12==12.5.1.3 - # via - # jax-cuda12-plugin - # nvidia-cusolver-cu12 -nvidia-ml-py==12.555.43 - # via deepspeed -nvidia-nccl-cu12==2.22.3 - # via jax-cuda12-plugin -nvidia-nvjitlink-cu12==12.5.82 - # via - # jax-cuda12-plugin - # nvidia-cufft-cu12 - # nvidia-cusolver-cu12 - # nvidia-cusparse-cu12 + # xformers omegaconf==2.3.0 - # via voir -onnx==1.16.1 - # via -r benchmarks/dlrm/requirements.in + # via + # -r benchmarks/dinov2/requirements.in + # torchtune + # voir +oneccl-bind-pt==2.1.400+xpu + # via + # -c .pin/../constraints/xpu.txt + # -r constraints/extra/torch.xpu.txt opencv-python==4.10.0.84 # via -r benchmarks/super-slomo/requirements.in opt-einsum==3.3.0 @@ -324,221 +297,226 @@ optax==0.2.3 # via # brax # flax -orbax-checkpoint==0.5.21 +orbax-checkpoint==0.5.23 # via # brax # flax -ovld==0.3.5 +ovld==0.3.8 # via voir packaging==24.1 # via # accelerate # datasets - # deepspeed - # evaluate # huggingface-hub + # intel-extension-for-pytorch + # lightning # lightning-utilities + # pytorch-lightning # tensorboardx # torchmetrics # transformers pandas==2.2.2 - # via - # datasets - # evaluate + # via datasets pillow==10.4.0 # via + # -r benchmarks/huggingface/requirements.in # brax + # diffusers + # fvcore # torchvision -protobuf==4.25.3 +portalocker==2.10.1 + # via iopath +protobuf==5.27.3 # via - # onnx # orbax-checkpoint - # tensorboard # tensorboardx psutil==5.9.8 # via # accelerate - # deepspeed + # intel-extension-for-pytorch # voir ptera==1.4.1 # via voir -py-cpuinfo==9.0.0 - # via deepspeed pyarrow==17.0.0 # via datasets pyarrow-hotfix==0.6 # via datasets -pydantic==2.7.4 - # via deepspeed -pydantic-core==2.18.4 - # via pydantic -pydot==3.0.1 - # via -r benchmarks/dlrm/requirements.in +pycryptodomex==3.20.0 + # via blobfile pygments==2.18.0 # via rich pynvml==11.5.3 # via voir pyopengl==3.1.7 # via mujoco -pyparsing==3.1.2 - # via pydot -pyre-extensions==0.0.30 - # via torchx -pysocks==1.7.1 - # via requests python-dateutil==2.9.0.post0 # via pandas pytinyrenderer==0.0.14 # via brax +pytorch-lightning==2.4.0 + # via lightning pytz==2024.1 # via pandas -pyyaml==6.0.1 +pyyaml==6.0.2 # via + # -r benchmarks/llm/requirements.in # -r benchmarks/timm/requirements.in # accelerate # datasets # flax + # fvcore # huggingface-hub + # lightning # ml-collections # omegaconf # orbax-checkpoint - # torchx + # pytorch-lightning # transformers + # yacs reactivex==4.0.4 # via giving -regex==2024.5.15 - # via transformers -requests[socks]==2.32.3 +regex==2024.7.24 + # via + # diffusers + # tiktoken + # transformers +requests==2.32.3 # via # datasets - # docker - # evaluate - # gdown + # diffusers # huggingface-hub - # torchvision + # tiktoken # transformers rich==13.7.1 # via - # -r benchmarks/accelerate_opt/requirements.in # flax # voir -safetensors==0.4.3 +safetensors==0.4.4 # via # -r benchmarks/timm/requirements.in # accelerate + # diffusers + # torchtune # transformers -scikit-learn==1.5.1 - # via -r benchmarks/dlrm/requirements.in -scipy==1.14.0 +scipy==1.11.4 # via + # -r benchmarks/dinov2/requirements.in # brax + # intel-extension-for-openxla # jax # jaxlib # jaxopt # mujoco-mjx - # scikit-learn sentencepiece==0.2.0 - # via -r benchmarks/llama/requirements.in + # via + # -r benchmarks/llama/requirements.in + # torchtune six==1.16.0 # via # asttokens # fire # ml-collections # python-dateutil - # tensorboard -soupsieve==2.5 - # via beautifulsoup4 -sympy==1.13.0 +submitit==1.5.1 + # via -r benchmarks/dinov2/requirements.in +sympy==1.13.1 # via torch tabulate==0.9.0 - # via torchx -tensorboard==2.17.0 - # via -r benchmarks/dlrm/requirements.in -tensorboard-data-server==0.7.2 - # via tensorboard + # via fvcore tensorboardx==2.6.2.2 # via brax -tensorstore==0.1.63 +tensorstore==0.1.64 # via # flax # orbax-checkpoint termcolor==2.4.0 - # via fire -threadpoolctl==3.5.0 - # via scikit-learn + # via + # fire + # fvcore +tiktoken==0.7.0 + # via torchtune tokenizers==0.19.1 # via transformers toolz==0.12.1 # via chex -torch==2.1.0.post2+cxx11.abi +torch==2.4.0+cpu # via # -c .pin/../constraints/xpu.txt - # -r benchmarks/accelerate_opt/requirements.in # -r benchmarks/brax/requirements.in - # -r benchmarks/dlrm/requirements.in + # -r benchmarks/dinov2/requirements.in # -r benchmarks/flops/requirements.in # -r benchmarks/huggingface/requirements.in + # -r benchmarks/lightning/requirements.in # -r benchmarks/llama/requirements.in - # -r benchmarks/stargan/requirements.in + # -r benchmarks/llm/requirements.in # -r benchmarks/super-slomo/requirements.in # -r benchmarks/timm/requirements.in # -r benchmarks/torchvision/requirements.in # -r benchmarks/torchvision_ddp/requirements.in + # -r constraints/extra/torch.xpu.txt # accelerate - # deepspeed + # diffusers # fairscale + # lightning + # pytorch-lightning # torchaudio # torchmetrics # torchvision - # torchviz -torchaudio==2.1.0.post2+cxx11.abi + # xformers +torchao==0.3.1+cpu + # via torchtune +torchaudio==2.4.0+cpu # via # -c .pin/../constraints/xpu.txt - # -r benchmarks/accelerate_opt/requirements.in + # -r constraints/extra/torch.xpu.txt torchcompat==1.1.4 # via # -c .pin/../constraints/xpu.txt # -r benchmarks/flops/requirements.in + # -r benchmarks/lightning/requirements.in # -r benchmarks/torchvision/requirements.in # -r benchmarks/torchvision_ddp/requirements.in -torchmetrics==1.0.3 - # via torchrec -torchrec==0.7.0 - # via -r benchmarks/dlrm/requirements.in -torchvision==0.16.0.post2+cxx11.abi + # -r constraints/extra/torch.xpu.txt +torchmetrics==1.4.1 + # via + # -r benchmarks/dinov2/requirements.in + # lightning + # pytorch-lightning +torchtune==0.2.1+cpu + # via -r benchmarks/llm/requirements.in +torchvision==0.19.0+cpu # via # -c .pin/../constraints/xpu.txt - # -r benchmarks/accelerate_opt/requirements.in + # -r benchmarks/diffusion/requirements.in + # -r benchmarks/dinov2/requirements.in # -r benchmarks/flops/requirements.in - # -r benchmarks/stargan/requirements.in + # -r benchmarks/lightning/requirements.in # -r benchmarks/super-slomo/requirements.in # -r benchmarks/timm/requirements.in # -r benchmarks/torchvision/requirements.in # -r benchmarks/torchvision_ddp/requirements.in -torchviz==0.0.2 - # via -r benchmarks/dlrm/requirements.in -torchx==0.7.0 - # via -r benchmarks/dlrm/requirements.in -tqdm==4.66.4 + # -r constraints/extra/torch.xpu.txt +tqdm==4.66.5 # via - # -r benchmarks/dlrm/requirements.in + # -r benchmarks/diffusion/requirements.in # -r benchmarks/flops/requirements.in # -r benchmarks/super-slomo/requirements.in # -r benchmarks/torchvision/requirements.in # -r benchmarks/torchvision_ddp/requirements.in # datasets - # deepspeed - # evaluate - # gdown + # fvcore # huggingface-hub - # torchrec + # iopath + # lightning + # pytorch-lightning + # torchtune # transformers -transformers==4.42.4 +transformers==4.44.0 # via - # -r benchmarks/accelerate_opt/requirements.in + # -r benchmarks/diffusion/requirements.in # -r benchmarks/huggingface/requirements.in # -r benchmarks/llama/requirements.in -trimesh==4.4.3 +trimesh==4.4.4 # via # brax # mujoco-mjx @@ -549,47 +527,48 @@ typing-extensions==4.12.2 # etils # flax # huggingface-hub + # iopath + # lightning # lightning-utilities # orbax-checkpoint - # pydantic - # pydantic-core - # pyre-extensions + # pytorch-lightning # reactivex + # submitit # torch - # typing-inspect -typing-inspect==0.9.0 - # via pyre-extensions tzdata==2024.1 # via pandas -urllib3==1.26.19 +urllib3==2.2.2 # via - # docker + # blobfile # requests - # torchx varname==0.10.0 # via giving voir==0.2.19 # via # -c .pin/../constraints/xpu.txt - # -r benchmarks/accelerate_opt/requirements.in # -r benchmarks/brax/requirements.in - # -r benchmarks/dlrm/requirements.in + # -r benchmarks/diffusion/requirements.in + # -r benchmarks/dinov2/requirements.in # -r benchmarks/flops/requirements.in # -r benchmarks/huggingface/requirements.in + # -r benchmarks/lightning/requirements.in # -r benchmarks/llama/requirements.in - # -r benchmarks/stargan/requirements.in + # -r benchmarks/llm/requirements.in # -r benchmarks/super-slomo/requirements.in # -r benchmarks/timm/requirements.in # -r benchmarks/torchvision/requirements.in # -r benchmarks/torchvision_ddp/requirements.in + # -r constraints/extra/torch.xpu.txt werkzeug==3.0.3 - # via - # flask - # tensorboard + # via flask +wheel==0.44.0 + # via intel-extension-for-openxla +xformers==0.0.27.post2 + # via -r benchmarks/dinov2/requirements.in xxhash==3.4.1 - # via - # datasets - # evaluate + # via datasets +yacs==0.1.8 + # via fvcore yarl==1.9.4 # via aiohttp zipp==3.19.2 diff --git a/README.md b/README.md index 5531a1253..526398938 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ It will include all of the necessary data docker run -it --rm --ipc=host --gpus=all \ -v $(pwd)/results:/milabench/envs/runs \ $MILABENCH_IMAGE \ - milabench run + bash -c "milabench prepare && milabench run" ================= Benchmark results diff --git a/benchmarks/brax/requirements.cuda.txt b/benchmarks/brax/requirements.cuda.txt index db4ddb1c7..21e15c3bc 100644 --- a/benchmarks/brax/requirements.cuda.txt +++ b/benchmarks/brax/requirements.cuda.txt @@ -90,11 +90,11 @@ flask-cors==4.0.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt # brax -flax==0.8.5 +flax==0.9.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # brax -fsspec==2024.5.0 +fsspec==2024.6.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt # etils @@ -108,11 +108,11 @@ glfw==2.7.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # mujoco -grpcio==1.65.2 +grpcio==1.66.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # brax -gym==0.26.2 +gym==0.23.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt # brax @@ -120,7 +120,11 @@ gym-notices==0.0.8 # via # -c .pin/../.pin/constraints-cuda-torch.txt # gym -importlib-resources==6.4.0 +humanize==4.10.0 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # orbax-checkpoint +importlib-resources==6.4.4 # via # -c .pin/../.pin/constraints-cuda-torch.txt # etils @@ -128,7 +132,7 @@ itsdangerous==2.2.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # flask -jax[cuda12]==0.4.31 +jax==0.4.31 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/brax/requirements.in @@ -139,14 +143,6 @@ jax[cuda12]==0.4.31 # mujoco-mjx # optax # orbax-checkpoint -jax-cuda12-pjrt==0.4.31 - # via - # -c .pin/../.pin/constraints-cuda-torch.txt - # jax-cuda12-plugin -jax-cuda12-plugin[with-cuda]==0.4.31 - # via - # -c .pin/../.pin/constraints-cuda-torch.txt - # jax jaxlib==0.4.31 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -199,12 +195,12 @@ msgpack==1.0.8 # -c .pin/../.pin/constraints-cuda-torch.txt # flax # orbax-checkpoint -mujoco==3.2.0 +mujoco==3.2.2 # via # -c .pin/../.pin/constraints-cuda-torch.txt # brax # mujoco-mjx -mujoco-mjx==3.2.0 +mujoco-mjx==3.2.2 # via # -c .pin/../.pin/constraints-cuda-torch.txt # brax @@ -222,7 +218,6 @@ numpy==1.26.4 # brax # chex # dm-env - # flax # gym # jax # jaxlib @@ -239,19 +234,13 @@ numpy==1.26.4 nvidia-cublas-cu12==12.1.3.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt - # jax-cuda12-plugin # nvidia-cudnn-cu12 # nvidia-cusolver-cu12 # torch nvidia-cuda-cupti-cu12==12.1.105 # via # -c .pin/../.pin/constraints-cuda-torch.txt - # jax-cuda12-plugin # torch -nvidia-cuda-nvcc-cu12==12.6.20 - # via - # -c .pin/../.pin/constraints-cuda-torch.txt - # jax-cuda12-plugin nvidia-cuda-nvrtc-cu12==12.1.105 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -259,17 +248,14 @@ nvidia-cuda-nvrtc-cu12==12.1.105 nvidia-cuda-runtime-cu12==12.1.105 # via # -c .pin/../.pin/constraints-cuda-torch.txt - # jax-cuda12-plugin # torch nvidia-cudnn-cu12==9.1.0.70 # via # -c .pin/../.pin/constraints-cuda-torch.txt - # jax-cuda12-plugin # torch nvidia-cufft-cu12==11.0.2.54 # via # -c .pin/../.pin/constraints-cuda-torch.txt - # jax-cuda12-plugin # torch nvidia-curand-cu12==10.3.2.106 # via @@ -278,23 +264,23 @@ nvidia-curand-cu12==10.3.2.106 nvidia-cusolver-cu12==11.4.5.107 # via # -c .pin/../.pin/constraints-cuda-torch.txt - # jax-cuda12-plugin # torch nvidia-cusparse-cu12==12.1.0.106 # via # -c .pin/../.pin/constraints-cuda-torch.txt - # jax-cuda12-plugin # nvidia-cusolver-cu12 # torch +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # voir nvidia-nccl-cu12==2.20.5 # via # -c .pin/../.pin/constraints-cuda-torch.txt - # jax-cuda12-plugin # torch nvidia-nvjitlink-cu12==12.6.20 # via # -c .pin/../.pin/constraints-cuda-torch.txt - # jax-cuda12-plugin # nvidia-cusolver-cu12 # nvidia-cusparse-cu12 nvidia-nvtx-cu12==12.1.105 @@ -314,12 +300,12 @@ optax==0.2.3 # -c .pin/../.pin/constraints-cuda-torch.txt # brax # flax -orbax-checkpoint==0.5.23 +orbax-checkpoint==0.6.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt # brax # flax -ovld==0.3.6 +ovld==0.3.9 # via # -c .pin/../.pin/constraints-cuda-torch.txt # voir @@ -348,10 +334,6 @@ pygments==2.18.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # rich -pynvml==11.5.3 - # via - # -c .pin/../.pin/constraints-cuda-torch.txt - # voir pyopengl==3.1.7 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -360,7 +342,7 @@ pytinyrenderer==0.0.14 # via # -c .pin/../.pin/constraints-cuda-torch.txt # brax -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-cuda-torch.txt # flax @@ -371,12 +353,12 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-cuda-torch.txt # giving -rich==13.7.1 +rich==13.8.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # flax # voir -scipy==1.14.0 +scipy==1.14.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt # brax @@ -389,7 +371,7 @@ six==1.16.0 # -c .pin/../.pin/constraints-cuda-torch.txt # asttokens # ml-collections -sympy==1.13.1 +sympy==1.13.2 # via # -c .pin/../.pin/constraints-cuda-torch.txt # torch @@ -397,7 +379,7 @@ tensorboardx==2.6.2.2 # via # -c .pin/../.pin/constraints-cuda-torch.txt # brax -tensorstore==0.1.63 +tensorstore==0.1.64 # via # -c .pin/../.pin/constraints-cuda-torch.txt # flax @@ -410,7 +392,7 @@ torch==2.4.0+cu121 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/brax/requirements.in -trimesh==4.4.3 +trimesh==4.4.7 # via # -c .pin/../.pin/constraints-cuda-torch.txt # brax @@ -438,11 +420,11 @@ voir==0.2.19 # -c .pin/../.pin/constraints-cuda-torch.txt # -c .pin/../constraints/cuda.txt # -r benchmarks/brax/requirements.in -werkzeug==3.0.3 +werkzeug==3.0.4 # via # -c .pin/../.pin/constraints-cuda-torch.txt # flask -zipp==3.19.2 +zipp==3.20.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt # etils diff --git a/benchmarks/brax/requirements.in b/benchmarks/brax/requirements.in index 8221238cf..cb8584f98 100644 --- a/benchmarks/brax/requirements.in +++ b/benchmarks/brax/requirements.in @@ -1,5 +1,4 @@ -jax[cuda12] ---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html +jax torch brax voir>=0.2.19,<0.3 diff --git a/benchmarks/brax/requirements.rocm.txt b/benchmarks/brax/requirements.rocm.txt index 22646c6c3..0c14e04d9 100644 --- a/benchmarks/brax/requirements.rocm.txt +++ b/benchmarks/brax/requirements.rocm.txt @@ -4,10 +4,7 @@ # # pip-compile --output-file=benchmarks/brax/requirements.rocm.txt .pin/tmp-constraints-rocm-brax.txt benchmarks/brax/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com --extra-index-url https://download.pytorch.org/whl/rocm6.0 ---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com absl-py==2.1.0 # via @@ -94,7 +91,7 @@ flax==0.8.5 # via # -c .pin/../.pin/constraints-rocm-torch.txt # brax -fsspec==2024.5.0 +fsspec==2024.6.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # etils @@ -108,11 +105,11 @@ glfw==2.7.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # mujoco -grpcio==1.65.1 +grpcio==1.65.5 # via # -c .pin/../.pin/constraints-rocm-torch.txt # brax -gym==0.26.2 +gym==0.23.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # brax @@ -120,7 +117,11 @@ gym-notices==0.0.8 # via # -c .pin/../.pin/constraints-rocm-torch.txt # gym -importlib-resources==6.4.0 +humanize==4.10.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # orbax-checkpoint +importlib-resources==6.4.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # etils @@ -128,7 +129,7 @@ itsdangerous==2.2.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # flask -jax[cuda12]==0.4.30 +jax==0.4.31 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/brax/requirements.in @@ -139,15 +140,7 @@ jax[cuda12]==0.4.30 # mujoco-mjx # optax # orbax-checkpoint -jax-cuda12-pjrt==0.4.30 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # jax-cuda12-plugin -jax-cuda12-plugin[with-cuda]==0.4.30 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # jax -jaxlib==0.4.30 +jaxlib==0.4.31 # via # -c .pin/../.pin/constraints-rocm-torch.txt # brax @@ -199,12 +192,12 @@ msgpack==1.0.8 # -c .pin/../.pin/constraints-rocm-torch.txt # flax # orbax-checkpoint -mujoco==3.2.0 +mujoco==3.2.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # brax # mujoco-mjx -mujoco-mjx==3.2.0 +mujoco-mjx==3.2.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # brax @@ -236,52 +229,6 @@ numpy==1.26.4 # tensorboardx # tensorstore # trimesh -nvidia-cublas-cu12==12.5.3.2 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # jax-cuda12-plugin - # nvidia-cudnn-cu12 - # nvidia-cusolver-cu12 -nvidia-cuda-cupti-cu12==12.5.82 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # jax-cuda12-plugin -nvidia-cuda-nvcc-cu12==12.5.82 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # jax-cuda12-plugin -nvidia-cuda-runtime-cu12==12.5.82 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # jax-cuda12-plugin -nvidia-cudnn-cu12==9.2.1.18 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # jax-cuda12-plugin -nvidia-cufft-cu12==11.2.3.61 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # jax-cuda12-plugin -nvidia-cusolver-cu12==11.6.3.83 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # jax-cuda12-plugin -nvidia-cusparse-cu12==12.5.1.3 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # jax-cuda12-plugin - # nvidia-cusolver-cu12 -nvidia-nccl-cu12==2.22.3 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # jax-cuda12-plugin -nvidia-nvjitlink-cu12==12.5.82 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # jax-cuda12-plugin - # nvidia-cufft-cu12 - # nvidia-cusolver-cu12 - # nvidia-cusparse-cu12 omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt @@ -295,12 +242,12 @@ optax==0.2.3 # -c .pin/../.pin/constraints-rocm-torch.txt # brax # flax -orbax-checkpoint==0.5.21 +orbax-checkpoint==0.6.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # brax # flax -ovld==0.3.5 +ovld==0.3.8 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir @@ -312,7 +259,7 @@ pillow==10.4.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # brax -protobuf==4.25.3 +protobuf==5.27.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # orbax-checkpoint @@ -341,11 +288,11 @@ pytinyrenderer==0.0.14 # via # -c .pin/../.pin/constraints-rocm-torch.txt # brax -pytorch-triton-rocm==2.3.1 +pytorch-triton-rocm==3.0.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # flax @@ -374,7 +321,7 @@ six==1.16.0 # -c .pin/../.pin/constraints-rocm-torch.txt # asttokens # ml-collections -sympy==1.13.0 +sympy==1.13.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch @@ -382,7 +329,7 @@ tensorboardx==2.6.2.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # brax -tensorstore==0.1.63 +tensorstore==0.1.64 # via # -c .pin/../.pin/constraints-rocm-torch.txt # flax @@ -391,11 +338,11 @@ toolz==0.12.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # chex -torch==2.3.1+rocm6.0 +torch==2.4.0+rocm6.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/brax/requirements.in -trimesh==4.4.3 +trimesh==4.4.7 # via # -c .pin/../.pin/constraints-rocm-torch.txt # brax @@ -423,7 +370,7 @@ werkzeug==3.0.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # flask -zipp==3.19.2 +zipp==3.20.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # etils diff --git a/benchmarks/brax/requirements.xpu.txt b/benchmarks/brax/requirements.xpu.txt index 7a2405d7d..5e7dbe294 100644 --- a/benchmarks/brax/requirements.xpu.txt +++ b/benchmarks/brax/requirements.xpu.txt @@ -4,10 +4,8 @@ # # pip-compile --output-file=benchmarks/brax/requirements.xpu.txt .pin/tmp-constraints-xpu-brax.txt benchmarks/brax/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com +--extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ ---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com absl-py==2.1.0 # via @@ -107,7 +105,7 @@ glfw==2.7.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # mujoco -grpcio==1.65.1 +grpcio==1.65.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt # brax @@ -127,7 +125,7 @@ itsdangerous==2.2.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # flask -jax[cuda12]==0.4.30 +jax==0.4.31 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -r benchmarks/brax/requirements.in @@ -138,15 +136,7 @@ jax[cuda12]==0.4.30 # mujoco-mjx # optax # orbax-checkpoint -jax-cuda12-pjrt==0.4.30 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # jax-cuda12-plugin -jax-cuda12-plugin[with-cuda]==0.4.30 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # jax -jaxlib==0.4.30 +jaxlib==0.4.31 # via # -c .pin/../.pin/constraints-xpu-torch.txt # brax @@ -198,12 +188,12 @@ msgpack==1.0.8 # -c .pin/../.pin/constraints-xpu-torch.txt # flax # orbax-checkpoint -mujoco==3.2.0 +mujoco==3.2.2 # via # -c .pin/../.pin/constraints-xpu-torch.txt # brax # mujoco-mjx -mujoco-mjx==3.2.0 +mujoco-mjx==3.2.2 # via # -c .pin/../.pin/constraints-xpu-torch.txt # brax @@ -235,52 +225,6 @@ numpy==1.26.4 # tensorboardx # tensorstore # trimesh -nvidia-cublas-cu12==12.5.3.2 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # jax-cuda12-plugin - # nvidia-cudnn-cu12 - # nvidia-cusolver-cu12 -nvidia-cuda-cupti-cu12==12.5.82 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # jax-cuda12-plugin -nvidia-cuda-nvcc-cu12==12.5.82 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # jax-cuda12-plugin -nvidia-cuda-runtime-cu12==12.5.82 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # jax-cuda12-plugin -nvidia-cudnn-cu12==9.2.1.18 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # jax-cuda12-plugin -nvidia-cufft-cu12==11.2.3.61 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # jax-cuda12-plugin -nvidia-cusolver-cu12==11.6.3.83 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # jax-cuda12-plugin -nvidia-cusparse-cu12==12.5.1.3 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # jax-cuda12-plugin - # nvidia-cusolver-cu12 -nvidia-nccl-cu12==2.22.3 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # jax-cuda12-plugin -nvidia-nvjitlink-cu12==12.5.82 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # jax-cuda12-plugin - # nvidia-cufft-cu12 - # nvidia-cusolver-cu12 - # nvidia-cusparse-cu12 omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt @@ -294,12 +238,12 @@ optax==0.2.3 # -c .pin/../.pin/constraints-xpu-torch.txt # brax # flax -orbax-checkpoint==0.5.21 +orbax-checkpoint==0.5.23 # via # -c .pin/../.pin/constraints-xpu-torch.txt # brax # flax -ovld==0.3.5 +ovld==0.3.8 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir @@ -311,7 +255,7 @@ pillow==10.4.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # brax -protobuf==4.25.3 +protobuf==5.27.3 # via # -c .pin/../.pin/constraints-xpu-torch.txt # orbax-checkpoint @@ -340,7 +284,7 @@ pytinyrenderer==0.0.14 # via # -c .pin/../.pin/constraints-xpu-torch.txt # brax -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-xpu-torch.txt # flax @@ -356,7 +300,7 @@ rich==13.7.1 # -c .pin/../.pin/constraints-xpu-torch.txt # flax # voir -scipy==1.14.0 +scipy==1.11.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt # brax @@ -369,7 +313,7 @@ six==1.16.0 # -c .pin/../.pin/constraints-xpu-torch.txt # asttokens # ml-collections -sympy==1.13.0 +sympy==1.13.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # torch @@ -377,7 +321,7 @@ tensorboardx==2.6.2.2 # via # -c .pin/../.pin/constraints-xpu-torch.txt # brax -tensorstore==0.1.63 +tensorstore==0.1.64 # via # -c .pin/../.pin/constraints-xpu-torch.txt # flax @@ -386,12 +330,13 @@ toolz==0.12.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # chex -torch==2.1.0.post2+cxx11.abi +torch==2.4.0+cpu # via # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/brax/requirements.in -trimesh==4.4.3 +trimesh==4.4.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt # brax @@ -413,6 +358,7 @@ varname==0.10.0 voir==0.2.19 # via # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/brax/requirements.in werkzeug==3.0.3 diff --git a/benchmarks/diffusion/main.py b/benchmarks/diffusion/main.py index bd6668dab..2b4fe9bfd 100644 --- a/benchmarks/diffusion/main.py +++ b/benchmarks/diffusion/main.py @@ -4,8 +4,6 @@ import math import random -from contextlib import nullcontext -from pathlib import Path import numpy as np import torch @@ -14,7 +12,6 @@ from accelerate import Accelerator from datasets import load_dataset from torchvision import transforms -from tqdm.auto import tqdm from transformers import CLIPTextModel, CLIPTokenizer from diffusers import AutoencoderKL, DDPMScheduler, UNet2DConditionModel @@ -44,6 +41,7 @@ class Arguments: lr_scheduler: str = "constant" lr_warmup_steps: int = 500 epochs: int = 10 + cache: str = None def models(accelerator, args: Arguments): diff --git a/benchmarks/diffusion/prepare.py b/benchmarks/diffusion/prepare.py index be7de0312..ed9e3f333 100755 --- a/benchmarks/diffusion/prepare.py +++ b/benchmarks/diffusion/prepare.py @@ -2,10 +2,6 @@ from dataclasses import dataclass import os -from transformers import CLIPTextModel, CLIPTokenizer - -from diffusers import AutoencoderKL, UNet2DConditionModel, DDPMScheduler -from datasets import load_dataset @dataclass @@ -14,6 +10,7 @@ class TrainingConfig: dataset: str = "lambdalabs/naruto-blip-captions" revision: str = None variant: str = None + cache: str = None def main(): @@ -22,6 +19,16 @@ def main(): parser = ArgumentParser() parser.add_arguments(TrainingConfig) args, _ = parser.parse_known_args() + # -- + + if args.cache: + os.environ["XDG_CACHE_HOME"] = str(args.cache) + + # -- + from transformers import CLIPTextModel, CLIPTokenizer + from diffusers import AutoencoderKL, UNet2DConditionModel, DDPMScheduler + from datasets import load_dataset + _ = load_dataset(args.dataset) diff --git a/benchmarks/diffusion/requirements.cuda.txt b/benchmarks/diffusion/requirements.cuda.txt index 250051be7..74ce5bd0a 100644 --- a/benchmarks/diffusion/requirements.cuda.txt +++ b/benchmarks/diffusion/requirements.cuda.txt @@ -14,11 +14,11 @@ accelerate==0.33.0 # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/diffusion/requirements.in # diffusers -aiohappyeyeballs==2.3.4 +aiohappyeyeballs==2.4.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # aiohttp -aiohttp==3.10.0 +aiohttp==3.10.5 # via # -c .pin/../.pin/constraints-cuda-torch.txt # datasets @@ -43,7 +43,7 @@ async-timeout==4.0.3 # via # -c .pin/../.pin/constraints-cuda-torch.txt # aiohttp -attrs==23.2.0 +attrs==24.2.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # aiohttp @@ -59,11 +59,11 @@ codefind==0.1.6 # via # -c .pin/../.pin/constraints-cuda-torch.txt # ptera -datasets==2.20.0 +datasets==2.21.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/diffusion/requirements.in -diffusers[torch]==0.29.2 +diffusers[torch]==0.30.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/diffusion/requirements.in @@ -90,7 +90,7 @@ frozenlist==1.4.1 # -c .pin/../.pin/constraints-cuda-torch.txt # aiohttp # aiosignal -fsspec[http]==2024.5.0 +fsspec[http]==2024.6.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt # datasets @@ -105,7 +105,7 @@ hjson==3.1.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # argklass -huggingface-hub==0.24.5 +huggingface-hub==0.24.6 # via # -c .pin/../.pin/constraints-cuda-torch.txt # accelerate @@ -113,16 +113,16 @@ huggingface-hub==0.24.5 # diffusers # tokenizers # transformers -idna==3.7 +idna==3.8 # via # -c .pin/../.pin/constraints-cuda-torch.txt # requests # yarl -importlib-metadata==8.2.0 +importlib-metadata==8.4.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # diffusers -importlib-resources==6.4.0 +importlib-resources==6.4.4 # via # -c .pin/../.pin/constraints-cuda-torch.txt # argklass @@ -208,6 +208,10 @@ nvidia-cusparse-cu12==12.1.0.106 # -c .pin/../.pin/constraints-cuda-torch.txt # nvidia-cusolver-cu12 # torch +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # voir nvidia-nccl-cu12==2.20.5 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -225,7 +229,7 @@ omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # voir -ovld==0.3.6 +ovld==0.3.9 # via # -c .pin/../.pin/constraints-cuda-torch.txt # voir @@ -258,18 +262,10 @@ pyarrow==17.0.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # datasets -pyarrow-hotfix==0.6 - # via - # -c .pin/../.pin/constraints-cuda-torch.txt - # datasets pygments==2.18.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # rich -pynvml==11.5.3 - # via - # -c .pin/../.pin/constraints-cuda-torch.txt - # voir python-dateutil==2.9.0.post0 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -278,7 +274,7 @@ pytz==2024.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt # pandas -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-cuda-torch.txt # accelerate @@ -302,11 +298,11 @@ requests==2.32.3 # diffusers # huggingface-hub # transformers -rich==13.7.1 +rich==13.8.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # voir -safetensors==0.4.3 +safetensors==0.4.4 # via # -c .pin/../.pin/constraints-cuda-torch.txt # accelerate @@ -317,7 +313,7 @@ six==1.16.0 # -c .pin/../.pin/constraints-cuda-torch.txt # asttokens # python-dateutil -sympy==1.13.1 +sympy==1.13.2 # via # -c .pin/../.pin/constraints-cuda-torch.txt # torch @@ -335,14 +331,14 @@ torchvision==0.19.0+cu121 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/diffusion/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/diffusion/requirements.in # datasets # huggingface-hub # transformers -transformers==4.43.3 +transformers==4.44.2 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/diffusion/requirements.in @@ -373,7 +369,7 @@ voir==0.2.19 # -c .pin/../.pin/constraints-cuda-torch.txt # -c .pin/../constraints/cuda.txt # -r benchmarks/diffusion/requirements.in -xxhash==3.4.1 +xxhash==3.5.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # datasets @@ -381,7 +377,7 @@ yarl==1.9.4 # via # -c .pin/../.pin/constraints-cuda-torch.txt # aiohttp -zipp==3.19.2 +zipp==3.20.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt # importlib-metadata diff --git a/benchmarks/diffusion/requirements.rocm.txt b/benchmarks/diffusion/requirements.rocm.txt new file mode 100644 index 000000000..5d0fd6e3f --- /dev/null +++ b/benchmarks/diffusion/requirements.rocm.txt @@ -0,0 +1,328 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/diffusion/requirements.rocm.txt .pin/tmp-constraints-rocm-diffusion-nodes.txt benchmarks/diffusion/requirements.in +# +--extra-index-url https://download.pytorch.org/whl/rocm6.0 + +accelerate==0.33.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/diffusion/requirements.in + # diffusers +aiohappyeyeballs==2.4.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp +aiohttp==3.10.5 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets + # fsspec +aiosignal==1.3.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp +antlr4-python3-runtime==4.9.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # omegaconf +argklass==1.4.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/diffusion/requirements.in +asttokens==2.4.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # giving +async-timeout==4.0.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp +attrs==24.2.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp +certifi==2024.7.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # requests +charset-normalizer==3.3.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # requests +codefind==0.1.6 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # ptera +datasets==2.21.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/diffusion/requirements.in +diffusers[torch]==0.30.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/diffusion/requirements.in +dill==0.3.8 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets + # multiprocess +executing==1.2.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # varname +filelock==3.15.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets + # diffusers + # huggingface-hub + # pytorch-triton-rocm + # torch + # transformers +frozenlist==1.4.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp + # aiosignal +fsspec[http]==2024.6.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets + # huggingface-hub + # torch +giving==0.4.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # ptera + # voir +hjson==3.1.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # argklass +huggingface-hub==0.24.6 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # accelerate + # datasets + # diffusers + # tokenizers + # transformers +idna==3.7 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # requests + # yarl +importlib-metadata==8.4.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # diffusers +importlib-resources==6.4.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # argklass +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +markdown-it-py==3.0.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # rich +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # jinja2 +mdurl==0.1.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # markdown-it-py +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # sympy +multidict==6.0.5 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp + # yarl +multiprocess==0.70.16 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets +networkx==3.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +numpy==1.26.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # accelerate + # datasets + # diffusers + # pandas + # pyarrow + # torchvision + # transformers +omegaconf==2.3.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +ovld==0.3.8 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +packaging==24.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # accelerate + # datasets + # huggingface-hub + # transformers +pandas==2.2.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets +pillow==10.4.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # diffusers + # torchvision +psutil==5.9.8 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # accelerate + # voir +ptera==1.4.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +pyarrow==17.0.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets +pygments==2.18.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # rich +pynvml==11.5.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +python-dateutil==2.9.0.post0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # pandas +pytorch-triton-rocm==3.0.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +pytz==2024.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # pandas +pyyaml==6.0.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # accelerate + # datasets + # huggingface-hub + # omegaconf + # transformers +reactivex==4.0.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # giving +regex==2024.7.24 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # diffusers + # transformers +requests==2.32.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets + # diffusers + # huggingface-hub + # transformers +rich==13.7.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +safetensors==0.4.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # accelerate + # diffusers + # transformers +six==1.16.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # asttokens + # python-dateutil +sympy==1.13.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +tokenizers==0.19.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # transformers +torch==2.4.0+rocm6.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # accelerate + # diffusers + # torchvision +torchvision==0.19.0+rocm6.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/diffusion/requirements.in +tqdm==4.66.5 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/diffusion/requirements.in + # datasets + # huggingface-hub + # transformers +transformers==4.44.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/diffusion/requirements.in +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # huggingface-hub + # reactivex + # torch +tzdata==2024.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # pandas +urllib3==2.2.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # requests +varname==0.10.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # giving +voir==0.2.17 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -c .pin/../constraints/rocm.txt + # -r benchmarks/diffusion/requirements.in +xxhash==3.5.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets +yarl==1.9.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp +zipp==3.20.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # importlib-metadata diff --git a/benchmarks/diffusion/requirements.xpu.txt b/benchmarks/diffusion/requirements.xpu.txt new file mode 100644 index 000000000..62a1aba1e --- /dev/null +++ b/benchmarks/diffusion/requirements.xpu.txt @@ -0,0 +1,333 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/diffusion/requirements.xpu.txt .pin/tmp-constraints-xpu-diffusion-nodes.txt benchmarks/diffusion/requirements.in +# +--extra-index-url https://download.pytorch.org/whl/cpu +--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ + +accelerate==0.33.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -r benchmarks/diffusion/requirements.in + # diffusers +aiohappyeyeballs==2.3.5 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # aiohttp +aiohttp==3.10.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # datasets + # fsspec +aiosignal==1.3.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # aiohttp +antlr4-python3-runtime==4.9.3 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # omegaconf +argklass==1.4.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -r benchmarks/diffusion/requirements.in +asttokens==2.4.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # giving +async-timeout==4.0.3 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # aiohttp +attrs==24.2.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # aiohttp +certifi==2024.7.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # requests +charset-normalizer==3.3.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # requests +codefind==0.1.6 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # ptera +datasets==2.20.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -r benchmarks/diffusion/requirements.in +diffusers[torch]==0.30.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -r benchmarks/diffusion/requirements.in +dill==0.3.8 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # datasets + # multiprocess +executing==1.2.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # varname +filelock==3.15.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # datasets + # diffusers + # huggingface-hub + # torch + # transformers +frozenlist==1.4.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # aiohttp + # aiosignal +fsspec[http]==2024.5.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # datasets + # huggingface-hub + # torch +giving==0.4.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # ptera + # voir +hjson==3.1.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # argklass +huggingface-hub==0.24.5 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # accelerate + # datasets + # diffusers + # tokenizers + # transformers +idna==3.7 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # requests + # yarl +importlib-metadata==8.2.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # diffusers +importlib-resources==6.4.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # argklass +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # torch +markdown-it-py==3.0.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # rich +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # jinja2 +mdurl==0.1.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # markdown-it-py +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # sympy +multidict==6.0.5 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # aiohttp + # yarl +multiprocess==0.70.16 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # datasets +networkx==3.3 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # torch +numpy==1.26.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # accelerate + # datasets + # diffusers + # pandas + # pyarrow + # torchvision + # transformers +omegaconf==2.3.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # voir +ovld==0.3.8 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # voir +packaging==24.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # accelerate + # datasets + # huggingface-hub + # transformers +pandas==2.2.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # datasets +pillow==10.4.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # diffusers + # torchvision +psutil==5.9.8 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # accelerate + # voir +ptera==1.4.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # voir +pyarrow==17.0.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # datasets +pyarrow-hotfix==0.6 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # datasets +pygments==2.18.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # rich +pynvml==11.5.3 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # voir +python-dateutil==2.9.0.post0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # pandas +pytz==2024.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # pandas +pyyaml==6.0.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # accelerate + # datasets + # huggingface-hub + # omegaconf + # transformers +reactivex==4.0.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # giving +regex==2024.7.24 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # diffusers + # transformers +requests==2.32.3 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # datasets + # diffusers + # huggingface-hub + # transformers +rich==13.7.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # voir +safetensors==0.4.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # accelerate + # diffusers + # transformers +six==1.16.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # asttokens + # python-dateutil +sympy==1.13.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # torch +tokenizers==0.19.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # transformers +torch==2.4.0+cpu + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt + # -c .pin/../constraints/xpu.txt + # accelerate + # diffusers + # torchvision +torchvision==0.19.0+cpu + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt + # -c .pin/../constraints/xpu.txt + # -r benchmarks/diffusion/requirements.in +tqdm==4.66.5 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -r benchmarks/diffusion/requirements.in + # datasets + # huggingface-hub + # transformers +transformers==4.44.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -r benchmarks/diffusion/requirements.in +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # huggingface-hub + # reactivex + # torch +tzdata==2024.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # pandas +urllib3==2.2.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # requests +varname==0.10.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # giving +voir==0.2.17 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt + # -c .pin/../constraints/xpu.txt + # -r benchmarks/diffusion/requirements.in +xxhash==3.4.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # datasets +yarl==1.9.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # aiohttp +zipp==3.19.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # importlib-metadata diff --git a/benchmarks/dinov2/benchfile.py b/benchmarks/dinov2/benchfile.py index ddfc4bc06..214a013f8 100644 --- a/benchmarks/dinov2/benchfile.py +++ b/benchmarks/dinov2/benchfile.py @@ -3,8 +3,8 @@ SOURCE_DIR = "src" -REPO_URL = "https://github.com/facebookresearch/dinov2" -BRANCH = "e1277af2ba9496fbadf7aec6eba56e8d882d1e35" +REPO_URL = "https://github.com/Delaunay/dinov2" +BRANCH = "451bc15a084f42cc97c21e3bc0be9e9158f9049c" class Dinov2(Package): @@ -28,7 +28,8 @@ def working_directory(self): def make_env(self): # Return a dict of environment variables for prepare_script and # main_script. - return super().make_env() + env = super().make_env() + return env async def install(self): await super().install() diff --git a/benchmarks/dinov2/requirements.cuda.txt b/benchmarks/dinov2/requirements.cuda.txt index 4a29be4a7..16e579e9d 100644 --- a/benchmarks/dinov2/requirements.cuda.txt +++ b/benchmarks/dinov2/requirements.cuda.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # -# pip-compile --output-file=benchmarks/dinov2/requirements.cuda.txt .pin/tmp-constraints-cuda-dinov2-giant-nodes.txt benchmarks/dinov2/requirements.in +# pip-compile --output-file=benchmarks/dinov2/requirements.cuda.txt .pin/tmp-constraints-cuda-dinov2-giant-gpus.txt benchmarks/dinov2/requirements.in # --extra-index-url https://pypi.ngc.nvidia.com --extra-index-url https://download.pytorch.org/whl/cu121 @@ -34,7 +34,7 @@ filelock==3.15.4 # -c .pin/../.pin/constraints-cuda-torch.txt # torch # triton -fsspec==2024.5.0 +fsspec==2024.6.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt # torch @@ -127,6 +127,10 @@ nvidia-cusparse-cu12==12.1.0.106 # -c .pin/../.pin/constraints-cuda-torch.txt # nvidia-cusolver-cu12 # torch +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # voir nvidia-nccl-cu12==2.20.5 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -145,7 +149,7 @@ omegaconf==2.3.0 # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/dinov2/requirements.in # voir -ovld==0.3.6 +ovld==0.3.9 # via # -c .pin/../.pin/constraints-cuda-torch.txt # voir @@ -175,11 +179,7 @@ pygments==2.18.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # rich -pynvml==11.5.3 - # via - # -c .pin/../.pin/constraints-cuda-torch.txt - # voir -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-cuda-torch.txt # fvcore @@ -189,11 +189,11 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-cuda-torch.txt # giving -rich==13.7.1 +rich==13.8.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # voir -scipy==1.14.0 +scipy==1.14.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/dinov2/requirements.in @@ -205,7 +205,7 @@ submitit==1.5.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/dinov2/requirements.in -sympy==1.13.1 +sympy==1.13.2 # via # -c .pin/../.pin/constraints-cuda-torch.txt # torch @@ -224,7 +224,7 @@ torch==2.4.0+cu121 # torchmetrics # torchvision # xformers -torchmetrics==1.4.0.post0 +torchmetrics==1.4.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/dinov2/requirements.in @@ -232,7 +232,7 @@ torchvision==0.19.0+cu121 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/dinov2/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-cuda-torch.txt # fvcore diff --git a/benchmarks/dinov2/requirements.rocm.txt b/benchmarks/dinov2/requirements.rocm.txt new file mode 100644 index 000000000..c46ba9819 --- /dev/null +++ b/benchmarks/dinov2/requirements.rocm.txt @@ -0,0 +1,216 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/dinov2/requirements.rocm.txt .pin/tmp-constraints-rocm-dinov2-giant-gpus.txt benchmarks/dinov2/requirements.in +# +--extra-index-url https://download.pytorch.org/whl/rocm6.0 + +antlr4-python3-runtime==4.9.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # omegaconf +asttokens==2.4.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # giving +cloudpickle==3.0.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # submitit +codefind==0.1.6 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # ptera +executing==1.2.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # varname +filelock==3.15.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # pytorch-triton-rocm + # torch +fsspec==2024.6.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +fvcore==0.1.5.post20221221 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/dinov2/requirements.in +giving==0.4.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # ptera + # voir +iopath==0.1.10 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/dinov2/requirements.in + # fvcore +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +lightning-utilities==0.11.6 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torchmetrics +markdown-it-py==3.0.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # rich +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # jinja2 +mdurl==0.1.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # markdown-it-py +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # sympy +networkx==3.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +numpy==1.26.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # fvcore + # scipy + # torchmetrics + # torchvision + # xformers +omegaconf==2.3.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/dinov2/requirements.in + # voir +ovld==0.3.8 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +packaging==24.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # lightning-utilities + # torchmetrics +pillow==10.4.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # fvcore + # torchvision +portalocker==2.10.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # iopath +psutil==5.9.8 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +ptera==1.4.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +pygments==2.18.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # rich +pynvml==11.5.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +pytorch-triton-rocm==3.0.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +pyyaml==6.0.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # fvcore + # omegaconf + # yacs +reactivex==4.0.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # giving +rich==13.7.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +scipy==1.14.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/dinov2/requirements.in +six==1.16.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # asttokens +submitit==1.5.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/dinov2/requirements.in +sympy==1.13.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +tabulate==0.9.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # fvcore +termcolor==2.4.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # fvcore +torch==2.4.0+rocm6.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/dinov2/requirements.in + # torchmetrics + # torchvision + # xformers +torchmetrics==1.4.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/dinov2/requirements.in +torchvision==0.19.0+rocm6.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/dinov2/requirements.in +tqdm==4.66.5 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # fvcore + # iopath +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # iopath + # lightning-utilities + # reactivex + # submitit + # torch +varname==0.10.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # giving +voir==0.2.17 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -c .pin/../constraints/rocm.txt + # -r benchmarks/dinov2/requirements.in +xformers==0.0.27.post2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/dinov2/requirements.in +yacs==0.1.8 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # fvcore + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/benchmarks/dinov2/requirements.xpu.txt b/benchmarks/dinov2/requirements.xpu.txt new file mode 100644 index 000000000..032296c6f --- /dev/null +++ b/benchmarks/dinov2/requirements.xpu.txt @@ -0,0 +1,217 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/dinov2/requirements.xpu.txt .pin/tmp-constraints-xpu-dinov2-giant-nodes.txt benchmarks/dinov2/requirements.in +# +--extra-index-url https://download.pytorch.org/whl/cpu +--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ + +antlr4-python3-runtime==4.9.3 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # omegaconf +asttokens==2.4.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # giving +cloudpickle==3.0.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # submitit +codefind==0.1.6 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # ptera +executing==1.2.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # varname +filelock==3.15.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # torch +fsspec==2024.5.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # torch +fvcore==0.1.5.post20221221 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -r benchmarks/dinov2/requirements.in +giving==0.4.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # ptera + # voir +iopath==0.1.10 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -r benchmarks/dinov2/requirements.in + # fvcore +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # torch +lightning-utilities==0.11.6 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # torchmetrics +markdown-it-py==3.0.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # rich +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # jinja2 +mdurl==0.1.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # markdown-it-py +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # sympy +networkx==3.3 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # torch +numpy==1.26.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # fvcore + # scipy + # torchmetrics + # torchvision + # xformers +omegaconf==2.3.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -r benchmarks/dinov2/requirements.in + # voir +ovld==0.3.8 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # voir +packaging==24.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # lightning-utilities + # torchmetrics +pillow==10.4.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # fvcore + # torchvision +portalocker==2.10.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # iopath +psutil==5.9.8 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # voir +ptera==1.4.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # voir +pygments==2.18.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # rich +pynvml==11.5.3 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # voir +pyyaml==6.0.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # fvcore + # omegaconf + # yacs +reactivex==4.0.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # giving +rich==13.7.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # voir +scipy==1.11.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -r benchmarks/dinov2/requirements.in +six==1.16.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # asttokens +submitit==1.5.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -r benchmarks/dinov2/requirements.in +sympy==1.13.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # torch +tabulate==0.9.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # fvcore +termcolor==2.4.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # fvcore +torch==2.4.0+cpu + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt + # -c .pin/../constraints/xpu.txt + # -r benchmarks/dinov2/requirements.in + # torchmetrics + # torchvision + # xformers +torchmetrics==1.4.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -r benchmarks/dinov2/requirements.in +torchvision==0.19.0+cpu + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt + # -c .pin/../constraints/xpu.txt + # -r benchmarks/dinov2/requirements.in +tqdm==4.66.5 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # fvcore + # iopath +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # iopath + # lightning-utilities + # reactivex + # submitit + # torch +varname==0.10.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # giving +voir==0.2.17 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt + # -c .pin/../constraints/xpu.txt + # -r benchmarks/dinov2/requirements.in +xformers==0.0.27.post2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -r benchmarks/dinov2/requirements.in +yacs==0.1.8 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # fvcore + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/benchmarks/dinov2/voirfile.py b/benchmarks/dinov2/voirfile.py index f358914dc..fdc616b83 100644 --- a/benchmarks/dinov2/voirfile.py +++ b/benchmarks/dinov2/voirfile.py @@ -26,11 +26,30 @@ class Config: gpu_poll: int = 3 +def populate_slurm(): + import json + import os + + config = json.loads(os.environ["MILABENCH_CONFIG"]) + + nodes = [n["name"] for n in config["system"]["nodes"]] + + env = { + "SLURM_JOB_ID": "123", + "SLURM_JOB_NUM_NODES": "2", + "SLURM_JOB_NODELIST": ",".join(nodes), + "SLURM_NTASKS": str(len(config["system"]["nodes"])), + "SLURM_PROCID": "2", # RANK + "SLURM_LOCALID": "1", # Local RANK + } + + @configurable def instrument_main(ov, options: Config): + import os + yield ov.phases.init - import os import sys sys.path.append(os.path.dirname(__file__) + "/src/") diff --git a/benchmarks/flops/activator b/benchmarks/flops/activator deleted file mode 100755 index 083c28cb1..000000000 --- a/benchmarks/flops/activator +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -venv="$1" -shift - -source "$venv"/bin/activate -exec "$@" diff --git a/benchmarks/flops/requirements.cuda.txt b/benchmarks/flops/requirements.cuda.txt index 75ea2eb79..8553edece 100644 --- a/benchmarks/flops/requirements.cuda.txt +++ b/benchmarks/flops/requirements.cuda.txt @@ -30,7 +30,7 @@ filelock==3.15.4 # -c .pin/../.pin/constraints-cuda-torch.txt # torch # triton -fsspec==2024.5.0 +fsspec==2024.6.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt # torch @@ -39,7 +39,7 @@ giving==0.4.2 # -c .pin/../.pin/constraints-cuda-torch.txt # ptera # voir -importlib-resources==6.4.0 +importlib-resources==6.4.4 # via # -c .pin/../.pin/constraints-cuda-torch.txt # torchcompat @@ -110,6 +110,10 @@ nvidia-cusparse-cu12==12.1.0.106 # -c .pin/../.pin/constraints-cuda-torch.txt # nvidia-cusolver-cu12 # torch +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # voir nvidia-nccl-cu12==2.20.5 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -127,7 +131,7 @@ omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # voir -ovld==0.3.6 +ovld==0.3.9 # via # -c .pin/../.pin/constraints-cuda-torch.txt # voir @@ -147,11 +151,7 @@ pygments==2.18.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # rich -pynvml==11.5.3 - # via - # -c .pin/../.pin/constraints-cuda-torch.txt - # voir -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-cuda-torch.txt # omegaconf @@ -159,7 +159,7 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-cuda-torch.txt # giving -rich==13.7.1 +rich==13.8.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # voir @@ -167,7 +167,7 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # asttokens -sympy==1.13.1 +sympy==1.13.2 # via # -c .pin/../.pin/constraints-cuda-torch.txt # torch @@ -185,7 +185,7 @@ torchvision==0.19.0+cu121 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/flops/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/flops/requirements.in diff --git a/benchmarks/flops/requirements.rocm.txt b/benchmarks/flops/requirements.rocm.txt index 953732347..d9ac15eb5 100644 --- a/benchmarks/flops/requirements.rocm.txt +++ b/benchmarks/flops/requirements.rocm.txt @@ -4,10 +4,7 @@ # # pip-compile --output-file=benchmarks/flops/requirements.rocm.txt .pin/tmp-constraints-rocm-flops.txt benchmarks/flops/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com --extra-index-url https://download.pytorch.org/whl/rocm6.0 ---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com antlr4-python3-runtime==4.9.3 # via @@ -30,7 +27,7 @@ filelock==3.15.4 # -c .pin/../.pin/constraints-rocm-torch.txt # pytorch-triton-rocm # torch -fsspec==2024.5.0 +fsspec==2024.6.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch @@ -39,7 +36,7 @@ giving==0.4.2 # -c .pin/../.pin/constraints-rocm-torch.txt # ptera # voir -importlib-resources==6.4.0 +importlib-resources==6.4.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torchcompat @@ -75,7 +72,7 @@ omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir -ovld==0.3.5 +ovld==0.3.8 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir @@ -99,11 +96,11 @@ pynvml==11.5.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir -pytorch-triton-rocm==2.3.1 +pytorch-triton-rocm==3.0.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # omegaconf @@ -119,11 +116,11 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # asttokens -sympy==1.13.0 +sympy==1.13.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch -torch==2.3.1+rocm6.0 +torch==2.4.0+rocm6.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/flops/requirements.in @@ -133,11 +130,11 @@ torchcompat==1.1.4 # -c .pin/../.pin/constraints-rocm-torch.txt # -c .pin/../constraints/rocm.txt # -r benchmarks/flops/requirements.in -torchvision==0.18.1+rocm6.0 +torchvision==0.19.0+rocm6.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/flops/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/flops/requirements.in diff --git a/benchmarks/flops/requirements.xpu.txt b/benchmarks/flops/requirements.xpu.txt index ed57d25f6..087e29b9a 100644 --- a/benchmarks/flops/requirements.xpu.txt +++ b/benchmarks/flops/requirements.xpu.txt @@ -4,10 +4,8 @@ # # pip-compile --output-file=benchmarks/flops/requirements.xpu.txt .pin/tmp-constraints-xpu-flops.txt benchmarks/flops/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com +--extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ ---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com antlr4-python3-runtime==4.9.3 # via @@ -17,14 +15,6 @@ asttokens==2.4.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -certifi==2024.6.2 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # requests -charset-normalizer==3.3.2 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # requests codefind==0.1.6 # via # -c .pin/../.pin/constraints-xpu-torch.txt @@ -46,10 +36,6 @@ giving==0.4.2 # -c .pin/../.pin/constraints-xpu-torch.txt # ptera # voir -idna==3.7 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # requests importlib-resources==6.4.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt @@ -86,7 +72,7 @@ omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir -ovld==0.3.5 +ovld==0.3.8 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir @@ -110,7 +96,7 @@ pynvml==11.5.3 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-xpu-torch.txt # omegaconf @@ -118,10 +104,6 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -requests==2.32.3 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # torchvision rich==13.7.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt @@ -130,27 +112,30 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # asttokens -sympy==1.13.0 +sympy==1.13.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # torch -torch==2.1.0.post2+cxx11.abi +torch==2.4.0+cpu # via # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/flops/requirements.in # torchvision torchcompat==1.1.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/flops/requirements.in -torchvision==0.16.0.post2+cxx11.abi +torchvision==0.19.0+cpu # via # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/flops/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -r benchmarks/flops/requirements.in @@ -159,10 +144,6 @@ typing-extensions==4.12.2 # -c .pin/../.pin/constraints-xpu-torch.txt # reactivex # torch -urllib3==1.26.19 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # requests varname==0.10.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt @@ -170,5 +151,6 @@ varname==0.10.0 voir==0.2.19 # via # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/flops/requirements.in diff --git a/benchmarks/llm/main.py b/benchmarks/geo_gnn/.pin/tmp-constraints-cuda-pna.txt similarity index 100% rename from benchmarks/llm/main.py rename to benchmarks/geo_gnn/.pin/tmp-constraints-cuda-pna.txt diff --git a/benchmarks/geo_gnn/Makefile b/benchmarks/geo_gnn/Makefile new file mode 100644 index 000000000..7645407e2 --- /dev/null +++ b/benchmarks/geo_gnn/Makefile @@ -0,0 +1,35 @@ +# Use global base if possible +ifndef MILABENCH_BASE + MILABENCH_BASE="base" +endif + +export MILABENCH_BASE +export MILABENCH_GPU_ARCH=cuda + +BENCH_NAME=geo_gnn +MILABENCH_CONFIG=dev.yaml +MILABENCH_ARGS=--config $(MILABENCH_CONFIG) --base $(MILABENCH_BASE) + +all: + install prepare single gpus nodes + +install: + milabench install $(MILABENCH_ARGS) --update + +prepare: + milabench prepare $(MILABENCH_ARGS) + +tests: # install + milabench run $(MILABENCH_ARGS) --select pna + +single: + milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME)-single + +gpus: + milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME)-gpus + +nodes: + milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME)-nodes + +pin: + milabench pin $(MILABENCH_ARGS) diff --git a/benchmarks/geo_gnn/README.md b/benchmarks/geo_gnn/README.md new file mode 100644 index 000000000..b82e3c019 --- /dev/null +++ b/benchmarks/geo_gnn/README.md @@ -0,0 +1,4 @@ + +# Dimenet + +Rewrite this README to explain what the benchmark is! diff --git a/benchmarks/geo_gnn/bench/models.py b/benchmarks/geo_gnn/bench/models.py new file mode 100644 index 000000000..0868724a0 --- /dev/null +++ b/benchmarks/geo_gnn/bench/models.py @@ -0,0 +1,68 @@ +from types import SimpleNamespace as NS + +from torch_geometric.nn.models import PNA as _PNA, DimeNet as _DimeNet + +models = {} + + +def register_model(fn): + models[fn.__name__] = fn + return fn + + +@register_model +def DimeNet(args, sample, **extras): + # The directional message passing neural network (DimeNet) from the “Directional Message Passing for Molecular Graphs” paper. + # DimeNet transforms messages based on the angle between them in a rotation-equivariant fashion. + + # PCQM4Mv2Subset: Data(x=[18, 9], edge_index=[2, 40], edge_attr=[40, 3], y=3.0476751256, pos=[18, 3], smiles='Cc1ccc([C@H]2[CH]c3cnccc3[N]C2=O)cc1') + # QM9: Data(x=[5, 11], edge_index=[2, 8], edge_attr=[8, 4], y=[1, 19], pos=[5, 3], z=[5], smiles='[H]C([H])([H])[H]', name='gdb_1', idx=[1]) + try: + batch_size, out_channels = sample.y.shape + except: + out_channels = 1 + + return NS( + category="3d", + model=_DimeNet( + hidden_channels=64, + out_channels=out_channels, + num_blocks=6, + num_bilinear=8, + num_spherical=7, + num_radial=6, + cutoff=10.0, + envelope_exponent=5, + num_before_skip=1, + num_after_skip=2, + num_output_layers=3, + ), + ) + + +@register_model +def PNA(args, sample, degree): + # The Graph Neural Network from the “Principal Neighbourhood Aggregation for Graph Nets” paper, + # using the PNAConv operator for message passing. + + out_channels = 1 + if hasattr(sample.y, "shape") and len(sample.y.shape) > 1: + out_channels = sample.y.shape[-1] + + _, in_channels = sample.x.shape + + return NS( + category="2d", + model=_PNA( + # Basic GCNN setup + in_channels=in_channels, + out_channels=out_channels, + hidden_channels=64, + num_layers=64, + # https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.conv.PNAConv.html + aggregators=['mean', 'min', 'max', 'std'], + scalers=['identity', 'amplification', 'attenuation'], + # Histogram of in-degrees of nodes in the training set, used by scalers to normalize + deg=degree(), + ), + ) diff --git a/benchmarks/geo_gnn/benchfile.py b/benchmarks/geo_gnn/benchfile.py new file mode 100644 index 000000000..cb6565b68 --- /dev/null +++ b/benchmarks/geo_gnn/benchfile.py @@ -0,0 +1,30 @@ +from milabench.pack import Package + + +class Dimenet(Package): + # Requirements file installed by install(). It can be empty or absent. + base_requirements = ["requirements-pre.in", "requirements.in"] + + # The preparation script called by prepare(). It must be executable, + # but it can be any type of script. It can be empty or absent. + prepare_script = "prepare.py" + + # The main script called by run(). It must be a Python file. It has to + # be present. + main_script = "main.py" + + # You can remove the functions below if you don't need to modify them. + + def make_env(self): + # Return a dict of environment variables for prepare_script and + # main_script. + return super().make_env() + + async def install(self): + await super().install() # super() call installs the requirements + + async def prepare(self): + await super().prepare() # super() call executes prepare_script + + +__pack__ = Dimenet diff --git a/benchmarks/geo_gnn/dev.yaml b/benchmarks/geo_gnn/dev.yaml new file mode 100644 index 000000000..7fadaea5f --- /dev/null +++ b/benchmarks/geo_gnn/dev.yaml @@ -0,0 +1,22 @@ +dimenet: + inherits: _defaults + definition: . + install-variant: cuda + install_group: torch + plan: + method: per_gpu + argv: + --model: 'DimeNet' + --num-samples: 10000 + --use3d: True + +pna: + inherits: _defaults + definition: . + install-variant: cuda + install_group: torch + plan: + method: per_gpu + argv: + --model: 'PNA' + --num-samples: 10000 \ No newline at end of file diff --git a/benchmarks/geo_gnn/main.py b/benchmarks/geo_gnn/main.py new file mode 100644 index 000000000..714707f65 --- /dev/null +++ b/benchmarks/geo_gnn/main.py @@ -0,0 +1,199 @@ +import argparse +import os + +import torch +import torch.nn as nn +import torch.optim as optim +import torchcompat.core as accelerator +from bench.models import models +from pcqm4m_subset import PCQM4Mv2Subset +from torch_geometric.datasets import QM9 +from torch_geometric.loader import DataLoader + +from benchmate.observer import BenchObserver + + +def parser(): + parser = argparse.ArgumentParser(description="Geometric GNN") + parser.add_argument( + "--batch-size", + type=int, + default=16, + metavar="N", + help="input batch size for training (default: 16)", + ) + parser.add_argument( + "--epochs", + type=int, + default=20, + metavar="N", + help="Number of epochs to train (default: 20)", + ) + parser.add_argument("--model", type=str, help="GNN name", required=True) + parser.add_argument( + "--num-samples", + type=int, + help="Number of samples to process in the dataset", + default=10000, + ) + parser.add_argument( + "--lr", + type=float, + default=1e-4, + metavar="LR", + help="learning rate (default: 0.0001)", + ) + parser.add_argument( + "--seed", + type=int, + default=1234, + metavar="S", + help="random seed (default: 1234)", + ) + parser.add_argument( + "--num-workers", + type=int, + default=0, + help="number of workers for data loading", + ) + parser.add_argument( + "--use3d", + action="store_true", + default=False, + help="Use 3D coordinates with data", + ) + parser.add_argument( + "--root", + type=str, + default=os.environ["MILABENCH_DIR_DATA"], + help="Dataset path", + ) + return parser + + +def train_degree(train_dataset): + from torch_geometric.utils import degree + + # Compute the maximum in-degree in the training data. + max_degree = -1 + for data in train_dataset: + d = degree(data.edge_index[1], num_nodes=data.num_nodes, dtype=torch.long) + max_degree = max(max_degree, int(d.max())) + + # Compute the in-degree histogram tensor + deg = torch.zeros(max_degree + 1, dtype=torch.long) + for data in train_dataset: + d = degree(data.edge_index[1], num_nodes=data.num_nodes, dtype=torch.long) + deg += torch.bincount(d, minlength=deg.numel()) + + return deg + + +def mean(self): + import numpy as np + return np.mean([self.get(i).y for i in range(len(self))]) + +def std(self): + import numpy as np + return np.std([self.get(i).y for i in range(len(self))]) + + +def main(): + args = parser().parse_args() + + def batch_size(x): + shape = x.y.shape + return shape[0] + + observer = BenchObserver(batch_size_fn=batch_size) + + # train_dataset = PCQM4Mv2Subset(args.num_samples, args.root) + train_dataset = QM9(args.root) + + sample = next(iter(train_dataset)) + + info = models[args.model](args, + sample=sample, + degree=lambda: train_degree(train_dataset), + ) + + TRAIN_mean, TRAIN_std = ( + mean(train_dataset).item(), + std(train_dataset).item(), + ) + print("Train mean: {}\tTrain std: {}".format(TRAIN_mean, TRAIN_std)) + + DataLoaderClass = DataLoader + dataloader_kwargs = {} + + train_loader = DataLoaderClass( + train_dataset, + batch_size=args.batch_size, + shuffle=True, + num_workers=args.num_workers, + **dataloader_kwargs + ) + + device = accelerator.fetch_device(0) + model = info.model.to(device) + + criterion = nn.L1Loss() + + # set up optimizer + # different learning rate for different part of GNN + model_param_group = [{"params": model.parameters(), "lr": args.lr}] + optimizer = optim.Adam(model_param_group, lr=args.lr, weight_decay=0) + + lr_scheduler = None + lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs) + + num_batches = len(train_loader) + for epoch in range(1, args.epochs + 1): + model.train() + + for step, batch in enumerate(observer.iterate(train_loader)): + # QM9 => DataBatch(x=[290, 11], edge_index=[2, 602], edge_attr=[602, 4], y=[16, 19], pos=[290, 3], z=[290], smiles=[16], name=[16], idx=[16], batch=[290], ptr=[17]) + # PCQM4Mv2Subset => DataBatch(x=[229, 9], edge_index=[2, 476], edge_attr=[476, 3], y=[16], pos=[229, 3], smiles=[16], batch=[229], ptr=[17]) + batch = batch.to(device) + + if args.use3d: + + if hasattr(batch, "z"): + z = batch.z + else: + z = batch.batch + + molecule_repr = model(z=z, pos=batch.pos, batch=batch.batch) + else: + molecule_repr = model(x=batch.x, batch=batch.batch, edge_index=batch.edge_index, batch_size=batch_size(batch)) + + pred = molecule_repr.squeeze() + + # Dimenet : pred: torch.Size([ 16, 19]) + # PNA : pred: torch.Size([292, 19]) <= (with x=batch.x) WTF !? 292 = batch.x.shape[0] + # batch : torch.Size([ 16, 19]) + # print(molecule_repr.shape) + # print(batch.y.shape) + + B = pred.size()[0] + y = batch.y.view(B, -1) + # normalize + y = (y - TRAIN_mean) / TRAIN_std + + loss = criterion(pred, y) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + lr_scheduler.step(epoch - 1 + step / num_batches) + + observer.record_loss(loss) + + lr_scheduler.step() + + print("Epoch: {}\nLoss: {}".format(epoch)) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/geo_gnn/pcqm4m_subset.py b/benchmarks/geo_gnn/pcqm4m_subset.py new file mode 100644 index 000000000..615aea2bb --- /dev/null +++ b/benchmarks/geo_gnn/pcqm4m_subset.py @@ -0,0 +1,107 @@ +import os +import os.path as osp +from typing import Any, Callable, Dict, List, Optional + +import numpy as np +import torch +from rdkit import Chem +from torch_geometric.data import Data, download_url, extract_tar +from torch_geometric.data.data import BaseData +from torch_geometric.datasets import PCQM4Mv2 +from torch_geometric.utils import from_smiles as _from_smiles +from tqdm import tqdm + + +class PCQM4Mv2Subset(PCQM4Mv2): + suppl_url = "http://ogb-data.stanford.edu/data/lsc/pcqm4m-v2-train.sdf.tar.gz" + + def __init__( + self, + size: int, + root: str, + split: str = "train", + transform: Optional[Callable] = None, + backend: str = "sqlite", + from_smiles: Optional[Callable] = None, + ) -> None: + assert split in ["train", "val", "test", "holdout"] + + self.size = size + + schema = { + "x": dict(dtype=torch.int64, size=(-1, 9)), + "edge_index": dict(dtype=torch.int64, size=(2, -1)), + "edge_attr": dict(dtype=torch.int64, size=(-1, 3)), + "smiles": str, + "pos": dict(dtype=torch.float32, size=(-1, 3)), + "y": float, + } + + self.from_smiles = from_smiles or _from_smiles + super(PCQM4Mv2, self).__init__(root, transform, backend=backend, schema=schema) + + split_idx = torch.load(self.raw_paths[1]) + self._indices = split_idx[self.split_mapping[split]].tolist() + + def raw_file_names(self): + return super().raw_file_names + [ + osp.join("pcqm4m-v2", "raw", "pcqm4m-v2-train.sdf") + ] + + def download(self): + print(self.raw_paths) + if all(os.path.exists(path) for path in self.raw_paths): + return + + # Download 2d graphs + print(self.raw_dir) + super().download() + + # Download 3D coordinates + file_path = download_url(self.suppl_url, self.raw_dir) + # md5sum: fd72bce606e7ddf36c2a832badeec6ab + extract_tar(file_path, osp.join(self.raw_dir, "pcqm4m-v2", "raw"), mode="r:gz") + os.unlink(file_path) + + def process(self) -> None: + import pandas as pd + + df = pd.read_csv(self.raw_paths[0]) + + data_list: List[Data] = [] + suppl = Chem.SDMolSupplier(self.raw_paths[-1]) + iterator = enumerate(zip(df["smiles"], df["homolumogap"], suppl)) + for i, (smiles, y, extra) in tqdm(iterator, total=min(len(df), self.size)): + # data = from_smiles(smiles) + data = self.from_smiles(Chem.MolToSmiles(extra)) + data.y = y + data.pos = torch.tensor( + extra.GetConformer().GetPositions(), dtype=torch.float + ) + + data_list.append(data) + if ( + i + 1 == len(df) or (i + 1) % 1000 == 0 or i >= self.size + ): # Write batch-wise: + self.extend(data_list) + data_list = [] + + if i >= self.size: + break + + def __len__(self): + return min(super().__len__(), self.size) + + def len(self): + return min(super().len(), self.size) + + def mean(self): + return np.mean([self.get(i).y for i in range(len(self))]) + + def std(self): + return np.std([self.get(i).y for i in range(len(self))]) + + def serialize(self, data: BaseData) -> Dict[str, Any]: + rval = super().serialize(data) + rval["pos"] = data.pos + return rval diff --git a/benchmarks/geo_gnn/prepare.py b/benchmarks/geo_gnn/prepare.py new file mode 100755 index 000000000..2b352f8ce --- /dev/null +++ b/benchmarks/geo_gnn/prepare.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python +import argparse +import os + +from pcqm4m_subset import PCQM4Mv2Subset +from torch_geometric.datasets import QM9 + + +def parser(): + parser = argparse.ArgumentParser(description="Geometric GNN") + parser.add_argument( + "--num-samples", + type=int, + help="Number of samples to process in the dataset", + default=10000, + ) + parser.add_argument( + "--root", + type=str, + default=os.environ["MILABENCH_DIR_DATA"], + help="Dataset path", + ) + return parser + + +if __name__ == "__main__": + args, _ = parser().parse_known_args() + + # TODO: Handle argument for the number of samples + train_dataset = QM9(args.root) + # dataset = PCQM4Mv2Subset(args.num_samples, root=args.root) + diff --git a/benchmarks/geo_gnn/requirements-pre.cuda.txt b/benchmarks/geo_gnn/requirements-pre.cuda.txt new file mode 100644 index 000000000..3b11e80df --- /dev/null +++ b/benchmarks/geo_gnn/requirements-pre.cuda.txt @@ -0,0 +1,104 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/geo_gnn/requirements-pre.cuda.txt .pin/tmp-constraints-cuda-geo_gnn.txt benchmarks/geo_gnn/requirements-pre.in +# +--extra-index-url https://pypi.ngc.nvidia.com +--extra-index-url https://download.pytorch.org/whl/cu121 +--find-links https://data.pyg.org/whl/torch-2.3.0+cu121.html +--trusted-host pypi.ngc.nvidia.com + +filelock==3.15.4 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch + # triton +fsspec==2024.6.1 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # jinja2 +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # sympy +networkx==3.3 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch +nvidia-cublas-cu12==12.1.3.1 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch +nvidia-cuda-nvrtc-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch +nvidia-cuda-runtime-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch +nvidia-cudnn-cu12==8.9.2.26 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch +nvidia-cufft-cu12==11.0.2.54 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch +nvidia-curand-cu12==10.3.2.106 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch +nvidia-cusolver-cu12==11.4.5.107 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch +nvidia-cusparse-cu12==12.1.0.106 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # nvidia-cusolver-cu12 + # torch +nvidia-nccl-cu12==2.20.5 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch +nvidia-nvjitlink-cu12==12.6.20 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 +nvidia-nvtx-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch +sympy==1.13.2 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch +torch==2.3.1+cu121 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.in +triton==2.3.1 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch diff --git a/benchmarks/geo_gnn/requirements-pre.in b/benchmarks/geo_gnn/requirements-pre.in new file mode 100644 index 000000000..08ed5eeb4 --- /dev/null +++ b/benchmarks/geo_gnn/requirements-pre.in @@ -0,0 +1 @@ +torch \ No newline at end of file diff --git a/benchmarks/geo_gnn/requirements-pre.rocm.txt b/benchmarks/geo_gnn/requirements-pre.rocm.txt new file mode 100644 index 000000000..3aded346f --- /dev/null +++ b/benchmarks/geo_gnn/requirements-pre.rocm.txt @@ -0,0 +1,49 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/geo_gnn/requirements-pre.rocm.txt .pin/tmp-constraints-rocm-geo_gnn.txt benchmarks/geo_gnn/requirements-pre.in +# +--extra-index-url https://download.pytorch.org/whl/rocm6.0 + +filelock==3.15.4 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # pytorch-triton-rocm + # torch +fsspec==2024.6.1 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # torch +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # torch +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # jinja2 +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # sympy +networkx==3.3 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # torch +pytorch-triton-rocm==3.0.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # torch +sympy==1.13.2 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # torch +torch==2.4.0+rocm6.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.in +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # torch diff --git a/benchmarks/geo_gnn/requirements.cuda.txt b/benchmarks/geo_gnn/requirements.cuda.txt new file mode 100644 index 000000000..8965d9007 --- /dev/null +++ b/benchmarks/geo_gnn/requirements.cuda.txt @@ -0,0 +1,339 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/geo_gnn/requirements.cuda.txt .pin/tmp-constraints-cuda-geo_gnn.txt benchmarks/geo_gnn/requirements-pre.cuda.txt benchmarks/geo_gnn/requirements.in +# +--extra-index-url https://pypi.ngc.nvidia.com +--extra-index-url https://download.pytorch.org/whl/cu121 +--find-links https://data.pyg.org/whl/torch-2.3.0+cu121.html +--trusted-host pypi.ngc.nvidia.com + +aiohappyeyeballs==2.4.0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # aiohttp +aiohttp==3.10.5 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch-geometric +aiosignal==1.3.1 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # aiohttp +antlr4-python3-runtime==4.9.3 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # omegaconf +asttokens==2.4.1 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # giving +async-timeout==4.0.3 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # aiohttp +attrs==24.2.0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # aiohttp +certifi==2024.7.4 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # requests +charset-normalizer==3.3.2 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # requests +codefind==0.1.6 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # ptera +executing==1.2.0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # varname +filelock==3.15.4 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.cuda.txt + # torch + # triton +frozenlist==1.4.1 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # aiohttp + # aiosignal +fsspec==2024.6.1 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.cuda.txt + # torch + # torch-geometric +giving==0.4.2 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # ptera + # voir +idna==3.8 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # requests + # yarl +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.cuda.txt + # torch + # torch-geometric +joblib==1.4.2 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # scikit-learn +markdown-it-py==3.0.0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # rich +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.cuda.txt + # jinja2 +mdurl==0.1.2 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # markdown-it-py +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.cuda.txt + # sympy +multidict==6.0.5 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # aiohttp + # yarl +networkx==3.3 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.cuda.txt + # torch +numpy==1.26.4 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/geo_gnn/requirements.in + # pandas + # rdkit + # scikit-learn + # scipy + # torch-geometric +nvidia-cublas-cu12==12.1.3.1 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.cuda.txt + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.cuda.txt + # torch +nvidia-cuda-nvrtc-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.cuda.txt + # torch +nvidia-cuda-runtime-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.cuda.txt + # torch +nvidia-cudnn-cu12==8.9.2.26 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.cuda.txt + # torch +nvidia-cufft-cu12==11.0.2.54 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.cuda.txt + # torch +nvidia-curand-cu12==10.3.2.106 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.cuda.txt + # torch +nvidia-cusolver-cu12==11.4.5.107 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.cuda.txt + # torch +nvidia-cusparse-cu12==12.1.0.106 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.cuda.txt + # nvidia-cusolver-cu12 + # torch +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # voir +nvidia-nccl-cu12==2.20.5 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.cuda.txt + # torch +nvidia-nvjitlink-cu12==12.6.20 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.cuda.txt + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 +nvidia-nvtx-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.cuda.txt + # torch +omegaconf==2.3.0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # voir +ovld==0.3.9 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # voir +pandas==2.2.2 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/geo_gnn/requirements.in +pillow==10.4.0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # rdkit +psutil==5.9.8 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch-geometric + # voir +ptera==1.4.1 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # voir +pygments==2.18.0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # rich +pyparsing==3.1.4 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch-geometric +python-dateutil==2.9.0.post0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # pandas +pytz==2024.1 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # pandas +pyyaml==6.0.2 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # omegaconf +rdkit==2024.3.5 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/geo_gnn/requirements.in +reactivex==4.0.4 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # giving +requests==2.32.3 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch-geometric +rich==13.8.0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # voir +scikit-learn==1.5.1 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch-geometric +scipy==1.14.1 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # scikit-learn + # torch-cluster + # torch-geometric + # torch-sparse +six==1.16.0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # asttokens + # python-dateutil +sympy==1.13.2 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.cuda.txt + # torch +threadpoolctl==3.5.0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # scikit-learn +torch==2.3.1+cu121 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.cuda.txt +torch-cluster==1.6.3+pt23cu121 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/geo_gnn/requirements.in +torch-geometric==2.5.3 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/geo_gnn/requirements.in +torch-scatter==2.1.2+pt23cu121 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/geo_gnn/requirements.in +torch-sparse==0.6.18+pt23cu121 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/geo_gnn/requirements.in +tqdm==4.66.5 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch-geometric +triton==2.3.1 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.cuda.txt + # torch +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.cuda.txt + # reactivex + # torch +tzdata==2024.1 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # pandas +urllib3==2.2.2 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # requests +varname==0.10.0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # giving +voir==0.2.19 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -c .pin/../constraints/cuda.txt + # -r benchmarks/geo_gnn/requirements.in +yarl==1.9.4 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # aiohttp diff --git a/benchmarks/geo_gnn/requirements.in b/benchmarks/geo_gnn/requirements.in new file mode 100644 index 000000000..6fbdd7dea --- /dev/null +++ b/benchmarks/geo_gnn/requirements.in @@ -0,0 +1,9 @@ +voir>=0.2.17,<0.3 +torch-geometric +torch-cluster +torch-sparse +torch-scatter +pandas +rdkit +numpy<2.0 + diff --git a/benchmarks/geo_gnn/requirements.rocm.txt b/benchmarks/geo_gnn/requirements.rocm.txt new file mode 100644 index 000000000..60246f795 --- /dev/null +++ b/benchmarks/geo_gnn/requirements.rocm.txt @@ -0,0 +1,272 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/geo_gnn/requirements.rocm.txt .pin/tmp-constraints-rocm-geo_gnn.txt benchmarks/geo_gnn/requirements-pre.rocm.txt benchmarks/geo_gnn/requirements.in +# +--extra-index-url https://download.pytorch.org/whl/rocm6.0 + +aiohappyeyeballs==2.4.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # aiohttp +aiohttp==3.10.5 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # torch-geometric +aiosignal==1.3.1 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # aiohttp +antlr4-python3-runtime==4.9.3 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # omegaconf +asttokens==2.4.1 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # giving +async-timeout==4.0.3 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # aiohttp +attrs==24.2.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # aiohttp +certifi==2024.7.4 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # requests +charset-normalizer==3.3.2 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # requests +codefind==0.1.6 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # ptera +executing==1.2.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # varname +filelock==3.15.4 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.rocm.txt + # pytorch-triton-rocm + # torch +frozenlist==1.4.1 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # aiohttp + # aiosignal +fsspec==2024.6.1 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.rocm.txt + # torch + # torch-geometric +giving==0.4.2 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # ptera + # voir +idna==3.7 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # requests + # yarl +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.rocm.txt + # torch + # torch-geometric +joblib==1.4.2 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # scikit-learn +markdown-it-py==3.0.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # rich +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.rocm.txt + # jinja2 +mdurl==0.1.2 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # markdown-it-py +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.rocm.txt + # sympy +multidict==6.0.5 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # aiohttp + # yarl +networkx==3.3 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.rocm.txt + # torch +numpy==1.26.4 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # -r benchmarks/geo_gnn/requirements.in + # pandas + # rdkit + # scikit-learn + # scipy + # torch-geometric +omegaconf==2.3.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # voir +ovld==0.3.8 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # voir +pandas==2.2.2 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # -r benchmarks/geo_gnn/requirements.in +pillow==10.4.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # rdkit +psutil==5.9.8 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # torch-geometric + # voir +ptera==1.4.1 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # voir +pygments==2.18.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # rich +pynvml==11.5.3 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # voir +pyparsing==3.1.2 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # torch-geometric +python-dateutil==2.9.0.post0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # pandas +pytorch-triton-rocm==3.0.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.rocm.txt + # torch +pytz==2024.1 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # pandas +pyyaml==6.0.2 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # omegaconf +rdkit==2024.3.5 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # -r benchmarks/geo_gnn/requirements.in +reactivex==4.0.4 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # giving +requests==2.32.3 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # torch-geometric +rich==13.7.1 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # voir +scikit-learn==1.5.1 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # torch-geometric +scipy==1.14.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # scikit-learn + # torch-cluster + # torch-geometric + # torch-sparse +six==1.16.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # asttokens + # python-dateutil +sympy==1.13.2 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.rocm.txt + # torch +threadpoolctl==3.5.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # scikit-learn +torch==2.4.0+rocm6.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.rocm.txt +torch-cluster==1.6.3 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # -r benchmarks/geo_gnn/requirements.in +torch-geometric==2.5.3 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # -r benchmarks/geo_gnn/requirements.in +torch-scatter==2.1.2 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # -r benchmarks/geo_gnn/requirements.in +torch-sparse==0.6.18 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # -r benchmarks/geo_gnn/requirements.in +tqdm==4.66.5 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # torch-geometric +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # -r benchmarks/geo_gnn/requirements-pre.rocm.txt + # reactivex + # torch +tzdata==2024.1 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # pandas +urllib3==2.2.2 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # requests +varname==0.10.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # giving +voir==0.2.17 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../constraints/rocm.txt + # -r benchmarks/geo_gnn/requirements.in +yarl==1.9.4 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # aiohttp diff --git a/benchmarks/geo_gnn/voirfile.py b/benchmarks/geo_gnn/voirfile.py new file mode 100644 index 000000000..d93f886cd --- /dev/null +++ b/benchmarks/geo_gnn/voirfile.py @@ -0,0 +1,38 @@ +from dataclasses import dataclass + +from voir import configurable +from voir.instruments import dash, early_stop, log, rate +from benchmate.monitor import monitor_monogpu + +@dataclass +class Config: + """voir configuration""" + + # Whether to display the dash or not + dash: bool = False + + # How often to log the rates + interval: str = "1s" + + # Number of rates to skip before logging + skip: int = 5 + + # Number of rates to log before stopping + stop: int = 20 + + # Number of seconds between each gpu poll + gpu_poll: int = 3 + + +@configurable +def instrument_main(ov, options: Config): + yield ov.phases.init + + if options.dash: + ov.require(dash) + + ov.require( + log("value", "progress", "rate", "units", "loss", "gpudata", context="task"), + early_stop(n=options.stop, key="rate", task="train"), + monitor_monogpu(poll_interval=options.gpu_poll), + ) diff --git a/benchmarks/huggingface/requirements.cuda.txt b/benchmarks/huggingface/requirements.cuda.txt index 3f2e112b6..c9ba4dbdb 100644 --- a/benchmarks/huggingface/requirements.cuda.txt +++ b/benchmarks/huggingface/requirements.cuda.txt @@ -40,7 +40,7 @@ filelock==3.15.4 # torch # transformers # triton -fsspec==2024.5.0 +fsspec==2024.6.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt # huggingface-hub @@ -50,12 +50,12 @@ giving==0.4.2 # -c .pin/../.pin/constraints-cuda-torch.txt # ptera # voir -huggingface-hub==0.24.5 +huggingface-hub==0.24.6 # via # -c .pin/../.pin/constraints-cuda-torch.txt # tokenizers # transformers -idna==3.7 +idna==3.8 # via # -c .pin/../.pin/constraints-cuda-torch.txt # requests @@ -126,6 +126,10 @@ nvidia-cusparse-cu12==12.1.0.106 # -c .pin/../.pin/constraints-cuda-torch.txt # nvidia-cusolver-cu12 # torch +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # voir nvidia-nccl-cu12==2.20.5 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -143,7 +147,7 @@ omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # voir -ovld==0.3.6 +ovld==0.3.9 # via # -c .pin/../.pin/constraints-cuda-torch.txt # voir @@ -168,11 +172,7 @@ pygments==2.18.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # rich -pynvml==11.5.3 - # via - # -c .pin/../.pin/constraints-cuda-torch.txt - # voir -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-cuda-torch.txt # huggingface-hub @@ -191,11 +191,11 @@ requests==2.32.3 # -c .pin/../.pin/constraints-cuda-torch.txt # huggingface-hub # transformers -rich==13.7.1 +rich==13.8.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # voir -safetensors==0.4.3 +safetensors==0.4.4 # via # -c .pin/../.pin/constraints-cuda-torch.txt # transformers @@ -203,7 +203,7 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # asttokens -sympy==1.13.1 +sympy==1.13.2 # via # -c .pin/../.pin/constraints-cuda-torch.txt # torch @@ -215,12 +215,12 @@ torch==2.4.0+cu121 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/huggingface/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-cuda-torch.txt # huggingface-hub # transformers -transformers==4.43.3 +transformers==4.44.2 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/huggingface/requirements.in diff --git a/benchmarks/huggingface/requirements.rocm.txt b/benchmarks/huggingface/requirements.rocm.txt index b5b910f65..1f54d841a 100644 --- a/benchmarks/huggingface/requirements.rocm.txt +++ b/benchmarks/huggingface/requirements.rocm.txt @@ -4,10 +4,7 @@ # # pip-compile --output-file=benchmarks/huggingface/requirements.rocm.txt .pin/tmp-constraints-rocm-hf.txt benchmarks/huggingface/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com --extra-index-url https://download.pytorch.org/whl/rocm6.0 ---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com antlr4-python3-runtime==4.9.3 # via @@ -17,7 +14,7 @@ asttokens==2.4.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -certifi==2024.6.2 +certifi==2024.7.4 # via # -c .pin/../.pin/constraints-rocm-torch.txt # requests @@ -40,7 +37,7 @@ filelock==3.15.4 # pytorch-triton-rocm # torch # transformers -fsspec==2024.5.0 +fsspec==2024.6.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # huggingface-hub @@ -50,7 +47,7 @@ giving==0.4.2 # -c .pin/../.pin/constraints-rocm-torch.txt # ptera # voir -huggingface-hub==0.23.5 +huggingface-hub==0.24.6 # via # -c .pin/../.pin/constraints-rocm-torch.txt # tokenizers @@ -91,7 +88,7 @@ omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir -ovld==0.3.5 +ovld==0.3.8 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir @@ -100,6 +97,10 @@ packaging==24.1 # -c .pin/../.pin/constraints-rocm-torch.txt # huggingface-hub # transformers +pillow==10.4.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/huggingface/requirements.in psutil==5.9.8 # via # -c .pin/../.pin/constraints-rocm-torch.txt @@ -116,11 +117,11 @@ pynvml==11.5.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir -pytorch-triton-rocm==2.3.1 +pytorch-triton-rocm==3.0.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # huggingface-hub @@ -130,7 +131,7 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -regex==2024.5.15 +regex==2024.7.24 # via # -c .pin/../.pin/constraints-rocm-torch.txt # transformers @@ -143,7 +144,7 @@ rich==13.7.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir -safetensors==0.4.3 +safetensors==0.4.4 # via # -c .pin/../.pin/constraints-rocm-torch.txt # transformers @@ -151,7 +152,7 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # asttokens -sympy==1.13.0 +sympy==1.13.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch @@ -159,16 +160,16 @@ tokenizers==0.19.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # transformers -torch==2.3.1+rocm6.0 +torch==2.4.0+rocm6.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/huggingface/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-rocm-torch.txt # huggingface-hub # transformers -transformers==4.42.4 +transformers==4.44.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/huggingface/requirements.in @@ -178,7 +179,7 @@ typing-extensions==4.12.2 # huggingface-hub # reactivex # torch -urllib3==1.26.19 +urllib3==2.2.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # requests diff --git a/benchmarks/huggingface/requirements.xpu.txt b/benchmarks/huggingface/requirements.xpu.txt index bcebec727..cb14810eb 100644 --- a/benchmarks/huggingface/requirements.xpu.txt +++ b/benchmarks/huggingface/requirements.xpu.txt @@ -4,10 +4,8 @@ # # pip-compile --output-file=benchmarks/huggingface/requirements.xpu.txt .pin/tmp-constraints-xpu-hf.txt benchmarks/huggingface/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com +--extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ ---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com antlr4-python3-runtime==4.9.3 # via @@ -17,7 +15,7 @@ asttokens==2.4.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -certifi==2024.6.2 +certifi==2024.7.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt # requests @@ -49,7 +47,7 @@ giving==0.4.2 # -c .pin/../.pin/constraints-xpu-torch.txt # ptera # voir -huggingface-hub==0.24.0 +huggingface-hub==0.24.5 # via # -c .pin/../.pin/constraints-xpu-torch.txt # tokenizers @@ -90,7 +88,7 @@ omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir -ovld==0.3.5 +ovld==0.3.8 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir @@ -99,6 +97,10 @@ packaging==24.1 # -c .pin/../.pin/constraints-xpu-torch.txt # huggingface-hub # transformers +pillow==10.4.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -r benchmarks/huggingface/requirements.in psutil==5.9.8 # via # -c .pin/../.pin/constraints-xpu-torch.txt @@ -115,7 +117,7 @@ pynvml==11.5.3 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-xpu-torch.txt # huggingface-hub @@ -125,7 +127,7 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -regex==2024.5.15 +regex==2024.7.24 # via # -c .pin/../.pin/constraints-xpu-torch.txt # transformers @@ -138,7 +140,7 @@ rich==13.7.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir -safetensors==0.4.3 +safetensors==0.4.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt # transformers @@ -146,7 +148,7 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # asttokens -sympy==1.13.0 +sympy==1.13.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # torch @@ -154,17 +156,18 @@ tokenizers==0.19.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # transformers -torch==2.1.0.post2+cxx11.abi +torch==2.4.0+cpu # via # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/huggingface/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-xpu-torch.txt # huggingface-hub # transformers -transformers==4.42.4 +transformers==4.44.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -r benchmarks/huggingface/requirements.in @@ -174,7 +177,7 @@ typing-extensions==4.12.2 # huggingface-hub # reactivex # torch -urllib3==1.26.19 +urllib3==2.2.2 # via # -c .pin/../.pin/constraints-xpu-torch.txt # requests @@ -185,5 +188,6 @@ varname==0.10.0 voir==0.2.19 # via # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/huggingface/requirements.in diff --git a/benchmarks/huggingface/tunableop_results0.csv b/benchmarks/huggingface/tunableop_results0.csv new file mode 100644 index 000000000..6a38d561a --- /dev/null +++ b/benchmarks/huggingface/tunableop_results0.csv @@ -0,0 +1,17 @@ +Validator,PT_VERSION,2.4.0 +Validator,ROCBLAS_VERSION,4.0.0-88df9726-dirty +Validator,HIPBLASLT_VERSION,0.6.0-592518e7 +Validator,ROCM_VERSION,6.0.0.0-91-08e5094 +Validator,GCN_ARCH_NAME,gfx942:sramecc+:xnack- +GemmTunableOp_float_NT,nt_768_3072_16384,Gemm_Rocblas_69720,0.751226 +GemmTunableOp_float_NT,nt_3072_768_16384,Gemm_Rocblas_69733,0.684042 +GemmTunableOp_float_NT,nt_768_768_16384,Gemm_Hipblaslt_NT_28806,0.264226 +GemmTunableOp_float_NT,nt_768_30522_16384,Gemm_Hipblaslt_NT_27808,5.73919 +GemmTunableOp_float_NN,nn_768_16384_3072,Gemm_Hipblaslt_NN_33293,0.701076 +GemmTunableOp_float_NN,nn_768_16384_768,Gemm_Hipblaslt_NN_33685,0.209309 +GemmTunableOp_float_NN,nn_3072_16384_768,Gemm_Hipblaslt_NN_33225,0.69655 +GemmTunableOp_float_NN,nn_768_16384_30522,Gemm_Hipblaslt_NN_33924,5.81957 +GemmTunableOp_float_TN,tn_30522_16384_768,Default,6.06459 +GemmTunableOp_float_TN,tn_768_16384_3072,Gemm_Hipblaslt_TN_34830,0.584625 +GemmTunableOp_float_TN,tn_3072_16384_768,Gemm_Rocblas_69037,0.742789 +GemmTunableOp_float_TN,tn_768_16384_768,Gemm_Rocblas_69047,0.211827 diff --git a/benchmarks/huggingface/voirfile.py b/benchmarks/huggingface/voirfile.py index 0ed042a80..b2e3ddd14 100644 --- a/benchmarks/huggingface/voirfile.py +++ b/benchmarks/huggingface/voirfile.py @@ -25,7 +25,7 @@ class Config: stop: int = 20 # Number of seconds between each gpu poll - gpu_poll: int = 3 + gpu_poll: int = 1 @configurable diff --git a/benchmarks/lightning/requirements.cuda.txt b/benchmarks/lightning/requirements.cuda.txt index 88c732317..80f6b7f42 100644 --- a/benchmarks/lightning/requirements.cuda.txt +++ b/benchmarks/lightning/requirements.cuda.txt @@ -9,11 +9,11 @@ --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html --trusted-host pypi.ngc.nvidia.com -aiohappyeyeballs==2.3.4 +aiohappyeyeballs==2.4.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # aiohttp -aiohttp==3.10.0 +aiohttp==3.10.5 # via # -c .pin/../.pin/constraints-cuda-torch.txt # fsspec @@ -33,7 +33,7 @@ async-timeout==4.0.3 # via # -c .pin/../.pin/constraints-cuda-torch.txt # aiohttp -attrs==23.2.0 +attrs==24.2.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # aiohttp @@ -55,7 +55,7 @@ frozenlist==1.4.1 # -c .pin/../.pin/constraints-cuda-torch.txt # aiohttp # aiosignal -fsspec[http]==2024.5.0 +fsspec[http]==2024.6.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt # lightning @@ -66,11 +66,11 @@ giving==0.4.2 # -c .pin/../.pin/constraints-cuda-torch.txt # ptera # voir -idna==3.7 +idna==3.8 # via # -c .pin/../.pin/constraints-cuda-torch.txt # yarl -importlib-resources==6.4.0 +importlib-resources==6.4.4 # via # -c .pin/../.pin/constraints-cuda-torch.txt # torchcompat @@ -78,7 +78,7 @@ jinja2==3.1.4 # via # -c .pin/../.pin/constraints-cuda-torch.txt # torch -lightning==2.3.3 +lightning==2.4.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/lightning/requirements.in @@ -116,8 +116,6 @@ networkx==3.3 numpy==1.26.4 # via # -c .pin/../.pin/constraints-cuda-torch.txt - # lightning - # pytorch-lightning # torchmetrics # torchvision nvidia-cublas-cu12==12.1.3.1 @@ -159,6 +157,10 @@ nvidia-cusparse-cu12==12.1.0.106 # -c .pin/../.pin/constraints-cuda-torch.txt # nvidia-cusolver-cu12 # torch +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # voir nvidia-nccl-cu12==2.20.5 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -176,7 +178,7 @@ omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # voir -ovld==0.3.6 +ovld==0.3.9 # via # -c .pin/../.pin/constraints-cuda-torch.txt # voir @@ -203,15 +205,11 @@ pygments==2.18.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # rich -pynvml==11.5.3 - # via - # -c .pin/../.pin/constraints-cuda-torch.txt - # voir -pytorch-lightning==2.3.3 +pytorch-lightning==2.4.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # lightning -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-cuda-torch.txt # lightning @@ -221,7 +219,7 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-cuda-torch.txt # giving -rich==13.7.1 +rich==13.8.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # voir @@ -229,7 +227,7 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # asttokens -sympy==1.13.1 +sympy==1.13.2 # via # -c .pin/../.pin/constraints-cuda-torch.txt # torch @@ -246,7 +244,7 @@ torchcompat==1.1.4 # -c .pin/../.pin/constraints-cuda-torch.txt # -c .pin/../constraints/cuda.txt # -r benchmarks/lightning/requirements.in -torchmetrics==1.4.0.post0 +torchmetrics==1.4.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt # lightning @@ -255,7 +253,7 @@ torchvision==0.19.0+cu121 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/lightning/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-cuda-torch.txt # lightning diff --git a/benchmarks/lightning/requirements.rocm.txt b/benchmarks/lightning/requirements.rocm.txt new file mode 100644 index 000000000..26fdcedfa --- /dev/null +++ b/benchmarks/lightning/requirements.rocm.txt @@ -0,0 +1,233 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/lightning/requirements.rocm.txt .pin/tmp-constraints-rocm-lightning-gpus.txt benchmarks/lightning/requirements.in +# +--extra-index-url https://download.pytorch.org/whl/rocm6.0 + +aiohappyeyeballs==2.4.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp +aiohttp==3.10.5 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # fsspec +aiosignal==1.3.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp +antlr4-python3-runtime==4.9.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # omegaconf +asttokens==2.4.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # giving +async-timeout==4.0.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp +attrs==24.2.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp +codefind==0.1.6 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # ptera +executing==1.2.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # varname +filelock==3.15.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # pytorch-triton-rocm + # torch +frozenlist==1.4.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp + # aiosignal +fsspec[http]==2024.6.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # lightning + # pytorch-lightning + # torch +giving==0.4.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # ptera + # voir +idna==3.7 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # yarl +importlib-resources==6.4.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torchcompat +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +lightning==2.4.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/lightning/requirements.in +lightning-utilities==0.11.6 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # lightning + # pytorch-lightning + # torchmetrics +markdown-it-py==3.0.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # rich +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # jinja2 +mdurl==0.1.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # markdown-it-py +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # sympy +multidict==6.0.5 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp + # yarl +networkx==3.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +numpy==1.26.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torchmetrics + # torchvision +omegaconf==2.3.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +ovld==0.3.8 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +packaging==24.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # lightning + # lightning-utilities + # pytorch-lightning + # torchmetrics +pillow==10.4.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torchvision +psutil==5.9.8 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +ptera==1.4.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +pygments==2.18.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # rich +pynvml==11.5.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +pytorch-lightning==2.4.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # lightning +pytorch-triton-rocm==3.0.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +pyyaml==6.0.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # lightning + # omegaconf + # pytorch-lightning +reactivex==4.0.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # giving +rich==13.7.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +six==1.16.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # asttokens +sympy==1.13.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +torch==2.4.0+rocm6.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/lightning/requirements.in + # lightning + # pytorch-lightning + # torchmetrics + # torchvision +torchcompat==1.1.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -c .pin/../constraints/rocm.txt + # -r benchmarks/lightning/requirements.in +torchmetrics==1.4.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # lightning + # pytorch-lightning +torchvision==0.19.0+rocm6.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/lightning/requirements.in +tqdm==4.66.5 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # lightning + # pytorch-lightning +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # lightning + # lightning-utilities + # pytorch-lightning + # reactivex + # torch +varname==0.10.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # giving +voir==0.2.17 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -c .pin/../constraints/rocm.txt + # -r benchmarks/lightning/requirements.in +yarl==1.9.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/benchmarks/lightning/requirements.xpu.txt b/benchmarks/lightning/requirements.xpu.txt new file mode 100644 index 000000000..338ee0fb9 --- /dev/null +++ b/benchmarks/lightning/requirements.xpu.txt @@ -0,0 +1,235 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/lightning/requirements.xpu.txt .pin/tmp-constraints-xpu-lightning-gpus.txt benchmarks/lightning/requirements.in +# +--extra-index-url https://download.pytorch.org/whl/cpu +--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ + +aiohappyeyeballs==2.3.5 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # aiohttp +aiohttp==3.10.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # fsspec +aiosignal==1.3.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # aiohttp +antlr4-python3-runtime==4.9.3 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # omegaconf +asttokens==2.4.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # giving +async-timeout==4.0.3 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # aiohttp +attrs==24.2.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # aiohttp +codefind==0.1.6 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # ptera +executing==1.2.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # varname +filelock==3.15.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # torch +frozenlist==1.4.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # aiohttp + # aiosignal +fsspec[http]==2024.5.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # lightning + # pytorch-lightning + # torch +giving==0.4.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # ptera + # voir +idna==3.7 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # yarl +importlib-resources==6.4.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # torchcompat +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # torch +lightning==2.4.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -r benchmarks/lightning/requirements.in +lightning-utilities==0.11.6 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # lightning + # pytorch-lightning + # torchmetrics +markdown-it-py==3.0.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # rich +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # jinja2 +mdurl==0.1.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # markdown-it-py +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # sympy +multidict==6.0.5 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # aiohttp + # yarl +networkx==3.3 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # torch +numpy==1.26.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # torchmetrics + # torchvision +omegaconf==2.3.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # voir +ovld==0.3.8 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # voir +packaging==24.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # lightning + # lightning-utilities + # pytorch-lightning + # torchmetrics +pillow==10.4.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # torchvision +psutil==5.9.8 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # voir +ptera==1.4.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # voir +pygments==2.18.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # rich +pynvml==11.5.3 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # voir +pytorch-lightning==2.4.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # lightning +pyyaml==6.0.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # lightning + # omegaconf + # pytorch-lightning +reactivex==4.0.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # giving +rich==13.7.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # voir +six==1.16.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # asttokens +sympy==1.13.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # torch +torch==2.4.0+cpu + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt + # -c .pin/../constraints/xpu.txt + # -r benchmarks/lightning/requirements.in + # lightning + # pytorch-lightning + # torchmetrics + # torchvision +torchcompat==1.1.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt + # -c .pin/../constraints/xpu.txt + # -r benchmarks/lightning/requirements.in +torchmetrics==1.4.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # lightning + # pytorch-lightning +torchvision==0.19.0+cpu + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt + # -c .pin/../constraints/xpu.txt + # -r benchmarks/lightning/requirements.in +tqdm==4.66.5 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # lightning + # pytorch-lightning +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # lightning + # lightning-utilities + # pytorch-lightning + # reactivex + # torch +varname==0.10.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # giving +voir==0.2.17 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt + # -c .pin/../constraints/xpu.txt + # -r benchmarks/lightning/requirements.in +yarl==1.9.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # aiohttp + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/benchmarks/llama/main.py b/benchmarks/llama/main.py index 696d16288..a17053296 100755 --- a/benchmarks/llama/main.py +++ b/benchmarks/llama/main.py @@ -83,6 +83,7 @@ def huggingface_main(args, model, config): # we just instantiate an untrained one println("Model") device = accelerator.fetch_device(0) + print(device) if args.pretrained: model = LlamaForCausalLM.from_pretrained(config["_name_or_path"]).to(device=device) @@ -93,7 +94,7 @@ def huggingface_main(args, model, config): pipeline = transformers.pipeline( "text-generation", model=model, - torch_dtype=torch.float16, + torch_dtype=torch.bfloat16, # device_map="cuda", tokenizer=tokenizer, device=device, diff --git a/benchmarks/llama/requirements.cuda.txt b/benchmarks/llama/requirements.cuda.txt index 98e990076..a26e5bec4 100644 --- a/benchmarks/llama/requirements.cuda.txt +++ b/benchmarks/llama/requirements.cuda.txt @@ -9,11 +9,11 @@ --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html --trusted-host pypi.ngc.nvidia.com -aiohappyeyeballs==2.3.4 +aiohappyeyeballs==2.4.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # aiohttp -aiohttp==3.10.0 +aiohttp==3.10.5 # via # -c .pin/../.pin/constraints-cuda-torch.txt # datasets @@ -34,7 +34,7 @@ async-timeout==4.0.3 # via # -c .pin/../.pin/constraints-cuda-torch.txt # aiohttp -attrs==23.2.0 +attrs==24.2.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # aiohttp @@ -50,7 +50,7 @@ codefind==0.1.6 # via # -c .pin/../.pin/constraints-cuda-torch.txt # ptera -datasets==2.20.0 +datasets==2.21.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/llama/requirements.in @@ -84,7 +84,7 @@ frozenlist==1.4.1 # -c .pin/../.pin/constraints-cuda-torch.txt # aiohttp # aiosignal -fsspec[http]==2024.5.0 +fsspec[http]==2024.6.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt # datasets @@ -95,13 +95,13 @@ giving==0.4.2 # -c .pin/../.pin/constraints-cuda-torch.txt # ptera # voir -huggingface-hub==0.24.5 +huggingface-hub==0.24.6 # via # -c .pin/../.pin/constraints-cuda-torch.txt # datasets # tokenizers # transformers -idna==3.7 +idna==3.8 # via # -c .pin/../.pin/constraints-cuda-torch.txt # requests @@ -186,6 +186,10 @@ nvidia-cusparse-cu12==12.1.0.106 # -c .pin/../.pin/constraints-cuda-torch.txt # nvidia-cusolver-cu12 # torch +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # voir nvidia-nccl-cu12==2.20.5 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -203,7 +207,7 @@ omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # voir -ovld==0.3.6 +ovld==0.3.9 # via # -c .pin/../.pin/constraints-cuda-torch.txt # voir @@ -229,18 +233,10 @@ pyarrow==17.0.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # datasets -pyarrow-hotfix==0.6 - # via - # -c .pin/../.pin/constraints-cuda-torch.txt - # datasets pygments==2.18.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # rich -pynvml==11.5.3 - # via - # -c .pin/../.pin/constraints-cuda-torch.txt - # voir python-dateutil==2.9.0.post0 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -249,7 +245,7 @@ pytz==2024.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt # pandas -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-cuda-torch.txt # datasets @@ -270,11 +266,11 @@ requests==2.32.3 # datasets # huggingface-hub # transformers -rich==13.7.1 +rich==13.8.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # voir -safetensors==0.4.3 +safetensors==0.4.4 # via # -c .pin/../.pin/constraints-cuda-torch.txt # transformers @@ -288,7 +284,7 @@ six==1.16.0 # asttokens # fire # python-dateutil -sympy==1.13.1 +sympy==1.13.2 # via # -c .pin/../.pin/constraints-cuda-torch.txt # torch @@ -305,13 +301,13 @@ torch==2.4.0+cu121 # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/llama/requirements.in # fairscale -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-cuda-torch.txt # datasets # huggingface-hub # transformers -transformers==4.43.3 +transformers==4.44.2 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/llama/requirements.in @@ -342,7 +338,7 @@ voir==0.2.19 # -c .pin/../.pin/constraints-cuda-torch.txt # -c .pin/../constraints/cuda.txt # -r benchmarks/llama/requirements.in -xxhash==3.4.1 +xxhash==3.5.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # datasets diff --git a/benchmarks/llama/requirements.rocm.txt b/benchmarks/llama/requirements.rocm.txt index 25293406d..97c44bb0c 100644 --- a/benchmarks/llama/requirements.rocm.txt +++ b/benchmarks/llama/requirements.rocm.txt @@ -4,12 +4,13 @@ # # pip-compile --output-file=benchmarks/llama/requirements.rocm.txt .pin/tmp-constraints-rocm-llm.txt benchmarks/llama/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com --extra-index-url https://download.pytorch.org/whl/rocm6.0 ---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com -aiohttp==3.9.5 +aiohappyeyeballs==2.4.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp +aiohttp==3.10.5 # via # -c .pin/../.pin/constraints-rocm-torch.txt # datasets @@ -30,11 +31,11 @@ async-timeout==4.0.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # aiohttp -attrs==23.2.0 +attrs==24.2.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # aiohttp -certifi==2024.6.2 +certifi==2024.7.4 # via # -c .pin/../.pin/constraints-rocm-torch.txt # requests @@ -46,7 +47,7 @@ codefind==0.1.6 # via # -c .pin/../.pin/constraints-rocm-torch.txt # ptera -datasets==2.20.0 +datasets==2.21.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/llama/requirements.in @@ -80,7 +81,7 @@ frozenlist==1.4.1 # -c .pin/../.pin/constraints-rocm-torch.txt # aiohttp # aiosignal -fsspec[http]==2024.5.0 +fsspec[http]==2024.6.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # datasets @@ -91,7 +92,7 @@ giving==0.4.2 # -c .pin/../.pin/constraints-rocm-torch.txt # ptera # voir -huggingface-hub==0.23.5 +huggingface-hub==0.24.6 # via # -c .pin/../.pin/constraints-rocm-torch.txt # datasets @@ -147,7 +148,7 @@ omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir -ovld==0.3.5 +ovld==0.3.8 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir @@ -173,10 +174,6 @@ pyarrow==17.0.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # datasets -pyarrow-hotfix==0.6 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # datasets pygments==2.18.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt @@ -189,7 +186,7 @@ python-dateutil==2.9.0.post0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # pandas -pytorch-triton-rocm==2.3.1 +pytorch-triton-rocm==3.0.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch @@ -197,7 +194,7 @@ pytz==2024.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # pandas -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # datasets @@ -208,7 +205,7 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -regex==2024.5.15 +regex==2024.7.24 # via # -c .pin/../.pin/constraints-rocm-torch.txt # transformers @@ -222,7 +219,7 @@ rich==13.7.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir -safetensors==0.4.3 +safetensors==0.4.4 # via # -c .pin/../.pin/constraints-rocm-torch.txt # transformers @@ -236,7 +233,7 @@ six==1.16.0 # asttokens # fire # python-dateutil -sympy==1.13.0 +sympy==1.13.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch @@ -248,18 +245,18 @@ tokenizers==0.19.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # transformers -torch==2.3.1+rocm6.0 +torch==2.4.0+rocm6.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/llama/requirements.in # fairscale -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-rocm-torch.txt # datasets # huggingface-hub # transformers -transformers==4.42.4 +transformers==4.44.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/llama/requirements.in @@ -273,7 +270,7 @@ tzdata==2024.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # pandas -urllib3==1.26.19 +urllib3==2.2.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # requests @@ -286,7 +283,7 @@ voir==0.2.19 # -c .pin/../.pin/constraints-rocm-torch.txt # -c .pin/../constraints/rocm.txt # -r benchmarks/llama/requirements.in -xxhash==3.4.1 +xxhash==3.5.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # datasets diff --git a/benchmarks/llama/requirements.xpu.txt b/benchmarks/llama/requirements.xpu.txt index 9ef2b6f8d..b852606fd 100644 --- a/benchmarks/llama/requirements.xpu.txt +++ b/benchmarks/llama/requirements.xpu.txt @@ -4,12 +4,14 @@ # # pip-compile --output-file=benchmarks/llama/requirements.xpu.txt .pin/tmp-constraints-xpu-llm.txt benchmarks/llama/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com +--extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ ---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com -aiohttp==3.9.5 +aiohappyeyeballs==2.3.5 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # aiohttp +aiohttp==3.10.2 # via # -c .pin/../.pin/constraints-xpu-torch.txt # datasets @@ -30,11 +32,11 @@ async-timeout==4.0.3 # via # -c .pin/../.pin/constraints-xpu-torch.txt # aiohttp -attrs==23.2.0 +attrs==24.2.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # aiohttp -certifi==2024.6.2 +certifi==2024.7.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt # requests @@ -90,7 +92,7 @@ giving==0.4.2 # -c .pin/../.pin/constraints-xpu-torch.txt # ptera # voir -huggingface-hub==0.24.0 +huggingface-hub==0.24.5 # via # -c .pin/../.pin/constraints-xpu-torch.txt # datasets @@ -146,7 +148,7 @@ omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir -ovld==0.3.5 +ovld==0.3.8 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir @@ -192,7 +194,7 @@ pytz==2024.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # pandas -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-xpu-torch.txt # datasets @@ -203,7 +205,7 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -regex==2024.5.15 +regex==2024.7.24 # via # -c .pin/../.pin/constraints-xpu-torch.txt # transformers @@ -217,7 +219,7 @@ rich==13.7.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir -safetensors==0.4.3 +safetensors==0.4.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt # transformers @@ -231,7 +233,7 @@ six==1.16.0 # asttokens # fire # python-dateutil -sympy==1.13.0 +sympy==1.13.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # torch @@ -243,19 +245,20 @@ tokenizers==0.19.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # transformers -torch==2.1.0.post2+cxx11.abi +torch==2.4.0+cpu # via # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/llama/requirements.in # fairscale -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-xpu-torch.txt # datasets # huggingface-hub # transformers -transformers==4.42.4 +transformers==4.44.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -r benchmarks/llama/requirements.in @@ -269,7 +272,7 @@ tzdata==2024.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # pandas -urllib3==1.26.19 +urllib3==2.2.2 # via # -c .pin/../.pin/constraints-xpu-torch.txt # requests @@ -280,6 +283,7 @@ varname==0.10.0 voir==0.2.19 # via # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/llama/requirements.in xxhash==3.4.1 diff --git a/benchmarks/llm/benchfile.py b/benchmarks/llm/benchfile.py index 1537ad556..6f8cadeee 100644 --- a/benchmarks/llm/benchfile.py +++ b/benchmarks/llm/benchfile.py @@ -1,7 +1,7 @@ from milabench.pack import Package -from milabench.commands import TorchrunAllGPU +from milabench.commands import TorchrunAllGPU, TorchrunAllNodes, ForeachNode from milabench.pack import BasePackage from milabench.commands import SimpleCommand @@ -15,7 +15,18 @@ def executable(self): # return True def __init__(self, pack: BasePackage, *torchrun_args, **kwargs): - super().__init__(pack, *torchrun_args, module=False, **kwargs) + super().__init__(pack, "run", *torchrun_args, module=False, **kwargs) + + +class TorchtuneAllNodes(TorchrunAllNodes): + def __init__(self, executor, *args, **kwargs) -> None: + base_exec = TorchrunAllNodes.make_base_executor( + Torchtune, + executor, + *args, + **kwargs + ) + ForeachNode.__init__(self, base_exec) class Llm(Package): @@ -31,7 +42,7 @@ async def install(self): def build_run_plan(self): exec = SimpleCommand(self) - return Torchtune(exec, "run").use_stdout() + return TorchtuneAllNodes(exec).use_stdout() __pack__ = Llm diff --git a/benchmarks/llm/dev.yaml b/benchmarks/llm/dev.yaml index 44386f209..e965769b1 100644 --- a/benchmarks/llm/dev.yaml +++ b/benchmarks/llm/dev.yaml @@ -13,6 +13,27 @@ _llm: method: per_gpu +llm-rlhf-single: + inherits: _llm + definition: . + install-variant: unpinned + plan: + method: per_gpu + + argv: + "{milabench_code}/recipes/lora_finetune_single_device.py": true + --config: "{milabench_code}/configs/llama3_8B_lora_single_device.yaml" + epochs=1: true + output_dir={milabench_extra}/output: true + tokenizer.path={milabench_data}/llama3_8B/original/tokenizer.model: true + checkpointer.checkpoint_dir={milabench_data}/llama3_8B/original: true + checkpointer.output_dir={milabench_data}/llama3_8B/: true + metric_logger.log_dir={milabench_extra}/metrics: true + repo_id="meta-llama/Meta-Llama-3.1-8B": true + batch_size=8: true + gradient_accumulation_steps=8: true + + llm-lora-single: inherits: _llm definition: . diff --git a/benchmarks/llm/recipes/ppo_full_finetune_single_device.py b/benchmarks/llm/recipes/ppo_full_finetune_single_device.py new file mode 100644 index 000000000..8ee77c06a --- /dev/null +++ b/benchmarks/llm/recipes/ppo_full_finetune_single_device.py @@ -0,0 +1,1084 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import math +import os +import sys +from functools import partial +from itertools import chain +from typing import Any, Dict, List, Optional, Tuple +from warnings import warn + +import torch +from omegaconf import DictConfig, ListConfig +from torch import nn +from torch.optim import Optimizer +from torch.utils.data import DataLoader, DistributedSampler +from torchtune import config, modules, utils +from torchtune.datasets import ConcatDataset +from torchtune.modules import rlhf +from torchtune.modules.rlhf import PPOStats, Trajectory +from torchtune.recipe_interfaces import FTRecipeInterface +from tqdm import tqdm + + +log = utils.get_logger("DEBUG") + + +class PPOFullFinetuneRecipeSingleDevice(FTRecipeInterface): + """ + Full finetuning recipe for RLHF with PPO for dense transformer-based LLMs such as LLama2. This recipe is optimized + for single GPU training. Training on CPU is not supported. + + This implementation is based on `Learning to summarize from human feedback None: + + self._device = utils.get_device(device=cfg.device) + self._dtype = utils.get_dtype(cfg.dtype, device=self._device) + + # Disable for fp16, as we haven't validated "full" fp16 with this recipe, nor + # enabled necessary features such as gradient scaling. + if self._dtype == torch.float16: + raise RuntimeError( + "full fp16 training is not supported with this recipe. Please use bf16 or fp32 instead." + ) + + # logging attributes + self._output_dir = cfg.output_dir + self._log_every_n_steps = cfg.get("log_every_n_steps", 1) + self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) + + # These are public properties which are updated by the checkpoint loader + # when ``resume_from_checkpoint`` is `True` or validated in tests + self.seed = utils.set_seed(seed=cfg.seed) + # manually setting up a generator for the recipe + self._rng = torch.Generator(self._device).manual_seed(self.seed) + self._total_steps = 0 + self._steps_run = 0 + self._total_epochs = 0 + self._epochs_run = 0 + self.global_step = 0 + + # Training cfg + self._resume_from_checkpoint = cfg.resume_from_checkpoint + self._gradient_accumulation_steps = cfg.gradient_accumulation_steps + + def setup(self, cfg: DictConfig) -> None: + """ + Sets up the recipe state correctly. This includes setting recipe attributes based + on the ``resume_from_checkpoint`` flag. + """ + self._metric_logger = config.instantiate(cfg.metric_logger) + + # log config with parameter override + self._metric_logger.log_config(cfg) + + # setup checkpointers + ( + self._policy_checkpointer, + ref_policy_checkpointer, + self._value_checkpointer, + reward_checkpointer, + ) = self._setup_checkpointers( + cfg.checkpointer, + cfg.ref_policy_checkpointer, + cfg.value_checkpointer, + cfg.reward_checkpointer, + ) + + # load policy checkpoints + policy_model_checkpoint_dict = self._policy_checkpointer.load_checkpoint() + ref_policy_state_dict = ref_policy_checkpointer.load_checkpoint() + + # load reward and value model checkpoints + value_model_checkpoint_dict = self._value_checkpointer.load_checkpoint() + reward_model_state_dict = reward_checkpointer.load_checkpoint() + + # update recipe state + # ``_setup_model`` handles initialization and loading the state dict. This method + # should be called before ``_setup_optimizer`` since transforming the optimizer + # state dict requires the model + self._model_compile = cfg.compile + self._optimizer_in_bwd = cfg.optimizer_in_bwd + ( + self._policy_model, + self._value_model, + self._reward_model, + self._ref_policy_model, + ) = self._setup_model( + cfg_model=cfg.policy_model, + cfg_reward_value_model=cfg.reward_and_value_model, + enable_activation_checkpointing=cfg.enable_activation_checkpointing, + compile_model=self._model_compile, + policy_state_dict=policy_model_checkpoint_dict[utils.MODEL_KEY], + ref_policy_state_dict=ref_policy_state_dict[utils.MODEL_KEY], + value_model_state_dict=value_model_checkpoint_dict[utils.MODEL_KEY], + reward_model_state_dict=reward_model_state_dict[utils.MODEL_KEY], + ) + + # setup tokenizer + self._tokenizer = config.instantiate(cfg.tokenizer) + log.info("Tokenizer is initialized from file.") + + # _setup_optimizer should take in ckpt_dict only if training is resumed from + # checkpoint. Transforming the opt state dict is handled by this method + self._optimizer = self._setup_optimizer( + cfg_optimizer=cfg.optimizer, + optimizer_in_bwd=cfg.optimizer_in_bwd, + opt_state_dict=( + policy_model_checkpoint_dict[utils.OPT_KEY] + if self._resume_from_checkpoint + else None + ), + ) + + self._loss_fn = config.instantiate(cfg.loss) + log.info("Loss is initialized.") + + # sampler and dataloader depends on the tokenizer and should be set + # setup afterit is initialized + self._sampler, self._dataloader = self._setup_data( + cfg_dataset=cfg.dataset, + shuffle=cfg.shuffle, + batch_size=cfg.batch_size, + ) + + self._setup_training_parameters(cfg) + self._setup_training_hyperparameters(cfg) + + if self._resume_from_checkpoint: + self._update_recipe_state(policy_model_checkpoint_dict) + + # one "step" is a single gradient update update over a minibatch of trajectories + self.global_step = ( + self._steps_run + * self._ppo_epochs + * (self.batch_size // self._ppo_batch_size) + ) + + def _setup_training_hyperparameters(self, cfg) -> None: + """ + Sets up the training hyperparameters for the recipe. This includes the GAE hyperparameters, + generation hyperparameters, reward masking hyperparameters, and stop token ids. + """ + + self._kl_coeff = cfg.kl_coeff + # GAE hyperparameters + self._gamma = cfg.gamma + self._lmbda = cfg.lmbda + self._whiten_rewards = cfg.whiten_rewards + + # trajectory generation args + self._temperature = cfg.temperature + self._top_k = cfg.top_k + self._max_generated_tokens = cfg.max_generated_tokens + + # reward masking args + self._min_response_length = cfg.min_response_length + self._penalise_no_eos = cfg.penalise_no_eos + self._reward_penalty = cfg.reward_penalty + + # lots of hand holding for stop tokens + if cfg.get("stop_token_ids", False): + stop_token_ids = cfg.stop_token_ids + if self._tokenizer.eos_id not in stop_token_ids: + warn( + f"tokenizer eos_id ({self._tokenizer.eos_id}) is not in stop_token_ids ({stop_token_ids})." + "This may lead to unexpected behaviour." + ) + else: + if not hasattr(self._tokenizer.stop_tokens): + warn( + "No stop tokens defined in tokenizer, and no stop_token_ids provided. This may lead to unexpected behaviour." + ) + stop_token_ids = [] + else: + stop_token_ids = self._tokenizer.stop_tokens + self._stop_token_ids = torch.tensor(stop_token_ids, device=self._device) + + def _setup_training_parameters(self, cfg: DictConfig) -> None: + """ + Validates and sets up parameters for used during training and for tracking training state, + batch sizes for model forward passes during trajectory generation, PPO minibatches, and + PPO microbatches for gradient accumulation. + + Raises + - ValueError if: + - batch_size is not divisible by forward_batch_size + - batch_size is not divisible by ppo_batch_size + - ppo_batch_size is not divisible by gradient_accumulation_steps + - num_steps is less than batch_size + - gradient_accumulation_steps > 1 and optimizer_in_bwd is True + """ + self.batch_size = cfg.batch_size + self._forward_batch_size = cfg.forward_batch_size + self._ppo_epochs = cfg.ppo_epochs + self._ppo_batch_size = cfg.ppo_batch_size + self._gradient_accumulation_steps = cfg.gradient_accumulation_steps + self._ppo_backward_batch_size = ( + cfg.ppo_batch_size // self._gradient_accumulation_steps + ) + + if self.batch_size % self._forward_batch_size != 0: + raise ValueError( + f"batch_size ({self.batch_size}) must be exactly divisible by " + f"forward_batch_size ({self._forward_batch_size})." + ) + if self.batch_size % self._ppo_batch_size != 0: + raise ValueError( + f"batch_size ({self.batch_size}) must be exactly divisible by " + f"ppo_batch_size ({self._ppo_batch_size})." + ) + if self._ppo_batch_size % self._gradient_accumulation_steps != 0: + raise ValueError( + f"ppo_batch_size ({self._ppo_batch_size}) must be exactly divisible " + f"by gradient_accumulation_steps ({self._gradient_accumulation_steps})." + ) + + if self._gradient_accumulation_steps > 1 and self._optimizer_in_bwd: + raise RuntimeError( + "Gradient accumulation is not supported with optimizer in bwd." + "Please set gradient_accumulation_steps=1, or optimizer_in_bwd=False." + ) + + self._total_steps = cfg.num_steps // self.batch_size + batches_per_epoch = max( + 1, len(self._dataloader) + ) # when we only have a single batch in the dataset + + self._total_epochs = math.ceil(self._total_steps / batches_per_epoch) + if self._total_steps == 0: + raise ValueError( + f"num_steps {cfg.num_steps} must be greater than the batch size {self.batch_size}." + ) + if self._total_steps < len(self._dataloader): + warn( + f"There are fewer total steps ({self._total_steps}, (num_steps//batch_size) " + f"than there are batches ({len(self._dataloader)}) in the dataset. " + f"Training will stop after ({self._total_steps}) steps without saving intermediate checkpoints" + ) + if (self._total_steps > batches_per_epoch) and ( + self._total_steps % batches_per_epoch != 0 + ): + warn( + f"num_steps ({cfg.num_steps}) is not exactly divisible by " + f"the number of batches in the dataset ({batches_per_epoch}). " + f"Intermediate checkpoints will only be saved every {batches_per_epoch} steps." + ) + log.info( + f"Total steps to run: {self._total_steps}, Total epochs to run: {self._total_epochs}" + ) + + def _setup_checkpointers( + self, + policy_cfg: DictConfig, + ref_policy_cfg: DictConfig, + value_cfg: DictConfig, + reward_cfg: DictConfig, + ) -> Tuple[ + utils.Checkpointer, utils.Checkpointer, utils.Checkpointer, utils.Checkpointer + ]: + """ + Sets up checkpointers for policy, reference policy, value, and reward models. + Only the policy checkpoint handles recipe state for resuming from checkpoints. + """ + + if not self._resume_from_checkpoint: + assert policy_cfg.checkpoint_dir == ref_policy_cfg.checkpoint_dir, ( + "Policy and reference policy should be loaded from the same checkpoint directories" + f"at the start of training. Found: {policy_cfg.checkpoint_dir} and" + f"{ref_policy_cfg.checkpoint_dir}" + ) + assert policy_cfg.checkpoint_files == ref_policy_cfg.checkpoint_files, ( + "Policy and reference policy should be loaded from the same checkpoint files" + f"at the start of training. Found: {policy_cfg.checkpoint_files} and" + f"{ref_policy_cfg.checkpoint_files}" + ) + + policy_checkpointer = config.instantiate( + policy_cfg, + resume_from_checkpoint=self._resume_from_checkpoint, + ) + + ref_policy_checkpointer = config.instantiate( + ref_policy_cfg, + resume_from_checkpoint=False, + ) + + value_checkpointer = config.instantiate( + value_cfg, + resume_from_checkpoint=False, + ) + + reward_checkpointer = config.instantiate( + reward_cfg, + resume_from_checkpoint=False, + ) + + return ( + policy_checkpointer, + ref_policy_checkpointer, + value_checkpointer, + reward_checkpointer, + ) + + def _setup_model( + self, + cfg_model: DictConfig, + cfg_reward_value_model: DictConfig, + enable_activation_checkpointing: bool, + compile_model: bool, + policy_state_dict: Dict[str, Any], + ref_policy_state_dict: Dict[str, Any], + value_model_state_dict: Dict[str, Any], + reward_model_state_dict: Dict[str, Any], + ) -> Tuple[nn.Module, nn.Module, nn.Module]: + """ + Sets up the policy model, reference policy model, reward model, and value model. + """ + + with utils.set_default_dtype(self._dtype), self._device: + policy_model = config.instantiate(cfg_model) + ref_policy_model = config.instantiate(cfg_model) + reward_model = config.instantiate(cfg_reward_value_model) + value_model = config.instantiate(cfg_reward_value_model) + + if enable_activation_checkpointing: + utils.set_activation_checkpointing( + policy_model, auto_wrap_policy={modules.TransformerDecoderLayer} + ) + utils.set_activation_checkpointing( + value_model, auto_wrap_policy={modules.TransformerDecoderLayer} + ) + + policy_model.load_state_dict(policy_state_dict) + ref_policy_model.load_state_dict(ref_policy_state_dict) + + reward_missing, reward_unexpected = reward_model.load_state_dict( + reward_model_state_dict, strict=False + ) + value_missing, value_unexpected = value_model.load_state_dict( + value_model_state_dict, strict=False + ) + + # some extra validation for HF classifier checkpoints with a `score.bias` present + assert ( + reward_missing == value_missing == [] + ), f"Missing keys in reward ({reward_missing}) and value model ({value_missing}) state dicts." + + if reward_unexpected or value_unexpected: + # the only unexpected keys should be when pre-trained HF models were saved with + # bias=True in final classification layers. This happens when training a reward model with TRL. + assert ( + reward_unexpected == value_unexpected == ["output.bias"] + ), f"Unexpected keys in reward ({reward_unexpected}) and value model ({value_unexpected}) state dicts." + + # Validate models were loaded in with the expected dtype. + utils.validate_expected_param_dtype( + value_model.named_parameters(), dtype=self._dtype + ) + utils.validate_expected_param_dtype( + reward_model.named_parameters(), dtype=self._dtype + ) + utils.validate_expected_param_dtype( + value_model.named_parameters(), dtype=self._dtype + ) + utils.validate_expected_param_dtype( + ref_policy_model.named_parameters(), dtype=self._dtype + ) + + log.info(f"Models are initialized with precision {self._dtype}.") + + # disabling dropout if found - non-determinism leads to issues in e.g. comparing logprobs + # between ref policy and current policy + for module in policy_model.modules(): + if isinstance(module, torch.nn.Dropout): + warn( + f"Dropout found in {module}. This is likely to cause issues during training. Disabling." + ) + module.p = 0 + for module in value_model.modules(): + if isinstance(module, torch.nn.Dropout): + warn( + f"Dropout found in {module}. This is likely to cause issues during training. Disabling." + ) + module.p = 0 + + # disabling grad and dropout in reward and reference policy models + reward_model.eval() + ref_policy_model.eval() + + for p in reward_model.parameters(): + p.requires_grad = False + + for p in ref_policy_model.parameters(): + p.requires_grad = False + + # Compile model, if enabled. + if compile_model: + backend = os.environ.get("TORCH_COMPILE_BACKEND", "inductor") + log.info("Compiling models with torch.compile...") + + policy_model.compile(backend=backend) + reward_model.compile(backend=backend) + ref_policy_model.compile(backend=backend) + value_model.compile(backend=backend) + + if self._device.type == "cuda": + memory_stats = utils.get_memory_stats(device=self._device) + utils.log_memory_stats(memory_stats) + + return policy_model, value_model, reward_model, ref_policy_model + + def _setup_optimizer( + self, + cfg_optimizer: DictConfig, + optimizer_in_bwd: bool = False, + opt_state_dict: Optional[Dict[str, Any]] = None, + ) -> Optimizer: + + if optimizer_in_bwd: + # Maintain a dict of optims for every parameter. + optim_dict = { + p: config.instantiate(cfg_optimizer, [p]) + for p in chain( + self._policy_model.parameters(), self._value_model.parameters() + ) + } + # Register optimizer step hooks on the models to run optimizer in backward. + utils.register_optim_in_bwd_hooks( + model=self._policy_model, optim_dict=optim_dict + ) + utils.register_optim_in_bwd_hooks( + model=self._value_model, optim_dict=optim_dict + ) + # Create a wrapper for checkpoint save/load of optimizer states when running in backward. + self._optim_ckpt_wrapper = utils.create_optim_in_bwd_wrapper( + model=self._policy_model, optim_dict=optim_dict + ) + self._optim_ckpt_wrapper = utils.create_optim_in_bwd_wrapper( + model=self._value_model, optim_dict=optim_dict + ) + # Load optimizer states. If optimizer states are being restored in an optimizer in backward + # run, these need to have been saved with the same setting. Cannot restore from runs that did not + # use optimizer in backward. + if opt_state_dict is not None: + try: + self._optim_ckpt_wrapper.load_state_dict(opt_state_dict) + except BaseException as e: + raise RuntimeError( + "Failed loading in-backward optimizer checkpoints." + "Please make sure run being restored from was using in-backward optimizer." + ) from e + log.info("In-backward optimizers are set up.") + return None + else: + optimizer = config.instantiate( + cfg_optimizer, + chain(self._policy_model.parameters(), self._value_model.parameters()), + ) + if opt_state_dict: + optimizer.load_state_dict(opt_state_dict) + + log.info("Optimizer is initialized.") + return optimizer + + def _setup_data( + self, cfg_dataset: DictConfig, shuffle: bool, batch_size: int + ) -> Tuple[DistributedSampler, DataLoader]: + """ + All data related setup happens here. + """ + if isinstance(cfg_dataset, ListConfig): + datasets = [ + config.instantiate(single_cfg_dataset, tokenizer=self._tokenizer) + for single_cfg_dataset in cfg_dataset + ] + ds = ConcatDataset(datasets=datasets) + else: + ds = config.instantiate(cfg_dataset, tokenizer=self._tokenizer) + + sampler = DistributedSampler( + ds, + num_replicas=1, + rank=0, + shuffle=shuffle, + seed=0, + ) + dataloader = DataLoader( + dataset=ds, + sampler=sampler, + batch_size=batch_size, + collate_fn=partial( + rlhf.left_padded_collate, + padding_idx=self._tokenizer.pad_id, + ), + drop_last=True, + ) + + return sampler, dataloader + + def save_checkpoint( + self, epoch: int, is_intermediate_checkpoint: bool = False + ) -> None: + """ + Save state dict to file. The recipe save_checkpoint method is responsible for + correctly creating the checkpoint dict and passing to the checkpointer. + """ + policy_ckpt_dict = {utils.MODEL_KEY: self._policy_model.state_dict()} + value_ckpt_dict = {utils.MODEL_KEY: self._value_model.state_dict()} + + # if training is in-progress, checkpoint the optimizer state and rng state as well + if is_intermediate_checkpoint: + policy_ckpt_dict.update( + { + utils.SEED_KEY: self.seed, + utils.EPOCHS_KEY: self._epochs_run, + utils.TOTAL_EPOCHS_KEY: self._total_epochs, + utils.MAX_STEPS_KEY: self._total_steps, + utils.STEPS_KEY: self._steps_run, + utils.RNG_KEY: self._rng.get_state(), + } + ) + if not self._optimizer_in_bwd: + policy_ckpt_dict[utils.OPT_KEY] = self._optimizer.state_dict() + else: + policy_ckpt_dict[utils.OPT_KEY] = self._optim_ckpt_wrapper.state_dict() + + self._policy_checkpointer.save_checkpoint( + policy_ckpt_dict, + epoch=epoch, + intermediate_checkpoint=is_intermediate_checkpoint, + ) + + self._value_checkpointer.save_checkpoint( + value_ckpt_dict, + epoch=epoch, + intermediate_checkpoint=False, + ) + + def _update_recipe_state(self, ckpt_dict: Dict[str, Any]) -> None: + """ + Updates the recipe state from checkpoint. + """ + # If seed or total_steps, or total_epochs don't match, + # warn the user and overwrite. + try: + if ( + self.seed != ckpt_dict[utils.SEED_KEY] + or self._total_steps != ckpt_dict[utils.MAX_STEPS_KEY] + or self._total_epochs != ckpt_dict[utils.TOTAL_EPOCHS_KEY] + ): + warn( + message="""Configured value for seed, total_steps, or total_epochs + does not match the value stored in checkpoint.""" + ) + self.seed = utils.set_seed(seed=ckpt_dict[utils.SEED_KEY]) + self._rng.set_state(ckpt_dict[utils.RNG_KEY]) + self._steps_run = ckpt_dict[utils.STEPS_KEY] + self._total_steps = ckpt_dict[utils.MAX_STEPS_KEY] + self._total_epochs = ckpt_dict[utils.TOTAL_EPOCHS_KEY] + self._epochs_run = ckpt_dict[utils.EPOCHS_KEY] + + except KeyError as e: + raise KeyError from e( + "Checkpoint does not contain the required keys needed for updating recipe state." + "Are you sure you passed in the right recipe checkpoint?" + ) + + def generate_trajectory(self, input_ids: torch.Tensor) -> Trajectory: + """ + Generates a trajectory given the current policy and value models, the reference policy model, the reward model, + and batch of inputs. This is done over the following steps: + + 1: Generate responses, and logits corresponding to the responses using the current policy, + generating (query, response) pairs. + 2. Estimate logprobs of the generated responses using the current policy. + 3. Estimate values from the generated responses using the current value function. + 4. Replace any tokens in the response after the first stop token (usually EOS token) with padding, + producting truncated responses. + 5. Run the reward model on the (query, truncated-response) pairs. + 6. Mask out all the invalid values in the trajectory due to padding tokens. + + Args: + input_ids (torch.Tensor): tensor of input token IDs with shape [b, seq_length] + + Returns: + Trajectory: An instance of :class:`~torchtune.modules.rlhf.Trajectory` comprising + the current trajectory. + """ + batch_size, context_length = input_ids.shape + + # step 1: generate responses, and logits corresponding to the responses using the current policy + query_responses, logits = rlhf.generate_with_logits( + model=self._policy_model, + prompt=input_ids, + max_generated_tokens=self._max_generated_tokens, + temperature=self._temperature, + top_k=self._top_k, + pad_id=self._tokenizer.pad_id, + rng=self._rng, + ) + + responses = query_responses[:, context_length:].clone() + query_response_padding_masks = query_responses == self._tokenizer.pad_id + + # step 1.1 create attention masks and position IDs for any padding tokens in inputs, used for future forward passes + masks = rlhf.get_causal_mask(~(query_response_padding_masks)) + position_ids = (~query_response_padding_masks).cumsum(-1) - ( + ~query_response_padding_masks + ).long() + position_ids = position_ids.type(torch.int) + + del query_response_padding_masks + + # step 2. estimate logprobs of the responses using the current policy + logits = logits[:, context_length - 1 :] + logprobs = rlhf.logits_to_logprobs(logits, responses, self._temperature) + + del logits + + # step 2.1 estimate logprobs of the responses using the reference policy + ref_logits = self._ref_policy_model( + query_responses, input_pos=position_ids, mask=masks + ) + ref_logits = rlhf.truncate_sequence_for_logprobs(ref_logits, context_length) + ref_logprobs = rlhf.logits_to_logprobs(ref_logits, responses, self._temperature) + + del ref_logits + + # step 3. estimate values from the responses using the value function + values = self._value_model(query_responses, input_pos=position_ids, mask=masks) + values = rlhf.truncate_sequence_for_logprobs(values, context_length).squeeze(-1) + + # step 4. replace any tokens in the responses after the first stop token (usually EOS token) with padding + # resulting in truncated responses + response_padding_masks, responses = rlhf.truncate_sequence_at_first_stop_token( + responses, self._stop_token_ids, self._tokenizer.pad_id + ) + + # step 5. run the reward model on the (query, truncated-response) pairs + scores = self._reward_model( + torch.cat([input_ids, responses], dim=1), + input_pos=position_ids, + mask=masks, + ) + + del responses + + # step 5.1 the scores from the reward model are the logits for the last non-padding token in + # each (query, truncated-response) pair + seq_lens = utils.get_unmasked_sequence_lengths(response_padding_masks) + scores = scores[torch.arange(batch_size), seq_lens + context_length].squeeze(-1) + + # step 5.2 if configured, apply any penalties for sequences without EOS tokens + # or shorter than a certain length + if self._penalise_no_eos or self._min_response_length: + reward_penalty_mask = rlhf.get_reward_penalty_mask( + response_padding_masks, + seq_lens, + self._penalise_no_eos, + self._min_response_length, + ) + scores[reward_penalty_mask] = self._reward_penalty + + # step 6. mask out all the invalid values in the trajectory due to padding tokens + logprobs[response_padding_masks] = 1.0 + ref_logprobs[response_padding_masks] = 1.0 + + # step 6.1 values are masked out *after* the last valid token in the response + value_seq_idxs = torch.where( + (seq_lens > 0) & (seq_lens < self._max_generated_tokens - 1), + seq_lens + 1, + seq_lens, + ) + value_padding_masks = response_padding_masks.clone() + value_padding_masks[ + torch.arange(batch_size, device=value_padding_masks.device), + value_seq_idxs, + ] = False + + values[value_padding_masks] = 0.0 + + return Trajectory( + query_responses=query_responses, + logprobs=logprobs, + ref_logprobs=ref_logprobs, + values=values, + masks=masks, + position_ids=position_ids, + response_padding_masks=response_padding_masks, + value_padding_masks=value_padding_masks, + value_seq_idxs=value_seq_idxs, + scores=scores, + seq_lens=seq_lens, + ) + + def generate_trajectory_batched(self, input_ids: torch.Tensor) -> Trajectory: + """ + Generates a ``self.batch_size`` batch of trajectories using `self._forward_batch_size` batch sizes. + See ``generate_trajectory`` for more details. + + Args: + input_ids (torch.Tensor): tensor of input token IDs with shape [b, seq_length] + + Returns: + Trajectory: An instance of :class:`~torchtune.modules.rlhf.Trajectory`, comprising + the current trajectory. + """ + trajectories: List[Trajectory] = [] + with torch.no_grad(): + for batch_start in range(0, self.batch_size, self._forward_batch_size): + batch_input_ids = input_ids[ + batch_start : batch_start + self._forward_batch_size + ] + trajectories.append(self.generate_trajectory(batch_input_ids)) + return Trajectory(*map(torch.cat, zip(*trajectories))) + + def train(self) -> None: + """ + The core training loop.""" + + if self._model_compile: + log.info( + "NOTE: torch.compile is enabled and model is compiled in first forward." + "Expect a relatively slow first iteration." + ) + # zero out the gradients before starting training + if not self._optimizer_in_bwd: + self._optimizer.zero_grad() + + training_completed = False + pbar = tqdm(total=self._total_steps, initial=self._steps_run) + for curr_epoch in range(self._epochs_run, self._total_epochs): + # Update the sampler to ensure data is correctly shuffled across epochs + # in case shuffle is True + self._sampler.set_epoch(curr_epoch) + + for _, batch in enumerate(self._dataloader): + batch = batch.to(self._device) + _, context_length = batch.shape + + # step 1. generate the trajectory using: + # - the current policy (pi_theta) + # - the current value function (V_phi) + # - the reference frozen policy model (pi_theta_0) + trajectory = self.generate_trajectory_batched(batch) + + # step 2. get the rewards for the current trajectory. these are based on: + # - the divergence between the current policy and the reference policy + # - the scores from the reward model + rewards, kl, kl_rewards = rlhf.get_rewards_ppo( + trajectory.scores, + trajectory.logprobs, + trajectory.ref_logprobs, + self._kl_coeff, + trajectory.value_seq_idxs, + ) + + # step 3. estimate the advantages using Generalized Advantage Estimation (GAE) + advantages, returns = rlhf.estimate_advantages( + trajectory.values, + rewards, + self._gamma, + self._lmbda, + masks=~trajectory.response_padding_masks, + ) + + # step 4. optimise using the PPO objective over multiple epochs + ppo_stats: List[PPOStats] = [] + for _ in range(self._ppo_epochs): + batch_idxs = torch.randperm(self.batch_size, device=self._device) + for i in range(0, self.batch_size, self._ppo_batch_size): + mini_batch_idxs = batch_idxs[i : i + self._ppo_batch_size] + + batch_ppo_stats: List[PPOStats] = [] + for j in range( + 0, self._ppo_batch_size, self._ppo_backward_batch_size + ): + backward_batch_idxs = mini_batch_idxs[ + j : j + self._ppo_backward_batch_size + ] + + batch_trajectory = Trajectory( + *map( + partial( + torch.index_select, + dim=0, + index=backward_batch_idxs, + ), + trajectory, + ) + ) + batch_ppo_stats.append( + self._ppo_step( + batch_trajectory, + advantages[backward_batch_idxs], + returns[backward_batch_idxs], + context_length, + ) + ) + del batch_trajectory + + ppo_stats.append(PPOStats(*map(sum, zip(*batch_ppo_stats)))) + + if not self._optimizer_in_bwd: + self._optimizer.step() + self._optimizer.zero_grad(set_to_none=True) + + self.global_step += 1 + + # step 5. profit + self._steps_run += 1 + if self._steps_run % self._log_every_n_steps == 0: + self.log_metrics( + trajectory, + PPOStats(*map(torch.stack, zip(*ppo_stats))), + kl, + kl_rewards, + ) + self.cleanup_after_step( + trajectory, ppo_stats, advantages, returns, kl, kl_rewards + ) + pbar.update(1) + if self._steps_run == self._total_steps: + training_completed = True + break + + # save checkpoint at current epoch + self._epochs_run += 1 + + self.save_checkpoint( + curr_epoch, is_intermediate_checkpoint=not training_completed + ) + if training_completed: + return + + def _ppo_step( + self, + trajectory: Trajectory, + advantages: torch.Tensor, + returns: torch.Tensor, + context_length: int, + ) -> PPOStats: + """ + Perform a single PPO optimisation step over a batch of trajectories and corresponding advantages and returns. + + Args: + trajectory (Trajectory): a batch of trajectories + advantages (torch.Tensor): advantages corresponding to the trajectories + returns (torch.Tensor): returns corresponding the trajectories + context_length (int): input ids sequence length + + Returns: + PPOStats: An instance of :class:`~torchtune.modules.rlhf.PPOStats`, a NamedTuple containing: + - loss (torch.Tensor): The total PPO loss. + - policy_loss (torch.Tensor): The policy function loss. + - value_loss (torch.Tensor): The value function loss. + - ratios (torch.Tensor): The ratio between the current and old policy probabilities. + - clipfrac (torch.Tensor): The fraction of ratios that were clipped. + - approx_policy_kls: Average estimated KL divergence between the policy before and after the optimisation step. + + """ + # estimate logprobs from the policy at the current optimisation step + pi_logits = self._policy_model( + trajectory.query_responses, + input_pos=trajectory.position_ids, + mask=trajectory.masks, + ) + pi_logits = rlhf.truncate_sequence_for_logprobs(pi_logits, context_length) + pi_logprobs = rlhf.logits_to_logprobs( + pi_logits, trajectory.query_responses[:, context_length:], self._temperature + ) + pi_logprobs[trajectory.response_padding_masks] = 1.0 + + del pi_logits + + # estimate the values from the value function at the current optimisation step + phi_values = self._value_model( + trajectory.query_responses, + input_pos=trajectory.position_ids, + mask=trajectory.masks, + ) + + phi_values = rlhf.truncate_sequence_for_logprobs( + phi_values, context_length + ).squeeze(-1) + phi_values[trajectory.value_padding_masks] = 0.0 + + # calculate ppo loss + loss, policy_loss, value_loss, ratios, clipfrac = self._loss_fn( + trajectory.logprobs, + pi_logprobs, + advantages, + trajectory.values, + phi_values, + returns, + padding_masks=~trajectory.response_padding_masks, + value_padding_masks=~trajectory.value_padding_masks, + ) + + loss /= self._gradient_accumulation_steps + loss.backward() + + with torch.no_grad(): + approx_policy_kls = ( + 0.5 * (pi_logprobs - trajectory.logprobs).pow(2) + ).mean() + + return PPOStats( + loss, + policy_loss / self._gradient_accumulation_steps, + value_loss / self._gradient_accumulation_steps, + ratios / self._gradient_accumulation_steps, + clipfrac / self._gradient_accumulation_steps, + approx_policy_kls / self._gradient_accumulation_steps, + ) + + def log_metrics( + self, + trajectory: Trajectory, + ppo_stats: PPOStats, + kl: torch.Tensor, + kl_rewards: torch.Tensor, + ) -> None: + """ + Log metrics and statistics for the current step to the metric logger. + """ + log_dict = { + "scores": trajectory.scores.mean(), + "num_stop_tokens": trajectory.response_padding_masks.any(-1).sum(), + "rlhf_reward": trajectory.scores.mean() + kl_rewards.sum(1).mean(), + "kl": kl.sum(1).mean(), + "kl_reward": kl_rewards.sum(1).mean(), + "loss": ppo_stats.loss.mean(), + "policy_loss": ppo_stats.policy_loss.mean(), + "value_loss": ppo_stats.value_loss.mean(), + "clipfrac": ppo_stats.clipfrac.mean(), + "ratios": ppo_stats.ratios.mean(), + "approx_policy_kl": ppo_stats.approx_policy_kls.mean(), + "response_lengths": trajectory.seq_lens.float().mean(), + } + if self._device.type == "cuda" and self._log_peak_memory_stats: + log_dict.update(utils.get_memory_stats(device=self._device)) + + self._metric_logger.log_dict(log_dict, step=self.global_step) + + def cleanup_after_step( + self, + trajectory: Trajectory, + ppo_stats: PPOStats, + advantages: torch.Tensor, + returns: torch.Tensor, + kl: torch.Tensor, + kl_rewards: torch.Tensor, + ) -> None: + """ + Cleanup tensors after each PPO step to free up memory. + """ + # there shouldn't be any floating references to the individual tensors at the this point, so gc can do its thing + for v in trajectory: + del v + del trajectory + for v in ppo_stats: + del v + del ppo_stats + del advantages + del returns + del kl + del kl_rewards + + def cleanup(self, **kwargs) -> None: + self._metric_logger.close() + + +@config.parse +def recipe_main(cfg: DictConfig) -> None: + """ + Entry point for the recipe. + + Configurable parameters are read in the following order: + - Parameters specified in config (see available configs through ``tune ls``) + - Overwritten by arguments from the command-line + """ + config.log_config(recipe_name="PPOFullFinetuneRecipeSingleDevice", cfg=cfg) + recipe = PPOFullFinetuneRecipeSingleDevice(cfg=cfg) + recipe.setup(cfg=cfg) + recipe.train() + recipe.cleanup() + + +if __name__ == "__main__": + sys.exit(recipe_main()) diff --git a/benchmarks/llm/requirements.cuda.txt b/benchmarks/llm/requirements.cuda.txt index b6c9752f0..9b54a8464 100644 --- a/benchmarks/llm/requirements.cuda.txt +++ b/benchmarks/llm/requirements.cuda.txt @@ -9,11 +9,11 @@ --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html --trusted-host pypi.ngc.nvidia.com -aiohappyeyeballs==2.3.4 +aiohappyeyeballs==2.4.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # aiohttp -aiohttp==3.10.0 +aiohttp==3.10.5 # via # -c .pin/../.pin/constraints-cuda-torch.txt # datasets @@ -38,11 +38,11 @@ async-timeout==4.0.3 # via # -c .pin/../.pin/constraints-cuda-torch.txt # aiohttp -attrs==23.2.0 +attrs==24.2.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # aiohttp -blobfile==2.1.1 +blobfile==3.0.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # torchtune @@ -58,7 +58,7 @@ codefind==0.1.6 # via # -c .pin/../.pin/constraints-cuda-torch.txt # ptera -datasets==2.20.0 +datasets==2.21.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # torchtune @@ -84,7 +84,7 @@ frozenlist==1.4.1 # -c .pin/../.pin/constraints-cuda-torch.txt # aiohttp # aiosignal -fsspec[http]==2024.5.0 +fsspec[http]==2024.6.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt # datasets @@ -99,17 +99,17 @@ hjson==3.1.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # argklass -huggingface-hub==0.24.5 +huggingface-hub==0.24.6 # via # -c .pin/../.pin/constraints-cuda-torch.txt # datasets # torchtune -idna==3.7 +idna==3.8 # via # -c .pin/../.pin/constraints-cuda-torch.txt # requests # yarl -importlib-resources==6.4.0 +importlib-resources==6.4.4 # via # -c .pin/../.pin/constraints-cuda-torch.txt # argklass @@ -117,7 +117,7 @@ jinja2==3.1.4 # via # -c .pin/../.pin/constraints-cuda-torch.txt # torch -lxml==4.9.4 +lxml==5.3.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # blobfile @@ -196,6 +196,10 @@ nvidia-cusparse-cu12==12.1.0.106 # -c .pin/../.pin/constraints-cuda-torch.txt # nvidia-cusolver-cu12 # torch +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # voir nvidia-nccl-cu12==2.20.5 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -214,7 +218,7 @@ omegaconf==2.3.0 # -c .pin/../.pin/constraints-cuda-torch.txt # torchtune # voir -ovld==0.3.6 +ovld==0.3.9 # via # -c .pin/../.pin/constraints-cuda-torch.txt # voir @@ -239,10 +243,6 @@ pyarrow==17.0.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # datasets -pyarrow-hotfix==0.6 - # via - # -c .pin/../.pin/constraints-cuda-torch.txt - # datasets pycryptodomex==3.20.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -251,10 +251,6 @@ pygments==2.18.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # rich -pynvml==11.5.3 - # via - # -c .pin/../.pin/constraints-cuda-torch.txt - # voir python-dateutil==2.9.0.post0 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -263,7 +259,7 @@ pytz==2024.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt # pandas -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/llm/requirements.in @@ -284,11 +280,11 @@ requests==2.32.3 # datasets # huggingface-hub # tiktoken -rich==13.7.1 +rich==13.8.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # voir -safetensors==0.4.3 +safetensors==0.4.4 # via # -c .pin/../.pin/constraints-cuda-torch.txt # torchtune @@ -301,7 +297,7 @@ six==1.16.0 # -c .pin/../.pin/constraints-cuda-torch.txt # asttokens # python-dateutil -sympy==1.13.1 +sympy==1.13.2 # via # -c .pin/../.pin/constraints-cuda-torch.txt # torch @@ -321,7 +317,7 @@ torchtune==0.2.1+cu121 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/llm/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-cuda-torch.txt # datasets @@ -355,7 +351,7 @@ voir==0.2.19 # -c .pin/../.pin/constraints-cuda-torch.txt # -c .pin/../constraints/cuda.txt # -r benchmarks/llm/requirements.in -xxhash==3.4.1 +xxhash==3.5.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # datasets diff --git a/benchmarks/llm/requirements.rocm.txt b/benchmarks/llm/requirements.rocm.txt new file mode 100644 index 000000000..ab5098d08 --- /dev/null +++ b/benchmarks/llm/requirements.rocm.txt @@ -0,0 +1,306 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/llm/requirements.rocm.txt .pin/tmp-constraints-rocm-llm-full-mp-nodes.txt benchmarks/llm/requirements.in +# +--extra-index-url https://download.pytorch.org/whl/rocm6.0 + +aiohappyeyeballs==2.4.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp +aiohttp==3.10.5 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets + # fsspec +aiosignal==1.3.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp +antlr4-python3-runtime==4.9.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # omegaconf +argklass==1.4.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/llm/requirements.in +asttokens==2.4.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # giving +async-timeout==4.0.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp +attrs==24.2.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp +blobfile==2.1.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torchtune +certifi==2024.7.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # requests +charset-normalizer==3.3.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # requests +codefind==0.1.6 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # ptera +datasets==2.21.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torchtune +dill==0.3.8 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets + # multiprocess +executing==1.2.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # varname +filelock==3.15.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # blobfile + # datasets + # huggingface-hub + # pytorch-triton-rocm + # torch +frozenlist==1.4.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp + # aiosignal +fsspec[http]==2024.6.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets + # huggingface-hub + # torch +giving==0.4.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # ptera + # voir +hjson==3.1.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # argklass +huggingface-hub==0.24.6 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets + # torchtune +idna==3.7 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # requests + # yarl +importlib-resources==6.4.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # argklass +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +lxml==4.9.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # blobfile +markdown-it-py==3.0.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # rich +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # jinja2 +mdurl==0.1.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # markdown-it-py +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # sympy +multidict==6.0.5 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp + # yarl +multiprocess==0.70.16 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets +networkx==3.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +numpy==1.26.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets + # pandas + # pyarrow + # torchtune +omegaconf==2.3.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torchtune + # voir +ovld==0.3.8 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +packaging==24.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets + # huggingface-hub +pandas==2.2.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets +psutil==5.9.8 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +ptera==1.4.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +pyarrow==17.0.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets +pycryptodomex==3.20.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # blobfile +pygments==2.18.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # rich +pynvml==11.5.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +python-dateutil==2.9.0.post0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # pandas +pytorch-triton-rocm==3.0.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +pytz==2024.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # pandas +pyyaml==6.0.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/llm/requirements.in + # datasets + # huggingface-hub + # omegaconf +reactivex==4.0.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # giving +regex==2024.7.24 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # tiktoken +requests==2.32.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets + # huggingface-hub + # tiktoken +rich==13.7.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +safetensors==0.4.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torchtune +sentencepiece==0.2.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torchtune +six==1.16.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # asttokens + # python-dateutil +sympy==1.13.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +tiktoken==0.7.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torchtune +torch==2.4.0+rocm6.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/llm/requirements.in +torchao==0.3.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torchtune +torchtune==0.2.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/llm/requirements.in +tqdm==4.66.5 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets + # huggingface-hub + # torchtune +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # huggingface-hub + # reactivex + # torch +tzdata==2024.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # pandas +urllib3==2.2.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # blobfile + # requests +varname==0.10.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # giving +voir==0.2.17 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -c .pin/../constraints/rocm.txt + # -r benchmarks/llm/requirements.in +xxhash==3.5.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets +yarl==1.9.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp diff --git a/benchmarks/recursiongfn/Makefile b/benchmarks/recursiongfn/Makefile new file mode 100644 index 000000000..892657d09 --- /dev/null +++ b/benchmarks/recursiongfn/Makefile @@ -0,0 +1,31 @@ +# Use global base if possible +ifndef MILABENCH_BASE + MILABENCH_BASE="base" +endif + +export MILABENCH_BASE + +BENCH_NAME=recursiongfn +MILABENCH_CONFIG=dev.yaml +MILABENCH_ARGS=--config $(MILABENCH_CONFIG) --base $(MILABENCH_BASE) + +all: + install prepare single gpus nodes + +install: + milabench install $(MILABENCH_ARGS) --force + +prepare: + milabench prepare $(MILABENCH_ARGS) + +tests: # install prepare + milabench run $(MILABENCH_ARGS) + +single: + milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME)-single + +gpus: + milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME)-gpus + +nodes: + milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME)-nodes diff --git a/benchmarks/recursiongfn/README.md b/benchmarks/recursiongfn/README.md new file mode 100644 index 000000000..b8db87236 --- /dev/null +++ b/benchmarks/recursiongfn/README.md @@ -0,0 +1,4 @@ + +# Recursiongfn + +Rewrite this README to explain what the benchmark is! diff --git a/benchmarks/recursiongfn/benchfile.py b/benchmarks/recursiongfn/benchfile.py new file mode 100644 index 000000000..1d5d46351 --- /dev/null +++ b/benchmarks/recursiongfn/benchfile.py @@ -0,0 +1,34 @@ +from milabench.pack import Package + + +URL = "https://github.com/Delaunay/gflownet/" +BRANCH = "milabench" + +class Recursiongfn(Package): + # Requirements file installed by install(). It can be empty or absent. + base_requirements = "requirements.in" + + # The preparation script called by prepare(). It must be executable, + # but it can be any type of script. It can be empty or absent. + prepare_script = "prepare.py" + + # The main script called by run(). It must be a Python file. It has to + # be present. + main_script = "main.py" + + # You can remove the functions below if you don't need to modify them. + def clone(self): + gflownet = self.dirs.code / "gflownet" + if not gflownet.exists(): + gflownet.clone_subtree(URL, BRANCH) + + async def install(self): + self.clone() + await super().install() # super() call installs the requirements + + async def prepare(self): + await super().prepare() # super() call executes prepare_script + + + +__pack__ = Recursiongfn diff --git a/benchmarks/recursiongfn/dev.yaml b/benchmarks/recursiongfn/dev.yaml new file mode 100644 index 000000000..8730968ff --- /dev/null +++ b/benchmarks/recursiongfn/dev.yaml @@ -0,0 +1,15 @@ + +recursiongfn: + inherits: _defaults + definition: . + install-variant: unpinned + install_group: torch + plan: + method: per_gpu + + argv: + --batch_size: 128 + --num_workers: 8 + --num_steps: 100 + --layer_width: 128 + --num_layers: 4 diff --git a/benchmarks/recursiongfn/main.py b/benchmarks/recursiongfn/main.py new file mode 100644 index 000000000..81d08e8aa --- /dev/null +++ b/benchmarks/recursiongfn/main.py @@ -0,0 +1,169 @@ +# This is the script run by milabench run (by default) +# It is possible to use a script from a GitHub repo if it is cloned using +# clone_subtree in the benchfile.py, in which case this file can simply +# be deleted. + +import datetime +import os +from pathlib import Path +from typing import Callable + +import sys +sys.path.append(os.path.join(os.path.dirname(__file__), "gflownet", "src")) + +import numpy as np +import torch.nn as nn +import torchcompat.core as accelerator +from gflownet.config import Config, init_empty +from gflownet.models import bengio2021flow +from gflownet.tasks.seh_frag import SEHFragTrainer, SEHTask +from gflownet.utils.conditioning import TemperatureConditional +from gflownet.utils.misc import get_worker_device +from torch import Tensor +from torch.utils.data import DataLoader, Dataset + +from benchmate.observer import BenchObserver + + +class SEHFragTrainerMonkeyPatch(SEHFragTrainer): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.batch_size_in_nodes = [] + + def batch_size(x): + """Measures the batch size as the sum of all nodes in the batch.""" + return self.batch_size_in_nodes.pop() + + self.observer = BenchObserver( + accelerator.Event, + earlystop=65, + batch_size_fn=batch_size, + raise_stop_program=False, + stdout=False, + ) + + def _maybe_resolve_shared_buffer(self, *args, **kwargs): + batch = super()._maybe_resolve_shared_buffer(*args, **kwargs) + + # Accumulate the size of all graphs in the batch measured in nodes. + acc = 0 + n = len(batch) + for i in range(n): + elem = batch[i] + acc += elem.x.shape[0] + + self.batch_size_in_nodes.append(acc) + return batch + + def step(self, loss: Tensor): + original_output = super().step(loss) + self.observer.record_loss(loss) + return original_output + + def build_training_data_loader(self) -> DataLoader: + original_output = super().build_training_data_loader() + return self.observer.loader(original_output) + + def setup_task(self): + self.task = SEHTaskMonkeyPatch( + dataset=self.training_data, + cfg=self.cfg, + rng=self.rng, + wrap_model=self._wrap_for_mp, + ) + + +class SEHTaskMonkeyPatch(SEHTask): + """Allows us to specify the location of the original model download.""" + + def __init__( + self, + dataset: Dataset, + cfg: Config, + rng: np.random.Generator = None, + wrap_model: Callable[[nn.Module], nn.Module] = None, + ): + self._wrap_model = wrap_model + self.rng = rng + self.models = self._load_task_models() + self.dataset = dataset + self.temperature_conditional = TemperatureConditional(cfg, rng) + self.num_cond_dim = self.temperature_conditional.encoding_size() + + def _load_task_models(self): + xdg_cache = os.environ["XDG_CACHE_HOME"] + model = bengio2021flow.load_original_model( + cache=True, + location=Path(os.path.join(xdg_cache, "bengio2021flow_proxy.pkl.gz")), + ) + model.to(get_worker_device()) + model = self._wrap_model(model) + return {"seh": model} + + +def main( + batch_size: int, num_workers: int, num_steps: int, layer_width: int, num_layers: int +): + # This script runs on an A100 with 8 cpus and 32Gb memory, but the A100 is probably + # overkill here. VRAM peaks at 6Gb and GPU usage peaks at 25%. + + config = init_empty(Config()) + config.print_every = 1 + config.log_dir = f"./logs/debug_run_seh_frag_{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}" + config.device = accelerator.fetch_device(0) # This is your CUDA device. + config.overwrite_existing_exp = True + + config.num_training_steps = num_steps # Change this to train for longer. + config.checkpoint_every = 5 # 500 + config.validate_every = 0 + config.num_final_gen_steps = 0 + config.opt.lr_decay = 20_000 + config.opt.clip_grad_type = "total_norm" + config.algo.sampling_tau = 0.9 + config.cond.temperature.sample_dist = "constant" + config.cond.temperature.dist_params = [64.0] + config.replay.use = False + + # Things it may be fun to play with. + config.num_workers = num_workers + config.model.num_emb = layer_width + config.model.num_layers = num_layers + batch_size = batch_size + + if config.replay.use: + config.algo.num_from_policy = 0 + config.replay.num_new_samples = batch_size + config.replay.num_from_replay = batch_size + else: + config.algo.num_from_policy = batch_size + + # This may need to be adjusted if the batch_size is made bigger + config.mp_buffer_size = 32 * 1024**2 # 32Mb + trial = SEHFragTrainerMonkeyPatch(config, print_config=False) + trial.run() + trial.terminate() + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Description of your program") + parser.add_argument("-b", "--batch_size", help="Batch Size", default=128) + parser.add_argument("-n", "--num_workers", help="Number of Workers", default=8) + parser.add_argument( + "-s", "--num_steps", help="Number of Training Steps", default=100 + ) + parser.add_argument( + "-w", "--layer_width", help="Width of each policy hidden layer", default=128 + ) + parser.add_argument("-l", "--num_layers", help="Number of hidden layers", default=4) + args = parser.parse_args() + + main( + args.batch_size, + args.num_workers, + args.num_steps, + args.layer_width, + args.num_layers, + ) diff --git a/benchmarks/recursiongfn/prepare.py b/benchmarks/recursiongfn/prepare.py new file mode 100755 index 000000000..89cafada3 --- /dev/null +++ b/benchmarks/recursiongfn/prepare.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python + +import os + +from pathlib import Path + +import sys +sys.path.append(os.path.join(os.path.dirname(__file__), "gflownet", "src")) + + +if __name__ == "__main__": + from gflownet.models.bengio2021flow import load_original_model + + # If you need the whole configuration: + # config = json.loads(os.environ["MILABENCH_CONFIG"]) + print("+ Full environment:\n{}\n***".format(os.environ)) + + #milabench_cfg = os.environ["MILABENCH_CONFIG"] + #print(milabench_cfg) + + xdg_cache = os.environ["XDG_CACHE_HOME"] + + print("+ Loading proxy model weights to MILABENCH_DIR_DATA={}".format(xdg_cache)) + _ = load_original_model( + cache=True, + location=Path(os.path.join(xdg_cache, "bengio2021flow_proxy.pkl.gz")), + ) + diff --git a/benchmarks/recursiongfn/requirements.cuda.txt b/benchmarks/recursiongfn/requirements.cuda.txt new file mode 100644 index 000000000..1aef7b7fb --- /dev/null +++ b/benchmarks/recursiongfn/requirements.cuda.txt @@ -0,0 +1,495 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/recursiongfn/requirements.cuda.txt .pin/tmp-constraints-cuda-recursiongfn_gnn.txt benchmarks/recursiongfn/requirements.in +# +--extra-index-url https://pypi.ngc.nvidia.com +--extra-index-url https://download.pytorch.org/whl/cu121 +--find-links https://data.pyg.org/whl/torch-2.3.0+cu121.html +--trusted-host pypi.ngc.nvidia.com + +absl-py==2.1.0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # tensorboard +aiohappyeyeballs==2.4.0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # aiohttp +aiohttp==3.10.5 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch-geometric +aiosignal==1.3.1 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # aiohttp +antlr4-python3-runtime==4.9.3 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # omegaconf +asttokens==2.4.1 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # giving +async-timeout==4.0.3 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # aiohttp +attrs==24.2.0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # aiohttp +blosc2==2.7.1 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # tables +botorch==0.11.3 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/recursiongfn/requirements.in +certifi==2024.7.4 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # requests + # sentry-sdk +charset-normalizer==3.3.2 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # requests +click==8.1.7 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # wandb +codefind==0.1.6 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # ptera +cvxopt==1.3.2 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/recursiongfn/requirements.in +docker-pycreds==0.4.0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # wandb +executing==1.2.0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # varname +filelock==3.15.4 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch + # triton +frozenlist==1.4.1 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # aiohttp + # aiosignal +fsspec==2024.6.1 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch + # torch-geometric +gitdb==4.0.11 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # gitpython +gitpython==3.1.43 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/recursiongfn/requirements.in + # wandb +giving==0.4.2 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # ptera + # voir +gpytorch==1.12 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/recursiongfn/requirements.in + # botorch +grpcio==1.66.0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # tensorboard +idna==3.8 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # requests + # yarl +jaxtyping==0.2.33 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # linear-operator +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch + # torch-geometric +joblib==1.4.2 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # scikit-learn +linear-operator==0.5.2 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # botorch + # gpytorch +markdown==3.7 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # tensorboard +markdown-it-py==3.0.0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # rich +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # jinja2 + # werkzeug +mdurl==0.1.2 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # markdown-it-py +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # botorch + # gpytorch + # sympy +msgpack==1.0.8 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # blosc2 +multidict==6.0.5 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # aiohttp + # yarl +multipledispatch==1.0.0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # botorch +ndindex==1.8 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # blosc2 +networkx==3.3 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/recursiongfn/requirements.in + # torch +numexpr==2.10.1 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # blosc2 + # tables +numpy==1.26.4 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # blosc2 + # botorch + # numexpr + # opt-einsum + # pandas + # pyarrow + # pyro-ppl + # rdkit + # scikit-learn + # scipy + # tables + # tensorboard + # torch-geometric +nvidia-cublas-cu12==12.1.3.1 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch +nvidia-cuda-nvrtc-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch +nvidia-cuda-runtime-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch +nvidia-cudnn-cu12==8.9.2.26 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch +nvidia-cufft-cu12==11.0.2.54 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch +nvidia-curand-cu12==10.3.2.106 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch +nvidia-cusolver-cu12==11.4.5.107 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch +nvidia-cusparse-cu12==12.1.0.106 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # nvidia-cusolver-cu12 + # torch +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # voir +nvidia-nccl-cu12==2.20.5 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch +nvidia-nvjitlink-cu12==12.6.20 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 +nvidia-nvtx-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch +omegaconf==2.3.0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/recursiongfn/requirements.in + # voir +opt-einsum==3.3.0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # pyro-ppl +ovld==0.3.9 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # voir +packaging==24.1 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # tables + # tensorboard +pandas==2.2.2 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/recursiongfn/requirements.in +pillow==10.4.0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # rdkit +platformdirs==4.2.2 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # wandb +protobuf==5.27.3 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # tensorboard + # wandb +psutil==5.9.8 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch-geometric + # voir + # wandb +ptera==1.4.1 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # voir +py-cpuinfo==9.0.0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # blosc2 + # tables +pyarrow==17.0.0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/recursiongfn/requirements.in +pygments==2.18.0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # rich +pyparsing==3.1.4 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch-geometric +pyro-api==0.1.2 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # pyro-ppl +pyro-ppl==1.9.1 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/recursiongfn/requirements.in + # botorch +python-dateutil==2.9.0.post0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # pandas +pytz==2024.1 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # pandas +pyyaml==6.0.2 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # omegaconf + # wandb +rdkit==2024.3.5 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/recursiongfn/requirements.in +reactivex==4.0.4 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # giving +requests==2.32.3 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch-geometric + # wandb +rich==13.8.0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # voir +scikit-learn==1.5.1 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # gpytorch + # torch-geometric +scipy==1.14.1 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/recursiongfn/requirements.in + # botorch + # gpytorch + # linear-operator + # scikit-learn + # torch-cluster + # torch-geometric + # torch-sparse +sentry-sdk==2.13.0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # wandb +setproctitle==1.3.3 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # wandb +six==1.16.0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # asttokens + # docker-pycreds + # python-dateutil + # tensorboard +smmap==5.0.1 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # gitdb +sympy==1.13.2 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch +tables==3.10.1 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/recursiongfn/requirements.in +tensorboard==2.17.1 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/recursiongfn/requirements.in +tensorboard-data-server==0.7.2 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # tensorboard +threadpoolctl==3.5.0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # scikit-learn +torch==2.3.1+cu121 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/recursiongfn/requirements.in + # botorch + # linear-operator + # pyro-ppl +torch-cluster==1.6.3+pt23cu121 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/recursiongfn/requirements.in +torch-geometric==2.5.3 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/recursiongfn/requirements.in +torch-scatter==2.1.2+pt23cu121 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/recursiongfn/requirements.in +torch-sparse==0.6.18+pt23cu121 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/recursiongfn/requirements.in +tqdm==4.66.5 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # pyro-ppl + # torch-geometric +triton==2.3.1 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # torch +typeguard==2.13.3 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # jaxtyping + # linear-operator +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # reactivex + # tables + # torch +tzdata==2024.1 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # pandas +urllib3==2.2.2 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # requests + # sentry-sdk +varname==0.10.0 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # giving +voir==0.2.19 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -c .pin/../constraints/cuda.txt + # -r benchmarks/recursiongfn/requirements.in +wandb==0.17.7 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # -r benchmarks/recursiongfn/requirements.in +werkzeug==3.0.4 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # tensorboard +yarl==1.9.4 + # via + # -c .pin/../.pin/constraints-cuda-gnn.txt + # aiohttp + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/benchmarks/recursiongfn/requirements.in b/benchmarks/recursiongfn/requirements.in new file mode 100644 index 000000000..77df598d3 --- /dev/null +++ b/benchmarks/recursiongfn/requirements.in @@ -0,0 +1,20 @@ +voir>=0.2.17,<0.3 +torch +torch-geometric +torch-scatter +torch-sparse +torch-cluster +rdkit +tables +scipy +networkx +tensorboard +cvxopt +pyarrow +gitpython +botorch +pyro-ppl +gpytorch +omegaconf>=2.3 +wandb +pandas diff --git a/benchmarks/recursiongfn/requirements.rocm.txt b/benchmarks/recursiongfn/requirements.rocm.txt new file mode 100644 index 000000000..1bc73f14e --- /dev/null +++ b/benchmarks/recursiongfn/requirements.rocm.txt @@ -0,0 +1,445 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/recursiongfn/requirements.rocm.txt .pin/tmp-constraints-rocm-recursiongfn_gnn.txt benchmarks/recursiongfn/requirements.in +# +--extra-index-url https://download.pytorch.org/whl/rocm6.0 + +absl-py==2.1.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # tensorboard +aiohappyeyeballs==2.4.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # aiohttp +aiohttp==3.10.5 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # torch-geometric +aiosignal==1.3.1 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # aiohttp +antlr4-python3-runtime==4.9.3 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # omegaconf +asttokens==2.4.1 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # giving +async-timeout==4.0.3 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # aiohttp +attrs==24.2.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # aiohttp +blosc2==2.7.1 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # tables +botorch==0.11.3 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # gflownet +certifi==2024.7.4 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # requests + # sentry-sdk +charset-normalizer==3.3.2 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # requests +click==8.1.7 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # wandb +codefind==0.1.6 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # ptera +cvxopt==1.3.2 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # gflownet +docker-pycreds==0.4.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # wandb +executing==1.2.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # varname +filelock==3.15.4 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # pytorch-triton-rocm + # torch +frozenlist==1.4.1 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # aiohttp + # aiosignal +fsspec==2024.6.1 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # torch + # torch-geometric +gflownet @ git+https://github.com/Delaunay/gflownet@milabench + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # -r benchmarks/recursiongfn/requirements.in +gitdb==4.0.11 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # gitpython +gitpython==3.1.43 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # gflownet + # wandb +giving==0.4.2 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # ptera + # voir +gpytorch==1.12 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # botorch + # gflownet +grpcio==1.65.5 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # tensorboard +idna==3.7 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # requests + # yarl +jaxtyping==0.2.33 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # linear-operator +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # torch + # torch-geometric +joblib==1.4.2 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # scikit-learn +linear-operator==0.5.2 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # botorch + # gpytorch +markdown==3.7 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # tensorboard +markdown-it-py==3.0.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # rich +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # jinja2 + # werkzeug +mdurl==0.1.2 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # markdown-it-py +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # botorch + # gpytorch + # sympy +msgpack==1.0.8 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # blosc2 +multidict==6.0.5 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # aiohttp + # yarl +multipledispatch==1.0.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # botorch +ndindex==1.8 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # blosc2 +networkx==3.3 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # gflownet + # torch +numexpr==2.10.1 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # blosc2 + # tables +numpy==1.26.4 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # blosc2 + # botorch + # numexpr + # opt-einsum + # pandas + # pyarrow + # pyro-ppl + # rdkit + # scikit-learn + # scipy + # tables + # tensorboard + # torch-geometric +omegaconf==2.3.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # gflownet + # voir +opt-einsum==3.3.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # pyro-ppl +ovld==0.3.8 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # voir +packaging==24.1 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # tables + # tensorboard +pandas==2.2.2 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # gflownet +pillow==10.4.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # rdkit +platformdirs==4.2.2 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # wandb +protobuf==5.27.3 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # tensorboard + # wandb +psutil==5.9.8 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # torch-geometric + # voir + # wandb +ptera==1.4.1 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # voir +py-cpuinfo==9.0.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # blosc2 + # tables +pyarrow==17.0.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # gflownet +pygments==2.18.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # rich +pynvml==11.5.3 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # voir +pyparsing==3.1.2 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # torch-geometric +pyro-api==0.1.2 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # pyro-ppl +pyro-ppl==1.9.1 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # botorch + # gflownet +python-dateutil==2.9.0.post0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # pandas +pytorch-triton-rocm==3.0.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # torch +pytz==2024.1 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # pandas +pyyaml==6.0.2 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # omegaconf + # wandb +rdkit==2024.3.5 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # gflownet +reactivex==4.0.4 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # giving +requests==2.32.3 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # torch-geometric + # wandb +rich==13.7.1 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # voir +scikit-learn==1.5.1 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # gpytorch + # torch-geometric +scipy==1.14.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # botorch + # gflownet + # gpytorch + # linear-operator + # scikit-learn + # torch-cluster + # torch-geometric + # torch-sparse +sentry-sdk==2.13.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # wandb +setproctitle==1.3.3 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # wandb +six==1.16.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # asttokens + # docker-pycreds + # python-dateutil + # tensorboard +smmap==5.0.1 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # gitdb +sympy==1.13.2 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # torch +tables==3.10.1 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # gflownet +tensorboard==2.17.1 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # gflownet +tensorboard-data-server==0.7.2 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # tensorboard +threadpoolctl==3.5.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # scikit-learn +torch==2.4.0+rocm6.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # -r benchmarks/recursiongfn/requirements.in + # botorch + # gflownet + # linear-operator + # pyro-ppl +torch-cluster==1.6.3 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # gflownet +torch-geometric==2.5.3 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # gflownet +torch-scatter==2.1.2 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # gflownet +torch-sparse==0.6.18 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # gflownet +tqdm==4.66.5 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # pyro-ppl + # torch-geometric +typeguard==2.13.3 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # jaxtyping + # linear-operator +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # reactivex + # tables + # torch +tzdata==2024.1 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # pandas +urllib3==2.2.2 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # requests + # sentry-sdk +varname==0.10.0 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # giving +voir==0.2.17 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../constraints/rocm.txt + # -r benchmarks/recursiongfn/requirements.in +wandb==0.17.7 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # gflownet +werkzeug==3.0.3 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # tensorboard +yarl==1.9.4 + # via + # -c .pin/../.pin/constraints-rocm-gnn.txt + # aiohttp + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/benchmarks/recursiongfn/voirfile.py b/benchmarks/recursiongfn/voirfile.py new file mode 100644 index 000000000..d93f886cd --- /dev/null +++ b/benchmarks/recursiongfn/voirfile.py @@ -0,0 +1,38 @@ +from dataclasses import dataclass + +from voir import configurable +from voir.instruments import dash, early_stop, log, rate +from benchmate.monitor import monitor_monogpu + +@dataclass +class Config: + """voir configuration""" + + # Whether to display the dash or not + dash: bool = False + + # How often to log the rates + interval: str = "1s" + + # Number of rates to skip before logging + skip: int = 5 + + # Number of rates to log before stopping + stop: int = 20 + + # Number of seconds between each gpu poll + gpu_poll: int = 3 + + +@configurable +def instrument_main(ov, options: Config): + yield ov.phases.init + + if options.dash: + ov.require(dash) + + ov.require( + log("value", "progress", "rate", "units", "loss", "gpudata", context="task"), + early_stop(n=options.stop, key="rate", task="train"), + monitor_monogpu(poll_interval=options.gpu_poll), + ) diff --git a/benchmarks/stargan/README.md b/benchmarks/retired/stargan/README.md similarity index 100% rename from benchmarks/stargan/README.md rename to benchmarks/retired/stargan/README.md diff --git a/benchmarks/stargan/benchfile.py b/benchmarks/retired/stargan/benchfile.py similarity index 100% rename from benchmarks/stargan/benchfile.py rename to benchmarks/retired/stargan/benchfile.py diff --git a/benchmarks/stargan/prepare.py b/benchmarks/retired/stargan/prepare.py similarity index 100% rename from benchmarks/stargan/prepare.py rename to benchmarks/retired/stargan/prepare.py diff --git a/benchmarks/stargan/requirements.cuda.txt b/benchmarks/retired/stargan/requirements.cuda.txt similarity index 100% rename from benchmarks/stargan/requirements.cuda.txt rename to benchmarks/retired/stargan/requirements.cuda.txt diff --git a/benchmarks/stargan/requirements.hpu.txt b/benchmarks/retired/stargan/requirements.hpu.txt similarity index 100% rename from benchmarks/stargan/requirements.hpu.txt rename to benchmarks/retired/stargan/requirements.hpu.txt diff --git a/benchmarks/stargan/requirements.in b/benchmarks/retired/stargan/requirements.in similarity index 100% rename from benchmarks/stargan/requirements.in rename to benchmarks/retired/stargan/requirements.in diff --git a/benchmarks/stargan/requirements.rocm.txt b/benchmarks/retired/stargan/requirements.rocm.txt similarity index 100% rename from benchmarks/stargan/requirements.rocm.txt rename to benchmarks/retired/stargan/requirements.rocm.txt diff --git a/benchmarks/stargan/requirements.xpu.txt b/benchmarks/retired/stargan/requirements.xpu.txt similarity index 100% rename from benchmarks/stargan/requirements.xpu.txt rename to benchmarks/retired/stargan/requirements.xpu.txt diff --git a/benchmarks/stargan/stargan/LICENSE b/benchmarks/retired/stargan/stargan/LICENSE similarity index 100% rename from benchmarks/stargan/stargan/LICENSE rename to benchmarks/retired/stargan/stargan/LICENSE diff --git a/benchmarks/stargan/stargan/ORIGIN.md b/benchmarks/retired/stargan/stargan/ORIGIN.md similarity index 100% rename from benchmarks/stargan/stargan/ORIGIN.md rename to benchmarks/retired/stargan/stargan/ORIGIN.md diff --git a/benchmarks/stargan/stargan/README.md b/benchmarks/retired/stargan/stargan/README.md similarity index 100% rename from benchmarks/stargan/stargan/README.md rename to benchmarks/retired/stargan/stargan/README.md diff --git a/benchmarks/stargan/stargan/data_loader.py b/benchmarks/retired/stargan/stargan/data_loader.py similarity index 100% rename from benchmarks/stargan/stargan/data_loader.py rename to benchmarks/retired/stargan/stargan/data_loader.py diff --git a/benchmarks/stargan/stargan/download.sh b/benchmarks/retired/stargan/stargan/download.sh similarity index 100% rename from benchmarks/stargan/stargan/download.sh rename to benchmarks/retired/stargan/stargan/download.sh diff --git a/benchmarks/stargan/stargan/logger.py b/benchmarks/retired/stargan/stargan/logger.py similarity index 100% rename from benchmarks/stargan/stargan/logger.py rename to benchmarks/retired/stargan/stargan/logger.py diff --git a/benchmarks/stargan/stargan/main.py b/benchmarks/retired/stargan/stargan/main.py similarity index 100% rename from benchmarks/stargan/stargan/main.py rename to benchmarks/retired/stargan/stargan/main.py diff --git a/benchmarks/stargan/stargan/model.py b/benchmarks/retired/stargan/stargan/model.py similarity index 100% rename from benchmarks/stargan/stargan/model.py rename to benchmarks/retired/stargan/stargan/model.py diff --git a/benchmarks/stargan/stargan/solver.py b/benchmarks/retired/stargan/stargan/solver.py similarity index 100% rename from benchmarks/stargan/stargan/solver.py rename to benchmarks/retired/stargan/stargan/solver.py diff --git a/benchmarks/stargan/stargan/synth.py b/benchmarks/retired/stargan/stargan/synth.py similarity index 100% rename from benchmarks/stargan/stargan/synth.py rename to benchmarks/retired/stargan/stargan/synth.py diff --git a/benchmarks/stargan/voirfile.py b/benchmarks/retired/stargan/voirfile.py similarity index 100% rename from benchmarks/stargan/voirfile.py rename to benchmarks/retired/stargan/voirfile.py diff --git a/benchmarks/super-slomo/requirements.cuda.txt b/benchmarks/super-slomo/requirements.cuda.txt index e4e207911..88c4880e5 100644 --- a/benchmarks/super-slomo/requirements.cuda.txt +++ b/benchmarks/super-slomo/requirements.cuda.txt @@ -30,7 +30,7 @@ filelock==3.15.4 # -c .pin/../.pin/constraints-cuda-torch.txt # torch # triton -fsspec==2024.5.0 +fsspec==2024.6.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt # torch @@ -108,6 +108,10 @@ nvidia-cusparse-cu12==12.1.0.106 # -c .pin/../.pin/constraints-cuda-torch.txt # nvidia-cusolver-cu12 # torch +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # voir nvidia-nccl-cu12==2.20.5 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -129,7 +133,7 @@ opencv-python==4.10.0.84 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/super-slomo/requirements.in -ovld==0.3.6 +ovld==0.3.9 # via # -c .pin/../.pin/constraints-cuda-torch.txt # voir @@ -149,11 +153,7 @@ pygments==2.18.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # rich -pynvml==11.5.3 - # via - # -c .pin/../.pin/constraints-cuda-torch.txt - # voir -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-cuda-torch.txt # omegaconf @@ -161,7 +161,7 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-cuda-torch.txt # giving -rich==13.7.1 +rich==13.8.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # voir @@ -169,7 +169,7 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # asttokens -sympy==1.13.1 +sympy==1.13.2 # via # -c .pin/../.pin/constraints-cuda-torch.txt # torch @@ -182,7 +182,7 @@ torchvision==0.19.0+cu121 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/super-slomo/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/super-slomo/requirements.in diff --git a/benchmarks/super-slomo/requirements.rocm.txt b/benchmarks/super-slomo/requirements.rocm.txt index 4b1ce5b0c..d85fcf14a 100644 --- a/benchmarks/super-slomo/requirements.rocm.txt +++ b/benchmarks/super-slomo/requirements.rocm.txt @@ -4,10 +4,7 @@ # # pip-compile --output-file=benchmarks/super-slomo/requirements.rocm.txt .pin/tmp-constraints-rocm-super-slomo.txt benchmarks/super-slomo/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com --extra-index-url https://download.pytorch.org/whl/rocm6.0 ---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com antlr4-python3-runtime==4.9.3 # via @@ -30,7 +27,7 @@ filelock==3.15.4 # -c .pin/../.pin/constraints-rocm-torch.txt # pytorch-triton-rocm # torch -fsspec==2024.5.0 +fsspec==2024.6.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch @@ -77,7 +74,7 @@ opencv-python==4.10.0.84 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/super-slomo/requirements.in -ovld==0.3.5 +ovld==0.3.8 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir @@ -101,11 +98,11 @@ pynvml==11.5.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir -pytorch-triton-rocm==2.3.1 +pytorch-triton-rocm==3.0.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # omegaconf @@ -121,20 +118,20 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # asttokens -sympy==1.13.0 +sympy==1.13.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch -torch==2.3.1+rocm6.0 +torch==2.4.0+rocm6.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/super-slomo/requirements.in # torchvision -torchvision==0.18.1+rocm6.0 +torchvision==0.19.0+rocm6.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/super-slomo/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/super-slomo/requirements.in diff --git a/benchmarks/super-slomo/requirements.xpu.txt b/benchmarks/super-slomo/requirements.xpu.txt index 2053d45b6..1a40b14fb 100644 --- a/benchmarks/super-slomo/requirements.xpu.txt +++ b/benchmarks/super-slomo/requirements.xpu.txt @@ -4,10 +4,8 @@ # # pip-compile --output-file=benchmarks/super-slomo/requirements.xpu.txt .pin/tmp-constraints-xpu-super-slomo.txt benchmarks/super-slomo/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com +--extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ ---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com antlr4-python3-runtime==4.9.3 # via @@ -17,14 +15,6 @@ asttokens==2.4.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -certifi==2024.6.2 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # requests -charset-normalizer==3.3.2 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # requests codefind==0.1.6 # via # -c .pin/../.pin/constraints-xpu-torch.txt @@ -46,10 +36,6 @@ giving==0.4.2 # -c .pin/../.pin/constraints-xpu-torch.txt # ptera # voir -idna==3.7 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # requests jinja2==3.1.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt @@ -88,7 +74,7 @@ opencv-python==4.10.0.84 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -r benchmarks/super-slomo/requirements.in -ovld==0.3.5 +ovld==0.3.8 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir @@ -112,7 +98,7 @@ pynvml==11.5.3 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-xpu-torch.txt # omegaconf @@ -120,10 +106,6 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -requests==2.32.3 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # torchvision rich==13.7.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt @@ -132,22 +114,24 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # asttokens -sympy==1.13.0 +sympy==1.13.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # torch -torch==2.1.0.post2+cxx11.abi +torch==2.4.0+cpu # via # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/super-slomo/requirements.in # torchvision -torchvision==0.16.0.post2+cxx11.abi +torchvision==0.19.0+cpu # via # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/super-slomo/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -r benchmarks/super-slomo/requirements.in @@ -156,10 +140,6 @@ typing-extensions==4.12.2 # -c .pin/../.pin/constraints-xpu-torch.txt # reactivex # torch -urllib3==1.26.19 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # requests varname==0.10.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt @@ -167,5 +147,6 @@ varname==0.10.0 voir==0.2.19 # via # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/super-slomo/requirements.in diff --git a/benchmarks/timm/benchfile.py b/benchmarks/timm/benchfile.py index 52a31ba1d..df36de07e 100644 --- a/benchmarks/timm/benchfile.py +++ b/benchmarks/timm/benchfile.py @@ -30,9 +30,6 @@ async def install(self): "https://github.com/huggingface/pytorch-image-models", BRANCH ) - # Install TIMM first - # await self.pip_install("-e", str(timm)) - # install the rest, which might override what TIMM specified await super().install() diff --git a/benchmarks/timm/requirements.cuda.txt b/benchmarks/timm/requirements.cuda.txt index 84f2f328d..d7e100fb2 100644 --- a/benchmarks/timm/requirements.cuda.txt +++ b/benchmarks/timm/requirements.cuda.txt @@ -39,7 +39,7 @@ filelock==3.15.4 # huggingface-hub # torch # triton -fsspec==2024.5.0 +fsspec==2024.6.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt # huggingface-hub @@ -49,11 +49,11 @@ giving==0.4.2 # -c .pin/../.pin/constraints-cuda-torch.txt # ptera # voir -huggingface-hub==0.24.5 +huggingface-hub==0.24.6 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/timm/requirements.in -idna==3.7 +idna==3.8 # via # -c .pin/../.pin/constraints-cuda-torch.txt # requests @@ -124,6 +124,10 @@ nvidia-cusparse-cu12==12.1.0.106 # -c .pin/../.pin/constraints-cuda-torch.txt # nvidia-cusolver-cu12 # torch +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # voir nvidia-nccl-cu12==2.20.5 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -141,7 +145,7 @@ omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # voir -ovld==0.3.6 +ovld==0.3.9 # via # -c .pin/../.pin/constraints-cuda-torch.txt # voir @@ -165,11 +169,7 @@ pygments==2.18.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # rich -pynvml==11.5.3 - # via - # -c .pin/../.pin/constraints-cuda-torch.txt - # voir -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/timm/requirements.in @@ -183,11 +183,11 @@ requests==2.32.3 # via # -c .pin/../.pin/constraints-cuda-torch.txt # huggingface-hub -rich==13.7.1 +rich==13.8.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # voir -safetensors==0.4.3 +safetensors==0.4.4 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/timm/requirements.in @@ -195,7 +195,7 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # asttokens -sympy==1.13.1 +sympy==1.13.2 # via # -c .pin/../.pin/constraints-cuda-torch.txt # torch @@ -208,7 +208,7 @@ torchvision==0.19.0+cu121 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/timm/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-cuda-torch.txt # huggingface-hub diff --git a/benchmarks/timm/requirements.rocm.txt b/benchmarks/timm/requirements.rocm.txt index a27a7da2d..8383f9e6b 100644 --- a/benchmarks/timm/requirements.rocm.txt +++ b/benchmarks/timm/requirements.rocm.txt @@ -4,10 +4,7 @@ # # pip-compile --output-file=benchmarks/timm/requirements.rocm.txt .pin/tmp-constraints-rocm-timm.txt benchmarks/timm/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com --extra-index-url https://download.pytorch.org/whl/rocm6.0 ---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com antlr4-python3-runtime==4.9.3 # via @@ -17,7 +14,7 @@ asttokens==2.4.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -certifi==2024.6.2 +certifi==2024.7.4 # via # -c .pin/../.pin/constraints-rocm-torch.txt # requests @@ -39,7 +36,7 @@ filelock==3.15.4 # huggingface-hub # pytorch-triton-rocm # torch -fsspec==2024.5.0 +fsspec==2024.6.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # huggingface-hub @@ -49,7 +46,7 @@ giving==0.4.2 # -c .pin/../.pin/constraints-rocm-torch.txt # ptera # voir -huggingface-hub==0.23.5 +huggingface-hub==0.24.6 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/timm/requirements.in @@ -89,7 +86,7 @@ omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir -ovld==0.3.5 +ovld==0.3.8 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir @@ -117,11 +114,11 @@ pynvml==11.5.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir -pytorch-triton-rocm==2.3.1 +pytorch-triton-rocm==3.0.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/timm/requirements.in @@ -139,7 +136,7 @@ rich==13.7.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir -safetensors==0.4.3 +safetensors==0.4.4 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/timm/requirements.in @@ -147,20 +144,20 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # asttokens -sympy==1.13.0 +sympy==1.13.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch -torch==2.3.1+rocm6.0 +torch==2.4.0+rocm6.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/timm/requirements.in # torchvision -torchvision==0.18.1+rocm6.0 +torchvision==0.19.0+rocm6.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/timm/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-rocm-torch.txt # huggingface-hub @@ -170,7 +167,7 @@ typing-extensions==4.12.2 # huggingface-hub # reactivex # torch -urllib3==1.26.19 +urllib3==2.2.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # requests diff --git a/benchmarks/timm/requirements.xpu.txt b/benchmarks/timm/requirements.xpu.txt index d71eb8433..d7f993372 100644 --- a/benchmarks/timm/requirements.xpu.txt +++ b/benchmarks/timm/requirements.xpu.txt @@ -4,10 +4,9 @@ # # pip-compile --output-file=benchmarks/timm/requirements.xpu.txt .pin/tmp-constraints-xpu-timm.txt benchmarks/timm/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com +--extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com antlr4-python3-runtime==4.9.3 # via @@ -17,7 +16,7 @@ asttokens==2.4.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -certifi==2024.6.2 +certifi==2024.7.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt # requests @@ -48,7 +47,7 @@ giving==0.4.2 # -c .pin/../.pin/constraints-xpu-torch.txt # ptera # voir -huggingface-hub==0.24.0 +huggingface-hub==0.24.5 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -r benchmarks/timm/requirements.in @@ -88,7 +87,7 @@ omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir -ovld==0.3.5 +ovld==0.3.8 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir @@ -116,7 +115,7 @@ pynvml==11.5.3 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -r benchmarks/timm/requirements.in @@ -130,12 +129,11 @@ requests==2.32.3 # via # -c .pin/../.pin/constraints-xpu-torch.txt # huggingface-hub - # torchvision rich==13.7.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir -safetensors==0.4.3 +safetensors==0.4.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -r benchmarks/timm/requirements.in @@ -143,22 +141,22 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # asttokens -sympy==1.13.0 +sympy==1.13.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # torch -torch==2.1.0.post2+cxx11.abi +torch==2.4.0+cpu # via # -c .pin/../.pin/constraints-xpu-torch.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/timm/requirements.in # torchvision -torchvision==0.16.0.post2+cxx11.abi +torchvision==0.19.0+cpu # via # -c .pin/../.pin/constraints-xpu-torch.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/timm/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-xpu-torch.txt # huggingface-hub @@ -168,7 +166,7 @@ typing-extensions==4.12.2 # huggingface-hub # reactivex # torch -urllib3==1.26.19 +urllib3==2.2.2 # via # -c .pin/../.pin/constraints-xpu-torch.txt # requests diff --git a/benchmarks/torchatari/Makefile b/benchmarks/torchatari/Makefile new file mode 100644 index 000000000..9eb0a30c5 --- /dev/null +++ b/benchmarks/torchatari/Makefile @@ -0,0 +1,31 @@ +# Use global base if possible +ifndef MILABENCH_BASE + MILABENCH_BASE="base" +endif + +export MILABENCH_BASE + +BENCH_NAME=torchatari +MILABENCH_CONFIG=dev.yaml +MILABENCH_ARGS=--config $(MILABENCH_CONFIG) --base $(MILABENCH_BASE) + +all: + install prepare single gpus nodes + +install: + milabench install $(MILABENCH_ARGS) --force + +prepare: + milabench prepare $(MILABENCH_ARGS) + +tests: + MILABENCH_CPU_AUTO=1 CUDA_VISIBLE_DEVICES=0,1 milabench run $(MILABENCH_ARGS) + +single: + MILABENCH_CPU_AUTO=1 milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME) + +gpus: + milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME)-gpus + +nodes: + milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME)-nodes diff --git a/benchmarks/torchatari/README.md b/benchmarks/torchatari/README.md new file mode 100644 index 000000000..44de20162 --- /dev/null +++ b/benchmarks/torchatari/README.md @@ -0,0 +1,4 @@ + +# Torch_ppo_atari_envpool + +Rewrite this README to explain what the benchmark is! diff --git a/benchmarks/torchatari/benchfile.py b/benchmarks/torchatari/benchfile.py new file mode 100644 index 000000000..1bf4ee785 --- /dev/null +++ b/benchmarks/torchatari/benchfile.py @@ -0,0 +1,31 @@ +from milabench.pack import Package + + +class Torchatari(Package): + # Requirements file installed by install(). It can be empty or absent. + base_requirements = "requirements.in" + + # The preparation script called by prepare(). It must be executable, + # but it can be any type of script. It can be empty or absent. + prepare_script = "prepare.py" + + # The main script called by run(). It must be a Python file. It has to + # be present. + main_script = "main.py" + + # You can remove the functions below if you don't need to modify them. + + def make_env(self): + # Return a dict of environment variables for prepare_script and + # main_script. + return super().make_env() + + async def install(self): + await super().install() # super() call installs the requirements + + async def prepare(self): + await super().prepare() # super() call executes prepare_script + + + +__pack__ = Torchatari diff --git a/benchmarks/torchatari/dev.yaml b/benchmarks/torchatari/dev.yaml new file mode 100644 index 000000000..d0df0df1a --- /dev/null +++ b/benchmarks/torchatari/dev.yaml @@ -0,0 +1,17 @@ + +torchatari: + max_duration: 600 + inherits: _defaults + definition: . + install-variant: unpinned + install_group: torch + plan: + method: per_gpu + + argv: + --num-minibatches: 16 + --update-epochs: 4 + --num-steps: 128 + --num-envs: auto({cpu_per_gpu}, 128) + --total-timesteps: 1000000 + --env-id: Breakout-v5 \ No newline at end of file diff --git a/benchmarks/torchatari/main.py b/benchmarks/torchatari/main.py new file mode 100644 index 000000000..62c9b3a07 --- /dev/null +++ b/benchmarks/torchatari/main.py @@ -0,0 +1,349 @@ +# docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/ppo/#ppo_atari_envpoolpy +import os +import random +import time +from collections import deque +from dataclasses import dataclass + +import envpool +import gym +import numpy as np +import torch +import torch.nn as nn +import torch.optim as optim +import tyro +from torch.distributions.categorical import Categorical +from torch.utils.tensorboard import SummaryWriter +import torchcompat.core as acc + +@dataclass +class Args: + exp_name: str = os.path.basename(__file__)[: -len(".py")] + """the name of this experiment""" + seed: int = 1 + """seed of the experiment""" + torch_deterministic: bool = True + """if toggled, `torch.backends.cudnn.deterministic=False`""" + cuda: bool = True + """if toggled, cuda will be enabled by default""" + track: bool = False + """if toggled, this experiment will be tracked with Weights and Biases""" + wandb_project_name: str = "cleanRL" + """the wandb's project name""" + wandb_entity: str = None + """the entity (team) of wandb's project""" + capture_video: bool = False + """whether to capture videos of the agent performances (check out `videos` folder)""" + + # Algorithm specific arguments + env_id: str = "Breakout-v5" + """the id of the environment""" + total_timesteps: int = 10000000 + """total timesteps of the experiments""" + learning_rate: float = 2.5e-4 + """the learning rate of the optimizer""" + num_envs: int = 128 + """the number of parallel game environments""" + num_steps: int = 128 + """the number of steps to run in each environment per policy rollout""" + anneal_lr: bool = True + """Toggle learning rate annealing for policy and value networks""" + gamma: float = 0.99 + """the discount factor gamma""" + gae_lambda: float = 0.95 + """the lambda for the general advantage estimation""" + num_minibatches: int = 16 + """the number of mini-batches""" + update_epochs: int = 4 + """the K epochs to update the policy""" + norm_adv: bool = True + """Toggles advantages normalization""" + clip_coef: float = 0.1 + """the surrogate clipping coefficient""" + clip_vloss: bool = True + """Toggles whether or not to use a clipped loss for the value function, as per the paper.""" + ent_coef: float = 0.01 + """coefficient of the entropy""" + vf_coef: float = 0.5 + """coefficient of the value function""" + max_grad_norm: float = 0.5 + """the maximum norm for the gradient clipping""" + target_kl: float = None + """the target KL divergence threshold""" + + # to be filled in runtime + batch_size: int = 0 + """the batch size (computed in runtime)""" + minibatch_size: int = 0 + """the mini-batch size (computed in runtime)""" + num_iterations: int = 0 + """the number of iterations (computed in runtime)""" + + +class RecordEpisodeStatistics(gym.Wrapper): + def __init__(self, env, deque_size=100): + super().__init__(env) + self.num_envs = getattr(env, "num_envs", 1) + self.episode_returns = None + self.episode_lengths = None + + def reset(self, **kwargs): + observations = super().reset(**kwargs) + self.episode_returns = np.zeros(self.num_envs, dtype=np.float32) + self.episode_lengths = np.zeros(self.num_envs, dtype=np.int32) + self.lives = np.zeros(self.num_envs, dtype=np.int32) + self.returned_episode_returns = np.zeros(self.num_envs, dtype=np.float32) + self.returned_episode_lengths = np.zeros(self.num_envs, dtype=np.int32) + return observations + + def step(self, action): + observations, rewards, dones, infos = super().step(action) + self.episode_returns += infos["reward"] + self.episode_lengths += 1 + self.returned_episode_returns[:] = self.episode_returns + self.returned_episode_lengths[:] = self.episode_lengths + self.episode_returns *= 1 - infos["terminated"] + self.episode_lengths *= 1 - infos["terminated"] + infos["r"] = self.returned_episode_returns + infos["l"] = self.returned_episode_lengths + return ( + observations, + rewards, + dones, + infos, + ) + + +def layer_init(layer, std=np.sqrt(2), bias_const=0.0): + torch.nn.init.orthogonal_(layer.weight, std) + torch.nn.init.constant_(layer.bias, bias_const) + return layer + + +class Agent(nn.Module): + def __init__(self, envs): + super().__init__() + self.network = nn.Sequential( + layer_init(nn.Conv2d(4, 32, 8, stride=4)), + nn.ReLU(), + layer_init(nn.Conv2d(32, 64, 4, stride=2)), + nn.ReLU(), + layer_init(nn.Conv2d(64, 64, 3, stride=1)), + nn.ReLU(), + nn.Flatten(), + layer_init(nn.Linear(64 * 7 * 7, 512)), + nn.ReLU(), + ) + self.actor = layer_init(nn.Linear(512, envs.single_action_space.n), std=0.01) + self.critic = layer_init(nn.Linear(512, 1), std=1) + + def get_value(self, x): + return self.critic(self.network(x / 255.0)) + + def get_action_and_value(self, x, action=None): + hidden = self.network(x / 255.0) + logits = self.actor(hidden) + probs = Categorical(logits=logits) + if action is None: + action = probs.sample() + return action, probs.log_prob(action), probs.entropy(), self.critic(hidden) + + +def main(): + args = tyro.cli(Args) + args.batch_size = int(args.num_envs * args.num_steps) + args.minibatch_size = int(args.batch_size // args.num_minibatches) + args.num_iterations = args.total_timesteps // args.batch_size + run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}" + if args.track: + import wandb + + wandb.init( + project=args.wandb_project_name, + entity=args.wandb_entity, + sync_tensorboard=True, + config=vars(args), + name=run_name, + monitor_gym=True, + save_code=True, + ) + writer = SummaryWriter(f"runs/{run_name}") + writer.add_text( + "hyperparameters", + "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])), + ) + + # TRY NOT TO MODIFY: seeding + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.backends.cudnn.deterministic = args.torch_deterministic + + device = acc.fetch_device(0) + + # env setup + envs = envpool.make( + args.env_id, + env_type="gym", + num_envs=args.num_envs, + episodic_life=True, + reward_clip=True, + seed=args.seed, + ) + envs.num_envs = args.num_envs + envs.single_action_space = envs.action_space + envs.single_observation_space = envs.observation_space + envs = RecordEpisodeStatistics(envs) + assert isinstance(envs.action_space, gym.spaces.Discrete), "only discrete action space is supported" + + agent = Agent(envs).to(device) + optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5) + + # ALGO Logic: Storage setup + obs = torch.zeros((args.num_steps, args.num_envs) + envs.single_observation_space.shape).to(device) + actions = torch.zeros((args.num_steps, args.num_envs) + envs.single_action_space.shape).to(device) + logprobs = torch.zeros((args.num_steps, args.num_envs)).to(device) + rewards = torch.zeros((args.num_steps, args.num_envs)).to(device) + dones = torch.zeros((args.num_steps, args.num_envs)).to(device) + values = torch.zeros((args.num_steps, args.num_envs)).to(device) + avg_returns = deque(maxlen=20) + + # TRY NOT TO MODIFY: start the game + global_step = 0 + start_time = time.time() + next_obs = torch.Tensor(envs.reset()).to(device) + next_done = torch.zeros(args.num_envs).to(device) + iterations = range(1, args.num_iterations + 1) + + for iteration in iterations: + # Annealing the rate if instructed to do so. + if args.anneal_lr: + frac = 1.0 - (iteration - 1.0) / args.num_iterations + lrnow = frac * args.learning_rate + optimizer.param_groups[0]["lr"] = lrnow + + for step in range(0, args.num_steps): + global_step += args.num_envs + obs[step] = next_obs + dones[step] = next_done + + # ALGO LOGIC: action logic + with torch.no_grad(): + action, logprob, _, value = agent.get_action_and_value(next_obs) + values[step] = value.flatten() + actions[step] = action + logprobs[step] = logprob + + # TRY NOT TO MODIFY: execute the game and log data. + next_obs, reward, next_done, info = envs.step(action.cpu().numpy()) + rewards[step] = torch.tensor(reward).to(device).view(-1) + next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(next_done).to(device) + + for idx, d in enumerate(next_done): + if d and info["lives"][idx] == 0: + # print(f"global_step={global_step}, episodic_return={info['r'][idx]}") + avg_returns.append(info["r"][idx]) + writer.add_scalar("charts/avg_episodic_return", np.average(avg_returns), global_step) + writer.add_scalar("charts/episodic_return", info["r"][idx], global_step) + writer.add_scalar("charts/episodic_length", info["l"][idx], global_step) + + # bootstrap value if not done + with torch.no_grad(): + next_value = agent.get_value(next_obs).reshape(1, -1) + advantages = torch.zeros_like(rewards).to(device) + lastgaelam = 0 + for t in reversed(range(args.num_steps)): + if t == args.num_steps - 1: + nextnonterminal = 1.0 - next_done + nextvalues = next_value + else: + nextnonterminal = 1.0 - dones[t + 1] + nextvalues = values[t + 1] + delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t] + advantages[t] = lastgaelam = delta + args.gamma * args.gae_lambda * nextnonterminal * lastgaelam + returns = advantages + values + + # flatten the batch + b_obs = obs.reshape((-1,) + envs.single_observation_space.shape) + b_logprobs = logprobs.reshape(-1) + b_actions = actions.reshape((-1,) + envs.single_action_space.shape) + b_advantages = advantages.reshape(-1) + b_returns = returns.reshape(-1) + b_values = values.reshape(-1) + + # Optimizing the policy and value network + b_inds = np.arange(args.batch_size) + clipfracs = [] + for epoch in range(args.update_epochs): + np.random.shuffle(b_inds) + for start in range(0, args.batch_size, args.minibatch_size): + end = start + args.minibatch_size + mb_inds = b_inds[start:end] + + _, newlogprob, entropy, newvalue = agent.get_action_and_value(b_obs[mb_inds], b_actions.long()[mb_inds]) + logratio = newlogprob - b_logprobs[mb_inds] + ratio = logratio.exp() + + with torch.no_grad(): + # calculate approx_kl http://joschu.net/blog/kl-approx.html + old_approx_kl = (-logratio).mean() + approx_kl = ((ratio - 1) - logratio).mean() + clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()] + + mb_advantages = b_advantages[mb_inds] + if args.norm_adv: + mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8) + + # Policy loss + pg_loss1 = -mb_advantages * ratio + pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - args.clip_coef, 1 + args.clip_coef) + pg_loss = torch.max(pg_loss1, pg_loss2).mean() + + # Value loss + newvalue = newvalue.view(-1) + if args.clip_vloss: + v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2 + v_clipped = b_values[mb_inds] + torch.clamp( + newvalue - b_values[mb_inds], + -args.clip_coef, + args.clip_coef, + ) + v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2 + v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) + v_loss = 0.5 * v_loss_max.mean() + else: + v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean() + + entropy_loss = entropy.mean() + loss = pg_loss - args.ent_coef * entropy_loss + v_loss * args.vf_coef + + optimizer.zero_grad() + loss.backward() + nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm) + optimizer.step() + + if args.target_kl is not None and approx_kl > args.target_kl: + break + + y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy() + var_y = np.var(y_true) + explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y + + # TRY NOT TO MODIFY: record rewards for plotting purposes + writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step) + writer.add_scalar("losses/value_loss", v_loss.item(), global_step) + writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step) + writer.add_scalar("losses/entropy", entropy_loss.item(), global_step) + writer.add_scalar("losses/old_approx_kl", old_approx_kl.item(), global_step) + writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) + writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step) + writer.add_scalar("losses/explained_variance", explained_var, global_step) + print("SPS:", int(global_step / (time.time() - start_time))) + writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step) + + envs.close() + writer.close() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/benchmarks/torchatari/prepare.py b/benchmarks/torchatari/prepare.py new file mode 100755 index 000000000..32bd5901d --- /dev/null +++ b/benchmarks/torchatari/prepare.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python + +import os + +if __name__ == "__main__": + # If you need the whole configuration: + # config = json.loads(os.environ["MILABENCH_CONFIG"]) + + data_directory = os.environ["MILABENCH_DIR_DATA"] + + # Download (or generate) the needed dataset(s). You are responsible + # to check if it has already been properly downloaded or not, and to + # do nothing if it has been. + print("Hello I am doing some data stuff!") + + # If there is nothing to download or generate, just delete this file. diff --git a/benchmarks/torchatari/requirements.cuda.txt b/benchmarks/torchatari/requirements.cuda.txt new file mode 100644 index 000000000..8f15c6635 --- /dev/null +++ b/benchmarks/torchatari/requirements.cuda.txt @@ -0,0 +1,308 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/torchatari/requirements.cuda.txt .pin/tmp-constraints-cuda-torchatari.txt benchmarks/torchatari/requirements.in +# +--extra-index-url https://pypi.ngc.nvidia.com +--extra-index-url https://download.pytorch.org/whl/cu121 +--find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html +--trusted-host pypi.ngc.nvidia.com + +absl-py==2.1.0 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # dm-env + # tensorboard +antlr4-python3-runtime==4.9.3 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # omegaconf +appdirs==1.4.4 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # cantilever +asttokens==2.4.1 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # giving +cantilever==0.1.0 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # -r benchmarks/torchatari/requirements.in +cloudpickle==3.0.0 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # gym + # gymnasium +codefind==0.1.6 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # ptera +dm-env==1.6 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # envpool +dm-tree==0.1.8 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # dm-env +docstring-parser==0.16 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # tyro +envpool==0.8.4 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # -r benchmarks/torchatari/requirements.in +executing==1.2.0 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # varname +farama-notifications==0.0.4 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # gymnasium +filelock==3.15.4 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # torch + # triton +fsspec==2024.6.1 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # torch +giving==0.4.2 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # ptera + # voir +grpcio==1.66.0 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # tensorboard +gym==0.23.1 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # -r benchmarks/torchatari/requirements.in + # envpool +gym-notices==0.0.8 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # gym +gymnasium==0.29.1 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # envpool +importlib-resources==6.4.4 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # cantilever + # torchcompat +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # torch +markdown==3.7 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # tensorboard +markdown-it-py==3.0.0 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # rich +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # jinja2 + # werkzeug +mdurl==0.1.2 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # markdown-it-py +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # sympy +networkx==3.3 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # torch +numpy==1.26.4 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # -r benchmarks/torchatari/requirements.in + # dm-env + # envpool + # gym + # gymnasium + # tensorboard +nvidia-cublas-cu12==12.1.3.1 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # torch +nvidia-cuda-nvrtc-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # torch +nvidia-cuda-runtime-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # torch +nvidia-cudnn-cu12==9.1.0.70 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # torch +nvidia-cufft-cu12==11.0.2.54 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # torch +nvidia-curand-cu12==10.3.2.106 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # torch +nvidia-cusolver-cu12==11.4.5.107 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # torch +nvidia-cusparse-cu12==12.1.0.106 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # nvidia-cusolver-cu12 + # torch +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # voir +nvidia-nccl-cu12==2.20.5 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # torch +nvidia-nvjitlink-cu12==12.6.20 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 +nvidia-nvtx-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # torch +omegaconf==2.3.0 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # voir +optree==0.12.1 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # envpool +ovld==0.3.9 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # voir +packaging==24.1 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # envpool + # tensorboard +protobuf==5.27.3 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # tensorboard +psutil==5.9.8 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # voir +ptera==1.4.1 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # voir +pygments==2.18.0 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # rich +pyyaml==6.0.2 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # omegaconf +reactivex==4.0.4 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # giving +rich==13.8.0 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # tyro + # voir +shtab==1.7.1 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # tyro +six==1.16.0 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # asttokens + # tensorboard +sympy==1.13.2 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # torch +tensorboard==2.17.1 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # -r benchmarks/torchatari/requirements.in +tensorboard-data-server==0.7.2 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # tensorboard +torch==2.4.0+cu121 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # -r benchmarks/torchatari/requirements.in +torchcompat==1.1.4 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # -c .pin/../constraints/cuda.txt + # -r benchmarks/torchatari/requirements.in +triton==3.0.0 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # torch +types-protobuf==5.27.0.20240626 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # envpool +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # envpool + # gymnasium + # optree + # reactivex + # torch + # tyro +tyro==0.8.10 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # -r benchmarks/torchatari/requirements.in +varname==0.10.0 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # giving +voir==0.2.19 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # -c .pin/../constraints/cuda.txt + # -r benchmarks/torchatari/requirements.in +werkzeug==3.0.4 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # tensorboard + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/benchmarks/torchatari/requirements.in b/benchmarks/torchatari/requirements.in new file mode 100644 index 000000000..c264f5563 --- /dev/null +++ b/benchmarks/torchatari/requirements.in @@ -0,0 +1,9 @@ +envpool +gym==0.23.1 +numpy +torch +tyro +voir +tensorboard +torchcompat +cantilever diff --git a/benchmarks/torchatari/requirements.rocm.txt b/benchmarks/torchatari/requirements.rocm.txt new file mode 100644 index 000000000..71fd92e51 --- /dev/null +++ b/benchmarks/torchatari/requirements.rocm.txt @@ -0,0 +1,253 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/torchatari/requirements.rocm.txt .pin/tmp-constraints-rocm-torchatari.txt benchmarks/torchatari/requirements.in +# +--extra-index-url https://download.pytorch.org/whl/rocm6.0 + +absl-py==2.1.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # dm-env + # tensorboard +antlr4-python3-runtime==4.9.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # omegaconf +appdirs==1.4.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # cantilever +asttokens==2.4.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # giving +cantilever==0.1.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/torchatari/requirements.in +cloudpickle==3.0.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # gym + # gymnasium +codefind==0.1.6 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # ptera +dm-env==1.6 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # envpool +dm-tree==0.1.8 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # dm-env +docstring-parser==0.16 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # tyro +envpool==0.8.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/torchatari/requirements.in +executing==1.2.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # varname +farama-notifications==0.0.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # gymnasium +filelock==3.15.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # pytorch-triton-rocm + # torch +fsspec==2024.6.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +giving==0.4.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # ptera + # voir +grpcio==1.65.5 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # tensorboard +gym==0.23.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/torchatari/requirements.in + # envpool +gym-notices==0.0.8 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # gym +gymnasium==0.29.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # envpool +importlib-resources==6.4.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # cantilever + # torchcompat +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +markdown==3.7 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # tensorboard +markdown-it-py==3.0.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # rich +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # jinja2 + # werkzeug +mdurl==0.1.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # markdown-it-py +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # sympy +networkx==3.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +numpy==1.26.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/torchatari/requirements.in + # dm-env + # envpool + # gym + # gymnasium + # tensorboard +omegaconf==2.3.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +optree==0.12.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # envpool +ovld==0.3.8 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +packaging==24.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # envpool + # tensorboard +protobuf==5.27.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # tensorboard +psutil==5.9.8 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +ptera==1.4.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +pygments==2.18.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # rich +pynvml==11.5.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +pytorch-triton-rocm==3.0.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +pyyaml==6.0.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # omegaconf +reactivex==4.0.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # giving +rich==13.7.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # tyro + # voir +shtab==1.7.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # tyro +six==1.16.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # asttokens + # tensorboard +sympy==1.13.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +tensorboard==2.17.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/torchatari/requirements.in +tensorboard-data-server==0.7.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # tensorboard +torch==2.4.0+rocm6.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/torchatari/requirements.in +torchcompat==1.1.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -c .pin/../constraints/rocm.txt + # -r benchmarks/torchatari/requirements.in +types-protobuf==5.27.0.20240626 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # envpool +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # envpool + # gymnasium + # optree + # reactivex + # torch + # tyro +tyro==0.8.8 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/torchatari/requirements.in +varname==0.10.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # giving +voir==0.2.17 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -c .pin/../constraints/rocm.txt + # -r benchmarks/torchatari/requirements.in +werkzeug==3.0.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # tensorboard + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/benchmarks/torchatari/voirfile.py b/benchmarks/torchatari/voirfile.py new file mode 100644 index 000000000..7b8873852 --- /dev/null +++ b/benchmarks/torchatari/voirfile.py @@ -0,0 +1,87 @@ +from dataclasses import dataclass + +from voir import configurable +from voir.phase import StopProgram +from benchmate.observer import BenchObserver +from benchmate.monitor import voirfile_monitor + + +@dataclass +class Config: + """voir configuration""" + + # Whether to display the dash or not + dash: bool = False + + # How often to log the rates + interval: str = "1s" + + # Number of rates to skip before logging + skip: int = 5 + + # Number of rates to log before stopping + stop: int = 20 + + # Number of seconds between each gpu poll + gpu_poll: int = 3 + + +@configurable +def instrument_main(ov, options: Config): + yield ov.phases.init + + # GPU monitor, rate, loss etc... + voirfile_monitor(ov, options) + + yield ov.phases.load_script + + step_per_iteration = 0 + + def fetch_args(args): + nonlocal step_per_iteration + step_per_iteration = args.num_envs * args.num_steps + return args + + def batch_size(x): + return step_per_iteration + + observer = BenchObserver( + earlystop=options.stop + options.skip, + batch_size_fn=batch_size, + ) + + probe = ov.probe("//main > args", overridable=True) + probe['args'].override(fetch_args) + + # measure the time it took to execute the body + probe = ov.probe("//main > iterations", overridable=True) + probe['iterations'].override(observer.loader) + + # Too many losses + # probe = ov.probe("//main > loss", overridable=True) + # probe["loss"].override(observer.record_loss) + + def record_starts(writer): + old_add_scalar = writer.add_scalar + + def add_scalar(name, *values): + if name == "losses/value_loss": + observer.record_loss(values[0]) + old_add_scalar(name, *values) + + writer.add_scalar = add_scalar + return writer + + probe = ov.probe("//main > writer", overridable=True) + probe["writer"].override(record_starts) + + probe = ov.probe("//main > optimizer", overridable=True) + probe['optimizer'].override(observer.optimizer) + + # + # Run the benchmark + # + try: + yield ov.phases.run_script + except StopProgram: + print("early stopped") \ No newline at end of file diff --git a/benchmarks/torchvision/requirements.cuda.txt b/benchmarks/torchvision/requirements.cuda.txt index 0b1d03ae1..a24805963 100644 --- a/benchmarks/torchvision/requirements.cuda.txt +++ b/benchmarks/torchvision/requirements.cuda.txt @@ -30,7 +30,7 @@ filelock==3.15.4 # -c .pin/../.pin/constraints-cuda-torch.txt # torch # triton -fsspec==2024.5.0 +fsspec==2024.6.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt # torch @@ -39,7 +39,7 @@ giving==0.4.2 # -c .pin/../.pin/constraints-cuda-torch.txt # ptera # voir -importlib-resources==6.4.0 +importlib-resources==6.4.4 # via # -c .pin/../.pin/constraints-cuda-torch.txt # torchcompat @@ -110,6 +110,10 @@ nvidia-cusparse-cu12==12.1.0.106 # -c .pin/../.pin/constraints-cuda-torch.txt # nvidia-cusolver-cu12 # torch +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # voir nvidia-nccl-cu12==2.20.5 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -127,7 +131,7 @@ omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # voir -ovld==0.3.6 +ovld==0.3.9 # via # -c .pin/../.pin/constraints-cuda-torch.txt # voir @@ -147,11 +151,7 @@ pygments==2.18.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # rich -pynvml==11.5.3 - # via - # -c .pin/../.pin/constraints-cuda-torch.txt - # voir -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-cuda-torch.txt # omegaconf @@ -159,7 +159,7 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-cuda-torch.txt # giving -rich==13.7.1 +rich==13.8.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # voir @@ -167,7 +167,7 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # asttokens -sympy==1.13.1 +sympy==1.13.2 # via # -c .pin/../.pin/constraints-cuda-torch.txt # torch @@ -185,7 +185,7 @@ torchvision==0.19.0+cu121 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/torchvision/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/torchvision/requirements.in diff --git a/benchmarks/torchvision/requirements.rocm.txt b/benchmarks/torchvision/requirements.rocm.txt index 2f0b78222..094eb29b6 100644 --- a/benchmarks/torchvision/requirements.rocm.txt +++ b/benchmarks/torchvision/requirements.rocm.txt @@ -4,10 +4,7 @@ # # pip-compile --output-file=benchmarks/torchvision/requirements.rocm.txt .pin/tmp-constraints-rocm-torchvision.txt benchmarks/torchvision/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com --extra-index-url https://download.pytorch.org/whl/rocm6.0 ---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com antlr4-python3-runtime==4.9.3 # via @@ -30,7 +27,7 @@ filelock==3.15.4 # -c .pin/../.pin/constraints-rocm-torch.txt # pytorch-triton-rocm # torch -fsspec==2024.5.0 +fsspec==2024.6.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch @@ -39,7 +36,7 @@ giving==0.4.2 # -c .pin/../.pin/constraints-rocm-torch.txt # ptera # voir -importlib-resources==6.4.0 +importlib-resources==6.4.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torchcompat @@ -75,7 +72,7 @@ omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir -ovld==0.3.5 +ovld==0.3.8 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir @@ -99,11 +96,11 @@ pynvml==11.5.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir -pytorch-triton-rocm==2.3.1 +pytorch-triton-rocm==3.0.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # omegaconf @@ -119,11 +116,11 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # asttokens -sympy==1.13.0 +sympy==1.13.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch -torch==2.3.1+rocm6.0 +torch==2.4.0+rocm6.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/torchvision/requirements.in @@ -133,11 +130,11 @@ torchcompat==1.1.4 # -c .pin/../.pin/constraints-rocm-torch.txt # -c .pin/../constraints/rocm.txt # -r benchmarks/torchvision/requirements.in -torchvision==0.18.1+rocm6.0 +torchvision==0.19.0+rocm6.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/torchvision/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/torchvision/requirements.in diff --git a/benchmarks/torchvision/requirements.xpu.txt b/benchmarks/torchvision/requirements.xpu.txt index 6503a0c9e..3cd876972 100644 --- a/benchmarks/torchvision/requirements.xpu.txt +++ b/benchmarks/torchvision/requirements.xpu.txt @@ -4,10 +4,9 @@ # # pip-compile --output-file=benchmarks/torchvision/requirements.xpu.txt .pin/tmp-constraints-xpu-torchvision.txt benchmarks/torchvision/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com +--extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com antlr4-python3-runtime==4.9.3 # via @@ -17,14 +16,6 @@ asttokens==2.4.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -certifi==2024.6.2 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # requests -charset-normalizer==3.3.2 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # requests codefind==0.1.6 # via # -c .pin/../.pin/constraints-xpu-torch.txt @@ -46,10 +37,6 @@ giving==0.4.2 # -c .pin/../.pin/constraints-xpu-torch.txt # ptera # voir -idna==3.7 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # requests importlib-resources==6.4.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt @@ -86,7 +73,7 @@ omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir -ovld==0.3.5 +ovld==0.3.8 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir @@ -110,7 +97,7 @@ pynvml==11.5.3 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-xpu-torch.txt # omegaconf @@ -118,10 +105,6 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -requests==2.32.3 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # torchvision rich==13.7.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt @@ -130,11 +113,11 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # asttokens -sympy==1.13.0 +sympy==1.13.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # torch -torch==2.1.0.post2+cxx11.abi +torch==2.4.0+cpu # via # -c .pin/../.pin/constraints-xpu-torch.txt # -c .pin/../constraints/xpu.txt @@ -145,12 +128,12 @@ torchcompat==1.1.4 # -c .pin/../.pin/constraints-xpu-torch.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/torchvision/requirements.in -torchvision==0.16.0.post2+cxx11.abi +torchvision==0.19.0+cpu # via # -c .pin/../.pin/constraints-xpu-torch.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/torchvision/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -r benchmarks/torchvision/requirements.in @@ -159,10 +142,6 @@ typing-extensions==4.12.2 # -c .pin/../.pin/constraints-xpu-torch.txt # reactivex # torch -urllib3==1.26.19 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # requests varname==0.10.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt diff --git a/benchmarks/torchvision_ddp/requirements.cuda.txt b/benchmarks/torchvision_ddp/requirements.cuda.txt index 856b6c852..7c1971e1f 100644 --- a/benchmarks/torchvision_ddp/requirements.cuda.txt +++ b/benchmarks/torchvision_ddp/requirements.cuda.txt @@ -30,7 +30,7 @@ filelock==3.15.4 # -c .pin/../.pin/constraints-cuda-torch.txt # torch # triton -fsspec==2024.5.0 +fsspec==2024.6.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt # torch @@ -39,7 +39,7 @@ giving==0.4.2 # -c .pin/../.pin/constraints-cuda-torch.txt # ptera # voir -importlib-resources==6.4.0 +importlib-resources==6.4.4 # via # -c .pin/../.pin/constraints-cuda-torch.txt # torchcompat @@ -110,6 +110,10 @@ nvidia-cusparse-cu12==12.1.0.106 # -c .pin/../.pin/constraints-cuda-torch.txt # nvidia-cusolver-cu12 # torch +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # voir nvidia-nccl-cu12==2.20.5 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -127,7 +131,7 @@ omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # voir -ovld==0.3.6 +ovld==0.3.9 # via # -c .pin/../.pin/constraints-cuda-torch.txt # voir @@ -147,11 +151,7 @@ pygments==2.18.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # rich -pynvml==11.5.3 - # via - # -c .pin/../.pin/constraints-cuda-torch.txt - # voir -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-cuda-torch.txt # omegaconf @@ -159,7 +159,7 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-cuda-torch.txt # giving -rich==13.7.1 +rich==13.8.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # voir @@ -167,7 +167,7 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # asttokens -sympy==1.13.1 +sympy==1.13.2 # via # -c .pin/../.pin/constraints-cuda-torch.txt # torch @@ -185,7 +185,7 @@ torchvision==0.19.0+cu121 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/torchvision_ddp/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/torchvision_ddp/requirements.in diff --git a/benchmarks/torchvision_ddp/requirements.rocm.txt b/benchmarks/torchvision_ddp/requirements.rocm.txt index 105c4a545..d1241db8b 100644 --- a/benchmarks/torchvision_ddp/requirements.rocm.txt +++ b/benchmarks/torchvision_ddp/requirements.rocm.txt @@ -4,10 +4,7 @@ # # pip-compile --output-file=benchmarks/torchvision_ddp/requirements.rocm.txt .pin/tmp-constraints-rocm-torchvision.txt benchmarks/torchvision_ddp/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com --extra-index-url https://download.pytorch.org/whl/rocm6.0 ---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com antlr4-python3-runtime==4.9.3 # via @@ -30,7 +27,7 @@ filelock==3.15.4 # -c .pin/../.pin/constraints-rocm-torch.txt # pytorch-triton-rocm # torch -fsspec==2024.5.0 +fsspec==2024.6.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch @@ -39,7 +36,7 @@ giving==0.4.2 # -c .pin/../.pin/constraints-rocm-torch.txt # ptera # voir -importlib-resources==6.4.0 +importlib-resources==6.4.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torchcompat @@ -75,7 +72,7 @@ omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir -ovld==0.3.5 +ovld==0.3.8 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir @@ -99,11 +96,11 @@ pynvml==11.5.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir -pytorch-triton-rocm==2.3.1 +pytorch-triton-rocm==3.0.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # omegaconf @@ -119,11 +116,11 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # asttokens -sympy==1.13.0 +sympy==1.13.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch -torch==2.3.1+rocm6.0 +torch==2.4.0+rocm6.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/torchvision_ddp/requirements.in @@ -133,11 +130,11 @@ torchcompat==1.1.4 # -c .pin/../.pin/constraints-rocm-torch.txt # -c .pin/../constraints/rocm.txt # -r benchmarks/torchvision_ddp/requirements.in -torchvision==0.18.1+rocm6.0 +torchvision==0.19.0+rocm6.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/torchvision_ddp/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/torchvision_ddp/requirements.in diff --git a/benchmarks/torchvision_ddp/requirements.xpu.txt b/benchmarks/torchvision_ddp/requirements.xpu.txt index a4a3f6220..b3f732e86 100644 --- a/benchmarks/torchvision_ddp/requirements.xpu.txt +++ b/benchmarks/torchvision_ddp/requirements.xpu.txt @@ -4,10 +4,8 @@ # # pip-compile --output-file=benchmarks/torchvision_ddp/requirements.xpu.txt .pin/tmp-constraints-xpu-torchvision.txt benchmarks/torchvision_ddp/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com +--extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ ---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com antlr4-python3-runtime==4.9.3 # via @@ -17,14 +15,6 @@ asttokens==2.4.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -certifi==2024.6.2 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # requests -charset-normalizer==3.3.2 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # requests codefind==0.1.6 # via # -c .pin/../.pin/constraints-xpu-torch.txt @@ -46,10 +36,6 @@ giving==0.4.2 # -c .pin/../.pin/constraints-xpu-torch.txt # ptera # voir -idna==3.7 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # requests importlib-resources==6.4.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt @@ -86,7 +72,7 @@ omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir -ovld==0.3.5 +ovld==0.3.8 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir @@ -110,7 +96,7 @@ pynvml==11.5.3 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-xpu-torch.txt # omegaconf @@ -118,10 +104,6 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -requests==2.32.3 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # torchvision rich==13.7.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt @@ -130,11 +112,11 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # asttokens -sympy==1.13.0 +sympy==1.13.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # torch -torch==2.1.0.post2+cxx11.abi +torch==2.4.0+cpu # via # -c .pin/../.pin/constraints-xpu-torch.txt # -c .pin/../constraints/xpu.txt @@ -145,12 +127,12 @@ torchcompat==1.1.4 # -c .pin/../.pin/constraints-xpu-torch.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/torchvision_ddp/requirements.in -torchvision==0.16.0.post2+cxx11.abi +torchvision==0.19.0+cpu # via # -c .pin/../.pin/constraints-xpu-torch.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/torchvision_ddp/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -r benchmarks/torchvision_ddp/requirements.in @@ -159,10 +141,6 @@ typing-extensions==4.12.2 # -c .pin/../.pin/constraints-xpu-torch.txt # reactivex # torch -urllib3==1.26.19 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # requests varname==0.10.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt diff --git a/benchmate/benchmate/observer.py b/benchmate/benchmate/observer.py index 9676e8261..0a40ae11e 100644 --- a/benchmate/benchmate/observer.py +++ b/benchmate/benchmate/observer.py @@ -75,10 +75,13 @@ def iterate(self, iterator, custom_step=False): def step(self): self.instance.step() + def original_dataloader(self): + return self.instance + def loader(self, loader, custom_step=False): """Wrap a dataloader or an iterable which enable accurate measuring of time spent in the loop's body""" - if self.instance: - return self.instance + if self.instance is not None: + return self.instance.loader cls = TimedIterator if custom_step: diff --git a/config/base.yaml b/config/base.yaml index 1d10341bb..3d02f33e6 100644 --- a/config/base.yaml +++ b/config/base.yaml @@ -106,7 +106,7 @@ _timm: argv: --amp: true --amp-dtype: bfloat16 - --device: '{arch}' + --device: '{device_name}' --val-split: '' --data-dir: "{milabench_data}" --dataset: "FakeImageNet" @@ -402,6 +402,7 @@ _diffusion: --num_epochs: 5 --batch_size: 32 --num_workers: "auto({n_worker}, 8)" + --cache: "{milabench_cache}" diffusion-single: inherits: _diffusion @@ -414,6 +415,8 @@ diffusion-gpus: num_machines: 1 diffusion-nodes: + tags: + - multinode inherits: _diffusion num_machines: 2 requires_capabilities: @@ -429,7 +432,7 @@ _lightning: --loader: pytorch --data: "{milabench_data}/FakeImageNet" --model: resnet152 - --batch-size: 16 + --batch-size: 256 lightning: inherits: _lightning @@ -463,7 +466,7 @@ dinov2-giant-single: method: per_gpu argv: - --config-file: src/dinov2/configs/train/vitg14.yaml + --config-file: "{benchmark_folder}/src/dinov2/configs/train/vitg14.yaml" # THOSE NEED TO BE LAST train.dataset_path=ImageNet:split=TRAIN:root={milabench_data}/FakeImageNet:extra={milabench_data}/FakeImageNet: true train.batch_size_per_gpu=32: true @@ -473,7 +476,7 @@ dinov2-giant-single: dinov2-giant-gpus: inherits: _dinov2 argv: - --config-file: src/dinov2/configs/train/vitg14.yaml + --config-file: "{benchmark_folder}/src/dinov2/configs/train/vitg14.yaml" # THOSE NEED TO BE LAST train.dataset_path=ImageNet:split=TRAIN:root={milabench_data}/FakeImageNet:extra={milabench_data}/FakeImageNet: true train.batch_size_per_gpu=32: true @@ -481,12 +484,16 @@ dinov2-giant-gpus: train.num_workers=10: true dinov2-giant-nodes: + enabled: false + tags: + - multinode + max_duration: 3600 inherits: _dinov2 argv: - --config-file: src/dinov2/configs/train/vitg14.yaml + --config-file: "{benchmark_folder}/src/dinov2/configs/train/vitg14.yaml" # THOSE NEED TO BE LAST - train.dataset_path=ImageNet:split=TRAIN:root={milabench_data}/FakeImageNet:extra={milabench_data}/FakeImageNet: true - train.batch_size_per_gpu=32: true + train.dataset_path=ImageFolder:root={milabench_data}/FakeImageNet: true + train.batch_size_per_gpu=12: true train.saveckp_freq=100: true train.num_workers=10: true @@ -546,6 +553,9 @@ llm-lora-ddp-gpus: llm-lora-ddp-nodes: + tags: + - multinode + max_duration: 3600 inherits: _llm plan: method: njobs @@ -611,6 +621,9 @@ llm-full-mp-gpus: llm-full-mp-nodes: + tags: + - multinode + max_duration: 3600 inherits: _llm plan: method: njobs @@ -635,3 +648,52 @@ llm-full-mp-nodes: - "len(nodes) >= ${num_machines}" +_geo_gnn: + inherits: _defaults + definition: . + # FIXME: torch cluster is laging behind pytorch + # we are forced to use torch==2.3 instead of torch==2.4 + install_group: gnn + group: geo_gnn + definition: ../benchmarks/geo_gnn + plan: + method: per_gpu + +dimenet: + inherits: _geo_gnn + argv: + --model: 'DimeNet' + --num-samples: 10000 + --use3d: True + + +recursiongfn: + inherits: _defaults + definition: ../benchmarks/recursiongfn + install_group: gnn + group: recursiongfn_gnn + plan: + method: per_gpu + + argv: + --batch_size: 128 + --num_workers: 8 + --num_steps: 100 + --layer_width: 128 + --num_layers: 4 + + +torchatari: + inherits: _defaults + definition: ../benchmarks/torchatari + install_group: torch + plan: + method: per_gpu + + argv: + --num-minibatches: 16 + --update-epochs: 4 + --num-steps: 128 + --num-envs: auto({cpu_per_gpu}, 128) + --total-timesteps: 1000000 + --env-id: Breakout-v5 diff --git a/config/scaling.yaml b/config/scaling.yaml index c6cf1bf6c..0a9907e5a 100644 --- a/config/scaling.yaml +++ b/config/scaling.yaml @@ -2,9 +2,10 @@ bert-fp16: arg: --batch-size model: 1: 4108.75 MiB + 2: 475.0 MiB 4: 1840.375 MiB 8: 8614.75 MiB - 16: 14254.75 MiB + 16: 475.0 MiB 32: 24604.75 MiB 40: 34157.9375 MiB 64: 47212.375 MiB @@ -16,9 +17,10 @@ bert-fp32: arg: --batch-size model: 1: 4206.75 MiB + 2: 475.0 MiB 4: 6652.375 MiB 8: 10240.75 MiB - 16: 17646.75 MiB + 16: 475.0 MiB 24: 28007.9375 MiB 32: 31568.75 MiB 64: 61196.375 MiB @@ -28,9 +30,10 @@ bert-tf32: arg: --batch-size model: 1: 4204.75 MiB + 2: 475.0 MiB 4: 6654.375 MiB 8: 10242.75 MiB - 16: 17648.75 MiB + 16: 475.0 MiB 24: 28009.9375 MiB 32: 31570.75 MiB 64: 61198.375 MiB @@ -40,9 +43,10 @@ bert-tf32-fp16: arg: --batch-size model: 1: 4108.75 MiB + 2: 475.0 MiB 4: 1840.375 MiB 8: 8614.75 MiB - 16: 14254.75 MiB + 16: 475.0 MiB 32: 24604.75 MiB 40: 34157.9375 MiB 64: 47212.375 MiB @@ -75,6 +79,7 @@ convnext_large-fp32: model: 1: 3268.75 MiB 2: 3480.375 MiB + 4: 2060.75 MiB 8: 5824.75 MiB 16: 8774.75 MiB 32: 14548.75 MiB @@ -91,7 +96,7 @@ convnext_large-tf32: 1: 3268.75 MiB 2: 3480.375 MiB 8: 5824.75 MiB - 16: 8774.75 MiB + 16: 1768.75 MiB 32: 14548.75 MiB 64: 26274.75 MiB 72: 33081.9375 MiB @@ -156,6 +161,37 @@ davit_large-multi: 288: 65910.375 MiB 328: 81742.75 MiB optimized: 128 +diffusion-gpus: + arg: --batch_size + model: + 1: 23082 MiB + 2: 21818.75 MiB + 4: 23478.75 MiB + 8: 26500.75 MiB + 16: 36436.75 MiB + 32: 57808 MiB + 48: 80698 MiB + optimized: 32 +diffusion-nodes: + arg: --batch_size + model: + 1: 21686.75 MiB + 2: 21930.75 MiB + 4: 23510.75 MiB +diffusion-single: + arg: --batch_size + model: + 1: 21654.75 MiB + 2: 21818.75 MiB + 4: 23478.75 MiB +dimenet: {} +dinov2-giant-gpus: + arg: train.batch_size_per_gpu={batch_size} + model: + 32: 69614 MiB + optimized: 32 +dinov2-giant-single: + arg: train.batch_size_per_gpu={batch_size} dlrm: {} focalnet: arg: --batch-size @@ -178,7 +214,37 @@ focalnet: optimized: 128 fp16: {} fp32: {} +lightning: + arg: --batch-size +lightning-gpus: + arg: --batch-size + model: + 1: 4542 MiB + 2: 1158.75 MiB + 4: 1156.75 MiB + 8: 1260.75 MiB + 16: 4150.75 MiB + 128: 15858 MiB + optimized: 16 llama: {} +llm-full-mp-gpus: + arg: batch_size={batch_size} +llm-full-mp-nodes: + arg: batch_size={batch_size} +llm-lora-ddp-gpus: + arg: batch_size={batch_size} + model: + 1: 12418.75 MiB +llm-lora-ddp-nodes: + arg: batch_size={batch_size} +llm-lora-mp-gpus: + arg: batch_size={batch_size} +llm-lora-single: + arg: batch_size={batch_size} + model: + 1: 23196.75 MiB + 2: 27694.75 MiB + 16: 45076.75 MiB opt-1_3b: arg: --per_gpu_batch_size model: @@ -189,19 +255,25 @@ opt-1_3b-multinode: model: 1: 42126 MiB optimized: 1 -opt-6_7b: {} +opt-6_7b: + arg: --per_gpu_batch_size opt-6_7b-multinode: arg: --per_gpu_batch_size model: 1: 55380 MiB optimized: 1 +recursiongfn: + arg: --batch_size + model: + 2: 1134.75 MiB + 4: 1140.75 MiB reformer: arg: --batch-size model: 1: 1916.75 MiB 4: 3004.375 MiB 8: 4512.75 MiB - 16: 7486.75 MiB + 16: 7082.75 MiB 24: 10470.75 MiB 32: 13454.75 MiB 64: 25408.75 MiB @@ -215,9 +287,10 @@ regnet_y_128gf: arg: --batch-size model: 1: 6876.75 MiB + 2: 475.0 MiB 4: 9062.375 MiB 8: 8524.75 MiB - 16: 11426.75 MiB + 16: 1234.75 MiB 24: 18523.9375 MiB 32: 18324.75 MiB 56: 31165.9375 MiB @@ -248,7 +321,14 @@ resnet152: 576: 58588.375 MiB 640: 81354.75 MiB optimized: 128 -resnet152-ddp: {} +resnet152-ddp: + arg: --batch-size +resnet152-ddp-gpus: + arg: --batch-size + model: + 1: 2084.75 MiB + 2: 2122.75 MiB + 4: 2260.75 MiB resnet152-multi: arg: --batch-size model: @@ -294,7 +374,8 @@ resnet50: 1552: 81146.75 MiB 1560: 81590.75 MiB optimized: 64 -resnet50-noio: {} +resnet50-noio: + arg: --batch-size rwkv: arg: --micro_bsz model: @@ -317,9 +398,10 @@ super-slomo: arg: --train_batch_size model: 1: 3016.75 MiB + 2: 3506.75 MiB 4: 5884.375 MiB 8: 10288.75 MiB - 16: 18718.75 MiB + 16: 16914.75 MiB 24: 29777.9375 MiB 32: 33934.375 MiB 56: 61837.9375 MiB @@ -333,11 +415,17 @@ t5: 2: 6384.375 MiB 4: 10620.375 MiB 8: 18684.75 MiB - 16: 35448.75 MiB + 16: 33990.75 MiB 24: 54479.9375 MiB 32: 66760.375 MiB optimized: 128 tf32: {} +torchatari: + arg: --num-steps + model: + 1: 1124.75 MiB + 2: 1138.75 MiB + 4: 1166.75 MiB whisper: arg: --batch-size model: @@ -354,28 +442,3 @@ whisper: 128: 71634.375 MiB 144: 80412.75 MiB optimized: 128 - - -diffusion-gpus: - arg: --batch_size - model: - 1: 23082 MiB - 16: 37778 MiB - 32: 57808 MiB - 48: 80698 MiB - optimized: 32 - - -lightning-gpus: - arg: --batch-size - model: - 1: 4542 MiB - 16: 5692 MiB - 128: 15858 MiB - optimized: 16 - -dinov2-giant-gpus: - arg: train.batch_size_per_gpu={batch_size} - model: - 32: 69614 MiB - optimized: 32 diff --git a/constraints/extra/gnn.cuda.txt b/constraints/extra/gnn.cuda.txt new file mode 100644 index 000000000..e5decec56 --- /dev/null +++ b/constraints/extra/gnn.cuda.txt @@ -0,0 +1,4 @@ +--find-links https://data.pyg.org/whl/torch-2.3.0+cu121.html + +torch>=2.3.0,<2.4.0 + diff --git a/constraints/extra/gnn.hpu.txt b/constraints/extra/gnn.hpu.txt new file mode 100644 index 000000000..e69de29bb diff --git a/constraints/extra/gnn.rocm.txt b/constraints/extra/gnn.rocm.txt new file mode 100644 index 000000000..e69de29bb diff --git a/constraints/extra/gnn.xpu.txt b/constraints/extra/gnn.xpu.txt new file mode 100644 index 000000000..e69de29bb diff --git a/constraints/extra/torch.cuda.txt b/constraints/extra/torch.cuda.txt new file mode 100644 index 000000000..aba504237 --- /dev/null +++ b/constraints/extra/torch.cuda.txt @@ -0,0 +1,2 @@ +jax[cuda12] +--find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html diff --git a/constraints/extra/torch.hpu.txt b/constraints/extra/torch.hpu.txt new file mode 100644 index 000000000..1d21c1779 --- /dev/null +++ b/constraints/extra/torch.hpu.txt @@ -0,0 +1,5 @@ + +# +# +voir >= 0.2.15 +torchcompat >= 1.0.0 diff --git a/constraints/extra/torch.rocm.txt b/constraints/extra/torch.rocm.txt new file mode 100644 index 000000000..870d923a2 --- /dev/null +++ b/constraints/extra/torch.rocm.txt @@ -0,0 +1 @@ +# No jax only a container for it diff --git a/constraints/extra/torch.xpu.txt b/constraints/extra/torch.xpu.txt new file mode 100644 index 000000000..6b7454cbc --- /dev/null +++ b/constraints/extra/torch.xpu.txt @@ -0,0 +1,20 @@ +--extra-index-url https://download.pytorch.org/whl/cpu +--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ + +# +# Including a package in a constraints file does not trigger installation of the package. +# +torch +torchvision +torchaudio +intel-extension-for-pytorch +oneccl_bind_pt +intel-extension-for-pytorch-deepspeed + +# for jax as well +intel-extension-for-openxla + +# +# +voir >= 0.2.15 +torchcompat >= 1.0.0 diff --git a/constraints/xpu.txt b/constraints/xpu.txt index 37d21a00a..5aa7739a2 100644 --- a/constraints/xpu.txt +++ b/constraints/xpu.txt @@ -1,16 +1,16 @@ -# --extra-index-url https://download.pytorch.org/whl/cpu +--extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ # # Including a package in a constraints file does not trigger installation of the package. # -torch>=2.1.0 -torchvision>=0.16.0a0 -torchaudio>=2.1.0a0 -intel-extension-for-pytorch>=2.1.10+xpu -oneccl_bind_pt==2.1.100+xpu -intel-extension-for-pytorch-deepspeed>=2.1.30 -intel-extension-for-openxla>=0.3.0 +torch +torchvision +torchaudio +intel-extension-for-pytorch +oneccl_bind_pt +intel-extension-for-pytorch-deepspeed +intel-extension-for-openxla # # diff --git a/docs/execution_modes.rst b/docs/execution_modes.rst new file mode 100644 index 000000000..8d40fc44d --- /dev/null +++ b/docs/execution_modes.rst @@ -0,0 +1,93 @@ +Milabench processes overview +============================ + +* milabench main process + * gather metrics from benchmark processes, save them to file + * manages the benchmarks (timeout etc...) + + * if ``per_gpu`` is used, milabench will launch one process per GPU (sets ``CUDA_VISIBLE_DEVCES``) + * each processes log their GPU data + * might spawn a monitor process + * will init pynvml + * dataloader will also spawn process workers + * usually not using GPU + + * if ``njobs`` is used, milabench will launch a single process (torchrun) + * torchrun in turn will spawn one process per GPU + * RANK 0 is used for logging + * RANK 0 might spawn a monitor process + * will init pynvml + * dataloader will also spawn process workers + * usually not using GPU + +Plan +---- + +per_gpu ++++++++ + +``per_gpu``: used for mono gpu benchmarks, spawn one process per gpu and run the same benchmark + +.. code-block:: yaml + + _torchvision: + inherits: _defaults + definition: ../benchmarks/torchvision + group: torchvision + install_group: torch + plan: + method: per_gpu + +Milabench will essentially execute something akin to below. + +.. code-block:: bash + + echo "---" + echo "fp16" + echo "====" + time ( + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + wait + ) + +njobs ++++++ + +``njobs`` used to launch a single jobs that can see all the gpus. + +.. code-block:: yaml + + _torchvision_ddp: + inherits: _defaults + definition: ../benchmarks/torchvision_ddp + group: torchvision + install_group: torch + plan: + method: njobs + n: 1 + +Milabench will essentially execute something akin to below. + +.. code-block:: bash + + echo "---" + echo "lightning-gpus" + echo "==============" + time ( + $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & + wait + ) + + + + + + + diff --git a/extra/torch_ppo_atari_envpool/mark_torch_ppo_atari_envpool b/extra/torch_ppo_atari_envpool/mark_torch_ppo_atari_envpool new file mode 100644 index 000000000..e69de29bb diff --git a/milabench/_version.py b/milabench/_version.py index d24b7975c..6a9a689f2 100644 --- a/milabench/_version.py +++ b/milabench/_version.py @@ -1,5 +1,5 @@ """This file is generated, do not modify""" -__tag__ = "v0.1.0-34-g93521fd7" -__commit__ = "93521fd70a02719076f64253ac4ae3b4a444c739" -__date__ = "2024-08-22 19:02:01 +0000" +__tag__ = "v0.1.0-51-g3d185d1" +__commit__ = "3d185d15af22876b0dece6f296e179754b316a26" +__date__ = "2024-08-28 11:52:25 -0400" diff --git a/milabench/cli/__init__.py b/milabench/cli/__init__.py index 205942e47..5a1f122c5 100644 --- a/milabench/cli/__init__.py +++ b/milabench/cli/__init__.py @@ -21,7 +21,8 @@ from .summary import cli_summary from .resolve import cli_resolve from .new import cli_new - +from .env import cli_env +from .prepare_run import cli_prepare_run class Main: def new(): @@ -94,6 +95,13 @@ def matrix(): def resolve(): return cli_resolve() + + def env(): + """Print milabench environment variables""" + cli_env() + + def prepare_run(): + cli_prepare_run() def main(argv=None): diff --git a/milabench/cli/dry.py b/milabench/cli/dry.py index 80d55d6ea..010269223 100644 --- a/milabench/cli/dry.py +++ b/milabench/cli/dry.py @@ -169,7 +169,7 @@ def multipack_args(conf: Arguments): "ip": f"192.168.0.{i + 10}" if i != 0 else "127.0.0.1", "user": "username", "main": i == 0, - "port": 22, + "sshport": 22, } for i in range(conf.nnodes) ], diff --git a/milabench/cli/env.py b/milabench/cli/env.py new file mode 100644 index 000000000..3725aa9df --- /dev/null +++ b/milabench/cli/env.py @@ -0,0 +1,27 @@ + + +from milabench.system import _global_options, as_environment_variable, SystemConfig + + +from dataclasses import asdict + + +def cli_env(): + _ = SystemConfig() + + # import yaml + # print(yaml.dump(asdict(_))) + + for k, option in _global_options.items(): + env_name = as_environment_variable(k) + value = option["value"] + default = option["default"] + + if value is None or value == default: + print("# ", end="") + + print(f"export {env_name}={value}") + + +if __name__ == "__main__": + cli_env() diff --git a/milabench/cli/install.py b/milabench/cli/install.py index 00977aea3..10d33a1da 100644 --- a/milabench/cli/install.py +++ b/milabench/cli/install.py @@ -12,8 +12,10 @@ @dataclass class Arguments: force: bool = False + update: bool = False shorttrace: bool = False variant: str = None + # fmt: on @@ -22,13 +24,16 @@ def arguments(): # Force install force: Option & bool = False + # Update package + update: Option & bool = False + # On error show full stacktrace shorttrace: Option & bool = False # Install variant variant: Option & str = None - return Arguments(force, shorttrace, variant) + return Arguments(force, update, shorttrace, variant) @tooled @@ -39,10 +44,13 @@ def cli_install(args=None): overrides = {"*": {"install_variant": args.variant}} if args.variant else {} - if args.force: - mp = get_multipack(run_name="install.{time}", overrides=overrides) - for pack in mp.packs.values(): + + mp = get_multipack(run_name="install.{time}", overrides=overrides) + for pack in mp.packs.values(): + if args.force or args.update: pack.install_mark_file.rm() + + if args.force: pack.dirs.venv.rm() mp = get_multipack(run_name="install.{time}", overrides=overrides) diff --git a/milabench/cli/prepare_run.py b/milabench/cli/prepare_run.py new file mode 100644 index 000000000..58b5fe559 --- /dev/null +++ b/milabench/cli/prepare_run.py @@ -0,0 +1,15 @@ +from coleo import tooled + +from .prepare import cli_prepare +from .run import cli_run + +@tooled +def cli_prepare_run(args=None): + """Prepare a benchmark: download datasets, weights etc.""" + + rc = cli_prepare() + + if rc == 0: + rc = cli_run() + + return rc diff --git a/milabench/cli/slurm.py b/milabench/cli/slurm.py index db68dbf0e..35f1fe94e 100644 --- a/milabench/cli/slurm.py +++ b/milabench/cli/slurm.py @@ -1,23 +1,25 @@ import getpass import os - +import socket +import subprocess from coleo import tooled -from ..system import get_gpu_capacity +from ..system import get_gpu_capacity, is_loopback, resolve_hostname, gethostname -@tooled -def cli_slurm_system(): - """Generate a system file based of slurm environment variables""" - node_list = expand_node_list(os.getenv("SLURM_JOB_NODELIST", "")) +def make_node_list_from_slurm(node_list): def make_node(i, ip): + hostname, local = resolve_hostname(ip) + node = { "name": ip, - "ip": ip, + "ip": hostname, + "hostname": gethostname(ip), "user": getpass.getuser(), - "main": i == 0, + "main": local, + "sshport": 22, } if i == 0: @@ -26,9 +28,46 @@ def make_node(i, ip): return node # nvidia-smi --query-gpu=memory.total --format=csv + + nodes = [make_node(i, ip) for i, ip in enumerate(node_list)] + + # ensure there is a main + # either it is the local node or first node + for node in nodes: + if node.get("main", False): + break + else: + nodes[0]["main"] = True + + return nodes + + +@tooled +def cli_slurm_system(): + """Generate a system file based of slurm environment variables""" + + node_list = expand_node_list(os.getenv("SLURM_JOB_NODELIST", "")) + + if len(node_list) > 0: + nodes = make_node_list_from_slurm(node_list) + else: + self = socket.gethostname() + nodes = [{ + "name": self, + "ip": self, + "hostname": self, + "user": getpass.getuser(), + "main": True, + "sshport": 22, + }] + + + from milabench.system import resolve_addresses + resolve_addresses(nodes) + system = { "arch": "cuda", - "nodes": [make_node(i, ip) for i, ip in enumerate(node_list)], + "nodes": nodes, } capacity = get_gpu_capacity() diff --git a/milabench/commands/__init__.py b/milabench/commands/__init__.py index bee42baf5..e6d3639d5 100644 --- a/milabench/commands/__init__.py +++ b/milabench/commands/__init__.py @@ -456,7 +456,7 @@ def is_local(self): if localnode is not None: return (False # The ip belongs to the local node - or self.host in localnode["ipaddrlist"] + or self.host in localnode.get("ipaddrlist", []) # The hostname is the local node or self.host == localnode["hostname"] ) @@ -485,7 +485,7 @@ def _argv(self, **kwargs) -> List: argv.append(f"-i{key}") argv.append(host) - return argv + return argv # + ["env", "-i"] class SCPCommand(SSHCommand, CmdCommand): @@ -577,16 +577,24 @@ def _argv(self, **kwargs): return [] +def node_address(node): + """Favour Hostname as it is the most consistent name across machines""" + host = node.get("hostname") + ip = node.get("ip") + return host or ip + + class ForeachNode(ListCommand): def __init__(self, executor: Command, **kwargs) -> None: super().__init__(None, **kwargs) self.options.update(kwargs) self.executor = executor + self.base_tags = self.executor.pack.config["tag"] def make_new_node_pack(self, rank, node, base) -> "BasePackage": """Make a new environment/config for the run""" config = base.pack.config - tags = [*config["tag"], node["name"]] + tags = [*self.base_tags, node["name"]] # Workers do not send training data # tag it as such so validation can ignore this pack @@ -630,10 +638,10 @@ def executors(self): ) worker = SSHCommand( - host=node["ip"], + host=node_address(node), user=node["user"], key=key, - port=node.get("port", 22), + port=node.get("sshport", 22), executor=self.make_new_node_executor(rank, node, self.executor), **options ) @@ -653,31 +661,43 @@ def copy(self, pack): class TorchrunAllNodes(ForeachNode): """executes torchrun on multiple machines""" - def __init__(self, executor: Command, **kwargs) -> None: + @staticmethod + def make_base_executor(cls, executor, *args, **kwargs): config = executor.pack.config max_num = config.get("num_machines", 1) - self.nodes = select_nodes(config["system"]["nodes"], max_num) + nodes = select_nodes(config["system"]["nodes"], max_num) - main = self.nodes[0] + main = nodes[0] # node[port] is for SSH - main_host = main["ip"] + main_host = node_address(main) # add them as option so we could tweak them if necessary main_port = option("torchrun.port", int, default=29400) backend = option("torchrun.backend", str, default="c10d") main_addr = f"{main_host}:{main_port}" - base_exec = TorchrunAllGPU( + + config = executor.pack.config + + return cls( executor, - f"--nnodes={len(self.nodes)}", + f"--nnodes={len(nodes)}", f"--rdzv-backend={backend}", f"--rdzv-endpoint={main_addr}", - f"--master-addr={main_host}", - f"--master-port={main_port}", + # f"--master-addr={main_host}", + # f"--master-port={main_port}", + *args, **kwargs ) + def __init__(self, executor: Command, *args, **kwargs) -> None: + base_exec = TorchrunAllNodes.make_base_executor( + TorchrunAllGPU, + executor, + *args, + **kwargs + ) super().__init__(base_exec) @@ -852,7 +872,7 @@ def __init__(self, pack: pack.BasePackage, **kwargs): super().__init__(pack, **kwargs) def _argv(self, **_) -> List: - return [f"{self.pack.dirs.code / 'activator'}", f"{self.pack.dirs.venv}"] + return [activator_script(), f"{self.pack.dirs.venv}", f"{self.pack.dirs.cache}"] @@ -874,9 +894,10 @@ def make_new_node_executor(self, rank, node, base): config = base.pack.config pack = self.make_new_node_pack(rank, node, base) - + executor = base.copy(pack) + return DockerRunCommand( - AccelerateLaunchCommand(pack, rank=rank), + AccelerateLaunchCommand(executor, rank=rank, **self.options), config["system"].get("docker_image"), ) @@ -948,6 +969,8 @@ def _argv(self, **_) -> List: deepspeed_argv = [] cpu_per_process = self.pack.resolve_argument('--cpus_per_gpu', 4) + main_port = option("torchrun.port", int, default=29400) + return [ # -- Run the command in the right venv # This could be inside the SSH Command @@ -956,6 +979,7 @@ def _argv(self, **_) -> List: # inside a specifc venv activator_script(), f"{self.pack.dirs.venv}", + f"{self.pack.dirs.cache}", # -- "accelerate", "launch", @@ -967,7 +991,7 @@ def _argv(self, **_) -> List: f"--gradient_accumulation_steps={self.pack.config.get('gradient_accumulation_steps', 1)}", f"--num_cpu_threads_per_process={cpu_per_process}", f"--main_process_ip={manager['ip']}", - f"--main_process_port={manager['port']}", + f"--main_process_port={main_port}", f"--num_processes={nproc}", *self.accelerate_argv, ] diff --git a/milabench/common.py b/milabench/common.py index 5849e05fe..135e45545 100644 --- a/milabench/common.py +++ b/milabench/common.py @@ -141,7 +141,7 @@ def get_base_defaults(base, arch="none", run_name="none"): { "name": "local", "ip": "127.0.0.1", - "port": 8123, + "sshport": 22, "user": user, "main": True, } diff --git a/milabench/multi.py b/milabench/multi.py index b09eeecca..f734e40d5 100644 --- a/milabench/multi.py +++ b/milabench/multi.py @@ -83,6 +83,23 @@ def make_execution_plan(pack, step=0, repeat=1): return exec_plan +async def copy_base_to_workers(setup): + # Note: when we use docker we do not need to install + # so this should be ignored + if is_main_local(setup) and is_multinode(setup): + print("Coping main setup from this node to worker") + # copy the main setup to the workers + # so it copies the bench venv already, no need for python + from milabench.remote import copy_folder + from milabench.system import SystemConfig + + # we copy the entire content of base + # FIXME: handle custom (venv, cache, data, etc...) directories + # + copy_plan = copy_folder(setup, SystemConfig().base) + remote_task = asyncio.create_task(copy_plan.execute()) + await asyncio.wait([remote_task]) + class MultiPackage: def __init__(self, packs): @@ -140,6 +157,7 @@ async def do_install(self): remote_task = None if is_remote(setup): + print("Current node is outside of our system") # We are outside system, setup the main node first remote_plan = milabench_remote_install(setup, setup_for="main") remote_task = asyncio.create_task(remote_plan.execute()) @@ -148,15 +166,18 @@ async def do_install(self): # We do not install benchmarks on that node return - elif is_main_local(setup) and is_multinode(setup): - # We are the main node, setup workers - remote_plan = milabench_remote_install(setup, setup_for="worker") - remote_task = asyncio.create_task(remote_plan.execute()) + # elif is_main_local(setup) and is_multinode(setup): + # # this was executing install on the remote node but then it needed python to be available + # # We are the main node, setup workers + # remote_plan = milabench_remote_install(setup, setup_for="worker") + # remote_task = asyncio.create_task(remote_plan.execute()) # do the installation step with phase_lock("install"): await self.do_phase("install", remote_task, "checked_install") + await copy_base_to_workers(setup) + async def do_prepare(self): setup = self.setup_pack() remote_task = None @@ -168,13 +189,17 @@ async def do_prepare(self): return - elif is_main_local(setup) and is_multinode(setup): - remote_plan = milabench_remote_prepare(setup, run_for="worker") - remote_task = asyncio.create_task(remote_plan.execute()) + # elif is_main_local(setup) and is_multinode(setup): + # remote_plan = milabench_remote_prepare(setup, run_for="worker") + # remote_task = asyncio.create_task(remote_plan.execute()) with phase_lock("prepare"): await self.do_phase("prepare", remote_task, "prepare") + # Prepare is done on the main node + # copy the result there + await copy_base_to_workers(setup) + async def do_run(self, repeat=1): setup = self.setup_pack() @@ -207,7 +232,7 @@ async def do_run(self, repeat=1): await pack.message_error(exc) async def do_pin( - self, pip_compile_args, constraints: list = tuple(), from_scratch=False + self, pip_compile_args, constraints: list = tuple(), from_scratch=False, requirements: list = tuple() ): groups = defaultdict(dict) for pack in self.packs.values(): @@ -215,11 +240,13 @@ async def do_pin( igrp = pack.config["install_group"] ivar = pack.config["install_variant"] ivar_constraints: XPath = here.parent / "constraints" / f"{ivar}.txt" + base_reqs = pack.requirements_map().keys() if ivar_constraints.exists(): constraints = {ivar_constraints, *constraints} - groups[igrp].update({req: pack for req in base_reqs}) + groups[igrp].update({req: pack for req in base_reqs}) + for constraint in constraints: print("Using constraint file:", constraint) @@ -231,19 +258,28 @@ async def do_pin( for ig, (reqs, packs) in groups.items(): if len(packs) < len(reqs): if len(set(p.config["group"] for p in packs)) > 1: - raise Exception( - f"Install group '{ig}' contains benchmarks that have more than" + print( + f"WARNING: Install group '{ig}' contains benchmarks that have more than" " one requirements file. Please isolate such benchmarks in their" " own install_group." ) for ig, (reqs, packs) in groups.items(): packs = list(packs) + pack0 = packs[0] + + ivar = pack.config["install_variant"] + ivar_requirements: XPath = here.parent / "constraints" / "extra" / f"{ig}.{ivar}.txt" + + if ivar_requirements.exists(): + reqs.add(ivar_requirements) + if len(packs) == 1: (pack,) = packs await pack.pin( pip_compile_args=pip_compile_args, constraints=constraints, + requirements=requirements ) else: pack0 = packs[0] @@ -253,7 +289,7 @@ async def do_pin( constraint_path = pindir / "tmp-constraints.txt" constraint_files = make_constraints_file( - constraint_path, constraints, str(here.parent) + constraint_path, constraints, str(here.parent), requirements=requirements ) ig_constraint_path = pindir / f"constraints-{ivar}-{ig}.txt" @@ -278,6 +314,7 @@ async def do_pin( pip_compile_args=pip_compile_args, constraints=new_constraints, working_dir=here.parent, + requirements=requirements ) async def count_runs(self, repeat): diff --git a/milabench/pack.py b/milabench/pack.py index 60a5df2f7..b557b7729 100644 --- a/milabench/pack.py +++ b/milabench/pack.py @@ -398,6 +398,7 @@ async def pin( input_files: Sequence = tuple(), constraints: Sequence = tuple(), working_dir=None, + requirements: Sequence = tuple(), ): """Pin versions to requirements file. @@ -407,6 +408,9 @@ async def pin( input_files: A list of inputs to piptools compile constraint: The constraint file """ + if working_dir is None: + working_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) + ivar = self.config.get("install_variant", None) if ivar == "unpinned": @@ -426,7 +430,10 @@ async def pin( grp = self.config["group"] constraint_path = XPath(".pin") / f"tmp-constraints-{ivar}-{grp}.txt" constraint_files = make_constraints_file( - constraint_path, constraints, working_dir + constraint_path, + constraints, + working_dir, + requirements=requirements, ) current_input_files = constraint_files + (base_reqs, *input_files) diff --git a/milabench/remote.py b/milabench/remote.py index bf5963183..7e1eef85c 100644 --- a/milabench/remote.py +++ b/milabench/remote.py @@ -70,9 +70,48 @@ def milabench_remote_sync(pack, worker): def should_run_for(worker, setup_for): if setup_for == "worker": - return not worker["main"] + return not worker.get("main", False) + + return worker.get("main", False) + + +def worker_commands(pack, worker_plan, setup_for="worker"): + nodes = pack.config["system"]["nodes"] + copy = [] + node_packs = [] + + for node in nodes: + node_pack = None + + if should_run_for(node, setup_for): + node_pack = worker_pack(pack, node) + + cmds = worker_plan(node_pack, node) + + if not isinstance(cmds, list): + cmds = [cmds] + copy.extend(cmds) + + node_packs.append(node_pack) + + return ListCommand(*copy) + + +def sshnode(node, cmd): + host = node["ip"] + user = node["user"] + port = node["sshport"] + return SSHCommand(cmd, user=user, host=host, port=port) + + +def copy_folder(pack, folder, setup_for="worker"): + def copy_to_worker(nodepack, node): + return [ + sshnode(node, CmdCommand(nodepack, "mkdir", "-p", folder)), + CmdCommand(nodepack, *rsync(node, folder)) + ] + return worker_commands(pack, copy_to_worker, setup_for=setup_for) - return worker["main"] def milabench_remote_setup_plan(pack, setup_for="worker") -> SequenceCommand: @@ -87,22 +126,16 @@ def milabench_remote_setup_plan(pack, setup_for="worker") -> SequenceCommand: copy = [] node_packs = [] - for node in nodes: - node_pack = None - - if should_run_for(node, setup_for): - node_pack = worker_pack(pack, node) - copy.append(CmdCommand(node_pack, *rsync(node, INSTALL_FOLDER))) - - node_packs.append(node_pack) + copy_source = copy_folder(pack, INSTALL_FOLDER, setup_for) install = [] + for i, node in enumerate(nodes): if should_run_for(node, setup_for): install.append(pip_install_milabench(node_packs[i], node, INSTALL_FOLDER)) return SequenceCommand( - ListCommand(*copy), + copy_source, ListCommand(*install), ) @@ -146,7 +179,7 @@ def is_multinode(pack): count = 0 nodes = pack.config["system"]["nodes"] for node in nodes: - if not node["main"]: + if not node.get("main", False): count += 1 return count > 0 @@ -159,12 +192,12 @@ def is_remote(pack): def is_main_local(pack): """Only the local main can send remote commands to remote""" self = pack.config["system"]["self"] - return self is not None and self["local"] and self["main"] + return self is not None and self["local"] and self.get("main", False) def is_worker(pack): self = pack.config["system"]["self"] - return self is not None and (not self["main"]) + return self is not None and (not self.get("main", False)) def _sanity(pack, setup_for): diff --git a/milabench/scripts/activator b/milabench/scripts/activator index 083c28cb1..3ea5b3c86 100755 --- a/milabench/scripts/activator +++ b/milabench/scripts/activator @@ -3,5 +3,11 @@ venv="$1" shift +cache="$1" +shift + +echo "$cache" +export XDG_CACHE_HOME=$cache + source "$venv"/bin/activate exec "$@" diff --git a/milabench/sizer.py b/milabench/sizer.py index 2ae877213..b3fa40478 100644 --- a/milabench/sizer.py +++ b/milabench/sizer.py @@ -261,20 +261,37 @@ def on_start(self, entry): self.max_usage = float("-inf") config = self.memory.setdefault(self.benchname, dict()) - scalingarg = config.get("arg", None) + template = config.get("arg", None) - if scalingarg is None: + if template is None: self.benchname = None return + + placeholder = "{batch_size}" + argstart = template.replace(placeholder, "") + is_template = False found = None for i, arg in enumerate(argv): - if arg.endswith(scalingarg): + if arg.endswith(template): + found = i + break + + # + if arg.startswith(argstart): found = i + is_template = True break if found: - self.batch_size = int(argv[found + 1]) + if is_template: + arg = argv[found] + value = arg.replace(argstart, "") + self.batch_size = int(value) + else: + self.batch_size = int(argv[found + 1]) + else: + print("Count not find batch_size argument") def on_data(self, entry): if self.filepath is None: @@ -331,6 +348,23 @@ def report(self, *args): yaml.dump(newdata, file) +def arch_to_device(arch): + device_types = [ + "cpu", + "cuda", + "ipu", + "xpu", + "mkldnn", + "opengl", "opencl", "ideep", "hip", "ve", + "fpga", "maia", "xla", "lazy", "vulkan", "mps", "meta", + "hpu", "mtia", "privateuseone" + ] + arch_to_device = {t:t for t in device_types} + arch_to_device["rocm"] = "cuda" + return arch_to_device.get(arch, "cpu") + + + def new_argument_resolver(pack): system_config = system_global.get() if system_config is None: @@ -339,16 +373,17 @@ def new_argument_resolver(pack): context = deepcopy(system_config) arch = context.get("arch", "cpu") + device_count_used = 1 + device_count_system = len(get_gpu_info()["gpus"]) if hasattr(pack, "config"): - device_count = len(pack.config.get("devices", [0])) - else: - device_count = len(get_gpu_info()["gpus"]) + device_count_used = len(pack.config.get("devices", [0])) + + if device_count_used <= 0: + device_count_used = 1 ccl = {"hpu": "hccl", "cuda": "nccl", "rocm": "rccl", "xpu": "ccl", "cpu": "gloo"} - if device_count <= 0: - device_count = 1 cpu_opt = CPUOptions() def auto(value, default): @@ -363,13 +398,14 @@ def clamp(x, mn=cpu_opt.cpu_min, mx=cpu_opt.cpu_max): total_available = total_cpu - cpu_opt.reserved_cores context["cpu_count"] = total_available - context["cpu_per_gpu"] = total_available // device_count + context["cpu_per_gpu"] = total_available // max(device_count_system, 1) context["n_worker"] = clamp(context["cpu_per_gpu"]) if cpu_opt.n_workers is not None: context["n_worker"] = cpu_opt.n_workers context["arch"] = arch + context["device_name"] = arch_to_device(arch) context["ccl"] = ccl.get(arch, "gloo") context["milabench_base"] = option("base", str, default="") @@ -381,6 +417,7 @@ def clamp(x, mn=cpu_opt.cpu_min, mx=cpu_opt.cpu_max): context["milabench_runs"] = dirs.get('runs', "") context["milabench_cache"] = dirs.get('cache', "") context["milabench_name"] = pack.config.get("name", None) + context["benchmark_folder"] = pack.config.get('definition', None) def auto_eval(arg): newvalue = str(arg).format(**context) diff --git a/milabench/system.py b/milabench/system.py index 7db61e5ea..d29f4cd27 100644 --- a/milabench/system.py +++ b/milabench/system.py @@ -3,7 +3,10 @@ import socket from dataclasses import dataclass, field import sys +import subprocess from contextlib import contextmanager +import ipaddress + import psutil import yaml from voir.instruments.gpu import get_gpu_info @@ -14,6 +17,21 @@ system_global = contextvars.ContextVar("system", default=None) +def get_gpu_capacity(strict=False): + try: + capacity = 1e24 + + for k, v in get_gpu_info()["gpus"].items(): + capacity = min(v["memory"]["total"], capacity) + + return int(capacity) + except: + print("GPU not available, defaulting to 0 MiB") + if strict: + raise + return 0 + + def getenv(name, expected_type): value = os.getenv(name) @@ -66,8 +84,6 @@ def option(name, etype, default=None): system = system_global.get() if system: options = system.get("options", dict()) - else: - warn_no_config() frags = name.split(".") env_name = as_environment_variable(name) @@ -124,7 +140,7 @@ class SizerOptions: optimized: bool = defaultfield("sizer.optimized", int) # Set a target VRAM capacity to use - capacity: str = defaultfield("sizer.capacity", str) + capacity: str = defaultfield("sizer.capacity", str, None) # Save the batch size, VRM usage data to a scaling file save: str = defaultfield("sizer.save", str, None) @@ -177,17 +193,17 @@ class Torchrun: @dataclass class Options: - sizer: SizerOptions - cpu: CPUOptions - dataset: DatasetConfig - dirs: Dirs - torchrun: Torchrun + sizer: SizerOptions = SizerOptions() + cpu: CPUOptions = CPUOptions() + dataset: DatasetConfig = DatasetConfig() + dirs: Dirs = Dirs() + torchrun: Torchrun = Torchrun() @dataclass class GPUConfig: arch: str = defaultfield("gpu.arch", str, None) - capacity: str = None + capacity: str = defaultfield("gpu.capacity", str, str(get_gpu_capacity())) @dataclass @@ -204,21 +220,29 @@ class Github: pat: str = defaultfield("github.path", str, None) +def default_device(): + try: + gpu_info = get_gpu_info() + return gpu_info["arch"] + except: + return "cpu" + + @dataclass class SystemConfig: """This is meant to be an exhaustive list of all the environment overrides""" - arch: str = defaultfield("gpu.arch", str, None) - sshkey: str = None + arch: str = defaultfield("gpu.arch", str, default_device()) + sshkey: str = defaultfield("ssh", str, "~/.ssh/id_rsa") docker_image: str = None nodes: list[Nodes] = field(default_factory=list) - gpu: GPUConfig = None - options: Options = None + gpu: GPUConfig = GPUConfig() + options: Options = Options() base: str = defaultfield("base", str, None) config: str = defaultfield("config", str, None) dash: bool = defaultfield("dash", bool, 1) noterm: bool = defaultfield("noterm", bool, 0) - github: Github = None + github: Github = Github() def check_node_config(nodes): @@ -249,6 +273,18 @@ def get_remote_ip(): return set(result) +def is_loopback(address: str) -> bool: + try: + # Create an IP address object + ip = ipaddress.ip_address(address) + # Check if the address is a loopback address + return ip.is_loopback + except ValueError: + # If the address is invalid, return False + return False + + + def _resolve_ip(ip): hostname = ip aliaslist = [] @@ -304,7 +340,7 @@ def enable_offline(enabled): offline = old -def resolve_addresses(nodes): +def _resolve_addresses(nodes): # Note: it is possible for self to be none # if we are running milabench on a node that is not part of the system # in that case it should still work; the local is then going to @@ -327,12 +363,14 @@ def resolve_addresses(nodes): or (hostname in ("localhost", socket.gethostname(), "127.0.0.1")) or (socket.gethostname().startswith(hostname)) or len(ip_list.intersection(ipaddrlist)) > 0 + or any([is_loopback(ip) for ip in ipaddrlist]) ) + # cn-g005 cn-g005.server.mila.quebec # print(hostname, socket.gethostname()) node["local"] = is_local - if is_local: + if is_local and self is None: self = node node["ipaddrlist"] = list(set(list(ip_list) + list(ipaddrlist))) @@ -345,19 +383,64 @@ def resolve_addresses(nodes): return self -def get_gpu_capacity(strict=False): +def gethostname(host): try: - capacity = 0 + # "-oCheckHostIP=no", + # "-oPasswordAuthentication=no", + return subprocess.check_output([ + "ssh", + "-oCheckHostIP=no", + "-oPasswordAuthentication=no", + "-oStrictHostKeyChecking=no", host, "cat", "/etc/hostname"], text=True).strip() + except: + print("Could not resolve hostname") + return host - for k, v in get_gpu_info()["gpus"].items(): - capacity = min(v["memory"]["total"], capacity) - return int(capacity) +def resolve_hostname(ip): + try: + hostname, _, iplist = socket.gethostbyaddr(ip) + + for ip in iplist: + if is_loopback(ip): + return hostname, True + + return hostname, hostname == socket.gethostname() + except: - print("GPU not available, defaulting to 0 MiB") - if strict: - raise - return 0 + if offline: + return ip, False + + raise + +def resolve_node_address(node): + hostname, local = resolve_hostname(node["ip"]) + + node["hostname"] = hostname + node["local"] = local + + if local: + # `gethostbyaddr` returns `cn-d003` but we want `cn-d003.server.mila.quebec` + # else torchrun does not recognize the main node + node["hostname"] = socket.gethostname() + + return local + + +def resolve_addresses(nodes): + if offline: + for n in nodes: + n["hostname"] = n["ip"] + + return nodes[0] + + self = None + + for node in nodes: + if resolve_node_address(node): + self = node + + return self def build_system_config(config_file, defaults=None, gpu=True): diff --git a/milabench/utils.py b/milabench/utils.py index 2e732200d..8495d117e 100644 --- a/milabench/utils.py +++ b/milabench/utils.py @@ -114,7 +114,7 @@ def relativize(pth, working_dir): return pth -def make_constraints_file(pth, constraints, working_dir): +def make_constraints_file(pth, constraints, working_dir, requirements=tuple()): if constraints: constraint_file = XPath(working_dir) / XPath(pth) os.makedirs(constraint_file.parent, exist_ok=True) @@ -122,7 +122,10 @@ def make_constraints_file(pth, constraints, working_dir): # We prefix the constraint with ../ because we are creating a constraint # file in ./.pin/,but containing constraints with paths relative to ./ tfile.write( - "\n".join([f"-c ../{relativize(c, working_dir)}" for c in constraints]) + "\n".join([f"-c ../{relativize(c, working_dir)}" for c in constraints]) + "\n" + ) + tfile.write( + "\n".join([f"-r ../{relativize(r, working_dir)}" for r in requirements]) + "\n" ) return (constraint_file,) else: @@ -231,7 +234,7 @@ def select_nodes(nodes, n): ranked = [] for node in nodes: - if node["main"]: + if node.get("main", False): ranked.insert(0, node) else: ranked.append(node) @@ -242,7 +245,7 @@ def select_nodes(nodes, n): def enumerate_rank(nodes): rank = 1 for node in nodes: - if node["main"]: + if node.get("main", False): yield 0, node else: yield rank, node diff --git a/poetry.lock b/poetry.lock index 037d00a5c..1276e7e8f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -682,17 +682,17 @@ test = ["objgraph", "psutil"] [[package]] name = "hrepr" -version = "0.4.1" +version = "0.7.3" description = "Extensible HTML representation for Python objects." optional = false -python-versions = ">=3.6,<4.0" +python-versions = ">=3.9" files = [ - {file = "hrepr-0.4.1-py3-none-any.whl", hash = "sha256:b1a010a8be820cbc2aba41863831985001319961cb303a59134472ec9df5972a"}, - {file = "hrepr-0.4.1.tar.gz", hash = "sha256:52c2d379c08992f236a5004a8cb86716f7ccf8fb367b043af3704ffc97d04bc4"}, + {file = "hrepr-0.7.3-py3-none-any.whl", hash = "sha256:ad6ce531ee97ed280d79a3235a3b67008ecd4cdd921941c097ce1fbb8912ffd1"}, + {file = "hrepr-0.7.3.tar.gz", hash = "sha256:9b0f8480d0bec912dd16b8f06d7008c9bfd9408508df81465703aab4c35024a8"}, ] [package.dependencies] -ovld = ">=0.3.2,<0.4.0" +ovld = ">=0.3.6,<0.4.0" [[package]] name = "idna" @@ -1026,13 +1026,13 @@ PyYAML = ">=5.1.0" [[package]] name = "ovld" -version = "0.3.5" +version = "0.3.9" description = "Overloading Python functions" optional = false -python-versions = "<4.0,>=3.8" +python-versions = ">=3.8" files = [ - {file = "ovld-0.3.5-py3-none-any.whl", hash = "sha256:d36604a9ff7202d5639ebefd6ff97955ce5b04ffff0c7f0ade6ddc3189ca9846"}, - {file = "ovld-0.3.5.tar.gz", hash = "sha256:838358bc800d5bf3a66afcd6d59f0826eda7a598f48f885a9c8662169ef29813"}, + {file = "ovld-0.3.9-py3-none-any.whl", hash = "sha256:41c9c6555dc7749f71a020dcbc335dd834585876bfbb09d27fd9a5be40bb6e57"}, + {file = "ovld-0.3.9.tar.gz", hash = "sha256:ef7eda584f62266fb3260345a91f0d888b938652fc790f3a95b349237e262f0b"}, ] [[package]] @@ -1450,17 +1450,6 @@ snappy = ["python-snappy"] test = ["pytest (>=7)"] zstd = ["zstandard"] -[[package]] -name = "pynvml" -version = "11.5.3" -description = "Python utilities for the NVIDIA Management Library" -optional = false -python-versions = ">=3.6" -files = [ - {file = "pynvml-11.5.3-py3-none-any.whl", hash = "sha256:a5fba3ab14febda50d19dbda012ef62ae0aed45b7ccc07af0bc5be79223e450c"}, - {file = "pynvml-11.5.3.tar.gz", hash = "sha256:183d223ae487e5f00402d8da06c68c978ef8a9295793ee75559839c6ade7b229"}, -] - [[package]] name = "pyproject-hooks" version = "1.1.0" @@ -2201,4 +2190,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = ">=3.10,<4.0" -content-hash = "c7d08e99853b23573817ead28e8e40883529dd95f0646a35ed1eed96daf4e2b9" +content-hash = "b0283769e6ab814b9c62b13d6dc68f01dbc27156b8d0cb0f03f1490aaaf384e6" diff --git a/pyproject.toml b/pyproject.toml index 802ce02ff..0d4a6d62d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,14 +20,11 @@ requests = "^2.26.0" nox = "^2021.10.1" GitPython = "^3.1.24" PyYAML = "^6.0" -ovld = "^0.3.2" -hrepr = "^0.4.0" blessed = "^1.19.1" pathspec = "^0.9.0" cp-template = "^0.3.0" pandas = ">=1.4.2" numpy = ">=1.23.0,<2.0.0" -pynvml = "^11.4.1" tqdm = "^4.64.1" pip-tools = "^7.4.1" rich = "^13.3.2" @@ -39,6 +36,7 @@ py-cpuinfo = "^9.0.0" psutil = "^5.9.5" importlib-resources = "^6.1.0" filelock = "^3.15.3" +hrepr = ">=0.7.0" [tool.poetry.group.dev.dependencies] black = ">=21.10b0" diff --git a/scripts/article/run_cuda.sh b/scripts/article/run_cuda.sh index 405d7e3fd..c8c151d80 100644 --- a/scripts/article/run_cuda.sh +++ b/scripts/article/run_cuda.sh @@ -31,7 +31,7 @@ install_prepare() { if [ -z "${MILABENCH_SOURCE}" ]; then if [ ! -d "$MILABENCH_WORDIR/milabench" ]; then - git clone https://github.com/mila-iqia/milabench.git + git clone https://github.com/mila-iqia/milabench.git -b staging fi export MILABENCH_SOURCE="$MILABENCH_WORDIR/milabench" fi @@ -40,10 +40,12 @@ install_prepare() { pip install -e $MILABENCH_SOURCE + milabench slurm_system > $MILABENCH_WORDIR/system.yaml + # # Install milabench's benchmarks in their venv # - milabench install $ARGS + milabench install --system $MILABENCH_WORDIR/system.yaml $ARGS which pip @@ -60,7 +62,7 @@ install_prepare() { # # Generate/download datasets, download models etc... - milabench prepare $ARGS + milabench prepare --system $MILABENCH_WORDIR/system.yaml $ARGS } module load cuda/12.3.2 @@ -78,7 +80,7 @@ if [ "$MILABENCH_PREPARE" -eq 0 ]; then # # Run the benchmakrs - milabench run "$@" + milabench run --system $MILABENCH_WORDIR/system.yaml "$@" # # Display report diff --git a/scripts/article/run_cuda_dev.sh b/scripts/article/run_cuda_dev.sh index 7980d41d4..c21730b7a 100644 --- a/scripts/article/run_cuda_dev.sh +++ b/scripts/article/run_cuda_dev.sh @@ -3,9 +3,16 @@ set -ex # export MILABENCH_SOURCE=$HOME/milabench +# +# # put those on the shared drived +# export MILABENCH_DIRS_DATA=/home/mila/d/delaunap/scratch/milabench/data +# export MILABENCH_DIRS_VENV=/home/mila/d/delaunap/scratch/milabench/venv +# export MILABENCH_DIRS_RUNS=/home/mila/d/delaunap/scratch/milabench/runs +# +# # mkdir /tmp/workspace && cd /tmp/workspace # conda activate py310 -# +# bash $HOME/milabench/scripts/article/run_cuda_dev.sh # export MILABENCH_GPU_ARCH=cuda @@ -14,8 +21,13 @@ export MILABENCH_WORDIR="$(pwd)/$MILABENCH_GPU_ARCH" export MILABENCH_BASE="$MILABENCH_WORDIR/results" export MILABENCH_CONFIG="$MILABENCH_WORDIR/milabench/config/standard.yaml" export MILABENCH_VENV="$MILABENCH_WORDIR/env" -export BENCHMARK_VENV="$MILABENCH_WORDIR/results/venv/torch" +export MILABENCH_SYSTEM="$MILABENCH_WORDIR/system.yaml" +if [ -z "${MILABENCH_DIRS_VENV}" ]; then + export BENCHMARK_VENV="$MILABENCH_WORDIR/results/venv/torch" +else + export BENCHMARK_VENV="$MILABENCH_DIRS_VENV/"'${install_group}' +fi if [ -z "${MILABENCH_PREPARE}" ]; then export MILABENCH_PREPARE=0 @@ -51,20 +63,25 @@ install_prepare() { . $MILABENCH_WORDIR/env/bin/activate pip install -e $MILABENCH_SOURCE - # milabench pin --variant cuda --from-scratch "$@" + # need torch for pinning + pip install torch + milabench pin --variant cuda --from-scratch "$@" + + milabench slurm_system > $MILABENCH_WORDIR/system.yaml # # Install milabench's benchmarks in their venv # - milabench install "$@" + milabench install --system $MILABENCH_WORDIR/system.yaml "$@" which pip # pip install -e $MILABENCH_WORDIR/voir # pip install -e $MILABENCH_WORDIR/torchcompat ( - . $BENCHMARK_VENV/bin/activate - which pip + echo "Pass" + # . $BENCHMARK_VENV/bin/activate + # which pip #pip install -e $MILABENCH_WORDIR/voir # pip install -e $MILABENCH_WORDIR/torchcompat # pip install torch torchvision torchaudio @@ -79,12 +96,12 @@ install_prepare() { # # Generate/download datasets, download models etc... - milabench prepare "$@" + milabench prepare --system $MILABENCH_WORDIR/system.yaml "$@" } module load cuda/12.3.2 -if [ ! -d "$MILABENCH_WORDIR/results/venv/torch" ]; then +if [ ! -d "$MILABENCH_VENV" ]; then install_prepare else echo "Reusing previous install" @@ -92,29 +109,16 @@ else fi -( - . $MILABENCH_WORDIR/env/bin/activate - pip show setuptools - pip show pip - pip install git+https://github.com/Delaunay/voir.git@patch-8 -) - -( - . $BENCHMARK_VENV/bin/activate - pip show setuptools - pip show pip - pip install git+https://github.com/Delaunay/voir.git@patch-8 -) - - if [ "$MILABENCH_PREPARE" -eq 0 ]; then cd $MILABENCH_WORDIR + # milabench prepare --system $MILABENCH_WORDIR/system.yaml "$@" + # milabench prepare "$@" # # Run the benchmakrs - milabench run "$@" + milabench run --system $MILABENCH_WORDIR/system.yaml "$@" # # Display report diff --git a/scripts/article/run_rocm.sh b/scripts/article/run_rocm.sh index 79e736c20..b8a15fb76 100644 --- a/scripts/article/run_rocm.sh +++ b/scripts/article/run_rocm.sh @@ -31,7 +31,7 @@ install_prepare() { # Override/add package to milabench venv here # which pip - # pip install ... + pip uninstall pynvml ( . $BENCHMARK_VENV/bin/activate @@ -41,7 +41,24 @@ install_prepare() { # which pip pip uninstall torch torchvision torchaudio - pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.0 + pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.1 + pip uninstall pynvml + + # sudo apt-get install lld + # https://github.com/ROCm/jax/releases/tag/rocm-jaxlib-v0.4.30 + # does not really work + pip install https://github.com/ROCm/jax/releases/download/rocm-jaxlib-v0.4.30/jaxlib-0.4.30+rocm611-cp310-cp310-manylinux2014_x86_64.whl + pip install https://github.com/ROCm/jax/archive/refs/tags/rocm-jaxlib-v0.4.30.tar.gz + + # + FORCE_CUDA=1 pip install -U -v --no-build-isolation git+https://github.com/rusty1s/pytorch_cluster.git + FORCE_CUDA=1 pip install -U -v --no-build-isolation git+https://github.com/rusty1s/pytorch_scatter.git + FORCE_CUDA=1 pip install -U -v --no-build-isolation git+https://github.com/rusty1s/pytorch_sparse.git + + # takes forever to compile + # https://github.com/ROCm/xformers + pip install -v -U --no-build-isolation --no-deps git+https://github.com/ROCm/xformers.git@develop#egg=xformers + pip install -v -U --no-build-isolation --no-deps git+https://github.com/ROCm/flash-attention.git ) # diff --git a/scripts/article/run_update_batch_size.sh b/scripts/article/run_update_batch_size.sh new file mode 100644 index 000000000..f839f952d --- /dev/null +++ b/scripts/article/run_update_batch_size.sh @@ -0,0 +1,33 @@ + + + + +export MILABENCH_SIZER_AUTO=1 +export MILABENCH_SIZER_BATCH_SIZE=1 +FINAL_OUTPUT="$HOME/batch_x_worker" +export MILABENCH_SIZER_SAVE="$FINAL_OUTPUT/scaling.yaml" +milabench run --system $MILABENCH_WORDIR/system.yaml --exclude llama + +export MILABENCH_SIZER_AUTO=1 +export MILABENCH_SIZER_BATCH_SIZE=2 +FINAL_OUTPUT="$HOME/batch_x_worker" +export MILABENCH_SIZER_SAVE="$FINAL_OUTPUT/scaling.yaml" +milabench run --system $MILABENCH_WORDIR/system.yaml --exclude llama + +export MILABENCH_SIZER_AUTO=1 +export MILABENCH_SIZER_BATCH_SIZE=4 +FINAL_OUTPUT="$HOME/batch_x_worker" +export MILABENCH_SIZER_SAVE="$FINAL_OUTPUT/scaling.yaml" +milabench run --system $MILABENCH_WORDIR/system.yaml --exclude llama + +export MILABENCH_SIZER_AUTO=1 +export MILABENCH_SIZER_BATCH_SIZE=8 +FINAL_OUTPUT="$HOME/batch_x_worker" +export MILABENCH_SIZER_SAVE="$FINAL_OUTPUT/scaling.yaml" +milabench run --system $MILABENCH_WORDIR/system.yaml --exclude llama + +export MILABENCH_SIZER_AUTO=1 +export MILABENCH_SIZER_BATCH_SIZE=16 +FINAL_OUTPUT="$HOME/batch_x_worker" +export MILABENCH_SIZER_SAVE="$FINAL_OUTPUT/scaling.yaml" +milabench run --system $MILABENCH_WORDIR/system.yaml --exclude llama \ No newline at end of file diff --git a/tests/test_command_reg/test_command_reg_one_node.txt b/tests/test_command_reg/test_command_reg_one_node.txt index 7f1d5dc83..f3ff218ae 100644 --- a/tests/test_command_reg/test_command_reg_one_node.txt +++ b/tests/test_command_reg/test_command_reg_one_node.txt @@ -15,8 +15,8 @@ export MILABENCH_DIR_DATA=$BASE/data export MILABENCH_DIR_RUNS=$BASE/runs export MILABENCH_DIR_EXTRA=$BASE/extra/llm export MILABENCH_DIR_CACHE=$BASE/cache -export OMP_NUM_THREADS=4 -export MILABENCH_CONFIG='{"system": {"arch": "cuda", "sshkey": null, "nodes": [{"ip": "127.0.0.1", "main": true, "name": "0", "port": 22, "user": "username", "hostname": "127.0.0.1", "aliaslist": [], "ipaddrlist": ["127.0.0.1"], "local": true}], "self": {"ip": "127.0.0.1", "main": true, "name": "0", "port": 22, "user": "username", "hostname": "127.0.0.1", "aliaslist": [], "ipaddrlist": ["127.0.0.1"], "local": true}}, "dirs": {"base": "$BASE", "venv": "$BASE/venv/torch", "data": "$BASE/data", "runs": "$BASE/runs", "extra": "$BASE/extra/llm", "cache": "$BASE/cache"}, "group": "llm", "install_group": "torch", "install_variant": "cuda", "run_name": "dev", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 800, "voir": {"options": {"stop": 30, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "$SRC/milabench/config", "config_file": "$SRC/milabench/config/standard.yaml", "definition": "$SRC/milabench/benchmarks/llama", "tags": ["inference", "llm", "nlp"], "plan": {"method": "per_gpu"}, "weight": 1.0, "name": "llama", "tag": ["llama"]}' +export OMP_NUM_THREADS=0 +export MILABENCH_CONFIG='{"system": {"arch": "cuda", "sshkey": null, "nodes": [{"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}], "self": {"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}}, "dirs": {"base": "$BASE", "venv": "$BASE/venv/torch", "data": "$BASE/data", "runs": "$BASE/runs", "extra": "$BASE/extra/llm", "cache": "$BASE/cache"}, "group": "llm", "install_group": "torch", "install_variant": "cuda", "run_name": "dev", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 800, "voir": {"options": {"stop": 30, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "$SRC/milabench/config", "config_file": "$SRC/milabench/config/standard.yaml", "definition": "$SRC/milabench/benchmarks/llama", "tags": ["inference", "llm", "nlp"], "plan": {"method": "per_gpu"}, "weight": 1.0, "name": "llama", "tag": ["llama"]}' echo "---" echo "llama" @@ -37,14 +37,14 @@ echo "---" echo "fp16" echo "====" time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & wait ) @@ -52,14 +52,14 @@ echo "---" echo "bf16" echo "====" time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & wait ) @@ -67,14 +67,14 @@ echo "---" echo "tf32" echo "====" time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & wait ) @@ -82,14 +82,14 @@ echo "---" echo "fp32" echo "====" time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & wait ) @@ -127,7 +127,7 @@ echo "---" echo "resnet152-ddp-gpus" echo "==================" time ( - $SRC/milabench/benchmarks/torchvision_ddp/activator $BASE/venv/torch $SRC/milabench/benchmarks/torchvision_ddp/main.py --epochs 10 --num-workers 8 --loader torch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/torchvision_ddp/main.py --epochs 10 --num-workers 8 --loader torch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & wait ) @@ -353,7 +353,7 @@ echo "---" echo "diffusion-single" echo "================" time ( - $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 & + $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & wait ) @@ -361,7 +361,7 @@ echo "---" echo "diffusion-gpus" echo "==============" time ( - $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 & + $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & wait ) @@ -369,7 +369,7 @@ echo "---" echo "diffusion-nodes" echo "===============" time ( - $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 & + $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & wait ) @@ -377,14 +377,14 @@ echo "---" echo "lightning" echo "=========" time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & wait ) @@ -392,7 +392,7 @@ echo "---" echo "lightning-gpus" echo "==============" time ( - $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & + $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & wait ) @@ -400,14 +400,14 @@ echo "---" echo "dinov2-giant-single" echo "===================" time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & wait ) @@ -415,15 +415,7 @@ echo "---" echo "dinov2-giant-gpus" echo "=================" time ( - $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-gpus/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - wait -) - -echo "---" -echo "dinov2-giant-nodes" -echo "==================" -time ( - $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-nodes/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-gpus/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & wait ) @@ -446,7 +438,7 @@ echo "---" echo "llm-lora-ddp-gpus" echo "=================" time ( - $BASE/venv/torch/bin/tune run --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-gpus/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & + $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-gpus/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & wait ) @@ -454,7 +446,7 @@ echo "---" echo "llm-lora-ddp-nodes" echo "==================" time ( - $BASE/venv/torch/bin/tune run --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & + $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & wait ) @@ -462,7 +454,7 @@ echo "---" echo "llm-lora-mp-gpus" echo "================" time ( - $BASE/venv/torch/bin/tune run --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_lora.yaml epochs=1 output_dir=$BASE/extra/llm-lora-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-lora-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" batch_size=8 gradient_accumulation_steps=1 & + $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_lora.yaml epochs=1 output_dir=$BASE/extra/llm-lora-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-lora-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" batch_size=8 gradient_accumulation_steps=1 & wait ) @@ -470,7 +462,7 @@ echo "---" echo "llm-full-mp-gpus" echo "================" time ( - $BASE/venv/torch/bin/tune run --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 & + $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 & wait ) @@ -478,7 +470,52 @@ echo "---" echo "llm-full-mp-nodes" echo "=================" time ( - $BASE/venv/torch/bin/tune run --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 & + $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 & + wait +) + +echo "---" +echo "dimenet" +echo "=======" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & + wait +) + +echo "---" +echo "recursiongfn" +echo "============" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & + wait +) + +echo "---" +echo "torchatari" +echo "==========" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & wait ) diff --git a/tests/test_command_reg/test_command_reg_two_nodes.txt b/tests/test_command_reg/test_command_reg_two_nodes.txt index 479a57859..bda22033e 100644 --- a/tests/test_command_reg/test_command_reg_two_nodes.txt +++ b/tests/test_command_reg/test_command_reg_two_nodes.txt @@ -15,8 +15,8 @@ export MILABENCH_DIR_DATA=$BASE/data export MILABENCH_DIR_RUNS=$BASE/runs export MILABENCH_DIR_EXTRA=$BASE/extra/llm export MILABENCH_DIR_CACHE=$BASE/cache -export OMP_NUM_THREADS=4 -export MILABENCH_CONFIG='{"system": {"arch": "cuda", "sshkey": null, "nodes": [{"ip": "127.0.0.1", "main": true, "name": "0", "port": 22, "user": "username", "hostname": "127.0.0.1", "aliaslist": [], "ipaddrlist": ["127.0.0.1"], "local": true}, {"ip": "192.168.0.11", "main": false, "name": "1", "port": 22, "user": "username", "hostname": "192.168.0.11", "aliaslist": [], "ipaddrlist": ["192.168.0.11"], "local": false}], "self": {"ip": "127.0.0.1", "main": true, "name": "0", "port": 22, "user": "username", "hostname": "127.0.0.1", "aliaslist": [], "ipaddrlist": ["127.0.0.1"], "local": true}}, "dirs": {"base": "$BASE", "venv": "$BASE/venv/torch", "data": "$BASE/data", "runs": "$BASE/runs", "extra": "$BASE/extra/llm", "cache": "$BASE/cache"}, "group": "llm", "install_group": "torch", "install_variant": "cuda", "run_name": "dev", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 800, "voir": {"options": {"stop": 30, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "$SRC/milabench/config", "config_file": "$SRC/milabench/config/standard.yaml", "definition": "$SRC/milabench/benchmarks/llama", "tags": ["inference", "llm", "nlp"], "plan": {"method": "per_gpu"}, "weight": 1.0, "name": "llama", "tag": ["llama"]}' +export OMP_NUM_THREADS=0 +export MILABENCH_CONFIG='{"system": {"arch": "cuda", "sshkey": null, "nodes": [{"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}, {"ip": "192.168.0.11", "main": false, "name": "1", "sshport": 22, "user": "username", "hostname": "192.168.0.11"}], "self": {"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}}, "dirs": {"base": "$BASE", "venv": "$BASE/venv/torch", "data": "$BASE/data", "runs": "$BASE/runs", "extra": "$BASE/extra/llm", "cache": "$BASE/cache"}, "group": "llm", "install_group": "torch", "install_variant": "cuda", "run_name": "dev", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 800, "voir": {"options": {"stop": 30, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "$SRC/milabench/config", "config_file": "$SRC/milabench/config/standard.yaml", "definition": "$SRC/milabench/benchmarks/llama", "tags": ["inference", "llm", "nlp"], "plan": {"method": "per_gpu"}, "weight": 1.0, "name": "llama", "tag": ["llama"]}' echo "---" echo "llama" @@ -37,14 +37,14 @@ echo "---" echo "fp16" echo "====" time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & wait ) @@ -52,14 +52,14 @@ echo "---" echo "bf16" echo "====" time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & wait ) @@ -67,14 +67,14 @@ echo "---" echo "tf32" echo "====" time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & wait ) @@ -82,14 +82,14 @@ echo "---" echo "fp32" echo "====" time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/activator $BASE/venv/torch $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & wait ) @@ -127,7 +127,7 @@ echo "---" echo "resnet152-ddp-gpus" echo "==================" time ( - $SRC/milabench/benchmarks/torchvision_ddp/activator $BASE/venv/torch $SRC/milabench/benchmarks/torchvision_ddp/main.py --epochs 10 --num-workers 8 --loader torch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/torchvision_ddp/main.py --epochs 10 --num-workers 8 --loader torch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & wait ) @@ -353,7 +353,7 @@ echo "---" echo "diffusion-single" echo "================" time ( - $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 & + $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & wait ) @@ -361,7 +361,7 @@ echo "---" echo "diffusion-gpus" echo "==============" time ( - $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 & + $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & wait ) @@ -369,8 +369,8 @@ echo "---" echo "diffusion-nodes" echo "===============" time ( - $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=2 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=16 & - ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=1 --num_machines=2 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=22 --num_processes=16 & + $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=2 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=16 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & + ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=1 --num_machines=2 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=16 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & wait ) @@ -378,14 +378,14 @@ echo "---" echo "lightning" echo "=========" time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & wait ) @@ -393,7 +393,7 @@ echo "---" echo "lightning-gpus" echo "==============" time ( - $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 16 & + $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & wait ) @@ -401,14 +401,14 @@ echo "---" echo "dinov2-giant-single" echo "===================" time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & wait ) @@ -416,16 +416,7 @@ echo "---" echo "dinov2-giant-gpus" echo "=================" time ( - $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-gpus/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - wait -) - -echo "---" -echo "dinov2-giant-nodes" -echo "==================" -time ( - $BASE/venv/torch/bin/benchrun --nnodes=2 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-nodes/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $BASE/venv/torch/bin/benchrun --nnodes=2 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-nodes/output --no-resume --config-file src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-gpus/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & wait ) @@ -448,7 +439,7 @@ echo "---" echo "llm-lora-ddp-gpus" echo "=================" time ( - $BASE/venv/torch/bin/tune run --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-gpus/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & + $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-gpus/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & wait ) @@ -456,7 +447,8 @@ echo "---" echo "llm-lora-ddp-nodes" echo "==================" time ( - $BASE/venv/torch/bin/tune run --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & + $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & + ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & wait ) @@ -464,7 +456,7 @@ echo "---" echo "llm-lora-mp-gpus" echo "================" time ( - $BASE/venv/torch/bin/tune run --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_lora.yaml epochs=1 output_dir=$BASE/extra/llm-lora-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-lora-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" batch_size=8 gradient_accumulation_steps=1 & + $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_lora.yaml epochs=1 output_dir=$BASE/extra/llm-lora-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-lora-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" batch_size=8 gradient_accumulation_steps=1 & wait ) @@ -472,7 +464,7 @@ echo "---" echo "llm-full-mp-gpus" echo "================" time ( - $BASE/venv/torch/bin/tune run --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 & + $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 & wait ) @@ -480,7 +472,53 @@ echo "---" echo "llm-full-mp-nodes" echo "=================" time ( - $BASE/venv/torch/bin/tune run --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 & + $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 & + ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 & + wait +) + +echo "---" +echo "dimenet" +echo "=======" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & + wait +) + +echo "---" +echo "recursiongfn" +echo "============" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & + wait +) + +echo "---" +echo "torchatari" +echo "==========" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & wait )