From 4fb687c744cb55f7f9098fd67a5eecbb7c6fe609 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 30 Oct 2024 13:31:28 +0000 Subject: [PATCH 01/20] Tweaks --- milabench/_version.py | 6 +++--- scripts/article/run_cuda.sh | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/milabench/_version.py b/milabench/_version.py index cdd2418d..e2795a03 100644 --- a/milabench/_version.py +++ b/milabench/_version.py @@ -1,5 +1,5 @@ """This file is generated, do not modify""" -__tag__ = "v1.0.0_RC1-9-g6d1e1140" -__commit__ = "6d1e114000cc4200ea307330032234db6696e40d" -__date__ = "2024-09-30 14:39:43 -0400" +__tag__ = "v1.0.0_RC1-12-g3b87cb4" +__commit__ = "3b87cb465e855be452953273c314ab01024e0925" +__date__ = "2024-10-09 12:04:43 -0400" diff --git a/scripts/article/run_cuda.sh b/scripts/article/run_cuda.sh index ba4c1ae3..47b21313 100644 --- a/scripts/article/run_cuda.sh +++ b/scripts/article/run_cuda.sh @@ -49,8 +49,8 @@ install_prepare() { # Install milabench's benchmarks in their venv # # pip install torch - milabench pin --variant cuda --from-scratch $ARGS - milabench install --system $MILABENCH_WORDIR/system.yaml $ARGS + # milabench pin --variant cuda --from-scratch $ARGS + milabench install # --system $MILABENCH_WORDIR/system.yaml $ARGS which pip @@ -67,10 +67,10 @@ install_prepare() { # # Generate/download datasets, download models etc... - milabench prepare --system $MILABENCH_WORDIR/system.yaml $ARGS + milabench prepare # --system $MILABENCH_WORDIR/system.yaml $ARGS } -module load cuda/12.3.2 +# module load cuda/12.3.2 if [ ! -d "$MILABENCH_WORDIR/results" ]; then install_prepare @@ -89,7 +89,7 @@ if [ "$MILABENCH_PREPARE" -eq 0 ]; then # rm -rf $MILABENCH_WORDIR/results/venv/ # rm -rf $MILABENCH_WORDIR/results/extra # milabench install --system $MILABENCH_WORDIR/system.yaml - milabench prepare --system $MILABENCH_WORDIR/system.yaml $ARGS + milabench prepare # --system $MILABENCH_WORDIR/system.yaml $ARGS ( . $BENCHMARK_VENV/bin/activate @@ -117,7 +117,7 @@ if [ "$MILABENCH_PREPARE" -eq 0 ]; then # milabench run --run-name "c$CAPACITY.{time}" --system $MILABENCH_WORDIR/system.yaml $ARGS || true # done - milabench run --system $MILABENCH_WORDIR/system.yaml $ARGS + milabench run # --system $MILABENCH_WORDIR/system.yaml $ARGS # # Display report From a849fc2f6b2ed1204d5f3aa1f27640d5fbd99dd9 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 30 Oct 2024 13:38:08 +0000 Subject: [PATCH 02/20] - --- milabench/system.py | 1 + scripts/article/run_cuda.sh | 32 +++++++------------------------- 2 files changed, 8 insertions(+), 25 deletions(-) diff --git a/milabench/system.py b/milabench/system.py index c237baf2..3a50d143 100644 --- a/milabench/system.py +++ b/milabench/system.py @@ -406,6 +406,7 @@ def resolve_hostname(ip): if is_loopback(ip): return hostname, True + return socket.gethostname(), hostname.startswith(socket.gethostname()) return hostname, hostname == socket.gethostname() except: diff --git a/scripts/article/run_cuda.sh b/scripts/article/run_cuda.sh index 47b21313..e56b2da1 100644 --- a/scripts/article/run_cuda.sh +++ b/scripts/article/run_cuda.sh @@ -50,7 +50,7 @@ install_prepare() { # # pip install torch # milabench pin --variant cuda --from-scratch $ARGS - milabench install # --system $MILABENCH_WORDIR/system.yaml $ARGS + milabench install --system $MILABENCH_WORDIR/system.yaml $ARGS which pip @@ -67,7 +67,7 @@ install_prepare() { # # Generate/download datasets, download models etc... - milabench prepare # --system $MILABENCH_WORDIR/system.yaml $ARGS + milabench prepare --system $MILABENCH_WORDIR/system.yaml $ARGS } # module load cuda/12.3.2 @@ -87,9 +87,10 @@ if [ "$MILABENCH_PREPARE" -eq 0 ]; then # pip install torch # milabench pin --variant cuda --from-scratch # rm -rf $MILABENCH_WORDIR/results/venv/ - # rm -rf $MILABENCH_WORDIR/results/extra - # milabench install --system $MILABENCH_WORDIR/system.yaml - milabench prepare # --system $MILABENCH_WORDIR/system.yaml $ARGS + rm -rf $MILABENCH_WORDIR/results/extra + + milabench install --system $MILABENCH_WORDIR/system.yaml + milabench prepare --system $MILABENCH_WORDIR/system.yaml $ARGS ( . $BENCHMARK_VENV/bin/activate @@ -98,26 +99,7 @@ if [ "$MILABENCH_PREPARE" -eq 0 ]; then # pip install torchao --no-input ) - # pip install torch - # milabench pin --variant cuda --from-scratch - # milabench install --system $MILABENCH_WORDIR/system.yaml --force $ARGS - # milabench prepare --system $MILABENCH_WORDIR/system.yaml $ARGS - - # ARGS="--select resnet50-noio,brax,lightning,dinov2-giant-single,dinov2-giant-gpus,llm-lora-ddp-gpus,llm-lora-ddp-nodes,llm-lora-mp-gpus,llm-full-mp-gpus,llm-full-mp-nodes,dqn,ppo,dimenet,llava-single,rlhf-single,rlhf-gpus,vjepa-single,vjepa-gpus" - - # MEMORY_CAPACITY=("4Go" "8Go" "16Go" "32Go" "64Go" "80Go") - # # MEMORY_CAPACITY=("2048" "4096" "8192") - - # # Run the benchmakrs - # for CAPACITY in "${MEMORY_CAPACITY[@]}"; do - # export MILABENCH_SIZER_AUTO=1 - # export MILABENCH_SIZER_MULTIPLE=8 - # export MILABENCH_SIZER_CAPACITY=$CAPACITY - # # export MILABENCH_SIZER_BATCH_SIZE=$CAPACITY - # milabench run --run-name "c$CAPACITY.{time}" --system $MILABENCH_WORDIR/system.yaml $ARGS || true - # done - - milabench run # --system $MILABENCH_WORDIR/system.yaml $ARGS + milabench run --system $MILABENCH_WORDIR/system.yaml $ARGS # # Display report From dafcbf94c4e5957c2db0cdbd592eb5923d39ad80 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 30 Oct 2024 14:10:38 +0000 Subject: [PATCH 03/20] Ignore prepare & install runs for reports --- milabench/compare.py | 3 +++ scripts/article/run_cuda.sh | 9 +++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/milabench/compare.py b/milabench/compare.py index cae06820..d4d6299e 100644 --- a/milabench/compare.py +++ b/milabench/compare.py @@ -27,6 +27,9 @@ def fetch_runs(folder, filter): runs = [] ignored = 0 for run in os.listdir(folder): + if run.startswith("install") or run.startswith("prepare"): + continue + if filter is not None and (not fnmatch.fnmatch(run, filter)): ignored += 1 continue diff --git a/scripts/article/run_cuda.sh b/scripts/article/run_cuda.sh index e56b2da1..12ffffe6 100644 --- a/scripts/article/run_cuda.sh +++ b/scripts/article/run_cuda.sh @@ -72,7 +72,7 @@ install_prepare() { # module load cuda/12.3.2 -if [ ! -d "$MILABENCH_WORDIR/results" ]; then +if [ ! -d "$MILABENCH_WORDIR/env" ]; then install_prepare else echo "Reusing previous install" @@ -104,4 +104,9 @@ if [ "$MILABENCH_PREPARE" -eq 0 ]; then # # Display report milabench report --runs $MILABENCH_WORDIR/results/runs -fi \ No newline at end of file +fi + + +# rsync -av mila@172.29.171.42:~/rocm/results/cache ~/cuda/results/cache +# rsync -av mila@172.29.171.42:~/rocm/results/data ~/cuda/results/data +# rsync -av mila@172.29.171.42:~/rocm/results/cache ~/cuda/results/cache \ No newline at end of file From 783a13b4bd87db7fad8e848191a18557c40b6eb5 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Sat, 16 Nov 2024 13:06:45 -0500 Subject: [PATCH 04/20] Update report.py --- milabench/report.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/milabench/report.py b/milabench/report.py index c54ed8dd..da68c80a 100644 --- a/milabench/report.py +++ b/milabench/report.py @@ -417,12 +417,17 @@ def _score(column): # This computes a weighted geometric mean # perf can be object np.float64 !? - perf = df[column].astype(float) + # success_ratio = 1 - row["fail"] / max(row["n"], 1) + + # score = (acc if acc > 0 else row["perf"]) * success_ratio + score = df[column].astype(float) weights = df["weight"] * df["enabled"].astype(int) - weight_total = np.sum(weights) + # if total weight is 0 ? + weight_total = np.sum(weights) - logscore = np.sum(np.log(perf) * weights) / weight_total + # score cannot be 0 + logscore = np.sum(np.log(score + 1) * weights) / weight_total return np.exp(logscore) except ZeroDivisionError: return 0 From ecd92149d26b8372da73db28b9440e5c61fd9769 Mon Sep 17 00:00:00 2001 From: Pierre Delaunay Date: Thu, 21 Nov 2024 12:41:28 -0500 Subject: [PATCH 05/20] Consolidate metric pushing logic --- benchmarks/diffusion/main.py | 5 ++ benchmarks/flops/benchfile.py | 18 ++++--- benchmarks/flops/dev.yaml | 56 +++++++++++++++++++++ benchmarks/flops/dev/extra/flops/mark_torch | 0 benchmarks/flops/main.py | 3 +- benchmarks/flops/requirements.cpu.txt | 5 ++ benchmarks/flops/simple.sh | 13 +++++ benchmarks/geo_gnn/modelsize.py | 36 +++++++++++++ benchmarks/purejaxrl/dqn.py | 5 ++ benchmarks/purejaxrl/ppo.py | 6 +++ benchmarks/recursiongfn/main.py | 4 +- benchmarks/torchatari/main.py | 3 ++ benchmate/benchmate/models.py | 36 +++++++++++++ benchmate/benchmate/monitor.py | 30 +++++++---- milabench/_version.py | 6 +-- milabench/pack.py | 2 + 16 files changed, 208 insertions(+), 20 deletions(-) create mode 100644 benchmarks/flops/dev.yaml create mode 100644 benchmarks/flops/dev/extra/flops/mark_torch create mode 100644 benchmarks/flops/requirements.cpu.txt create mode 100644 benchmarks/flops/simple.sh create mode 100644 benchmarks/geo_gnn/modelsize.py create mode 100644 benchmate/benchmate/models.py diff --git a/benchmarks/diffusion/main.py b/benchmarks/diffusion/main.py index 0bcb67d5..c5b7757e 100755 --- a/benchmarks/diffusion/main.py +++ b/benchmarks/diffusion/main.py @@ -57,6 +57,11 @@ def models(accelerator, args: Arguments): unet = UNet2DConditionModel.from_pretrained( args.model, subfolder="unet", revision=args.revision, variant=args.variant ) + + from benchmate.models import model_size + print(model_size(unet)) + print(model_size(encoder)) + print(model_size(vae)) vae.requires_grad_(False) encoder.requires_grad_(False) diff --git a/benchmarks/flops/benchfile.py b/benchmarks/flops/benchfile.py index 59c5c4a7..9117caf0 100644 --- a/benchmarks/flops/benchfile.py +++ b/benchmarks/flops/benchfile.py @@ -8,11 +8,17 @@ class FlopsBenchmarch(Package): def build_run_plan(self) -> "Command": import milabench.commands as cmd - - pack = cmd.PackCommand(self, *self.argv, lazy=True) - # pack = cmd.VoirCommand(pack, cwd=main.parent) - pack = cmd.ActivatorCommand(pack) - return pack.use_stdout() - + + use_stdout = True + + if use_stdout: + main = self.dirs.code / self.main_script + pack = cmd.PackCommand(self, *self.argv, lazy=True) + return pack.use_stdout() + else: + main = self.dirs.code / self.main_script + pack = cmd.PackCommand(self, *self.argv, lazy=True) + pack = cmd.VoirCommand(pack, cwd=main.parent) + return pack __pack__ = FlopsBenchmarch diff --git a/benchmarks/flops/dev.yaml b/benchmarks/flops/dev.yaml new file mode 100644 index 00000000..5106bf40 --- /dev/null +++ b/benchmarks/flops/dev.yaml @@ -0,0 +1,56 @@ + + +_flops: + inherits: _defaults + definition: . + group: flops + install-variant: unpinned + install_group: torch + plan: + method: per_gpu + + tags: + - diagnostic + - flops + - monogpu + - nobatch + + argv: + --number: 30 + --repeat: 90 + + +fp16: + inherits: _flops + + argv: + --number: 30 + --repeat: 10 + --m: 8192 + --n: 8192 + --dtype: fp16 + +bf16: + inherits: _flops + + argv: + --m: 8192 + --n: 8192 + --dtype: bf16 + +tf32: + inherits: _flops + + argv: + --m: 8192 + --n: 8192 + --dtype: fp32 + --tf32: true + +fp32: + inherits: _flops + + argv: + --m: 256 + --n: 256 + --dtype: fp32 diff --git a/benchmarks/flops/dev/extra/flops/mark_torch b/benchmarks/flops/dev/extra/flops/mark_torch new file mode 100644 index 00000000..e69de29b diff --git a/benchmarks/flops/main.py b/benchmarks/flops/main.py index e4f05c17..ba03518a 100755 --- a/benchmarks/flops/main.py +++ b/benchmarks/flops/main.py @@ -109,11 +109,12 @@ def main(): log, monitor = setupvoir() + # FIXME + #with monitor: f(args.number, args.repeat, args.m, args.n, TERA, dtypes[args.dtype], log) monitor.stop() - if __name__ == "__main__": main() print("done") diff --git a/benchmarks/flops/requirements.cpu.txt b/benchmarks/flops/requirements.cpu.txt new file mode 100644 index 00000000..88f8b61e --- /dev/null +++ b/benchmarks/flops/requirements.cpu.txt @@ -0,0 +1,5 @@ +torch +torchvision +torchcompat +tqdm +voir diff --git a/benchmarks/flops/simple.sh b/benchmarks/flops/simple.sh new file mode 100644 index 00000000..3f54d424 --- /dev/null +++ b/benchmarks/flops/simple.sh @@ -0,0 +1,13 @@ + + + + +export MILABENCH_BASE="$(pwd)/dev" +export MILABENCH_CONFIG="$(pwd)/dev.yaml" + + +milabench install --select fp32 + +milabench prepare --select fp32 + +milabench run --select fp32 diff --git a/benchmarks/geo_gnn/modelsize.py b/benchmarks/geo_gnn/modelsize.py new file mode 100644 index 00000000..0b65655a --- /dev/null +++ b/benchmarks/geo_gnn/modelsize.py @@ -0,0 +1,36 @@ +from torch_geometric.nn.models import PNA as _PNA, DimeNet as _DimeNet + +import torch + +from benchmate.models import model_size + + +print(model_size(_DimeNet( + hidden_channels=64, + out_channels=1, + num_blocks=6, + num_bilinear=8, + num_spherical=7, + num_radial=6, + cutoff=10.0, + envelope_exponent=5, + num_before_skip=1, + num_after_skip=2, + num_output_layers=3, +) + +)) + +print(model_size( +_PNA( + # Basic GCNN setup + in_channels=1, + out_channels=1, + hidden_channels=64, + num_layers=64, + # https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.conv.PNAConv.html + aggregators=['mean', 'min', 'max', 'std'], + scalers=['identity', 'amplification', 'attenuation'], + # Histogram of in-degrees of nodes in the training set, used by scalers to normalize + deg=torch.tensor(4), +))) \ No newline at end of file diff --git a/benchmarks/purejaxrl/dqn.py b/benchmarks/purejaxrl/dqn.py index fc0a97b8..85e9e8b5 100644 --- a/benchmarks/purejaxrl/dqn.py +++ b/benchmarks/purejaxrl/dqn.py @@ -98,6 +98,11 @@ def train(rng): init_x = jnp.zeros(env.observation_space(env_params).shape) network_params = network.init(_rng, init_x) + + param_count = sum(x.size for x in jax.tree.leaves(network_params)) + print("PARAM COUNT", param_count) + + def linear_schedule(count): frac = 1.0 - (count / config["NUM_UPDATES"]) return config["LR"] * frac diff --git a/benchmarks/purejaxrl/ppo.py b/benchmarks/purejaxrl/ppo.py index 0cc8896c..a70e195e 100644 --- a/benchmarks/purejaxrl/ppo.py +++ b/benchmarks/purejaxrl/ppo.py @@ -107,9 +107,15 @@ def train(rng): network = ActorCritic( env.action_space(env_params).shape[0], activation=config["ACTIVATION"] ) + + rng, _rng = jax.random.split(rng) init_x = jnp.zeros(env.observation_space(env_params).shape) network_params = network.init(_rng, init_x) + + param_count = sum(x.size for x in jax.tree.leaves(network_params)) + print("PARAM COUNT", param_count) + if config["ANNEAL_LR"]: tx = optax.chain( optax.clip_by_global_norm(config["MAX_GRAD_NORM"]), diff --git a/benchmarks/recursiongfn/main.py b/benchmarks/recursiongfn/main.py index 81d08e8a..7099247d 100644 --- a/benchmarks/recursiongfn/main.py +++ b/benchmarks/recursiongfn/main.py @@ -92,11 +92,13 @@ def __init__( self.num_cond_dim = self.temperature_conditional.encoding_size() def _load_task_models(self): - xdg_cache = os.environ["XDG_CACHE_HOME"] + xdg_cache = os.environ.get("XDG_CACHE_HOME") model = bengio2021flow.load_original_model( cache=True, location=Path(os.path.join(xdg_cache, "bengio2021flow_proxy.pkl.gz")), ) + from benchmate.models import model_size + print(model_size(model)) model.to(get_worker_device()) model = self._wrap_model(model) return {"seh": model} diff --git a/benchmarks/torchatari/main.py b/benchmarks/torchatari/main.py index bf5b7ef6..898a8cab 100644 --- a/benchmarks/torchatari/main.py +++ b/benchmarks/torchatari/main.py @@ -201,7 +201,10 @@ def main(): envs = RecordEpisodeStatistics(envs) assert isinstance(envs.action_space, gym.spaces.Discrete), "only discrete action space is supported" + + from benchmate.models import model_size agent = Agent(envs).to(device) + print(model_size(agent)) optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5) # ALGO Logic: Storage setup diff --git a/benchmate/benchmate/models.py b/benchmate/benchmate/models.py new file mode 100644 index 00000000..efd13e2c --- /dev/null +++ b/benchmate/benchmate/models.py @@ -0,0 +1,36 @@ + + +def model_summary(model, input_shape): + try: + from torchsummary import summary + + summary(model, input_shape) + except: + print("Could not print summary") + + +def model_size(model): + param_size = 0 + param_count = 0 + for param in model.parameters(): + param_count += param.nelement() + param_size += param.nelement() * param.element_size() + + buffer_size = 0 + buffer_count = 0 + for buff in model.buffers(): + buffer_count += buff.nelement() + buffer_size += buff.nelement() * buff.element_size() + + return { + "param": { + "count": param_count, + "size": param_size / 1024**2, + "unit": "MB" + }, + "buffer": { + "count": buffer_count, + "size": buffer_size / 1024**2, + "unit": "MB" + } + } diff --git a/benchmate/benchmate/monitor.py b/benchmate/benchmate/monitor.py index 0ad34a3d..5c63796e 100644 --- a/benchmate/benchmate/monitor.py +++ b/benchmate/benchmate/monitor.py @@ -13,7 +13,25 @@ from voir.instruments.io import io_monitor from voir.instruments.network import network_monitor from voir.instruments.monitor import monitor +from voir.helpers import current_overseer +from .metrics import sumggle_push, give_push, file_push + + +def auto_push(): + # Milabench managed: we need to push metrics to it + if int(os.getenv("MILABENCH_MANAGED", 0)) == 1: + + # Using voir, DATA_FD is defined as well + ov = current_overseer.get() + if ov is not None: + return ov.give + + # Not using Voir, using structured stdout + return sumggle_push() + + # Not using milabench; using stdout + return file_push() @instrument_definition @@ -41,16 +59,10 @@ def monitor_node(ov, poll_interval=1, arch=None): def _smuggle_monitor(poll_interval=10, worker_init=None, **monitors): - data_file = SmuggleWriter(sys.stdout) + log = auto_push() + def mblog(data): - nonlocal data_file - - if data_file is not None: - try: - print(json.dumps(data), file=data_file) - except ValueError: - pass - # print("Is bench ending?, ignoring ValueError") + log(**data) def get(): t = time.time() diff --git a/milabench/_version.py b/milabench/_version.py index cdd2418d..5f55a16d 100644 --- a/milabench/_version.py +++ b/milabench/_version.py @@ -1,5 +1,5 @@ """This file is generated, do not modify""" -__tag__ = "v1.0.0_RC1-9-g6d1e1140" -__commit__ = "6d1e114000cc4200ea307330032234db6696e40d" -__date__ = "2024-09-30 14:39:43 -0400" +__tag__ = "v0.1.0-129-ga60a3aa" +__commit__ = "a60a3aae21e87e46bcce403620a3f56c12878554" +__date__ = "2024-11-06 22:52:12 -0500" diff --git a/milabench/pack.py b/milabench/pack.py index 1cdde093..20feca39 100644 --- a/milabench/pack.py +++ b/milabench/pack.py @@ -335,6 +335,8 @@ def make_env(self): f"MILABENCH_DIR_{name.upper()}": path for name, path in self.config["dirs"].items() } + + env["MILABENCH_MANAGED"] = "1" env["OMP_NUM_THREADS"] = resolve_placeholder(self, "{cpu_per_gpu}") From 8148e5388cd3ecbc2b6ea8e4c2a4efe6c63333f8 Mon Sep 17 00:00:00 2001 From: Pierre Delaunay Date: Thu, 21 Nov 2024 13:02:00 -0500 Subject: [PATCH 06/20] Tweaks --- benchmarks/flops/benchfile.py | 8 +++----- benchmate/benchmate/monitor.py | 13 +++++++++---- milabench/commands/executors.py | 3 +++ milabench/report.py | 4 ++-- 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/benchmarks/flops/benchfile.py b/benchmarks/flops/benchfile.py index 9117caf0..3090a3c7 100644 --- a/benchmarks/flops/benchfile.py +++ b/benchmarks/flops/benchfile.py @@ -8,16 +8,14 @@ class FlopsBenchmarch(Package): def build_run_plan(self) -> "Command": import milabench.commands as cmd - + main = self.dirs.code / self.main_script + pack = cmd.PackCommand(self, *self.argv, lazy=True) + use_stdout = True if use_stdout: - main = self.dirs.code / self.main_script - pack = cmd.PackCommand(self, *self.argv, lazy=True) return pack.use_stdout() else: - main = self.dirs.code / self.main_script - pack = cmd.PackCommand(self, *self.argv, lazy=True) pack = cmd.VoirCommand(pack, cwd=main.parent) return pack diff --git a/benchmate/benchmate/monitor.py b/benchmate/benchmate/monitor.py index 5c63796e..294d0c88 100644 --- a/benchmate/benchmate/monitor.py +++ b/benchmate/benchmate/monitor.py @@ -19,16 +19,21 @@ def auto_push(): + # use_stdout = int(os.getenv("MILABENCH_USE_STDOUT", 0)) + mb_managed = int(os.getenv("MILABENCH_MANAGED", 0)) + # Milabench managed: we need to push metrics to it - if int(os.getenv("MILABENCH_MANAGED", 0)) == 1: - + if mb_managed == 1: # Using voir, DATA_FD is defined as well ov = current_overseer.get() if ov is not None: return ov.give - + # Not using Voir, using structured stdout - return sumggle_push() + if int(os.getenv("MILABENCH_USE_STDOUT", 0)) == 1: + return sumggle_push() + + raise RuntimeError("Could not find something to push to") # Not using milabench; using stdout return file_push() diff --git a/milabench/commands/executors.py b/milabench/commands/executors.py index f0402d29..807a261e 100644 --- a/milabench/commands/executors.py +++ b/milabench/commands/executors.py @@ -32,6 +32,9 @@ async def execute(pack, *args, cwd=None, env={}, external=False, use_stdout=Fals sized_args = scale_argv(pack, args) final_args = resolve_argv(pack, sized_args) + if use_stdout: + exec_env["MILABENCH_USE_STDOUT"] = "1" + return await run( final_args, **kwargs, diff --git a/milabench/report.py b/milabench/report.py index c54ed8dd..bdc4999c 100644 --- a/milabench/report.py +++ b/milabench/report.py @@ -525,12 +525,12 @@ def pandas_to_string(df, formatters=_formatters): # Compute column size col_size = defaultdict(int) for index, row in df.iterrows(): - col_size["bench"] = max(col_size["bench"], len(index)) + col_size["bench"] = max(col_size["bench"], len(index), len("bench")) for col, val in zip(columns, row): fmt = formatters.get(col) if fmt is not None: val = fmt(val) - col_size[col] = max(col_size[col], len(val)) + col_size[col] = max(col_size[col], len(val), len(col)) # Generate report sep = " | " From 3d2718031897bcc5eac93e1304442ba528f5840c Mon Sep 17 00:00:00 2001 From: Pierre Delaunay Date: Thu, 21 Nov 2024 13:04:41 -0500 Subject: [PATCH 07/20] Revert bad commit --- benchmarks/diffusion/main.py | 5 -- benchmarks/flops/benchfile.py | 14 ++---- benchmarks/flops/dev.yaml | 56 --------------------- benchmarks/flops/dev/extra/flops/mark_torch | 0 benchmarks/flops/main.py | 3 +- benchmarks/flops/requirements.cpu.txt | 5 -- benchmarks/flops/simple.sh | 13 ----- benchmarks/geo_gnn/modelsize.py | 36 ------------- benchmarks/purejaxrl/dqn.py | 5 -- benchmarks/purejaxrl/ppo.py | 6 --- benchmarks/recursiongfn/main.py | 4 +- benchmarks/torchatari/main.py | 3 -- benchmate/benchmate/models.py | 36 ------------- benchmate/benchmate/monitor.py | 35 ++++--------- milabench/_version.py | 6 +-- milabench/commands/executors.py | 3 -- milabench/pack.py | 2 - milabench/report.py | 4 +- 18 files changed, 21 insertions(+), 215 deletions(-) delete mode 100644 benchmarks/flops/dev.yaml delete mode 100644 benchmarks/flops/dev/extra/flops/mark_torch delete mode 100644 benchmarks/flops/requirements.cpu.txt delete mode 100644 benchmarks/flops/simple.sh delete mode 100644 benchmarks/geo_gnn/modelsize.py delete mode 100644 benchmate/benchmate/models.py diff --git a/benchmarks/diffusion/main.py b/benchmarks/diffusion/main.py index c5b7757e..0bcb67d5 100755 --- a/benchmarks/diffusion/main.py +++ b/benchmarks/diffusion/main.py @@ -57,11 +57,6 @@ def models(accelerator, args: Arguments): unet = UNet2DConditionModel.from_pretrained( args.model, subfolder="unet", revision=args.revision, variant=args.variant ) - - from benchmate.models import model_size - print(model_size(unet)) - print(model_size(encoder)) - print(model_size(vae)) vae.requires_grad_(False) encoder.requires_grad_(False) diff --git a/benchmarks/flops/benchfile.py b/benchmarks/flops/benchfile.py index 3090a3c7..59c5c4a7 100644 --- a/benchmarks/flops/benchfile.py +++ b/benchmarks/flops/benchfile.py @@ -8,15 +8,11 @@ class FlopsBenchmarch(Package): def build_run_plan(self) -> "Command": import milabench.commands as cmd - main = self.dirs.code / self.main_script + pack = cmd.PackCommand(self, *self.argv, lazy=True) - - use_stdout = True - - if use_stdout: - return pack.use_stdout() - else: - pack = cmd.VoirCommand(pack, cwd=main.parent) - return pack + # pack = cmd.VoirCommand(pack, cwd=main.parent) + pack = cmd.ActivatorCommand(pack) + return pack.use_stdout() + __pack__ = FlopsBenchmarch diff --git a/benchmarks/flops/dev.yaml b/benchmarks/flops/dev.yaml deleted file mode 100644 index 5106bf40..00000000 --- a/benchmarks/flops/dev.yaml +++ /dev/null @@ -1,56 +0,0 @@ - - -_flops: - inherits: _defaults - definition: . - group: flops - install-variant: unpinned - install_group: torch - plan: - method: per_gpu - - tags: - - diagnostic - - flops - - monogpu - - nobatch - - argv: - --number: 30 - --repeat: 90 - - -fp16: - inherits: _flops - - argv: - --number: 30 - --repeat: 10 - --m: 8192 - --n: 8192 - --dtype: fp16 - -bf16: - inherits: _flops - - argv: - --m: 8192 - --n: 8192 - --dtype: bf16 - -tf32: - inherits: _flops - - argv: - --m: 8192 - --n: 8192 - --dtype: fp32 - --tf32: true - -fp32: - inherits: _flops - - argv: - --m: 256 - --n: 256 - --dtype: fp32 diff --git a/benchmarks/flops/dev/extra/flops/mark_torch b/benchmarks/flops/dev/extra/flops/mark_torch deleted file mode 100644 index e69de29b..00000000 diff --git a/benchmarks/flops/main.py b/benchmarks/flops/main.py index ba03518a..e4f05c17 100755 --- a/benchmarks/flops/main.py +++ b/benchmarks/flops/main.py @@ -109,12 +109,11 @@ def main(): log, monitor = setupvoir() - # FIXME - #with monitor: f(args.number, args.repeat, args.m, args.n, TERA, dtypes[args.dtype], log) monitor.stop() + if __name__ == "__main__": main() print("done") diff --git a/benchmarks/flops/requirements.cpu.txt b/benchmarks/flops/requirements.cpu.txt deleted file mode 100644 index 88f8b61e..00000000 --- a/benchmarks/flops/requirements.cpu.txt +++ /dev/null @@ -1,5 +0,0 @@ -torch -torchvision -torchcompat -tqdm -voir diff --git a/benchmarks/flops/simple.sh b/benchmarks/flops/simple.sh deleted file mode 100644 index 3f54d424..00000000 --- a/benchmarks/flops/simple.sh +++ /dev/null @@ -1,13 +0,0 @@ - - - - -export MILABENCH_BASE="$(pwd)/dev" -export MILABENCH_CONFIG="$(pwd)/dev.yaml" - - -milabench install --select fp32 - -milabench prepare --select fp32 - -milabench run --select fp32 diff --git a/benchmarks/geo_gnn/modelsize.py b/benchmarks/geo_gnn/modelsize.py deleted file mode 100644 index 0b65655a..00000000 --- a/benchmarks/geo_gnn/modelsize.py +++ /dev/null @@ -1,36 +0,0 @@ -from torch_geometric.nn.models import PNA as _PNA, DimeNet as _DimeNet - -import torch - -from benchmate.models import model_size - - -print(model_size(_DimeNet( - hidden_channels=64, - out_channels=1, - num_blocks=6, - num_bilinear=8, - num_spherical=7, - num_radial=6, - cutoff=10.0, - envelope_exponent=5, - num_before_skip=1, - num_after_skip=2, - num_output_layers=3, -) - -)) - -print(model_size( -_PNA( - # Basic GCNN setup - in_channels=1, - out_channels=1, - hidden_channels=64, - num_layers=64, - # https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.conv.PNAConv.html - aggregators=['mean', 'min', 'max', 'std'], - scalers=['identity', 'amplification', 'attenuation'], - # Histogram of in-degrees of nodes in the training set, used by scalers to normalize - deg=torch.tensor(4), -))) \ No newline at end of file diff --git a/benchmarks/purejaxrl/dqn.py b/benchmarks/purejaxrl/dqn.py index 85e9e8b5..fc0a97b8 100644 --- a/benchmarks/purejaxrl/dqn.py +++ b/benchmarks/purejaxrl/dqn.py @@ -98,11 +98,6 @@ def train(rng): init_x = jnp.zeros(env.observation_space(env_params).shape) network_params = network.init(_rng, init_x) - - param_count = sum(x.size for x in jax.tree.leaves(network_params)) - print("PARAM COUNT", param_count) - - def linear_schedule(count): frac = 1.0 - (count / config["NUM_UPDATES"]) return config["LR"] * frac diff --git a/benchmarks/purejaxrl/ppo.py b/benchmarks/purejaxrl/ppo.py index a70e195e..0cc8896c 100644 --- a/benchmarks/purejaxrl/ppo.py +++ b/benchmarks/purejaxrl/ppo.py @@ -107,15 +107,9 @@ def train(rng): network = ActorCritic( env.action_space(env_params).shape[0], activation=config["ACTIVATION"] ) - - rng, _rng = jax.random.split(rng) init_x = jnp.zeros(env.observation_space(env_params).shape) network_params = network.init(_rng, init_x) - - param_count = sum(x.size for x in jax.tree.leaves(network_params)) - print("PARAM COUNT", param_count) - if config["ANNEAL_LR"]: tx = optax.chain( optax.clip_by_global_norm(config["MAX_GRAD_NORM"]), diff --git a/benchmarks/recursiongfn/main.py b/benchmarks/recursiongfn/main.py index 7099247d..81d08e8a 100644 --- a/benchmarks/recursiongfn/main.py +++ b/benchmarks/recursiongfn/main.py @@ -92,13 +92,11 @@ def __init__( self.num_cond_dim = self.temperature_conditional.encoding_size() def _load_task_models(self): - xdg_cache = os.environ.get("XDG_CACHE_HOME") + xdg_cache = os.environ["XDG_CACHE_HOME"] model = bengio2021flow.load_original_model( cache=True, location=Path(os.path.join(xdg_cache, "bengio2021flow_proxy.pkl.gz")), ) - from benchmate.models import model_size - print(model_size(model)) model.to(get_worker_device()) model = self._wrap_model(model) return {"seh": model} diff --git a/benchmarks/torchatari/main.py b/benchmarks/torchatari/main.py index 898a8cab..bf5b7ef6 100644 --- a/benchmarks/torchatari/main.py +++ b/benchmarks/torchatari/main.py @@ -201,10 +201,7 @@ def main(): envs = RecordEpisodeStatistics(envs) assert isinstance(envs.action_space, gym.spaces.Discrete), "only discrete action space is supported" - - from benchmate.models import model_size agent = Agent(envs).to(device) - print(model_size(agent)) optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5) # ALGO Logic: Storage setup diff --git a/benchmate/benchmate/models.py b/benchmate/benchmate/models.py deleted file mode 100644 index efd13e2c..00000000 --- a/benchmate/benchmate/models.py +++ /dev/null @@ -1,36 +0,0 @@ - - -def model_summary(model, input_shape): - try: - from torchsummary import summary - - summary(model, input_shape) - except: - print("Could not print summary") - - -def model_size(model): - param_size = 0 - param_count = 0 - for param in model.parameters(): - param_count += param.nelement() - param_size += param.nelement() * param.element_size() - - buffer_size = 0 - buffer_count = 0 - for buff in model.buffers(): - buffer_count += buff.nelement() - buffer_size += buff.nelement() * buff.element_size() - - return { - "param": { - "count": param_count, - "size": param_size / 1024**2, - "unit": "MB" - }, - "buffer": { - "count": buffer_count, - "size": buffer_size / 1024**2, - "unit": "MB" - } - } diff --git a/benchmate/benchmate/monitor.py b/benchmate/benchmate/monitor.py index 294d0c88..0ad34a3d 100644 --- a/benchmate/benchmate/monitor.py +++ b/benchmate/benchmate/monitor.py @@ -13,30 +13,7 @@ from voir.instruments.io import io_monitor from voir.instruments.network import network_monitor from voir.instruments.monitor import monitor -from voir.helpers import current_overseer -from .metrics import sumggle_push, give_push, file_push - - -def auto_push(): - # use_stdout = int(os.getenv("MILABENCH_USE_STDOUT", 0)) - mb_managed = int(os.getenv("MILABENCH_MANAGED", 0)) - - # Milabench managed: we need to push metrics to it - if mb_managed == 1: - # Using voir, DATA_FD is defined as well - ov = current_overseer.get() - if ov is not None: - return ov.give - - # Not using Voir, using structured stdout - if int(os.getenv("MILABENCH_USE_STDOUT", 0)) == 1: - return sumggle_push() - - raise RuntimeError("Could not find something to push to") - - # Not using milabench; using stdout - return file_push() @instrument_definition @@ -64,10 +41,16 @@ def monitor_node(ov, poll_interval=1, arch=None): def _smuggle_monitor(poll_interval=10, worker_init=None, **monitors): - log = auto_push() - + data_file = SmuggleWriter(sys.stdout) def mblog(data): - log(**data) + nonlocal data_file + + if data_file is not None: + try: + print(json.dumps(data), file=data_file) + except ValueError: + pass + # print("Is bench ending?, ignoring ValueError") def get(): t = time.time() diff --git a/milabench/_version.py b/milabench/_version.py index 5f55a16d..cdd2418d 100644 --- a/milabench/_version.py +++ b/milabench/_version.py @@ -1,5 +1,5 @@ """This file is generated, do not modify""" -__tag__ = "v0.1.0-129-ga60a3aa" -__commit__ = "a60a3aae21e87e46bcce403620a3f56c12878554" -__date__ = "2024-11-06 22:52:12 -0500" +__tag__ = "v1.0.0_RC1-9-g6d1e1140" +__commit__ = "6d1e114000cc4200ea307330032234db6696e40d" +__date__ = "2024-09-30 14:39:43 -0400" diff --git a/milabench/commands/executors.py b/milabench/commands/executors.py index 807a261e..f0402d29 100644 --- a/milabench/commands/executors.py +++ b/milabench/commands/executors.py @@ -32,9 +32,6 @@ async def execute(pack, *args, cwd=None, env={}, external=False, use_stdout=Fals sized_args = scale_argv(pack, args) final_args = resolve_argv(pack, sized_args) - if use_stdout: - exec_env["MILABENCH_USE_STDOUT"] = "1" - return await run( final_args, **kwargs, diff --git a/milabench/pack.py b/milabench/pack.py index 20feca39..1cdde093 100644 --- a/milabench/pack.py +++ b/milabench/pack.py @@ -335,8 +335,6 @@ def make_env(self): f"MILABENCH_DIR_{name.upper()}": path for name, path in self.config["dirs"].items() } - - env["MILABENCH_MANAGED"] = "1" env["OMP_NUM_THREADS"] = resolve_placeholder(self, "{cpu_per_gpu}") diff --git a/milabench/report.py b/milabench/report.py index bdc4999c..c54ed8dd 100644 --- a/milabench/report.py +++ b/milabench/report.py @@ -525,12 +525,12 @@ def pandas_to_string(df, formatters=_formatters): # Compute column size col_size = defaultdict(int) for index, row in df.iterrows(): - col_size["bench"] = max(col_size["bench"], len(index), len("bench")) + col_size["bench"] = max(col_size["bench"], len(index)) for col, val in zip(columns, row): fmt = formatters.get(col) if fmt is not None: val = fmt(val) - col_size[col] = max(col_size[col], len(val), len(col)) + col_size[col] = max(col_size[col], len(val)) # Generate report sep = " | " From 3d7d5f108d17a9ee8381b51eb0380964acf1e340 Mon Sep 17 00:00:00 2001 From: Pierre Delaunay Date: Thu, 21 Nov 2024 13:02:00 -0500 Subject: [PATCH 08/20] Tweaks --- benchmarks/flops/benchfile.py | 8 +++----- benchmarks/flops/dev/extra/flops/mark_torch | 0 benchmate/benchmate/monitor.py | 13 +++++++++---- milabench/commands/executors.py | 3 +++ milabench/report.py | 4 ++-- 5 files changed, 17 insertions(+), 11 deletions(-) delete mode 100644 benchmarks/flops/dev/extra/flops/mark_torch diff --git a/benchmarks/flops/benchfile.py b/benchmarks/flops/benchfile.py index 9117caf0..3090a3c7 100644 --- a/benchmarks/flops/benchfile.py +++ b/benchmarks/flops/benchfile.py @@ -8,16 +8,14 @@ class FlopsBenchmarch(Package): def build_run_plan(self) -> "Command": import milabench.commands as cmd - + main = self.dirs.code / self.main_script + pack = cmd.PackCommand(self, *self.argv, lazy=True) + use_stdout = True if use_stdout: - main = self.dirs.code / self.main_script - pack = cmd.PackCommand(self, *self.argv, lazy=True) return pack.use_stdout() else: - main = self.dirs.code / self.main_script - pack = cmd.PackCommand(self, *self.argv, lazy=True) pack = cmd.VoirCommand(pack, cwd=main.parent) return pack diff --git a/benchmarks/flops/dev/extra/flops/mark_torch b/benchmarks/flops/dev/extra/flops/mark_torch deleted file mode 100644 index e69de29b..00000000 diff --git a/benchmate/benchmate/monitor.py b/benchmate/benchmate/monitor.py index 5c63796e..294d0c88 100644 --- a/benchmate/benchmate/monitor.py +++ b/benchmate/benchmate/monitor.py @@ -19,16 +19,21 @@ def auto_push(): + # use_stdout = int(os.getenv("MILABENCH_USE_STDOUT", 0)) + mb_managed = int(os.getenv("MILABENCH_MANAGED", 0)) + # Milabench managed: we need to push metrics to it - if int(os.getenv("MILABENCH_MANAGED", 0)) == 1: - + if mb_managed == 1: # Using voir, DATA_FD is defined as well ov = current_overseer.get() if ov is not None: return ov.give - + # Not using Voir, using structured stdout - return sumggle_push() + if int(os.getenv("MILABENCH_USE_STDOUT", 0)) == 1: + return sumggle_push() + + raise RuntimeError("Could not find something to push to") # Not using milabench; using stdout return file_push() diff --git a/milabench/commands/executors.py b/milabench/commands/executors.py index f0402d29..807a261e 100644 --- a/milabench/commands/executors.py +++ b/milabench/commands/executors.py @@ -32,6 +32,9 @@ async def execute(pack, *args, cwd=None, env={}, external=False, use_stdout=Fals sized_args = scale_argv(pack, args) final_args = resolve_argv(pack, sized_args) + if use_stdout: + exec_env["MILABENCH_USE_STDOUT"] = "1" + return await run( final_args, **kwargs, diff --git a/milabench/report.py b/milabench/report.py index c54ed8dd..bdc4999c 100644 --- a/milabench/report.py +++ b/milabench/report.py @@ -525,12 +525,12 @@ def pandas_to_string(df, formatters=_formatters): # Compute column size col_size = defaultdict(int) for index, row in df.iterrows(): - col_size["bench"] = max(col_size["bench"], len(index)) + col_size["bench"] = max(col_size["bench"], len(index), len("bench")) for col, val in zip(columns, row): fmt = formatters.get(col) if fmt is not None: val = fmt(val) - col_size[col] = max(col_size[col], len(val)) + col_size[col] = max(col_size[col], len(val), len(col)) # Generate report sep = " | " From 798a186891a466db96047bbbd6ee38076343175c Mon Sep 17 00:00:00 2001 From: Setepenre Date: Thu, 21 Nov 2024 13:11:12 -0500 Subject: [PATCH 09/20] Hpu (#292) * HPU changes * HPU pins * Use HPU device * Gaudi Tweaks * Remove pinnning * call prepare again * Add docker image for HPU * Tweaks * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - --------- Co-authored-by: Your Name --- .pin/constraints-hpu-torch.txt | 861 +++++++++++++----- benchmarks/brax/requirements.hpu.txt | 108 +-- benchmarks/diffusion/requirements.hpu.txt | 381 ++++++++ benchmarks/dinov2/requirements.hpu.txt | 267 ++++++ benchmarks/flops/requirements.hpu.txt | 47 +- benchmarks/geo_gnn/requirements-pre.hpu.txt | 99 ++ benchmarks/geo_gnn/requirements.hpu.txt | 321 +++++++ benchmarks/huggingface/requirements.hpu.txt | 62 +- benchmarks/lightning/main.py | 22 +- benchmarks/lightning/requirements.hpu.txt | 285 ++++++ benchmarks/llama/requirements.hpu.txt | 88 +- benchmarks/llava/benchfile.py | 4 +- benchmarks/llava/main.py | 6 +- benchmarks/llava/requirements.hpu.txt | 343 +++++++ benchmarks/llm/configs/llama3_70B_full.yaml | 8 +- .../llm/recipes/full_finetune_distributed.py | 44 +- .../recipes/full_finetune_single_device.py | 12 +- .../llm/recipes/lora_finetune_distributed.py | 27 +- .../recipes/lora_finetune_single_device.py | 31 +- .../ppo_full_finetune_single_device.py | 10 +- benchmarks/llm/requirements.hpu.txt | 408 +++++++++ benchmarks/purejaxrl/requirements.hpu.txt | 743 +++++++++++++++ benchmarks/recursiongfn/requirements.hpu.txt | 493 ++++++++++ benchmarks/rlhf/main.py | 11 + benchmarks/rlhf/requirements.hpu.txt | 362 ++++++++ benchmarks/timm/requirements.hpu.txt | 55 +- benchmarks/torchatari/requirements.hpu.txt | 304 +++++++ benchmarks/torchvision/requirements.hpu.txt | 47 +- .../torchvision_ddp/requirements.hpu.txt | 205 +++++ benchmarks/vjepa/benchfile.py | 4 +- benchmarks/vjepa/main.py | 15 +- benchmarks/vjepa/requirements.hpu.txt | 297 ++++++ config/base.yaml | 22 +- constraints/extra/torch.hpu.txt | 5 - constraints/hpu.txt | 16 +- docker/Dockerfile-hpu | 42 + docker/Makefile | 17 + milabench/_version.py | 6 +- milabench/remote.py | 2 +- milabench/system.py | 2 +- scripts/article/run_hpu.sh | 84 +- 41 files changed, 5584 insertions(+), 582 deletions(-) create mode 100644 benchmarks/diffusion/requirements.hpu.txt create mode 100644 benchmarks/dinov2/requirements.hpu.txt create mode 100644 benchmarks/geo_gnn/requirements-pre.hpu.txt create mode 100644 benchmarks/geo_gnn/requirements.hpu.txt create mode 100644 benchmarks/lightning/requirements.hpu.txt create mode 100644 benchmarks/llava/requirements.hpu.txt create mode 100644 benchmarks/llm/requirements.hpu.txt create mode 100644 benchmarks/purejaxrl/requirements.hpu.txt create mode 100644 benchmarks/recursiongfn/requirements.hpu.txt create mode 100644 benchmarks/rlhf/requirements.hpu.txt create mode 100644 benchmarks/torchatari/requirements.hpu.txt create mode 100644 benchmarks/vjepa/requirements.hpu.txt create mode 100644 docker/Dockerfile-hpu create mode 100644 docker/Makefile diff --git a/.pin/constraints-hpu-torch.txt b/.pin/constraints-hpu-torch.txt index 6481e8c6..92a55858 100644 --- a/.pin/constraints-hpu-torch.txt +++ b/.pin/constraints-hpu-torch.txt @@ -2,204 +2,359 @@ # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # -# pip-compile --output-file=.pin/constraints-hpu-torch.txt .pin/tmp-constraints.txt benchmarks/accelerate_opt/requirements.in benchmarks/brax/requirements.in benchmarks/dlrm/requirements.in benchmarks/flops/requirements.in benchmarks/huggingface/requirements.in benchmarks/llama/requirements.in benchmarks/stargan/requirements.in benchmarks/super-slomo/requirements.in benchmarks/timm/requirements.in benchmarks/torchvision/requirements.in benchmarks/torchvision_ddp/requirements.in +# pip-compile --output-file=.pin/constraints-hpu-torch.txt .pin/tmp-constraints.txt benchmarks/brax/requirements.in benchmarks/diffusion/requirements.in benchmarks/dinov2/requirements.in benchmarks/flops/requirements.in benchmarks/geo_gnn/requirements-pre.in benchmarks/geo_gnn/requirements.in benchmarks/huggingface/requirements.in benchmarks/lightning/requirements.in benchmarks/llama/requirements.in benchmarks/llava/requirements.in benchmarks/llm/requirements.in benchmarks/purejaxrl/requirements.in benchmarks/recursiongfn/requirements.in benchmarks/rlhf/requirements.in benchmarks/timm/requirements.in benchmarks/torchatari/requirements.in benchmarks/torchvision/requirements.in benchmarks/torchvision_ddp/requirements.in benchmarks/vjepa/requirements.in constraints/extra/torch.hpu.txt # ---extra-index-url https://pypi.ngc.nvidia.com ---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com - absl-py==2.1.0 # via # brax # chex + # distrax # dm-env # ml-collections # mujoco # mujoco-mjx # optax # orbax-checkpoint + # rlax # tensorboard -accelerate==0.32.1 - # via -r benchmarks/accelerate_opt/requirements.in -aiohttp==3.9.5 + # tensorflow-probability +accelerate==0.34.2 + # via + # -r benchmarks/diffusion/requirements.in + # -r benchmarks/llava/requirements.in + # -r benchmarks/llm/requirements.in + # -r benchmarks/rlhf/requirements.in + # diffusers + # trl +aiohappyeyeballs==2.4.3 + # via aiohttp +aiohttp==3.10.8 # via # datasets # fsspec + # torch-geometric aiosignal==1.3.1 # via aiohttp -annotated-types==0.7.0 - # via pydantic antlr4-python3-runtime==4.9.3 # via omegaconf +appdirs==1.4.4 + # via cantilever +argklass==1.4.4 + # via + # -r benchmarks/diffusion/requirements.in + # -r benchmarks/llm/requirements.in + # -r benchmarks/purejaxrl/requirements.in +astroid==3.3.4 + # via pylint asttokens==2.4.1 # via giving async-timeout==4.0.3 # via aiohttp -attrs==23.2.0 +attrs==24.2.0 # via aiohttp -beautifulsoup4==4.12.3 - # via gdown +beartype==0.19.0 + # via -r benchmarks/vjepa/requirements.in +black==24.8.0 + # via navix blinker==1.8.2 # via flask +blobfile==3.0.0 + # via + # -r benchmarks/llm/requirements.txt + # torchtune +blosc2==2.7.1 + # via tables +botorch==0.12.0 + # via -r benchmarks/recursiongfn/requirements.in +braceexpand==0.1.7 + # via + # -r benchmarks/vjepa/requirements.in + # webdataset brax==0.10.5 - # via -r benchmarks/brax/requirements.in -certifi==2024.6.2 - # via requests + # via + # -r benchmarks/brax/requirements.in + # -r benchmarks/purejaxrl/requirements.in +cantilever==0.1.0 + # via -r benchmarks/torchatari/requirements.in +certifi==2024.8.30 + # via + # requests + # sentry-sdk charset-normalizer==3.3.2 # via requests -chex==0.1.86 - # via optax +chex==0.1.87 + # via + # distrax + # evosax + # flashbax + # gymnax + # optax + # rlax click==8.1.7 - # via flask + # via + # black + # flask + # wandb cloudpickle==3.0.0 - # via gym -codefind==0.1.6 + # via + # gym + # gymnasium + # submitit + # tensorflow-probability +codefind==0.1.7 # via ptera contextlib2==21.6.0 # via ml-collections -datasets==2.20.0 - # via - # -r benchmarks/accelerate_opt/requirements.in +contourpy==1.3.0 + # via matplotlib +cvxopt==1.3.2 + # via -r benchmarks/recursiongfn/requirements.in +cycler==0.12.1 + # via matplotlib +datasets==3.0.1 + # via + # -r benchmarks/diffusion/requirements.in # -r benchmarks/llama/requirements.in - # evaluate -deepspeed==0.14.4 - # via -r benchmarks/accelerate_opt/requirements.in + # -r benchmarks/llava/requirements.in + # -r benchmarks/rlhf/requirements.in + # torchtune + # trl +decorator==5.1.1 + # via tensorflow-probability +decord==0.6.0 + # via -r benchmarks/vjepa/requirements.in +diffusers[torch]==0.30.3 + # via -r benchmarks/diffusion/requirements.in dill==0.3.8 # via # datasets - # evaluate # multiprocess + # pylint +distrax==0.1.5 + # via + # -r benchmarks/purejaxrl/requirements.in + # rlax dm-env==1.6 - # via brax + # via + # brax + # envpool + # rlax dm-tree==0.1.8 - # via dm-env -docker==7.1.0 - # via torchx + # via + # dm-env + # tensorflow-probability +docker-pycreds==0.4.0 + # via wandb docstring-parser==0.16 - # via torchx -etils[epath,epy]==1.7.0 + # via tyro +dotmap==1.3.30 + # via evosax +einops==0.8.0 + # via -r benchmarks/vjepa/requirements.in +envpool==0.8.4 + # via -r benchmarks/torchatari/requirements.in +etils[epath,epy]==1.9.4 # via # brax # mujoco # mujoco-mjx # optax # orbax-checkpoint -evaluate==0.4.2 - # via -r benchmarks/accelerate_opt/requirements.in -executing==1.2.0 +evosax==0.1.6 + # via -r benchmarks/purejaxrl/requirements.in +exceptiongroup==1.2.2 + # via pytest +executing==2.1.0 # via varname fairscale==0.4.13 - # via -r benchmarks/llama/requirements.in -fbgemm-gpu==0.7.0 - # via torchrec -filelock==3.15.4 # via + # -r benchmarks/llama/requirements.in + # -r benchmarks/llm/requirements.in + # -r benchmarks/llm/requirements.txt +farama-notifications==0.0.4 + # via gymnasium +filelock==3.16.1 + # via + # blobfile # datasets - # gdown + # diffusers # huggingface-hub # torch - # torchx # transformers # triton -fire==0.6.0 - # via -r benchmarks/llama/requirements.in +fire==0.7.0 + # via + # -r benchmarks/llama/requirements.in + # -r benchmarks/llm/requirements.txt +flake8==7.1.1 + # via navix +flashbax==0.1.2 + # via -r benchmarks/purejaxrl/requirements.in flask==3.0.3 # via # brax # flask-cors -flask-cors==4.0.1 - # via brax -flax==0.8.5 +flask-cors==5.0.0 # via brax +flax==0.9.0 + # via + # -r benchmarks/purejaxrl/requirements.in + # brax + # evosax + # flashbax + # gymnax + # navix +fonttools==4.54.1 + # via matplotlib frozenlist==1.4.1 # via # aiohttp # aiosignal -fsspec[http]==2024.5.0 +fsspec[http]==2024.6.1 # via # datasets # etils - # evaluate # huggingface-hub + # lightning + # pytorch-lightning # torch - # torchx -future==1.0.0 - # via -r benchmarks/dlrm/requirements.in -gdown==5.2.0 - # via -r benchmarks/stargan/requirements.in -giving==0.4.2 + # torch-geometric +fvcore==0.1.5.post20221221 + # via -r benchmarks/dinov2/requirements.in +gast==0.6.0 + # via tensorflow-probability +gitdb==4.0.11 + # via gitpython +gitpython==3.1.43 + # via + # -r benchmarks/recursiongfn/requirements.in + # wandb +giving==0.4.3 # via # ptera # voir glfw==2.7.0 # via mujoco -graphviz==0.20.3 - # via torchviz -grpcio==1.65.1 +gpytorch==1.13 + # via + # -r benchmarks/recursiongfn/requirements.in + # botorch +grpcio==1.66.2 # via # brax # tensorboard gym==0.26.2 - # via brax + # via + # -r benchmarks/torchatari/requirements.in + # brax + # envpool + # gymnax gym-notices==0.0.8 # via gym +gymnasium==0.29.1 + # via + # envpool + # gymnax +gymnax==0.0.8 + # via + # -c .pin/../constraints/hpu.txt + # -r benchmarks/purejaxrl/requirements.in hjson==3.1.0 - # via deepspeed -huggingface-hub==0.24.0 + # via argklass +huggingface-hub==0.25.1 # via # -r benchmarks/timm/requirements.in # accelerate # datasets - # evaluate + # diffusers + # timm # tokenizers + # torchtune # transformers -idna==3.7 +humanize==4.10.0 + # via orbax-checkpoint +idna==3.10 # via # requests # yarl -importlib-metadata==8.0.0 - # via torchx -importlib-resources==6.4.0 +importlib-metadata==8.5.0 + # via diffusers +importlib-resources==6.4.5 # via + # argklass + # cantilever # etils # torchcompat +iniconfig==2.0.0 + # via pytest +iopath==0.1.10 + # via + # -r benchmarks/dinov2/requirements.in + # fvcore +isort==5.13.2 + # via pylint itsdangerous==2.2.0 # via flask -jax[cuda12]==0.4.28 +jax==0.4.33 # via # -r benchmarks/brax/requirements.in + # -r benchmarks/purejaxrl/requirements.in # brax # chex + # distrax + # evosax + # flashbax # flax + # gymnax # jaxopt # mujoco-mjx # optax # orbax-checkpoint -jax-cuda12-pjrt==0.4.28 - # via jax-cuda12-plugin -jax-cuda12-plugin==0.4.28 - # via jax -jaxlib==0.4.28+cuda12.cudnn89 + # rlax +jaxlib==0.4.33 # via # brax # chex + # distrax + # evosax + # flashbax + # gymnax # jax # jaxopt # mujoco-mjx # optax # orbax-checkpoint + # rlax jaxopt==0.8.3 # via brax +jaxtyping==0.2.19 + # via + # gpytorch + # linear-operator jinja2==3.1.4 # via # brax # flask # torch + # torch-geometric joblib==1.4.2 # via scikit-learn -lightning-utilities==0.11.5 - # via torchmetrics -markdown==3.6 +kiwisolver==1.4.7 + # via matplotlib +lightning==2.4.0 + # via -r benchmarks/lightning/requirements.in +lightning-utilities==0.11.7 + # via + # lightning + # pytorch-lightning + # torchmetrics +linear-operator==0.5.3 + # via + # botorch + # gpytorch +lxml==5.3.0 + # via blobfile +markdown==3.7 # via tensorboard markdown-it-py==3.0.0 # via rich @@ -207,410 +362,634 @@ markupsafe==2.1.5 # via # jinja2 # werkzeug +matplotlib==3.9.2 + # via + # evosax + # gymnax + # seaborn +mccabe==0.7.0 + # via + # flake8 + # pylint mdurl==0.1.2 # via markdown-it-py ml-collections==0.1.1 # via brax -ml-dtypes==0.4.0 +ml-dtypes==0.5.0 # via # jax # jaxlib # tensorstore mpmath==1.3.0 - # via sympy -msgpack==1.0.8 # via + # botorch + # gpytorch + # linear-operator + # sympy +msgpack==1.1.0 + # via + # blosc2 # flax # orbax-checkpoint -mujoco==3.2.0 +mujoco==3.2.3 # via # brax # mujoco-mjx -mujoco-mjx==3.2.0 +mujoco-mjx==3.2.3 # via brax -multidict==6.0.5 +multidict==6.1.0 # via # aiohttp # yarl +multipledispatch==1.0.0 + # via botorch multiprocess==0.70.16 - # via - # datasets - # evaluate + # via datasets mypy-extensions==1.0.0 - # via typing-inspect + # via black +navix==0.7.0 + # via -r benchmarks/purejaxrl/requirements.in +ndindex==1.9.2 + # via blosc2 nest-asyncio==1.6.0 # via orbax-checkpoint networkx==3.3 - # via torch -ninja==1.11.1.1 - # via deepspeed + # via + # -r benchmarks/recursiongfn/requirements.in + # torch +numexpr==2.10.1 + # via + # blosc2 + # tables numpy==1.26.4 # via - # -r benchmarks/dlrm/requirements.in - # -r benchmarks/stargan/requirements.in - # -r benchmarks/super-slomo/requirements.in + # -r benchmarks/geo_gnn/requirements.in + # -r benchmarks/llava/requirements.in + # -r benchmarks/purejaxrl/requirements.in + # -r benchmarks/torchatari/requirements.in + # -r benchmarks/vjepa/requirements.in # accelerate + # blosc2 # brax # chex + # contourpy # datasets - # deepspeed + # decord + # diffusers + # distrax # dm-env - # evaluate + # envpool + # evosax # fairscale - # fbgemm-gpu - # flax + # flashbax + # fvcore # gym + # gymnasium # jax # jaxlib # jaxopt + # jaxtyping + # matplotlib # ml-dtypes # mujoco - # onnx + # navix + # numexpr # opencv-python - # opt-einsum # optax # orbax-checkpoint # pandas # pyarrow + # pyro-ppl + # rdkit + # rlax # scikit-learn # scipy + # seaborn + # tables # tensorboard # tensorboardx + # tensorflow-probability # tensorstore + # torch-geometric # torchmetrics + # torchtune # torchvision # transformers # trimesh + # trl + # webdataset + # xformers nvidia-cublas-cu12==12.1.3.1 # via - # jax # nvidia-cudnn-cu12 # nvidia-cusolver-cu12 # torch nvidia-cuda-cupti-cu12==12.1.105 - # via - # jax - # torch -nvidia-cuda-nvcc-cu12==12.5.82 - # via - # jax - # jax-cuda12-plugin + # via torch nvidia-cuda-nvrtc-cu12==12.1.105 # via torch nvidia-cuda-runtime-cu12==12.1.105 - # via - # jax - # torch -nvidia-cudnn-cu12==8.9.2.26 - # via - # jax - # torch + # via torch +nvidia-cudnn-cu12==9.1.0.70 + # via torch nvidia-cufft-cu12==11.0.2.54 - # via - # jax - # torch + # via torch nvidia-curand-cu12==10.3.2.106 # via torch nvidia-cusolver-cu12==11.4.5.107 - # via - # jax - # torch + # via torch nvidia-cusparse-cu12==12.1.0.106 # via - # jax # nvidia-cusolver-cu12 # torch -nvidia-ml-py==12.555.43 - # via deepspeed +nvidia-ml-py==12.560.30 + # via voir nvidia-nccl-cu12==2.20.5 + # via torch +nvidia-nvjitlink-cu12==12.6.77 # via - # jax - # torch -nvidia-nvjitlink-cu12==12.5.82 - # via - # jax # nvidia-cusolver-cu12 # nvidia-cusparse-cu12 nvidia-nvtx-cu12==12.1.105 # via torch omegaconf==2.3.0 - # via voir -onnx==1.16.1 - # via -r benchmarks/dlrm/requirements.in + # via + # -r benchmarks/dinov2/requirements.in + # -r benchmarks/recursiongfn/requirements.in + # torchtune + # voir opencv-python==4.10.0.84 - # via -r benchmarks/super-slomo/requirements.in -opt-einsum==3.3.0 - # via jax + # via -r benchmarks/vjepa/requirements.in +opt-einsum==3.4.0 + # via + # jax + # pyro-ppl optax==0.2.3 # via + # -r benchmarks/purejaxrl/requirements.in # brax # flax -orbax-checkpoint==0.5.21 +optree==0.13.0 + # via envpool +orbax-checkpoint==0.6.4 # via # brax # flax -ovld==0.3.5 +ovld==0.3.9 # via voir packaging==24.1 # via # accelerate + # black # datasets - # deepspeed - # evaluate + # envpool # huggingface-hub + # lightning # lightning-utilities + # matplotlib + # pytest + # pytorch-lightning + # setuptools-scm + # tables + # tensorboard # tensorboardx # torchmetrics # transformers -pandas==2.2.2 +pandas==2.2.3 # via + # -r benchmarks/geo_gnn/requirements.in + # -r benchmarks/recursiongfn/requirements.in + # -r benchmarks/vjepa/requirements.in # datasets - # evaluate + # seaborn +pathspec==0.12.1 + # via black pillow==10.4.0 # via + # -r benchmarks/huggingface/requirements.in + # -r benchmarks/llava/requirements.in # brax + # diffusers + # fvcore + # matplotlib + # navix + # rdkit # torchvision -protobuf==4.25.3 +platformdirs==4.3.6 + # via + # black + # pylint + # wandb +pluggy==1.5.0 + # via pytest +portalocker==2.10.1 + # via iopath +protobuf==5.28.2 # via - # onnx # orbax-checkpoint # tensorboard # tensorboardx + # wandb psutil==5.9.8 # via # accelerate - # deepspeed + # torch-geometric # voir + # wandb ptera==1.4.1 # via voir py-cpuinfo==9.0.0 - # via deepspeed + # via + # blosc2 + # tables pyarrow==17.0.0 - # via datasets -pyarrow-hotfix==0.6 - # via datasets -pydantic==2.7.4 - # via deepspeed -pydantic-core==2.18.4 - # via pydantic -pydot==3.0.1 - # via -r benchmarks/dlrm/requirements.in + # via + # -r benchmarks/recursiongfn/requirements.in + # datasets +pycodestyle==2.12.1 + # via flake8 +pycryptodomex==3.21.0 + # via blobfile +pyflakes==3.2.0 + # via flake8 pygments==2.18.0 # via rich -pynvml==11.5.3 - # via voir +pylint==3.3.1 + # via navix pyopengl==3.1.7 # via mujoco -pyparsing==3.1.2 - # via pydot -pyre-extensions==0.0.30 - # via torchx -pysocks==1.7.1 - # via requests +pyparsing==3.1.4 + # via + # matplotlib + # torch-geometric +pyro-api==0.1.2 + # via pyro-ppl +pyro-ppl==1.9.1 + # via + # -r benchmarks/recursiongfn/requirements.in + # botorch +pytest==8.3.3 + # via navix python-dateutil==2.9.0.post0 - # via pandas + # via + # matplotlib + # pandas pytinyrenderer==0.0.14 # via brax -pytz==2024.1 +pytorch-lightning==2.4.0 + # via lightning +pytz==2024.2 # via pandas -pyyaml==6.0.1 +pyyaml==6.0.2 # via + # -r benchmarks/llm/requirements.in # -r benchmarks/timm/requirements.in + # -r benchmarks/vjepa/requirements.in # accelerate # datasets + # evosax # flax + # fvcore + # gymnax # huggingface-hub + # lightning # ml-collections # omegaconf # orbax-checkpoint - # torchx + # pytorch-lightning + # timm # transformers + # wandb + # webdataset + # yacs +rdkit==2024.3.5 + # via + # -r benchmarks/geo_gnn/requirements.in + # -r benchmarks/recursiongfn/requirements.in reactivex==4.0.4 # via giving -regex==2024.5.15 - # via transformers -requests[socks]==2.32.3 +regex==2024.9.11 + # via + # diffusers + # tiktoken + # transformers +requests==2.32.3 # via # datasets - # docker - # evaluate - # gdown + # diffusers # huggingface-hub + # tiktoken + # torch-geometric # transformers -rich==13.7.1 + # wandb +rich==13.9.1 # via - # -r benchmarks/accelerate_opt/requirements.in # flax + # tyro # voir -safetensors==0.4.3 +rlax==0.1.6 + # via navix +safetensors==0.4.5 # via # -r benchmarks/timm/requirements.in # accelerate + # diffusers + # timm + # torchtune # transformers -scikit-learn==1.5.1 - # via -r benchmarks/dlrm/requirements.in -scipy==1.14.0 +scikit-learn==1.5.2 + # via gpytorch +scipy==1.14.1 # via + # -r benchmarks/dinov2/requirements.in + # -r benchmarks/recursiongfn/requirements.in + # botorch # brax + # gpytorch # jax # jaxlib # jaxopt + # linear-operator # mujoco-mjx # scikit-learn + # torch-cluster + # torch-sparse +seaborn==0.13.2 + # via gymnax sentencepiece==0.2.0 - # via -r benchmarks/llama/requirements.in + # via + # -r benchmarks/llama/requirements.in + # torchtune +sentry-sdk==2.15.0 + # via wandb +setproctitle==1.3.3 + # via wandb +setuptools-scm==8.1.0 + # via navix +shtab==1.7.1 + # via tyro six==1.16.0 # via # asttokens - # fire + # docker-pycreds # ml-collections # python-dateutil # tensorboard -soupsieve==2.5 - # via beautifulsoup4 -sympy==1.13.0 + # tensorflow-probability +smmap==5.0.1 + # via gitdb +submitit==1.5.2 + # via + # -r benchmarks/dinov2/requirements.in + # -r benchmarks/vjepa/requirements.in +sympy==1.13.3 # via torch +tables==3.10.1 + # via -r benchmarks/recursiongfn/requirements.in tabulate==0.9.0 - # via torchx -tensorboard==2.17.0 - # via -r benchmarks/dlrm/requirements.in + # via fvcore +tensorboard==2.18.0 + # via + # -r benchmarks/recursiongfn/requirements.in + # -r benchmarks/torchatari/requirements.in tensorboard-data-server==0.7.2 # via tensorboard tensorboardx==2.6.2.2 # via brax -tensorstore==0.1.63 +tensorflow-probability==0.24.0 + # via distrax +tensorstore==0.1.66 # via + # flashbax # flax # orbax-checkpoint termcolor==2.4.0 - # via fire + # via + # fire + # fvcore threadpoolctl==3.5.0 # via scikit-learn +tiktoken==0.7.0 + # via torchtune +timm==1.0.9 + # via -r benchmarks/vjepa/requirements.in tokenizers==0.19.1 # via transformers +tomli==2.0.2 + # via + # black + # pylint + # pytest + # setuptools-scm +tomlkit==0.13.2 + # via pylint toolz==0.12.1 # via chex -torch==2.3.1 +torch==2.4.1 # via - # -r benchmarks/accelerate_opt/requirements.in # -r benchmarks/brax/requirements.in - # -r benchmarks/dlrm/requirements.in + # -r benchmarks/dinov2/requirements.in # -r benchmarks/flops/requirements.in + # -r benchmarks/geo_gnn/requirements-pre.in # -r benchmarks/huggingface/requirements.in + # -r benchmarks/lightning/requirements.in # -r benchmarks/llama/requirements.in - # -r benchmarks/stargan/requirements.in - # -r benchmarks/super-slomo/requirements.in + # -r benchmarks/llava/requirements.in + # -r benchmarks/llm/requirements.in + # -r benchmarks/llm/requirements.txt + # -r benchmarks/purejaxrl/requirements.in + # -r benchmarks/recursiongfn/requirements.in + # -r benchmarks/rlhf/requirements.in # -r benchmarks/timm/requirements.in + # -r benchmarks/torchatari/requirements.in # -r benchmarks/torchvision/requirements.in # -r benchmarks/torchvision_ddp/requirements.in + # -r benchmarks/vjepa/requirements.in # accelerate - # deepspeed + # botorch + # diffusers # fairscale - # torchaudio + # lightning + # linear-operator + # pyro-ppl + # pytorch-lightning + # timm # torchmetrics # torchvision - # torchviz -torchaudio==2.3.1 - # via -r benchmarks/accelerate_opt/requirements.in + # trl + # xformers +torch-cluster==1.6.3 + # via + # -r benchmarks/geo_gnn/requirements.in + # -r benchmarks/recursiongfn/requirements.in +torch-geometric==2.6.1 + # via + # -r benchmarks/geo_gnn/requirements.in + # -r benchmarks/recursiongfn/requirements.in +torch-scatter==2.1.2 + # via + # -r benchmarks/geo_gnn/requirements.in + # -r benchmarks/recursiongfn/requirements.in +torch-sparse==0.6.18 + # via + # -r benchmarks/geo_gnn/requirements.in + # -r benchmarks/recursiongfn/requirements.in +torchao==0.3.1 + # via + # -c .pin/../constraints/hpu.txt + # -r benchmarks/llm/requirements.in + # torchtune torchcompat==1.1.4 # via # -c .pin/../constraints/hpu.txt # -r benchmarks/flops/requirements.in + # -r benchmarks/lightning/requirements.in + # -r benchmarks/torchatari/requirements.in # -r benchmarks/torchvision/requirements.in # -r benchmarks/torchvision_ddp/requirements.in -torchmetrics==1.0.3 - # via torchrec -torchrec==0.7.0 - # via -r benchmarks/dlrm/requirements.in -torchvision==0.18.1 +torchmetrics==1.4.2 + # via + # -r benchmarks/dinov2/requirements.in + # lightning + # pytorch-lightning +torchtune==0.2.1 + # via + # -c .pin/../constraints/hpu.txt + # -r benchmarks/llm/requirements.in +torchvision==0.19.1 # via - # -r benchmarks/accelerate_opt/requirements.in + # -r benchmarks/diffusion/requirements.in + # -r benchmarks/dinov2/requirements.in # -r benchmarks/flops/requirements.in - # -r benchmarks/stargan/requirements.in - # -r benchmarks/super-slomo/requirements.in + # -r benchmarks/lightning/requirements.in # -r benchmarks/timm/requirements.in # -r benchmarks/torchvision/requirements.in # -r benchmarks/torchvision_ddp/requirements.in -torchviz==0.0.2 - # via -r benchmarks/dlrm/requirements.in -torchx==0.7.0 - # via -r benchmarks/dlrm/requirements.in -tqdm==4.66.4 + # -r benchmarks/vjepa/requirements.in + # timm +tqdm==4.66.5 # via - # -r benchmarks/dlrm/requirements.in + # -r benchmarks/diffusion/requirements.in # -r benchmarks/flops/requirements.in - # -r benchmarks/super-slomo/requirements.in # -r benchmarks/torchvision/requirements.in # -r benchmarks/torchvision_ddp/requirements.in # datasets - # deepspeed - # evaluate - # gdown + # fvcore # huggingface-hub - # torchrec + # iopath + # lightning + # pyro-ppl + # pytorch-lightning + # torch-geometric + # torchtune # transformers -transformers==4.42.4 +transformers==4.44.2 # via - # -r benchmarks/accelerate_opt/requirements.in + # -c .pin/../constraints/hpu.txt + # -r benchmarks/diffusion/requirements.in # -r benchmarks/huggingface/requirements.in # -r benchmarks/llama/requirements.in -trimesh==4.4.3 + # -r benchmarks/llava/requirements.in + # -r benchmarks/llm/requirements.in + # -r benchmarks/rlhf/requirements.in + # trl +trimesh==4.4.9 # via # brax # mujoco-mjx -triton==2.3.1 +triton==3.0.0 # via torch +trl==0.10.1 + # via + # -c .pin/../constraints/hpu.txt + # -r benchmarks/rlhf/requirements.in +typeguard==4.3.0 + # via jaxtyping +types-protobuf==5.28.0.20240924 + # via envpool typing-extensions==4.12.2 # via + # astroid + # black + # botorch # brax # chex + # envpool # etils + # flashbax # flax + # gymnasium # huggingface-hub + # iopath + # jaxtyping + # lightning # lightning-utilities + # multidict + # navix + # optree # orbax-checkpoint - # pydantic - # pydantic-core - # pyre-extensions + # pytorch-lightning # reactivex + # rich + # submitit + # tables # torch - # typing-inspect -typing-inspect==0.9.0 - # via pyre-extensions -tzdata==2024.1 + # typeguard + # tyro +tyro==0.8.11 + # via + # -r benchmarks/torchatari/requirements.in + # navix + # trl +tzdata==2024.2 # via pandas -urllib3==1.26.19 +urllib3==2.2.3 # via - # docker + # blobfile # requests - # torchx -varname==0.10.0 + # sentry-sdk +varname==0.13.3 # via giving voir==0.2.19 # via # -c .pin/../constraints/hpu.txt - # -r benchmarks/accelerate_opt/requirements.in # -r benchmarks/brax/requirements.in - # -r benchmarks/dlrm/requirements.in + # -r benchmarks/diffusion/requirements.in + # -r benchmarks/dinov2/requirements.in # -r benchmarks/flops/requirements.in + # -r benchmarks/geo_gnn/requirements.in # -r benchmarks/huggingface/requirements.in + # -r benchmarks/lightning/requirements.in # -r benchmarks/llama/requirements.in - # -r benchmarks/stargan/requirements.in - # -r benchmarks/super-slomo/requirements.in + # -r benchmarks/llava/requirements.in + # -r benchmarks/llm/requirements.in + # -r benchmarks/purejaxrl/requirements.in + # -r benchmarks/recursiongfn/requirements.in + # -r benchmarks/rlhf/requirements.in # -r benchmarks/timm/requirements.in + # -r benchmarks/torchatari/requirements.in # -r benchmarks/torchvision/requirements.in # -r benchmarks/torchvision_ddp/requirements.in -werkzeug==3.0.3 + # -r benchmarks/vjepa/requirements.in +wandb==0.18.3 + # via + # -r benchmarks/recursiongfn/requirements.in + # navix +webdataset==0.2.100 + # via -r benchmarks/vjepa/requirements.in +werkzeug==3.0.4 # via # flask # tensorboard -xxhash==3.4.1 - # via - # datasets - # evaluate -yarl==1.9.4 +xformers==0.0.28.post1 + # via -r benchmarks/dinov2/requirements.in +xxhash==3.5.0 + # via datasets +yacs==0.1.8 + # via fvcore +yarl==1.13.1 # via aiohttp -zipp==3.19.2 +zipp==3.20.2 # via # etils # importlib-metadata diff --git a/benchmarks/brax/requirements.hpu.txt b/benchmarks/brax/requirements.hpu.txt index cae1147c..b02ff745 100644 --- a/benchmarks/brax/requirements.hpu.txt +++ b/benchmarks/brax/requirements.hpu.txt @@ -4,10 +4,6 @@ # # pip-compile --output-file=benchmarks/brax/requirements.hpu.txt .pin/tmp-constraints-hpu-brax.txt benchmarks/brax/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com ---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com - absl-py==2.1.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt @@ -35,7 +31,7 @@ brax==0.10.5 # via # -c .pin/../.pin/constraints-hpu-torch.txt # -r benchmarks/brax/requirements.in -chex==0.1.86 +chex==0.1.87 # via # -c .pin/../.pin/constraints-hpu-torch.txt # optax @@ -47,7 +43,7 @@ cloudpickle==3.0.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # gym -codefind==0.1.6 +codefind==0.1.7 # via # -c .pin/../.pin/constraints-hpu-torch.txt # ptera @@ -63,7 +59,7 @@ dm-tree==0.1.8 # via # -c .pin/../.pin/constraints-hpu-torch.txt # dm-env -etils[epath,epy]==1.7.0 +etils[epath,epy]==1.9.4 # via # -c .pin/../.pin/constraints-hpu-torch.txt # brax @@ -71,11 +67,11 @@ etils[epath,epy]==1.7.0 # mujoco-mjx # optax # orbax-checkpoint -executing==1.2.0 +executing==2.1.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # varname -filelock==3.15.4 +filelock==3.16.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torch @@ -85,20 +81,20 @@ flask==3.0.3 # -c .pin/../.pin/constraints-hpu-torch.txt # brax # flask-cors -flask-cors==4.0.1 +flask-cors==5.0.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # brax -flax==0.8.5 +flax==0.9.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # brax -fsspec==2024.5.0 +fsspec==2024.6.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # etils # torch -giving==0.4.2 +giving==0.4.3 # via # -c .pin/../.pin/constraints-hpu-torch.txt # ptera @@ -107,7 +103,7 @@ glfw==2.7.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # mujoco -grpcio==1.65.1 +grpcio==1.66.2 # via # -c .pin/../.pin/constraints-hpu-torch.txt # brax @@ -119,7 +115,11 @@ gym-notices==0.0.8 # via # -c .pin/../.pin/constraints-hpu-torch.txt # gym -importlib-resources==6.4.0 +humanize==4.10.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # orbax-checkpoint +importlib-resources==6.4.5 # via # -c .pin/../.pin/constraints-hpu-torch.txt # etils @@ -127,7 +127,7 @@ itsdangerous==2.2.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # flask -jax[cuda12]==0.4.28 +jax==0.4.33 # via # -c .pin/../.pin/constraints-hpu-torch.txt # -r benchmarks/brax/requirements.in @@ -138,15 +138,7 @@ jax[cuda12]==0.4.28 # mujoco-mjx # optax # orbax-checkpoint -jax-cuda12-pjrt==0.4.28 - # via - # -c .pin/../.pin/constraints-hpu-torch.txt - # jax-cuda12-plugin -jax-cuda12-plugin==0.4.28 - # via - # -c .pin/../.pin/constraints-hpu-torch.txt - # jax -jaxlib==0.4.28+cuda12.cudnn89 +jaxlib==0.4.33 # via # -c .pin/../.pin/constraints-hpu-torch.txt # brax @@ -183,7 +175,7 @@ ml-collections==0.1.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # brax -ml-dtypes==0.4.0 +ml-dtypes==0.5.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # jax @@ -193,17 +185,17 @@ mpmath==1.3.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # sympy -msgpack==1.0.8 +msgpack==1.1.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # flax # orbax-checkpoint -mujoco==3.2.0 +mujoco==3.2.3 # via # -c .pin/../.pin/constraints-hpu-torch.txt # brax # mujoco-mjx -mujoco-mjx==3.2.0 +mujoco-mjx==3.2.3 # via # -c .pin/../.pin/constraints-hpu-torch.txt # brax @@ -221,14 +213,12 @@ numpy==1.26.4 # brax # chex # dm-env - # flax # gym # jax # jaxlib # jaxopt # ml-dtypes # mujoco - # opt-einsum # optax # orbax-checkpoint # scipy @@ -238,19 +228,13 @@ numpy==1.26.4 nvidia-cublas-cu12==12.1.3.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt - # jax # nvidia-cudnn-cu12 # nvidia-cusolver-cu12 # torch nvidia-cuda-cupti-cu12==12.1.105 # via # -c .pin/../.pin/constraints-hpu-torch.txt - # jax # torch -nvidia-cuda-nvcc-cu12==12.5.82 - # via - # -c .pin/../.pin/constraints-hpu-torch.txt - # jax nvidia-cuda-nvrtc-cu12==12.1.105 # via # -c .pin/../.pin/constraints-hpu-torch.txt @@ -258,17 +242,14 @@ nvidia-cuda-nvrtc-cu12==12.1.105 nvidia-cuda-runtime-cu12==12.1.105 # via # -c .pin/../.pin/constraints-hpu-torch.txt - # jax # torch -nvidia-cudnn-cu12==8.9.2.26 +nvidia-cudnn-cu12==9.1.0.70 # via # -c .pin/../.pin/constraints-hpu-torch.txt - # jax # torch nvidia-cufft-cu12==11.0.2.54 # via # -c .pin/../.pin/constraints-hpu-torch.txt - # jax # torch nvidia-curand-cu12==10.3.2.106 # via @@ -277,23 +258,23 @@ nvidia-curand-cu12==10.3.2.106 nvidia-cusolver-cu12==11.4.5.107 # via # -c .pin/../.pin/constraints-hpu-torch.txt - # jax # torch nvidia-cusparse-cu12==12.1.0.106 # via # -c .pin/../.pin/constraints-hpu-torch.txt - # jax # nvidia-cusolver-cu12 # torch +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir nvidia-nccl-cu12==2.20.5 # via # -c .pin/../.pin/constraints-hpu-torch.txt - # jax # torch -nvidia-nvjitlink-cu12==12.5.82 +nvidia-nvjitlink-cu12==12.6.77 # via # -c .pin/../.pin/constraints-hpu-torch.txt - # jax # nvidia-cusolver-cu12 # nvidia-cusparse-cu12 nvidia-nvtx-cu12==12.1.105 @@ -304,7 +285,7 @@ omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # voir -opt-einsum==3.3.0 +opt-einsum==3.4.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # jax @@ -313,12 +294,12 @@ optax==0.2.3 # -c .pin/../.pin/constraints-hpu-torch.txt # brax # flax -orbax-checkpoint==0.5.21 +orbax-checkpoint==0.6.4 # via # -c .pin/../.pin/constraints-hpu-torch.txt # brax # flax -ovld==0.3.5 +ovld==0.3.9 # via # -c .pin/../.pin/constraints-hpu-torch.txt # voir @@ -330,7 +311,7 @@ pillow==10.4.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # brax -protobuf==4.25.3 +protobuf==5.28.2 # via # -c .pin/../.pin/constraints-hpu-torch.txt # orbax-checkpoint @@ -347,10 +328,6 @@ pygments==2.18.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # rich -pynvml==11.5.3 - # via - # -c .pin/../.pin/constraints-hpu-torch.txt - # voir pyopengl==3.1.7 # via # -c .pin/../.pin/constraints-hpu-torch.txt @@ -359,7 +336,7 @@ pytinyrenderer==0.0.14 # via # -c .pin/../.pin/constraints-hpu-torch.txt # brax -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-hpu-torch.txt # flax @@ -370,12 +347,12 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-hpu-torch.txt # giving -rich==13.7.1 +rich==13.9.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # flax # voir -scipy==1.14.0 +scipy==1.14.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # brax @@ -388,7 +365,7 @@ six==1.16.0 # -c .pin/../.pin/constraints-hpu-torch.txt # asttokens # ml-collections -sympy==1.13.0 +sympy==1.13.3 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torch @@ -396,7 +373,7 @@ tensorboardx==2.6.2.2 # via # -c .pin/../.pin/constraints-hpu-torch.txt # brax -tensorstore==0.1.63 +tensorstore==0.1.66 # via # -c .pin/../.pin/constraints-hpu-torch.txt # flax @@ -405,16 +382,16 @@ toolz==0.12.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # chex -torch==2.3.1 +torch==2.4.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # -r benchmarks/brax/requirements.in -trimesh==4.4.3 +trimesh==4.4.9 # via # -c .pin/../.pin/constraints-hpu-torch.txt # brax # mujoco-mjx -triton==2.3.1 +triton==3.0.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torch @@ -427,8 +404,9 @@ typing-extensions==4.12.2 # flax # orbax-checkpoint # reactivex + # rich # torch -varname==0.10.0 +varname==0.13.3 # via # -c .pin/../.pin/constraints-hpu-torch.txt # giving @@ -437,11 +415,11 @@ voir==0.2.19 # -c .pin/../.pin/constraints-hpu-torch.txt # -c .pin/../constraints/hpu.txt # -r benchmarks/brax/requirements.in -werkzeug==3.0.3 +werkzeug==3.0.4 # via # -c .pin/../.pin/constraints-hpu-torch.txt # flask -zipp==3.19.2 +zipp==3.20.2 # via # -c .pin/../.pin/constraints-hpu-torch.txt # etils diff --git a/benchmarks/diffusion/requirements.hpu.txt b/benchmarks/diffusion/requirements.hpu.txt new file mode 100644 index 00000000..88ccd569 --- /dev/null +++ b/benchmarks/diffusion/requirements.hpu.txt @@ -0,0 +1,381 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/diffusion/requirements.hpu.txt .pin/tmp-constraints-hpu-diffusion-nodes.txt benchmarks/diffusion/requirements.in +# +accelerate==0.34.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/diffusion/requirements.in + # diffusers +aiohappyeyeballs==2.4.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp +aiohttp==3.10.8 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets + # fsspec +aiosignal==1.3.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp +antlr4-python3-runtime==4.9.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # omegaconf +argklass==1.4.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/diffusion/requirements.in +asttokens==2.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +async-timeout==4.0.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp +attrs==24.2.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp +certifi==2024.8.30 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # requests +charset-normalizer==3.3.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # requests +codefind==0.1.7 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # ptera +datasets==3.0.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/diffusion/requirements.in +diffusers[torch]==0.30.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/diffusion/requirements.in +dill==0.3.8 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets + # multiprocess +executing==2.1.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # varname +filelock==3.16.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets + # diffusers + # huggingface-hub + # torch + # transformers + # triton +frozenlist==1.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp + # aiosignal +fsspec[http]==2024.6.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets + # huggingface-hub + # torch +giving==0.4.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # ptera + # voir +hjson==3.1.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # argklass +huggingface-hub==0.25.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # accelerate + # datasets + # diffusers + # tokenizers + # transformers +idna==3.10 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # requests + # yarl +importlib-metadata==8.5.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # diffusers +importlib-resources==6.4.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # argklass +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +markdown-it-py==3.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # rich +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # jinja2 +mdurl==0.1.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # markdown-it-py +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # sympy +multidict==6.1.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp + # yarl +multiprocess==0.70.16 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets +networkx==3.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +numpy==1.26.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # accelerate + # datasets + # diffusers + # pandas + # pyarrow + # torchvision + # transformers +nvidia-cublas-cu12==12.1.3.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cuda-nvrtc-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cuda-runtime-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cudnn-cu12==9.1.0.70 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cufft-cu12==11.0.2.54 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-curand-cu12==10.3.2.106 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cusolver-cu12==11.4.5.107 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cusparse-cu12==12.1.0.106 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cusolver-cu12 + # torch +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +nvidia-nccl-cu12==2.20.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-nvjitlink-cu12==12.6.77 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 +nvidia-nvtx-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +omegaconf==2.3.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +ovld==0.3.9 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +packaging==24.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # accelerate + # datasets + # huggingface-hub + # transformers +pandas==2.2.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets +pillow==10.4.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # diffusers + # torchvision +psutil==5.9.8 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # accelerate + # voir +ptera==1.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +pyarrow==17.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets +pygments==2.18.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # rich +python-dateutil==2.9.0.post0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # pandas +pytz==2024.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # pandas +pyyaml==6.0.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # accelerate + # datasets + # huggingface-hub + # omegaconf + # transformers +reactivex==4.0.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +regex==2024.9.11 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # diffusers + # transformers +requests==2.32.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets + # diffusers + # huggingface-hub + # transformers +rich==13.9.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +safetensors==0.4.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # accelerate + # diffusers + # transformers +six==1.16.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # asttokens + # python-dateutil +sympy==1.13.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +tokenizers==0.19.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # transformers +torch==2.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # accelerate + # diffusers + # torchvision +torchvision==0.19.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/diffusion/requirements.in +tqdm==4.66.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/diffusion/requirements.in + # datasets + # huggingface-hub + # transformers +transformers==4.44.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -c .pin/../constraints/hpu.txt + # -r benchmarks/diffusion/requirements.in +triton==3.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # huggingface-hub + # multidict + # reactivex + # rich + # torch +tzdata==2024.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # pandas +urllib3==2.2.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # requests +varname==0.13.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +voir==0.2.19 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -c .pin/../constraints/hpu.txt + # -r benchmarks/diffusion/requirements.in +xxhash==3.5.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets +yarl==1.13.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp +zipp==3.20.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # importlib-metadata diff --git a/benchmarks/dinov2/requirements.hpu.txt b/benchmarks/dinov2/requirements.hpu.txt new file mode 100644 index 00000000..4a11ccfb --- /dev/null +++ b/benchmarks/dinov2/requirements.hpu.txt @@ -0,0 +1,267 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/dinov2/requirements.hpu.txt .pin/tmp-constraints-hpu-dinov2-giant-gpus.txt benchmarks/dinov2/requirements.in +# +antlr4-python3-runtime==4.9.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # omegaconf +asttokens==2.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +cloudpickle==3.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # submitit +codefind==0.1.7 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # ptera +executing==2.1.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # varname +filelock==3.16.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch + # triton +fsspec==2024.6.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +fvcore==0.1.5.post20221221 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/dinov2/requirements.in +giving==0.4.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # ptera + # voir +iopath==0.1.10 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/dinov2/requirements.in + # fvcore +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +lightning-utilities==0.11.7 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torchmetrics +markdown-it-py==3.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # rich +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # jinja2 +mdurl==0.1.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # markdown-it-py +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # sympy +networkx==3.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +numpy==1.26.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # fvcore + # scipy + # torchmetrics + # torchvision + # xformers +nvidia-cublas-cu12==12.1.3.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cuda-nvrtc-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cuda-runtime-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cudnn-cu12==9.1.0.70 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cufft-cu12==11.0.2.54 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-curand-cu12==10.3.2.106 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cusolver-cu12==11.4.5.107 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cusparse-cu12==12.1.0.106 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cusolver-cu12 + # torch +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +nvidia-nccl-cu12==2.20.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-nvjitlink-cu12==12.6.77 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 +nvidia-nvtx-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +omegaconf==2.3.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/dinov2/requirements.in + # voir +ovld==0.3.9 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +packaging==24.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # lightning-utilities + # torchmetrics +pillow==10.4.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # fvcore + # torchvision +portalocker==2.10.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # iopath +psutil==5.9.8 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +ptera==1.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +pygments==2.18.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # rich +pyyaml==6.0.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # fvcore + # omegaconf + # yacs +reactivex==4.0.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +rich==13.9.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +scipy==1.14.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/dinov2/requirements.in +six==1.16.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # asttokens +submitit==1.5.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/dinov2/requirements.in +sympy==1.13.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +tabulate==0.9.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # fvcore +termcolor==2.4.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # fvcore +torch==2.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/dinov2/requirements.in + # torchmetrics + # torchvision + # xformers +torchmetrics==1.4.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/dinov2/requirements.in +torchvision==0.19.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/dinov2/requirements.in +tqdm==4.66.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # fvcore + # iopath +triton==3.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # iopath + # lightning-utilities + # reactivex + # rich + # submitit + # torch +varname==0.13.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +voir==0.2.19 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -c .pin/../constraints/hpu.txt + # -r benchmarks/dinov2/requirements.in +xformers==0.0.28.post1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/dinov2/requirements.in +yacs==0.1.8 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # fvcore + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/benchmarks/flops/requirements.hpu.txt b/benchmarks/flops/requirements.hpu.txt index 77595d5f..91e5677f 100644 --- a/benchmarks/flops/requirements.hpu.txt +++ b/benchmarks/flops/requirements.hpu.txt @@ -4,10 +4,6 @@ # # pip-compile --output-file=benchmarks/flops/requirements.hpu.txt .pin/tmp-constraints-hpu-flops.txt benchmarks/flops/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com ---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com - antlr4-python3-runtime==4.9.3 # via # -c .pin/../.pin/constraints-hpu-torch.txt @@ -16,29 +12,29 @@ asttokens==2.4.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # giving -codefind==0.1.6 +codefind==0.1.7 # via # -c .pin/../.pin/constraints-hpu-torch.txt # ptera -executing==1.2.0 +executing==2.1.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # varname -filelock==3.15.4 +filelock==3.16.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torch # triton -fsspec==2024.5.0 +fsspec==2024.6.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torch -giving==0.4.2 +giving==0.4.3 # via # -c .pin/../.pin/constraints-hpu-torch.txt # ptera # voir -importlib-resources==6.4.0 +importlib-resources==6.4.5 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torchcompat @@ -88,7 +84,7 @@ nvidia-cuda-runtime-cu12==12.1.105 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torch -nvidia-cudnn-cu12==8.9.2.26 +nvidia-cudnn-cu12==9.1.0.70 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torch @@ -109,11 +105,15 @@ nvidia-cusparse-cu12==12.1.0.106 # -c .pin/../.pin/constraints-hpu-torch.txt # nvidia-cusolver-cu12 # torch +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir nvidia-nccl-cu12==2.20.5 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torch -nvidia-nvjitlink-cu12==12.5.82 +nvidia-nvjitlink-cu12==12.6.77 # via # -c .pin/../.pin/constraints-hpu-torch.txt # nvidia-cusolver-cu12 @@ -126,7 +126,7 @@ omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # voir -ovld==0.3.5 +ovld==0.3.9 # via # -c .pin/../.pin/constraints-hpu-torch.txt # voir @@ -146,11 +146,7 @@ pygments==2.18.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # rich -pynvml==11.5.3 - # via - # -c .pin/../.pin/constraints-hpu-torch.txt - # voir -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-hpu-torch.txt # omegaconf @@ -158,7 +154,7 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-hpu-torch.txt # giving -rich==13.7.1 +rich==13.9.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # voir @@ -166,11 +162,11 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # asttokens -sympy==1.13.0 +sympy==1.13.3 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torch -torch==2.3.1 +torch==2.4.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # -r benchmarks/flops/requirements.in @@ -180,15 +176,15 @@ torchcompat==1.1.4 # -c .pin/../.pin/constraints-hpu-torch.txt # -c .pin/../constraints/hpu.txt # -r benchmarks/flops/requirements.in -torchvision==0.18.1 +torchvision==0.19.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # -r benchmarks/flops/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-hpu-torch.txt # -r benchmarks/flops/requirements.in -triton==2.3.1 +triton==3.0.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torch @@ -196,8 +192,9 @@ typing-extensions==4.12.2 # via # -c .pin/../.pin/constraints-hpu-torch.txt # reactivex + # rich # torch -varname==0.10.0 +varname==0.13.3 # via # -c .pin/../.pin/constraints-hpu-torch.txt # giving diff --git a/benchmarks/geo_gnn/requirements-pre.hpu.txt b/benchmarks/geo_gnn/requirements-pre.hpu.txt new file mode 100644 index 00000000..db910c1a --- /dev/null +++ b/benchmarks/geo_gnn/requirements-pre.hpu.txt @@ -0,0 +1,99 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/geo_gnn/requirements-pre.hpu.txt .pin/tmp-constraints-hpu-dimenet.txt benchmarks/geo_gnn/requirements-pre.in +# +filelock==3.16.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch + # triton +fsspec==2024.6.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # jinja2 +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # sympy +networkx==3.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cublas-cu12==12.1.3.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cuda-nvrtc-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cuda-runtime-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cudnn-cu12==9.1.0.70 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cufft-cu12==11.0.2.54 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-curand-cu12==10.3.2.106 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cusolver-cu12==11.4.5.107 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cusparse-cu12==12.1.0.106 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cusolver-cu12 + # torch +nvidia-nccl-cu12==2.20.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-nvjitlink-cu12==12.6.77 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 +nvidia-nvtx-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +sympy==1.13.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +torch==2.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/geo_gnn/requirements-pre.in +triton==3.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch diff --git a/benchmarks/geo_gnn/requirements.hpu.txt b/benchmarks/geo_gnn/requirements.hpu.txt new file mode 100644 index 00000000..9c6bb6d6 --- /dev/null +++ b/benchmarks/geo_gnn/requirements.hpu.txt @@ -0,0 +1,321 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/geo_gnn/requirements.hpu.txt .pin/tmp-constraints-hpu-dimenet.txt benchmarks/geo_gnn/requirements-pre.hpu.txt benchmarks/geo_gnn/requirements.in +# +aiohappyeyeballs==2.4.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp +aiohttp==3.10.8 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch-geometric +aiosignal==1.3.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp +antlr4-python3-runtime==4.9.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # omegaconf +asttokens==2.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +async-timeout==4.0.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp +attrs==24.2.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp +certifi==2024.8.30 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # requests +charset-normalizer==3.3.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # requests +codefind==0.1.7 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # ptera +executing==2.1.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # varname +filelock==3.16.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/geo_gnn/requirements-pre.hpu.txt + # torch + # triton +frozenlist==1.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp + # aiosignal +fsspec==2024.6.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/geo_gnn/requirements-pre.hpu.txt + # torch + # torch-geometric +giving==0.4.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # ptera + # voir +idna==3.10 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # requests + # yarl +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/geo_gnn/requirements-pre.hpu.txt + # torch + # torch-geometric +markdown-it-py==3.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # rich +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/geo_gnn/requirements-pre.hpu.txt + # jinja2 +mdurl==0.1.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # markdown-it-py +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/geo_gnn/requirements-pre.hpu.txt + # sympy +multidict==6.1.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp + # yarl +networkx==3.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/geo_gnn/requirements-pre.hpu.txt + # torch +numpy==1.26.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/geo_gnn/requirements.in + # pandas + # rdkit + # scipy + # torch-geometric +nvidia-cublas-cu12==12.1.3.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/geo_gnn/requirements-pre.hpu.txt + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/geo_gnn/requirements-pre.hpu.txt + # torch +nvidia-cuda-nvrtc-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/geo_gnn/requirements-pre.hpu.txt + # torch +nvidia-cuda-runtime-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/geo_gnn/requirements-pre.hpu.txt + # torch +nvidia-cudnn-cu12==9.1.0.70 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/geo_gnn/requirements-pre.hpu.txt + # torch +nvidia-cufft-cu12==11.0.2.54 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/geo_gnn/requirements-pre.hpu.txt + # torch +nvidia-curand-cu12==10.3.2.106 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/geo_gnn/requirements-pre.hpu.txt + # torch +nvidia-cusolver-cu12==11.4.5.107 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/geo_gnn/requirements-pre.hpu.txt + # torch +nvidia-cusparse-cu12==12.1.0.106 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/geo_gnn/requirements-pre.hpu.txt + # nvidia-cusolver-cu12 + # torch +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +nvidia-nccl-cu12==2.20.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/geo_gnn/requirements-pre.hpu.txt + # torch +nvidia-nvjitlink-cu12==12.6.77 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/geo_gnn/requirements-pre.hpu.txt + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 +nvidia-nvtx-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/geo_gnn/requirements-pre.hpu.txt + # torch +omegaconf==2.3.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +ovld==0.3.9 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +pandas==2.2.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/geo_gnn/requirements.in +pillow==10.4.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # rdkit +psutil==5.9.8 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch-geometric + # voir +ptera==1.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +pygments==2.18.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # rich +pyparsing==3.1.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch-geometric +python-dateutil==2.9.0.post0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # pandas +pytz==2024.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # pandas +pyyaml==6.0.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # omegaconf +rdkit==2024.3.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/geo_gnn/requirements.in +reactivex==4.0.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +requests==2.32.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch-geometric +rich==13.9.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +scipy==1.14.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch-cluster + # torch-sparse +six==1.16.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # asttokens + # python-dateutil +sympy==1.13.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/geo_gnn/requirements-pre.hpu.txt + # torch +torch==2.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/geo_gnn/requirements-pre.hpu.txt +torch-cluster==1.6.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/geo_gnn/requirements.in +torch-geometric==2.6.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/geo_gnn/requirements.in +torch-scatter==2.1.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/geo_gnn/requirements.in +torch-sparse==0.6.18 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/geo_gnn/requirements.in +tqdm==4.66.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch-geometric +triton==3.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/geo_gnn/requirements-pre.hpu.txt + # torch +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/geo_gnn/requirements-pre.hpu.txt + # multidict + # reactivex + # rich + # torch +tzdata==2024.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # pandas +urllib3==2.2.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # requests +varname==0.13.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +voir==0.2.19 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -c .pin/../constraints/hpu.txt + # -r benchmarks/geo_gnn/requirements.in +yarl==1.13.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp diff --git a/benchmarks/huggingface/requirements.hpu.txt b/benchmarks/huggingface/requirements.hpu.txt index a504cba1..b5e21d99 100644 --- a/benchmarks/huggingface/requirements.hpu.txt +++ b/benchmarks/huggingface/requirements.hpu.txt @@ -4,10 +4,6 @@ # # pip-compile --output-file=benchmarks/huggingface/requirements.hpu.txt .pin/tmp-constraints-hpu-hf.txt benchmarks/huggingface/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com ---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com - antlr4-python3-runtime==4.9.3 # via # -c .pin/../.pin/constraints-hpu-torch.txt @@ -16,7 +12,7 @@ asttokens==2.4.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # giving -certifi==2024.6.2 +certifi==2024.8.30 # via # -c .pin/../.pin/constraints-hpu-torch.txt # requests @@ -24,37 +20,37 @@ charset-normalizer==3.3.2 # via # -c .pin/../.pin/constraints-hpu-torch.txt # requests -codefind==0.1.6 +codefind==0.1.7 # via # -c .pin/../.pin/constraints-hpu-torch.txt # ptera -executing==1.2.0 +executing==2.1.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # varname -filelock==3.15.4 +filelock==3.16.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # huggingface-hub # torch # transformers # triton -fsspec==2024.5.0 +fsspec==2024.6.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # huggingface-hub # torch -giving==0.4.2 +giving==0.4.3 # via # -c .pin/../.pin/constraints-hpu-torch.txt # ptera # voir -huggingface-hub==0.24.0 +huggingface-hub==0.25.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # tokenizers # transformers -idna==3.7 +idna==3.10 # via # -c .pin/../.pin/constraints-hpu-torch.txt # requests @@ -104,7 +100,7 @@ nvidia-cuda-runtime-cu12==12.1.105 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torch -nvidia-cudnn-cu12==8.9.2.26 +nvidia-cudnn-cu12==9.1.0.70 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torch @@ -125,11 +121,15 @@ nvidia-cusparse-cu12==12.1.0.106 # -c .pin/../.pin/constraints-hpu-torch.txt # nvidia-cusolver-cu12 # torch +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir nvidia-nccl-cu12==2.20.5 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torch -nvidia-nvjitlink-cu12==12.5.82 +nvidia-nvjitlink-cu12==12.6.77 # via # -c .pin/../.pin/constraints-hpu-torch.txt # nvidia-cusolver-cu12 @@ -142,7 +142,7 @@ omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # voir -ovld==0.3.5 +ovld==0.3.9 # via # -c .pin/../.pin/constraints-hpu-torch.txt # voir @@ -151,6 +151,10 @@ packaging==24.1 # -c .pin/../.pin/constraints-hpu-torch.txt # huggingface-hub # transformers +pillow==10.4.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/huggingface/requirements.in psutil==5.9.8 # via # -c .pin/../.pin/constraints-hpu-torch.txt @@ -163,11 +167,7 @@ pygments==2.18.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # rich -pynvml==11.5.3 - # via - # -c .pin/../.pin/constraints-hpu-torch.txt - # voir -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-hpu-torch.txt # huggingface-hub @@ -177,7 +177,7 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-hpu-torch.txt # giving -regex==2024.5.15 +regex==2024.9.11 # via # -c .pin/../.pin/constraints-hpu-torch.txt # transformers @@ -186,11 +186,11 @@ requests==2.32.3 # -c .pin/../.pin/constraints-hpu-torch.txt # huggingface-hub # transformers -rich==13.7.1 +rich==13.9.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # voir -safetensors==0.4.3 +safetensors==0.4.5 # via # -c .pin/../.pin/constraints-hpu-torch.txt # transformers @@ -198,7 +198,7 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # asttokens -sympy==1.13.0 +sympy==1.13.3 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torch @@ -206,20 +206,21 @@ tokenizers==0.19.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # transformers -torch==2.3.1 +torch==2.4.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # -r benchmarks/huggingface/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-hpu-torch.txt # huggingface-hub # transformers -transformers==4.42.4 +transformers==4.44.2 # via # -c .pin/../.pin/constraints-hpu-torch.txt + # -c .pin/../constraints/hpu.txt # -r benchmarks/huggingface/requirements.in -triton==2.3.1 +triton==3.0.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torch @@ -228,12 +229,13 @@ typing-extensions==4.12.2 # -c .pin/../.pin/constraints-hpu-torch.txt # huggingface-hub # reactivex + # rich # torch -urllib3==1.26.19 +urllib3==2.2.3 # via # -c .pin/../.pin/constraints-hpu-torch.txt # requests -varname==0.10.0 +varname==0.13.3 # via # -c .pin/../.pin/constraints-hpu-torch.txt # giving diff --git a/benchmarks/lightning/main.py b/benchmarks/lightning/main.py index aca89ee4..4c3d1206 100644 --- a/benchmarks/lightning/main.py +++ b/benchmarks/lightning/main.py @@ -1,14 +1,17 @@ #!/usr/bin/env python + import argparse import os +# FIXME this is HPU only +os.environ["PT_HPU_LAZY_MODE"] = str(int(int(os.getenv("WORLD_SIZE", -1)) <= 0)) + import torch import torch.nn.functional as F import lightning as L import torchvision.models as torchvision_models -import torchcompat.core as accelerator from benchmate.dataloader import imagenet_dataloader, dataloader_arguments @@ -37,7 +40,7 @@ def configure_optimizers(self): def prepare_voir(): from benchmate.observer import BenchObserver from benchmate.monitor import bench_monitor - + import torchcompat.core as accelerator observer = BenchObserver( accelerator.Event, earlystop=100, @@ -49,6 +52,10 @@ def prepare_voir(): return observer, bench_monitor def main(): + rank = int(os.getenv("RANK", 0)) + world_size = int(os.getenv("WORLD_SIZE", 1)) + local_world_size = int(os.getenv("LOCAL_WORLD_SIZE", 1)) + parser = argparse.ArgumentParser(description='simple distributed training job') parser.add_argument( "--epochs", @@ -64,11 +71,10 @@ def main(): args = parser.parse_args() model = getattr(torchvision_models, args.model)() - rank = int(os.getenv("RANK", 0)) - world_size = int(os.getenv("WORLD_SIZE", 1)) - local_world_size = int(os.getenv("LOCAL_WORLD_SIZE", 1)) - + import torchcompat.core as accelerator + n = accelerator.device_count() + n = local_world_size nnodes = world_size // local_world_size model = TorchvisionLightning(model) @@ -83,9 +89,9 @@ def main(): accelerator="auto", devices=n, num_nodes=nnodes, - strategy="ddp", + strategy="auto", max_epochs=args.epochs, - precision="16-mixed", + precision="bf16-mixed", enable_checkpointing=False, enable_progress_bar=False, reload_dataloaders_every_n_epochs=1, diff --git a/benchmarks/lightning/requirements.hpu.txt b/benchmarks/lightning/requirements.hpu.txt new file mode 100644 index 00000000..f86fb064 --- /dev/null +++ b/benchmarks/lightning/requirements.hpu.txt @@ -0,0 +1,285 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/lightning/requirements.hpu.txt .pin/tmp-constraints-hpu-lightning-gpus.txt benchmarks/lightning/requirements.in +# +aiohappyeyeballs==2.4.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp +aiohttp==3.10.8 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # fsspec +aiosignal==1.3.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp +antlr4-python3-runtime==4.9.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # omegaconf +asttokens==2.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +async-timeout==4.0.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp +attrs==24.2.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp +codefind==0.1.7 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # ptera +executing==2.1.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # varname +filelock==3.16.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch + # triton +frozenlist==1.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp + # aiosignal +fsspec[http]==2024.6.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # lightning + # pytorch-lightning + # torch +giving==0.4.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # ptera + # voir +idna==3.10 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # yarl +importlib-resources==6.4.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torchcompat +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +lightning==2.4.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/lightning/requirements.in +lightning-utilities==0.11.7 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # lightning + # pytorch-lightning + # torchmetrics +markdown-it-py==3.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # rich +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # jinja2 +mdurl==0.1.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # markdown-it-py +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # sympy +multidict==6.1.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp + # yarl +networkx==3.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +numpy==1.26.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torchmetrics + # torchvision +nvidia-cublas-cu12==12.1.3.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cuda-nvrtc-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cuda-runtime-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cudnn-cu12==9.1.0.70 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cufft-cu12==11.0.2.54 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-curand-cu12==10.3.2.106 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cusolver-cu12==11.4.5.107 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cusparse-cu12==12.1.0.106 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cusolver-cu12 + # torch +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +nvidia-nccl-cu12==2.20.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-nvjitlink-cu12==12.6.77 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 +nvidia-nvtx-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +omegaconf==2.3.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +ovld==0.3.9 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +packaging==24.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # lightning + # lightning-utilities + # pytorch-lightning + # torchmetrics +pillow==10.4.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torchvision +psutil==5.9.8 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +ptera==1.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +pygments==2.18.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # rich +pytorch-lightning==2.4.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # lightning +pyyaml==6.0.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # lightning + # omegaconf + # pytorch-lightning +reactivex==4.0.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +rich==13.9.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +six==1.16.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # asttokens +sympy==1.13.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +torch==2.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/lightning/requirements.in + # lightning + # pytorch-lightning + # torchmetrics + # torchvision +torchcompat==1.1.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -c .pin/../constraints/hpu.txt + # -r benchmarks/lightning/requirements.in +torchmetrics==1.4.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # lightning + # pytorch-lightning +torchvision==0.19.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/lightning/requirements.in +tqdm==4.66.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # lightning + # pytorch-lightning +triton==3.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # lightning + # lightning-utilities + # multidict + # pytorch-lightning + # reactivex + # rich + # torch +varname==0.13.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +voir==0.2.19 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -c .pin/../constraints/hpu.txt + # -r benchmarks/lightning/requirements.in +yarl==1.13.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/benchmarks/llama/requirements.hpu.txt b/benchmarks/llama/requirements.hpu.txt index 2368c150..9c01a4dd 100644 --- a/benchmarks/llama/requirements.hpu.txt +++ b/benchmarks/llama/requirements.hpu.txt @@ -4,11 +4,11 @@ # # pip-compile --output-file=benchmarks/llama/requirements.hpu.txt .pin/tmp-constraints-hpu-llm.txt benchmarks/llama/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com ---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com - -aiohttp==3.9.5 +aiohappyeyeballs==2.4.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp +aiohttp==3.10.8 # via # -c .pin/../.pin/constraints-hpu-torch.txt # datasets @@ -29,11 +29,11 @@ async-timeout==4.0.3 # via # -c .pin/../.pin/constraints-hpu-torch.txt # aiohttp -attrs==23.2.0 +attrs==24.2.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # aiohttp -certifi==2024.6.2 +certifi==2024.8.30 # via # -c .pin/../.pin/constraints-hpu-torch.txt # requests @@ -41,11 +41,11 @@ charset-normalizer==3.3.2 # via # -c .pin/../.pin/constraints-hpu-torch.txt # requests -codefind==0.1.6 +codefind==0.1.7 # via # -c .pin/../.pin/constraints-hpu-torch.txt # ptera -datasets==2.20.0 +datasets==3.0.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # -r benchmarks/llama/requirements.in @@ -54,7 +54,7 @@ dill==0.3.8 # -c .pin/../.pin/constraints-hpu-torch.txt # datasets # multiprocess -executing==1.2.0 +executing==2.1.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # varname @@ -62,7 +62,7 @@ fairscale==0.4.13 # via # -c .pin/../.pin/constraints-hpu-torch.txt # -r benchmarks/llama/requirements.in -filelock==3.15.4 +filelock==3.16.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # datasets @@ -70,7 +70,7 @@ filelock==3.15.4 # torch # transformers # triton -fire==0.6.0 +fire==0.7.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # -r benchmarks/llama/requirements.in @@ -79,24 +79,24 @@ frozenlist==1.4.1 # -c .pin/../.pin/constraints-hpu-torch.txt # aiohttp # aiosignal -fsspec[http]==2024.5.0 +fsspec[http]==2024.6.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # datasets # huggingface-hub # torch -giving==0.4.2 +giving==0.4.3 # via # -c .pin/../.pin/constraints-hpu-torch.txt # ptera # voir -huggingface-hub==0.24.0 +huggingface-hub==0.25.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # datasets # tokenizers # transformers -idna==3.7 +idna==3.10 # via # -c .pin/../.pin/constraints-hpu-torch.txt # requests @@ -121,7 +121,7 @@ mpmath==1.3.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # sympy -multidict==6.0.5 +multidict==6.1.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # aiohttp @@ -160,7 +160,7 @@ nvidia-cuda-runtime-cu12==12.1.105 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torch -nvidia-cudnn-cu12==8.9.2.26 +nvidia-cudnn-cu12==9.1.0.70 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torch @@ -181,11 +181,15 @@ nvidia-cusparse-cu12==12.1.0.106 # -c .pin/../.pin/constraints-hpu-torch.txt # nvidia-cusolver-cu12 # torch +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir nvidia-nccl-cu12==2.20.5 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torch -nvidia-nvjitlink-cu12==12.5.82 +nvidia-nvjitlink-cu12==12.6.77 # via # -c .pin/../.pin/constraints-hpu-torch.txt # nvidia-cusolver-cu12 @@ -198,7 +202,7 @@ omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # voir -ovld==0.3.5 +ovld==0.3.9 # via # -c .pin/../.pin/constraints-hpu-torch.txt # voir @@ -208,7 +212,7 @@ packaging==24.1 # datasets # huggingface-hub # transformers -pandas==2.2.2 +pandas==2.2.3 # via # -c .pin/../.pin/constraints-hpu-torch.txt # datasets @@ -224,27 +228,19 @@ pyarrow==17.0.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # datasets -pyarrow-hotfix==0.6 - # via - # -c .pin/../.pin/constraints-hpu-torch.txt - # datasets pygments==2.18.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # rich -pynvml==11.5.3 - # via - # -c .pin/../.pin/constraints-hpu-torch.txt - # voir python-dateutil==2.9.0.post0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # pandas -pytz==2024.1 +pytz==2024.2 # via # -c .pin/../.pin/constraints-hpu-torch.txt # pandas -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-hpu-torch.txt # datasets @@ -255,7 +251,7 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-hpu-torch.txt # giving -regex==2024.5.15 +regex==2024.9.11 # via # -c .pin/../.pin/constraints-hpu-torch.txt # transformers @@ -265,11 +261,11 @@ requests==2.32.3 # datasets # huggingface-hub # transformers -rich==13.7.1 +rich==13.9.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # voir -safetensors==0.4.3 +safetensors==0.4.5 # via # -c .pin/../.pin/constraints-hpu-torch.txt # transformers @@ -281,9 +277,8 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # asttokens - # fire # python-dateutil -sympy==1.13.0 +sympy==1.13.3 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torch @@ -295,22 +290,23 @@ tokenizers==0.19.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # transformers -torch==2.3.1 +torch==2.4.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # -r benchmarks/llama/requirements.in # fairscale -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-hpu-torch.txt # datasets # huggingface-hub # transformers -transformers==4.42.4 +transformers==4.44.2 # via # -c .pin/../.pin/constraints-hpu-torch.txt + # -c .pin/../constraints/hpu.txt # -r benchmarks/llama/requirements.in -triton==2.3.1 +triton==3.0.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torch @@ -318,17 +314,19 @@ typing-extensions==4.12.2 # via # -c .pin/../.pin/constraints-hpu-torch.txt # huggingface-hub + # multidict # reactivex + # rich # torch -tzdata==2024.1 +tzdata==2024.2 # via # -c .pin/../.pin/constraints-hpu-torch.txt # pandas -urllib3==1.26.19 +urllib3==2.2.3 # via # -c .pin/../.pin/constraints-hpu-torch.txt # requests -varname==0.10.0 +varname==0.13.3 # via # -c .pin/../.pin/constraints-hpu-torch.txt # giving @@ -337,11 +335,11 @@ voir==0.2.19 # -c .pin/../.pin/constraints-hpu-torch.txt # -c .pin/../constraints/hpu.txt # -r benchmarks/llama/requirements.in -xxhash==3.4.1 +xxhash==3.5.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # datasets -yarl==1.9.4 +yarl==1.13.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # aiohttp diff --git a/benchmarks/llava/benchfile.py b/benchmarks/llava/benchfile.py index 3bc06eaa..d6d40d6e 100644 --- a/benchmarks/llava/benchfile.py +++ b/benchmarks/llava/benchfile.py @@ -19,7 +19,9 @@ class Llava(Package): def make_env(self): # Return a dict of environment variables for prepare_script and # main_script. - return super().make_env() + env = super().make_env() + env["PT_HPU_LAZY_MODE"] = "0" + return env async def install(self): await super().install() # super() call installs the requirements diff --git a/benchmarks/llava/main.py b/benchmarks/llava/main.py index 879baca0..233ae2eb 100755 --- a/benchmarks/llava/main.py +++ b/benchmarks/llava/main.py @@ -1,7 +1,6 @@ #!/usr/bin/env python from dataclasses import dataclass - import torch from accelerate import Accelerator from accelerate.utils import set_seed @@ -90,8 +89,11 @@ def batch_size_fn(batch): optimizer = observer.optimizer(torch.optim.AdamW(model.parameters(), lr=5e-5)) model, optimizer, dataloader = accelerator.prepare(model, optimizer, dataloader) + # model = torch.compile(model,backend="hpu_backend") + for epoch in range(args.epochs): for i, batch in enumerate(observer.iterate(dataloader)): + print("HERE") images = batch["images"][0] # Access the first item in the list of images texts = batch["texts"] prompt = apply_chat_template(texts) @@ -124,7 +126,9 @@ def batch_size_fn(batch): if accelerator.sync_gradients: accelerator.clip_grad_norm_(model.parameters(), 1.0) + compat.mark_step() optimizer.step() + compat.mark_step() optimizer.zero_grad() observer.record_loss(loss) diff --git a/benchmarks/llava/requirements.hpu.txt b/benchmarks/llava/requirements.hpu.txt new file mode 100644 index 00000000..3bd40dff --- /dev/null +++ b/benchmarks/llava/requirements.hpu.txt @@ -0,0 +1,343 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/llava/requirements.hpu.txt .pin/tmp-constraints-hpu-llava-single.txt benchmarks/llava/requirements.in +# +accelerate==0.34.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/llava/requirements.in +aiohappyeyeballs==2.4.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp +aiohttp==3.10.8 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets + # fsspec +aiosignal==1.3.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp +antlr4-python3-runtime==4.9.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # omegaconf +asttokens==2.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +async-timeout==4.0.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp +attrs==24.2.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp +certifi==2024.8.30 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # requests +charset-normalizer==3.3.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # requests +codefind==0.1.7 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # ptera +datasets==3.0.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/llava/requirements.in +dill==0.3.8 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets + # multiprocess +executing==2.1.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # varname +filelock==3.16.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets + # huggingface-hub + # torch + # transformers + # triton +frozenlist==1.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp + # aiosignal +fsspec[http]==2024.6.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets + # huggingface-hub + # torch +giving==0.4.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # ptera + # voir +huggingface-hub==0.25.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # accelerate + # datasets + # tokenizers + # transformers +idna==3.10 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # requests + # yarl +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +markdown-it-py==3.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # rich +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # jinja2 +mdurl==0.1.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # markdown-it-py +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # sympy +multidict==6.1.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp + # yarl +multiprocess==0.70.16 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets +networkx==3.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +numpy==1.26.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/llava/requirements.in + # accelerate + # datasets + # pandas + # pyarrow + # transformers +nvidia-cublas-cu12==12.1.3.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cuda-nvrtc-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cuda-runtime-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cudnn-cu12==9.1.0.70 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cufft-cu12==11.0.2.54 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-curand-cu12==10.3.2.106 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cusolver-cu12==11.4.5.107 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cusparse-cu12==12.1.0.106 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cusolver-cu12 + # torch +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +nvidia-nccl-cu12==2.20.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-nvjitlink-cu12==12.6.77 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 +nvidia-nvtx-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +omegaconf==2.3.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +ovld==0.3.9 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +packaging==24.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # accelerate + # datasets + # huggingface-hub + # transformers +pandas==2.2.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets +pillow==10.4.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/llava/requirements.in +psutil==5.9.8 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # accelerate + # voir +ptera==1.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +pyarrow==17.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets +pygments==2.18.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # rich +python-dateutil==2.9.0.post0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # pandas +pytz==2024.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # pandas +pyyaml==6.0.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # accelerate + # datasets + # huggingface-hub + # omegaconf + # transformers +reactivex==4.0.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +regex==2024.9.11 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # transformers +requests==2.32.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets + # huggingface-hub + # transformers +rich==13.9.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +safetensors==0.4.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # accelerate + # transformers +six==1.16.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # asttokens + # python-dateutil +sympy==1.13.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +tokenizers==0.19.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # transformers +torch==2.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/llava/requirements.in + # accelerate +tqdm==4.66.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets + # huggingface-hub + # transformers +transformers==4.44.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -c .pin/../constraints/hpu.txt + # -r benchmarks/llava/requirements.in +triton==3.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # huggingface-hub + # multidict + # reactivex + # rich + # torch +tzdata==2024.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # pandas +urllib3==2.2.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # requests +varname==0.13.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +voir==0.2.19 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -c .pin/../constraints/hpu.txt + # -r benchmarks/llava/requirements.in +xxhash==3.5.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets +yarl==1.13.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp diff --git a/benchmarks/llm/configs/llama3_70B_full.yaml b/benchmarks/llm/configs/llama3_70B_full.yaml index ae5cf2af..22b52b79 100644 --- a/benchmarks/llm/configs/llama3_70B_full.yaml +++ b/benchmarks/llm/configs/llama3_70B_full.yaml @@ -82,7 +82,7 @@ optimizer: foreach: False # Note: highly recommended to use fused=True optimizer flag # with CPU offload for faster optimizer step. - fused: True + fused: False loss: _component_: torch.nn.CrossEntropyLoss @@ -94,9 +94,9 @@ gradient_accumulation_steps: 1 device: cuda # Memory management -enable_activation_checkpointing: True -memory_efficient_fsdp_wrap: True -fsdp_cpu_offload: True +enable_activation_checkpointing: false +memory_efficient_fsdp_wrap: false +fsdp_cpu_offload: false # Reduced precision dtype: bf16 diff --git a/benchmarks/llm/recipes/full_finetune_distributed.py b/benchmarks/llm/recipes/full_finetune_distributed.py index 3a51842d..19556ec7 100755 --- a/benchmarks/llm/recipes/full_finetune_distributed.py +++ b/benchmarks/llm/recipes/full_finetune_distributed.py @@ -16,6 +16,7 @@ import torch from omegaconf import DictConfig, ListConfig +import torchcompat.core as acc from torch import nn from torch.distributed import init_process_group from torch.distributed.fsdp import ( @@ -38,6 +39,8 @@ log = utils.get_logger("DEBUG") +HPU_UNSUPPORTED = False + class FullFinetuneRecipeDistributed(FTRecipeInterface): """ @@ -98,7 +101,8 @@ class FullFinetuneRecipeDistributed(FTRecipeInterface): def __init__(self, cfg: DictConfig) -> None: - self._device = utils.get_device(device=cfg.device) + import os + self._device = acc.fetch_device(int(os.getenv("LOCAL_RANK", "0"))) self._dtype = utils.get_dtype(cfg.dtype, device=self._device) if self._dtype == torch.float16: @@ -131,7 +135,10 @@ def __init__(self, cfg: DictConfig) -> None: # These are public properties which are updated by the checkpoint loader # when ``resume_from_checkpoint`` is `True` or validated in tests - self.seed = utils.set_seed(seed=cfg.seed) + if HPU_UNSUPPORTED: + self.seed = utils.set_seed(seed=cfg.seed) + else: + self.seed = 1 self.epochs_run = 0 self.total_epochs = cfg.epochs self.max_steps_per_epoch = cfg.max_steps_per_epoch @@ -351,8 +358,10 @@ def _setup_model( ) if self._is_rank_zero: - memory_stats = utils.get_memory_stats(device=self._device) - utils.log_memory_stats(memory_stats) + if HPU_UNSUPPORTED: + pass + #memory_stats = utils.get_memory_stats(device=self._device) + #utils.log_memory_stats(memory_stats) # synchronize before training begins torch.distributed.barrier() @@ -413,6 +422,7 @@ def _setup_data( dataset=ds, batch_size=batch_size, sampler=sampler, + # persistent_workers=True, collate_fn=partial( utils.padded_collate, padding_idx=self._tokenizer.pad_id, @@ -543,31 +553,14 @@ def train(self) -> None: f"{curr_epoch+1}|{self.global_step}|Loss: {loss_to_log}" ) - # Log per-step metrics - if ( - self.global_step % self._log_every_n_steps == 0 - and self._is_rank_zero - ): - time_per_step = time.perf_counter() - t0 - log_dict = { - "loss": loss_to_log, - "lr": self._optimizer.param_groups[0]["lr"], - "tokens_per_second_per_gpu": num_tokens / time_per_step, - } - if self._log_peak_memory_stats: - log_dict.update(utils.get_memory_stats(device=self._device)) - self._metric_logger.log_dict( - log_dict, - step=self.global_step, - ) - # Reset running stats for the next step running_loss = 0 num_tokens = 0 t0 = time.perf_counter() - + + print("HERE") self.epochs_run += 1 - self.save_checkpoint(epoch=curr_epoch) + # self.save_checkpoint(epoch=curr_epoch) def cleanup(self) -> None: if self._is_rank_zero: @@ -618,7 +611,8 @@ def recipe_main(cfg: DictConfig) -> None: "If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]" ) - init_process_group(backend="gloo" if cfg.device == "cpu" else "nccl") + acc.init_process_group() + if cfg.get("fsdp_cpu_offload", False): # Utilize all available CPU cores for intra-op parallelism. This provides ~2x # speed up when benchmarking fused AdamW on CPU diff --git a/benchmarks/llm/recipes/full_finetune_single_device.py b/benchmarks/llm/recipes/full_finetune_single_device.py index 98322579..629b0e9a 100755 --- a/benchmarks/llm/recipes/full_finetune_single_device.py +++ b/benchmarks/llm/recipes/full_finetune_single_device.py @@ -97,7 +97,7 @@ class FullFinetuneRecipeSingleDevice(FTRecipeInterface): """ def __init__(self, cfg: DictConfig) -> None: - self._device = utils.get_device(device=cfg.device) + self._device = accelerator.fetch_device(int(os.getenv("HABANA_VISIBLE_MODULES", "0").split(",")[0])) self._dtype = utils.get_dtype(cfg.dtype, device=self._device) # Disable for fp16, as we haven't validated "full" fp16 with this recipe, nor # enabled necessary features such as gradient scaling. @@ -279,9 +279,9 @@ def _setup_model( log.info("Compiling model with torch.compile...") backend = os.environ.get("TORCH_COMPILE_BACKEND", "inductor") model.compile(backend=backend) - if self._device.type == "cuda": - memory_stats = utils.get_memory_stats(device=self._device) - utils.log_memory_stats(memory_stats) + # if self._device.type == "cuda": + # memory_stats = utils.get_memory_stats(device=self._device) + # utils.log_memory_stats(memory_stats) return model @@ -487,8 +487,8 @@ def train(self) -> None: ), "tokens_per_second_per_gpu": num_tokens / time_per_step, } - if self._device.type == "cuda" and self._log_peak_memory_stats: - log_dict.update(utils.get_memory_stats(device=self._device)) + # if self._device.type == "cuda" and self._log_peak_memory_stats: + # log_dict.update(utils.get_memory_stats(device=self._device)) self._metric_logger.log_dict( log_dict, step=self.global_step, diff --git a/benchmarks/llm/recipes/lora_finetune_distributed.py b/benchmarks/llm/recipes/lora_finetune_distributed.py index 18b736fb..ae7c5b40 100755 --- a/benchmarks/llm/recipes/lora_finetune_distributed.py +++ b/benchmarks/llm/recipes/lora_finetune_distributed.py @@ -16,6 +16,7 @@ import torch from omegaconf import DictConfig, ListConfig +import torchcompat.core as acc from torch import nn from torch.distributed import destroy_process_group, init_process_group @@ -44,6 +45,9 @@ log = utils.get_logger("DEBUG") +HPU_UNSUPPORTED = False + + class LoRAFinetuneRecipeDistributed(FTRecipeInterface): """ Distributed LoRA finetuning recipe for dense transformer-based LLMs such as Llama2. This recipe supports @@ -108,7 +112,7 @@ class LoRAFinetuneRecipeDistributed(FTRecipeInterface): """ def __init__(self, cfg: DictConfig) -> None: - self._device = utils.get_device(device=cfg.device) + self._device = acc.fetch_device(int(os.getenv("LOCAL_RANK", "0"))) self._dtype = utils.get_dtype(cfg.dtype, device=self._device) if self._dtype == torch.float16: @@ -132,7 +136,11 @@ def __init__(self, cfg: DictConfig) -> None: # These attributes constitute the recipe state and are updated by ``load_checkpoint`` # when ``resume_from_checkpoint`` is ``True`` - self.seed = utils.set_seed(seed=cfg.seed) + if HPU_UNSUPPORTED: + self.seed = utils.set_seed(seed=cfg.seed) + else: + self.seed = 1 + self.epochs_run = 0 self.total_epochs = cfg.epochs self.max_steps_per_epoch = cfg.max_steps_per_epoch @@ -428,7 +436,7 @@ def _setup_model( # Initialize empty modules on all non-zero ranks param_init_fn=( lambda module: module.to_empty( - device=torch.device("cuda"), recurse=False + device=self._device, recurse=False ) if not self._is_rank_zero else None @@ -443,8 +451,10 @@ def _setup_model( model, auto_wrap_policy={modules.TransformerDecoderLayer} ) if self._is_rank_zero: - memory_stats = utils.get_memory_stats(device=self._device) - utils.log_memory_stats(memory_stats) + if HPU_UNSUPPORTED: + pass + # memory_stats = utils.get_memory_stats(device=self._device) + # utils.log_memory_stats(memory_stats) # synchronize before training begins torch.distributed.barrier() @@ -703,8 +713,9 @@ def train(self) -> None: "lr": self._optimizer.param_groups[0]["lr"], "tokens_per_second_per_gpu": num_tokens / time_per_step, } - if self._log_peak_memory_stats: - log_dict.update(utils.get_memory_stats(device=self._device)) + # if self._log_peak_memory_stats: + # if HPU_UNSUPPORTED: + # log_dict.update(utils.get_memory_stats(device=self._device)) self._metric_logger.log_dict( log_dict, step=self.global_step, @@ -773,7 +784,7 @@ def recipe_main(cfg: DictConfig) -> None: "If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]" ) os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1" - init_process_group(backend="gloo" if cfg.device == "cpu" else "nccl") + acc.init_process_group() config.log_config(recipe_name="LoRAFinetuneRecipeDistributed", cfg=cfg) diff --git a/benchmarks/llm/recipes/lora_finetune_single_device.py b/benchmarks/llm/recipes/lora_finetune_single_device.py index cf5256ea..9060d203 100755 --- a/benchmarks/llm/recipes/lora_finetune_single_device.py +++ b/benchmarks/llm/recipes/lora_finetune_single_device.py @@ -101,8 +101,9 @@ class LoRAFinetuneRecipeSingleDevice(FTRecipeInterface): """ def __init__(self, cfg: DictConfig) -> None: - - self._device = utils.get_device(device=cfg.device) + import torchcompat.core as accelerator + + self._device = accelerator.fetch_device(int(os.getenv("HABANA_VISIBLE_MODULES", "0").split(",")[0])) # Reduced precision logic self._dtype = utils.get_dtype(cfg.dtype, device=self._device) # fp16 precision is explicitly disabled as it is not supported in this @@ -388,9 +389,9 @@ def _setup_model( log.info("Compiling model with torch.compile...") backend = os.environ.get("TORCH_COMPILE_BACKEND", "inductor") model.compile(backend=backend) - if self._device.type == "cuda": - memory_stats = utils.get_memory_stats(device=self._device) - utils.log_memory_stats(memory_stats) + # if self._device.type == "cuda": + # memory_stats = utils.get_memory_stats(device=self._device) + # utils.log_memory_stats(memory_stats) return model def _setup_optimizer( @@ -528,7 +529,8 @@ def train(self) -> None: """ The core training loop. """ - + import torchcompat.core as accelerator + if self._model_compile: log.info( "NOTE: torch.compile is enabled and model is compiled in first forward. Expect a relatively slow first iteration." @@ -579,10 +581,13 @@ def train(self) -> None: loss = self._loss_fn(logits, labels) / self._gradient_accumulation_steps running_loss += loss loss.backward() + accelerator.mark_step() # Step with optimizer if (idx + 1) % self._gradient_accumulation_steps == 0: self._optimizer.step() + accelerator.mark_step() + self._optimizer.zero_grad(set_to_none=True) self._lr_scheduler.step() # Update the number of steps when the weights are updated @@ -603,13 +608,13 @@ def train(self) -> None: "lr": self._optimizer.param_groups[0]["lr"], "tokens_per_second_per_gpu": num_tokens / time_per_step, } - if ( - self._device.type == "cuda" - and self._log_peak_memory_stats - ): - log_dict.update( - utils.get_memory_stats(device=self._device) - ) + # if ( + # self._device.type == "cuda" + # and self._log_peak_memory_stats + # ): + # log_dict.update( + # utils.get_memory_stats(device=self._device) + # ) self._metric_logger.log_dict( log_dict, step=self.global_step, diff --git a/benchmarks/llm/recipes/ppo_full_finetune_single_device.py b/benchmarks/llm/recipes/ppo_full_finetune_single_device.py index 8ee77c06..fbf8630a 100644 --- a/benchmarks/llm/recipes/ppo_full_finetune_single_device.py +++ b/benchmarks/llm/recipes/ppo_full_finetune_single_device.py @@ -496,9 +496,9 @@ def _setup_model( ref_policy_model.compile(backend=backend) value_model.compile(backend=backend) - if self._device.type == "cuda": - memory_stats = utils.get_memory_stats(device=self._device) - utils.log_memory_stats(memory_stats) + # if self._device.type == "cuda": + # memory_stats = utils.get_memory_stats(device=self._device) + # utils.log_memory_stats(memory_stats) return policy_model, value_model, reward_model, ref_policy_model @@ -1031,8 +1031,8 @@ def log_metrics( "approx_policy_kl": ppo_stats.approx_policy_kls.mean(), "response_lengths": trajectory.seq_lens.float().mean(), } - if self._device.type == "cuda" and self._log_peak_memory_stats: - log_dict.update(utils.get_memory_stats(device=self._device)) + # if self._device.type == "cuda" and self._log_peak_memory_stats: + # log_dict.update(utils.get_memory_stats(device=self._device)) self._metric_logger.log_dict(log_dict, step=self.global_step) diff --git a/benchmarks/llm/requirements.hpu.txt b/benchmarks/llm/requirements.hpu.txt new file mode 100644 index 00000000..9b88be53 --- /dev/null +++ b/benchmarks/llm/requirements.hpu.txt @@ -0,0 +1,408 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/llm/requirements.hpu.txt .pin/tmp-constraints-hpu-llm-full-mp-nodes.txt benchmarks/llm/requirements.in +# +accelerate==0.34.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/llm/requirements.in +aiohappyeyeballs==2.4.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp +aiohttp==3.10.8 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets + # fsspec +aiosignal==1.3.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp +antlr4-python3-runtime==4.9.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # omegaconf +argklass==1.4.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/llm/requirements.in +asttokens==2.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +async-timeout==4.0.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp +attrs==24.2.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp +blobfile==3.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/llm/requirements.txt + # torchtune +certifi==2024.8.30 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # requests +charset-normalizer==3.3.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # requests +codefind==0.1.7 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # ptera +datasets==3.0.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torchtune +dill==0.3.8 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets + # multiprocess +executing==2.1.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # varname +fairscale==0.4.13 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/llm/requirements.in + # -r benchmarks/llm/requirements.txt +filelock==3.16.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # blobfile + # datasets + # huggingface-hub + # torch + # transformers + # triton +fire==0.7.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/llm/requirements.txt +frozenlist==1.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp + # aiosignal +fsspec[http]==2024.6.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets + # huggingface-hub + # torch +giving==0.4.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # ptera + # voir +hjson==3.1.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # argklass +huggingface-hub==0.25.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # accelerate + # datasets + # tokenizers + # torchtune + # transformers +idna==3.10 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # requests + # yarl +importlib-resources==6.4.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # argklass +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +lxml==5.3.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # blobfile +markdown-it-py==3.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # rich +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # jinja2 +mdurl==0.1.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # markdown-it-py +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # sympy +multidict==6.1.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp + # yarl +multiprocess==0.70.16 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets +networkx==3.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +numpy==1.26.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # accelerate + # datasets + # fairscale + # pandas + # pyarrow + # torchtune + # transformers +nvidia-cublas-cu12==12.1.3.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cuda-nvrtc-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cuda-runtime-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cudnn-cu12==9.1.0.70 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cufft-cu12==11.0.2.54 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-curand-cu12==10.3.2.106 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cusolver-cu12==11.4.5.107 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cusparse-cu12==12.1.0.106 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cusolver-cu12 + # torch +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +nvidia-nccl-cu12==2.20.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-nvjitlink-cu12==12.6.77 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 +nvidia-nvtx-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +omegaconf==2.3.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torchtune + # voir +ovld==0.3.9 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +packaging==24.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # accelerate + # datasets + # huggingface-hub + # transformers +pandas==2.2.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets +psutil==5.9.8 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # accelerate + # voir +ptera==1.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +pyarrow==17.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets +pycryptodomex==3.21.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # blobfile +pygments==2.18.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # rich +python-dateutil==2.9.0.post0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # pandas +pytz==2024.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # pandas +pyyaml==6.0.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/llm/requirements.in + # accelerate + # datasets + # huggingface-hub + # omegaconf + # transformers +reactivex==4.0.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +regex==2024.9.11 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # tiktoken + # transformers +requests==2.32.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets + # huggingface-hub + # tiktoken + # transformers +rich==13.9.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +safetensors==0.4.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # accelerate + # torchtune + # transformers +sentencepiece==0.2.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torchtune +six==1.16.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # asttokens + # python-dateutil +sympy==1.13.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +termcolor==2.4.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # fire +tiktoken==0.7.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torchtune +tokenizers==0.19.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # transformers +torch==2.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/llm/requirements.in + # -r benchmarks/llm/requirements.txt + # accelerate + # fairscale +torchao==0.3.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -c .pin/../constraints/hpu.txt + # -r benchmarks/llm/requirements.in + # torchtune +torchtune==0.2.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -c .pin/../constraints/hpu.txt + # -r benchmarks/llm/requirements.in +tqdm==4.66.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets + # huggingface-hub + # torchtune + # transformers +transformers==4.44.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -c .pin/../constraints/hpu.txt + # -r benchmarks/llm/requirements.in +triton==3.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # huggingface-hub + # multidict + # reactivex + # rich + # torch +tzdata==2024.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # pandas +urllib3==2.2.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # blobfile + # requests +varname==0.13.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +voir==0.2.19 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -c .pin/../constraints/hpu.txt + # -r benchmarks/llm/requirements.in +xxhash==3.5.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets +yarl==1.13.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp diff --git a/benchmarks/purejaxrl/requirements.hpu.txt b/benchmarks/purejaxrl/requirements.hpu.txt new file mode 100644 index 00000000..aeb2b110 --- /dev/null +++ b/benchmarks/purejaxrl/requirements.hpu.txt @@ -0,0 +1,743 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/purejaxrl/requirements.hpu.txt .pin/tmp-constraints-hpu-ppo.txt benchmarks/purejaxrl/requirements.in +# +absl-py==2.1.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # brax + # chex + # distrax + # dm-env + # ml-collections + # mujoco + # mujoco-mjx + # optax + # orbax-checkpoint + # rlax + # tensorflow-probability +antlr4-python3-runtime==4.9.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # omegaconf +argklass==1.4.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/purejaxrl/requirements.in +astroid==3.3.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # pylint +asttokens==2.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +black==24.8.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # navix +blinker==1.8.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # flask +brax==0.10.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/purejaxrl/requirements.in +certifi==2024.8.30 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # requests + # sentry-sdk +charset-normalizer==3.3.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # requests +chex==0.1.87 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # distrax + # evosax + # flashbax + # gymnax + # optax + # rlax +click==8.1.7 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # black + # flask + # wandb +cloudpickle==3.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # gym + # gymnasium + # tensorflow-probability +codefind==0.1.7 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # ptera +contextlib2==21.6.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # ml-collections +contourpy==1.3.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # matplotlib +cycler==0.12.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # matplotlib +decorator==5.1.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # tensorflow-probability +dill==0.3.8 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # pylint +distrax==0.1.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/purejaxrl/requirements.in + # rlax +dm-env==1.6 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # brax + # rlax +dm-tree==0.1.8 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # dm-env + # tensorflow-probability +docker-pycreds==0.4.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # wandb +docstring-parser==0.16 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # tyro +dotmap==1.3.30 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # evosax +etils[epath,epy]==1.9.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # brax + # mujoco + # mujoco-mjx + # optax + # orbax-checkpoint +evosax==0.1.6 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/purejaxrl/requirements.in +exceptiongroup==1.2.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # pytest +executing==2.1.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # varname +farama-notifications==0.0.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # gymnasium +filelock==3.16.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch + # triton +flake8==7.1.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # navix +flashbax==0.1.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/purejaxrl/requirements.in +flask==3.0.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # brax + # flask-cors +flask-cors==5.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # brax +flax==0.9.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/purejaxrl/requirements.in + # brax + # evosax + # flashbax + # gymnax + # navix +fonttools==4.54.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # matplotlib +fsspec==2024.6.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # etils + # torch +gast==0.6.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # tensorflow-probability +gitdb==4.0.11 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # gitpython +gitpython==3.1.43 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # wandb +giving==0.4.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # ptera + # voir +glfw==2.7.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # mujoco +grpcio==1.66.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # brax +gym==0.26.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # brax + # gymnax +gym-notices==0.0.8 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # gym +gymnasium==0.29.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # gymnax +gymnax==0.0.8 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -c .pin/../constraints/hpu.txt + # -r benchmarks/purejaxrl/requirements.in +hjson==3.1.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # argklass +humanize==4.10.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # orbax-checkpoint +idna==3.10 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # requests +importlib-resources==6.4.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # argklass + # etils +iniconfig==2.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # pytest +isort==5.13.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # pylint +itsdangerous==2.2.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # flask +jax==0.4.33 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/purejaxrl/requirements.in + # brax + # chex + # distrax + # evosax + # flashbax + # flax + # gymnax + # jaxopt + # mujoco-mjx + # optax + # orbax-checkpoint + # rlax +jaxlib==0.4.33 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # brax + # chex + # distrax + # evosax + # flashbax + # gymnax + # jax + # jaxopt + # mujoco-mjx + # optax + # orbax-checkpoint + # rlax +jaxopt==0.8.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # brax +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # brax + # flask + # torch +kiwisolver==1.4.7 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # matplotlib +markdown-it-py==3.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # rich +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # jinja2 + # werkzeug +matplotlib==3.9.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # evosax + # gymnax + # seaborn +mccabe==0.7.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # flake8 + # pylint +mdurl==0.1.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # markdown-it-py +ml-collections==0.1.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # brax +ml-dtypes==0.5.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # jax + # jaxlib + # tensorstore +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # sympy +msgpack==1.1.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # flax + # orbax-checkpoint +mujoco==3.2.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # brax + # mujoco-mjx +mujoco-mjx==3.2.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # brax +mypy-extensions==1.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # black +navix==0.7.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/purejaxrl/requirements.in +nest-asyncio==1.6.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # orbax-checkpoint +networkx==3.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +numpy==1.26.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/purejaxrl/requirements.in + # brax + # chex + # contourpy + # distrax + # dm-env + # evosax + # flashbax + # gym + # gymnasium + # jax + # jaxlib + # jaxopt + # matplotlib + # ml-dtypes + # mujoco + # navix + # optax + # orbax-checkpoint + # pandas + # rlax + # scipy + # seaborn + # tensorboardx + # tensorflow-probability + # tensorstore + # trimesh +nvidia-cublas-cu12==12.1.3.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cuda-nvrtc-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cuda-runtime-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cudnn-cu12==9.1.0.70 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cufft-cu12==11.0.2.54 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-curand-cu12==10.3.2.106 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cusolver-cu12==11.4.5.107 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cusparse-cu12==12.1.0.106 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cusolver-cu12 + # torch +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +nvidia-nccl-cu12==2.20.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-nvjitlink-cu12==12.6.77 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 +nvidia-nvtx-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +omegaconf==2.3.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +opt-einsum==3.4.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # jax +optax==0.2.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/purejaxrl/requirements.in + # brax + # flax +orbax-checkpoint==0.6.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # brax + # flax +ovld==0.3.9 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +packaging==24.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # black + # matplotlib + # pytest + # setuptools-scm + # tensorboardx +pandas==2.2.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # seaborn +pathspec==0.12.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # black +pillow==10.4.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # brax + # matplotlib + # navix +platformdirs==4.3.6 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # black + # pylint + # wandb +pluggy==1.5.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # pytest +protobuf==5.28.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # orbax-checkpoint + # tensorboardx + # wandb +psutil==5.9.8 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir + # wandb +ptera==1.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +pycodestyle==2.12.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # flake8 +pyflakes==3.2.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # flake8 +pygments==2.18.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # rich +pylint==3.3.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # navix +pyopengl==3.1.7 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # mujoco +pyparsing==3.1.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # matplotlib +pytest==8.3.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # navix +python-dateutil==2.9.0.post0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # matplotlib + # pandas +pytinyrenderer==0.0.14 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # brax +pytz==2024.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # pandas +pyyaml==6.0.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # evosax + # flax + # gymnax + # ml-collections + # omegaconf + # orbax-checkpoint + # wandb +reactivex==4.0.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +requests==2.32.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # wandb +rich==13.9.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # flax + # tyro + # voir +rlax==0.1.6 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # navix +scipy==1.14.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # brax + # jax + # jaxlib + # jaxopt + # mujoco-mjx +seaborn==0.13.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # gymnax +sentry-sdk==2.15.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # wandb +setproctitle==1.3.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # wandb +setuptools-scm==8.1.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # navix +shtab==1.7.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # tyro +six==1.16.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # asttokens + # docker-pycreds + # ml-collections + # python-dateutil + # tensorflow-probability +smmap==5.0.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # gitdb +sympy==1.13.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +tensorboardx==2.6.2.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # brax +tensorflow-probability==0.24.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # distrax +tensorstore==0.1.66 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # flashbax + # flax + # orbax-checkpoint +tomli==2.0.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # black + # pylint + # pytest + # setuptools-scm +tomlkit==0.13.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # pylint +toolz==0.12.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # chex +torch==2.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/purejaxrl/requirements.in +trimesh==4.4.9 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # brax + # mujoco-mjx +triton==3.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # astroid + # black + # brax + # chex + # etils + # flashbax + # flax + # gymnasium + # navix + # orbax-checkpoint + # reactivex + # rich + # torch + # tyro +tyro==0.8.11 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # navix +tzdata==2024.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # pandas +urllib3==2.2.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # requests + # sentry-sdk +varname==0.13.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +voir==0.2.19 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -c .pin/../constraints/hpu.txt + # -r benchmarks/purejaxrl/requirements.in +wandb==0.18.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # navix +werkzeug==3.0.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # flask +zipp==3.20.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # etils + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/benchmarks/recursiongfn/requirements.hpu.txt b/benchmarks/recursiongfn/requirements.hpu.txt new file mode 100644 index 00000000..4e362ae6 --- /dev/null +++ b/benchmarks/recursiongfn/requirements.hpu.txt @@ -0,0 +1,493 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/recursiongfn/requirements.hpu.txt .pin/tmp-constraints-hpu-recursiongfn.txt benchmarks/recursiongfn/requirements.in +# +absl-py==2.1.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # tensorboard +aiohappyeyeballs==2.4.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp +aiohttp==3.10.8 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch-geometric +aiosignal==1.3.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp +antlr4-python3-runtime==4.9.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # omegaconf +asttokens==2.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +async-timeout==4.0.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp +attrs==24.2.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp +blosc2==2.7.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # tables +botorch==0.12.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/recursiongfn/requirements.in +certifi==2024.8.30 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # requests + # sentry-sdk +charset-normalizer==3.3.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # requests +click==8.1.7 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # wandb +codefind==0.1.7 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # ptera +cvxopt==1.3.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/recursiongfn/requirements.in +docker-pycreds==0.4.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # wandb +executing==2.1.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # varname +filelock==3.16.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch + # triton +frozenlist==1.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp + # aiosignal +fsspec==2024.6.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch + # torch-geometric +gitdb==4.0.11 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # gitpython +gitpython==3.1.43 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/recursiongfn/requirements.in + # wandb +giving==0.4.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # ptera + # voir +gpytorch==1.13 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/recursiongfn/requirements.in + # botorch +grpcio==1.66.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # tensorboard +idna==3.10 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # requests + # yarl +jaxtyping==0.2.19 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # gpytorch + # linear-operator +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch + # torch-geometric +joblib==1.4.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # scikit-learn +linear-operator==0.5.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # botorch + # gpytorch +markdown==3.7 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # tensorboard +markdown-it-py==3.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # rich +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # jinja2 + # werkzeug +mdurl==0.1.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # markdown-it-py +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # botorch + # gpytorch + # linear-operator + # sympy +msgpack==1.1.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # blosc2 +multidict==6.1.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp + # yarl +multipledispatch==1.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # botorch +ndindex==1.9.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # blosc2 +networkx==3.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/recursiongfn/requirements.in + # torch +numexpr==2.10.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # blosc2 + # tables +numpy==1.26.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # blosc2 + # jaxtyping + # numexpr + # pandas + # pyarrow + # pyro-ppl + # rdkit + # scikit-learn + # scipy + # tables + # tensorboard + # torch-geometric +nvidia-cublas-cu12==12.1.3.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cuda-nvrtc-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cuda-runtime-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cudnn-cu12==9.1.0.70 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cufft-cu12==11.0.2.54 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-curand-cu12==10.3.2.106 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cusolver-cu12==11.4.5.107 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cusparse-cu12==12.1.0.106 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cusolver-cu12 + # torch +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +nvidia-nccl-cu12==2.20.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-nvjitlink-cu12==12.6.77 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 +nvidia-nvtx-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +omegaconf==2.3.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/recursiongfn/requirements.in + # voir +opt-einsum==3.4.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # pyro-ppl +ovld==0.3.9 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +packaging==24.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # tables + # tensorboard +pandas==2.2.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/recursiongfn/requirements.in +pillow==10.4.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # rdkit +platformdirs==4.3.6 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # wandb +protobuf==5.28.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # tensorboard + # wandb +psutil==5.9.8 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch-geometric + # voir + # wandb +ptera==1.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +py-cpuinfo==9.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # blosc2 + # tables +pyarrow==17.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/recursiongfn/requirements.in +pygments==2.18.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # rich +pyparsing==3.1.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch-geometric +pyro-api==0.1.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # pyro-ppl +pyro-ppl==1.9.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/recursiongfn/requirements.in + # botorch +python-dateutil==2.9.0.post0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # pandas +pytz==2024.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # pandas +pyyaml==6.0.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # omegaconf + # wandb +rdkit==2024.3.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/recursiongfn/requirements.in +reactivex==4.0.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +requests==2.32.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch-geometric + # wandb +rich==13.9.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +scikit-learn==1.5.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # gpytorch +scipy==1.14.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/recursiongfn/requirements.in + # botorch + # gpytorch + # linear-operator + # scikit-learn + # torch-cluster + # torch-sparse +sentry-sdk==2.15.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # wandb +setproctitle==1.3.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # wandb +six==1.16.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # asttokens + # docker-pycreds + # python-dateutil + # tensorboard +smmap==5.0.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # gitdb +sympy==1.13.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +tables==3.10.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/recursiongfn/requirements.in +tensorboard==2.18.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/recursiongfn/requirements.in +tensorboard-data-server==0.7.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # tensorboard +threadpoolctl==3.5.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # scikit-learn +torch==2.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/recursiongfn/requirements.in + # botorch + # linear-operator + # pyro-ppl +torch-cluster==1.6.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/recursiongfn/requirements.in +torch-geometric==2.6.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/recursiongfn/requirements.in +torch-scatter==2.1.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/recursiongfn/requirements.in +torch-sparse==0.6.18 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/recursiongfn/requirements.in +tqdm==4.66.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # pyro-ppl + # torch-geometric +triton==3.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +typeguard==4.3.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # jaxtyping +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # botorch + # jaxtyping + # multidict + # reactivex + # rich + # tables + # torch + # typeguard +tzdata==2024.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # pandas +urllib3==2.2.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # requests + # sentry-sdk +varname==0.13.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +voir==0.2.19 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -c .pin/../constraints/hpu.txt + # -r benchmarks/recursiongfn/requirements.in +wandb==0.18.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/recursiongfn/requirements.in +werkzeug==3.0.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # tensorboard +yarl==1.13.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/benchmarks/rlhf/main.py b/benchmarks/rlhf/main.py index 0be12d28..3a5f1dda 100755 --- a/benchmarks/rlhf/main.py +++ b/benchmarks/rlhf/main.py @@ -2,6 +2,7 @@ import shutil +import accelerate from accelerate import PartialState from datasets import load_dataset from transformers import ( @@ -15,10 +16,16 @@ from trl.trainer.ppov2_trainer import PPOv2Config, PPOv2Trainer from trl.trainer.utils import SIMPLE_QUERY_CHAT_TEMPLATE +import torchcompat.core as compat + class PPOv2TrainerIntrumented(PPOv2Trainer): def __init__(self, config: PPOv2Config, *args, **kwargs): config.report_to = [] + + # FIXME: better way to monkeypatch this ? + # Use the compatibility accelerator class + accelerate.Accelerator = compat.accelerate.Accelerator super().__init__(config, *args, **kwargs) def batch_size_fn(batch): @@ -46,9 +53,13 @@ def save_model(self, *args, **kwargs): def main(): + parser = HfArgumentParser((PPOv2Config, ModelConfig)) config, model_config = parser.parse_args_into_dataclasses() + + import torchcompat.core + # remove output_dir if exists shutil.rmtree(config.output_dir, ignore_errors=True) diff --git a/benchmarks/rlhf/requirements.hpu.txt b/benchmarks/rlhf/requirements.hpu.txt new file mode 100644 index 00000000..a6c12765 --- /dev/null +++ b/benchmarks/rlhf/requirements.hpu.txt @@ -0,0 +1,362 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/rlhf/requirements.hpu.txt .pin/tmp-constraints-hpu-rlhf-gpus.txt benchmarks/rlhf/requirements.in +# +accelerate==0.34.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/rlhf/requirements.in + # trl +aiohappyeyeballs==2.4.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp +aiohttp==3.10.8 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets + # fsspec +aiosignal==1.3.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp +antlr4-python3-runtime==4.9.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # omegaconf +asttokens==2.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +async-timeout==4.0.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp +attrs==24.2.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp +certifi==2024.8.30 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # requests +charset-normalizer==3.3.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # requests +codefind==0.1.7 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # ptera +datasets==3.0.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/rlhf/requirements.in + # trl +dill==0.3.8 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets + # multiprocess +docstring-parser==0.16 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # tyro +executing==2.1.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # varname +filelock==3.16.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets + # huggingface-hub + # torch + # transformers + # triton +frozenlist==1.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp + # aiosignal +fsspec[http]==2024.6.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets + # huggingface-hub + # torch +giving==0.4.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # ptera + # voir +huggingface-hub==0.25.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # accelerate + # datasets + # tokenizers + # transformers +idna==3.10 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # requests + # yarl +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +markdown-it-py==3.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # rich +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # jinja2 +mdurl==0.1.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # markdown-it-py +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # sympy +multidict==6.1.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp + # yarl +multiprocess==0.70.16 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets +networkx==3.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +numpy==1.26.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # accelerate + # datasets + # pandas + # pyarrow + # transformers + # trl +nvidia-cublas-cu12==12.1.3.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cuda-nvrtc-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cuda-runtime-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cudnn-cu12==9.1.0.70 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cufft-cu12==11.0.2.54 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-curand-cu12==10.3.2.106 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cusolver-cu12==11.4.5.107 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cusparse-cu12==12.1.0.106 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cusolver-cu12 + # torch +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +nvidia-nccl-cu12==2.20.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-nvjitlink-cu12==12.6.77 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 +nvidia-nvtx-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +omegaconf==2.3.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +ovld==0.3.9 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +packaging==24.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # accelerate + # datasets + # huggingface-hub + # transformers +pandas==2.2.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets +psutil==5.9.8 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # accelerate + # voir +ptera==1.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +pyarrow==17.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets +pygments==2.18.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # rich +python-dateutil==2.9.0.post0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # pandas +pytz==2024.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # pandas +pyyaml==6.0.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # accelerate + # datasets + # huggingface-hub + # omegaconf + # transformers +reactivex==4.0.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +regex==2024.9.11 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # transformers +requests==2.32.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets + # huggingface-hub + # transformers +rich==13.9.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # tyro + # voir +safetensors==0.4.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # accelerate + # transformers +shtab==1.7.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # tyro +six==1.16.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # asttokens + # python-dateutil +sympy==1.13.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +tokenizers==0.19.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # transformers +torch==2.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/rlhf/requirements.in + # accelerate + # trl +tqdm==4.66.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets + # huggingface-hub + # transformers +transformers==4.44.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -c .pin/../constraints/hpu.txt + # -r benchmarks/rlhf/requirements.in + # trl +triton==3.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +trl==0.10.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -c .pin/../constraints/hpu.txt + # -r benchmarks/rlhf/requirements.in +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # huggingface-hub + # multidict + # reactivex + # rich + # torch + # tyro +tyro==0.8.11 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # trl +tzdata==2024.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # pandas +urllib3==2.2.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # requests +varname==0.13.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +voir==0.2.19 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -c .pin/../constraints/hpu.txt + # -r benchmarks/rlhf/requirements.in +xxhash==3.5.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # datasets +yarl==1.13.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # aiohttp diff --git a/benchmarks/timm/requirements.hpu.txt b/benchmarks/timm/requirements.hpu.txt index 432c91bc..e626bd1f 100644 --- a/benchmarks/timm/requirements.hpu.txt +++ b/benchmarks/timm/requirements.hpu.txt @@ -4,10 +4,6 @@ # # pip-compile --output-file=benchmarks/timm/requirements.hpu.txt .pin/tmp-constraints-hpu-timm.txt benchmarks/timm/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com ---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com - antlr4-python3-runtime==4.9.3 # via # -c .pin/../.pin/constraints-hpu-torch.txt @@ -16,7 +12,7 @@ asttokens==2.4.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # giving -certifi==2024.6.2 +certifi==2024.8.30 # via # -c .pin/../.pin/constraints-hpu-torch.txt # requests @@ -24,35 +20,35 @@ charset-normalizer==3.3.2 # via # -c .pin/../.pin/constraints-hpu-torch.txt # requests -codefind==0.1.6 +codefind==0.1.7 # via # -c .pin/../.pin/constraints-hpu-torch.txt # ptera -executing==1.2.0 +executing==2.1.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # varname -filelock==3.15.4 +filelock==3.16.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # huggingface-hub # torch # triton -fsspec==2024.5.0 +fsspec==2024.6.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # huggingface-hub # torch -giving==0.4.2 +giving==0.4.3 # via # -c .pin/../.pin/constraints-hpu-torch.txt # ptera # voir -huggingface-hub==0.24.0 +huggingface-hub==0.25.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # -r benchmarks/timm/requirements.in -idna==3.7 +idna==3.10 # via # -c .pin/../.pin/constraints-hpu-torch.txt # requests @@ -102,7 +98,7 @@ nvidia-cuda-runtime-cu12==12.1.105 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torch -nvidia-cudnn-cu12==8.9.2.26 +nvidia-cudnn-cu12==9.1.0.70 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torch @@ -123,11 +119,15 @@ nvidia-cusparse-cu12==12.1.0.106 # -c .pin/../.pin/constraints-hpu-torch.txt # nvidia-cusolver-cu12 # torch +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir nvidia-nccl-cu12==2.20.5 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torch -nvidia-nvjitlink-cu12==12.5.82 +nvidia-nvjitlink-cu12==12.6.77 # via # -c .pin/../.pin/constraints-hpu-torch.txt # nvidia-cusolver-cu12 @@ -140,7 +140,7 @@ omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # voir -ovld==0.3.5 +ovld==0.3.9 # via # -c .pin/../.pin/constraints-hpu-torch.txt # voir @@ -164,11 +164,7 @@ pygments==2.18.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # rich -pynvml==11.5.3 - # via - # -c .pin/../.pin/constraints-hpu-torch.txt - # voir -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-hpu-torch.txt # -r benchmarks/timm/requirements.in @@ -182,11 +178,11 @@ requests==2.32.3 # via # -c .pin/../.pin/constraints-hpu-torch.txt # huggingface-hub -rich==13.7.1 +rich==13.9.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # voir -safetensors==0.4.3 +safetensors==0.4.5 # via # -c .pin/../.pin/constraints-hpu-torch.txt # -r benchmarks/timm/requirements.in @@ -194,24 +190,24 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # asttokens -sympy==1.13.0 +sympy==1.13.3 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torch -torch==2.3.1 +torch==2.4.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # -r benchmarks/timm/requirements.in # torchvision -torchvision==0.18.1 +torchvision==0.19.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # -r benchmarks/timm/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-hpu-torch.txt # huggingface-hub -triton==2.3.1 +triton==3.0.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torch @@ -220,12 +216,13 @@ typing-extensions==4.12.2 # -c .pin/../.pin/constraints-hpu-torch.txt # huggingface-hub # reactivex + # rich # torch -urllib3==1.26.19 +urllib3==2.2.3 # via # -c .pin/../.pin/constraints-hpu-torch.txt # requests -varname==0.10.0 +varname==0.13.3 # via # -c .pin/../.pin/constraints-hpu-torch.txt # giving diff --git a/benchmarks/torchatari/requirements.hpu.txt b/benchmarks/torchatari/requirements.hpu.txt new file mode 100644 index 00000000..6d7369df --- /dev/null +++ b/benchmarks/torchatari/requirements.hpu.txt @@ -0,0 +1,304 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/torchatari/requirements.hpu.txt .pin/tmp-constraints-hpu-torchatari.txt benchmarks/torchatari/requirements.in +# +absl-py==2.1.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # dm-env + # tensorboard +antlr4-python3-runtime==4.9.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # omegaconf +appdirs==1.4.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # cantilever +asttokens==2.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +cantilever==0.1.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/torchatari/requirements.in +cloudpickle==3.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # gym + # gymnasium +codefind==0.1.7 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # ptera +dm-env==1.6 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # envpool +dm-tree==0.1.8 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # dm-env +docstring-parser==0.16 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # tyro +envpool==0.8.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/torchatari/requirements.in +executing==2.1.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # varname +farama-notifications==0.0.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # gymnasium +filelock==3.16.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch + # triton +fsspec==2024.6.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +giving==0.4.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # ptera + # voir +grpcio==1.66.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # tensorboard +gym==0.26.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/torchatari/requirements.in + # envpool +gym-notices==0.0.8 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # gym +gymnasium==0.29.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # envpool +importlib-resources==6.4.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # cantilever + # torchcompat +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +markdown==3.7 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # tensorboard +markdown-it-py==3.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # rich +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # jinja2 + # werkzeug +mdurl==0.1.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # markdown-it-py +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # sympy +networkx==3.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +numpy==1.26.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/torchatari/requirements.in + # dm-env + # envpool + # gym + # gymnasium + # tensorboard +nvidia-cublas-cu12==12.1.3.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cuda-nvrtc-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cuda-runtime-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cudnn-cu12==9.1.0.70 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cufft-cu12==11.0.2.54 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-curand-cu12==10.3.2.106 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cusolver-cu12==11.4.5.107 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cusparse-cu12==12.1.0.106 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cusolver-cu12 + # torch +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +nvidia-nccl-cu12==2.20.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-nvjitlink-cu12==12.6.77 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 +nvidia-nvtx-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +omegaconf==2.3.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +optree==0.13.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # envpool +ovld==0.3.9 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +packaging==24.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # envpool + # tensorboard +protobuf==5.28.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # tensorboard +psutil==5.9.8 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +ptera==1.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +pygments==2.18.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # rich +pyyaml==6.0.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # omegaconf +reactivex==4.0.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +rich==13.9.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # tyro + # voir +shtab==1.7.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # tyro +six==1.16.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # asttokens + # tensorboard +sympy==1.13.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +tensorboard==2.18.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/torchatari/requirements.in +tensorboard-data-server==0.7.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # tensorboard +torch==2.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/torchatari/requirements.in +torchcompat==1.1.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -c .pin/../constraints/hpu.txt + # -r benchmarks/torchatari/requirements.in +triton==3.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +types-protobuf==5.28.0.20240924 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # envpool +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # envpool + # gymnasium + # optree + # reactivex + # rich + # torch + # tyro +tyro==0.8.11 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/torchatari/requirements.in +varname==0.13.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +voir==0.2.19 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -c .pin/../constraints/hpu.txt + # -r benchmarks/torchatari/requirements.in +werkzeug==3.0.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # tensorboard + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/benchmarks/torchvision/requirements.hpu.txt b/benchmarks/torchvision/requirements.hpu.txt index 369a1753..f0b47e91 100644 --- a/benchmarks/torchvision/requirements.hpu.txt +++ b/benchmarks/torchvision/requirements.hpu.txt @@ -4,10 +4,6 @@ # # pip-compile --output-file=benchmarks/torchvision/requirements.hpu.txt .pin/tmp-constraints-hpu-torchvision.txt benchmarks/torchvision/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com ---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com - antlr4-python3-runtime==4.9.3 # via # -c .pin/../.pin/constraints-hpu-torch.txt @@ -16,29 +12,29 @@ asttokens==2.4.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # giving -codefind==0.1.6 +codefind==0.1.7 # via # -c .pin/../.pin/constraints-hpu-torch.txt # ptera -executing==1.2.0 +executing==2.1.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # varname -filelock==3.15.4 +filelock==3.16.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torch # triton -fsspec==2024.5.0 +fsspec==2024.6.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torch -giving==0.4.2 +giving==0.4.3 # via # -c .pin/../.pin/constraints-hpu-torch.txt # ptera # voir -importlib-resources==6.4.0 +importlib-resources==6.4.5 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torchcompat @@ -88,7 +84,7 @@ nvidia-cuda-runtime-cu12==12.1.105 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torch -nvidia-cudnn-cu12==8.9.2.26 +nvidia-cudnn-cu12==9.1.0.70 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torch @@ -109,11 +105,15 @@ nvidia-cusparse-cu12==12.1.0.106 # -c .pin/../.pin/constraints-hpu-torch.txt # nvidia-cusolver-cu12 # torch +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir nvidia-nccl-cu12==2.20.5 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torch -nvidia-nvjitlink-cu12==12.5.82 +nvidia-nvjitlink-cu12==12.6.77 # via # -c .pin/../.pin/constraints-hpu-torch.txt # nvidia-cusolver-cu12 @@ -126,7 +126,7 @@ omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # voir -ovld==0.3.5 +ovld==0.3.9 # via # -c .pin/../.pin/constraints-hpu-torch.txt # voir @@ -146,11 +146,7 @@ pygments==2.18.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # rich -pynvml==11.5.3 - # via - # -c .pin/../.pin/constraints-hpu-torch.txt - # voir -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-hpu-torch.txt # omegaconf @@ -158,7 +154,7 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-hpu-torch.txt # giving -rich==13.7.1 +rich==13.9.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # voir @@ -166,11 +162,11 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # asttokens -sympy==1.13.0 +sympy==1.13.3 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torch -torch==2.3.1 +torch==2.4.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # -r benchmarks/torchvision/requirements.in @@ -180,15 +176,15 @@ torchcompat==1.1.4 # -c .pin/../.pin/constraints-hpu-torch.txt # -c .pin/../constraints/hpu.txt # -r benchmarks/torchvision/requirements.in -torchvision==0.18.1 +torchvision==0.19.1 # via # -c .pin/../.pin/constraints-hpu-torch.txt # -r benchmarks/torchvision/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-hpu-torch.txt # -r benchmarks/torchvision/requirements.in -triton==2.3.1 +triton==3.0.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # torch @@ -196,8 +192,9 @@ typing-extensions==4.12.2 # via # -c .pin/../.pin/constraints-hpu-torch.txt # reactivex + # rich # torch -varname==0.10.0 +varname==0.13.3 # via # -c .pin/../.pin/constraints-hpu-torch.txt # giving diff --git a/benchmarks/torchvision_ddp/requirements.hpu.txt b/benchmarks/torchvision_ddp/requirements.hpu.txt index e69de29b..a4174e7b 100644 --- a/benchmarks/torchvision_ddp/requirements.hpu.txt +++ b/benchmarks/torchvision_ddp/requirements.hpu.txt @@ -0,0 +1,205 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/torchvision_ddp/requirements.hpu.txt .pin/tmp-constraints-hpu-torchvision.txt benchmarks/torchvision_ddp/requirements.in +# +antlr4-python3-runtime==4.9.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # omegaconf +asttokens==2.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +codefind==0.1.7 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # ptera +executing==2.1.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # varname +filelock==3.16.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch + # triton +fsspec==2024.6.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +giving==0.4.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # ptera + # voir +importlib-resources==6.4.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torchcompat +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +markdown-it-py==3.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # rich +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # jinja2 +mdurl==0.1.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # markdown-it-py +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # sympy +networkx==3.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +numpy==1.26.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torchvision +nvidia-cublas-cu12==12.1.3.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cuda-nvrtc-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cuda-runtime-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cudnn-cu12==9.1.0.70 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cufft-cu12==11.0.2.54 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-curand-cu12==10.3.2.106 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cusolver-cu12==11.4.5.107 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cusparse-cu12==12.1.0.106 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cusolver-cu12 + # torch +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +nvidia-nccl-cu12==2.20.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-nvjitlink-cu12==12.6.77 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 +nvidia-nvtx-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +omegaconf==2.3.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +ovld==0.3.9 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +pillow==10.4.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torchvision +psutil==5.9.8 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +ptera==1.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +pygments==2.18.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # rich +pyyaml==6.0.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # omegaconf +reactivex==4.0.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +rich==13.9.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +six==1.16.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # asttokens +sympy==1.13.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +torch==2.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/torchvision_ddp/requirements.in + # torchvision +torchcompat==1.1.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -c .pin/../constraints/hpu.txt + # -r benchmarks/torchvision_ddp/requirements.in +torchvision==0.19.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/torchvision_ddp/requirements.in +tqdm==4.66.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/torchvision_ddp/requirements.in +triton==3.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # reactivex + # rich + # torch +varname==0.13.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +voir==0.2.19 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -c .pin/../constraints/hpu.txt + # -r benchmarks/torchvision_ddp/requirements.in diff --git a/benchmarks/vjepa/benchfile.py b/benchmarks/vjepa/benchfile.py index d25b47b5..228023ce 100644 --- a/benchmarks/vjepa/benchfile.py +++ b/benchmarks/vjepa/benchfile.py @@ -23,7 +23,9 @@ class Vjepa(Package): def make_env(self): # Return a dict of environment variables for prepare_script and # main_script. - return super().make_env() + env = super().make_env() + env["PT_HPU_LAZY_MODE"] = "0" + return env async def install(self): vjepa = self.dirs.code / "jepa" diff --git a/benchmarks/vjepa/main.py b/benchmarks/vjepa/main.py index 18377b92..55981859 100644 --- a/benchmarks/vjepa/main.py +++ b/benchmarks/vjepa/main.py @@ -475,14 +475,19 @@ def reg_fn(z): scaler.unscale_(optimizer) else: loss.backward() + if (epoch > warmup) and (clip_grad is not None): _enc_norm = torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip_grad) _pred_norm = torch.nn.utils.clip_grad_norm_(predictor.parameters(), clip_grad) + + acc.mark_step() if mixed_precision: scaler.step(optimizer) scaler.update() else: optimizer.step() + acc.mark_step() + grad_stats = grad_logger(encoder.named_parameters()) grad_stats.global_norm = float(_enc_norm) grad_stats_pred = grad_logger(predictor.named_parameters()) @@ -506,7 +511,8 @@ def reg_fn(z): grad_stats_pred, optim_stats, ) - (loss, loss_jepa, loss_reg, _new_lr, _new_wd, grad_stats, grad_stats_pred, optim_stats,), gpu_etime_ms = gpu_timer(train_step) + loss, loss_jepa, loss_reg, _new_lr, _new_wd, grad_stats, grad_stats_pred, optim_stats = train_step() + iter_elapsed_time_ms = (time.time() - itr_start_time) * 1000. loss_meter.update(loss) input_var = float(AllReduce.apply(clips.view(clips.shape[0], -1).var(dim=1).mean(dim=0))) @@ -515,7 +521,7 @@ def reg_fn(z): input_var_min_meter.update(input_var_min) jepa_loss_meter.update(loss_jepa) reg_loss_meter.update(loss_reg) - gpu_time_meter.update(gpu_etime_ms) + # gpu_time_meter.update(gpu_etime_ms) wall_time_meter.update(iter_elapsed_time_ms) observer.record_loss(loss) @@ -530,7 +536,6 @@ def log_stats(): loss_reg, grad_stats.global_norm, grad_stats_pred.global_norm, - gpu_etime_ms, iter_elapsed_time_ms) if (itr % log_freq == 0) or np.isnan(loss) or np.isinf(loss): logger.info( @@ -637,7 +642,11 @@ def main(): params["nodes"] = nnodes params["tasks_per_node"] = gpu_per_nodes + print("HERE", os.getenv("RANK", -1) ) if os.getenv("RANK", -1) != -1: + print("INIT PROCESS GROUP HERE") + print(acc) + print(acc.init_process_group) acc.init_process_group() try: diff --git a/benchmarks/vjepa/requirements.hpu.txt b/benchmarks/vjepa/requirements.hpu.txt new file mode 100644 index 00000000..b1c986ec --- /dev/null +++ b/benchmarks/vjepa/requirements.hpu.txt @@ -0,0 +1,297 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/vjepa/requirements.hpu.txt .pin/tmp-constraints-hpu-vjepa-gpus.txt benchmarks/vjepa/requirements.in +# +antlr4-python3-runtime==4.9.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # omegaconf +asttokens==2.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +beartype==0.19.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/vjepa/requirements.in +braceexpand==0.1.7 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/vjepa/requirements.in + # webdataset +certifi==2024.8.30 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # requests +charset-normalizer==3.3.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # requests +cloudpickle==3.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # submitit +codefind==0.1.7 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # ptera +decord==0.6.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/vjepa/requirements.in +einops==0.8.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/vjepa/requirements.in +executing==2.1.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # varname +filelock==3.16.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # huggingface-hub + # torch + # triton +fsspec==2024.6.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # huggingface-hub + # torch +giving==0.4.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # ptera + # voir +huggingface-hub==0.25.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # timm +idna==3.10 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # requests +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +markdown-it-py==3.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # rich +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # jinja2 +mdurl==0.1.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # markdown-it-py +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # sympy +networkx==3.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +numpy==1.26.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/vjepa/requirements.in + # decord + # opencv-python + # pandas + # torchvision + # webdataset +nvidia-cublas-cu12==12.1.3.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cuda-nvrtc-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cuda-runtime-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cudnn-cu12==9.1.0.70 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cufft-cu12==11.0.2.54 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-curand-cu12==10.3.2.106 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cusolver-cu12==11.4.5.107 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-cusparse-cu12==12.1.0.106 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cusolver-cu12 + # torch +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +nvidia-nccl-cu12==2.20.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +nvidia-nvjitlink-cu12==12.6.77 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 +nvidia-nvtx-cu12==12.1.105 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +omegaconf==2.3.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +opencv-python==4.10.0.84 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/vjepa/requirements.in +ovld==0.3.9 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +packaging==24.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # huggingface-hub +pandas==2.2.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/vjepa/requirements.in +pillow==10.4.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torchvision +psutil==5.9.8 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +ptera==1.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +pygments==2.18.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # rich +python-dateutil==2.9.0.post0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # pandas +pytz==2024.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # pandas +pyyaml==6.0.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/vjepa/requirements.in + # huggingface-hub + # omegaconf + # timm + # webdataset +reactivex==4.0.4 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +requests==2.32.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # huggingface-hub +rich==13.9.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # voir +safetensors==0.4.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # timm +six==1.16.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # asttokens + # python-dateutil +submitit==1.5.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/vjepa/requirements.in +sympy==1.13.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +timm==1.0.9 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/vjepa/requirements.in +torch==2.4.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/vjepa/requirements.in + # timm + # torchvision +torchvision==0.19.1 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/vjepa/requirements.in + # timm +tqdm==4.66.5 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # huggingface-hub +triton==3.0.0 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # torch +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # huggingface-hub + # reactivex + # rich + # submitit + # torch +tzdata==2024.2 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # pandas +urllib3==2.2.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # requests +varname==0.13.3 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # giving +voir==0.2.19 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -c .pin/../constraints/hpu.txt + # -r benchmarks/vjepa/requirements.in +webdataset==0.2.100 + # via + # -c .pin/../.pin/constraints-hpu-torch.txt + # -r benchmarks/vjepa/requirements.in diff --git a/config/base.yaml b/config/base.yaml index d7926799..1a64d550 100644 --- a/config/base.yaml +++ b/config/base.yaml @@ -348,7 +348,7 @@ reformer: - monogpu argv: --model: "Reformer" - --batch-size: 64 + --batch-size: 32 whisper: inherits: _hf @@ -541,7 +541,7 @@ _llm: tags: - nlp - llm - max_duration: 1200 + max_duration: 3600 num_machines: 1 inherits: _defaults definition: ../benchmarks/llm @@ -566,6 +566,7 @@ llm-lora-single: repo_id="meta-llama/Meta-Llama-3.1-8B": true batch_size=8: true gradient_accumulation_steps=8: true + device={device_name}: true llm-lora-ddp-gpus: @@ -587,7 +588,7 @@ llm-lora-ddp-gpus: repo_id="meta-llama/Meta-Llama-3.1-8B": true batch_size=8: true gradient_accumulation_steps=8: true - + device={device_name}: true llm-lora-ddp-nodes: tags: @@ -610,7 +611,7 @@ llm-lora-ddp-nodes: repo_id="meta-llama/Meta-Llama-3.1-8B": true batch_size=8: true gradient_accumulation_steps=8: true - + device={device_name}: true num_machines: 2 requires_capabilities: - "len(nodes) >= ${num_machines}" @@ -636,8 +637,12 @@ llm-lora-mp-gpus: repo_id="meta-llama/Meta-Llama-3.1-70B": true batch_size=8: true gradient_accumulation_steps=1: true - + device={device_name}: true + llm-full-mp-gpus: + voir: + options: + stop: 30 inherits: _llm tags: - multigpu @@ -658,7 +663,8 @@ llm-full-mp-gpus: safetensors=true: true batch_size=2: true gradient_accumulation_steps=1: true - + device={device_name}: true + llm-full-mp-nodes: tags: - multinode @@ -681,7 +687,8 @@ llm-full-mp-nodes: safetensors=true: true batch_size=2: true gradient_accumulation_steps=1: true - + device={device_name}: true + num_machines: 2 requires_capabilities: - "len(nodes) >= ${num_machines}" @@ -781,6 +788,7 @@ torchatari: --env-id: Breakout-v5 _llava: + max_duration: 3600 inherits: _defaults definition: ../benchmarks/llava install_group: torch diff --git a/constraints/extra/torch.hpu.txt b/constraints/extra/torch.hpu.txt index 1d21c177..e69de29b 100644 --- a/constraints/extra/torch.hpu.txt +++ b/constraints/extra/torch.hpu.txt @@ -1,5 +0,0 @@ - -# -# -voir >= 0.2.15 -torchcompat >= 1.0.0 diff --git a/constraints/hpu.txt b/constraints/hpu.txt index 23a110bd..9f6fe957 100644 --- a/constraints/hpu.txt +++ b/constraints/hpu.txt @@ -1,8 +1,16 @@ -# FIXME -# Add - # # voir >= 0.2.19 torchcompat >= 1.0.0 -gymnax >= 0.0.8 \ No newline at end of file +gymnax >= 0.0.8 +trl<0.11.0 + +# latest torchtune is slower than before and cause failures +# next version of pytorch seems to work better +# so pending a new version of pytorch this is what we get +torchtune<0.3.0 + +# transformers added torchao support recently +# but only the most recent version we do not support +transformers<4.45.0 +torchvision \ No newline at end of file diff --git a/docker/Dockerfile-hpu b/docker/Dockerfile-hpu new file mode 100644 index 00000000..932959cd --- /dev/null +++ b/docker/Dockerfile-hpu @@ -0,0 +1,42 @@ +# FROM artifactory-kfs.habana-labs.com/docker-local/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:1.17.0-462 + +FROM vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest + +ENV MILABENCH_GPU_ARCH=hpu + +WORKDIR /workspace + +ENV MILABENCH_CONFIG="/workspace/milabench/config/standard.yaml" + +ENV MILABENCH_WORDIR="/workspace/${MILABENCH_GPU_ARCH}" +ENV MILABENCH_BASE="${MILABENCH_WORDIR}/results" +ENV MILABENCH_VENV="${MILABENCH_WORDIR}/env" +ENV BENCHMARK_VENV="${MILABENCH_WORDIR}/results/venv/torch" + +ARG BENCH=lightning + +RUN mkdir -p ${MILABENCH_WORDIR} +RUN pip install virtualenv +RUN virtualenv --system-site-packages $MILABENCH_VENV + +ARG CACHEBUST=1 +RUN echo "$CACHEBUST" +RUN git clone https://github.com/mila-iqia/milabench.git -b $MILABENCH_GPU_ARCH +RUN $MILABENCH_VENV/bin/pip install -e milabench + +RUN . $MILABENCH_VENV/bin/activate && milabench install --use-current-env --select "${BENCH}" + +RUN $MILABENCH_VENV/bin/pip uninstall torch torchvision torchaudio -y +RUN sed -i 's/pic.numpy(force=True)/pic.numpy()/' /usr/local/lib/python3.10/dist-packages/torchvision/transforms/functional.py + +# This does not work +# RUN . $MILABENCH_VENV/bin/activate && milabench prepare --use-current-env --select "${BENCH}" + + + +# RUN . $MILABENCH_VENV/bin/activate && milabench run --use-current-env --select $BENCH +# RUN huggingface-cli login --token $MILABENCH_HF_TOKEN + +# docker build --build-arg CACHEBUST=`git rev-parse hpu` -f Dockerfile-hpu -t dockerfile-hpu . +# docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --shm-size 50G --cap-add=sys_nice --net=host dockerfile-hpu:latest bash +# . $MILABENCH_VENV/bin/activate && milabench prepare --use-current-env --select lightning && milabench run --use-current-env --select lightning diff --git a/docker/Makefile b/docker/Makefile new file mode 100644 index 00000000..93a40270 --- /dev/null +++ b/docker/Makefile @@ -0,0 +1,17 @@ + + + +bench = rlhf-gpus +# bench = "lightning" +lazy = 0 + +hpu: + git add --all + git commit -m "-" | true + git push origin hpu + docker rmi -f $(docker images --filter "dangling=true" -q --no-trunc) | true + # docker system prune -a -f + # docker image prune -a -f + docker build --build-arg BENCH=$(bench) --build-arg CACHEBUST=`git rev-parse hpu` -f Dockerfile-hpu -t dockerfile-hpu . + docker run --rm -it --runtime=habana -e PT_HPU_LAZY_MODE=$(lazy) -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --shm-size 50G --cap-add=sys_nice --net=host dockerfile-hpu:latest bash -c '. $$MILABENCH_VENV/bin/activate && milabench install --use-current-env --select $(bench) && pip uninstall torch torchvision torchaudio -y && milabench prepare --use-current-env --select $(bench) && milabench run --use-current-env $(args) --select $(bench)' + diff --git a/milabench/_version.py b/milabench/_version.py index e2795a03..a5982314 100644 --- a/milabench/_version.py +++ b/milabench/_version.py @@ -1,5 +1,5 @@ """This file is generated, do not modify""" -__tag__ = "v1.0.0_RC1-12-g3b87cb4" -__commit__ = "3b87cb465e855be452953273c314ab01024e0925" -__date__ = "2024-10-09 12:04:43 -0400" +__tag__ = "v1.0.0_RC1-50-gd2c8ba2" +__commit__ = "d2c8ba2c67e19026293381bdbddeb3f30ba0ee64" +__date__ = "2024-11-11 18:51:45 +0000" diff --git a/milabench/remote.py b/milabench/remote.py index 7e1eef85..cbe9696b 100644 --- a/milabench/remote.py +++ b/milabench/remote.py @@ -100,7 +100,7 @@ def worker_commands(pack, worker_plan, setup_for="worker"): def sshnode(node, cmd): host = node["ip"] user = node["user"] - port = node["sshport"] + port = node.get("sshport", 22) return SSHCommand(cmd, user=user, host=host, port=port) diff --git a/milabench/system.py b/milabench/system.py index 3a50d143..2d5a6ca8 100644 --- a/milabench/system.py +++ b/milabench/system.py @@ -328,7 +328,7 @@ def _fix_weird(hostname): # If true that means we cannot resolve the ip addresses # so we ignore errors -offline = False +offline = True @contextmanager diff --git a/scripts/article/run_hpu.sh b/scripts/article/run_hpu.sh index 5d875ca1..8f6126d2 100644 --- a/scripts/article/run_hpu.sh +++ b/scripts/article/run_hpu.sh @@ -9,68 +9,84 @@ set -ex export MILABENCH_GPU_ARCH=hpu export MILABENCH_WORDIR="$(pwd)/$MILABENCH_GPU_ARCH" export MILABENCH_BASE="$MILABENCH_WORDIR/results" -export MILABENCH_CONFIG="$MILABENCH_WORDIR/milabench/config/standard.yaml" export MILABENCH_VENV="$MILABENCH_WORDIR/env" export BENCHMARK_VENV="$MILABENCH_WORDIR/results/venv/torch" +export PT_HPU_LAZY_MODE=0 + +if [ -z "${MILABENCH_SOURCE}" ]; then + export MILABENCH_CONFIG="$MILABENCH_WORDIR/milabench/config/standard.yaml" +else + export MILABENCH_CONFIG="$MILABENCH_SOURCE/config/standard.yaml" +fi if [ -z "${MILABENCH_PREPARE}" ]; then export MILABENCH_PREPARE=0 fi +ARGS="$@" + install_prepare() { mkdir -p $MILABENCH_WORDIR cd $MILABENCH_WORDIR virtualenv $MILABENCH_WORDIR/env - git clone https://github.com/mila-iqia/milabench.git - git clone https://github.com/huggingface/optimum-habana.git + if [ -z "${MILABENCH_SOURCE}" ]; then + if [ ! -d "$MILABENCH_WORDIR/milabench" ]; then + git clone https://github.com/mila-iqia/milabench.git + fi + export MILABENCH_SOURCE="$MILABENCH_WORDIR/milabench" + fi + + git clone https://github.com/huggingface/optimum-habana.git -b v1.13.2 # wget -nv https://vault.habana.ai/artifactory/gaudi-installer/1.15.1/habanalabs-installer.sh - wget -nv https://vault.habana.ai/artifactory/gaudi-installer/1.16.1/habanalabs-installer.sh + # wget -nv https://vault.habana.ai/artifactory/gaudi-installer/1.16.1/habanalabs-installer.sh + wget -nv https://vault.habana.ai/artifactory/gaudi-installer/1.17.1/habanalabs-installer.sh chmod +x habanalabs-installer.sh . $MILABENCH_WORDIR/env/bin/activate - pip install -e $MILABENCH_WORDIR/milabench - - - # - # Install milabench's benchmarks in their venv - # - milabench install + pip install -e $MILABENCH_SOURCE which pip # Override dependencies for HPU # milabench needs pyhlml export HABANALABS_VIRTUAL_DIR=$MILABENCH_VENV - ./habanalabs-installer.sh install -t dependencies --venv -y - ./habanalabs-installer.sh install -t pytorch --venv -y + ./habanalabs-installer.sh install -t dependencies --venv -y | true + ./habanalabs-installer.sh install -t pytorch --venv -y | true + + # + # Install milabench's benchmarks in their venv + # + # milabench pin --variant hpu --from-scratch $ARGS + milabench install $ARGS ( . $BENCHMARK_VENV/bin/activate which pip - pip install -e $MILABENCH_WORDIR/optimum-habana - - ( - cd $MILABENCH_WORDIR/milabench/benchmarks/dlrm/dlrm; - git remote add me https://github.com/Delaunay/dlrm.git - git fetch me - git checkout me/main - ) + pip install --no-deps -e $MILABENCH_WORDIR/optimum-habana # Override dependencies for HPU # benchmarks need pytorch - pip uninstall torch torchvision torchaudio + pip uninstall torch torchvision torchaudio -y export HABANALABS_VIRTUAL_DIR=$BENCHMARK_VENV - ./habanalabs-installer.sh install -t dependencies --venv -y - ./habanalabs-installer.sh install -t pytorch --venv -y + ./habanalabs-installer.sh install -t dependencies --venv -y | true + ./habanalabs-installer.sh install -t pytorch --venv -y | true + + if [ -z "${MILABENCH_HF_TOKEN}" ]; then + echo "Missing token" + else + huggingface-cli login --token $MILABENCH_HF_TOKEN + fi ) # # Generate/download datasets, download models etc... # - milabench prepare + # sed -i 's/pic.numpy(force=True)/pic.numpy()/' $BENCHMARK_VENV/lib/python3.10/dist-packages/torchvision/transforms/functional.py + # sed -i 's/range(hpu.device_count())/range(len(available_modules))/' $BENCHMARK_VENV/lib/site-packages/habana_frameworks/torch/hpu/_utils.py + milabench prepare $ARGS } if [ ! -d "$MILABENCH_WORDIR" ]; then @@ -81,12 +97,28 @@ else fi +( + . $BENCHMARK_VENV/bin/activate + pip install lightning-habana + pip install habana-media-loader + # git clone https://github.com/Delaunay/torchcompat.git + # git clone https://github.com/Delaunay/voir.git -b hpu + pip uninstall torchcompat voir -y + pip install -e $MILABENCH_WORDIR/torchcompat + pip install -e $MILABENCH_WORDIR/voir + pip install -e $MILABENCH_WORDIR/optimum-habana + # pip install habana_dataloader +) + if [ "$MILABENCH_PREPARE" -eq 0 ]; then cd $MILABENCH_WORDIR + # python -c "import torch; print(torch.__version__)" + milabench prepare $ARGS --system $MILABENCH_WORDIR/system.yaml + # # Run the benchmakrs - milabench run "$@" + milabench run $ARGS --system $MILABENCH_WORDIR/system.yaml # # Display report From 0b93d59cd198efaf6eacdf774b749a07c9bb1b37 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Thu, 21 Nov 2024 13:12:35 -0500 Subject: [PATCH 10/20] Rocm (#293) * ROCm changes * Update ping * - * Cleanup the rocm script * use rocm branch * - * Update run_rocm.sh --------- Co-authored-by: Your Name --- .pin/constraints-rocm-torch.txt | 557 +++++++++++--- benchmarks/brax/benchfile.py | 6 +- benchmarks/brax/requirements.rocm.txt | 73 +- benchmarks/diffusion/requirements.rocm.txt | 75 +- benchmarks/dinov2/requirements.rocm.txt | 43 +- benchmarks/flops/requirements.rocm.txt | 33 +- benchmarks/geo_gnn/requirements-pre.rocm.txt | 30 +- benchmarks/geo_gnn/requirements.rocm.txt | 185 +++-- benchmarks/huggingface/requirements.rocm.txt | 44 +- benchmarks/lightning/requirements.rocm.txt | 50 +- benchmarks/llama/requirements.rocm.txt | 64 +- benchmarks/llava/requirements.rocm.txt | 293 ++++++++ benchmarks/llm/requirements.rocm.txt | 118 ++- benchmarks/purejaxrl/benchfile.py | 4 +- benchmarks/purejaxrl/main.py | 1 + benchmarks/purejaxrl/requirements.rocm.txt | 693 ++++++++++++++++++ benchmarks/recursiongfn/requirements.rocm.txt | 328 ++++----- benchmarks/rlhf/requirements.in | 1 + benchmarks/rlhf/requirements.rocm.txt | 313 ++++++++ benchmarks/timm/requirements.rocm.txt | 41 +- benchmarks/torchatari/requirements.rocm.txt | 49 +- benchmarks/torchvision/requirements.rocm.txt | 33 +- .../torchvision_ddp/requirements.rocm.txt | 33 +- benchmarks/vjepa/requirements.rocm.txt | 247 +++++++ config/base.yaml | 4 +- constraints/rocm.txt | 15 +- milabench/_version.py | 6 +- scripts/article/run_cuda.sh | 4 +- scripts/article/run_rocm.sh | 76 +- 29 files changed, 2723 insertions(+), 696 deletions(-) create mode 100644 benchmarks/llava/requirements.rocm.txt create mode 100644 benchmarks/purejaxrl/requirements.rocm.txt create mode 100644 benchmarks/rlhf/requirements.rocm.txt create mode 100644 benchmarks/vjepa/requirements.rocm.txt diff --git a/.pin/constraints-rocm-torch.txt b/.pin/constraints-rocm-torch.txt index 4fe6ae9d..ecc49d51 100644 --- a/.pin/constraints-rocm-torch.txt +++ b/.pin/constraints-rocm-torch.txt @@ -2,31 +2,39 @@ # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # -# pip-compile --output-file=.pin/constraints-rocm-torch.txt .pin/tmp-constraints.txt benchmarks/brax/requirements.in benchmarks/diffusion/requirements.in benchmarks/dinov2/requirements.in benchmarks/flops/requirements.in benchmarks/huggingface/requirements.in benchmarks/lightning/requirements.in benchmarks/llama/requirements.in benchmarks/llm/requirements.in benchmarks/super-slomo/requirements.in benchmarks/timm/requirements.in benchmarks/torchatari/requirements.in benchmarks/torchvision/requirements.in benchmarks/torchvision_ddp/requirements.in +# pip-compile --output-file=.pin/constraints-rocm-torch.txt .pin/tmp-constraints.txt benchmarks/brax/requirements.in benchmarks/diffusion/requirements.in benchmarks/dinov2/requirements.in benchmarks/flops/requirements.in benchmarks/geo_gnn/requirements-pre.in benchmarks/geo_gnn/requirements.in benchmarks/huggingface/requirements.in benchmarks/lightning/requirements.in benchmarks/llama/requirements.in benchmarks/llava/requirements.in benchmarks/llm/requirements.in benchmarks/purejaxrl/requirements.in benchmarks/recursiongfn/requirements.in benchmarks/rlhf/requirements.in benchmarks/timm/requirements.in benchmarks/torchatari/requirements.in benchmarks/torchvision/requirements.in benchmarks/torchvision_ddp/requirements.in benchmarks/vjepa/requirements.in constraints/extra/torch.rocm.txt # ---extra-index-url https://download.pytorch.org/whl/rocm6.0 +--extra-index-url https://download.pytorch.org/whl/rocm6.1 absl-py==2.1.0 # via # brax # chex + # distrax # dm-env # ml-collections # mujoco # mujoco-mjx # optax # orbax-checkpoint + # rlax # tensorboard -accelerate==0.33.0 + # tensorflow-probability +accelerate==0.34.2 # via # -r benchmarks/diffusion/requirements.in + # -r benchmarks/llava/requirements.in + # -r benchmarks/llm/requirements.in + # -r benchmarks/rlhf/requirements.in # diffusers -aiohappyeyeballs==2.4.0 + # trl +aiohappyeyeballs==2.4.3 # via aiohttp -aiohttp==3.10.5 +aiohttp==3.10.8 # via # datasets # fsspec + # torch-geometric aiosignal==1.3.1 # via aiohttp antlr4-python3-runtime==4.9.3 @@ -37,72 +45,137 @@ argklass==1.4.4 # via # -r benchmarks/diffusion/requirements.in # -r benchmarks/llm/requirements.in + # -r benchmarks/purejaxrl/requirements.in +astroid==3.3.4 + # via pylint asttokens==2.4.1 # via giving async-timeout==4.0.3 # via aiohttp attrs==24.2.0 # via aiohttp +beartype==0.19.0 + # via -r benchmarks/vjepa/requirements.in +black==24.8.0 + # via navix blinker==1.8.2 # via flask -blobfile==2.1.1 - # via torchtune +blobfile==3.0.0 + # via + # -r benchmarks/llm/requirements.txt + # torchtune +blosc2==2.7.1 + # via tables +botorch==0.12.0 + # via -r benchmarks/recursiongfn/requirements.in +braceexpand==0.1.7 + # via + # -r benchmarks/vjepa/requirements.in + # webdataset brax==0.10.5 - # via -r benchmarks/brax/requirements.in + # via + # -r benchmarks/brax/requirements.in + # -r benchmarks/purejaxrl/requirements.in cantilever==0.1.0 # via -r benchmarks/torchatari/requirements.in -certifi==2024.7.4 - # via requests +certifi==2024.8.30 + # via + # requests + # sentry-sdk charset-normalizer==3.3.2 # via requests -chex==0.1.86 - # via optax +chex==0.1.87 + # via + # distrax + # evosax + # flashbax + # gymnax + # optax + # rlax click==8.1.7 - # via flask + # via + # black + # flask + # wandb cloudpickle==3.0.0 # via # gym # gymnasium # submitit -codefind==0.1.6 + # tensorflow-probability +codefind==0.1.7 # via ptera contextlib2==21.6.0 # via ml-collections -datasets==2.21.0 +contourpy==1.3.0 + # via matplotlib +cvxopt==1.3.2 + # via -r benchmarks/recursiongfn/requirements.in +cycler==0.12.1 + # via matplotlib +datasets==3.0.1 # via # -r benchmarks/diffusion/requirements.in # -r benchmarks/llama/requirements.in + # -r benchmarks/llava/requirements.in + # -r benchmarks/rlhf/requirements.in # torchtune -diffusers[torch]==0.30.0 + # trl +decorator==5.1.1 + # via tensorflow-probability +decord==0.6.0 + # via -r benchmarks/vjepa/requirements.in +diffusers[torch]==0.30.3 # via -r benchmarks/diffusion/requirements.in dill==0.3.8 # via # datasets # multiprocess + # pylint +distrax==0.1.5 + # via + # -r benchmarks/purejaxrl/requirements.in + # rlax dm-env==1.6 # via # brax # envpool + # rlax dm-tree==0.1.8 - # via dm-env + # via + # dm-env + # tensorflow-probability +docker-pycreds==0.4.0 + # via wandb docstring-parser==0.16 # via tyro +dotmap==1.3.30 + # via evosax +einops==0.8.0 + # via -r benchmarks/vjepa/requirements.in envpool==0.8.4 # via -r benchmarks/torchatari/requirements.in -etils[epath,epy]==1.7.0 +etils[epath,epy]==1.9.4 # via # brax # mujoco # mujoco-mjx # optax # orbax-checkpoint -executing==1.2.0 +evosax==0.1.6 + # via -r benchmarks/purejaxrl/requirements.in +exceptiongroup==1.2.2 + # via pytest +executing==2.1.0 # via varname fairscale==0.4.13 - # via -r benchmarks/llama/requirements.in + # via + # -r benchmarks/llama/requirements.in + # -r benchmarks/llm/requirements.in + # -r benchmarks/llm/requirements.txt farama-notifications==0.0.4 # via gymnasium -filelock==3.15.4 +filelock==3.16.1 # via # blobfile # datasets @@ -111,16 +184,30 @@ filelock==3.15.4 # pytorch-triton-rocm # torch # transformers -fire==0.6.0 - # via -r benchmarks/llama/requirements.in +fire==0.7.0 + # via + # -r benchmarks/llama/requirements.in + # -r benchmarks/llm/requirements.txt +flake8==7.1.1 + # via navix +flashbax==0.1.2 + # via -r benchmarks/purejaxrl/requirements.in flask==3.0.3 # via # brax # flask-cors -flask-cors==4.0.1 - # via brax -flax==0.8.5 +flask-cors==5.0.0 # via brax +flax==0.9.0 + # via + # -r benchmarks/purejaxrl/requirements.in + # brax + # evosax + # flashbax + # gymnax + # navix +fonttools==4.54.1 + # via matplotlib frozenlist==1.4.1 # via # aiohttp @@ -133,92 +220,141 @@ fsspec[http]==2024.6.1 # lightning # pytorch-lightning # torch + # torch-geometric fvcore==0.1.5.post20221221 # via -r benchmarks/dinov2/requirements.in -giving==0.4.2 +gast==0.6.0 + # via tensorflow-probability +gitdb==4.0.11 + # via gitpython +gitpython==3.1.43 + # via + # -r benchmarks/recursiongfn/requirements.in + # wandb +giving==0.4.3 # via # ptera # voir glfw==2.7.0 # via mujoco -grpcio==1.65.5 +gpytorch==1.13 + # via + # -r benchmarks/recursiongfn/requirements.in + # botorch +grpcio==1.66.2 # via # brax # tensorboard -gym==0.23.1 +gym==0.26.2 # via # -r benchmarks/torchatari/requirements.in # brax # envpool + # gymnax gym-notices==0.0.8 # via gym gymnasium==0.29.1 - # via envpool + # via + # envpool + # gymnax +gymnax==0.0.8 + # via + # -c .pin/../constraints/rocm.txt + # -r benchmarks/purejaxrl/requirements.in hjson==3.1.0 # via argklass -huggingface-hub==0.24.6 +huggingface-hub==0.25.1 # via # -r benchmarks/timm/requirements.in # accelerate # datasets # diffusers + # timm # tokenizers # torchtune # transformers humanize==4.10.0 # via orbax-checkpoint -idna==3.7 +idna==3.10 # via # requests # yarl -importlib-metadata==8.4.0 +importlib-metadata==8.5.0 # via diffusers -importlib-resources==6.4.3 +importlib-resources==6.4.5 # via # argklass # cantilever # etils # torchcompat +iniconfig==2.0.0 + # via pytest iopath==0.1.10 # via # -r benchmarks/dinov2/requirements.in # fvcore +isort==5.13.2 + # via pylint itsdangerous==2.2.0 # via flask -jax==0.4.31 +jax==0.4.33 # via # -r benchmarks/brax/requirements.in + # -r benchmarks/purejaxrl/requirements.in # brax # chex + # distrax + # evosax + # flashbax # flax + # gymnax # jaxopt # mujoco-mjx # optax # orbax-checkpoint -jaxlib==0.4.31 + # rlax +jaxlib==0.4.33 # via # brax # chex + # distrax + # evosax + # flashbax + # gymnax # jax # jaxopt # mujoco-mjx # optax # orbax-checkpoint + # rlax jaxopt==0.8.3 # via brax +jaxtyping==0.2.19 + # via + # gpytorch + # linear-operator jinja2==3.1.4 # via # brax # flask # torch + # torch-geometric +joblib==1.4.2 + # via scikit-learn +kiwisolver==1.4.7 + # via matplotlib lightning==2.4.0 # via -r benchmarks/lightning/requirements.in -lightning-utilities==0.11.6 +lightning-utilities==0.11.7 # via # lightning # pytorch-lightning # torchmetrics -lxml==4.9.4 +linear-operator==0.5.3 + # via + # botorch + # gpytorch +lxml==5.3.0 # via blobfile markdown==3.7 # via tensorboard @@ -228,169 +364,284 @@ markupsafe==2.1.5 # via # jinja2 # werkzeug +matplotlib==3.9.2 + # via + # evosax + # gymnax + # seaborn +mccabe==0.7.0 + # via + # flake8 + # pylint mdurl==0.1.2 # via markdown-it-py ml-collections==0.1.1 # via brax -ml-dtypes==0.4.0 +ml-dtypes==0.5.0 # via # jax # jaxlib # tensorstore mpmath==1.3.0 - # via sympy -msgpack==1.0.8 # via + # botorch + # gpytorch + # linear-operator + # sympy +msgpack==1.1.0 + # via + # blosc2 # flax # orbax-checkpoint -mujoco==3.2.2 +mujoco==3.2.3 # via # brax # mujoco-mjx -mujoco-mjx==3.2.2 +mujoco-mjx==3.2.3 # via brax -multidict==6.0.5 +multidict==6.1.0 # via # aiohttp # yarl +multipledispatch==1.0.0 + # via botorch multiprocess==0.70.16 # via datasets +mypy-extensions==1.0.0 + # via black +navix==0.7.0 + # via -r benchmarks/purejaxrl/requirements.in +ndindex==1.9.2 + # via blosc2 nest-asyncio==1.6.0 # via orbax-checkpoint networkx==3.3 - # via torch + # via + # -r benchmarks/recursiongfn/requirements.in + # torch +numexpr==2.10.1 + # via + # blosc2 + # tables numpy==1.26.4 # via - # -r benchmarks/super-slomo/requirements.in + # -r benchmarks/geo_gnn/requirements.in + # -r benchmarks/llava/requirements.in + # -r benchmarks/purejaxrl/requirements.in # -r benchmarks/torchatari/requirements.in + # -r benchmarks/vjepa/requirements.in # accelerate + # blosc2 # brax # chex + # contourpy # datasets + # decord # diffusers + # distrax # dm-env # envpool + # evosax # fairscale - # flax + # flashbax # fvcore # gym # gymnasium # jax # jaxlib # jaxopt + # jaxtyping + # matplotlib # ml-dtypes # mujoco + # navix + # numexpr # opencv-python - # opt-einsum # optax # orbax-checkpoint # pandas # pyarrow + # pyro-ppl + # rdkit + # rlax + # scikit-learn # scipy + # seaborn + # tables # tensorboard # tensorboardx + # tensorflow-probability # tensorstore + # torch-geometric # torchmetrics # torchtune # torchvision # transformers # trimesh + # trl + # webdataset # xformers +nvidia-ml-py==12.560.30 + # via voir omegaconf==2.3.0 # via # -r benchmarks/dinov2/requirements.in + # -r benchmarks/recursiongfn/requirements.in # torchtune # voir opencv-python==4.10.0.84 - # via -r benchmarks/super-slomo/requirements.in -opt-einsum==3.3.0 - # via jax + # via -r benchmarks/vjepa/requirements.in +opt-einsum==3.4.0 + # via + # jax + # pyro-ppl optax==0.2.3 # via + # -r benchmarks/purejaxrl/requirements.in # brax # flax -optree==0.12.1 +optree==0.13.0 # via envpool -orbax-checkpoint==0.6.0 +orbax-checkpoint==0.6.4 # via # brax # flax -ovld==0.3.8 +ovld==0.3.9 # via voir packaging==24.1 # via # accelerate + # black # datasets # envpool # huggingface-hub # lightning # lightning-utilities + # matplotlib + # pytest # pytorch-lightning + # setuptools-scm + # tables # tensorboard # tensorboardx # torchmetrics # transformers -pandas==2.2.2 - # via datasets +pandas==2.2.3 + # via + # -r benchmarks/geo_gnn/requirements.in + # -r benchmarks/recursiongfn/requirements.in + # -r benchmarks/vjepa/requirements.in + # datasets + # seaborn +pathspec==0.12.1 + # via black pillow==10.4.0 # via # -r benchmarks/huggingface/requirements.in + # -r benchmarks/llava/requirements.in # brax # diffusers # fvcore + # matplotlib + # navix + # rdkit # torchvision +platformdirs==4.3.6 + # via + # black + # pylint + # wandb +pluggy==1.5.0 + # via pytest portalocker==2.10.1 # via iopath -protobuf==5.27.3 +protobuf==5.28.2 # via # orbax-checkpoint # tensorboard # tensorboardx + # wandb psutil==5.9.8 # via # accelerate + # torch-geometric # voir + # wandb ptera==1.4.1 # via voir +py-cpuinfo==9.0.0 + # via + # blosc2 + # tables pyarrow==17.0.0 - # via datasets -pycryptodomex==3.20.0 + # via + # -r benchmarks/recursiongfn/requirements.in + # datasets +pycodestyle==2.12.1 + # via flake8 +pycryptodomex==3.21.0 # via blobfile +pyflakes==3.2.0 + # via flake8 pygments==2.18.0 # via rich -pynvml==11.5.3 - # via voir +pylint==3.3.1 + # via navix pyopengl==3.1.7 # via mujoco +pyparsing==3.1.4 + # via + # matplotlib + # torch-geometric +pyro-api==0.1.2 + # via pyro-ppl +pyro-ppl==1.9.1 + # via + # -r benchmarks/recursiongfn/requirements.in + # botorch +pytest==8.3.3 + # via navix python-dateutil==2.9.0.post0 - # via pandas + # via + # matplotlib + # pandas pytinyrenderer==0.0.14 # via brax pytorch-lightning==2.4.0 # via lightning pytorch-triton-rocm==3.0.0 # via torch -pytz==2024.1 +pytz==2024.2 # via pandas pyyaml==6.0.2 # via # -r benchmarks/llm/requirements.in # -r benchmarks/timm/requirements.in + # -r benchmarks/vjepa/requirements.in # accelerate # datasets + # evosax # flax # fvcore + # gymnax # huggingface-hub # lightning # ml-collections # omegaconf # orbax-checkpoint # pytorch-lightning + # timm # transformers + # wandb + # webdataset # yacs +rdkit==2024.3.5 + # via + # -r benchmarks/geo_gnn/requirements.in + # -r benchmarks/recursiongfn/requirements.in reactivex==4.0.4 # via giving -regex==2024.7.24 +regex==2024.9.11 # via # diffusers # tiktoken @@ -401,90 +652,166 @@ requests==2.32.3 # diffusers # huggingface-hub # tiktoken + # torch-geometric # transformers -rich==13.7.1 + # wandb +rich==13.9.1 # via # flax # tyro # voir -safetensors==0.4.4 +rlax==0.1.6 + # via navix +safetensors==0.4.5 # via # -r benchmarks/timm/requirements.in # accelerate # diffusers + # timm # torchtune # transformers -scipy==1.14.0 +scikit-learn==1.5.2 + # via gpytorch +scipy==1.14.1 # via # -r benchmarks/dinov2/requirements.in + # -r benchmarks/recursiongfn/requirements.in + # botorch # brax + # gpytorch # jax # jaxlib # jaxopt + # linear-operator # mujoco-mjx + # scikit-learn + # torch-cluster + # torch-sparse +seaborn==0.13.2 + # via gymnax sentencepiece==0.2.0 # via # -r benchmarks/llama/requirements.in # torchtune +sentry-sdk==2.15.0 + # via wandb +setproctitle==1.3.3 + # via wandb +setuptools-scm==8.1.0 + # via navix shtab==1.7.1 # via tyro six==1.16.0 # via # asttokens - # fire + # docker-pycreds # ml-collections # python-dateutil # tensorboard -submitit==1.5.1 - # via -r benchmarks/dinov2/requirements.in -sympy==1.13.2 + # tensorflow-probability +smmap==5.0.1 + # via gitdb +submitit==1.5.2 + # via + # -r benchmarks/dinov2/requirements.in + # -r benchmarks/vjepa/requirements.in +sympy==1.13.3 # via torch +tables==3.10.1 + # via -r benchmarks/recursiongfn/requirements.in tabulate==0.9.0 # via fvcore -tensorboard==2.17.1 - # via -r benchmarks/torchatari/requirements.in +tensorboard==2.18.0 + # via + # -r benchmarks/recursiongfn/requirements.in + # -r benchmarks/torchatari/requirements.in tensorboard-data-server==0.7.2 # via tensorboard tensorboardx==2.6.2.2 # via brax -tensorstore==0.1.64 +tensorflow-probability==0.24.0 + # via distrax +tensorstore==0.1.66 # via + # flashbax # flax # orbax-checkpoint termcolor==2.4.0 # via # fire # fvcore +threadpoolctl==3.5.0 + # via scikit-learn tiktoken==0.7.0 # via torchtune +timm==1.0.9 + # via -r benchmarks/vjepa/requirements.in tokenizers==0.19.1 # via transformers +tomli==2.0.2 + # via + # black + # pylint + # pytest + # setuptools-scm +tomlkit==0.13.2 + # via pylint toolz==0.12.1 # via chex -torch==2.4.0+rocm6.0 +torch==2.4.1+rocm6.1 # via # -r benchmarks/brax/requirements.in # -r benchmarks/dinov2/requirements.in # -r benchmarks/flops/requirements.in + # -r benchmarks/geo_gnn/requirements-pre.in # -r benchmarks/huggingface/requirements.in # -r benchmarks/lightning/requirements.in # -r benchmarks/llama/requirements.in + # -r benchmarks/llava/requirements.in # -r benchmarks/llm/requirements.in - # -r benchmarks/super-slomo/requirements.in + # -r benchmarks/llm/requirements.txt + # -r benchmarks/purejaxrl/requirements.in + # -r benchmarks/recursiongfn/requirements.in + # -r benchmarks/rlhf/requirements.in # -r benchmarks/timm/requirements.in # -r benchmarks/torchatari/requirements.in # -r benchmarks/torchvision/requirements.in # -r benchmarks/torchvision_ddp/requirements.in + # -r benchmarks/vjepa/requirements.in # accelerate + # botorch # diffusers # fairscale # lightning + # linear-operator + # pyro-ppl # pytorch-lightning + # timm # torchmetrics # torchvision + # trl # xformers +torch-cluster==1.6.3 + # via + # -r benchmarks/geo_gnn/requirements.in + # -r benchmarks/recursiongfn/requirements.in +torch-geometric==2.6.1 + # via + # -r benchmarks/geo_gnn/requirements.in + # -r benchmarks/recursiongfn/requirements.in +torch-scatter==2.1.2 + # via + # -r benchmarks/geo_gnn/requirements.in + # -r benchmarks/recursiongfn/requirements.in +torch-sparse==0.6.18 + # via + # -r benchmarks/geo_gnn/requirements.in + # -r benchmarks/recursiongfn/requirements.in torchao==0.3.1 - # via torchtune + # via + # -c .pin/../constraints/rocm.txt + # -r benchmarks/llm/requirements.in + # torchtune torchcompat==1.1.4 # via # -c .pin/../constraints/rocm.txt @@ -493,28 +820,30 @@ torchcompat==1.1.4 # -r benchmarks/torchatari/requirements.in # -r benchmarks/torchvision/requirements.in # -r benchmarks/torchvision_ddp/requirements.in -torchmetrics==1.4.1 +torchmetrics==1.4.2 # via # -r benchmarks/dinov2/requirements.in # lightning # pytorch-lightning torchtune==0.2.1 - # via -r benchmarks/llm/requirements.in -torchvision==0.19.0+rocm6.0 + # via + # -c .pin/../constraints/rocm.txt + # -r benchmarks/llm/requirements.in +torchvision==0.19.1+rocm6.1 # via # -r benchmarks/diffusion/requirements.in # -r benchmarks/dinov2/requirements.in # -r benchmarks/flops/requirements.in # -r benchmarks/lightning/requirements.in - # -r benchmarks/super-slomo/requirements.in # -r benchmarks/timm/requirements.in # -r benchmarks/torchvision/requirements.in # -r benchmarks/torchvision_ddp/requirements.in + # -r benchmarks/vjepa/requirements.in + # timm tqdm==4.66.5 # via # -r benchmarks/diffusion/requirements.in # -r benchmarks/flops/requirements.in - # -r benchmarks/super-slomo/requirements.in # -r benchmarks/torchvision/requirements.in # -r benchmarks/torchvision_ddp/requirements.in # datasets @@ -522,48 +851,75 @@ tqdm==4.66.5 # huggingface-hub # iopath # lightning + # pyro-ppl # pytorch-lightning + # torch-geometric # torchtune # transformers -transformers==4.44.1 +transformers==4.44.2 # via + # -c .pin/../constraints/rocm.txt # -r benchmarks/diffusion/requirements.in # -r benchmarks/huggingface/requirements.in # -r benchmarks/llama/requirements.in -trimesh==4.4.7 + # -r benchmarks/llava/requirements.in + # -r benchmarks/llm/requirements.in + # -r benchmarks/rlhf/requirements.in + # trl +trimesh==4.4.9 # via # brax # mujoco-mjx -types-protobuf==5.27.0.20240626 +trl==0.10.1 + # via + # -c .pin/../constraints/rocm.txt + # -r benchmarks/rlhf/requirements.in +typeguard==4.3.0 + # via jaxtyping +types-protobuf==5.28.0.20240924 # via envpool typing-extensions==4.12.2 # via + # astroid + # black + # botorch # brax # chex # envpool # etils + # flashbax # flax # gymnasium # huggingface-hub # iopath + # jaxtyping # lightning # lightning-utilities + # multidict + # navix # optree # orbax-checkpoint # pytorch-lightning # reactivex + # rich # submitit + # tables # torch + # typeguard # tyro -tyro==0.8.8 - # via -r benchmarks/torchatari/requirements.in -tzdata==2024.1 +tyro==0.8.11 + # via + # -r benchmarks/torchatari/requirements.in + # navix + # trl +tzdata==2024.2 # via pandas -urllib3==2.2.2 +urllib3==2.2.3 # via # blobfile # requests -varname==0.10.0 + # sentry-sdk +varname==0.13.3 # via giving voir==0.2.19 # via @@ -572,28 +928,39 @@ voir==0.2.19 # -r benchmarks/diffusion/requirements.in # -r benchmarks/dinov2/requirements.in # -r benchmarks/flops/requirements.in + # -r benchmarks/geo_gnn/requirements.in # -r benchmarks/huggingface/requirements.in # -r benchmarks/lightning/requirements.in # -r benchmarks/llama/requirements.in + # -r benchmarks/llava/requirements.in # -r benchmarks/llm/requirements.in - # -r benchmarks/super-slomo/requirements.in + # -r benchmarks/purejaxrl/requirements.in + # -r benchmarks/recursiongfn/requirements.in + # -r benchmarks/rlhf/requirements.in # -r benchmarks/timm/requirements.in # -r benchmarks/torchatari/requirements.in # -r benchmarks/torchvision/requirements.in # -r benchmarks/torchvision_ddp/requirements.in -werkzeug==3.0.3 + # -r benchmarks/vjepa/requirements.in +wandb==0.18.3 + # via + # -r benchmarks/recursiongfn/requirements.in + # navix +webdataset==0.2.100 + # via -r benchmarks/vjepa/requirements.in +werkzeug==3.0.4 # via # flask # tensorboard -xformers==0.0.27.post2 +xformers==0.0.28.post1 # via -r benchmarks/dinov2/requirements.in xxhash==3.5.0 # via datasets yacs==0.1.8 # via fvcore -yarl==1.9.4 +yarl==1.13.1 # via aiohttp -zipp==3.20.0 +zipp==3.20.2 # via # etils # importlib-metadata diff --git a/benchmarks/brax/benchfile.py b/benchmarks/brax/benchfile.py index 0388956d..c3312813 100644 --- a/benchmarks/brax/benchfile.py +++ b/benchmarks/brax/benchfile.py @@ -5,5 +5,9 @@ class BraxBenchmark(Package): base_requirements = "requirements.in" main_script = "main.py" - + def make_env(self): + env = super().make_env() + env["XLA_PYTHON_CLIENT_PREALLOCATE"] = "False" + return env + __pack__ = BraxBenchmark diff --git a/benchmarks/brax/requirements.rocm.txt b/benchmarks/brax/requirements.rocm.txt index 0c14e04d..a1923520 100644 --- a/benchmarks/brax/requirements.rocm.txt +++ b/benchmarks/brax/requirements.rocm.txt @@ -4,7 +4,7 @@ # # pip-compile --output-file=benchmarks/brax/requirements.rocm.txt .pin/tmp-constraints-rocm-brax.txt benchmarks/brax/requirements.in # ---extra-index-url https://download.pytorch.org/whl/rocm6.0 +--extra-index-url https://download.pytorch.org/whl/rocm6.1 absl-py==2.1.0 # via @@ -33,7 +33,7 @@ brax==0.10.5 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/brax/requirements.in -chex==0.1.86 +chex==0.1.87 # via # -c .pin/../.pin/constraints-rocm-torch.txt # optax @@ -45,7 +45,7 @@ cloudpickle==3.0.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # gym -codefind==0.1.6 +codefind==0.1.7 # via # -c .pin/../.pin/constraints-rocm-torch.txt # ptera @@ -61,7 +61,7 @@ dm-tree==0.1.8 # via # -c .pin/../.pin/constraints-rocm-torch.txt # dm-env -etils[epath,epy]==1.7.0 +etils[epath,epy]==1.9.4 # via # -c .pin/../.pin/constraints-rocm-torch.txt # brax @@ -69,11 +69,11 @@ etils[epath,epy]==1.7.0 # mujoco-mjx # optax # orbax-checkpoint -executing==1.2.0 +executing==2.1.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # varname -filelock==3.15.4 +filelock==3.16.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # pytorch-triton-rocm @@ -83,11 +83,11 @@ flask==3.0.3 # -c .pin/../.pin/constraints-rocm-torch.txt # brax # flask-cors -flask-cors==4.0.1 +flask-cors==5.0.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # brax -flax==0.8.5 +flax==0.9.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # brax @@ -96,7 +96,7 @@ fsspec==2024.6.1 # -c .pin/../.pin/constraints-rocm-torch.txt # etils # torch -giving==0.4.2 +giving==0.4.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # ptera @@ -105,11 +105,11 @@ glfw==2.7.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # mujoco -grpcio==1.65.5 +grpcio==1.66.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # brax -gym==0.23.1 +gym==0.26.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # brax @@ -121,7 +121,7 @@ humanize==4.10.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # orbax-checkpoint -importlib-resources==6.4.3 +importlib-resources==6.4.5 # via # -c .pin/../.pin/constraints-rocm-torch.txt # etils @@ -129,7 +129,7 @@ itsdangerous==2.2.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # flask -jax==0.4.31 +jax==0.4.33 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/brax/requirements.in @@ -140,7 +140,7 @@ jax==0.4.31 # mujoco-mjx # optax # orbax-checkpoint -jaxlib==0.4.31 +jaxlib==0.4.33 # via # -c .pin/../.pin/constraints-rocm-torch.txt # brax @@ -177,7 +177,7 @@ ml-collections==0.1.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # brax -ml-dtypes==0.4.0 +ml-dtypes==0.5.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # jax @@ -187,17 +187,17 @@ mpmath==1.3.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # sympy -msgpack==1.0.8 +msgpack==1.1.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # flax # orbax-checkpoint -mujoco==3.2.2 +mujoco==3.2.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # brax # mujoco-mjx -mujoco-mjx==3.2.2 +mujoco-mjx==3.2.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # brax @@ -215,25 +215,27 @@ numpy==1.26.4 # brax # chex # dm-env - # flax # gym # jax # jaxlib # jaxopt # ml-dtypes # mujoco - # opt-einsum # optax # orbax-checkpoint # scipy # tensorboardx # tensorstore # trimesh +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir -opt-einsum==3.3.0 +opt-einsum==3.4.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # jax @@ -242,12 +244,12 @@ optax==0.2.3 # -c .pin/../.pin/constraints-rocm-torch.txt # brax # flax -orbax-checkpoint==0.6.0 +orbax-checkpoint==0.6.4 # via # -c .pin/../.pin/constraints-rocm-torch.txt # brax # flax -ovld==0.3.8 +ovld==0.3.9 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir @@ -259,7 +261,7 @@ pillow==10.4.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # brax -protobuf==5.27.3 +protobuf==5.28.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # orbax-checkpoint @@ -276,10 +278,6 @@ pygments==2.18.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # rich -pynvml==11.5.3 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # voir pyopengl==3.1.7 # via # -c .pin/../.pin/constraints-rocm-torch.txt @@ -303,12 +301,12 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -rich==13.7.1 +rich==13.9.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # flax # voir -scipy==1.14.0 +scipy==1.14.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # brax @@ -321,7 +319,7 @@ six==1.16.0 # -c .pin/../.pin/constraints-rocm-torch.txt # asttokens # ml-collections -sympy==1.13.2 +sympy==1.13.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch @@ -329,7 +327,7 @@ tensorboardx==2.6.2.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # brax -tensorstore==0.1.64 +tensorstore==0.1.66 # via # -c .pin/../.pin/constraints-rocm-torch.txt # flax @@ -338,11 +336,11 @@ toolz==0.12.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # chex -torch==2.4.0+rocm6.0 +torch==2.4.1+rocm6.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/brax/requirements.in -trimesh==4.4.7 +trimesh==4.4.9 # via # -c .pin/../.pin/constraints-rocm-torch.txt # brax @@ -356,8 +354,9 @@ typing-extensions==4.12.2 # flax # orbax-checkpoint # reactivex + # rich # torch -varname==0.10.0 +varname==0.13.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving @@ -366,11 +365,11 @@ voir==0.2.19 # -c .pin/../.pin/constraints-rocm-torch.txt # -c .pin/../constraints/rocm.txt # -r benchmarks/brax/requirements.in -werkzeug==3.0.3 +werkzeug==3.0.4 # via # -c .pin/../.pin/constraints-rocm-torch.txt # flask -zipp==3.20.0 +zipp==3.20.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # etils diff --git a/benchmarks/diffusion/requirements.rocm.txt b/benchmarks/diffusion/requirements.rocm.txt index 5d0fd6e3..ecedcbb4 100644 --- a/benchmarks/diffusion/requirements.rocm.txt +++ b/benchmarks/diffusion/requirements.rocm.txt @@ -4,18 +4,18 @@ # # pip-compile --output-file=benchmarks/diffusion/requirements.rocm.txt .pin/tmp-constraints-rocm-diffusion-nodes.txt benchmarks/diffusion/requirements.in # ---extra-index-url https://download.pytorch.org/whl/rocm6.0 +--extra-index-url https://download.pytorch.org/whl/rocm6.1 -accelerate==0.33.0 +accelerate==0.34.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/diffusion/requirements.in # diffusers -aiohappyeyeballs==2.4.0 +aiohappyeyeballs==2.4.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # aiohttp -aiohttp==3.10.5 +aiohttp==3.10.8 # via # -c .pin/../.pin/constraints-rocm-torch.txt # datasets @@ -44,7 +44,7 @@ attrs==24.2.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # aiohttp -certifi==2024.7.4 +certifi==2024.8.30 # via # -c .pin/../.pin/constraints-rocm-torch.txt # requests @@ -52,15 +52,15 @@ charset-normalizer==3.3.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # requests -codefind==0.1.6 +codefind==0.1.7 # via # -c .pin/../.pin/constraints-rocm-torch.txt # ptera -datasets==2.21.0 +datasets==3.0.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/diffusion/requirements.in -diffusers[torch]==0.30.0 +diffusers[torch]==0.30.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/diffusion/requirements.in @@ -69,11 +69,11 @@ dill==0.3.8 # -c .pin/../.pin/constraints-rocm-torch.txt # datasets # multiprocess -executing==1.2.0 +executing==2.1.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # varname -filelock==3.15.4 +filelock==3.16.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # datasets @@ -93,7 +93,7 @@ fsspec[http]==2024.6.1 # datasets # huggingface-hub # torch -giving==0.4.2 +giving==0.4.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # ptera @@ -102,7 +102,7 @@ hjson==3.1.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # argklass -huggingface-hub==0.24.6 +huggingface-hub==0.25.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # accelerate @@ -110,16 +110,16 @@ huggingface-hub==0.24.6 # diffusers # tokenizers # transformers -idna==3.7 +idna==3.10 # via # -c .pin/../.pin/constraints-rocm-torch.txt # requests # yarl -importlib-metadata==8.4.0 +importlib-metadata==8.5.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # diffusers -importlib-resources==6.4.3 +importlib-resources==6.4.5 # via # -c .pin/../.pin/constraints-rocm-torch.txt # argklass @@ -143,7 +143,7 @@ mpmath==1.3.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # sympy -multidict==6.0.5 +multidict==6.1.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # aiohttp @@ -166,11 +166,15 @@ numpy==1.26.4 # pyarrow # torchvision # transformers +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir -ovld==0.3.8 +ovld==0.3.9 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir @@ -181,7 +185,7 @@ packaging==24.1 # datasets # huggingface-hub # transformers -pandas==2.2.2 +pandas==2.2.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # datasets @@ -207,10 +211,6 @@ pygments==2.18.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # rich -pynvml==11.5.3 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # voir python-dateutil==2.9.0.post0 # via # -c .pin/../.pin/constraints-rocm-torch.txt @@ -219,7 +219,7 @@ pytorch-triton-rocm==3.0.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch -pytz==2024.1 +pytz==2024.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # pandas @@ -235,7 +235,7 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -regex==2024.7.24 +regex==2024.9.11 # via # -c .pin/../.pin/constraints-rocm-torch.txt # diffusers @@ -247,11 +247,11 @@ requests==2.32.3 # diffusers # huggingface-hub # transformers -rich==13.7.1 +rich==13.9.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir -safetensors==0.4.4 +safetensors==0.4.5 # via # -c .pin/../.pin/constraints-rocm-torch.txt # accelerate @@ -262,7 +262,7 @@ six==1.16.0 # -c .pin/../.pin/constraints-rocm-torch.txt # asttokens # python-dateutil -sympy==1.13.2 +sympy==1.13.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch @@ -270,13 +270,13 @@ tokenizers==0.19.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # transformers -torch==2.4.0+rocm6.0 +torch==2.4.1+rocm6.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # accelerate # diffusers # torchvision -torchvision==0.19.0+rocm6.0 +torchvision==0.19.1+rocm6.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/diffusion/requirements.in @@ -287,29 +287,32 @@ tqdm==4.66.5 # datasets # huggingface-hub # transformers -transformers==4.44.1 +transformers==4.44.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt + # -c .pin/../constraints/rocm.txt # -r benchmarks/diffusion/requirements.in typing-extensions==4.12.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # huggingface-hub + # multidict # reactivex + # rich # torch -tzdata==2024.1 +tzdata==2024.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # pandas -urllib3==2.2.2 +urllib3==2.2.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # requests -varname==0.10.0 +varname==0.13.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -voir==0.2.17 +voir==0.2.19 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -c .pin/../constraints/rocm.txt @@ -318,11 +321,11 @@ xxhash==3.5.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # datasets -yarl==1.9.4 +yarl==1.13.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # aiohttp -zipp==3.20.0 +zipp==3.20.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # importlib-metadata diff --git a/benchmarks/dinov2/requirements.rocm.txt b/benchmarks/dinov2/requirements.rocm.txt index c46ba981..f8b7f43e 100644 --- a/benchmarks/dinov2/requirements.rocm.txt +++ b/benchmarks/dinov2/requirements.rocm.txt @@ -4,7 +4,7 @@ # # pip-compile --output-file=benchmarks/dinov2/requirements.rocm.txt .pin/tmp-constraints-rocm-dinov2-giant-gpus.txt benchmarks/dinov2/requirements.in # ---extra-index-url https://download.pytorch.org/whl/rocm6.0 +--extra-index-url https://download.pytorch.org/whl/rocm6.1 antlr4-python3-runtime==4.9.3 # via @@ -18,15 +18,15 @@ cloudpickle==3.0.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # submitit -codefind==0.1.6 +codefind==0.1.7 # via # -c .pin/../.pin/constraints-rocm-torch.txt # ptera -executing==1.2.0 +executing==2.1.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # varname -filelock==3.15.4 +filelock==3.16.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # pytorch-triton-rocm @@ -39,7 +39,7 @@ fvcore==0.1.5.post20221221 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/dinov2/requirements.in -giving==0.4.2 +giving==0.4.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # ptera @@ -53,7 +53,7 @@ jinja2==3.1.4 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch -lightning-utilities==0.11.6 +lightning-utilities==0.11.7 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torchmetrics @@ -85,12 +85,16 @@ numpy==1.26.4 # torchmetrics # torchvision # xformers +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/dinov2/requirements.in # voir -ovld==0.3.8 +ovld==0.3.9 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir @@ -120,10 +124,6 @@ pygments==2.18.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # rich -pynvml==11.5.3 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # voir pytorch-triton-rocm==3.0.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt @@ -138,11 +138,11 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -rich==13.7.1 +rich==13.9.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir -scipy==1.14.0 +scipy==1.14.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/dinov2/requirements.in @@ -150,11 +150,11 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # asttokens -submitit==1.5.1 +submitit==1.5.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/dinov2/requirements.in -sympy==1.13.2 +sympy==1.13.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch @@ -166,18 +166,18 @@ termcolor==2.4.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # fvcore -torch==2.4.0+rocm6.0 +torch==2.4.1+rocm6.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/dinov2/requirements.in # torchmetrics # torchvision # xformers -torchmetrics==1.4.1 +torchmetrics==1.4.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/dinov2/requirements.in -torchvision==0.19.0+rocm6.0 +torchvision==0.19.1+rocm6.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/dinov2/requirements.in @@ -192,18 +192,19 @@ typing-extensions==4.12.2 # iopath # lightning-utilities # reactivex + # rich # submitit # torch -varname==0.10.0 +varname==0.13.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -voir==0.2.17 +voir==0.2.19 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -c .pin/../constraints/rocm.txt # -r benchmarks/dinov2/requirements.in -xformers==0.0.27.post2 +xformers==0.0.28.post1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/dinov2/requirements.in diff --git a/benchmarks/flops/requirements.rocm.txt b/benchmarks/flops/requirements.rocm.txt index d9ac15eb..fbc8952d 100644 --- a/benchmarks/flops/requirements.rocm.txt +++ b/benchmarks/flops/requirements.rocm.txt @@ -4,7 +4,7 @@ # # pip-compile --output-file=benchmarks/flops/requirements.rocm.txt .pin/tmp-constraints-rocm-flops.txt benchmarks/flops/requirements.in # ---extra-index-url https://download.pytorch.org/whl/rocm6.0 +--extra-index-url https://download.pytorch.org/whl/rocm6.1 antlr4-python3-runtime==4.9.3 # via @@ -14,15 +14,15 @@ asttokens==2.4.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -codefind==0.1.6 +codefind==0.1.7 # via # -c .pin/../.pin/constraints-rocm-torch.txt # ptera -executing==1.2.0 +executing==2.1.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # varname -filelock==3.15.4 +filelock==3.16.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # pytorch-triton-rocm @@ -31,12 +31,12 @@ fsspec==2024.6.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch -giving==0.4.2 +giving==0.4.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # ptera # voir -importlib-resources==6.4.3 +importlib-resources==6.4.5 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torchcompat @@ -68,11 +68,15 @@ numpy==1.26.4 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torchvision +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir -ovld==0.3.8 +ovld==0.3.9 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir @@ -92,10 +96,6 @@ pygments==2.18.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # rich -pynvml==11.5.3 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # voir pytorch-triton-rocm==3.0.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt @@ -108,7 +108,7 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -rich==13.7.1 +rich==13.9.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir @@ -116,11 +116,11 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # asttokens -sympy==1.13.2 +sympy==1.13.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch -torch==2.4.0+rocm6.0 +torch==2.4.1+rocm6.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/flops/requirements.in @@ -130,7 +130,7 @@ torchcompat==1.1.4 # -c .pin/../.pin/constraints-rocm-torch.txt # -c .pin/../constraints/rocm.txt # -r benchmarks/flops/requirements.in -torchvision==0.19.0+rocm6.0 +torchvision==0.19.1+rocm6.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/flops/requirements.in @@ -142,8 +142,9 @@ typing-extensions==4.12.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # reactivex + # rich # torch -varname==0.10.0 +varname==0.13.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving diff --git a/benchmarks/geo_gnn/requirements-pre.rocm.txt b/benchmarks/geo_gnn/requirements-pre.rocm.txt index 3aded346..9b4cf02f 100644 --- a/benchmarks/geo_gnn/requirements-pre.rocm.txt +++ b/benchmarks/geo_gnn/requirements-pre.rocm.txt @@ -2,48 +2,48 @@ # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # -# pip-compile --output-file=benchmarks/geo_gnn/requirements-pre.rocm.txt .pin/tmp-constraints-rocm-geo_gnn.txt benchmarks/geo_gnn/requirements-pre.in +# pip-compile --output-file=benchmarks/geo_gnn/requirements-pre.rocm.txt .pin/tmp-constraints-rocm-dimenet.txt benchmarks/geo_gnn/requirements-pre.in # ---extra-index-url https://download.pytorch.org/whl/rocm6.0 +--extra-index-url https://download.pytorch.org/whl/rocm6.1 -filelock==3.15.4 +filelock==3.16.1 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # pytorch-triton-rocm # torch fsspec==2024.6.1 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # torch jinja2==3.1.4 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # torch markupsafe==2.1.5 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # jinja2 mpmath==1.3.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # sympy networkx==3.3 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # torch pytorch-triton-rocm==3.0.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # torch -sympy==1.13.2 +sympy==1.13.3 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # torch -torch==2.4.0+rocm6.0 +torch==2.4.1+rocm6.1 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/geo_gnn/requirements-pre.in typing-extensions==4.12.2 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # torch diff --git a/benchmarks/geo_gnn/requirements.rocm.txt b/benchmarks/geo_gnn/requirements.rocm.txt index 60246f79..8dfacfe7 100644 --- a/benchmarks/geo_gnn/requirements.rocm.txt +++ b/benchmarks/geo_gnn/requirements.rocm.txt @@ -2,271 +2,258 @@ # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # -# pip-compile --output-file=benchmarks/geo_gnn/requirements.rocm.txt .pin/tmp-constraints-rocm-geo_gnn.txt benchmarks/geo_gnn/requirements-pre.rocm.txt benchmarks/geo_gnn/requirements.in +# pip-compile --output-file=benchmarks/geo_gnn/requirements.rocm.txt .pin/tmp-constraints-rocm-dimenet.txt benchmarks/geo_gnn/requirements-pre.rocm.txt benchmarks/geo_gnn/requirements.in # ---extra-index-url https://download.pytorch.org/whl/rocm6.0 +--extra-index-url https://download.pytorch.org/whl/rocm6.1 -aiohappyeyeballs==2.4.0 +aiohappyeyeballs==2.4.3 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # aiohttp -aiohttp==3.10.5 +aiohttp==3.10.8 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # torch-geometric aiosignal==1.3.1 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # aiohttp antlr4-python3-runtime==4.9.3 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # omegaconf asttokens==2.4.1 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # giving async-timeout==4.0.3 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # aiohttp attrs==24.2.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # aiohttp -certifi==2024.7.4 +certifi==2024.8.30 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # requests charset-normalizer==3.3.2 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # requests -codefind==0.1.6 +codefind==0.1.7 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # ptera -executing==1.2.0 +executing==2.1.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # varname -filelock==3.15.4 +filelock==3.16.1 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/geo_gnn/requirements-pre.rocm.txt # pytorch-triton-rocm # torch frozenlist==1.4.1 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # aiohttp # aiosignal fsspec==2024.6.1 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/geo_gnn/requirements-pre.rocm.txt # torch # torch-geometric -giving==0.4.2 +giving==0.4.3 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # ptera # voir -idna==3.7 +idna==3.10 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # requests # yarl jinja2==3.1.4 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/geo_gnn/requirements-pre.rocm.txt # torch # torch-geometric -joblib==1.4.2 - # via - # -c .pin/../.pin/constraints-rocm-gnn.txt - # scikit-learn markdown-it-py==3.0.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # rich markupsafe==2.1.5 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/geo_gnn/requirements-pre.rocm.txt # jinja2 mdurl==0.1.2 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # markdown-it-py mpmath==1.3.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/geo_gnn/requirements-pre.rocm.txt # sympy -multidict==6.0.5 +multidict==6.1.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # aiohttp # yarl networkx==3.3 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/geo_gnn/requirements-pre.rocm.txt # torch numpy==1.26.4 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/geo_gnn/requirements.in # pandas # rdkit - # scikit-learn # scipy # torch-geometric +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir omegaconf==2.3.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # voir -ovld==0.3.8 +ovld==0.3.9 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # voir -pandas==2.2.2 +pandas==2.2.3 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/geo_gnn/requirements.in pillow==10.4.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # rdkit psutil==5.9.8 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # torch-geometric # voir ptera==1.4.1 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # voir pygments==2.18.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # rich -pynvml==11.5.3 - # via - # -c .pin/../.pin/constraints-rocm-gnn.txt - # voir -pyparsing==3.1.2 +pyparsing==3.1.4 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # torch-geometric python-dateutil==2.9.0.post0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # pandas pytorch-triton-rocm==3.0.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/geo_gnn/requirements-pre.rocm.txt # torch -pytz==2024.1 +pytz==2024.2 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # pandas pyyaml==6.0.2 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # omegaconf rdkit==2024.3.5 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/geo_gnn/requirements.in reactivex==4.0.4 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # giving requests==2.32.3 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # torch-geometric -rich==13.7.1 +rich==13.9.1 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # voir -scikit-learn==1.5.1 +scipy==1.14.1 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt - # torch-geometric -scipy==1.14.0 - # via - # -c .pin/../.pin/constraints-rocm-gnn.txt - # scikit-learn + # -c .pin/../.pin/constraints-rocm-torch.txt # torch-cluster - # torch-geometric # torch-sparse six==1.16.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # asttokens # python-dateutil -sympy==1.13.2 +sympy==1.13.3 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/geo_gnn/requirements-pre.rocm.txt # torch -threadpoolctl==3.5.0 +torch==2.4.1+rocm6.1 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt - # scikit-learn -torch==2.4.0+rocm6.0 - # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/geo_gnn/requirements-pre.rocm.txt torch-cluster==1.6.3 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/geo_gnn/requirements.in -torch-geometric==2.5.3 +torch-geometric==2.6.1 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/geo_gnn/requirements.in torch-scatter==2.1.2 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/geo_gnn/requirements.in torch-sparse==0.6.18 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/geo_gnn/requirements.in tqdm==4.66.5 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # torch-geometric typing-extensions==4.12.2 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/geo_gnn/requirements-pre.rocm.txt + # multidict # reactivex + # rich # torch -tzdata==2024.1 +tzdata==2024.2 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # pandas -urllib3==2.2.2 +urllib3==2.2.3 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # requests -varname==0.10.0 +varname==0.13.3 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # giving -voir==0.2.17 +voir==0.2.19 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # -c .pin/../constraints/rocm.txt # -r benchmarks/geo_gnn/requirements.in -yarl==1.9.4 +yarl==1.13.1 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # aiohttp diff --git a/benchmarks/huggingface/requirements.rocm.txt b/benchmarks/huggingface/requirements.rocm.txt index 1f54d841..653d2c59 100644 --- a/benchmarks/huggingface/requirements.rocm.txt +++ b/benchmarks/huggingface/requirements.rocm.txt @@ -4,7 +4,7 @@ # # pip-compile --output-file=benchmarks/huggingface/requirements.rocm.txt .pin/tmp-constraints-rocm-hf.txt benchmarks/huggingface/requirements.in # ---extra-index-url https://download.pytorch.org/whl/rocm6.0 +--extra-index-url https://download.pytorch.org/whl/rocm6.1 antlr4-python3-runtime==4.9.3 # via @@ -14,7 +14,7 @@ asttokens==2.4.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -certifi==2024.7.4 +certifi==2024.8.30 # via # -c .pin/../.pin/constraints-rocm-torch.txt # requests @@ -22,15 +22,15 @@ charset-normalizer==3.3.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # requests -codefind==0.1.6 +codefind==0.1.7 # via # -c .pin/../.pin/constraints-rocm-torch.txt # ptera -executing==1.2.0 +executing==2.1.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # varname -filelock==3.15.4 +filelock==3.16.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # huggingface-hub @@ -42,17 +42,17 @@ fsspec==2024.6.1 # -c .pin/../.pin/constraints-rocm-torch.txt # huggingface-hub # torch -giving==0.4.2 +giving==0.4.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # ptera # voir -huggingface-hub==0.24.6 +huggingface-hub==0.25.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # tokenizers # transformers -idna==3.7 +idna==3.10 # via # -c .pin/../.pin/constraints-rocm-torch.txt # requests @@ -84,11 +84,15 @@ numpy==1.26.4 # via # -c .pin/../.pin/constraints-rocm-torch.txt # transformers +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir -ovld==0.3.8 +ovld==0.3.9 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir @@ -113,10 +117,6 @@ pygments==2.18.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # rich -pynvml==11.5.3 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # voir pytorch-triton-rocm==3.0.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt @@ -131,7 +131,7 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -regex==2024.7.24 +regex==2024.9.11 # via # -c .pin/../.pin/constraints-rocm-torch.txt # transformers @@ -140,11 +140,11 @@ requests==2.32.3 # -c .pin/../.pin/constraints-rocm-torch.txt # huggingface-hub # transformers -rich==13.7.1 +rich==13.9.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir -safetensors==0.4.4 +safetensors==0.4.5 # via # -c .pin/../.pin/constraints-rocm-torch.txt # transformers @@ -152,7 +152,7 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # asttokens -sympy==1.13.2 +sympy==1.13.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch @@ -160,7 +160,7 @@ tokenizers==0.19.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # transformers -torch==2.4.0+rocm6.0 +torch==2.4.1+rocm6.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/huggingface/requirements.in @@ -169,21 +169,23 @@ tqdm==4.66.5 # -c .pin/../.pin/constraints-rocm-torch.txt # huggingface-hub # transformers -transformers==4.44.1 +transformers==4.44.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt + # -c .pin/../constraints/rocm.txt # -r benchmarks/huggingface/requirements.in typing-extensions==4.12.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # huggingface-hub # reactivex + # rich # torch -urllib3==2.2.2 +urllib3==2.2.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # requests -varname==0.10.0 +varname==0.13.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving diff --git a/benchmarks/lightning/requirements.rocm.txt b/benchmarks/lightning/requirements.rocm.txt index 26fdcedf..aee2b1ba 100644 --- a/benchmarks/lightning/requirements.rocm.txt +++ b/benchmarks/lightning/requirements.rocm.txt @@ -4,13 +4,13 @@ # # pip-compile --output-file=benchmarks/lightning/requirements.rocm.txt .pin/tmp-constraints-rocm-lightning-gpus.txt benchmarks/lightning/requirements.in # ---extra-index-url https://download.pytorch.org/whl/rocm6.0 +--extra-index-url https://download.pytorch.org/whl/rocm6.1 -aiohappyeyeballs==2.4.0 +aiohappyeyeballs==2.4.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # aiohttp -aiohttp==3.10.5 +aiohttp==3.10.8 # via # -c .pin/../.pin/constraints-rocm-torch.txt # fsspec @@ -34,15 +34,15 @@ attrs==24.2.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # aiohttp -codefind==0.1.6 +codefind==0.1.7 # via # -c .pin/../.pin/constraints-rocm-torch.txt # ptera -executing==1.2.0 +executing==2.1.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # varname -filelock==3.15.4 +filelock==3.16.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # pytorch-triton-rocm @@ -58,16 +58,16 @@ fsspec[http]==2024.6.1 # lightning # pytorch-lightning # torch -giving==0.4.2 +giving==0.4.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # ptera # voir -idna==3.7 +idna==3.10 # via # -c .pin/../.pin/constraints-rocm-torch.txt # yarl -importlib-resources==6.4.3 +importlib-resources==6.4.5 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torchcompat @@ -79,7 +79,7 @@ lightning==2.4.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/lightning/requirements.in -lightning-utilities==0.11.6 +lightning-utilities==0.11.7 # via # -c .pin/../.pin/constraints-rocm-torch.txt # lightning @@ -101,7 +101,7 @@ mpmath==1.3.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # sympy -multidict==6.0.5 +multidict==6.1.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # aiohttp @@ -115,11 +115,15 @@ numpy==1.26.4 # -c .pin/../.pin/constraints-rocm-torch.txt # torchmetrics # torchvision +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir -ovld==0.3.8 +ovld==0.3.9 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir @@ -146,10 +150,6 @@ pygments==2.18.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # rich -pynvml==11.5.3 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # voir pytorch-lightning==2.4.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt @@ -168,7 +168,7 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -rich==13.7.1 +rich==13.9.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir @@ -176,11 +176,11 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # asttokens -sympy==1.13.2 +sympy==1.13.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch -torch==2.4.0+rocm6.0 +torch==2.4.1+rocm6.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/lightning/requirements.in @@ -193,12 +193,12 @@ torchcompat==1.1.4 # -c .pin/../.pin/constraints-rocm-torch.txt # -c .pin/../constraints/rocm.txt # -r benchmarks/lightning/requirements.in -torchmetrics==1.4.1 +torchmetrics==1.4.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # lightning # pytorch-lightning -torchvision==0.19.0+rocm6.0 +torchvision==0.19.1+rocm6.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/lightning/requirements.in @@ -212,19 +212,21 @@ typing-extensions==4.12.2 # -c .pin/../.pin/constraints-rocm-torch.txt # lightning # lightning-utilities + # multidict # pytorch-lightning # reactivex + # rich # torch -varname==0.10.0 +varname==0.13.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -voir==0.2.17 +voir==0.2.19 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -c .pin/../constraints/rocm.txt # -r benchmarks/lightning/requirements.in -yarl==1.9.4 +yarl==1.13.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # aiohttp diff --git a/benchmarks/llama/requirements.rocm.txt b/benchmarks/llama/requirements.rocm.txt index 97c44bb0..41a93e55 100644 --- a/benchmarks/llama/requirements.rocm.txt +++ b/benchmarks/llama/requirements.rocm.txt @@ -4,13 +4,13 @@ # # pip-compile --output-file=benchmarks/llama/requirements.rocm.txt .pin/tmp-constraints-rocm-llm.txt benchmarks/llama/requirements.in # ---extra-index-url https://download.pytorch.org/whl/rocm6.0 +--extra-index-url https://download.pytorch.org/whl/rocm6.1 -aiohappyeyeballs==2.4.0 +aiohappyeyeballs==2.4.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # aiohttp -aiohttp==3.10.5 +aiohttp==3.10.8 # via # -c .pin/../.pin/constraints-rocm-torch.txt # datasets @@ -35,7 +35,7 @@ attrs==24.2.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # aiohttp -certifi==2024.7.4 +certifi==2024.8.30 # via # -c .pin/../.pin/constraints-rocm-torch.txt # requests @@ -43,11 +43,11 @@ charset-normalizer==3.3.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # requests -codefind==0.1.6 +codefind==0.1.7 # via # -c .pin/../.pin/constraints-rocm-torch.txt # ptera -datasets==2.21.0 +datasets==3.0.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/llama/requirements.in @@ -56,7 +56,7 @@ dill==0.3.8 # -c .pin/../.pin/constraints-rocm-torch.txt # datasets # multiprocess -executing==1.2.0 +executing==2.1.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # varname @@ -64,7 +64,7 @@ fairscale==0.4.13 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/llama/requirements.in -filelock==3.15.4 +filelock==3.16.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # datasets @@ -72,7 +72,7 @@ filelock==3.15.4 # pytorch-triton-rocm # torch # transformers -fire==0.6.0 +fire==0.7.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/llama/requirements.in @@ -87,18 +87,18 @@ fsspec[http]==2024.6.1 # datasets # huggingface-hub # torch -giving==0.4.2 +giving==0.4.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # ptera # voir -huggingface-hub==0.24.6 +huggingface-hub==0.25.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # datasets # tokenizers # transformers -idna==3.7 +idna==3.10 # via # -c .pin/../.pin/constraints-rocm-torch.txt # requests @@ -123,7 +123,7 @@ mpmath==1.3.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # sympy -multidict==6.0.5 +multidict==6.1.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # aiohttp @@ -144,11 +144,15 @@ numpy==1.26.4 # pandas # pyarrow # transformers +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir -ovld==0.3.8 +ovld==0.3.9 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir @@ -158,7 +162,7 @@ packaging==24.1 # datasets # huggingface-hub # transformers -pandas==2.2.2 +pandas==2.2.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # datasets @@ -178,10 +182,6 @@ pygments==2.18.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # rich -pynvml==11.5.3 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # voir python-dateutil==2.9.0.post0 # via # -c .pin/../.pin/constraints-rocm-torch.txt @@ -190,7 +190,7 @@ pytorch-triton-rocm==3.0.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch -pytz==2024.1 +pytz==2024.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # pandas @@ -205,7 +205,7 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -regex==2024.7.24 +regex==2024.9.11 # via # -c .pin/../.pin/constraints-rocm-torch.txt # transformers @@ -215,11 +215,11 @@ requests==2.32.3 # datasets # huggingface-hub # transformers -rich==13.7.1 +rich==13.9.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir -safetensors==0.4.4 +safetensors==0.4.5 # via # -c .pin/../.pin/constraints-rocm-torch.txt # transformers @@ -231,9 +231,8 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # asttokens - # fire # python-dateutil -sympy==1.13.2 +sympy==1.13.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch @@ -245,7 +244,7 @@ tokenizers==0.19.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # transformers -torch==2.4.0+rocm6.0 +torch==2.4.1+rocm6.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/llama/requirements.in @@ -256,25 +255,28 @@ tqdm==4.66.5 # datasets # huggingface-hub # transformers -transformers==4.44.1 +transformers==4.44.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt + # -c .pin/../constraints/rocm.txt # -r benchmarks/llama/requirements.in typing-extensions==4.12.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # huggingface-hub + # multidict # reactivex + # rich # torch -tzdata==2024.1 +tzdata==2024.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # pandas -urllib3==2.2.2 +urllib3==2.2.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # requests -varname==0.10.0 +varname==0.13.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving @@ -287,7 +289,7 @@ xxhash==3.5.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # datasets -yarl==1.9.4 +yarl==1.13.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # aiohttp diff --git a/benchmarks/llava/requirements.rocm.txt b/benchmarks/llava/requirements.rocm.txt new file mode 100644 index 00000000..fe11f280 --- /dev/null +++ b/benchmarks/llava/requirements.rocm.txt @@ -0,0 +1,293 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/llava/requirements.rocm.txt .pin/tmp-constraints-rocm-llava-single.txt benchmarks/llava/requirements.in +# +--extra-index-url https://download.pytorch.org/whl/rocm6.1 + +accelerate==0.34.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/llava/requirements.in +aiohappyeyeballs==2.4.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp +aiohttp==3.10.8 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets + # fsspec +aiosignal==1.3.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp +antlr4-python3-runtime==4.9.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # omegaconf +asttokens==2.4.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # giving +async-timeout==4.0.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp +attrs==24.2.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp +certifi==2024.8.30 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # requests +charset-normalizer==3.3.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # requests +codefind==0.1.7 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # ptera +datasets==3.0.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/llava/requirements.in +dill==0.3.8 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets + # multiprocess +executing==2.1.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # varname +filelock==3.16.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets + # huggingface-hub + # pytorch-triton-rocm + # torch + # transformers +frozenlist==1.4.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp + # aiosignal +fsspec[http]==2024.6.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets + # huggingface-hub + # torch +giving==0.4.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # ptera + # voir +huggingface-hub==0.25.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # accelerate + # datasets + # tokenizers + # transformers +idna==3.10 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # requests + # yarl +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +markdown-it-py==3.0.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # rich +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # jinja2 +mdurl==0.1.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # markdown-it-py +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # sympy +multidict==6.1.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp + # yarl +multiprocess==0.70.16 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets +networkx==3.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +numpy==1.26.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/llava/requirements.in + # accelerate + # datasets + # pandas + # pyarrow + # transformers +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +omegaconf==2.3.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +ovld==0.3.9 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +packaging==24.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # accelerate + # datasets + # huggingface-hub + # transformers +pandas==2.2.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets +pillow==10.4.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/llava/requirements.in +psutil==5.9.8 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # accelerate + # voir +ptera==1.4.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +pyarrow==17.0.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets +pygments==2.18.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # rich +python-dateutil==2.9.0.post0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # pandas +pytorch-triton-rocm==3.0.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +pytz==2024.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # pandas +pyyaml==6.0.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # accelerate + # datasets + # huggingface-hub + # omegaconf + # transformers +reactivex==4.0.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # giving +regex==2024.9.11 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # transformers +requests==2.32.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets + # huggingface-hub + # transformers +rich==13.9.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +safetensors==0.4.5 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # accelerate + # transformers +six==1.16.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # asttokens + # python-dateutil +sympy==1.13.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +tokenizers==0.19.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # transformers +torch==2.4.1+rocm6.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/llava/requirements.in + # accelerate +tqdm==4.66.5 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets + # huggingface-hub + # transformers +transformers==4.44.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -c .pin/../constraints/rocm.txt + # -r benchmarks/llava/requirements.in +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # huggingface-hub + # multidict + # reactivex + # rich + # torch +tzdata==2024.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # pandas +urllib3==2.2.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # requests +varname==0.13.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # giving +voir==0.2.19 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -c .pin/../constraints/rocm.txt + # -r benchmarks/llava/requirements.in +xxhash==3.5.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets +yarl==1.13.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp diff --git a/benchmarks/llm/requirements.rocm.txt b/benchmarks/llm/requirements.rocm.txt index ab5098d0..055089f0 100644 --- a/benchmarks/llm/requirements.rocm.txt +++ b/benchmarks/llm/requirements.rocm.txt @@ -4,13 +4,17 @@ # # pip-compile --output-file=benchmarks/llm/requirements.rocm.txt .pin/tmp-constraints-rocm-llm-full-mp-nodes.txt benchmarks/llm/requirements.in # ---extra-index-url https://download.pytorch.org/whl/rocm6.0 +--extra-index-url https://download.pytorch.org/whl/rocm6.1 -aiohappyeyeballs==2.4.0 +accelerate==0.34.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/llm/requirements.in +aiohappyeyeballs==2.4.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # aiohttp -aiohttp==3.10.5 +aiohttp==3.10.8 # via # -c .pin/../.pin/constraints-rocm-torch.txt # datasets @@ -39,11 +43,12 @@ attrs==24.2.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # aiohttp -blobfile==2.1.1 +blobfile==3.0.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/llm/requirements.txt # torchtune -certifi==2024.7.4 +certifi==2024.8.30 # via # -c .pin/../.pin/constraints-rocm-torch.txt # requests @@ -51,11 +56,11 @@ charset-normalizer==3.3.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # requests -codefind==0.1.6 +codefind==0.1.7 # via # -c .pin/../.pin/constraints-rocm-torch.txt # ptera -datasets==2.21.0 +datasets==3.0.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torchtune @@ -64,11 +69,16 @@ dill==0.3.8 # -c .pin/../.pin/constraints-rocm-torch.txt # datasets # multiprocess -executing==1.2.0 +executing==2.1.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # varname -filelock==3.15.4 +fairscale==0.4.13 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/llm/requirements.in + # -r benchmarks/llm/requirements.txt +filelock==3.16.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # blobfile @@ -76,6 +86,11 @@ filelock==3.15.4 # huggingface-hub # pytorch-triton-rocm # torch + # transformers +fire==0.7.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/llm/requirements.txt frozenlist==1.4.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt @@ -87,7 +102,7 @@ fsspec[http]==2024.6.1 # datasets # huggingface-hub # torch -giving==0.4.2 +giving==0.4.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # ptera @@ -96,17 +111,20 @@ hjson==3.1.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # argklass -huggingface-hub==0.24.6 +huggingface-hub==0.25.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt + # accelerate # datasets + # tokenizers # torchtune -idna==3.7 + # transformers +idna==3.10 # via # -c .pin/../.pin/constraints-rocm-torch.txt # requests # yarl -importlib-resources==6.4.3 +importlib-resources==6.4.5 # via # -c .pin/../.pin/constraints-rocm-torch.txt # argklass @@ -114,7 +132,7 @@ jinja2==3.1.4 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch -lxml==4.9.4 +lxml==5.3.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # blobfile @@ -134,7 +152,7 @@ mpmath==1.3.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # sympy -multidict==6.0.5 +multidict==6.1.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # aiohttp @@ -150,31 +168,41 @@ networkx==3.3 numpy==1.26.4 # via # -c .pin/../.pin/constraints-rocm-torch.txt + # accelerate # datasets + # fairscale # pandas # pyarrow # torchtune + # transformers +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torchtune # voir -ovld==0.3.8 +ovld==0.3.9 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir packaging==24.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt + # accelerate # datasets # huggingface-hub -pandas==2.2.2 + # transformers +pandas==2.2.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # datasets psutil==5.9.8 # via # -c .pin/../.pin/constraints-rocm-torch.txt + # accelerate # voir ptera==1.4.1 # via @@ -184,7 +212,7 @@ pyarrow==17.0.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # datasets -pycryptodomex==3.20.0 +pycryptodomex==3.21.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # blobfile @@ -192,10 +220,6 @@ pygments==2.18.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # rich -pynvml==11.5.3 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # voir python-dateutil==2.9.0.post0 # via # -c .pin/../.pin/constraints-rocm-torch.txt @@ -204,7 +228,7 @@ pytorch-triton-rocm==3.0.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch -pytz==2024.1 +pytz==2024.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # pandas @@ -212,31 +236,37 @@ pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/llm/requirements.in + # accelerate # datasets # huggingface-hub # omegaconf + # transformers reactivex==4.0.4 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -regex==2024.7.24 +regex==2024.9.11 # via # -c .pin/../.pin/constraints-rocm-torch.txt # tiktoken + # transformers requests==2.32.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # datasets # huggingface-hub # tiktoken -rich==13.7.1 + # transformers +rich==13.9.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir -safetensors==0.4.4 +safetensors==0.4.5 # via # -c .pin/../.pin/constraints-rocm-torch.txt + # accelerate # torchtune + # transformers sentencepiece==0.2.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt @@ -246,25 +276,39 @@ six==1.16.0 # -c .pin/../.pin/constraints-rocm-torch.txt # asttokens # python-dateutil -sympy==1.13.2 +sympy==1.13.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch +termcolor==2.4.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # fire tiktoken==0.7.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torchtune -torch==2.4.0+rocm6.0 +tokenizers==0.19.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # transformers +torch==2.4.1+rocm6.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/llm/requirements.in + # -r benchmarks/llm/requirements.txt + # accelerate + # fairscale torchao==0.3.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt + # -c .pin/../constraints/rocm.txt + # -r benchmarks/llm/requirements.in # torchtune torchtune==0.2.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt + # -c .pin/../constraints/rocm.txt # -r benchmarks/llm/requirements.in tqdm==4.66.5 # via @@ -272,26 +316,34 @@ tqdm==4.66.5 # datasets # huggingface-hub # torchtune + # transformers +transformers==4.44.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -c .pin/../constraints/rocm.txt + # -r benchmarks/llm/requirements.in typing-extensions==4.12.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # huggingface-hub + # multidict # reactivex + # rich # torch -tzdata==2024.1 +tzdata==2024.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # pandas -urllib3==2.2.2 +urllib3==2.2.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # blobfile # requests -varname==0.10.0 +varname==0.13.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -voir==0.2.17 +voir==0.2.19 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -c .pin/../constraints/rocm.txt @@ -300,7 +352,7 @@ xxhash==3.5.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # datasets -yarl==1.9.4 +yarl==1.13.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # aiohttp diff --git a/benchmarks/purejaxrl/benchfile.py b/benchmarks/purejaxrl/benchfile.py index 08a51cef..ab1c0ee7 100644 --- a/benchmarks/purejaxrl/benchfile.py +++ b/benchmarks/purejaxrl/benchfile.py @@ -18,7 +18,9 @@ class Template(Package): def make_env(self): # Return a dict of environment variables for prepare_script and # main_script. - return super().make_env() + env = super().make_env() + env["XLA_PYTHON_CLIENT_PREALLOCATE"] = "False" + return env async def install(self): await super().install() # super() call installs the requirements diff --git a/benchmarks/purejaxrl/main.py b/benchmarks/purejaxrl/main.py index f37c45e0..c3a3630d 100644 --- a/benchmarks/purejaxrl/main.py +++ b/benchmarks/purejaxrl/main.py @@ -6,6 +6,7 @@ import argklass +import torch # This is a bit of a trick to make jax use torch's packaged libs from dqn import add_dqn_command, main as dqn_main from ppo import add_ppo_command, main as ppo_main diff --git a/benchmarks/purejaxrl/requirements.rocm.txt b/benchmarks/purejaxrl/requirements.rocm.txt new file mode 100644 index 00000000..226415e0 --- /dev/null +++ b/benchmarks/purejaxrl/requirements.rocm.txt @@ -0,0 +1,693 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/purejaxrl/requirements.rocm.txt .pin/tmp-constraints-rocm-ppo.txt benchmarks/purejaxrl/requirements.in +# +--extra-index-url https://download.pytorch.org/whl/rocm6.1 + +absl-py==2.1.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # brax + # chex + # distrax + # dm-env + # ml-collections + # mujoco + # mujoco-mjx + # optax + # orbax-checkpoint + # rlax + # tensorflow-probability +antlr4-python3-runtime==4.9.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # omegaconf +argklass==1.4.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/purejaxrl/requirements.in +astroid==3.3.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # pylint +asttokens==2.4.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # giving +black==24.8.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # navix +blinker==1.8.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # flask +brax==0.10.5 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/purejaxrl/requirements.in +certifi==2024.8.30 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # requests + # sentry-sdk +charset-normalizer==3.3.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # requests +chex==0.1.87 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # distrax + # evosax + # flashbax + # gymnax + # optax + # rlax +click==8.1.7 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # black + # flask + # wandb +cloudpickle==3.0.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # gym + # gymnasium + # tensorflow-probability +codefind==0.1.7 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # ptera +contextlib2==21.6.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # ml-collections +contourpy==1.3.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # matplotlib +cycler==0.12.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # matplotlib +decorator==5.1.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # tensorflow-probability +dill==0.3.8 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # pylint +distrax==0.1.5 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/purejaxrl/requirements.in + # rlax +dm-env==1.6 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # brax + # rlax +dm-tree==0.1.8 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # dm-env + # tensorflow-probability +docker-pycreds==0.4.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # wandb +docstring-parser==0.16 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # tyro +dotmap==1.3.30 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # evosax +etils[epath,epy]==1.9.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # brax + # mujoco + # mujoco-mjx + # optax + # orbax-checkpoint +evosax==0.1.6 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/purejaxrl/requirements.in +exceptiongroup==1.2.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # pytest +executing==2.1.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # varname +farama-notifications==0.0.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # gymnasium +filelock==3.16.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # pytorch-triton-rocm + # torch +flake8==7.1.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # navix +flashbax==0.1.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/purejaxrl/requirements.in +flask==3.0.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # brax + # flask-cors +flask-cors==5.0.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # brax +flax==0.9.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/purejaxrl/requirements.in + # brax + # evosax + # flashbax + # gymnax + # navix +fonttools==4.54.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # matplotlib +fsspec==2024.6.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # etils + # torch +gast==0.6.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # tensorflow-probability +gitdb==4.0.11 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # gitpython +gitpython==3.1.43 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # wandb +giving==0.4.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # ptera + # voir +glfw==2.7.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # mujoco +grpcio==1.66.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # brax +gym==0.26.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # brax + # gymnax +gym-notices==0.0.8 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # gym +gymnasium==0.29.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # gymnax +gymnax==0.0.8 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -c .pin/../constraints/rocm.txt + # -r benchmarks/purejaxrl/requirements.in +hjson==3.1.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # argklass +humanize==4.10.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # orbax-checkpoint +idna==3.10 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # requests +importlib-resources==6.4.5 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # argklass + # etils +iniconfig==2.0.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # pytest +isort==5.13.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # pylint +itsdangerous==2.2.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # flask +jax==0.4.33 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/purejaxrl/requirements.in + # brax + # chex + # distrax + # evosax + # flashbax + # flax + # gymnax + # jaxopt + # mujoco-mjx + # optax + # orbax-checkpoint + # rlax +jaxlib==0.4.33 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # brax + # chex + # distrax + # evosax + # flashbax + # gymnax + # jax + # jaxopt + # mujoco-mjx + # optax + # orbax-checkpoint + # rlax +jaxopt==0.8.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # brax +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # brax + # flask + # torch +kiwisolver==1.4.7 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # matplotlib +markdown-it-py==3.0.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # rich +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # jinja2 + # werkzeug +matplotlib==3.9.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # evosax + # gymnax + # seaborn +mccabe==0.7.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # flake8 + # pylint +mdurl==0.1.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # markdown-it-py +ml-collections==0.1.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # brax +ml-dtypes==0.5.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # jax + # jaxlib + # tensorstore +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # sympy +msgpack==1.1.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # flax + # orbax-checkpoint +mujoco==3.2.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # brax + # mujoco-mjx +mujoco-mjx==3.2.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # brax +mypy-extensions==1.0.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # black +navix==0.7.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/purejaxrl/requirements.in +nest-asyncio==1.6.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # orbax-checkpoint +networkx==3.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +numpy==1.26.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/purejaxrl/requirements.in + # brax + # chex + # contourpy + # distrax + # dm-env + # evosax + # flashbax + # gym + # gymnasium + # jax + # jaxlib + # jaxopt + # matplotlib + # ml-dtypes + # mujoco + # navix + # optax + # orbax-checkpoint + # pandas + # rlax + # scipy + # seaborn + # tensorboardx + # tensorflow-probability + # tensorstore + # trimesh +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +omegaconf==2.3.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +opt-einsum==3.4.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # jax +optax==0.2.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/purejaxrl/requirements.in + # brax + # flax +orbax-checkpoint==0.6.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # brax + # flax +ovld==0.3.9 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +packaging==24.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # black + # matplotlib + # pytest + # setuptools-scm + # tensorboardx +pandas==2.2.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # seaborn +pathspec==0.12.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # black +pillow==10.4.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # brax + # matplotlib + # navix +platformdirs==4.3.6 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # black + # pylint + # wandb +pluggy==1.5.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # pytest +protobuf==5.28.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # orbax-checkpoint + # tensorboardx + # wandb +psutil==5.9.8 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir + # wandb +ptera==1.4.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +pycodestyle==2.12.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # flake8 +pyflakes==3.2.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # flake8 +pygments==2.18.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # rich +pylint==3.3.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # navix +pyopengl==3.1.7 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # mujoco +pyparsing==3.1.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # matplotlib +pytest==8.3.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # navix +python-dateutil==2.9.0.post0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # matplotlib + # pandas +pytinyrenderer==0.0.14 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # brax +pytorch-triton-rocm==3.0.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +pytz==2024.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # pandas +pyyaml==6.0.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # evosax + # flax + # gymnax + # ml-collections + # omegaconf + # orbax-checkpoint + # wandb +reactivex==4.0.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # giving +requests==2.32.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # wandb +rich==13.9.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # flax + # tyro + # voir +rlax==0.1.6 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # navix +scipy==1.14.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # brax + # jax + # jaxlib + # jaxopt + # mujoco-mjx +seaborn==0.13.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # gymnax +sentry-sdk==2.15.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # wandb +setproctitle==1.3.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # wandb +setuptools-scm==8.1.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # navix +shtab==1.7.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # tyro +six==1.16.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # asttokens + # docker-pycreds + # ml-collections + # python-dateutil + # tensorflow-probability +smmap==5.0.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # gitdb +sympy==1.13.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +tensorboardx==2.6.2.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # brax +tensorflow-probability==0.24.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # distrax +tensorstore==0.1.66 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # flashbax + # flax + # orbax-checkpoint +tomli==2.0.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # black + # pylint + # pytest + # setuptools-scm +tomlkit==0.13.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # pylint +toolz==0.12.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # chex +torch==2.4.1+rocm6.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/purejaxrl/requirements.in +trimesh==4.4.9 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # brax + # mujoco-mjx +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # astroid + # black + # brax + # chex + # etils + # flashbax + # flax + # gymnasium + # navix + # orbax-checkpoint + # reactivex + # rich + # torch + # tyro +tyro==0.8.11 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # navix +tzdata==2024.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # pandas +urllib3==2.2.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # requests + # sentry-sdk +varname==0.13.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # giving +voir==0.2.19 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -c .pin/../constraints/rocm.txt + # -r benchmarks/purejaxrl/requirements.in +wandb==0.18.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # navix +werkzeug==3.0.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # flask +zipp==3.20.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # etils + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/benchmarks/recursiongfn/requirements.rocm.txt b/benchmarks/recursiongfn/requirements.rocm.txt index 1bc73f14..bcb64cdb 100644 --- a/benchmarks/recursiongfn/requirements.rocm.txt +++ b/benchmarks/recursiongfn/requirements.rocm.txt @@ -2,201 +2,198 @@ # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # -# pip-compile --output-file=benchmarks/recursiongfn/requirements.rocm.txt .pin/tmp-constraints-rocm-recursiongfn_gnn.txt benchmarks/recursiongfn/requirements.in +# pip-compile --output-file=benchmarks/recursiongfn/requirements.rocm.txt .pin/tmp-constraints-rocm-recursiongfn.txt benchmarks/recursiongfn/requirements.in # ---extra-index-url https://download.pytorch.org/whl/rocm6.0 +--extra-index-url https://download.pytorch.org/whl/rocm6.1 absl-py==2.1.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # tensorboard -aiohappyeyeballs==2.4.0 +aiohappyeyeballs==2.4.3 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # aiohttp -aiohttp==3.10.5 +aiohttp==3.10.8 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # torch-geometric aiosignal==1.3.1 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # aiohttp antlr4-python3-runtime==4.9.3 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # omegaconf asttokens==2.4.1 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # giving async-timeout==4.0.3 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # aiohttp attrs==24.2.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # aiohttp blosc2==2.7.1 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # tables -botorch==0.11.3 +botorch==0.12.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt - # gflownet -certifi==2024.7.4 + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/recursiongfn/requirements.in +certifi==2024.8.30 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # requests # sentry-sdk charset-normalizer==3.3.2 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # requests click==8.1.7 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # wandb -codefind==0.1.6 +codefind==0.1.7 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # ptera cvxopt==1.3.2 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt - # gflownet + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/recursiongfn/requirements.in docker-pycreds==0.4.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # wandb -executing==1.2.0 +executing==2.1.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # varname -filelock==3.15.4 +filelock==3.16.1 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # pytorch-triton-rocm # torch frozenlist==1.4.1 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # aiohttp # aiosignal fsspec==2024.6.1 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # torch # torch-geometric -gflownet @ git+https://github.com/Delaunay/gflownet@milabench - # via - # -c .pin/../.pin/constraints-rocm-gnn.txt - # -r benchmarks/recursiongfn/requirements.in gitdb==4.0.11 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # gitpython gitpython==3.1.43 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt - # gflownet + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/recursiongfn/requirements.in # wandb -giving==0.4.2 +giving==0.4.3 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # ptera # voir -gpytorch==1.12 +gpytorch==1.13 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/recursiongfn/requirements.in # botorch - # gflownet -grpcio==1.65.5 +grpcio==1.66.2 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # tensorboard -idna==3.7 +idna==3.10 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # requests # yarl -jaxtyping==0.2.33 +jaxtyping==0.2.19 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt + # gpytorch # linear-operator jinja2==3.1.4 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # torch # torch-geometric joblib==1.4.2 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # scikit-learn -linear-operator==0.5.2 +linear-operator==0.5.3 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # botorch # gpytorch markdown==3.7 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # tensorboard markdown-it-py==3.0.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # rich markupsafe==2.1.5 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # jinja2 # werkzeug mdurl==0.1.2 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # markdown-it-py mpmath==1.3.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # botorch # gpytorch + # linear-operator # sympy -msgpack==1.0.8 +msgpack==1.1.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # blosc2 -multidict==6.0.5 +multidict==6.1.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # aiohttp # yarl multipledispatch==1.0.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # botorch -ndindex==1.8 +ndindex==1.9.2 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # blosc2 networkx==3.3 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt - # gflownet + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/recursiongfn/requirements.in # torch numexpr==2.10.1 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # blosc2 # tables numpy==1.26.4 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # blosc2 - # botorch + # jaxtyping # numexpr - # opt-einsum # pandas # pyarrow # pyro-ppl @@ -206,239 +203,240 @@ numpy==1.26.4 # tables # tensorboard # torch-geometric +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir omegaconf==2.3.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt - # gflownet + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/recursiongfn/requirements.in # voir -opt-einsum==3.3.0 +opt-einsum==3.4.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # pyro-ppl -ovld==0.3.8 +ovld==0.3.9 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # voir packaging==24.1 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # tables # tensorboard -pandas==2.2.2 +pandas==2.2.3 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt - # gflownet + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/recursiongfn/requirements.in pillow==10.4.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # rdkit -platformdirs==4.2.2 +platformdirs==4.3.6 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # wandb -protobuf==5.27.3 +protobuf==5.28.2 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # tensorboard # wandb psutil==5.9.8 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # torch-geometric # voir # wandb ptera==1.4.1 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # voir py-cpuinfo==9.0.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # blosc2 # tables pyarrow==17.0.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt - # gflownet + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/recursiongfn/requirements.in pygments==2.18.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # rich -pynvml==11.5.3 - # via - # -c .pin/../.pin/constraints-rocm-gnn.txt - # voir -pyparsing==3.1.2 +pyparsing==3.1.4 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # torch-geometric pyro-api==0.1.2 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # pyro-ppl pyro-ppl==1.9.1 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/recursiongfn/requirements.in # botorch - # gflownet python-dateutil==2.9.0.post0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # pandas pytorch-triton-rocm==3.0.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # torch -pytz==2024.1 +pytz==2024.2 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # pandas pyyaml==6.0.2 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # omegaconf # wandb rdkit==2024.3.5 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt - # gflownet + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/recursiongfn/requirements.in reactivex==4.0.4 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # giving requests==2.32.3 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # torch-geometric # wandb -rich==13.7.1 +rich==13.9.1 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # voir -scikit-learn==1.5.1 +scikit-learn==1.5.2 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # gpytorch - # torch-geometric -scipy==1.14.0 +scipy==1.14.1 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/recursiongfn/requirements.in # botorch - # gflownet # gpytorch # linear-operator # scikit-learn # torch-cluster - # torch-geometric # torch-sparse -sentry-sdk==2.13.0 +sentry-sdk==2.15.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # wandb setproctitle==1.3.3 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # wandb six==1.16.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # asttokens # docker-pycreds # python-dateutil # tensorboard smmap==5.0.1 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # gitdb -sympy==1.13.2 +sympy==1.13.3 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # torch tables==3.10.1 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt - # gflownet -tensorboard==2.17.1 + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/recursiongfn/requirements.in +tensorboard==2.18.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt - # gflownet + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/recursiongfn/requirements.in tensorboard-data-server==0.7.2 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # tensorboard threadpoolctl==3.5.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # scikit-learn -torch==2.4.0+rocm6.0 +torch==2.4.1+rocm6.1 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/recursiongfn/requirements.in # botorch - # gflownet # linear-operator # pyro-ppl torch-cluster==1.6.3 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt - # gflownet -torch-geometric==2.5.3 + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/recursiongfn/requirements.in +torch-geometric==2.6.1 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt - # gflownet + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/recursiongfn/requirements.in torch-scatter==2.1.2 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt - # gflownet + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/recursiongfn/requirements.in torch-sparse==0.6.18 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt - # gflownet + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/recursiongfn/requirements.in tqdm==4.66.5 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # pyro-ppl # torch-geometric -typeguard==2.13.3 +typeguard==4.3.0 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # jaxtyping - # linear-operator typing-extensions==4.12.2 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt + # botorch + # jaxtyping + # multidict # reactivex + # rich # tables # torch -tzdata==2024.1 + # typeguard +tzdata==2024.2 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # pandas -urllib3==2.2.2 +urllib3==2.2.3 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # requests # sentry-sdk -varname==0.10.0 +varname==0.13.3 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # giving -voir==0.2.17 +voir==0.2.19 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # -c .pin/../constraints/rocm.txt # -r benchmarks/recursiongfn/requirements.in -wandb==0.17.7 +wandb==0.18.3 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt - # gflownet -werkzeug==3.0.3 + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/recursiongfn/requirements.in +werkzeug==3.0.4 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # tensorboard -yarl==1.9.4 +yarl==1.13.1 # via - # -c .pin/../.pin/constraints-rocm-gnn.txt + # -c .pin/../.pin/constraints-rocm-torch.txt # aiohttp # The following packages are considered to be unsafe in a requirements file: diff --git a/benchmarks/rlhf/requirements.in b/benchmarks/rlhf/requirements.in index 045bca09..1cb6cd24 100644 --- a/benchmarks/rlhf/requirements.in +++ b/benchmarks/rlhf/requirements.in @@ -4,3 +4,4 @@ trl accelerate transformers datasets +einops \ No newline at end of file diff --git a/benchmarks/rlhf/requirements.rocm.txt b/benchmarks/rlhf/requirements.rocm.txt new file mode 100644 index 00000000..5b7f2726 --- /dev/null +++ b/benchmarks/rlhf/requirements.rocm.txt @@ -0,0 +1,313 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/rlhf/requirements.rocm.txt .pin/tmp-constraints-rocm-rlhf-gpus.txt benchmarks/rlhf/requirements.in +# +--extra-index-url https://download.pytorch.org/whl/rocm6.1 + +accelerate==0.34.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/rlhf/requirements.in + # trl +aiohappyeyeballs==2.4.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp +aiohttp==3.10.8 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets + # fsspec +aiosignal==1.3.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp +antlr4-python3-runtime==4.9.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # omegaconf +asttokens==2.4.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # giving +async-timeout==4.0.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp +attrs==24.2.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp +certifi==2024.8.30 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # requests +charset-normalizer==3.3.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # requests +codefind==0.1.7 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # ptera +datasets==3.0.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/rlhf/requirements.in + # trl +dill==0.3.8 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets + # multiprocess +docstring-parser==0.16 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # tyro +executing==2.1.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # varname +filelock==3.16.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets + # huggingface-hub + # pytorch-triton-rocm + # torch + # transformers +frozenlist==1.4.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp + # aiosignal +fsspec[http]==2024.6.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets + # huggingface-hub + # torch +giving==0.4.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # ptera + # voir +huggingface-hub==0.25.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # accelerate + # datasets + # tokenizers + # transformers +idna==3.10 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # requests + # yarl +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +markdown-it-py==3.0.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # rich +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # jinja2 +mdurl==0.1.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # markdown-it-py +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # sympy +multidict==6.1.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp + # yarl +multiprocess==0.70.16 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets +networkx==3.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +numpy==1.26.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # accelerate + # datasets + # pandas + # pyarrow + # transformers + # trl +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +omegaconf==2.3.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +ovld==0.3.9 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +packaging==24.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # accelerate + # datasets + # huggingface-hub + # transformers +pandas==2.2.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets +psutil==5.9.8 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # accelerate + # voir +ptera==1.4.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +pyarrow==17.0.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets +pygments==2.18.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # rich +python-dateutil==2.9.0.post0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # pandas +pytorch-triton-rocm==3.0.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +pytz==2024.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # pandas +pyyaml==6.0.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # accelerate + # datasets + # huggingface-hub + # omegaconf + # transformers +reactivex==4.0.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # giving +regex==2024.9.11 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # transformers +requests==2.32.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets + # huggingface-hub + # transformers +rich==13.9.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # tyro + # voir +safetensors==0.4.5 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # accelerate + # transformers +shtab==1.7.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # tyro +six==1.16.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # asttokens + # python-dateutil +sympy==1.13.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +tokenizers==0.19.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # transformers +torch==2.4.1+rocm6.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/rlhf/requirements.in + # accelerate + # trl +tqdm==4.66.5 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets + # huggingface-hub + # transformers +transformers==4.44.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -c .pin/../constraints/rocm.txt + # -r benchmarks/rlhf/requirements.in + # trl +trl==0.10.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -c .pin/../constraints/rocm.txt + # -r benchmarks/rlhf/requirements.in +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # huggingface-hub + # multidict + # reactivex + # rich + # torch + # tyro +tyro==0.8.11 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # trl +tzdata==2024.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # pandas +urllib3==2.2.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # requests +varname==0.13.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # giving +voir==0.2.19 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -c .pin/../constraints/rocm.txt + # -r benchmarks/rlhf/requirements.in +xxhash==3.5.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # datasets +yarl==1.13.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # aiohttp +einops \ No newline at end of file diff --git a/benchmarks/timm/requirements.rocm.txt b/benchmarks/timm/requirements.rocm.txt index 8383f9e6..18e83d95 100644 --- a/benchmarks/timm/requirements.rocm.txt +++ b/benchmarks/timm/requirements.rocm.txt @@ -4,7 +4,7 @@ # # pip-compile --output-file=benchmarks/timm/requirements.rocm.txt .pin/tmp-constraints-rocm-timm.txt benchmarks/timm/requirements.in # ---extra-index-url https://download.pytorch.org/whl/rocm6.0 +--extra-index-url https://download.pytorch.org/whl/rocm6.1 antlr4-python3-runtime==4.9.3 # via @@ -14,7 +14,7 @@ asttokens==2.4.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -certifi==2024.7.4 +certifi==2024.8.30 # via # -c .pin/../.pin/constraints-rocm-torch.txt # requests @@ -22,15 +22,15 @@ charset-normalizer==3.3.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # requests -codefind==0.1.6 +codefind==0.1.7 # via # -c .pin/../.pin/constraints-rocm-torch.txt # ptera -executing==1.2.0 +executing==2.1.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # varname -filelock==3.15.4 +filelock==3.16.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # huggingface-hub @@ -41,16 +41,16 @@ fsspec==2024.6.1 # -c .pin/../.pin/constraints-rocm-torch.txt # huggingface-hub # torch -giving==0.4.2 +giving==0.4.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # ptera # voir -huggingface-hub==0.24.6 +huggingface-hub==0.25.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/timm/requirements.in -idna==3.7 +idna==3.10 # via # -c .pin/../.pin/constraints-rocm-torch.txt # requests @@ -82,11 +82,15 @@ numpy==1.26.4 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torchvision +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir -ovld==0.3.8 +ovld==0.3.9 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir @@ -110,10 +114,6 @@ pygments==2.18.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # rich -pynvml==11.5.3 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # voir pytorch-triton-rocm==3.0.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt @@ -132,11 +132,11 @@ requests==2.32.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # huggingface-hub -rich==13.7.1 +rich==13.9.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir -safetensors==0.4.4 +safetensors==0.4.5 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/timm/requirements.in @@ -144,16 +144,16 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # asttokens -sympy==1.13.2 +sympy==1.13.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch -torch==2.4.0+rocm6.0 +torch==2.4.1+rocm6.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/timm/requirements.in # torchvision -torchvision==0.19.0+rocm6.0 +torchvision==0.19.1+rocm6.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/timm/requirements.in @@ -166,12 +166,13 @@ typing-extensions==4.12.2 # -c .pin/../.pin/constraints-rocm-torch.txt # huggingface-hub # reactivex + # rich # torch -urllib3==2.2.2 +urllib3==2.2.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # requests -varname==0.10.0 +varname==0.13.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving diff --git a/benchmarks/torchatari/requirements.rocm.txt b/benchmarks/torchatari/requirements.rocm.txt index 71fd92e5..76fa829c 100644 --- a/benchmarks/torchatari/requirements.rocm.txt +++ b/benchmarks/torchatari/requirements.rocm.txt @@ -4,7 +4,7 @@ # # pip-compile --output-file=benchmarks/torchatari/requirements.rocm.txt .pin/tmp-constraints-rocm-torchatari.txt benchmarks/torchatari/requirements.in # ---extra-index-url https://download.pytorch.org/whl/rocm6.0 +--extra-index-url https://download.pytorch.org/whl/rocm6.1 absl-py==2.1.0 # via @@ -32,7 +32,7 @@ cloudpickle==3.0.0 # -c .pin/../.pin/constraints-rocm-torch.txt # gym # gymnasium -codefind==0.1.6 +codefind==0.1.7 # via # -c .pin/../.pin/constraints-rocm-torch.txt # ptera @@ -52,7 +52,7 @@ envpool==0.8.4 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/torchatari/requirements.in -executing==1.2.0 +executing==2.1.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # varname @@ -60,7 +60,7 @@ farama-notifications==0.0.4 # via # -c .pin/../.pin/constraints-rocm-torch.txt # gymnasium -filelock==3.15.4 +filelock==3.16.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # pytorch-triton-rocm @@ -69,16 +69,16 @@ fsspec==2024.6.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch -giving==0.4.2 +giving==0.4.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # ptera # voir -grpcio==1.65.5 +grpcio==1.66.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # tensorboard -gym==0.23.1 +gym==0.26.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/torchatari/requirements.in @@ -91,7 +91,7 @@ gymnasium==0.29.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # envpool -importlib-resources==6.4.3 +importlib-resources==6.4.5 # via # -c .pin/../.pin/constraints-rocm-torch.txt # cantilever @@ -134,15 +134,19 @@ numpy==1.26.4 # gym # gymnasium # tensorboard +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir -optree==0.12.1 +optree==0.13.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # envpool -ovld==0.3.8 +ovld==0.3.9 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir @@ -151,7 +155,7 @@ packaging==24.1 # -c .pin/../.pin/constraints-rocm-torch.txt # envpool # tensorboard -protobuf==5.27.3 +protobuf==5.28.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # tensorboard @@ -167,10 +171,6 @@ pygments==2.18.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # rich -pynvml==11.5.3 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # voir pytorch-triton-rocm==3.0.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt @@ -183,7 +183,7 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -rich==13.7.1 +rich==13.9.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # tyro @@ -197,11 +197,11 @@ six==1.16.0 # -c .pin/../.pin/constraints-rocm-torch.txt # asttokens # tensorboard -sympy==1.13.2 +sympy==1.13.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch -tensorboard==2.17.1 +tensorboard==2.18.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/torchatari/requirements.in @@ -209,7 +209,7 @@ tensorboard-data-server==0.7.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # tensorboard -torch==2.4.0+rocm6.0 +torch==2.4.1+rocm6.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/torchatari/requirements.in @@ -218,7 +218,7 @@ torchcompat==1.1.4 # -c .pin/../.pin/constraints-rocm-torch.txt # -c .pin/../constraints/rocm.txt # -r benchmarks/torchatari/requirements.in -types-protobuf==5.27.0.20240626 +types-protobuf==5.28.0.20240924 # via # -c .pin/../.pin/constraints-rocm-torch.txt # envpool @@ -229,22 +229,23 @@ typing-extensions==4.12.2 # gymnasium # optree # reactivex + # rich # torch # tyro -tyro==0.8.8 +tyro==0.8.11 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/torchatari/requirements.in -varname==0.10.0 +varname==0.13.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -voir==0.2.17 +voir==0.2.19 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -c .pin/../constraints/rocm.txt # -r benchmarks/torchatari/requirements.in -werkzeug==3.0.3 +werkzeug==3.0.4 # via # -c .pin/../.pin/constraints-rocm-torch.txt # tensorboard diff --git a/benchmarks/torchvision/requirements.rocm.txt b/benchmarks/torchvision/requirements.rocm.txt index 094eb29b..08dfdebf 100644 --- a/benchmarks/torchvision/requirements.rocm.txt +++ b/benchmarks/torchvision/requirements.rocm.txt @@ -4,7 +4,7 @@ # # pip-compile --output-file=benchmarks/torchvision/requirements.rocm.txt .pin/tmp-constraints-rocm-torchvision.txt benchmarks/torchvision/requirements.in # ---extra-index-url https://download.pytorch.org/whl/rocm6.0 +--extra-index-url https://download.pytorch.org/whl/rocm6.1 antlr4-python3-runtime==4.9.3 # via @@ -14,15 +14,15 @@ asttokens==2.4.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -codefind==0.1.6 +codefind==0.1.7 # via # -c .pin/../.pin/constraints-rocm-torch.txt # ptera -executing==1.2.0 +executing==2.1.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # varname -filelock==3.15.4 +filelock==3.16.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # pytorch-triton-rocm @@ -31,12 +31,12 @@ fsspec==2024.6.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch -giving==0.4.2 +giving==0.4.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # ptera # voir -importlib-resources==6.4.3 +importlib-resources==6.4.5 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torchcompat @@ -68,11 +68,15 @@ numpy==1.26.4 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torchvision +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir -ovld==0.3.8 +ovld==0.3.9 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir @@ -92,10 +96,6 @@ pygments==2.18.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # rich -pynvml==11.5.3 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # voir pytorch-triton-rocm==3.0.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt @@ -108,7 +108,7 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -rich==13.7.1 +rich==13.9.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir @@ -116,11 +116,11 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # asttokens -sympy==1.13.2 +sympy==1.13.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch -torch==2.4.0+rocm6.0 +torch==2.4.1+rocm6.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/torchvision/requirements.in @@ -130,7 +130,7 @@ torchcompat==1.1.4 # -c .pin/../.pin/constraints-rocm-torch.txt # -c .pin/../constraints/rocm.txt # -r benchmarks/torchvision/requirements.in -torchvision==0.19.0+rocm6.0 +torchvision==0.19.1+rocm6.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/torchvision/requirements.in @@ -142,8 +142,9 @@ typing-extensions==4.12.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # reactivex + # rich # torch -varname==0.10.0 +varname==0.13.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving diff --git a/benchmarks/torchvision_ddp/requirements.rocm.txt b/benchmarks/torchvision_ddp/requirements.rocm.txt index d1241db8..9eed9442 100644 --- a/benchmarks/torchvision_ddp/requirements.rocm.txt +++ b/benchmarks/torchvision_ddp/requirements.rocm.txt @@ -4,7 +4,7 @@ # # pip-compile --output-file=benchmarks/torchvision_ddp/requirements.rocm.txt .pin/tmp-constraints-rocm-torchvision.txt benchmarks/torchvision_ddp/requirements.in # ---extra-index-url https://download.pytorch.org/whl/rocm6.0 +--extra-index-url https://download.pytorch.org/whl/rocm6.1 antlr4-python3-runtime==4.9.3 # via @@ -14,15 +14,15 @@ asttokens==2.4.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -codefind==0.1.6 +codefind==0.1.7 # via # -c .pin/../.pin/constraints-rocm-torch.txt # ptera -executing==1.2.0 +executing==2.1.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # varname -filelock==3.15.4 +filelock==3.16.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # pytorch-triton-rocm @@ -31,12 +31,12 @@ fsspec==2024.6.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch -giving==0.4.2 +giving==0.4.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # ptera # voir -importlib-resources==6.4.3 +importlib-resources==6.4.5 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torchcompat @@ -68,11 +68,15 @@ numpy==1.26.4 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torchvision +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir -ovld==0.3.8 +ovld==0.3.9 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir @@ -92,10 +96,6 @@ pygments==2.18.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # rich -pynvml==11.5.3 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # voir pytorch-triton-rocm==3.0.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt @@ -108,7 +108,7 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving -rich==13.7.1 +rich==13.9.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir @@ -116,11 +116,11 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # asttokens -sympy==1.13.2 +sympy==1.13.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch -torch==2.4.0+rocm6.0 +torch==2.4.1+rocm6.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/torchvision_ddp/requirements.in @@ -130,7 +130,7 @@ torchcompat==1.1.4 # -c .pin/../.pin/constraints-rocm-torch.txt # -c .pin/../constraints/rocm.txt # -r benchmarks/torchvision_ddp/requirements.in -torchvision==0.19.0+rocm6.0 +torchvision==0.19.1+rocm6.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/torchvision_ddp/requirements.in @@ -142,8 +142,9 @@ typing-extensions==4.12.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # reactivex + # rich # torch -varname==0.10.0 +varname==0.13.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # giving diff --git a/benchmarks/vjepa/requirements.rocm.txt b/benchmarks/vjepa/requirements.rocm.txt new file mode 100644 index 00000000..a473fac7 --- /dev/null +++ b/benchmarks/vjepa/requirements.rocm.txt @@ -0,0 +1,247 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/vjepa/requirements.rocm.txt .pin/tmp-constraints-rocm-vjepa-gpus.txt benchmarks/vjepa/requirements.in +# +--extra-index-url https://download.pytorch.org/whl/rocm6.1 + +antlr4-python3-runtime==4.9.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # omegaconf +asttokens==2.4.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # giving +beartype==0.19.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/vjepa/requirements.in +braceexpand==0.1.7 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/vjepa/requirements.in + # webdataset +certifi==2024.8.30 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # requests +charset-normalizer==3.3.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # requests +cloudpickle==3.0.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # submitit +codefind==0.1.7 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # ptera +decord==0.6.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/vjepa/requirements.in +einops==0.8.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/vjepa/requirements.in +executing==2.1.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # varname +filelock==3.16.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # huggingface-hub + # pytorch-triton-rocm + # torch +fsspec==2024.6.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # huggingface-hub + # torch +giving==0.4.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # ptera + # voir +huggingface-hub==0.25.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # timm +idna==3.10 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # requests +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +markdown-it-py==3.0.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # rich +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # jinja2 +mdurl==0.1.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # markdown-it-py +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # sympy +networkx==3.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +numpy==1.26.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/vjepa/requirements.in + # decord + # opencv-python + # pandas + # torchvision + # webdataset +nvidia-ml-py==12.560.30 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +omegaconf==2.3.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +opencv-python==4.10.0.84 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/vjepa/requirements.in +ovld==0.3.9 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +packaging==24.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # huggingface-hub +pandas==2.2.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/vjepa/requirements.in +pillow==10.4.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torchvision +psutil==5.9.8 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +ptera==1.4.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +pygments==2.18.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # rich +python-dateutil==2.9.0.post0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # pandas +pytorch-triton-rocm==3.0.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +pytz==2024.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # pandas +pyyaml==6.0.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/vjepa/requirements.in + # huggingface-hub + # omegaconf + # timm + # webdataset +reactivex==4.0.4 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # giving +requests==2.32.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # huggingface-hub +rich==13.9.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # voir +safetensors==0.4.5 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # timm +six==1.16.0 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # asttokens + # python-dateutil +submitit==1.5.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/vjepa/requirements.in +sympy==1.13.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # torch +timm==1.0.9 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/vjepa/requirements.in +torch==2.4.1+rocm6.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/vjepa/requirements.in + # timm + # torchvision +torchvision==0.19.1+rocm6.1 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/vjepa/requirements.in + # timm +tqdm==4.66.5 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # huggingface-hub +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # huggingface-hub + # reactivex + # rich + # submitit + # torch +tzdata==2024.2 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # pandas +urllib3==2.2.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # requests +varname==0.13.3 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # giving +voir==0.2.19 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -c .pin/../constraints/rocm.txt + # -r benchmarks/vjepa/requirements.in +webdataset==0.2.100 + # via + # -c .pin/../.pin/constraints-rocm-torch.txt + # -r benchmarks/vjepa/requirements.in diff --git a/config/base.yaml b/config/base.yaml index 1a64d550..b9b104d7 100644 --- a/config/base.yaml +++ b/config/base.yaml @@ -66,7 +66,7 @@ llama: definition: ../benchmarks/llama group: llm install_group: torch - max_duration: 800 + max_duration: 3600 tags: - nlp - llm @@ -700,6 +700,7 @@ _purejaxrl: - monogpu - gym - rl + - jax definition: ../benchmarks/purejaxrl plan: method: per_gpu @@ -891,6 +892,7 @@ cleanrljax: definition: ../benchmarks/cleanrl_jax tags: - monogpu + - jax plan: method: per_gpu diff --git a/constraints/rocm.txt b/constraints/rocm.txt index b86ce00d..cc158557 100644 --- a/constraints/rocm.txt +++ b/constraints/rocm.txt @@ -1,7 +1,20 @@ ---extra-index-url https://download.pytorch.org/whl/rocm6.0 +--extra-index-url https://download.pytorch.org/whl/rocm6.1 # # voir >= 0.2.19 torchcompat >= 1.0.0 gymnax >= 0.0.8 + + +trl<0.11.0 + +# latest torchtune is slower than before and cause failures +# next version of pytorch seems to work better +# so pending a new version of pytorch this is what we get +torchtune<0.3.0 + +# transformers added torchao support recently +# but only the most recent version we do not support +transformers<4.45.0 +torchao \ No newline at end of file diff --git a/milabench/_version.py b/milabench/_version.py index a5982314..b07d8b5e 100644 --- a/milabench/_version.py +++ b/milabench/_version.py @@ -1,5 +1,5 @@ """This file is generated, do not modify""" -__tag__ = "v1.0.0_RC1-50-gd2c8ba2" -__commit__ = "d2c8ba2c67e19026293381bdbddeb3f30ba0ee64" -__date__ = "2024-11-11 18:51:45 +0000" +__tag__ = "v1.0.0_RC1-13-gde92a7e" +__commit__ = "de92a7ea9dea1da24e8105e4566d5e6daef8464c" +__date__ = "2024-10-03 15:48:10 +0000" diff --git a/scripts/article/run_cuda.sh b/scripts/article/run_cuda.sh index 12ffffe6..0c2c1dae 100644 --- a/scripts/article/run_cuda.sh +++ b/scripts/article/run_cuda.sh @@ -9,7 +9,7 @@ export MILABENCH_BASE="$MILABENCH_WORDIR/results" export MILABENCH_VENV="$MILABENCH_WORDIR/env" export BENCHMARK_VENV="$MILABENCH_WORDIR/results/venv/torch" export MILABENCH_SIZER_SAVE="$MILABENCH_WORDIR/scaling.yaml" - + if [ -z "${MILABENCH_PREPARE}" ]; then export MILABENCH_PREPARE=0 @@ -84,6 +84,8 @@ if [ "$MILABENCH_PREPARE" -eq 0 ]; then . $MILABENCH_WORDIR/env/bin/activate + + # pip install torch # milabench pin --variant cuda --from-scratch # rm -rf $MILABENCH_WORDIR/results/venv/ diff --git a/scripts/article/run_rocm.sh b/scripts/article/run_rocm.sh index b8a15fb7..fbb9da83 100644 --- a/scripts/article/run_rocm.sh +++ b/scripts/article/run_rocm.sh @@ -2,14 +2,30 @@ set -ex +# sudo usermod -a -G render,video $LOGNAME +# sudo chmod u+s /opt/rocm-6.2.2/lib/llvm/bin/amdgpu-arch + export MILABENCH_GPU_ARCH=rocm export MILABENCH_WORDIR="$(pwd)/$MILABENCH_GPU_ARCH" - +export ROCM_PATH="/opt/rocm" export MILABENCH_BASE="$MILABENCH_WORDIR/results" -export MILABENCH_CONFIG="$MILABENCH_WORDIR/milabench/config/standard.yaml" export MILABENCH_VENV="$MILABENCH_WORDIR/env" export BENCHMARK_VENV="$MILABENCH_WORDIR/results/venv/torch" +if [ -z "${MILABENCH_SOURCE}" ]; then + export MILABENCH_CONFIG="$MILABENCH_WORDIR/milabench/config/standard.yaml" +else + export MILABENCH_CONFIG="$MILABENCH_SOURCE/config/standard.yaml" +fi + + +export GPU="$(/opt/rocm/lib/llvm/bin/amdgpu-arch | head -n 1)" +export TORCH_ROCM_ARCH_LIST="$GPU" +export ROCM_TARGETS="$GPU" +export PYTORCH_ROCM_ARCH="$GPU" + + +ARGS="$@" install_prepare() { mkdir -p $MILABENCH_WORDIR @@ -17,15 +33,23 @@ install_prepare() { virtualenv $MILABENCH_WORDIR/env - git clone https://github.com/mila-iqia/milabench.git + if [ -z "${MILABENCH_SOURCE}" ]; then + if [ ! -d "$MILABENCH_WORDIR/milabench" ]; then + git clone https://github.com/mila-iqia/milabench.git -b rocm + fi + export MILABENCH_SOURCE="$MILABENCH_WORDIR/milabench" + fi . $MILABENCH_WORDIR/env/bin/activate - pip install -e $MILABENCH_WORDIR/milabench + pip install -e $MILABENCH_SOURCE + # # Install milabench's benchmarks in their venv # - milabench install + # pip install torch --index-url https://download.pytorch.org/whl/rocm6.1 + # milabench pin --variant rocm --from-scratch $ARGS + milabench install $ARGS # # Override/add package to milabench venv here @@ -36,35 +60,48 @@ install_prepare() { ( . $BENCHMARK_VENV/bin/activate + pip install ninja + + if [ -z "${MILABENCH_HF_TOKEN}" ]; then + echo "Missing token" + else + huggingface-cli login --token $MILABENCH_HF_TOKEN + fi + # # Override/add package to the benchmark venv here # which pip - pip uninstall torch torchvision torchaudio - pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.1 - pip uninstall pynvml - # sudo apt-get install lld # https://github.com/ROCm/jax/releases/tag/rocm-jaxlib-v0.4.30 - # does not really work pip install https://github.com/ROCm/jax/releases/download/rocm-jaxlib-v0.4.30/jaxlib-0.4.30+rocm611-cp310-cp310-manylinux2014_x86_64.whl pip install https://github.com/ROCm/jax/archive/refs/tags/rocm-jaxlib-v0.4.30.tar.gz - # - FORCE_CUDA=1 pip install -U -v --no-build-isolation git+https://github.com/rusty1s/pytorch_cluster.git - FORCE_CUDA=1 pip install -U -v --no-build-isolation git+https://github.com/rusty1s/pytorch_scatter.git - FORCE_CUDA=1 pip install -U -v --no-build-isolation git+https://github.com/rusty1s/pytorch_sparse.git + pip uninstall torch_cluster torch_scatter torch_sparse -y + FORCE_ONLY_CUDA=1 pip install -U -v --use-pep517 --no-build-isolation git+https://github.com/rusty1s/pytorch_cluster.git + FORCE_ONLY_CUDA=1 pip install -U -v --use-pep517 --no-build-isolation git+https://github.com/rusty1s/pytorch_scatter.git + FORCE_ONLY_CUDA=1 pip install -U -v --use-pep517 --no-build-isolation git+https://github.com/rusty1s/pytorch_sparse.git # takes forever to compile # https://github.com/ROCm/xformers - pip install -v -U --no-build-isolation --no-deps git+https://github.com/ROCm/xformers.git@develop#egg=xformers - pip install -v -U --no-build-isolation --no-deps git+https://github.com/ROCm/flash-attention.git + pip uninstall xformers + pip install xformers --index-url https://download.pytorch.org/whl/rocm6.1 + # pip install -v -U --no-build-isolation --no-deps git+https://github.com/ROCm/xformers.git@develop#egg=xformers + # pip install -v -U --no-build-isolation --no-deps git+https://github.com/facebookresearch/xformers.git + # pip install xformers -U --index-url https://download.pytorch.org/whl/rocm6.1 + + pip uninstall flash-attention + pip install -v -U --no-build-isolation --use-pep517 --no-deps git+https://github.com/ROCm/flash-attention.git + pip uninstall pynvml nvidia-ml-py -y + + pip install einops ) + pip uninstall pynvml nvidia-ml-py -y # # Generate/download datasets, download models etc... # - milabench prepare + milabench prepare $ARGS } if [ ! -d "$MILABENCH_WORDIR" ]; then @@ -74,11 +111,12 @@ else . $MILABENCH_WORDIR/env/bin/activate fi -cd $MILABENCH_WORDIR + +milabench prepare $ARGS # # Run the benchmakrs -milabench run "$@" +milabench run $ARGS # # Display report From 491505f0c72248824a26926df05e22f4c764ecbe Mon Sep 17 00:00:00 2001 From: Setepenre Date: Thu, 21 Nov 2024 13:15:47 -0500 Subject: [PATCH 11/20] Multirun system (#308) * ROCm changes * Update ping * - * Cleanup the rocm script * use rocm branch * - * Ne wmulti run system * multinode tweaks * make sure system config is applied before running * Update matrix run * Tweaks --------- Co-authored-by: Your Name --- .../llm/recipes/full_finetune_distributed.py | 1 - config/base.yaml | 2 +- config/examples/system.yaml | 30 +++++++++ config/scaling.yaml | 2 + milabench/_version.py | 7 ++- milabench/alt_async.py | 2 + milabench/cli/run.py | 42 +++++++++---- milabench/commands/__init__.py | 8 ++- milabench/compare.py | 2 + milabench/config.py | 10 ++- milabench/remote.py | 4 +- milabench/sizer.py | 25 +++++--- milabench/system.py | 63 ++++++++++++++++++- scripts/article/run_rocm.sh | 25 +++++++- tests/test_system_matrix.py | 40 ++++++++++++ 15 files changed, 226 insertions(+), 37 deletions(-) create mode 100644 tests/test_system_matrix.py diff --git a/benchmarks/llm/recipes/full_finetune_distributed.py b/benchmarks/llm/recipes/full_finetune_distributed.py index 19556ec7..f8d58e2f 100755 --- a/benchmarks/llm/recipes/full_finetune_distributed.py +++ b/benchmarks/llm/recipes/full_finetune_distributed.py @@ -100,7 +100,6 @@ class FullFinetuneRecipeDistributed(FTRecipeInterface): """ def __init__(self, cfg: DictConfig) -> None: - import os self._device = acc.fetch_device(int(os.getenv("LOCAL_RANK", "0"))) self._dtype = utils.get_dtype(cfg.dtype, device=self._device) diff --git a/config/base.yaml b/config/base.yaml index b9b104d7..38dfc4d3 100644 --- a/config/base.yaml +++ b/config/base.yaml @@ -208,7 +208,7 @@ resnet50-noio: inherits: _torchvision voir: options: - stop: 1000 + stop: 500 interval: "1s" tags: diff --git a/config/examples/system.yaml b/config/examples/system.yaml index 7b84c48d..78cf3957 100644 --- a/config/examples/system.yaml +++ b/config/examples/system.yaml @@ -26,3 +26,33 @@ system: ip: 192.168.11.13 main: false user: username + + + + +multirun: + runs: + # Force batch size to populate the sizing model + - name: "bs{sizer.batch_size}" + matrix: + sizer.auto: 1 + sizer.batch_size: [1, 2, 4, 8, 16, 32, 64, 128] + sizer.save: ["scaling.yaml"] + + # Matrix run + - name: "c{sizer.capacity}_m{sizer.multiple}_w{cpu.n_workers}" + matrix: + cpu.auto: 1 + cpu.n_workers: [2, 4, 8, 16, 32] + sizer.auto: 1 + sizer.capacity: [4Go, 8Go, 16Go, 32Go, 64Go, All] + sizer.multiple: 8 + sizer.save: ["scaling.yaml"] + + # Auto run + - name: "auto" + matrix: + cpu.auto: 1 + sizer.auto: 1 + sizer.multiple: 8 + sizer.save: ["scaling.yaml"] diff --git a/config/scaling.yaml b/config/scaling.yaml index d9d3dbf9..00a37bd8 100644 --- a/config/scaling.yaml +++ b/config/scaling.yaml @@ -286,7 +286,9 @@ lightning-gpus: 112: 16776.25 MiB 128: 15858 MiB 240: 28942.25 MiB + 256: 77822 MiB 504: 54100.25 MiB + 616: 93571 MiB 624: 65386.25 MiB optimized: 16 llama: {} diff --git a/milabench/_version.py b/milabench/_version.py index b07d8b5e..6f09fef1 100644 --- a/milabench/_version.py +++ b/milabench/_version.py @@ -1,5 +1,6 @@ """This file is generated, do not modify""" -__tag__ = "v1.0.0_RC1-13-gde92a7e" -__commit__ = "de92a7ea9dea1da24e8105e4566d5e6daef8464c" -__date__ = "2024-10-03 15:48:10 +0000" +__tag__ = "v1.0.0_RC1-18-g784b38e" +__commit__ = "784b38e77b90116047e3de893c22c2f7d3225179" +__date__ = "2024-10-18 15:58:46 +0000" + diff --git a/milabench/alt_async.py b/milabench/alt_async.py index 8608196d..6fc9f64c 100644 --- a/milabench/alt_async.py +++ b/milabench/alt_async.py @@ -190,6 +190,8 @@ def run(argv, setsid=None, process_accumulator=None, info={}, **kwargs): destroy(*mx.processes) yield entry + # mx.close() + def proceed(coro): loop = FeedbackEventLoop() diff --git a/milabench/cli/run.py b/milabench/cli/run.py index f5e75b70..f04427af 100644 --- a/milabench/cli/run.py +++ b/milabench/cli/run.py @@ -23,6 +23,7 @@ from ..report import make_report from ..sizer import MemoryUsageExtractor from ..summary import make_summary +from ..system import multirun, apply_system, SizerOptions, option # fmt: off @@ -72,12 +73,7 @@ def _fetch_arch(mp): return None -@tooled -def cli_run(args=None): - """Run the benchmarks.""" - if args is None: - args = arguments() - +def run(mp, args, name): layers = validation_names(args.validations) dash_class = { @@ -85,13 +81,7 @@ def cli_run(args=None): "long": LongDashFormatter, "no": None, }.get(args.dash, None) - - mp = get_multipack(run_name=args.run_name) - arch = _fetch_arch(mp) - - # Initialize the backend here so we can retrieve GPU stats - init_arch(arch) - + success = run_with_loggers( mp.do_run(repeat=args.repeat), loggers=[ @@ -136,3 +126,29 @@ def cli_run(args=None): ) return success + + +@tooled +def cli_run(args=None): + """Run the benchmarks.""" + if args is None: + args = arguments() + + # Load the configuration and system + mp = get_multipack(run_name=args.run_name) + arch = _fetch_arch(mp) + + # Initialize the backend here so we can retrieve GPU stats + init_arch(arch) + + success = 0 + for name, conf in multirun(): + run_name = name or args.run_name + + # Note that this function overrides the system config + mp = get_multipack(run_name=run_name) + + with apply_system(conf): + success += run(mp, args, run_name) + + return success diff --git a/milabench/commands/__init__.py b/milabench/commands/__init__.py index e97ac4e5..4a8f1e90 100644 --- a/milabench/commands/__init__.py +++ b/milabench/commands/__init__.py @@ -451,6 +451,11 @@ def _find_node_config(self) -> Dict: return {} def is_local(self): + local = self._is_local() + print("is_local", self.host, local) + return local + + def _is_local(self): localnode = self.pack.config["system"]["self"] if localnode is not None: @@ -581,7 +586,7 @@ def node_address(node): """Favour Hostname as it is the most consistent name across machines""" host = node.get("hostname") ip = node.get("ip") - return host or ip + return ip or hostname class ForeachNode(ListCommand): @@ -637,6 +642,7 @@ def executors(self): **self.options ) + print(rank, node, node_address(node)) worker = SSHCommand( host=node_address(node), user=node["user"], diff --git a/milabench/compare.py b/milabench/compare.py index d4d6299e..32f95c64 100644 --- a/milabench/compare.py +++ b/milabench/compare.py @@ -26,6 +26,7 @@ def fetch_runs(folder, filter): runs = [] ignored = 0 + for run in os.listdir(folder): if run.startswith("install") or run.startswith("prepare"): continue @@ -43,6 +44,7 @@ def fetch_runs(folder, filter): date = retrieve_datetime_from_name(date) else: name = run + date = None if date is None: date = datetime.fromtimestamp(os.path.getmtime(pth)) diff --git a/milabench/config.py b/milabench/config.py index 039a85cc..9a2d519c 100644 --- a/milabench/config.py +++ b/milabench/config.py @@ -100,11 +100,15 @@ def combine_args(args, kwargs): yield kwargs else: key, values = args.popitem() - for value in values: - kwargs[key] = value + + try: + for value in values: + kwargs[key] = value + yield from combine_args(deepcopy(args), kwargs) + except: + kwargs[key] = values yield from combine_args(deepcopy(args), kwargs) - def expand_matrix(name, bench_config): if "matrix" not in bench_config: return [(name, bench_config)] diff --git a/milabench/remote.py b/milabench/remote.py index cbe9696b..c92166fd 100644 --- a/milabench/remote.py +++ b/milabench/remote.py @@ -124,7 +124,6 @@ def milabench_remote_setup_plan(pack, setup_for="worker") -> SequenceCommand: nodes = pack.config["system"]["nodes"] copy = [] - node_packs = [] copy_source = copy_folder(pack, INSTALL_FOLDER, setup_for) @@ -132,7 +131,8 @@ def milabench_remote_setup_plan(pack, setup_for="worker") -> SequenceCommand: for i, node in enumerate(nodes): if should_run_for(node, setup_for): - install.append(pip_install_milabench(node_packs[i], node, INSTALL_FOLDER)) + node_pack = worker_pack(pack, node) + install.append(pip_install_milabench(node_pack, node, INSTALL_FOLDER)) return SequenceCommand( copy_source, diff --git a/milabench/sizer.py b/milabench/sizer.py index 75002edb..00d6d2b6 100644 --- a/milabench/sizer.py +++ b/milabench/sizer.py @@ -53,8 +53,7 @@ def to_octet(value: str) -> float: class Sizer: """Automatically scale the batch size to match GPU spec""" - def __init__(self, options=SizerOptions(), scaling_config=None): - self.options = options + def __init__(self, scaling_config=None): self.path = scaling_config if scaling_config is None: @@ -62,6 +61,10 @@ def __init__(self, options=SizerOptions(), scaling_config=None): with open(scaling_config, "r") as sconf: self.scaling_config = yaml.safe_load(sconf) + + @property + def options(self): + return SizerOptions() def benchscaling(self, benchmark): # key @@ -165,6 +168,10 @@ def find_batch_size(self, benchmark, event): return -1 def argv(self, benchmark, capacity, argv): + newargv = self._argv(benchmark, capacity, argv) + return newargv + + def _argv(self, benchmark, capacity, argv): """Find the batch size and override it with a new value""" config = self.benchscaling(benchmark) @@ -214,11 +221,12 @@ def argv(self, benchmark, capacity, argv): def batch_sizer() -> Sizer: - sizer = sizer_global.get() - if sizer is None: - sizer_global.set(Sizer()) - return batch_sizer() - return sizer + return Sizer() + # sizer = sizer_global.get() + # if sizer is None: + # sizer_global.set(Sizer()) + # return batch_sizer() + # return sizer def get_batch_size(config, start_event): @@ -242,8 +250,9 @@ class MemoryUsageExtractor(ValidationLayer): """Extract max memory usage per benchmark to populate the memory model""" def __init__(self): + + self.filepath = option("sizer.save", str, None) sizer = batch_sizer() - self.filepath = sizer.options.save self.memory = deepcopy(sizer.scaling_config) self.scaling = None self.benchname = None diff --git a/milabench/system.py b/milabench/system.py index 2d5a6ca8..9aa49975 100644 --- a/milabench/system.py +++ b/milabench/system.py @@ -1,4 +1,5 @@ import contextvars +from copy import deepcopy import ipaddress import os import socket @@ -15,7 +16,7 @@ from .merge import merge system_global = contextvars.ContextVar("system", default=None) - +multirun_global = contextvars.ContextVar("multirun", default=None) def get_gpu_capacity(strict=False): try: @@ -79,6 +80,60 @@ def as_environment_variable(name): return "MILABENCH_" + "_".join(map(str.upper, frags)) +def multirun(): + multirun = multirun_global.get() + + if multirun is None or len(multirun) == 0: + yield None, dict() + + runs = multirun.get("runs", dict()) + + from .config import combine_args + import time + from types import SimpleNamespace + + def unflatten(dct): + result = {} + for k, v in dct.items(): + l = result + frags = k.split(".") + for frag in frags[:-1]: + l = l.setdefault(frag, SimpleNamespace()) + setattr(l, frags[-1], v) + + return result + + for run_matrix in runs: + arguments = run_matrix["matrix"] + + for run in combine_args(arguments, dict()): + template_name = run_matrix["name"] + + ctx = unflatten(run) + ctx['time'] = int(time.time()) + run_name = template_name.format(**ctx) + + yield run_name, run + + +@contextmanager +def apply_system(config: dict): + system = system_global.get() + old = deepcopy(system) + + for k, v in config.items(): + frags = k.split(".") + + lookup = system.setdefault("options", {}) + for f in frags[:-1]: + lookup = lookup.setdefault(f, {}) + lookup[frags[-1]] = v + + + yield + system_global.set(old) + + def option(name, etype, default=None): options = dict() system = system_global.get() @@ -401,11 +456,12 @@ def gethostname(host): def resolve_hostname(ip): try: hostname, _, iplist = socket.gethostbyaddr(ip) - + for ip in iplist: if is_loopback(ip): return hostname, True + # FIXME return socket.gethostname(), hostname.startswith(socket.gethostname()) return hostname, hostname == socket.gethostname() @@ -465,6 +521,9 @@ def build_system_config(config_file, defaults=None, gpu=True): config = merge(defaults, config) system = config.get("system", {}) + multirun = config.get("multirun", {}) + + multirun_global.set(multirun) system_global.set(system) # capacity is only required if batch resizer is enabled diff --git a/scripts/article/run_rocm.sh b/scripts/article/run_rocm.sh index fbb9da83..0fc2bf16 100644 --- a/scripts/article/run_rocm.sh +++ b/scripts/article/run_rocm.sh @@ -11,6 +11,7 @@ export ROCM_PATH="/opt/rocm" export MILABENCH_BASE="$MILABENCH_WORDIR/results" export MILABENCH_VENV="$MILABENCH_WORDIR/env" export BENCHMARK_VENV="$MILABENCH_WORDIR/results/venv/torch" +export MILABENCH_SIZER_SAVE="$MILABENCH_WORDIR/scaling.yaml" if [ -z "${MILABENCH_SOURCE}" ]; then export MILABENCH_CONFIG="$MILABENCH_WORDIR/milabench/config/standard.yaml" @@ -24,6 +25,17 @@ export TORCH_ROCM_ARCH_LIST="$GPU" export ROCM_TARGETS="$GPU" export PYTORCH_ROCM_ARCH="$GPU" +if [ -z "${MILABENCH_SOURCE}" ]; then + export MILABENCH_CONFIG="$MILABENCH_WORDIR/milabench/config/standard.yaml" +else + export MILABENCH_CONFIG="$MILABENCH_SOURCE/config/standard.yaml" +fi + + +export GPU="$(/opt/rocm/lib/llvm/bin/amdgpu-arch | head -n 1)" +export TORCH_ROCM_ARCH_LIST="$GPU" +export ROCM_TARGETS="$GPU" +export PYTORCH_ROCM_ARCH="$GPU" ARGS="$@" @@ -75,7 +87,7 @@ install_prepare() { # https://github.com/ROCm/jax/releases/tag/rocm-jaxlib-v0.4.30 pip install https://github.com/ROCm/jax/releases/download/rocm-jaxlib-v0.4.30/jaxlib-0.4.30+rocm611-cp310-cp310-manylinux2014_x86_64.whl - pip install https://github.com/ROCm/jax/archive/refs/tags/rocm-jaxlib-v0.4.30.tar.gz + pip install https://github.com/ROCm/jax/archive/refs/tags/rocm-jaxlib-v0.4.30.tar.g pip uninstall torch_cluster torch_scatter torch_sparse -y FORCE_ONLY_CUDA=1 pip install -U -v --use-pep517 --no-build-isolation git+https://github.com/rusty1s/pytorch_cluster.git @@ -111,12 +123,19 @@ else . $MILABENCH_WORDIR/env/bin/activate fi +( + . $BENCHMARK_VENV/bin/activate + pip install xformers --index-url https://download.pytorch.org/whl/rocm6.1 +) + +# milabench install $ARGS --system $MILABENCH_WORDIR/system.yaml -milabench prepare $ARGS +# milabench prepare $ARGS --system $MILABENCH_WORDIR/system.yaml # # Run the benchmakrs -milabench run $ARGS +milabench run $ARGS --system $MILABENCH_WORDIR/system.yaml + # # Display report diff --git a/tests/test_system_matrix.py b/tests/test_system_matrix.py new file mode 100644 index 00000000..ed537881 --- /dev/null +++ b/tests/test_system_matrix.py @@ -0,0 +1,40 @@ + + + + + +from milabench.system import multirun, build_system_config, enable_offline, option, apply_system, SizerOptions + +from milabench.testing import official_config + + +def test_system_matrix(): + with enable_offline(True): + sys = build_system_config(official_config("examples/system")) + + n = 0 + for name, conf in multirun(): + print(name, conf) + n += 1 + + assert n == 39 + + +def test_apply_system_matrix(): + with enable_offline(True): + sys = build_system_config(official_config("examples/system")) + + for name, conf in multirun(): + with apply_system(conf): + + # Apply system worked and changed the config + for k, v in conf.items(): + assert option(k, lambda x: x) == v + + + assert SizerOptions().save == option("sizer.save", lambda x: x) + + + +if __name__ == "__main__": + test_apply_system_matrix() From deb271ab4ad39f67938a824d00df717b3029d7f0 Mon Sep 17 00:00:00 2001 From: Pierre Delaunay Date: Thu, 21 Nov 2024 14:19:22 -0500 Subject: [PATCH 12/20] Add monitor tag to templates --- benchmarks/_templates/simple/dev.yaml | 2 ++ benchmarks/_templates/stdout/dev.yaml | 3 ++- benchmarks/_templates/voir/dev.yaml | 2 ++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/benchmarks/_templates/simple/dev.yaml b/benchmarks/_templates/simple/dev.yaml index e3aa9467..affcc977 100644 --- a/benchmarks/_templates/simple/dev.yaml +++ b/benchmarks/_templates/simple/dev.yaml @@ -6,3 +6,5 @@ template: install_group: torch plan: method: per_gpu + tags: + - monogpu diff --git a/benchmarks/_templates/stdout/dev.yaml b/benchmarks/_templates/stdout/dev.yaml index 2b7e75a3..24c7b813 100644 --- a/benchmarks/_templates/stdout/dev.yaml +++ b/benchmarks/_templates/stdout/dev.yaml @@ -3,7 +3,8 @@ _template: definition: . install-variant: unpinned install_group: torch - + tags: + - monogpu #argv: # --train_batch_size: 32 # --num_epochs: 5 diff --git a/benchmarks/_templates/voir/dev.yaml b/benchmarks/_templates/voir/dev.yaml index e3aa9467..affcc977 100644 --- a/benchmarks/_templates/voir/dev.yaml +++ b/benchmarks/_templates/voir/dev.yaml @@ -6,3 +6,5 @@ template: install_group: torch plan: method: per_gpu + tags: + - monogpu From a8415d3da9f91aa1ac23d932dff2c70fe580e556 Mon Sep 17 00:00:00 2001 From: Pierre Delaunay Date: Thu, 21 Nov 2024 14:35:55 -0500 Subject: [PATCH 13/20] Do not rely on DNS to resolve nodes --- milabench/remote.py | 2 +- milabench/system.py | 71 ++++++++------------------------------------- 2 files changed, 13 insertions(+), 60 deletions(-) diff --git a/milabench/remote.py b/milabench/remote.py index c92166fd..27660f75 100644 --- a/milabench/remote.py +++ b/milabench/remote.py @@ -192,7 +192,7 @@ def is_remote(pack): def is_main_local(pack): """Only the local main can send remote commands to remote""" self = pack.config["system"]["self"] - return self is not None and self["local"] and self.get("main", False) + return self is not None and self.get("local", True) and self.get("main", False) def is_worker(pack): diff --git a/milabench/system.py b/milabench/system.py index 9aa49975..bd8298c6 100644 --- a/milabench/system.py +++ b/milabench/system.py @@ -323,6 +323,7 @@ def get_remote_ip(): for interface, address_list in addresses.items(): for address in address_list: + # if address.family in (socket.AF_INET, socket.AF_INET6): if interface in stats and getattr(stats[interface], "isup"): result.append(address.address) @@ -341,46 +342,6 @@ def is_loopback(address: str) -> bool: -def _resolve_ip(ip): - hostname = ip - aliaslist = [] - ipaddrlist = [ip] - lazy_raise = None - - if not offline: - # Resolve the IP - try: - hostname, aliaslist, ipaddrlist = socket.gethostbyaddr(ip) - lazy_raise = None - - except socket.herror as err: - lazy_raise = err - - except socket.gaierror as err: - # Get Addr Info (GAI) Error - # - # When we are connecting to a node through a ssh proxy jump - # the node IPs/Hostnames are not available until we reach - # the first node inside the cluster - # - lazy_raise = err - - return hostname, aliaslist, ipaddrlist, lazy_raise - - -def _fix_weird(hostname): - if hostname.endswith(".server.mila.quebec.server.mila.quebec"): - print() - print("Hostname was extra long for no reason") - print(hostname, socket.gethostname()) - print() - - # why is this happening - hostname = hostname[: -len(".server.mila.quebec")] - - return hostname - - # If true that means we cannot resolve the ip addresses # so we ignore errors offline = True @@ -406,29 +367,21 @@ def _resolve_addresses(nodes): ip_list = get_remote_ip() for node in nodes: - hostname, aliaslist, ipaddrlist, lazy_raise = _resolve_ip(node["ip"]) - - hostname = _fix_weird(hostname) - - node["hostname"] = hostname - node["aliaslist"] = aliaslist - node["ipaddrlist"] = ipaddrlist - - is_local = ( - ("127.0.0.1" in ipaddrlist) - or (hostname in ("localhost", socket.gethostname(), "127.0.0.1")) - or (socket.gethostname().startswith(hostname)) - or len(ip_list.intersection(ipaddrlist)) > 0 - or any([is_loopback(ip) for ip in ipaddrlist]) - ) - - # cn-g005 cn-g005.server.mila.quebec - # print(hostname, socket.gethostname()) + ip = node["ip"] + + is_local = is_loopback(ip) + + if ip in ip_list: + is_local = True + node["local"] = is_local + + if is_local: + node["hostname"] = socket.gethostname() if is_local and self is None: self = node - node["ipaddrlist"] = list(set(list(ip_list) + list(ipaddrlist))) + node["ipaddrlist"] = list(set(list(ip_list))) # if self is node we might be outisde the cluster # which explains why we could not resolve the IP of the nodes From d5cbbf50dfffd05ccda970f6c3e12cd2599bf339 Mon Sep 17 00:00:00 2001 From: Pierre Delaunay Date: Thu, 21 Nov 2024 17:07:45 -0500 Subject: [PATCH 14/20] Update README --- README.md | 143 +++-- milabench/_version.py | 6 +- scripts/article/run_cuda.sh | 9 +- .../test_command_reg_one_node.txt | 604 ----------------- .../test_command_reg_two_nodes.txt | 607 ------------------ .../test_capabilities.py | 0 6 files changed, 90 insertions(+), 1279 deletions(-) delete mode 100644 tests/test_command_reg/test_command_reg_one_node.txt delete mode 100644 tests/test_command_reg/test_command_reg_two_nodes.txt rename tests/{ => test_validation}/test_capabilities.py (100%) diff --git a/README.md b/README.md index 52639893..163906d0 100644 --- a/README.md +++ b/README.md @@ -20,62 +20,23 @@ evaluating current and future hardware in a research environment. * Focussed on training * Ease of use * Pytorch focused -* ROCm & NVIDIA +* ROCm, NVIDIA, Intel OneAPI, Habana Gaudi (Synapse) * Independent ## Getting Started -The easiest way to run milabbench is to run it with one of its docker image. -It will include all of the necessary data - - - # Choose the image you want to use - export MILABENCH_IMAGE=ghcr.io/mila-iqia/milabench:cuda-nightly - - # Pull the image we are going to run - docker pull $MILABENCH_IMAGE - - # Run milabench - docker run -it --rm --ipc=host --gpus=all \ - -v $(pwd)/results:/milabench/envs/runs \ - $MILABENCH_IMAGE \ - bash -c "milabench prepare && milabench run" - - ================= - Benchmark results - ================= - fail n perf sem% std% peak_memory score weight - bert-fp16 0 8 155.08 0.3% 4.3% 24552 1241.260310 0.00 - bert-fp32 0 8 29.52 0.0% 0.5% 31524 236.337218 0.00 - bert-tf32 0 8 120.46 0.4% 6.1% 31524 964.713297 0.00 - bert-tf32-fp16 0 8 154.76 0.3% 4.1% 24552 1238.477257 3.00 - convnext_large-fp16 0 8 337.48 0.9% 14.0% 27658 2741.604444 0.00 - convnext_large-fp32 0 8 44.61 0.8% 12.6% 49786 354.207225 0.00 - convnext_large-tf32 0 8 135.99 0.7% 11.2% 49786 1089.394916 0.00 - convnext_large-tf32-fp16 0 8 338.58 0.8% 13.0% 27658 2744.325170 3.00 - davit_large 0 8 312.79 0.3% 6.7% 35058 2515.326450 1.00 - davit_large-multi 0 1 2401.65 1.0% 7.7% 42232 2401.651720 5.00 - dlrm 0 1 188777.20 1.8% 14.0% 3194 188777.203190 1.00 - focalnet 0 8 400.47 0.2% 5.4% 26604 3215.431924 2.00 - opt-1_3b 0 1 26.71 0.1% 0.4% 44116 26.714365 5.00 - opt-1_3b-multinode 0 2 34.62 0.2% 1.0% 43552 34.618292 10.00 - opt-6_7b 0 1 14.32 0.0% 0.1% 55750 14.319587 5.00 - opt-6_7b-multinode 0 2 10.79 0.1% 0.7% 49380 10.792595 10.00 - reformer 0 8 61.70 0.0% 0.9% 25376 494.110834 1.00 - regnet_y_128gf 0 8 99.96 0.2% 5.0% 31840 803.012507 2.00 - resnet152 0 8 710.18 0.3% 6.2% 36732 5710.828608 1.00 - resnet152-multi 0 1 5367.34 1.0% 8.1% 38638 5367.338469 5.00 - resnet50 0 8 984.43 0.9% 19.1% 5026 7927.257351 1.00 - rwkv 0 8 428.65 0.2% 3.8% 5546 3435.097716 1.00 - stargan 0 8 51.32 1.8% 40.8% 37848 413.238870 1.00 - super-slomo 0 8 41.63 0.1% 2.3% 34082 332.395065 1.00 - t5 0 8 48.05 0.2% 3.9% 35466 384.317023 2.00 - whisper 0 8 248.16 0.0% 0.6% 37006 1985.861017 1.00 - - Scores - ------ - Failure rate: 0.00% (PASS) - Score: 219.06 + + git clone https://github.com/mila-iqia/milabench.git + + pip install -e milabench + + export MILABENCH_GPU_ARCH=cuda + + milabench install --base workspace --config milabench/config/standard.yaml --select fp32 + + milabench prepare --base workspace --config milabench/config/standard.yaml --select fp32 + + milabench run --base workspace --config milabench/config/standard.yaml --select fp32 ## Details @@ -84,13 +45,77 @@ The benchmark suite has been validated on the following configurations: | Python version | GPU | Configuration file | | - | - | - | -| 3.10 (conda) | 2 node x 8xNVIDIA A100 80GB | config/standard.yaml | -| 3.9.12 (conda) | 8x NVIDIA RTX8000 48GB | config/standard.yaml | -| 3.9.16 (conda) | 2x NVIDIA K80 | config/ci.yaml | -| 3.9.16 (conda) | 2x AMD MI100 | config/ci.yaml | -| 3.9.16 (conda) | 4x AMD MI250 | config/standard.yaml | +| 3.10 | 2 node x 8xNVIDIA A100 80GB | config/standard.yaml | +| 3.10 | 2 node x 8xMI300X | config/standard.yaml | +| 3.10 | 1 node x 8xGaudi2 | config/standard.yaml | We are working on validating it on more configurations and will update the above table as we do. - - +## Report + + ================= + Benchmark results + ================= + + System + ------ + cpu: AMD EPYC 7742 64-Core Processor + n_cpu: 128 + product: NVIDIA A100-SXM4-80GB + n_gpu: 8 + memory: 81920.0 + + Breakdown + --------- + bench | fail | n | ngpu | perf | sem% | std% | peak_memory | score | weight + brax | 0 | 1 | 8 | 730035.71 | 0.1% | 0.4% | 2670 | 730035.71 | 1.00 + diffusion-gpus | 0 | 1 | 8 | 117.67 | 1.5% | 11.7% | 59944 | 117.67 | 1.00 + diffusion-single | 0 | 8 | 1 | 25.02 | 0.8% | 17.9% | 53994 | 202.10 | 1.00 + dimenet | 0 | 8 | 1 | 366.85 | 0.7% | 16.2% | 2302 | 2973.32 | 1.00 + dinov2-giant-gpus | 0 | 1 | 8 | 445.68 | 0.4% | 3.0% | 69614 | 445.68 | 1.00 + dinov2-giant-single | 0 | 8 | 1 | 53.54 | 0.4% | 9.5% | 74646 | 432.65 | 1.00 + dqn | 0 | 8 | 1 | 23089954554.91 | 1.1% | 89.9% | 62106 | 184480810548.20 | 1.00 + bf16 | 0 | 8 | 1 | 293.43 | 0.2% | 6.3% | 1788 | 2361.16 | 0.00 + fp16 | 0 | 8 | 1 | 289.26 | 0.1% | 3.6% | 1788 | 2321.65 | 0.00 + fp32 | 0 | 8 | 1 | 19.14 | 0.0% | 0.7% | 2166 | 153.21 | 0.00 + tf32 | 0 | 8 | 1 | 146.63 | 0.1% | 3.6% | 2166 | 1177.04 | 0.00 + bert-fp16 | 0 | 8 | 1 | 263.73 | 1.1% | 16.7% | nan | 2165.37 | 0.00 + bert-fp32 | 0 | 8 | 1 | 44.84 | 0.6% | 9.6% | 21170 | 364.52 | 0.00 + bert-tf32 | 0 | 8 | 1 | 141.95 | 0.9% | 14.1% | 1764 | 1162.94 | 0.00 + bert-tf32-fp16 | 0 | 8 | 1 | 265.04 | 1.0% | 15.6% | nan | 2175.59 | 3.00 + reformer | 0 | 8 | 1 | 62.29 | 0.3% | 6.0% | 25404 | 501.89 | 1.00 + t5 | 0 | 8 | 1 | 51.40 | 0.5% | 9.9% | 34390 | 416.14 | 2.00 + whisper | 0 | 8 | 1 | 481.95 | 1.0% | 21.4% | 8520 | 3897.53 | 1.00 + lightning | 0 | 8 | 1 | 680.22 | 1.0% | 22.7% | 27360 | 5506.90 | 1.00 + lightning-gpus | 0 | 1 | 8 | 3504.74 | 7.9% | 62.9% | 28184 | 3504.74 | 1.00 + llava-single | 1 | 8 | 1 | 2.28 | 0.4% | 9.6% | 72556 | 14.12 | 1.00 + llama | 0 | 8 | 1 | 484.86 | 4.4% | 80.0% | 27820 | 3680.86 | 1.00 + llm-full-mp-gpus | 0 | 1 | 8 | 193.92 | 3.1% | 16.2% | 48470 | 193.92 | 1.00 + llm-lora-ddp-gpus | 0 | 1 | 8 | 16738.58 | 0.4% | 2.0% | 36988 | 16738.58 | 1.00 + llm-lora-mp-gpus | 0 | 1 | 8 | 1980.63 | 2.2% | 11.8% | 55972 | 1980.63 | 1.00 + llm-lora-single | 0 | 8 | 1 | 2724.95 | 0.2% | 3.0% | 49926 | 21861.99 | 1.00 + ppo | 0 | 8 | 1 | 3114264.32 | 1.6% | 57.2% | 62206 | 24915954.98 | 1.00 + recursiongfn | 0 | 8 | 1 | 7080.67 | 1.2% | 27.1% | 10292 | 57038.34 | 1.00 + rlhf-gpus | 0 | 1 | 8 | 6314.94 | 2.1% | 11.2% | 21730 | 6314.94 | 1.00 + rlhf-single | 0 | 8 | 1 | 1143.72 | 0.4% | 8.4% | 19566 | 9174.52 | 1.00 + focalnet | 0 | 8 | 1 | 375.07 | 0.7% | 14.9% | 23536 | 3038.83 | 2.00 + torchatari | 0 | 8 | 1 | 5848.88 | 0.6% | 12.7% | 3834 | 46613.34 | 1.00 + convnext_large-fp16 | 0 | 8 | 1 | 330.93 | 1.5% | 22.9% | 27376 | 2711.46 | 0.00 + convnext_large-fp32 | 0 | 8 | 1 | 59.49 | 0.6% | 9.8% | 55950 | 483.84 | 0.00 + convnext_large-tf32 | 0 | 8 | 1 | 155.41 | 0.9% | 14.3% | 49650 | 1273.31 | 0.00 + convnext_large-tf32-fp16 | 0 | 8 | 1 | 322.28 | 1.6% | 24.5% | 27376 | 2637.88 | 3.00 + regnet_y_128gf | 0 | 8 | 1 | 119.46 | 0.5% | 10.0% | 29762 | 966.96 | 2.00 + resnet152-ddp-gpus | 0 | 1 | 8 | 3843.06 | 5.2% | 39.3% | 27980 | 3843.06 | 0.00 + resnet50 | 0 | 8 | 1 | 932.95 | 2.4% | 52.2% | 14848 | 7524.25 | 1.00 + resnet50-noio | 0 | 8 | 1 | 1163.88 | 0.3% | 6.7% | 27480 | 9385.35 | 0.00 + vjepa-gpus | 0 | 1 | 8 | 130.13 | 5.9% | 46.8% | 64244 | 130.13 | 1.00 + vjepa-single | 0 | 8 | 1 | 21.29 | 1.0% | 22.4% | 58552 | 172.11 | 1.00 + + Scores + ------ + Failure rate: 0.38% (PASS) + Score: 4175.57 + + Errors + ------ + 1 errors, details in HTML report. \ No newline at end of file diff --git a/milabench/_version.py b/milabench/_version.py index a3f4e1b4..281e1d0a 100644 --- a/milabench/_version.py +++ b/milabench/_version.py @@ -1,5 +1,5 @@ """This file is generated, do not modify""" -__tag__ = "v1.0.0_RC1-18-g784b38e" -__commit__ = "784b38e77b90116047e3de893c22c2f7d3225179" -__date__ = "2024-10-18 15:58:46 +0000" +__tag__ = "v0.1.0-146-ga8415d3" +__commit__ = "a8415d3da9f91aa1ac23d932dff2c70fe580e556" +__date__ = "2024-11-21 14:35:55 -0500" diff --git a/scripts/article/run_cuda.sh b/scripts/article/run_cuda.sh index 0c2c1dae..9ef13b7d 100644 --- a/scripts/article/run_cuda.sh +++ b/scripts/article/run_cuda.sh @@ -84,15 +84,12 @@ if [ "$MILABENCH_PREPARE" -eq 0 ]; then . $MILABENCH_WORDIR/env/bin/activate - - # pip install torch # milabench pin --variant cuda --from-scratch # rm -rf $MILABENCH_WORDIR/results/venv/ - rm -rf $MILABENCH_WORDIR/results/extra - - milabench install --system $MILABENCH_WORDIR/system.yaml - milabench prepare --system $MILABENCH_WORDIR/system.yaml $ARGS + # rm -rf $MILABENCH_WORDIR/results/extra + # milabench install --system $MILABENCH_WORDIR/system.yaml + # milabench prepare --system $MILABENCH_WORDIR/system.yaml $ARGS ( . $BENCHMARK_VENV/bin/activate diff --git a/tests/test_command_reg/test_command_reg_one_node.txt b/tests/test_command_reg/test_command_reg_one_node.txt deleted file mode 100644 index 3a511bb6..00000000 --- a/tests/test_command_reg/test_command_reg_one_node.txt +++ /dev/null @@ -1,604 +0,0 @@ -#!/bin/sh - -echo "---" -echo "Virtual Env" -echo "===========" -export VIRTUAL_ENV=$BASE/venv/torch - -source $VIRTUAL_ENV/bin/activate -echo "---" -echo "Milabench" -echo "=========" -export MILABENCH_DIR_BASE=$BASE -export MILABENCH_DIR_VENV=$BASE/venv/torch -export MILABENCH_DIR_DATA=$BASE/data -export MILABENCH_DIR_RUNS=$BASE/runs -export MILABENCH_DIR_EXTRA=$BASE/extra/llm -export MILABENCH_DIR_CACHE=$BASE/cache -export OMP_NUM_THREADS=0 -export MILABENCH_CONFIG='{"system": {"arch": "cuda", "sshkey": null, "nodes": [{"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}], "self": {"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}}, "dirs": {"base": "$BASE", "venv": "$BASE/venv/torch", "data": "$BASE/data", "runs": "$BASE/runs", "extra": "$BASE/extra/llm", "cache": "$BASE/cache"}, "group": "llm", "install_group": "torch", "install_variant": "cuda", "run_name": "dev", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 800, "voir": {"options": {"stop": 30, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "$SRC/milabench/config", "config_file": "$SRC/milabench/config/standard.yaml", "definition": "$SRC/milabench/benchmarks/llama", "tags": ["inference", "llm", "monogpu", "nlp", "nobatch"], "plan": {"method": "per_gpu"}, "weight": 1.0, "name": "llama", "tag": ["llama"]}' - -echo "---" -echo "llama" -echo "=====" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & - wait -) - -echo "---" -echo "fp16" -echo "====" -time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - wait -) - -echo "---" -echo "bf16" -echo "====" -time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - wait -) - -echo "---" -echo "tf32" -echo "====" -time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - wait -) - -echo "---" -echo "fp32" -echo "====" -time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - wait -) - -echo "---" -echo "resnet50" -echo "========" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - wait -) - -echo "---" -echo "resnet50-noio" -echo "=============" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - wait -) - -echo "---" -echo "resnet152-ddp-gpus" -echo "==================" -time ( - $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/torchvision_ddp/main.py --epochs 10 --num-workers 8 --loader torch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - wait -) - -echo "---" -echo "convnext_large-fp32" -echo "===================" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - wait -) - -echo "---" -echo "convnext_large-fp16" -echo "===================" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - wait -) - -echo "---" -echo "convnext_large-tf32" -echo "===================" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - wait -) - -echo "---" -echo "convnext_large-tf32-fp16" -echo "========================" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - wait -) - -echo "---" -echo "regnet_y_128gf" -echo "==============" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & - wait -) - -echo "---" -echo "bert-fp32" -echo "=========" -time ( - CUDA_VISIBLE_DEVICES=0 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=1 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=2 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=3 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=4 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=5 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=6 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=7 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & - wait -) - -echo "---" -echo "bert-fp16" -echo "=========" -time ( - CUDA_VISIBLE_DEVICES=0 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=1 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=2 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=3 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=4 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=5 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=6 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=7 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & - wait -) - -echo "---" -echo "bert-tf32" -echo "=========" -time ( - CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & - wait -) - -echo "---" -echo "bert-tf32-fp16" -echo "==============" -time ( - CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & - wait -) - -echo "---" -echo "t5" -echo "==" -time ( - CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & - CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & - CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & - CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & - CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & - CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & - CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & - CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & - wait -) - -echo "---" -echo "reformer" -echo "========" -time ( - CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & - CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & - CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & - CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & - CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & - CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & - CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & - CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & - wait -) - -echo "---" -echo "whisper" -echo "=======" -time ( - CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & - CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & - CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & - CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & - CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & - CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & - CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & - CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & - wait -) - -echo "---" -echo "focalnet" -echo "========" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D0 --checkpoint-hist 1 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D1 --checkpoint-hist 1 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D2 --checkpoint-hist 1 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D3 --checkpoint-hist 1 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D4 --checkpoint-hist 1 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D5 --checkpoint-hist 1 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D6 --checkpoint-hist 1 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D7 --checkpoint-hist 1 & - wait -) - -echo "---" -echo "brax" -echo "====" -time ( - python $SRC/milabench/benchmarks/brax/main.py --episode-length 20 --batch-size 1024 --num-minibatches 32 --num-envs 8192 & - wait -) - -echo "---" -echo "diffusion-single" -echo "================" -time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - wait -) - -echo "---" -echo "diffusion-gpus" -echo "==============" -time ( - $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - wait -) - -echo "---" -echo "diffusion-nodes" -echo "===============" -time ( - $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - wait -) - -echo "---" -echo "lightning" -echo "=========" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - wait -) - -echo "---" -echo "lightning-gpus" -echo "==============" -time ( - $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - wait -) - -echo "---" -echo "dinov2-giant-single" -echo "===================" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - wait -) - -echo "---" -echo "dinov2-giant-gpus" -echo "=================" -time ( - $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-gpus/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - wait -) - -echo "---" -echo "llm-lora-single" -echo "===============" -time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - wait -) - -echo "---" -echo "llm-lora-ddp-gpus" -echo "=================" -time ( - $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-gpus/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - wait -) - -echo "---" -echo "llm-lora-ddp-nodes" -echo "==================" -time ( - $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - wait -) - -echo "---" -echo "llm-lora-mp-gpus" -echo "================" -time ( - $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_lora.yaml epochs=1 output_dir=$BASE/extra/llm-lora-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ safetensors=true metric_logger.log_dir=$BASE/extra/llm-lora-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" batch_size=8 gradient_accumulation_steps=1 & - wait -) - -echo "---" -echo "llm-full-mp-gpus" -echo "================" -time ( - $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 & - wait -) - -echo "---" -echo "llm-full-mp-nodes" -echo "=================" -time ( - $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 & - wait -) - -echo "---" -echo "dqn" -echo "===" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 & - wait -) - -echo "---" -echo "ppo" -echo "===" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 & - wait -) - -echo "---" -echo "dimenet" -echo "=======" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & - wait -) - -echo "---" -echo "recursiongfn" -echo "============" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & - wait -) - -echo "---" -echo "torchatari" -echo "==========" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & - wait -) - -echo "---" -echo "llava-single" -echo "============" -time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & - wait -) - -echo "---" -echo "rlhf-single" -echo "===========" -time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - wait -) - -echo "---" -echo "rlhf-gpus" -echo "=========" -time ( - $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-gpus/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - wait -) - -echo "---" -echo "vjepa-single" -echo "============" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & - wait -) - -echo "---" -echo "vjepa-gpus" -echo "==========" -time ( - $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-gpus & - wait -) - diff --git a/tests/test_command_reg/test_command_reg_two_nodes.txt b/tests/test_command_reg/test_command_reg_two_nodes.txt deleted file mode 100644 index 3004505d..00000000 --- a/tests/test_command_reg/test_command_reg_two_nodes.txt +++ /dev/null @@ -1,607 +0,0 @@ -#!/bin/sh - -echo "---" -echo "Virtual Env" -echo "===========" -export VIRTUAL_ENV=$BASE/venv/torch - -source $VIRTUAL_ENV/bin/activate -echo "---" -echo "Milabench" -echo "=========" -export MILABENCH_DIR_BASE=$BASE -export MILABENCH_DIR_VENV=$BASE/venv/torch -export MILABENCH_DIR_DATA=$BASE/data -export MILABENCH_DIR_RUNS=$BASE/runs -export MILABENCH_DIR_EXTRA=$BASE/extra/llm -export MILABENCH_DIR_CACHE=$BASE/cache -export OMP_NUM_THREADS=0 -export MILABENCH_CONFIG='{"system": {"arch": "cuda", "sshkey": null, "nodes": [{"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}, {"ip": "192.168.0.11", "main": false, "name": "1", "sshport": 22, "user": "username", "hostname": "192.168.0.11"}], "self": {"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}}, "dirs": {"base": "$BASE", "venv": "$BASE/venv/torch", "data": "$BASE/data", "runs": "$BASE/runs", "extra": "$BASE/extra/llm", "cache": "$BASE/cache"}, "group": "llm", "install_group": "torch", "install_variant": "cuda", "run_name": "dev", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 800, "voir": {"options": {"stop": 30, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "$SRC/milabench/config", "config_file": "$SRC/milabench/config/standard.yaml", "definition": "$SRC/milabench/benchmarks/llama", "tags": ["inference", "llm", "monogpu", "nlp", "nobatch"], "plan": {"method": "per_gpu"}, "weight": 1.0, "name": "llama", "tag": ["llama"]}' - -echo "---" -echo "llama" -echo "=====" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & - wait -) - -echo "---" -echo "fp16" -echo "====" -time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & - wait -) - -echo "---" -echo "bf16" -echo "====" -time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & - wait -) - -echo "---" -echo "tf32" -echo "====" -time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & - wait -) - -echo "---" -echo "fp32" -echo "====" -time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & - wait -) - -echo "---" -echo "resnet50" -echo "========" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - wait -) - -echo "---" -echo "resnet50-noio" -echo "=============" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & - wait -) - -echo "---" -echo "resnet152-ddp-gpus" -echo "==================" -time ( - $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/torchvision_ddp/main.py --epochs 10 --num-workers 8 --loader torch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - wait -) - -echo "---" -echo "convnext_large-fp32" -echo "===================" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - wait -) - -echo "---" -echo "convnext_large-fp16" -echo "===================" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - wait -) - -echo "---" -echo "convnext_large-tf32" -echo "===================" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - wait -) - -echo "---" -echo "convnext_large-tf32-fp16" -echo "========================" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & - wait -) - -echo "---" -echo "regnet_y_128gf" -echo "==============" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & - wait -) - -echo "---" -echo "bert-fp32" -echo "=========" -time ( - CUDA_VISIBLE_DEVICES=0 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=1 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=2 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=3 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=4 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=5 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=6 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=7 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & - wait -) - -echo "---" -echo "bert-fp16" -echo "=========" -time ( - CUDA_VISIBLE_DEVICES=0 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=1 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=2 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=3 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=4 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=5 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=6 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=7 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & - wait -) - -echo "---" -echo "bert-tf32" -echo "=========" -time ( - CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & - wait -) - -echo "---" -echo "bert-tf32-fp16" -echo "==============" -time ( - CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & - CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & - wait -) - -echo "---" -echo "t5" -echo "==" -time ( - CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & - CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & - CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & - CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & - CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & - CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & - CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & - CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & - wait -) - -echo "---" -echo "reformer" -echo "========" -time ( - CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & - CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & - CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & - CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & - CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & - CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & - CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & - CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 64 & - wait -) - -echo "---" -echo "whisper" -echo "=======" -time ( - CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & - CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & - CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & - CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & - CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & - CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & - CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & - CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & - wait -) - -echo "---" -echo "focalnet" -echo "========" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D0 --checkpoint-hist 1 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D1 --checkpoint-hist 1 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D2 --checkpoint-hist 1 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D3 --checkpoint-hist 1 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D4 --checkpoint-hist 1 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D5 --checkpoint-hist 1 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D6 --checkpoint-hist 1 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D7 --checkpoint-hist 1 & - wait -) - -echo "---" -echo "brax" -echo "====" -time ( - python $SRC/milabench/benchmarks/brax/main.py --episode-length 20 --batch-size 1024 --num-minibatches 32 --num-envs 8192 & - wait -) - -echo "---" -echo "diffusion-single" -echo "================" -time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - wait -) - -echo "---" -echo "diffusion-gpus" -echo "==============" -time ( - $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - wait -) - -echo "---" -echo "diffusion-nodes" -echo "===============" -time ( - $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=2 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=16 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=1 --num_machines=2 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=16 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & - wait -) - -echo "---" -echo "lightning" -echo "=========" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - wait -) - -echo "---" -echo "lightning-gpus" -echo "==============" -time ( - $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & - wait -) - -echo "---" -echo "dinov2-giant-single" -echo "===================" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - wait -) - -echo "---" -echo "dinov2-giant-gpus" -echo "=================" -time ( - $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-gpus/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & - wait -) - -echo "---" -echo "llm-lora-single" -echo "===============" -time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - wait -) - -echo "---" -echo "llm-lora-ddp-gpus" -echo "=================" -time ( - $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-gpus/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - wait -) - -echo "---" -echo "llm-lora-ddp-nodes" -echo "==================" -time ( - $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --node-rank=0 --local-addr=127.0.0.1 --rdzv-conf=rank=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --node-rank=1 --local-addr=192.168.0.11 --rdzv-conf=rank=1 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 & - wait -) - -echo "---" -echo "llm-lora-mp-gpus" -echo "================" -time ( - $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_lora.yaml epochs=1 output_dir=$BASE/extra/llm-lora-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ safetensors=true metric_logger.log_dir=$BASE/extra/llm-lora-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" batch_size=8 gradient_accumulation_steps=1 & - wait -) - -echo "---" -echo "llm-full-mp-gpus" -echo "================" -time ( - $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 & - wait -) - -echo "---" -echo "llm-full-mp-nodes" -echo "=================" -time ( - $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --node-rank=0 --local-addr=127.0.0.1 --rdzv-conf=rank=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 & - ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --node-rank=1 --local-addr=192.168.0.11 --rdzv-conf=rank=1 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 & - wait -) - -echo "---" -echo "dqn" -echo "===" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_batch_size 128 --env_name CartPole-v1 --training_interval 10 & - wait -) - -echo "---" -echo "ppo" -echo "===" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 200000 & - wait -) - -echo "---" -echo "dimenet" -echo "=======" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 10000 --use3d & - wait -) - -echo "---" -echo "recursiongfn" -echo "============" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & - wait -) - -echo "---" -echo "torchatari" -echo "==========" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & - wait -) - -echo "---" -echo "llava-single" -echo "============" -time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & - wait -) - -echo "---" -echo "rlhf-single" -echo "===========" -time ( - CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - wait -) - -echo "---" -echo "rlhf-gpus" -echo "=========" -time ( - $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-gpus/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & - wait -) - -echo "---" -echo "vjepa-single" -echo "============" -time ( - CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & - CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & - CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & - CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & - CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & - CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & - CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & - CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & - wait -) - -echo "---" -echo "vjepa-gpus" -echo "==========" -time ( - $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-gpus & - wait -) - diff --git a/tests/test_capabilities.py b/tests/test_validation/test_capabilities.py similarity index 100% rename from tests/test_capabilities.py rename to tests/test_validation/test_capabilities.py From 9a29f06d689482cb3c1b72d044403d0866f31e7f Mon Sep 17 00:00:00 2001 From: Pierre Delaunay Date: Thu, 21 Nov 2024 17:12:15 -0500 Subject: [PATCH 15/20] undo gaudi2 config for llm --- benchmarks/llm/configs/llama3_70B_full.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmarks/llm/configs/llama3_70B_full.yaml b/benchmarks/llm/configs/llama3_70B_full.yaml index 22b52b79..703eb876 100644 --- a/benchmarks/llm/configs/llama3_70B_full.yaml +++ b/benchmarks/llm/configs/llama3_70B_full.yaml @@ -82,7 +82,7 @@ optimizer: foreach: False # Note: highly recommended to use fused=True optimizer flag # with CPU offload for faster optimizer step. - fused: False + fused: true loss: _component_: torch.nn.CrossEntropyLoss @@ -94,9 +94,9 @@ gradient_accumulation_steps: 1 device: cuda # Memory management -enable_activation_checkpointing: false -memory_efficient_fsdp_wrap: false -fsdp_cpu_offload: false +enable_activation_checkpointing: true +memory_efficient_fsdp_wrap: true +fsdp_cpu_offload: true # Reduced precision dtype: bf16 From 06fa1be26201f04bb2281ecd86e70da7e3522623 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Thu, 21 Nov 2024 19:44:48 -0500 Subject: [PATCH 16/20] Update README.md --- README.md | 152 +++++++++++++++++++++++++++--------------------------- 1 file changed, 76 insertions(+), 76 deletions(-) diff --git a/README.md b/README.md index 163906d0..a2f8ce50 100644 --- a/README.md +++ b/README.md @@ -25,18 +25,18 @@ evaluating current and future hardware in a research environment. ## Getting Started - - git clone https://github.com/mila-iqia/milabench.git - pip install -e milabench - - export MILABENCH_GPU_ARCH=cuda - - milabench install --base workspace --config milabench/config/standard.yaml --select fp32 + git clone https://github.com/mila-iqia/milabench.git + + pip install -e milabench - milabench prepare --base workspace --config milabench/config/standard.yaml --select fp32 + export MILABENCH_GPU_ARCH=cuda - milabench run --base workspace --config milabench/config/standard.yaml --select fp32 + milabench install --base workspace --config milabench/config/standard.yaml --select fp32 + + milabench prepare --base workspace --config milabench/config/standard.yaml --select fp32 + + milabench run --base workspace --config milabench/config/standard.yaml --select fp32 ## Details @@ -52,70 +52,70 @@ The benchmark suite has been validated on the following configurations: We are working on validating it on more configurations and will update the above table as we do. ## Report - - ================= - Benchmark results - ================= - - System - ------ - cpu: AMD EPYC 7742 64-Core Processor - n_cpu: 128 - product: NVIDIA A100-SXM4-80GB - n_gpu: 8 - memory: 81920.0 - - Breakdown - --------- - bench | fail | n | ngpu | perf | sem% | std% | peak_memory | score | weight - brax | 0 | 1 | 8 | 730035.71 | 0.1% | 0.4% | 2670 | 730035.71 | 1.00 - diffusion-gpus | 0 | 1 | 8 | 117.67 | 1.5% | 11.7% | 59944 | 117.67 | 1.00 - diffusion-single | 0 | 8 | 1 | 25.02 | 0.8% | 17.9% | 53994 | 202.10 | 1.00 - dimenet | 0 | 8 | 1 | 366.85 | 0.7% | 16.2% | 2302 | 2973.32 | 1.00 - dinov2-giant-gpus | 0 | 1 | 8 | 445.68 | 0.4% | 3.0% | 69614 | 445.68 | 1.00 - dinov2-giant-single | 0 | 8 | 1 | 53.54 | 0.4% | 9.5% | 74646 | 432.65 | 1.00 - dqn | 0 | 8 | 1 | 23089954554.91 | 1.1% | 89.9% | 62106 | 184480810548.20 | 1.00 - bf16 | 0 | 8 | 1 | 293.43 | 0.2% | 6.3% | 1788 | 2361.16 | 0.00 - fp16 | 0 | 8 | 1 | 289.26 | 0.1% | 3.6% | 1788 | 2321.65 | 0.00 - fp32 | 0 | 8 | 1 | 19.14 | 0.0% | 0.7% | 2166 | 153.21 | 0.00 - tf32 | 0 | 8 | 1 | 146.63 | 0.1% | 3.6% | 2166 | 1177.04 | 0.00 - bert-fp16 | 0 | 8 | 1 | 263.73 | 1.1% | 16.7% | nan | 2165.37 | 0.00 - bert-fp32 | 0 | 8 | 1 | 44.84 | 0.6% | 9.6% | 21170 | 364.52 | 0.00 - bert-tf32 | 0 | 8 | 1 | 141.95 | 0.9% | 14.1% | 1764 | 1162.94 | 0.00 - bert-tf32-fp16 | 0 | 8 | 1 | 265.04 | 1.0% | 15.6% | nan | 2175.59 | 3.00 - reformer | 0 | 8 | 1 | 62.29 | 0.3% | 6.0% | 25404 | 501.89 | 1.00 - t5 | 0 | 8 | 1 | 51.40 | 0.5% | 9.9% | 34390 | 416.14 | 2.00 - whisper | 0 | 8 | 1 | 481.95 | 1.0% | 21.4% | 8520 | 3897.53 | 1.00 - lightning | 0 | 8 | 1 | 680.22 | 1.0% | 22.7% | 27360 | 5506.90 | 1.00 - lightning-gpus | 0 | 1 | 8 | 3504.74 | 7.9% | 62.9% | 28184 | 3504.74 | 1.00 - llava-single | 1 | 8 | 1 | 2.28 | 0.4% | 9.6% | 72556 | 14.12 | 1.00 - llama | 0 | 8 | 1 | 484.86 | 4.4% | 80.0% | 27820 | 3680.86 | 1.00 - llm-full-mp-gpus | 0 | 1 | 8 | 193.92 | 3.1% | 16.2% | 48470 | 193.92 | 1.00 - llm-lora-ddp-gpus | 0 | 1 | 8 | 16738.58 | 0.4% | 2.0% | 36988 | 16738.58 | 1.00 - llm-lora-mp-gpus | 0 | 1 | 8 | 1980.63 | 2.2% | 11.8% | 55972 | 1980.63 | 1.00 - llm-lora-single | 0 | 8 | 1 | 2724.95 | 0.2% | 3.0% | 49926 | 21861.99 | 1.00 - ppo | 0 | 8 | 1 | 3114264.32 | 1.6% | 57.2% | 62206 | 24915954.98 | 1.00 - recursiongfn | 0 | 8 | 1 | 7080.67 | 1.2% | 27.1% | 10292 | 57038.34 | 1.00 - rlhf-gpus | 0 | 1 | 8 | 6314.94 | 2.1% | 11.2% | 21730 | 6314.94 | 1.00 - rlhf-single | 0 | 8 | 1 | 1143.72 | 0.4% | 8.4% | 19566 | 9174.52 | 1.00 - focalnet | 0 | 8 | 1 | 375.07 | 0.7% | 14.9% | 23536 | 3038.83 | 2.00 - torchatari | 0 | 8 | 1 | 5848.88 | 0.6% | 12.7% | 3834 | 46613.34 | 1.00 - convnext_large-fp16 | 0 | 8 | 1 | 330.93 | 1.5% | 22.9% | 27376 | 2711.46 | 0.00 - convnext_large-fp32 | 0 | 8 | 1 | 59.49 | 0.6% | 9.8% | 55950 | 483.84 | 0.00 - convnext_large-tf32 | 0 | 8 | 1 | 155.41 | 0.9% | 14.3% | 49650 | 1273.31 | 0.00 - convnext_large-tf32-fp16 | 0 | 8 | 1 | 322.28 | 1.6% | 24.5% | 27376 | 2637.88 | 3.00 - regnet_y_128gf | 0 | 8 | 1 | 119.46 | 0.5% | 10.0% | 29762 | 966.96 | 2.00 - resnet152-ddp-gpus | 0 | 1 | 8 | 3843.06 | 5.2% | 39.3% | 27980 | 3843.06 | 0.00 - resnet50 | 0 | 8 | 1 | 932.95 | 2.4% | 52.2% | 14848 | 7524.25 | 1.00 - resnet50-noio | 0 | 8 | 1 | 1163.88 | 0.3% | 6.7% | 27480 | 9385.35 | 0.00 - vjepa-gpus | 0 | 1 | 8 | 130.13 | 5.9% | 46.8% | 64244 | 130.13 | 1.00 - vjepa-single | 0 | 8 | 1 | 21.29 | 1.0% | 22.4% | 58552 | 172.11 | 1.00 - - Scores - ------ - Failure rate: 0.38% (PASS) - Score: 4175.57 - - Errors - ------ - 1 errors, details in HTML report. \ No newline at end of file + + ================= + Benchmark results + ================= + + System + ------ + cpu: AMD EPYC 7742 64-Core Processor + n_cpu: 128 + product: NVIDIA A100-SXM4-80GB + n_gpu: 8 + memory: 81920.0 + + Breakdown + --------- + bench | fail | n | ngpu | perf | sem% | std% | peak_memory | score | weight + brax | 0 | 1 | 8 | 730035.71 | 0.1% | 0.4% | 2670 | 730035.71 | 1.00 + diffusion-gpus | 0 | 1 | 8 | 117.67 | 1.5% | 11.7% | 59944 | 117.67 | 1.00 + diffusion-single | 0 | 8 | 1 | 25.02 | 0.8% | 17.9% | 53994 | 202.10 | 1.00 + dimenet | 0 | 8 | 1 | 366.85 | 0.7% | 16.2% | 2302 | 2973.32 | 1.00 + dinov2-giant-gpus | 0 | 1 | 8 | 445.68 | 0.4% | 3.0% | 69614 | 445.68 | 1.00 + dinov2-giant-single | 0 | 8 | 1 | 53.54 | 0.4% | 9.5% | 74646 | 432.65 | 1.00 + dqn | 0 | 8 | 1 | 23089954554.91 | 1.1% | 89.9% | 62106 | 184480810548.20 | 1.00 + bf16 | 0 | 8 | 1 | 293.43 | 0.2% | 6.3% | 1788 | 2361.16 | 0.00 + fp16 | 0 | 8 | 1 | 289.26 | 0.1% | 3.6% | 1788 | 2321.65 | 0.00 + fp32 | 0 | 8 | 1 | 19.14 | 0.0% | 0.7% | 2166 | 153.21 | 0.00 + tf32 | 0 | 8 | 1 | 146.63 | 0.1% | 3.6% | 2166 | 1177.04 | 0.00 + bert-fp16 | 0 | 8 | 1 | 263.73 | 1.1% | 16.7% | nan | 2165.37 | 0.00 + bert-fp32 | 0 | 8 | 1 | 44.84 | 0.6% | 9.6% | 21170 | 364.52 | 0.00 + bert-tf32 | 0 | 8 | 1 | 141.95 | 0.9% | 14.1% | 1764 | 1162.94 | 0.00 + bert-tf32-fp16 | 0 | 8 | 1 | 265.04 | 1.0% | 15.6% | nan | 2175.59 | 3.00 + reformer | 0 | 8 | 1 | 62.29 | 0.3% | 6.0% | 25404 | 501.89 | 1.00 + t5 | 0 | 8 | 1 | 51.40 | 0.5% | 9.9% | 34390 | 416.14 | 2.00 + whisper | 0 | 8 | 1 | 481.95 | 1.0% | 21.4% | 8520 | 3897.53 | 1.00 + lightning | 0 | 8 | 1 | 680.22 | 1.0% | 22.7% | 27360 | 5506.90 | 1.00 + lightning-gpus | 0 | 1 | 8 | 3504.74 | 7.9% | 62.9% | 28184 | 3504.74 | 1.00 + llava-single | 1 | 8 | 1 | 2.28 | 0.4% | 9.6% | 72556 | 14.12 | 1.00 + llama | 0 | 8 | 1 | 484.86 | 4.4% | 80.0% | 27820 | 3680.86 | 1.00 + llm-full-mp-gpus | 0 | 1 | 8 | 193.92 | 3.1% | 16.2% | 48470 | 193.92 | 1.00 + llm-lora-ddp-gpus | 0 | 1 | 8 | 16738.58 | 0.4% | 2.0% | 36988 | 16738.58 | 1.00 + llm-lora-mp-gpus | 0 | 1 | 8 | 1980.63 | 2.2% | 11.8% | 55972 | 1980.63 | 1.00 + llm-lora-single | 0 | 8 | 1 | 2724.95 | 0.2% | 3.0% | 49926 | 21861.99 | 1.00 + ppo | 0 | 8 | 1 | 3114264.32 | 1.6% | 57.2% | 62206 | 24915954.98 | 1.00 + recursiongfn | 0 | 8 | 1 | 7080.67 | 1.2% | 27.1% | 10292 | 57038.34 | 1.00 + rlhf-gpus | 0 | 1 | 8 | 6314.94 | 2.1% | 11.2% | 21730 | 6314.94 | 1.00 + rlhf-single | 0 | 8 | 1 | 1143.72 | 0.4% | 8.4% | 19566 | 9174.52 | 1.00 + focalnet | 0 | 8 | 1 | 375.07 | 0.7% | 14.9% | 23536 | 3038.83 | 2.00 + torchatari | 0 | 8 | 1 | 5848.88 | 0.6% | 12.7% | 3834 | 46613.34 | 1.00 + convnext_large-fp16 | 0 | 8 | 1 | 330.93 | 1.5% | 22.9% | 27376 | 2711.46 | 0.00 + convnext_large-fp32 | 0 | 8 | 1 | 59.49 | 0.6% | 9.8% | 55950 | 483.84 | 0.00 + convnext_large-tf32 | 0 | 8 | 1 | 155.41 | 0.9% | 14.3% | 49650 | 1273.31 | 0.00 + convnext_large-tf32-fp16 | 0 | 8 | 1 | 322.28 | 1.6% | 24.5% | 27376 | 2637.88 | 3.00 + regnet_y_128gf | 0 | 8 | 1 | 119.46 | 0.5% | 10.0% | 29762 | 966.96 | 2.00 + resnet152-ddp-gpus | 0 | 1 | 8 | 3843.06 | 5.2% | 39.3% | 27980 | 3843.06 | 0.00 + resnet50 | 0 | 8 | 1 | 932.95 | 2.4% | 52.2% | 14848 | 7524.25 | 1.00 + resnet50-noio | 0 | 8 | 1 | 1163.88 | 0.3% | 6.7% | 27480 | 9385.35 | 0.00 + vjepa-gpus | 0 | 1 | 8 | 130.13 | 5.9% | 46.8% | 64244 | 130.13 | 1.00 + vjepa-single | 0 | 8 | 1 | 21.29 | 1.0% | 22.4% | 58552 | 172.11 | 1.00 + + Scores + ------ + Failure rate: 0.38% (PASS) + Score: 4175.57 + + Errors + ------ + 1 errors, details in HTML report. From ab6b4129e3b957e51de5dbf6647b2f8d10cfb5d7 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Thu, 21 Nov 2024 20:59:32 -0500 Subject: [PATCH 17/20] Add missing tags to tests config (#312) * Add missing tags to tests config * Add revision * Add regression files * update test_memory_tracking --------- Co-authored-by: Pierre Delaunay --- benchmarks/llava/main.py | 6 +- benchmarks/llava/prepare.py | 6 +- milabench/sizer.py | 9 +- milabench/system.py | 5 + tests/config/argerror.yaml | 2 + tests/config/benchio.yaml | 4 +- tests/config/benchio_bad.yaml | 3 +- tests/config/scaling.yaml | 2 + .../test_command_reg_one_node.txt | 619 ++++++++++++++++ .../test_command_reg_two_nodes.txt | 676 ++++++++++++++++++ tests/test_scaler.py | 41 +- tests/test_summary/test_compare.txt | 4 +- tests/test_summary/test_report.txt | 2 +- .../test_report_folder_does_average.txt | 2 +- tests/test_validation.py | 71 +- 15 files changed, 1388 insertions(+), 64 deletions(-) create mode 100644 tests/test_command_reg/test_command_reg_one_node.txt create mode 100644 tests/test_command_reg/test_command_reg_two_nodes.txt diff --git a/benchmarks/llava/main.py b/benchmarks/llava/main.py index 233ae2eb..6c49b04a 100755 --- a/benchmarks/llava/main.py +++ b/benchmarks/llava/main.py @@ -62,8 +62,12 @@ def main(): "llava-hf/llava-1.5-7b-hf", torch_dtype=torch.bfloat16, device_map=compat.device_type, + revision="a272c74b2481d8aff3aa6fc2c4bf891fe57334fb" + ) + processor = AutoProcessor.from_pretrained( + "llava-hf/llava-1.5-7b-hf", + revision="a272c74b2481d8aff3aa6fc2c4bf891fe57334fb" ) - processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf") # Load dataset and create DataLoader dataset = load_dataset("HuggingFaceM4/the_cauldron", "aokvqa")["train"] diff --git a/benchmarks/llava/prepare.py b/benchmarks/llava/prepare.py index afa480b8..5e8b018f 100755 --- a/benchmarks/llava/prepare.py +++ b/benchmarks/llava/prepare.py @@ -11,8 +11,12 @@ def main(): "llava-hf/llava-1.5-7b-hf", torch_dtype=torch.float32, # Change to float32 device_map="auto", + revision="a272c74b2481d8aff3aa6fc2c4bf891fe57334fb" + ) + _ = AutoProcessor.from_pretrained( + "llava-hf/llava-1.5-7b-hf", + revision="a272c74b2481d8aff3aa6fc2c4bf891fe57334fb" ) - _ = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf") # Load dataset and create DataLoader _ = load_dataset("HuggingFaceM4/the_cauldron", "aokvqa")["train"] diff --git a/milabench/sizer.py b/milabench/sizer.py index 00d6d2b6..4bd62bc7 100644 --- a/milabench/sizer.py +++ b/milabench/sizer.py @@ -53,9 +53,10 @@ def to_octet(value: str) -> float: class Sizer: """Automatically scale the batch size to match GPU spec""" - def __init__(self, scaling_config=None): + def __init__(self, sizer=None, scaling_config=option("sizer.config", etype=str)): self.path = scaling_config - + self.sizer_override = sizer + if scaling_config is None: scaling_config = default_scaling_config @@ -64,6 +65,8 @@ def __init__(self, scaling_config=None): @property def options(self): + if self.sizer_override: + return self.sizer_override return SizerOptions() def benchscaling(self, benchmark): @@ -252,7 +255,7 @@ class MemoryUsageExtractor(ValidationLayer): def __init__(self): self.filepath = option("sizer.save", str, None) - sizer = batch_sizer() + sizer = Sizer() self.memory = deepcopy(sizer.scaling_config) self.scaling = None self.benchname = None diff --git a/milabench/system.py b/milabench/system.py index bd8298c6..691d06bd 100644 --- a/milabench/system.py +++ b/milabench/system.py @@ -121,6 +121,11 @@ def apply_system(config: dict): system = system_global.get() old = deepcopy(system) + if system is None: + system = dict() + system_global.set(system) + system = system_global.get() + for k, v in config.items(): frags = k.split(".") diff --git a/tests/config/argerror.yaml b/tests/config/argerror.yaml index 49ad733c..59041b72 100644 --- a/tests/config/argerror.yaml +++ b/tests/config/argerror.yaml @@ -9,3 +9,5 @@ benchio: n: 1 argv: --start: 0 + tags: + - monogpu \ No newline at end of file diff --git a/tests/config/benchio.yaml b/tests/config/benchio.yaml index f2c694e2..50c352ca 100644 --- a/tests/config/benchio.yaml +++ b/tests/config/benchio.yaml @@ -4,4 +4,6 @@ benchio: weight: 2 plan: method: njobs - n: 2 \ No newline at end of file + n: 2 + tags: + - monogpu \ No newline at end of file diff --git a/tests/config/benchio_bad.yaml b/tests/config/benchio_bad.yaml index ac0b2f82..51b15ac4 100644 --- a/tests/config/benchio_bad.yaml +++ b/tests/config/benchio_bad.yaml @@ -8,4 +8,5 @@ benchio: argv: --bad: true - + tags: + - monogpu \ No newline at end of file diff --git a/tests/config/scaling.yaml b/tests/config/scaling.yaml index 664996f7..3f3b032e 100644 --- a/tests/config/scaling.yaml +++ b/tests/config/scaling.yaml @@ -5,3 +5,5 @@ benchio: 64: 12Go 128: 24Go 256: 48Go + tags: + - monogpu \ No newline at end of file diff --git a/tests/test_command_reg/test_command_reg_one_node.txt b/tests/test_command_reg/test_command_reg_one_node.txt new file mode 100644 index 00000000..af21f4cd --- /dev/null +++ b/tests/test_command_reg/test_command_reg_one_node.txt @@ -0,0 +1,619 @@ +#!/bin/sh + +echo "---" +echo "Virtual Env" +echo "===========" +export VIRTUAL_ENV=$BASE/venv/torch + +source $VIRTUAL_ENV/bin/activate +echo "---" +echo "Milabench" +echo "=========" +export MILABENCH_DIR_BASE=$BASE +export MILABENCH_DIR_VENV=$BASE/venv/torch +export MILABENCH_DIR_DATA=$BASE/data +export MILABENCH_DIR_RUNS=$BASE/runs +export MILABENCH_DIR_EXTRA=$BASE/extra/llm +export MILABENCH_DIR_CACHE=$BASE/cache +export OMP_NUM_THREADS=0 +export MILABENCH_CONFIG='{"system": {"arch": "cuda", "sshkey": null, "nodes": [{"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}], "self": {"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}}, "dirs": {"base": "$BASE", "venv": "$BASE/venv/torch", "data": "$BASE/data", "runs": "$BASE/runs", "extra": "$BASE/extra/llm", "cache": "$BASE/cache"}, "group": "llm", "install_group": "torch", "install_variant": "cuda", "run_name": "dev", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 3600, "voir": {"options": {"stop": 30, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "$SRC/milabench/config", "config_file": "$SRC/milabench/config/standard.yaml", "definition": "$SRC/milabench/benchmarks/llama", "tags": ["inference", "llm", "monogpu", "nlp", "nobatch"], "plan": {"method": "per_gpu"}, "weight": 1.0, "name": "llama", "tag": ["llama"]}' + +echo "---" +echo "llama" +echo "=====" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & + wait +) + +echo "---" +echo "fp16" +echo "====" +time ( + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + wait +) + +echo "---" +echo "bf16" +echo "====" +time ( + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + wait +) + +echo "---" +echo "tf32" +echo "====" +time ( + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + wait +) + +echo "---" +echo "fp32" +echo "====" +time ( + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + wait +) + +echo "---" +echo "resnet50" +echo "========" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + wait +) + +echo "---" +echo "resnet50-noio" +echo "=============" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + wait +) + +echo "---" +echo "resnet152-ddp-gpus" +echo "==================" +time ( + $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/torchvision_ddp/main.py --epochs 10 --num-workers 8 --loader torch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + wait +) + +echo "---" +echo "convnext_large-fp32" +echo "===================" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + wait +) + +echo "---" +echo "convnext_large-fp16" +echo "===================" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + wait +) + +echo "---" +echo "convnext_large-tf32" +echo "===================" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + wait +) + +echo "---" +echo "convnext_large-tf32-fp16" +echo "========================" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + wait +) + +echo "---" +echo "regnet_y_128gf" +echo "==============" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & + wait +) + +echo "---" +echo "bert-fp32" +echo "=========" +time ( + CUDA_VISIBLE_DEVICES=0 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=1 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=2 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=3 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=4 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=5 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=6 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=7 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & + wait +) + +echo "---" +echo "bert-fp16" +echo "=========" +time ( + CUDA_VISIBLE_DEVICES=0 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=1 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=2 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=3 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=4 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=5 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=6 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=7 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & + wait +) + +echo "---" +echo "bert-tf32" +echo "=========" +time ( + CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & + wait +) + +echo "---" +echo "bert-tf32-fp16" +echo "==============" +time ( + CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & + wait +) + +echo "---" +echo "t5" +echo "==" +time ( + CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & + CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & + CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & + CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & + CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & + CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & + CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & + CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & + wait +) + +echo "---" +echo "reformer" +echo "========" +time ( + CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 & + CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 & + CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 & + CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 & + CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 & + CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 & + CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 & + CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 & + wait +) + +echo "---" +echo "whisper" +echo "=======" +time ( + CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & + CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & + CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & + CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & + CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & + CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & + CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & + CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & + wait +) + +echo "---" +echo "focalnet" +echo "========" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D0 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D1 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D2 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D3 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D4 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D5 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D6 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D7 --checkpoint-hist 1 & + wait +) + +echo "---" +echo "brax" +echo "====" +time ( + python $SRC/milabench/benchmarks/brax/main.py --episode-length 20 --batch-size 1024 --num-minibatches 32 --num-envs 8192 & + wait +) + +echo "---" +echo "diffusion-single" +echo "================" +time ( + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & + wait +) + +echo "---" +echo "diffusion-gpus" +echo "==============" +time ( + $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & + wait +) + +echo "---" +echo "diffusion-nodes" +echo "===============" +time ( + $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & + wait +) + +echo "---" +echo "lightning" +echo "=========" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + wait +) + +echo "---" +echo "lightning-gpus" +echo "==============" +time ( + $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + wait +) + +echo "---" +echo "dinov2-giant-single" +echo "===================" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + wait +) + +echo "---" +echo "dinov2-giant-gpus" +echo "=================" +time ( + $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-gpus/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + wait +) + +echo "---" +echo "llm-lora-single" +echo "===============" +time ( + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda & + wait +) + +echo "---" +echo "llm-lora-ddp-gpus" +echo "=================" +time ( + $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-gpus/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda & + wait +) + +echo "---" +echo "llm-lora-ddp-nodes" +echo "==================" +time ( + $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda & + wait +) + +echo "---" +echo "llm-lora-mp-gpus" +echo "================" +time ( + $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_lora.yaml epochs=1 output_dir=$BASE/extra/llm-lora-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ safetensors=true metric_logger.log_dir=$BASE/extra/llm-lora-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" batch_size=8 gradient_accumulation_steps=1 device=cuda & + wait +) + +echo "---" +echo "llm-full-mp-gpus" +echo "================" +time ( + $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 device=cuda & + wait +) + +echo "---" +echo "llm-full-mp-nodes" +echo "=================" +time ( + $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 device=cuda & + wait +) + +echo "---" +echo "dqn" +echo "===" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 & + wait +) + +echo "---" +echo "ppo" +echo "===" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 & + wait +) + +echo "---" +echo "pna" +echo "===" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 & + wait +) + +echo "---" +echo "dimenet" +echo "=======" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 & + wait +) + +echo "---" +echo "recursiongfn" +echo "============" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & + wait +) + +echo "---" +echo "torchatari" +echo "==========" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & + wait +) + +echo "---" +echo "llava-single" +echo "============" +time ( + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & + wait +) + +echo "---" +echo "rlhf-single" +echo "===========" +time ( + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & + wait +) + +echo "---" +echo "rlhf-gpus" +echo "=========" +time ( + $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-gpus/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & + wait +) + +echo "---" +echo "vjepa-single" +echo "============" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & + wait +) + +echo "---" +echo "vjepa-gpus" +echo "==========" +time ( + $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-gpus & + wait +) + diff --git a/tests/test_command_reg/test_command_reg_two_nodes.txt b/tests/test_command_reg/test_command_reg_two_nodes.txt new file mode 100644 index 00000000..5e516e3f --- /dev/null +++ b/tests/test_command_reg/test_command_reg_two_nodes.txt @@ -0,0 +1,676 @@ +#!/bin/sh + +echo "---" +echo "Virtual Env" +echo "===========" +export VIRTUAL_ENV=$BASE/venv/torch + +source $VIRTUAL_ENV/bin/activate +echo "---" +echo "Milabench" +echo "=========" +export MILABENCH_DIR_BASE=$BASE +export MILABENCH_DIR_VENV=$BASE/venv/torch +export MILABENCH_DIR_DATA=$BASE/data +export MILABENCH_DIR_RUNS=$BASE/runs +export MILABENCH_DIR_EXTRA=$BASE/extra/llm +export MILABENCH_DIR_CACHE=$BASE/cache +export OMP_NUM_THREADS=0 +export MILABENCH_CONFIG='{"system": {"arch": "cuda", "sshkey": null, "nodes": [{"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}, {"ip": "192.168.0.11", "main": false, "name": "1", "sshport": 22, "user": "username", "hostname": "192.168.0.11"}], "self": {"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}}, "dirs": {"base": "$BASE", "venv": "$BASE/venv/torch", "data": "$BASE/data", "runs": "$BASE/runs", "extra": "$BASE/extra/llm", "cache": "$BASE/cache"}, "group": "llm", "install_group": "torch", "install_variant": "cuda", "run_name": "dev", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 3600, "voir": {"options": {"stop": 30, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "$SRC/milabench/config", "config_file": "$SRC/milabench/config/standard.yaml", "definition": "$SRC/milabench/benchmarks/llama", "tags": ["inference", "llm", "monogpu", "nlp", "nobatch"], "plan": {"method": "per_gpu"}, "weight": 1.0, "name": "llama", "tag": ["llama"]}' + +echo "---" +echo "llama" +echo "=====" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/llama/main.py --cache $BASE/cache & + wait +) + +echo "---" +echo "fp16" +echo "====" +time ( + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/main.py --number 30 --repeat 90 --m 8192 --n 8192 --dtype fp16 & + wait +) + +echo "---" +echo "bf16" +echo "====" +time ( + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype bf16 & + wait +) + +echo "---" +echo "tf32" +echo "====" +time ( + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 --tf32 & + wait +) + +echo "---" +echo "fp32" +echo "====" +time ( + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/flops/main.py --number 10 --repeat 90 --m 8192 --n 8192 --dtype fp32 & + wait +) + +echo "---" +echo "resnet50" +echo "========" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + wait +) + +echo "---" +echo "resnet50-noio" +echo "=============" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader synthetic_fixed --data $BASE/data/FakeImageNet --model resnet50 --batch-size 256 & + wait +) + +echo "---" +echo "resnet152-ddp-gpus" +echo "==================" +time ( + $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache $SRC/milabench/benchmarks/torchvision_ddp/main.py --epochs 10 --num-workers 8 --loader torch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + wait +) + +echo "---" +echo "convnext_large-fp32" +echo "===================" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + wait +) + +echo "---" +echo "convnext_large-fp16" +echo "===================" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + wait +) + +echo "---" +echo "convnext_large-tf32" +echo "===================" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + wait +) + +echo "---" +echo "convnext_large-tf32-fp16" +echo "========================" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model convnext_large --batch-size 128 & + wait +) + +echo "---" +echo "regnet_y_128gf" +echo "==============" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchvision/main.py --precision tf32-fp16 --lr 0.01 --no-stdout --epochs 50 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model regnet_y_128gf --batch-size 64 & + wait +) + +echo "---" +echo "bert-fp32" +echo "=========" +time ( + CUDA_VISIBLE_DEVICES=0 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=1 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=2 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=3 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=4 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=5 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=6 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=7 python -m bench --precision fp32 --num-workers 8 --model Bert --batch-size 32 & + wait +) + +echo "---" +echo "bert-fp16" +echo "=========" +time ( + CUDA_VISIBLE_DEVICES=0 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=1 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=2 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=3 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=4 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=5 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=6 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=7 python -m bench --precision fp16 --num-workers 8 --model Bert --batch-size 32 & + wait +) + +echo "---" +echo "bert-tf32" +echo "=========" +time ( + CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32 --num-workers 8 --model Bert --batch-size 32 & + wait +) + +echo "---" +echo "bert-tf32-fp16" +echo "==============" +time ( + CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & + CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Bert --batch-size 32 & + wait +) + +echo "---" +echo "t5" +echo "==" +time ( + CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & + CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & + CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & + CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & + CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & + CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & + CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & + CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model T5 --batch-size 16 & + wait +) + +echo "---" +echo "reformer" +echo "========" +time ( + CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 & + CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 & + CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 & + CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 & + CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 & + CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 & + CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 & + CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Reformer --batch-size 32 & + wait +) + +echo "---" +echo "whisper" +echo "=======" +time ( + CUDA_VISIBLE_DEVICES=0 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & + CUDA_VISIBLE_DEVICES=1 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & + CUDA_VISIBLE_DEVICES=2 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & + CUDA_VISIBLE_DEVICES=3 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & + CUDA_VISIBLE_DEVICES=4 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & + CUDA_VISIBLE_DEVICES=5 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & + CUDA_VISIBLE_DEVICES=6 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & + CUDA_VISIBLE_DEVICES=7 python -m bench --precision tf32-fp16 --num-workers 8 --model Whisper --batch-size 64 & + wait +) + +echo "---" +echo "focalnet" +echo "========" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D0 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D1 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D2 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D3 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D4 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D5 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D6 --checkpoint-hist 1 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/timm/pytorch-image-models/train.py --amp --amp-dtype bfloat16 --device cuda --val-split --data-dir $BASE/data --dataset FakeImageNet --workers 8 --model focalnet_base_lrf --output $BASE/extra/timm/dev/focalnet.D7 --checkpoint-hist 1 & + wait +) + +echo "---" +echo "brax" +echo "====" +time ( + python $SRC/milabench/benchmarks/brax/main.py --episode-length 20 --batch-size 1024 --num-minibatches 32 --num-envs 8192 & + wait +) + +echo "---" +echo "diffusion-single" +echo "================" +time ( + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & + wait +) + +echo "---" +echo "diffusion-gpus" +echo "==============" +time ( + $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & + wait +) + +0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1 +is_local 127.0.0.1 True +1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11 +is_local 192.168.0.11 False +0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1 +is_local 127.0.0.1 True +1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11 +is_local 192.168.0.11 False +0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1 +is_local 127.0.0.1 True +1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11 +is_local 192.168.0.11 False +echo "---" +echo "diffusion-nodes" +echo "===============" +time ( +0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1 +is_local 127.0.0.1 True +1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11 +is_local 192.168.0.11 False +is_local 127.0.0.1 True + $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=2 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=16 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & +is_local 192.168.0.11 False + ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=1 --num_machines=2 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=16 $SRC/milabench/benchmarks/diffusion/main.py --num_epochs 5 --batch_size 32 --num_workers 8 --cache $BASE/cache & + wait +) + +echo "---" +echo "lightning" +echo "=========" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + wait +) + +echo "---" +echo "lightning-gpus" +echo "==============" +time ( + $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 & + wait +) + +echo "---" +echo "dinov2-giant-single" +echo "===================" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-single/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + wait +) + +echo "---" +echo "dinov2-giant-gpus" +echo "=================" +time ( + $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-gpus/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 & + wait +) + +echo "---" +echo "llm-lora-single" +echo "===============" +time ( + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/llm/recipes/lora_finetune_single_device.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-single/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-single/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda & + wait +) + +echo "---" +echo "llm-lora-ddp-gpus" +echo "=================" +time ( + $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-gpus/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda & + wait +) + +0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1 +is_local 127.0.0.1 True +1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11 +is_local 192.168.0.11 False +0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1 +is_local 127.0.0.1 True +1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11 +is_local 192.168.0.11 False +0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1 +is_local 127.0.0.1 True +1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11 +is_local 192.168.0.11 False +echo "---" +echo "llm-lora-ddp-nodes" +echo "==================" +time ( +0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1 +is_local 127.0.0.1 True +1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11 +is_local 192.168.0.11 False +is_local 127.0.0.1 True + $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --node-rank=0 --local-addr=127.0.0.1 --rdzv-conf=rank=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda & +is_local 192.168.0.11 False + ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --node-rank=1 --local-addr=192.168.0.11 --rdzv-conf=rank=1 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 device=cuda & + wait +) + +echo "---" +echo "llm-lora-mp-gpus" +echo "================" +time ( + $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_lora.yaml epochs=1 output_dir=$BASE/extra/llm-lora-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ safetensors=true metric_logger.log_dir=$BASE/extra/llm-lora-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" batch_size=8 gradient_accumulation_steps=1 device=cuda & + wait +) + +echo "---" +echo "llm-full-mp-gpus" +echo "================" +time ( + $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 device=cuda & + wait +) + +0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1 +is_local 127.0.0.1 True +1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11 +is_local 192.168.0.11 False +0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1 +is_local 127.0.0.1 True +1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11 +is_local 192.168.0.11 False +0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1 +is_local 127.0.0.1 True +1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11 +is_local 192.168.0.11 False +echo "---" +echo "llm-full-mp-nodes" +echo "=================" +time ( +0 {'ip': '127.0.0.1', 'main': True, 'name': '0', 'sshport': 22, 'user': 'username', 'hostname': '127.0.0.1'} 127.0.0.1 +is_local 127.0.0.1 True +1 {'ip': '192.168.0.11', 'main': False, 'name': '1', 'sshport': 22, 'user': 'username', 'hostname': '192.168.0.11'} 192.168.0.11 +is_local 192.168.0.11 False +is_local 127.0.0.1 True + $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --node-rank=0 --local-addr=127.0.0.1 --rdzv-conf=rank=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 device=cuda & +is_local 192.168.0.11 False + ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --node-rank=1 --local-addr=192.168.0.11 --rdzv-conf=rank=1 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 device=cuda & + wait +) + +echo "---" +echo "dqn" +echo "===" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/purejaxrl/main.py dqn --num_envs 128 --buffer_size 131072 --buffer_batch_size 65536 --env_name CartPole-v1 --training_interval 10 & + wait +) + +echo "---" +echo "ppo" +echo "===" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/purejaxrl/main.py ppo --num_envs 128 --num_steps 10 --num_minibatches 32 --update_epochs 4 --env_name hopper --total_timesteps 2000000 & + wait +) + +echo "---" +echo "pna" +echo "===" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/geo_gnn/main.py --model PNA --num-samples 100000 --batch-size 4096 --num-workers 0 & + wait +) + +echo "---" +echo "dimenet" +echo "=======" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/geo_gnn/main.py --model DimeNet --num-samples 100000 --use3d --batch-size 16 --num-workers 0 & + wait +) + +echo "---" +echo "recursiongfn" +echo "============" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/recursiongfn/main.py --batch_size 128 --num_workers 8 --num_steps 100 --layer_width 128 --num_layers 4 & + wait +) + +echo "---" +echo "torchatari" +echo "==========" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/torchatari/main.py --num-minibatches 16 --update-epochs 4 --num-steps 128 --num-envs 128 --total-timesteps 1000000 --env-id Breakout-v5 & + wait +) + +echo "---" +echo "llava-single" +echo "============" +time ( + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/llava/main.py --batch_size 1 --num_workers 4 --gradient_accumulation_steps 1 & + wait +) + +echo "---" +echo "rlhf-single" +echo "===========" +time ( + CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & + CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & + CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & + CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & + CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & + CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & + CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & + CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-single/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & + wait +) + +echo "---" +echo "rlhf-gpus" +echo "=========" +time ( + $SRC/milabench/milabench/scripts/activator $BASE/venv/torch $BASE/cache accelerate launch --mixed_precision=bf16 --dynamo_backend=no --machine_rank=0 --num_machines=1 --multi_gpu --gradient_accumulation_steps=1 --num_cpu_threads_per_process=4 --main_process_ip=127.0.0.1 --main_process_port=29400 --num_processes=8 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf-gpus/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 & + wait +) + +echo "---" +echo "vjepa-single" +echo "============" +time ( + CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & + CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & + CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & + CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & + CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & + CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & + CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & + CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single & + wait +) + +echo "---" +echo "vjepa-gpus" +echo "==========" +time ( + $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-gpus & + wait +) + diff --git a/tests/test_scaler.py b/tests/test_scaler.py index f00a8979..07cdb2ed 100644 --- a/tests/test_scaler.py +++ b/tests/test_scaler.py @@ -76,28 +76,27 @@ def fakeexec(pack): def test_scaler_enabled(multipack, config): from milabench.system import system_global - import contextvars - - ctx = contextvars.copy_context() - - def update_ctx(): - sizer = Sizer( - SizerOptions( - size=None, - autoscale=True, - multiple=8, - ), - config("scaling"), - ) - sizer_global.set(sizer) - system = system_global.get() - gpu = system.setdefault("gpu", dict()) - gpu["capacity"] = "41920 MiB" - - ctx.run(update_ctx) + from milabench.system import apply_system + + conf = { + "gpu": { + "capacity": "41920 MiB" + }, + "options": { + "sizer": { + "multiple": 8 + } + } + } for k, pack in multipack.packs.items(): - assert ctx.run(lambda: fakeexec(pack)) == ["--batch_size", "232"] + # Sizer is only enabled when config is applied + assert fakeexec(pack) == [] + + with apply_system(conf): + for k, pack in multipack.packs.items(): + fakeexec(pack) == ["--batch_size", "232"] - # Sizer is only enabled inside the context + for k, pack in multipack.packs.items(): + # Sizer is only enabled when config is applied assert fakeexec(pack) == [] diff --git a/tests/test_summary/test_compare.txt b/tests/test_summary/test_compare.txt index c4dd7f6d..c3bb5bf0 100644 --- a/tests/test_summary/test_compare.txt +++ b/tests/test_summary/test_compare.txt @@ -1,5 +1,5 @@ | rijubigo | sedumoje - | 2023-03-24 | 2023-03-24 -bench | metric | 13:45:27 | 13:57:35 + | 2024-08-23 | 2024-08-23 +bench | metric | 09:22:03 | 09:22:03 ---------------------------------------------------------------- benchio | train_rate | 8780.41 | 8286.03 diff --git a/tests/test_summary/test_report.txt b/tests/test_summary/test_report.txt index b9f6ce02..2f4d3fe4 100644 --- a/tests/test_summary/test_report.txt +++ b/tests/test_summary/test_report.txt @@ -11,4 +11,4 @@ benchio | 0 | 4 | 0 | 7979.82 | 2.9% | 17.2% | nan | 79 Scores ------ Failure rate: 0.00% (PASS) -Score: 7979.82 +Score: 7980.82 diff --git a/tests/test_summary/test_report_folder_does_average.txt b/tests/test_summary/test_report_folder_does_average.txt index 9fda7a9c..8884a73a 100644 --- a/tests/test_summary/test_report_folder_does_average.txt +++ b/tests/test_summary/test_report_folder_does_average.txt @@ -11,4 +11,4 @@ benchio | 0 | 6 | 0 | 7878.45 | 2.5% | 18.0% | 24456 | 78 Scores ------ Failure rate: 0.00% (PASS) -Score: 7878.45 +Score: 7879.45 diff --git a/tests/test_validation.py b/tests/test_validation.py index d5f1007b..9ed9000a 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -76,39 +76,46 @@ def test_planning_layer_per_gpu_bad(replayfolder, monkeypatch): def test_memory_tracking(replayfolder, config, tmp_path): import contextvars - - from milabench.sizer import ( - MemoryUsageExtractor, - Sizer, - SizerOptions, - sizer_global, - system_global, - ) - - ctx = contextvars.copy_context() - - def update_ctx(): - sizer = Sizer( - SizerOptions( - size=None, - autoscale=True, - multiple=8, - ), - config("scaling"), + import yaml + from milabench.system import apply_system, option + + conf = { + "gpu": { + "capacity": "41920 MiB" + }, + "options": { + "sizer": { + "multiple": 8, + "autoscale": 1 + } + } + } + + with apply_system(conf): + from milabench.sizer import ( + MemoryUsageExtractor, + Sizer, + SizerOptions, + sizer_global, + system_global, ) - sizer_global.set(sizer) - system_global.set({"gpu": {"capacity": "41920 MiB"}}) - - ctx.run(update_ctx) - layer = ctx.run(lambda: MemoryUsageExtractor()) - - layer.filepath = f"{tmp_path}/dummy" - - assert 123 not in layer.memory["benchio"]["model"] - - ctx.run(lambda: replay_validation_scenario(replayfolder, layer, filename="usage")) - - assert 123 in layer.memory["benchio"]["model"] + + layer = MemoryUsageExtractor() + with open(config("scaling"), "r") as sconf: + layer.memory = yaml.safe_load(sconf) + + layer.filepath = f"{tmp_path}/dummy" + + print(system_global.get()) + # print(option("sizer.multiple", etype=int)) + # print(option("sizer.config", etype=str)) + # print(Sizer().scaling_config) + assert 123 not in layer.memory["benchio"]["model"] + + replay_validation_scenario(replayfolder, layer, filename="usage") + + # print(layer.memory) + assert 123 in layer.memory["benchio"]["model"] def test_exception_tracking(replayfolder, file_regression, capsys): From b0cfe430222d68b482c192c35e032668cbbbc8c3 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Thu, 21 Nov 2024 21:16:28 -0500 Subject: [PATCH 18/20] Update Dockerfile-rocm --- docker/Dockerfile-rocm | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/docker/Dockerfile-rocm b/docker/Dockerfile-rocm index 50466084..2290c40e 100644 --- a/docker/Dockerfile-rocm +++ b/docker/Dockerfile-rocm @@ -1,3 +1,4 @@ + FROM ubuntu:22.04 # Arguments @@ -10,16 +11,17 @@ ARG CONFIG=standard.yaml ENV MILABENCH_CONFIG_NAME=$CONFIG ENV MILABENCH_DOCKER=1 +ARG PYTHON="3.10" + # Paths # ----- ENV MILABENCH_CONFIG=/milabench/milabench/config/$MILABENCH_CONFIG_NAME ENV MILABENCH_BASE=/milabench/envs -ENV MILABENCH_OUTPUT=/milabench/results/ ENV MILABENCH_ARGS="" -ENV CONDA_PATH=/opt/anaconda - +ENV MILABENCH_OUTPUT="$MILABENCH_BASE/runs" +ENV BENCHMARK_VENV="$MILABENCH_BASE/venv" # Copy milabench # -------------- @@ -37,22 +39,13 @@ COPY . /milabench/milabench/ # build-essential: for rust RUN apt-get update &&\ - apt-get install -y git build-essential curl &&\ + apt-get install -y git build-essential curl python3.10 python-is-python3 python3-pip &&\ apt-get clean &&\ rm -rf /var/lib/apt/lists/* RUN curl https://sh.rustup.rs -sSf | sh -s -- -y ENV PATH="/root/.cargo/bin:${PATH}" -# Install Python -# -------------- - -# Install anaconda because milabench will need it later anyway -RUN curl https://repo.anaconda.com/miniconda/Miniconda3-py39_23.1.0-1-Linux-x86_64.sh -o ~/miniconda.sh && \ - /bin/bash ~/miniconda.sh -b -p $CONDA_PATH && rm ~/miniconda.sh -ENV PATH=$CONDA_PATH/bin:$PATH - - # Install Milabench # ----------------- From 14d628a49f276921508b2f8a8a87a1d5521d8ed9 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Thu, 21 Nov 2024 21:23:26 -0500 Subject: [PATCH 19/20] Update README.md --- README.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/README.md b/README.md index a2f8ce50..6731e52f 100644 --- a/README.md +++ b/README.md @@ -115,7 +115,4 @@ We are working on validating it on more configurations and will update the above ------ Failure rate: 0.38% (PASS) Score: 4175.57 - - Errors - ------ - 1 errors, details in HTML report. + From 665390198fb59c19803e49cedb8c883191ff1e8b Mon Sep 17 00:00:00 2001 From: "pierre.delaunay" Date: Thu, 21 Nov 2024 22:23:29 -0500 Subject: [PATCH 20/20] Revert HPU changes --- benchmarks/llm/configs/llama3_70B_full.yaml | 8 ++-- .../llm/recipes/full_finetune_distributed.py | 45 +++++++++++-------- 2 files changed, 30 insertions(+), 23 deletions(-) diff --git a/benchmarks/llm/configs/llama3_70B_full.yaml b/benchmarks/llm/configs/llama3_70B_full.yaml index 703eb876..ae5cf2af 100644 --- a/benchmarks/llm/configs/llama3_70B_full.yaml +++ b/benchmarks/llm/configs/llama3_70B_full.yaml @@ -82,7 +82,7 @@ optimizer: foreach: False # Note: highly recommended to use fused=True optimizer flag # with CPU offload for faster optimizer step. - fused: true + fused: True loss: _component_: torch.nn.CrossEntropyLoss @@ -94,9 +94,9 @@ gradient_accumulation_steps: 1 device: cuda # Memory management -enable_activation_checkpointing: true -memory_efficient_fsdp_wrap: true -fsdp_cpu_offload: true +enable_activation_checkpointing: True +memory_efficient_fsdp_wrap: True +fsdp_cpu_offload: True # Reduced precision dtype: bf16 diff --git a/benchmarks/llm/recipes/full_finetune_distributed.py b/benchmarks/llm/recipes/full_finetune_distributed.py index f8d58e2f..3a51842d 100755 --- a/benchmarks/llm/recipes/full_finetune_distributed.py +++ b/benchmarks/llm/recipes/full_finetune_distributed.py @@ -16,7 +16,6 @@ import torch from omegaconf import DictConfig, ListConfig -import torchcompat.core as acc from torch import nn from torch.distributed import init_process_group from torch.distributed.fsdp import ( @@ -39,8 +38,6 @@ log = utils.get_logger("DEBUG") -HPU_UNSUPPORTED = False - class FullFinetuneRecipeDistributed(FTRecipeInterface): """ @@ -100,8 +97,8 @@ class FullFinetuneRecipeDistributed(FTRecipeInterface): """ def __init__(self, cfg: DictConfig) -> None: - import os - self._device = acc.fetch_device(int(os.getenv("LOCAL_RANK", "0"))) + + self._device = utils.get_device(device=cfg.device) self._dtype = utils.get_dtype(cfg.dtype, device=self._device) if self._dtype == torch.float16: @@ -134,10 +131,7 @@ def __init__(self, cfg: DictConfig) -> None: # These are public properties which are updated by the checkpoint loader # when ``resume_from_checkpoint`` is `True` or validated in tests - if HPU_UNSUPPORTED: - self.seed = utils.set_seed(seed=cfg.seed) - else: - self.seed = 1 + self.seed = utils.set_seed(seed=cfg.seed) self.epochs_run = 0 self.total_epochs = cfg.epochs self.max_steps_per_epoch = cfg.max_steps_per_epoch @@ -357,10 +351,8 @@ def _setup_model( ) if self._is_rank_zero: - if HPU_UNSUPPORTED: - pass - #memory_stats = utils.get_memory_stats(device=self._device) - #utils.log_memory_stats(memory_stats) + memory_stats = utils.get_memory_stats(device=self._device) + utils.log_memory_stats(memory_stats) # synchronize before training begins torch.distributed.barrier() @@ -421,7 +413,6 @@ def _setup_data( dataset=ds, batch_size=batch_size, sampler=sampler, - # persistent_workers=True, collate_fn=partial( utils.padded_collate, padding_idx=self._tokenizer.pad_id, @@ -552,14 +543,31 @@ def train(self) -> None: f"{curr_epoch+1}|{self.global_step}|Loss: {loss_to_log}" ) + # Log per-step metrics + if ( + self.global_step % self._log_every_n_steps == 0 + and self._is_rank_zero + ): + time_per_step = time.perf_counter() - t0 + log_dict = { + "loss": loss_to_log, + "lr": self._optimizer.param_groups[0]["lr"], + "tokens_per_second_per_gpu": num_tokens / time_per_step, + } + if self._log_peak_memory_stats: + log_dict.update(utils.get_memory_stats(device=self._device)) + self._metric_logger.log_dict( + log_dict, + step=self.global_step, + ) + # Reset running stats for the next step running_loss = 0 num_tokens = 0 t0 = time.perf_counter() - - print("HERE") + self.epochs_run += 1 - # self.save_checkpoint(epoch=curr_epoch) + self.save_checkpoint(epoch=curr_epoch) def cleanup(self) -> None: if self._is_rank_zero: @@ -610,8 +618,7 @@ def recipe_main(cfg: DictConfig) -> None: "If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]" ) - acc.init_process_group() - + init_process_group(backend="gloo" if cfg.device == "cpu" else "nccl") if cfg.get("fsdp_cpu_offload", False): # Utilize all available CPU cores for intra-op parallelism. This provides ~2x # speed up when benchmarking fused AdamW on CPU