From 3d8e9f5b25206b42fac1c2030a0f56a4b6dac114 Mon Sep 17 00:00:00 2001 From: "pierre.delaunay" Date: Mon, 5 Aug 2024 15:22:11 -0400 Subject: [PATCH 1/6] Multi node tweaks --- benchmarks/llm/benchfile.py | 17 ++++- benchmarks/llm/main.py | 0 benchmarks/rlhf/Makefile | 31 ++++++++ benchmarks/rlhf/README.md | 4 + benchmarks/rlhf/benchfile.py | 31 ++++++++ benchmarks/rlhf/dev.yaml | 53 ++++++++++++++ benchmarks/rlhf/main.py | 126 ++++++++++++++++++++++++++++++++ benchmarks/rlhf/prepare.py | 16 ++++ benchmarks/rlhf/requirements.in | 5 ++ benchmarks/rlhf/voirfile.py | 38 ++++++++++ config/base.yaml | 4 +- milabench/_version.py | 6 +- milabench/cli/dry.py | 2 +- milabench/cli/prepare_run.py | 15 ++++ milabench/cli/slurm.py | 37 +++++++++- milabench/commands/__init__.py | 42 ++++++++--- milabench/common.py | 2 +- milabench/multi.py | 39 ++++++++-- milabench/remote.py | 50 ++++++++++--- scripts/article/run_cuda_dev.sh | 11 ++- 20 files changed, 487 insertions(+), 42 deletions(-) delete mode 100644 benchmarks/llm/main.py create mode 100644 benchmarks/rlhf/Makefile create mode 100644 benchmarks/rlhf/README.md create mode 100644 benchmarks/rlhf/benchfile.py create mode 100644 benchmarks/rlhf/dev.yaml create mode 100644 benchmarks/rlhf/main.py create mode 100755 benchmarks/rlhf/prepare.py create mode 100644 benchmarks/rlhf/requirements.in create mode 100644 benchmarks/rlhf/voirfile.py create mode 100644 milabench/cli/prepare_run.py diff --git a/benchmarks/llm/benchfile.py b/benchmarks/llm/benchfile.py index 1537ad556..6f8cadeee 100644 --- a/benchmarks/llm/benchfile.py +++ b/benchmarks/llm/benchfile.py @@ -1,7 +1,7 @@ from milabench.pack import Package -from milabench.commands import TorchrunAllGPU +from milabench.commands import TorchrunAllGPU, TorchrunAllNodes, ForeachNode from milabench.pack import BasePackage from milabench.commands import SimpleCommand @@ -15,7 +15,18 @@ def executable(self): # return True def __init__(self, pack: BasePackage, *torchrun_args, **kwargs): - super().__init__(pack, *torchrun_args, module=False, **kwargs) + super().__init__(pack, "run", *torchrun_args, module=False, **kwargs) + + +class TorchtuneAllNodes(TorchrunAllNodes): + def __init__(self, executor, *args, **kwargs) -> None: + base_exec = TorchrunAllNodes.make_base_executor( + Torchtune, + executor, + *args, + **kwargs + ) + ForeachNode.__init__(self, base_exec) class Llm(Package): @@ -31,7 +42,7 @@ async def install(self): def build_run_plan(self): exec = SimpleCommand(self) - return Torchtune(exec, "run").use_stdout() + return TorchtuneAllNodes(exec).use_stdout() __pack__ = Llm diff --git a/benchmarks/llm/main.py b/benchmarks/llm/main.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/benchmarks/rlhf/Makefile b/benchmarks/rlhf/Makefile new file mode 100644 index 000000000..97b871cdc --- /dev/null +++ b/benchmarks/rlhf/Makefile @@ -0,0 +1,31 @@ +# Use global base if possible +ifndef MILABENCH_BASE + MILABENCH_BASE="base" +endif + +export MILABENCH_BASE + +BENCH_NAME=rlhf +MILABENCH_CONFIG=dev.yaml +MILABENCH_ARGS=--config $(MILABENCH_CONFIG) --base $(MILABENCH_BASE) + +all: + install prepare single gpus nodes + +install: + milabench install $(MILABENCH_ARGS) --force + +prepare: + milabench prepare $(MILABENCH_ARGS) + +tests: + CUDA_VISIBLE_DEVICES=0 milabench run $(MILABENCH_ARGS) + +single: + milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME) + +gpus: + milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME)-gpus + +nodes: + milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME)-nodes diff --git a/benchmarks/rlhf/README.md b/benchmarks/rlhf/README.md new file mode 100644 index 000000000..9c22d45ca --- /dev/null +++ b/benchmarks/rlhf/README.md @@ -0,0 +1,4 @@ + +# Rlhf + +Rewrite this README to explain what the benchmark is! diff --git a/benchmarks/rlhf/benchfile.py b/benchmarks/rlhf/benchfile.py new file mode 100644 index 000000000..a568f6690 --- /dev/null +++ b/benchmarks/rlhf/benchfile.py @@ -0,0 +1,31 @@ +from milabench.pack import Package + + +class Rlhf(Package): + # Requirements file installed by install(). It can be empty or absent. + base_requirements = "requirements.in" + + # The preparation script called by prepare(). It must be executable, + # but it can be any type of script. It can be empty or absent. + prepare_script = "prepare.py" + + # The main script called by run(). It must be a Python file. It has to + # be present. + main_script = "main.py" + + # You can remove the functions below if you don't need to modify them. + + def make_env(self): + # Return a dict of environment variables for prepare_script and + # main_script. + return super().make_env() + + async def install(self): + await super().install() # super() call installs the requirements + + async def prepare(self): + await super().prepare() # super() call executes prepare_script + + + +__pack__ = Rlhf diff --git a/benchmarks/rlhf/dev.yaml b/benchmarks/rlhf/dev.yaml new file mode 100644 index 000000000..99ab9b21e --- /dev/null +++ b/benchmarks/rlhf/dev.yaml @@ -0,0 +1,53 @@ + +rlhf: + inherits: _defaults + definition: . + install-variant: unpinned + install_group: torch + plan: + method: per_gpu + + argv: + --output_dir: models/minimal/ppo + --per_device_train_batch_size: 1 + --gradient_accumulation_steps: 1 + --total_episodes: 30000 + --model_name_or_path: meta-llama/Llama-2-7b-chat-hf + --sft_model_path: meta-llama/Llama-2-7b-chat-hf + --reward_model_path: cleanrl/EleutherAI_pythia-1b-deduped__reward__tldr + --non_eos_penalty: true + --stop_token: eos + --response_length: 53 + --sanity_check: true + + + +# """ +# python examples/scripts/ppo/ppo_tldr.py \ +# --learning_rate 3e-6 \ +# --output_dir models/minimal/ppo \ +# --per_device_train_batch_size 1 \ +# --gradient_accumulation_steps 64 \ +# --total_episodes 30000 \ +# --model_name_or_path EleutherAI/pythia-1b-deduped \ +# --sft_model_path cleanrl/EleutherAI_pythia-1b-deduped__sft__tldr \ +# --reward_model_path cleanrl/EleutherAI_pythia-1b-deduped__reward__tldr \ +# --non_eos_penalty \ +# --stop_token eos \ +# --response_length 53 \ +# --sanity_check + +# accelerate launch --config_file examples/accelerate_configs/deepspeed_zero2.yaml \ +# examples/scripts/ppo/ppo_tldr.py \ +# --output_dir models/minimal/ppo_tldr \ +# --learning_rate 3e-6 \ +# --per_device_train_batch_size 16 \ +# --gradient_accumulation_steps 4 \ +# --total_episodes 1000000 \ +# --model_name_or_path EleutherAI/pythia-1b-deduped \ +# --sft_model_path cleanrl/EleutherAI_pythia-1b-deduped__sft__tldr \ +# --reward_model_path cleanrl/EleutherAI_pythia-1b-deduped__reward__tldr \ +# --local_rollout_forward_batch_size 16 \ +# --non_eos_penalty \ +# --stop_token eos +# """ \ No newline at end of file diff --git a/benchmarks/rlhf/main.py b/benchmarks/rlhf/main.py new file mode 100644 index 000000000..7ab48e1d7 --- /dev/null +++ b/benchmarks/rlhf/main.py @@ -0,0 +1,126 @@ +import multiprocessing +import shutil + +from datasets import load_dataset +from transformers import ( + AutoModelForCausalLM, + AutoModelForSequenceClassification, + AutoTokenizer, + HfArgumentParser, +) + +from trl import ModelConfig +from trl.trainer.ppov2_trainer import PPOv2Config, PPOv2Trainer +from trl.trainer.utils import SIMPLE_QUERY_CHAT_TEMPLATE + + +def main(): + parser = HfArgumentParser((PPOv2Config, ModelConfig)) + config, model_config = parser.parse_args_into_dataclasses() + # remove output_dir if exists + shutil.rmtree(config.output_dir, ignore_errors=True) + + ################ + # Model & Tokenizer + ################ + tokenizer = AutoTokenizer.from_pretrained( + model_config.model_name_or_path, + padding_side="left", + trust_remote_code=model_config.trust_remote_code, + ) + tokenizer.add_special_tokens({"pad_token": "[PAD]"}) + + if tokenizer.chat_template is None: + tokenizer.chat_template = SIMPLE_QUERY_CHAT_TEMPLATE + value_model = AutoModelForSequenceClassification.from_pretrained( + config.reward_model_path, trust_remote_code=model_config.trust_remote_code, num_labels=1 + ) + reward_model = AutoModelForSequenceClassification.from_pretrained( + config.reward_model_path, trust_remote_code=model_config.trust_remote_code, num_labels=1 + ) + import torch + ref_policy = AutoModelForCausalLM.from_pretrained( + config.sft_model_path, + trust_remote_code=model_config.trust_remote_code, + low_cpu_mem_usage=True, + ) + policy = AutoModelForCausalLM.from_pretrained( + config.sft_model_path, + trust_remote_code=model_config.trust_remote_code, + low_cpu_mem_usage=True, + ) + + from peft import prepare_model_for_kbit_training + from peft import LoraConfig + from peft import get_peft_model + + ref_policy = prepare_model_for_kbit_training(ref_policy) + policy = prepare_model_for_kbit_training(policy) + + lora_config = LoraConfig( + r=16, + lora_alpha=32, + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + ref_policy = get_peft_model(ref_policy, lora_config) + policy = get_peft_model(policy, lora_config) + + ################ + # Dataset + ################ + raw_datasets = load_dataset("trl-internal-testing/tldr-preference-sft-trl-style") + if config.sanity_check: + for key in raw_datasets: + raw_datasets[key] = raw_datasets[key].select(range(1000)) + train_dataset = raw_datasets["train"] + eval_dataset = raw_datasets["validation"] + + def prepare_dataset(dataset, tokenizer): + """pre-tokenize the dataset before training; only collate during training""" + + def tokenize(element): + input_ids = tokenizer.apply_chat_template( + element["messages"][:1], + padding=False, + add_generation_prompt=True, + ) + return {"input_ids": input_ids, "lengths": len(input_ids)} + + return dataset.map( + tokenize, + remove_columns=dataset.column_names, + num_proc=1 if config.sanity_check else multiprocessing.cpu_count(), + load_from_cache_file=not config.sanity_check, + ) + + train_dataset = prepare_dataset(train_dataset, tokenizer) + eval_dataset = prepare_dataset(eval_dataset, tokenizer) + # filtering + train_dataset = train_dataset.filter(lambda x: x["lengths"] <= 512) + eval_dataset = eval_dataset.filter(lambda x: x["lengths"] <= 512) + + assert train_dataset[0]["input_ids"][-1] != tokenizer.eos_token_id, "The last token should not be an EOS token" + ################ + # Training + ################ + print("DONE") + trainer = PPOv2Trainer( + config=config, + tokenizer=tokenizer, + policy=policy, + ref_policy=ref_policy, + reward_model=reward_model, + value_model=value_model, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + ) + trainer.train() + + # trainer.save_model(config.output_dir) + # trainer.generate_completions() + + +if __name__ == "__main__": + main() diff --git a/benchmarks/rlhf/prepare.py b/benchmarks/rlhf/prepare.py new file mode 100755 index 000000000..32bd5901d --- /dev/null +++ b/benchmarks/rlhf/prepare.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python + +import os + +if __name__ == "__main__": + # If you need the whole configuration: + # config = json.loads(os.environ["MILABENCH_CONFIG"]) + + data_directory = os.environ["MILABENCH_DIR_DATA"] + + # Download (or generate) the needed dataset(s). You are responsible + # to check if it has already been properly downloaded or not, and to + # do nothing if it has been. + print("Hello I am doing some data stuff!") + + # If there is nothing to download or generate, just delete this file. diff --git a/benchmarks/rlhf/requirements.in b/benchmarks/rlhf/requirements.in new file mode 100644 index 000000000..d0faef03e --- /dev/null +++ b/benchmarks/rlhf/requirements.in @@ -0,0 +1,5 @@ +voir>=0.2.17,<0.3 +torch +trl +bitsandbytes +peft \ No newline at end of file diff --git a/benchmarks/rlhf/voirfile.py b/benchmarks/rlhf/voirfile.py new file mode 100644 index 000000000..d93f886cd --- /dev/null +++ b/benchmarks/rlhf/voirfile.py @@ -0,0 +1,38 @@ +from dataclasses import dataclass + +from voir import configurable +from voir.instruments import dash, early_stop, log, rate +from benchmate.monitor import monitor_monogpu + +@dataclass +class Config: + """voir configuration""" + + # Whether to display the dash or not + dash: bool = False + + # How often to log the rates + interval: str = "1s" + + # Number of rates to skip before logging + skip: int = 5 + + # Number of rates to log before stopping + stop: int = 20 + + # Number of seconds between each gpu poll + gpu_poll: int = 3 + + +@configurable +def instrument_main(ov, options: Config): + yield ov.phases.init + + if options.dash: + ov.require(dash) + + ov.require( + log("value", "progress", "rate", "units", "loss", "gpudata", context="task"), + early_stop(n=options.stop, key="rate", task="train"), + monitor_monogpu(poll_interval=options.gpu_poll), + ) diff --git a/config/base.yaml b/config/base.yaml index 1d10341bb..b9d0fdbdf 100644 --- a/config/base.yaml +++ b/config/base.yaml @@ -429,7 +429,7 @@ _lightning: --loader: pytorch --data: "{milabench_data}/FakeImageNet" --model: resnet152 - --batch-size: 16 + --batch-size: 256 lightning: inherits: _lightning @@ -546,6 +546,7 @@ llm-lora-ddp-gpus: llm-lora-ddp-nodes: + max_duration: 3600 inherits: _llm plan: method: njobs @@ -611,6 +612,7 @@ llm-full-mp-gpus: llm-full-mp-nodes: + max_duration: 3600 inherits: _llm plan: method: njobs diff --git a/milabench/_version.py b/milabench/_version.py index 23cf810bc..ad08908db 100644 --- a/milabench/_version.py +++ b/milabench/_version.py @@ -1,5 +1,5 @@ """This file is generated, do not modify""" -__tag__ = "v0.1.0-38-gfb01d691" -__commit__ = "fb01d691aa0d88717dcb3fea8852f61e111cc75f" -__date__ = "2024-08-01 18:59:13 -0400" +__tag__ = "v0.1.0-29-g56152dc0" +__commit__ = "56152dc0f5938bfdf261798ad8a8df4e42ac3045" +__date__ = "2024-08-01 22:22:13 -0400" diff --git a/milabench/cli/dry.py b/milabench/cli/dry.py index 80d55d6ea..010269223 100644 --- a/milabench/cli/dry.py +++ b/milabench/cli/dry.py @@ -169,7 +169,7 @@ def multipack_args(conf: Arguments): "ip": f"192.168.0.{i + 10}" if i != 0 else "127.0.0.1", "user": "username", "main": i == 0, - "port": 22, + "sshport": 22, } for i in range(conf.nnodes) ], diff --git a/milabench/cli/prepare_run.py b/milabench/cli/prepare_run.py new file mode 100644 index 000000000..58b5fe559 --- /dev/null +++ b/milabench/cli/prepare_run.py @@ -0,0 +1,15 @@ +from coleo import tooled + +from .prepare import cli_prepare +from .run import cli_run + +@tooled +def cli_prepare_run(args=None): + """Prepare a benchmark: download datasets, weights etc.""" + + rc = cli_prepare() + + if rc == 0: + rc = cli_run() + + return rc diff --git a/milabench/cli/slurm.py b/milabench/cli/slurm.py index db68dbf0e..3d00b5312 100644 --- a/milabench/cli/slurm.py +++ b/milabench/cli/slurm.py @@ -1,11 +1,42 @@ import getpass import os - +import socket +import subprocess from coleo import tooled from ..system import get_gpu_capacity +def gethostname(host): + try: + return subprocess.check_output(["ssh", host, "cat", "/etc/hostname"], text=True).strip() + except: + print("Could not resolve hostname") + return host + + +def getip(ip): + # This does get a good IP for everything except the local node + + hostname, _, iplist = socket.gethostbyaddr(ip) + if len(iplist) > 1: + print("Multiple IP found") + + + from milabench.system import get_remote_ip + + resolved = iplist[0] + if resolved.startswith("127.0"): + ips = get_remote_ip() + for ip in ips: + if "." in ip and not ip.startswith("127.0"): + return ip + + return resolved + + return resolved + + @tooled def cli_slurm_system(): """Generate a system file based of slurm environment variables""" @@ -15,9 +46,11 @@ def cli_slurm_system(): def make_node(i, ip): node = { "name": ip, - "ip": ip, + "ip": getip(ip), + "hostname": gethostname(ip), "user": getpass.getuser(), "main": i == 0, + "sshport": 22, } if i == 0: diff --git a/milabench/commands/__init__.py b/milabench/commands/__init__.py index bee42baf5..51b239e07 100644 --- a/milabench/commands/__init__.py +++ b/milabench/commands/__init__.py @@ -577,16 +577,24 @@ def _argv(self, **kwargs): return [] +def node_address(node): + """Favour Hostname as it is the most consistent name across machines""" + host = node.get("hostname") + ip = node.get("ip") + return host or ip + + class ForeachNode(ListCommand): def __init__(self, executor: Command, **kwargs) -> None: super().__init__(None, **kwargs) self.options.update(kwargs) self.executor = executor + self.base_tags = self.executor.pack.config["tag"] def make_new_node_pack(self, rank, node, base) -> "BasePackage": """Make a new environment/config for the run""" config = base.pack.config - tags = [*config["tag"], node["name"]] + tags = [*self.base_tags, node["name"]] # Workers do not send training data # tag it as such so validation can ignore this pack @@ -630,10 +638,10 @@ def executors(self): ) worker = SSHCommand( - host=node["ip"], + host=node_address(node), user=node["user"], key=key, - port=node.get("port", 22), + port=node.get("sshport", 22), executor=self.make_new_node_executor(rank, node, self.executor), **options ) @@ -653,31 +661,43 @@ def copy(self, pack): class TorchrunAllNodes(ForeachNode): """executes torchrun on multiple machines""" - def __init__(self, executor: Command, **kwargs) -> None: + @staticmethod + def make_base_executor(cls, executor, *args, **kwargs): config = executor.pack.config max_num = config.get("num_machines", 1) - self.nodes = select_nodes(config["system"]["nodes"], max_num) + nodes = select_nodes(config["system"]["nodes"], max_num) - main = self.nodes[0] + main = nodes[0] # node[port] is for SSH - main_host = main["ip"] + main_host = node_address(main) # add them as option so we could tweak them if necessary main_port = option("torchrun.port", int, default=29400) backend = option("torchrun.backend", str, default="c10d") main_addr = f"{main_host}:{main_port}" - base_exec = TorchrunAllGPU( + + config = executor.pack.config + + return cls( executor, - f"--nnodes={len(self.nodes)}", + f"--nnodes={len(nodes)}", f"--rdzv-backend={backend}", f"--rdzv-endpoint={main_addr}", - f"--master-addr={main_host}", - f"--master-port={main_port}", + # f"--master-addr={main_host}", + # f"--master-port={main_port}", + *args, **kwargs ) + def __init__(self, executor: Command, *args, **kwargs) -> None: + base_exec = TorchrunAllNodes.make_base_executor( + TorchrunAllGPU, + executor, + *args, + **kwargs + ) super().__init__(base_exec) diff --git a/milabench/common.py b/milabench/common.py index 5849e05fe..135e45545 100644 --- a/milabench/common.py +++ b/milabench/common.py @@ -141,7 +141,7 @@ def get_base_defaults(base, arch="none", run_name="none"): { "name": "local", "ip": "127.0.0.1", - "port": 8123, + "sshport": 22, "user": user, "main": True, } diff --git a/milabench/multi.py b/milabench/multi.py index b09eeecca..06aacae43 100644 --- a/milabench/multi.py +++ b/milabench/multi.py @@ -83,6 +83,23 @@ def make_execution_plan(pack, step=0, repeat=1): return exec_plan +async def copy_base_to_workers(setup): + # Note: when we use docker we do not need to install + # so this should be ignored + if is_main_local(setup) and is_multinode(setup): + print("Coping main setup from this node to worker") + # copy the main setup to the workers + # so it copies the bench venv already, no need for python + from milabench.remote import copy_folder + from milabench.system import SystemConfig + + # we copy the entire content of base + # FIXME: handle custom (venv, cache, data, etc...) directories + # + copy_plan = copy_folder(setup, SystemConfig().base) + remote_task = asyncio.create_task(copy_plan.execute()) + await asyncio.wait([remote_task]) + class MultiPackage: def __init__(self, packs): @@ -140,6 +157,7 @@ async def do_install(self): remote_task = None if is_remote(setup): + print("Current node is outside of our system") # We are outside system, setup the main node first remote_plan = milabench_remote_install(setup, setup_for="main") remote_task = asyncio.create_task(remote_plan.execute()) @@ -148,15 +166,18 @@ async def do_install(self): # We do not install benchmarks on that node return - elif is_main_local(setup) and is_multinode(setup): - # We are the main node, setup workers - remote_plan = milabench_remote_install(setup, setup_for="worker") - remote_task = asyncio.create_task(remote_plan.execute()) + # elif is_main_local(setup) and is_multinode(setup): + # # this was executing install on the remote node but then it needed python to be available + # # We are the main node, setup workers + # remote_plan = milabench_remote_install(setup, setup_for="worker") + # remote_task = asyncio.create_task(remote_plan.execute()) # do the installation step with phase_lock("install"): await self.do_phase("install", remote_task, "checked_install") + await copy_base_to_workers(setup) + async def do_prepare(self): setup = self.setup_pack() remote_task = None @@ -168,13 +189,17 @@ async def do_prepare(self): return - elif is_main_local(setup) and is_multinode(setup): - remote_plan = milabench_remote_prepare(setup, run_for="worker") - remote_task = asyncio.create_task(remote_plan.execute()) + # elif is_main_local(setup) and is_multinode(setup): + # remote_plan = milabench_remote_prepare(setup, run_for="worker") + # remote_task = asyncio.create_task(remote_plan.execute()) with phase_lock("prepare"): await self.do_phase("prepare", remote_task, "prepare") + # Prepare is done on the main node + # copy the result there + await copy_base_to_workers(setup) + async def do_run(self, repeat=1): setup = self.setup_pack() diff --git a/milabench/remote.py b/milabench/remote.py index bf5963183..725c57b45 100644 --- a/milabench/remote.py +++ b/milabench/remote.py @@ -75,6 +75,45 @@ def should_run_for(worker, setup_for): return worker["main"] +def worker_commands(pack, worker_plan, setup_for="worker"): + nodes = pack.config["system"]["nodes"] + copy = [] + node_packs = [] + + for node in nodes: + node_pack = None + + if should_run_for(node, setup_for): + node_pack = worker_pack(pack, node) + + cmds = worker_plan(node_pack, node) + + if not isinstance(cmds, list): + cmds = [cmds] + copy.extend(cmds) + + node_packs.append(node_pack) + + return ListCommand(*copy) + + +def sshnode(node, cmd): + host = node["ip"] + user = node["user"] + port = node["sshport"] + return SSHCommand(cmd, user=user, host=host, port=port) + + +def copy_folder(pack, folder, setup_for="worker"): + def copy_to_worker(nodepack, node): + return [ + sshnode(node, CmdCommand(nodepack, "mkdir", "-p", folder)), + CmdCommand(nodepack, *rsync(node, folder)) + ] + return worker_commands(pack, copy_to_worker, setup_for=setup_for) + + + def milabench_remote_setup_plan(pack, setup_for="worker") -> SequenceCommand: """Copy milabench source files to remote @@ -87,14 +126,7 @@ def milabench_remote_setup_plan(pack, setup_for="worker") -> SequenceCommand: copy = [] node_packs = [] - for node in nodes: - node_pack = None - - if should_run_for(node, setup_for): - node_pack = worker_pack(pack, node) - copy.append(CmdCommand(node_pack, *rsync(node, INSTALL_FOLDER))) - - node_packs.append(node_pack) + copy_source = copy_folder (INSTALL_FOLDER) install = [] for i, node in enumerate(nodes): @@ -102,7 +134,7 @@ def milabench_remote_setup_plan(pack, setup_for="worker") -> SequenceCommand: install.append(pip_install_milabench(node_packs[i], node, INSTALL_FOLDER)) return SequenceCommand( - ListCommand(*copy), + copy_source, ListCommand(*install), ) diff --git a/scripts/article/run_cuda_dev.sh b/scripts/article/run_cuda_dev.sh index 7980d41d4..df62775a0 100644 --- a/scripts/article/run_cuda_dev.sh +++ b/scripts/article/run_cuda_dev.sh @@ -15,7 +15,7 @@ export MILABENCH_BASE="$MILABENCH_WORDIR/results" export MILABENCH_CONFIG="$MILABENCH_WORDIR/milabench/config/standard.yaml" export MILABENCH_VENV="$MILABENCH_WORDIR/env" export BENCHMARK_VENV="$MILABENCH_WORDIR/results/venv/torch" - +export MILABENCH_SYSTEM="$MILABENCH_WORDIR/system.yaml" if [ -z "${MILABENCH_PREPARE}" ]; then export MILABENCH_PREPARE=0 @@ -53,10 +53,13 @@ install_prepare() { # milabench pin --variant cuda --from-scratch "$@" + + milabench slurm_system > $MILABENCH_WORDIR/system.yaml + # # Install milabench's benchmarks in their venv # - milabench install "$@" + milabench install --system $MILABENCH_WORDIR/system.yaml "$@" which pip # pip install -e $MILABENCH_WORDIR/voir @@ -79,7 +82,7 @@ install_prepare() { # # Generate/download datasets, download models etc... - milabench prepare "$@" + milabench prepare --system $MILABENCH_WORDIR/system.yaml "$@" } module load cuda/12.3.2 @@ -114,7 +117,7 @@ if [ "$MILABENCH_PREPARE" -eq 0 ]; then # milabench prepare "$@" # # Run the benchmakrs - milabench run "$@" + milabench run --system $MILABENCH_WORDIR/system.yaml "$@" # # Display report From 917196defcec45c2893549aa5ef627e12f4ae938 Mon Sep 17 00:00:00 2001 From: "pierre.delaunay" Date: Tue, 6 Aug 2024 13:52:12 -0400 Subject: [PATCH 2/6] Identify loopback more reliably --- milabench/_version.py | 6 ++--- milabench/cli/slurm.py | 44 +++++++++++++++++---------------- milabench/system.py | 19 ++++++++++++++ milabench/utils.py | 2 +- scripts/article/run_cuda_dev.sh | 15 ----------- 5 files changed, 46 insertions(+), 40 deletions(-) diff --git a/milabench/_version.py b/milabench/_version.py index ad08908db..0640b0ea1 100644 --- a/milabench/_version.py +++ b/milabench/_version.py @@ -1,5 +1,5 @@ """This file is generated, do not modify""" -__tag__ = "v0.1.0-29-g56152dc0" -__commit__ = "56152dc0f5938bfdf261798ad8a8df4e42ac3045" -__date__ = "2024-08-01 22:22:13 -0400" +__tag__ = "v0.1.0-30-g3d8e9f5b" +__commit__ = "3d8e9f5b25206b42fac1c2030a0f56a4b6dac114" +__date__ = "2024-08-05 15:22:11 -0400" diff --git a/milabench/cli/slurm.py b/milabench/cli/slurm.py index 3d00b5312..9f245e415 100644 --- a/milabench/cli/slurm.py +++ b/milabench/cli/slurm.py @@ -4,7 +4,7 @@ import subprocess from coleo import tooled -from ..system import get_gpu_capacity +from ..system import get_gpu_capacity, is_loopback def gethostname(host): @@ -15,26 +15,14 @@ def gethostname(host): return host -def getip(ip): - # This does get a good IP for everything except the local node - +def resolve_hostname(ip): hostname, _, iplist = socket.gethostbyaddr(ip) - if len(iplist) > 1: - print("Multiple IP found") + for ip in iplist: + if is_loopback(ip): + return hostname, True - from milabench.system import get_remote_ip - - resolved = iplist[0] - if resolved.startswith("127.0"): - ips = get_remote_ip() - for ip in ips: - if "." in ip and not ip.startswith("127.0"): - return ip - - return resolved - - return resolved + return hostname, False @tooled @@ -42,14 +30,17 @@ def cli_slurm_system(): """Generate a system file based of slurm environment variables""" node_list = expand_node_list(os.getenv("SLURM_JOB_NODELIST", "")) + def make_node(i, ip): + hostname, local = resolve_hostname(ip) + node = { "name": ip, - "ip": getip(ip), + "ip": hostname, "hostname": gethostname(ip), "user": getpass.getuser(), - "main": i == 0, + "main": local, "sshport": 22, } @@ -59,9 +50,20 @@ def make_node(i, ip): return node # nvidia-smi --query-gpu=memory.total --format=csv + + nodes = [make_node(i, ip) for i, ip in enumerate(node_list)] + + # ensure there is a main + # either it is the local node or first node + for node in nodes: + if node.get("main", False): + break + else: + nodes[0]["main"] = True + system = { "arch": "cuda", - "nodes": [make_node(i, ip) for i, ip in enumerate(node_list)], + "nodes": nodes, } capacity = get_gpu_capacity() diff --git a/milabench/system.py b/milabench/system.py index 7db61e5ea..6f7a9cf32 100644 --- a/milabench/system.py +++ b/milabench/system.py @@ -4,6 +4,8 @@ from dataclasses import dataclass, field import sys from contextlib import contextmanager +import ipaddress + import psutil import yaml from voir.instruments.gpu import get_gpu_info @@ -249,6 +251,21 @@ def get_remote_ip(): return set(result) + + + +def is_loopback(address: str) -> bool: + try: + # Create an IP address object + ip = ipaddress.ip_address(address) + # Check if the address is a loopback address + return ip.is_loopback + except ValueError: + # If the address is invalid, return False + return False + + + def _resolve_ip(ip): hostname = ip aliaslist = [] @@ -327,7 +344,9 @@ def resolve_addresses(nodes): or (hostname in ("localhost", socket.gethostname(), "127.0.0.1")) or (socket.gethostname().startswith(hostname)) or len(ip_list.intersection(ipaddrlist)) > 0 + or any([is_loopback(ip) for ip in ipaddrlist]) ) + # cn-g005 cn-g005.server.mila.quebec # print(hostname, socket.gethostname()) node["local"] = is_local diff --git a/milabench/utils.py b/milabench/utils.py index 2e732200d..d92442814 100644 --- a/milabench/utils.py +++ b/milabench/utils.py @@ -231,7 +231,7 @@ def select_nodes(nodes, n): ranked = [] for node in nodes: - if node["main"]: + if node.get("main", False): ranked.insert(0, node) else: ranked.append(node) diff --git a/scripts/article/run_cuda_dev.sh b/scripts/article/run_cuda_dev.sh index df62775a0..37729f4b7 100644 --- a/scripts/article/run_cuda_dev.sh +++ b/scripts/article/run_cuda_dev.sh @@ -95,21 +95,6 @@ else fi -( - . $MILABENCH_WORDIR/env/bin/activate - pip show setuptools - pip show pip - pip install git+https://github.com/Delaunay/voir.git@patch-8 -) - -( - . $BENCHMARK_VENV/bin/activate - pip show setuptools - pip show pip - pip install git+https://github.com/Delaunay/voir.git@patch-8 -) - - if [ "$MILABENCH_PREPARE" -eq 0 ]; then cd $MILABENCH_WORDIR From daf3612e0efdcc0bb9fb041129d660fc12f06914 Mon Sep 17 00:00:00 2001 From: "pierre.delaunay" Date: Tue, 6 Aug 2024 15:04:06 -0400 Subject: [PATCH 3/6] update main attr lookup --- milabench/remote.py | 10 +++++----- milabench/utils.py | 2 +- scripts/article/run_cuda_dev.sh | 2 ++ 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/milabench/remote.py b/milabench/remote.py index 725c57b45..4cebb2919 100644 --- a/milabench/remote.py +++ b/milabench/remote.py @@ -70,9 +70,9 @@ def milabench_remote_sync(pack, worker): def should_run_for(worker, setup_for): if setup_for == "worker": - return not worker["main"] + return not worker.get("main", False) - return worker["main"] + return worker.get("main", False) def worker_commands(pack, worker_plan, setup_for="worker"): @@ -178,7 +178,7 @@ def is_multinode(pack): count = 0 nodes = pack.config["system"]["nodes"] for node in nodes: - if not node["main"]: + if not node.get("main", False): count += 1 return count > 0 @@ -191,12 +191,12 @@ def is_remote(pack): def is_main_local(pack): """Only the local main can send remote commands to remote""" self = pack.config["system"]["self"] - return self is not None and self["local"] and self["main"] + return self is not None and self["local"] and self.get("main", False) def is_worker(pack): self = pack.config["system"]["self"] - return self is not None and (not self["main"]) + return self is not None and (not self.get("main", False)) def _sanity(pack, setup_for): diff --git a/milabench/utils.py b/milabench/utils.py index d92442814..bb99fdd75 100644 --- a/milabench/utils.py +++ b/milabench/utils.py @@ -242,7 +242,7 @@ def select_nodes(nodes, n): def enumerate_rank(nodes): rank = 1 for node in nodes: - if node["main"]: + if node.get("main", False): yield 0, node else: yield rank, node diff --git a/scripts/article/run_cuda_dev.sh b/scripts/article/run_cuda_dev.sh index 37729f4b7..0f52a1724 100644 --- a/scripts/article/run_cuda_dev.sh +++ b/scripts/article/run_cuda_dev.sh @@ -99,6 +99,8 @@ if [ "$MILABENCH_PREPARE" -eq 0 ]; then cd $MILABENCH_WORDIR + milabench prepare --system $MILABENCH_WORDIR/system.yaml "$@" + # milabench prepare "$@" # # Run the benchmakrs From da4f8bef902a3a5ecbdc912b9521b89bbe8b15da Mon Sep 17 00:00:00 2001 From: "pierre.delaunay" Date: Fri, 9 Aug 2024 13:37:06 -0400 Subject: [PATCH 4/6] diffusion multinode working --- .pin/constraints-rocm-torch.txt | 362 ++++++++------- .pin/constraints-xpu-torch.txt | 399 ++++++++-------- benchmarks/brax/requirements.in | 3 +- benchmarks/brax/requirements.rocm.txt | 429 ------------------ benchmarks/brax/requirements.xpu.txt | 88 +--- benchmarks/diffusion/main.py | 4 +- benchmarks/diffusion/prepare.py | 15 +- benchmarks/diffusion/requirements.xpu.txt | 333 ++++++++++++++ benchmarks/dinov2/benchfile.py | 7 +- benchmarks/dinov2/requirements.xpu.txt | 217 +++++++++ benchmarks/dinov2/voirfile.py | 21 +- benchmarks/flops/requirements.xpu.txt | 40 +- benchmarks/huggingface/requirements.xpu.txt | 32 +- benchmarks/lightning/requirements.xpu.txt | 235 ++++++++++ benchmarks/llama/requirements.xpu.txt | 36 +- benchmarks/super-slomo/requirements.rocm.txt | 16 +- benchmarks/super-slomo/requirements.xpu.txt | 39 +- benchmarks/timm/requirements.xpu.txt | 24 +- benchmarks/torchvision/requirements.xpu.txt | 35 +- .../torchvision_ddp/requirements.xpu.txt | 36 +- config/base.yaml | 20 +- constraints/extra/torch.cuda.txt | 2 + constraints/extra/torch.hpu.txt | 18 + constraints/extra/torch.rocm.txt | 1 + constraints/extra/torch.xpu.txt | 20 + constraints/xpu.txt | 16 +- milabench/_version.py | 6 +- milabench/cli/__init__.py | 5 + milabench/cli/env.py | 27 ++ milabench/cli/slurm.py | 31 +- milabench/commands/__init__.py | 14 +- milabench/scripts/activator | 6 + milabench/sizer.py | 1 + milabench/system.py | 62 +-- scripts/article/run_cuda_dev.sh | 20 +- 35 files changed, 1504 insertions(+), 1116 deletions(-) delete mode 100644 benchmarks/brax/requirements.rocm.txt create mode 100644 benchmarks/diffusion/requirements.xpu.txt create mode 100644 benchmarks/dinov2/requirements.xpu.txt create mode 100644 benchmarks/lightning/requirements.xpu.txt create mode 100644 constraints/extra/torch.cuda.txt create mode 100644 constraints/extra/torch.hpu.txt create mode 100644 constraints/extra/torch.rocm.txt create mode 100644 constraints/extra/torch.xpu.txt create mode 100644 milabench/cli/env.py diff --git a/.pin/constraints-rocm-torch.txt b/.pin/constraints-rocm-torch.txt index 09a8c47f7..53ae14621 100644 --- a/.pin/constraints-rocm-torch.txt +++ b/.pin/constraints-rocm-torch.txt @@ -2,12 +2,10 @@ # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # -# pip-compile --output-file=.pin/constraints-rocm-torch.txt .pin/tmp-constraints.txt benchmarks/accelerate_opt/requirements.in benchmarks/brax/requirements.in benchmarks/dlrm/requirements.in benchmarks/flops/requirements.in benchmarks/huggingface/requirements.in benchmarks/llama/requirements.in benchmarks/stargan/requirements.in benchmarks/super-slomo/requirements.in benchmarks/timm/requirements.in benchmarks/torchvision/requirements.in benchmarks/torchvision_ddp/requirements.in +# pip-compile --output-file=.pin/constraints-rocm-torch.txt .pin/tmp-constraints.txt benchmarks/brax/requirements.in benchmarks/diffusion/requirements.in benchmarks/dinov2/requirements.in benchmarks/flops/requirements.in benchmarks/huggingface/requirements.in benchmarks/lightning/requirements.in benchmarks/llama/requirements.in benchmarks/llm/requirements.in benchmarks/super-slomo/requirements.in benchmarks/timm/requirements.in benchmarks/torchvision/requirements.in benchmarks/torchvision_ddp/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com --extra-index-url https://download.pytorch.org/whl/rocm6.0 --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com absl-py==2.1.0 # via @@ -19,32 +17,37 @@ absl-py==2.1.0 # mujoco-mjx # optax # orbax-checkpoint - # tensorboard -accelerate==0.32.1 - # via -r benchmarks/accelerate_opt/requirements.in -aiohttp==3.9.5 +accelerate==0.33.0 + # via + # -r benchmarks/diffusion/requirements.in + # diffusers +aiohappyeyeballs==2.3.5 + # via aiohttp +aiohttp==3.10.2 # via # datasets # fsspec aiosignal==1.3.1 # via aiohttp -annotated-types==0.7.0 - # via pydantic antlr4-python3-runtime==4.9.3 # via omegaconf +argklass==1.4.4 + # via + # -r benchmarks/diffusion/requirements.in + # -r benchmarks/llm/requirements.in asttokens==2.4.1 # via giving async-timeout==4.0.3 # via aiohttp -attrs==23.2.0 +attrs==24.2.0 # via aiohttp -beautifulsoup4==4.12.3 - # via gdown blinker==1.8.2 # via flask +blobfile==2.1.1 + # via torchtune brax==0.10.5 # via -r benchmarks/brax/requirements.in -certifi==2024.6.2 +certifi==2024.7.4 # via requests charset-normalizer==3.3.2 # via requests @@ -53,31 +56,28 @@ chex==0.1.86 click==8.1.7 # via flask cloudpickle==3.0.0 - # via gym + # via + # gym + # submitit codefind==0.1.6 # via ptera contextlib2==21.6.0 # via ml-collections datasets==2.20.0 # via - # -r benchmarks/accelerate_opt/requirements.in + # -r benchmarks/diffusion/requirements.in # -r benchmarks/llama/requirements.in - # evaluate -deepspeed==0.14.4 - # via -r benchmarks/accelerate_opt/requirements.in + # torchtune +diffusers[torch]==0.30.0 + # via -r benchmarks/diffusion/requirements.in dill==0.3.8 # via # datasets - # evaluate # multiprocess dm-env==1.6 # via brax dm-tree==0.1.8 # via dm-env -docker==7.1.0 - # via torchx -docstring-parser==0.16 - # via torchx etils[epath,epy]==1.7.0 # via # brax @@ -85,22 +85,18 @@ etils[epath,epy]==1.7.0 # mujoco-mjx # optax # orbax-checkpoint -evaluate==0.4.2 - # via -r benchmarks/accelerate_opt/requirements.in executing==1.2.0 # via varname fairscale==0.4.13 # via -r benchmarks/llama/requirements.in -fbgemm-gpu==0.7.0+rocm6.0 - # via torchrec filelock==3.15.4 # via + # blobfile # datasets - # gdown + # diffusers # huggingface-hub # pytorch-triton-rocm # torch - # torchx # transformers fire==0.6.0 # via -r benchmarks/llama/requirements.in @@ -120,53 +116,53 @@ fsspec[http]==2024.5.0 # via # datasets # etils - # evaluate # huggingface-hub + # lightning + # pytorch-lightning # torch - # torchx -future==1.0.0 - # via -r benchmarks/dlrm/requirements.in -gdown==5.2.0 - # via -r benchmarks/stargan/requirements.in +fvcore==0.1.5.post20221221 + # via -r benchmarks/dinov2/requirements.in giving==0.4.2 # via # ptera # voir glfw==2.7.0 # via mujoco -graphviz==0.20.3 - # via torchviz -grpcio==1.65.1 - # via - # brax - # tensorboard +grpcio==1.65.4 + # via brax gym==0.26.2 # via brax gym-notices==0.0.8 # via gym hjson==3.1.0 - # via deepspeed -huggingface-hub==0.23.5 + # via argklass +huggingface-hub==0.24.5 # via # -r benchmarks/timm/requirements.in # accelerate # datasets - # evaluate + # diffusers # tokenizers + # torchtune # transformers idna==3.7 # via # requests # yarl -importlib-metadata==8.0.0 - # via torchx +importlib-metadata==8.2.0 + # via diffusers importlib-resources==6.4.0 # via + # argklass # etils # torchcompat +iopath==0.1.10 + # via + # -r benchmarks/dinov2/requirements.in + # fvcore itsdangerous==2.2.0 # via flask -jax[cuda12]==0.4.30 +jax[cuda12]==0.4.31 # via # -r benchmarks/brax/requirements.in # brax @@ -176,11 +172,11 @@ jax[cuda12]==0.4.30 # mujoco-mjx # optax # orbax-checkpoint -jax-cuda12-pjrt==0.4.30 +jax-cuda12-pjrt==0.4.31 # via jax-cuda12-plugin -jax-cuda12-plugin[with-cuda]==0.4.30 +jax-cuda12-plugin[with-cuda]==0.4.31 # via jax -jaxlib==0.4.30 +jaxlib==0.4.31 # via # brax # chex @@ -196,12 +192,15 @@ jinja2==3.1.4 # brax # flask # torch -joblib==1.4.2 - # via scikit-learn -lightning-utilities==0.11.5 - # via torchmetrics -markdown==3.6 - # via tensorboard +lightning==2.4.0 + # via -r benchmarks/lightning/requirements.in +lightning-utilities==0.11.6 + # via + # lightning + # pytorch-lightning + # torchmetrics +lxml==4.9.4 + # via blobfile markdown-it-py==3.0.0 # via rich markupsafe==2.1.5 @@ -223,100 +222,89 @@ msgpack==1.0.8 # via # flax # orbax-checkpoint -mujoco==3.2.0 +mujoco==3.2.2 # via # brax # mujoco-mjx -mujoco-mjx==3.2.0 +mujoco-mjx==3.2.2 # via brax multidict==6.0.5 # via # aiohttp # yarl multiprocess==0.70.16 - # via - # datasets - # evaluate -mypy-extensions==1.0.0 - # via typing-inspect + # via datasets nest-asyncio==1.6.0 # via orbax-checkpoint networkx==3.3 # via torch -ninja==1.11.1.1 - # via deepspeed numpy==1.26.4 # via - # -r benchmarks/dlrm/requirements.in - # -r benchmarks/stargan/requirements.in # -r benchmarks/super-slomo/requirements.in # accelerate # brax # chex # datasets - # deepspeed + # diffusers # dm-env - # evaluate # fairscale - # fbgemm-gpu # flax + # fvcore # gym # jax # jaxlib # jaxopt # ml-dtypes # mujoco - # onnx # opencv-python # opt-einsum # optax # orbax-checkpoint # pandas # pyarrow - # scikit-learn # scipy - # tensorboard # tensorboardx # tensorstore # torchmetrics + # torchtune # torchvision # transformers # trimesh -nvidia-cublas-cu12==12.5.3.2 + # xformers +nvidia-cublas-cu12==12.6.0.22 # via # jax-cuda12-plugin # nvidia-cudnn-cu12 # nvidia-cusolver-cu12 -nvidia-cuda-cupti-cu12==12.5.82 +nvidia-cuda-cupti-cu12==12.6.37 # via jax-cuda12-plugin -nvidia-cuda-nvcc-cu12==12.5.82 +nvidia-cuda-nvcc-cu12==12.6.20 # via jax-cuda12-plugin -nvidia-cuda-runtime-cu12==12.5.82 +nvidia-cuda-runtime-cu12==12.6.37 # via jax-cuda12-plugin -nvidia-cudnn-cu12==9.2.1.18 +nvidia-cudnn-cu12==9.3.0.75 # via jax-cuda12-plugin -nvidia-cufft-cu12==11.2.3.61 +nvidia-cufft-cu12==11.2.6.28 # via jax-cuda12-plugin -nvidia-cusolver-cu12==11.6.3.83 +nvidia-cusolver-cu12==11.6.4.38 # via jax-cuda12-plugin -nvidia-cusparse-cu12==12.5.1.3 +nvidia-cusparse-cu12==12.5.2.23 # via # jax-cuda12-plugin # nvidia-cusolver-cu12 -nvidia-ml-py==12.555.43 - # via deepspeed nvidia-nccl-cu12==2.22.3 # via jax-cuda12-plugin -nvidia-nvjitlink-cu12==12.5.82 +nvidia-nvjitlink-cu12==12.6.20 # via # jax-cuda12-plugin # nvidia-cufft-cu12 # nvidia-cusolver-cu12 # nvidia-cusparse-cu12 omegaconf==2.3.0 - # via voir -onnx==1.16.1 - # via -r benchmarks/dlrm/requirements.in + # via + # -r benchmarks/dinov2/requirements.in + # torchtune + # voir opencv-python==4.10.0.84 # via -r benchmarks/super-slomo/requirements.in opt-einsum==3.3.0 @@ -325,173 +313,215 @@ optax==0.2.3 # via # brax # flax -orbax-checkpoint==0.5.21 +orbax-checkpoint==0.5.23 # via # brax # flax -ovld==0.3.5 +ovld==0.3.8 # via voir packaging==24.1 # via # accelerate # datasets - # deepspeed - # evaluate # huggingface-hub + # lightning # lightning-utilities + # pytorch-lightning # tensorboardx # torchmetrics # transformers pandas==2.2.2 - # via - # datasets - # evaluate + # via datasets pillow==10.4.0 # via + # -r benchmarks/huggingface/requirements.in # brax + # diffusers + # fvcore # torchvision -protobuf==4.25.3 +portalocker==2.10.1 + # via iopath +protobuf==5.27.3 # via - # onnx # orbax-checkpoint - # tensorboard # tensorboardx psutil==5.9.8 # via # accelerate - # deepspeed # voir ptera==1.4.1 # via voir -py-cpuinfo==9.0.0 - # via deepspeed pyarrow==17.0.0 # via datasets pyarrow-hotfix==0.6 # via datasets -pydantic==2.7.4 - # via deepspeed -pydantic-core==2.18.4 - # via pydantic -pydot==3.0.1 - # via -r benchmarks/dlrm/requirements.in +pycryptodomex==3.20.0 + # via blobfile pygments==2.18.0 # via rich pynvml==11.5.3 # via voir pyopengl==3.1.7 # via mujoco -pyparsing==3.1.2 - # via pydot -pyre-extensions==0.0.30 - # via torchx -pysocks==1.7.1 - # via requests python-dateutil==2.9.0.post0 # via pandas pytinyrenderer==0.0.14 # via brax -pytorch-triton-rocm==2.3.1 +pytorch-lightning==2.4.0 + # via lightning +pytorch-triton-rocm==3.0.0 # via torch pytz==2024.1 # via pandas -pyyaml==6.0.1 +pyyaml==6.0.2 # via + # -r benchmarks/llm/requirements.in # -r benchmarks/timm/requirements.in # accelerate # datasets # flax + # fvcore # huggingface-hub + # lightning # ml-collections # omegaconf # orbax-checkpoint - # torchx + # pytorch-lightning # transformers + # yacs reactivex==4.0.4 # via giving -regex==2024.5.15 - # via transformers -requests[socks]==2.32.3 +regex==2024.7.24 + # via + # diffusers + # tiktoken + # transformers +requests==2.32.3 # via # datasets - # docker - # evaluate - # gdown + # diffusers # huggingface-hub + # tiktoken # transformers rich==13.7.1 # via - # -r benchmarks/accelerate_opt/requirements.in # flax # voir -safetensors==0.4.3 +safetensors==0.4.4 # via # -r benchmarks/timm/requirements.in # accelerate + # diffusers + # torchtune # transformers -scikit-learn==1.5.1 - # via -r benchmarks/dlrm/requirements.in scipy==1.14.0 # via + # -r benchmarks/dinov2/requirements.in # brax # jax # jaxlib # jaxopt # mujoco-mjx - # scikit-learn sentencepiece==0.2.0 - # via -r benchmarks/llama/requirements.in + # via + # -r benchmarks/llama/requirements.in + # torchtune six==1.16.0 # via # asttokens # fire # ml-collections # python-dateutil - # tensorboard -soupsieve==2.5 - # via beautifulsoup4 -sympy==1.13.0 +submitit==1.5.1 + # via -r benchmarks/dinov2/requirements.in +sympy==1.13.1 # via torch tabulate==0.9.0 - # via torchx -tensorboard==2.17.0 - # via -r benchmarks/dlrm/requirements.in -tensorboard-data-server==0.7.2 - # via tensorboard + # via fvcore tensorboardx==2.6.2.2 # via brax -tensorstore==0.1.63 +tensorstore==0.1.64 # via # flax # orbax-checkpoint termcolor==2.4.0 - # via fire -threadpoolctl==3.5.0 - # via scikit-learn + # via + # fire + # fvcore +tiktoken==0.7.0 + # via torchtune tokenizers==0.19.1 # via transformers toolz==0.12.1 # via chex -tqdm==4.66.4 +torch==2.4.0+rocm6.0 # via - # -r benchmarks/dlrm/requirements.in + # -r benchmarks/brax/requirements.in + # -r benchmarks/dinov2/requirements.in + # -r benchmarks/flops/requirements.in + # -r benchmarks/huggingface/requirements.in + # -r benchmarks/lightning/requirements.in + # -r benchmarks/llama/requirements.in + # -r benchmarks/llm/requirements.in + # -r benchmarks/super-slomo/requirements.in + # -r benchmarks/timm/requirements.in + # -r benchmarks/torchvision/requirements.in + # -r benchmarks/torchvision_ddp/requirements.in + # accelerate + # diffusers + # fairscale + # lightning + # pytorch-lightning + # torchmetrics + # torchvision + # xformers +torchao==0.3.1 + # via torchtune +torchcompat==1.1.4 + # via + # -c .pin/../constraints/rocm.txt + # -r benchmarks/flops/requirements.in + # -r benchmarks/lightning/requirements.in + # -r benchmarks/torchvision/requirements.in + # -r benchmarks/torchvision_ddp/requirements.in +torchmetrics==1.4.1 + # via + # -r benchmarks/dinov2/requirements.in + # lightning + # pytorch-lightning +torchtune==0.2.1 + # via -r benchmarks/llm/requirements.in +torchvision==0.19.0+rocm6.0 + # via + # -r benchmarks/diffusion/requirements.in + # -r benchmarks/dinov2/requirements.in + # -r benchmarks/flops/requirements.in + # -r benchmarks/lightning/requirements.in + # -r benchmarks/super-slomo/requirements.in + # -r benchmarks/timm/requirements.in + # -r benchmarks/torchvision/requirements.in + # -r benchmarks/torchvision_ddp/requirements.in +tqdm==4.66.5 + # via + # -r benchmarks/diffusion/requirements.in # -r benchmarks/flops/requirements.in # -r benchmarks/super-slomo/requirements.in # -r benchmarks/torchvision/requirements.in # -r benchmarks/torchvision_ddp/requirements.in # datasets - # deepspeed - # evaluate - # gdown + # fvcore # huggingface-hub - # torchrec + # iopath + # lightning + # pytorch-lightning + # torchtune # transformers -transformers==4.42.4 +transformers==4.44.0 # via - # -r benchmarks/accelerate_opt/requirements.in + # -r benchmarks/diffusion/requirements.in # -r benchmarks/huggingface/requirements.in # -r benchmarks/llama/requirements.in -trimesh==4.4.3 +trimesh==4.4.4 # via # brax # mujoco-mjx @@ -502,47 +532,45 @@ typing-extensions==4.12.2 # etils # flax # huggingface-hub + # iopath + # lightning # lightning-utilities # orbax-checkpoint - # pydantic - # pydantic-core - # pyre-extensions + # pytorch-lightning # reactivex + # submitit # torch - # typing-inspect -typing-inspect==0.9.0 - # via pyre-extensions tzdata==2024.1 # via pandas -urllib3==1.26.19 +urllib3==2.2.2 # via - # docker + # blobfile # requests - # torchx varname==0.10.0 # via giving voir==0.2.17 # via # -c .pin/../constraints/rocm.txt - # -r benchmarks/accelerate_opt/requirements.in # -r benchmarks/brax/requirements.in - # -r benchmarks/dlrm/requirements.in + # -r benchmarks/diffusion/requirements.in + # -r benchmarks/dinov2/requirements.in # -r benchmarks/flops/requirements.in # -r benchmarks/huggingface/requirements.in + # -r benchmarks/lightning/requirements.in # -r benchmarks/llama/requirements.in - # -r benchmarks/stargan/requirements.in + # -r benchmarks/llm/requirements.in # -r benchmarks/super-slomo/requirements.in # -r benchmarks/timm/requirements.in # -r benchmarks/torchvision/requirements.in # -r benchmarks/torchvision_ddp/requirements.in werkzeug==3.0.3 - # via - # flask - # tensorboard + # via flask +xformers==0.0.27.post2 + # via -r benchmarks/dinov2/requirements.in xxhash==3.4.1 - # via - # datasets - # evaluate + # via datasets +yacs==0.1.8 + # via fvcore yarl==1.9.4 # via aiohttp zipp==3.19.2 diff --git a/.pin/constraints-xpu-torch.txt b/.pin/constraints-xpu-torch.txt index 91cf0dceb..c92fb1476 100644 --- a/.pin/constraints-xpu-torch.txt +++ b/.pin/constraints-xpu-torch.txt @@ -2,12 +2,10 @@ # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # -# pip-compile --output-file=.pin/constraints-xpu-torch.txt .pin/tmp-constraints.txt benchmarks/accelerate_opt/requirements.in benchmarks/brax/requirements.in benchmarks/dlrm/requirements.in benchmarks/flops/requirements.in benchmarks/huggingface/requirements.in benchmarks/llama/requirements.in benchmarks/stargan/requirements.in benchmarks/super-slomo/requirements.in benchmarks/timm/requirements.in benchmarks/torchvision/requirements.in benchmarks/torchvision_ddp/requirements.in +# pip-compile --output-file=.pin/constraints-xpu-torch.txt .pin/tmp-constraints.txt benchmarks/brax/requirements.in benchmarks/diffusion/requirements.in benchmarks/dinov2/requirements.in benchmarks/flops/requirements.in benchmarks/huggingface/requirements.in benchmarks/lightning/requirements.in benchmarks/llama/requirements.in benchmarks/llm/requirements.in benchmarks/super-slomo/requirements.in benchmarks/timm/requirements.in benchmarks/torchvision/requirements.in benchmarks/torchvision_ddp/requirements.in constraints/extra/torch.xpu.txt # ---extra-index-url https://pypi.ngc.nvidia.com +--extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ ---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com absl-py==2.1.0 # via @@ -19,32 +17,37 @@ absl-py==2.1.0 # mujoco-mjx # optax # orbax-checkpoint - # tensorboard -accelerate==0.32.1 - # via -r benchmarks/accelerate_opt/requirements.in -aiohttp==3.9.5 +accelerate==0.33.0 + # via + # -r benchmarks/diffusion/requirements.in + # diffusers +aiohappyeyeballs==2.3.5 + # via aiohttp +aiohttp==3.10.2 # via # datasets # fsspec aiosignal==1.3.1 # via aiohttp -annotated-types==0.7.0 - # via pydantic antlr4-python3-runtime==4.9.3 # via omegaconf +argklass==1.4.4 + # via + # -r benchmarks/diffusion/requirements.in + # -r benchmarks/llm/requirements.in asttokens==2.4.1 # via giving async-timeout==4.0.3 # via aiohttp -attrs==23.2.0 +attrs==24.2.0 # via aiohttp -beautifulsoup4==4.12.3 - # via gdown blinker==1.8.2 # via flask +blobfile==2.1.1 + # via torchtune brax==0.10.5 # via -r benchmarks/brax/requirements.in -certifi==2024.6.2 +certifi==2024.7.4 # via requests charset-normalizer==3.3.2 # via requests @@ -53,31 +56,28 @@ chex==0.1.86 click==8.1.7 # via flask cloudpickle==3.0.0 - # via gym + # via + # gym + # submitit codefind==0.1.6 # via ptera contextlib2==21.6.0 # via ml-collections datasets==2.20.0 # via - # -r benchmarks/accelerate_opt/requirements.in + # -r benchmarks/diffusion/requirements.in # -r benchmarks/llama/requirements.in - # evaluate -deepspeed==0.14.4 - # via -r benchmarks/accelerate_opt/requirements.in + # torchtune +diffusers[torch]==0.30.0 + # via -r benchmarks/diffusion/requirements.in dill==0.3.8 # via # datasets - # evaluate # multiprocess dm-env==1.6 # via brax dm-tree==0.1.8 # via dm-env -docker==7.1.0 - # via torchx -docstring-parser==0.16 - # via torchx etils[epath,epy]==1.7.0 # via # brax @@ -85,21 +85,17 @@ etils[epath,epy]==1.7.0 # mujoco-mjx # optax # orbax-checkpoint -evaluate==0.4.2 - # via -r benchmarks/accelerate_opt/requirements.in executing==1.2.0 # via varname fairscale==0.4.13 # via -r benchmarks/llama/requirements.in -fbgemm-gpu==0.7.0 - # via torchrec filelock==3.15.4 # via + # blobfile # datasets - # gdown + # diffusers # huggingface-hub # torch - # torchx # transformers fire==0.6.0 # via -r benchmarks/llama/requirements.in @@ -119,53 +115,65 @@ fsspec[http]==2024.5.0 # via # datasets # etils - # evaluate # huggingface-hub + # lightning + # pytorch-lightning # torch - # torchx -future==1.0.0 - # via -r benchmarks/dlrm/requirements.in -gdown==5.2.0 - # via -r benchmarks/stargan/requirements.in +fvcore==0.1.5.post20221221 + # via -r benchmarks/dinov2/requirements.in giving==0.4.2 # via # ptera # voir glfw==2.7.0 # via mujoco -graphviz==0.20.3 - # via torchviz -grpcio==1.65.1 - # via - # brax - # tensorboard +grpcio==1.65.4 + # via brax gym==0.26.2 # via brax gym-notices==0.0.8 # via gym hjson==3.1.0 - # via deepspeed -huggingface-hub==0.24.0 + # via argklass +huggingface-hub==0.24.5 # via # -r benchmarks/timm/requirements.in # accelerate # datasets - # evaluate + # diffusers # tokenizers + # torchtune # transformers idna==3.7 # via # requests # yarl -importlib-metadata==8.0.0 - # via torchx +importlib-metadata==8.2.0 + # via diffusers importlib-resources==6.4.0 # via + # argklass # etils # torchcompat +intel-extension-for-openxla==0.3.0 + # via + # -c .pin/../constraints/xpu.txt + # -r constraints/extra/torch.xpu.txt +intel-extension-for-pytorch==2.3.100 + # via + # -c .pin/../constraints/xpu.txt + # -r constraints/extra/torch.xpu.txt +intel-extension-for-pytorch-deepspeed==2.1.40 + # via + # -c .pin/../constraints/xpu.txt + # -r constraints/extra/torch.xpu.txt +iopath==0.1.10 + # via + # -r benchmarks/dinov2/requirements.in + # fvcore itsdangerous==2.2.0 # via flask -jax[cuda12]==0.4.30 +jax==0.4.31 # via # -r benchmarks/brax/requirements.in # brax @@ -175,11 +183,7 @@ jax[cuda12]==0.4.30 # mujoco-mjx # optax # orbax-checkpoint -jax-cuda12-pjrt==0.4.30 - # via jax-cuda12-plugin -jax-cuda12-plugin[with-cuda]==0.4.30 - # via jax -jaxlib==0.4.30 +jaxlib==0.4.31 # via # brax # chex @@ -195,12 +199,15 @@ jinja2==3.1.4 # brax # flask # torch -joblib==1.4.2 - # via scikit-learn -lightning-utilities==0.11.5 - # via torchmetrics -markdown==3.6 - # via tensorboard +lightning==2.4.0 + # via -r benchmarks/lightning/requirements.in +lightning-utilities==0.11.6 + # via + # lightning + # pytorch-lightning + # torchmetrics +lxml==4.9.4 + # via blobfile markdown-it-py==3.0.0 # via rich markupsafe==2.1.5 @@ -222,100 +229,66 @@ msgpack==1.0.8 # via # flax # orbax-checkpoint -mujoco==3.2.0 +mujoco==3.2.2 # via # brax # mujoco-mjx -mujoco-mjx==3.2.0 +mujoco-mjx==3.2.2 # via brax multidict==6.0.5 # via # aiohttp # yarl multiprocess==0.70.16 - # via - # datasets - # evaluate -mypy-extensions==1.0.0 - # via typing-inspect + # via datasets nest-asyncio==1.6.0 # via orbax-checkpoint networkx==3.3 # via torch -ninja==1.11.1.1 - # via deepspeed numpy==1.26.4 # via - # -r benchmarks/dlrm/requirements.in - # -r benchmarks/stargan/requirements.in # -r benchmarks/super-slomo/requirements.in # accelerate # brax # chex # datasets - # deepspeed + # diffusers # dm-env - # evaluate # fairscale - # fbgemm-gpu # flax + # fvcore # gym + # intel-extension-for-openxla + # intel-extension-for-pytorch # jax # jaxlib # jaxopt # ml-dtypes # mujoco - # onnx # opencv-python # opt-einsum # optax # orbax-checkpoint # pandas # pyarrow - # scikit-learn # scipy - # tensorboard # tensorboardx # tensorstore # torchmetrics + # torchtune # torchvision # transformers # trimesh -nvidia-cublas-cu12==12.5.3.2 - # via - # jax-cuda12-plugin - # nvidia-cudnn-cu12 - # nvidia-cusolver-cu12 -nvidia-cuda-cupti-cu12==12.5.82 - # via jax-cuda12-plugin -nvidia-cuda-nvcc-cu12==12.5.82 - # via jax-cuda12-plugin -nvidia-cuda-runtime-cu12==12.5.82 - # via jax-cuda12-plugin -nvidia-cudnn-cu12==9.2.1.18 - # via jax-cuda12-plugin -nvidia-cufft-cu12==11.2.3.61 - # via jax-cuda12-plugin -nvidia-cusolver-cu12==11.6.3.83 - # via jax-cuda12-plugin -nvidia-cusparse-cu12==12.5.1.3 - # via - # jax-cuda12-plugin - # nvidia-cusolver-cu12 -nvidia-ml-py==12.555.43 - # via deepspeed -nvidia-nccl-cu12==2.22.3 - # via jax-cuda12-plugin -nvidia-nvjitlink-cu12==12.5.82 - # via - # jax-cuda12-plugin - # nvidia-cufft-cu12 - # nvidia-cusolver-cu12 - # nvidia-cusparse-cu12 + # xformers omegaconf==2.3.0 - # via voir -onnx==1.16.1 - # via -r benchmarks/dlrm/requirements.in + # via + # -r benchmarks/dinov2/requirements.in + # torchtune + # voir +oneccl-bind-pt==2.1.400+xpu + # via + # -c .pin/../constraints/xpu.txt + # -r constraints/extra/torch.xpu.txt opencv-python==4.10.0.84 # via -r benchmarks/super-slomo/requirements.in opt-einsum==3.3.0 @@ -324,221 +297,226 @@ optax==0.2.3 # via # brax # flax -orbax-checkpoint==0.5.21 +orbax-checkpoint==0.5.23 # via # brax # flax -ovld==0.3.5 +ovld==0.3.8 # via voir packaging==24.1 # via # accelerate # datasets - # deepspeed - # evaluate # huggingface-hub + # intel-extension-for-pytorch + # lightning # lightning-utilities + # pytorch-lightning # tensorboardx # torchmetrics # transformers pandas==2.2.2 - # via - # datasets - # evaluate + # via datasets pillow==10.4.0 # via + # -r benchmarks/huggingface/requirements.in # brax + # diffusers + # fvcore # torchvision -protobuf==4.25.3 +portalocker==2.10.1 + # via iopath +protobuf==5.27.3 # via - # onnx # orbax-checkpoint - # tensorboard # tensorboardx psutil==5.9.8 # via # accelerate - # deepspeed + # intel-extension-for-pytorch # voir ptera==1.4.1 # via voir -py-cpuinfo==9.0.0 - # via deepspeed pyarrow==17.0.0 # via datasets pyarrow-hotfix==0.6 # via datasets -pydantic==2.7.4 - # via deepspeed -pydantic-core==2.18.4 - # via pydantic -pydot==3.0.1 - # via -r benchmarks/dlrm/requirements.in +pycryptodomex==3.20.0 + # via blobfile pygments==2.18.0 # via rich pynvml==11.5.3 # via voir pyopengl==3.1.7 # via mujoco -pyparsing==3.1.2 - # via pydot -pyre-extensions==0.0.30 - # via torchx -pysocks==1.7.1 - # via requests python-dateutil==2.9.0.post0 # via pandas pytinyrenderer==0.0.14 # via brax +pytorch-lightning==2.4.0 + # via lightning pytz==2024.1 # via pandas -pyyaml==6.0.1 +pyyaml==6.0.2 # via + # -r benchmarks/llm/requirements.in # -r benchmarks/timm/requirements.in # accelerate # datasets # flax + # fvcore # huggingface-hub + # lightning # ml-collections # omegaconf # orbax-checkpoint - # torchx + # pytorch-lightning # transformers + # yacs reactivex==4.0.4 # via giving -regex==2024.5.15 - # via transformers -requests[socks]==2.32.3 +regex==2024.7.24 + # via + # diffusers + # tiktoken + # transformers +requests==2.32.3 # via # datasets - # docker - # evaluate - # gdown + # diffusers # huggingface-hub - # torchvision + # tiktoken # transformers rich==13.7.1 # via - # -r benchmarks/accelerate_opt/requirements.in # flax # voir -safetensors==0.4.3 +safetensors==0.4.4 # via # -r benchmarks/timm/requirements.in # accelerate + # diffusers + # torchtune # transformers -scikit-learn==1.5.1 - # via -r benchmarks/dlrm/requirements.in -scipy==1.14.0 +scipy==1.11.4 # via + # -r benchmarks/dinov2/requirements.in # brax + # intel-extension-for-openxla # jax # jaxlib # jaxopt # mujoco-mjx - # scikit-learn sentencepiece==0.2.0 - # via -r benchmarks/llama/requirements.in + # via + # -r benchmarks/llama/requirements.in + # torchtune six==1.16.0 # via # asttokens # fire # ml-collections # python-dateutil - # tensorboard -soupsieve==2.5 - # via beautifulsoup4 -sympy==1.13.0 +submitit==1.5.1 + # via -r benchmarks/dinov2/requirements.in +sympy==1.13.1 # via torch tabulate==0.9.0 - # via torchx -tensorboard==2.17.0 - # via -r benchmarks/dlrm/requirements.in -tensorboard-data-server==0.7.2 - # via tensorboard + # via fvcore tensorboardx==2.6.2.2 # via brax -tensorstore==0.1.63 +tensorstore==0.1.64 # via # flax # orbax-checkpoint termcolor==2.4.0 - # via fire -threadpoolctl==3.5.0 - # via scikit-learn + # via + # fire + # fvcore +tiktoken==0.7.0 + # via torchtune tokenizers==0.19.1 # via transformers toolz==0.12.1 # via chex -torch==2.1.0.post2+cxx11.abi +torch==2.4.0+cpu # via # -c .pin/../constraints/xpu.txt - # -r benchmarks/accelerate_opt/requirements.in # -r benchmarks/brax/requirements.in - # -r benchmarks/dlrm/requirements.in + # -r benchmarks/dinov2/requirements.in # -r benchmarks/flops/requirements.in # -r benchmarks/huggingface/requirements.in + # -r benchmarks/lightning/requirements.in # -r benchmarks/llama/requirements.in - # -r benchmarks/stargan/requirements.in + # -r benchmarks/llm/requirements.in # -r benchmarks/super-slomo/requirements.in # -r benchmarks/timm/requirements.in # -r benchmarks/torchvision/requirements.in # -r benchmarks/torchvision_ddp/requirements.in + # -r constraints/extra/torch.xpu.txt # accelerate - # deepspeed + # diffusers # fairscale + # lightning + # pytorch-lightning # torchaudio # torchmetrics # torchvision - # torchviz -torchaudio==2.1.0.post2+cxx11.abi + # xformers +torchao==0.3.1+cpu + # via torchtune +torchaudio==2.4.0+cpu # via # -c .pin/../constraints/xpu.txt - # -r benchmarks/accelerate_opt/requirements.in + # -r constraints/extra/torch.xpu.txt torchcompat==1.1.4 # via # -c .pin/../constraints/xpu.txt # -r benchmarks/flops/requirements.in + # -r benchmarks/lightning/requirements.in # -r benchmarks/torchvision/requirements.in # -r benchmarks/torchvision_ddp/requirements.in -torchmetrics==1.0.3 - # via torchrec -torchrec==0.7.0 - # via -r benchmarks/dlrm/requirements.in -torchvision==0.16.0.post2+cxx11.abi + # -r constraints/extra/torch.xpu.txt +torchmetrics==1.4.1 + # via + # -r benchmarks/dinov2/requirements.in + # lightning + # pytorch-lightning +torchtune==0.2.1+cpu + # via -r benchmarks/llm/requirements.in +torchvision==0.19.0+cpu # via # -c .pin/../constraints/xpu.txt - # -r benchmarks/accelerate_opt/requirements.in + # -r benchmarks/diffusion/requirements.in + # -r benchmarks/dinov2/requirements.in # -r benchmarks/flops/requirements.in - # -r benchmarks/stargan/requirements.in + # -r benchmarks/lightning/requirements.in # -r benchmarks/super-slomo/requirements.in # -r benchmarks/timm/requirements.in # -r benchmarks/torchvision/requirements.in # -r benchmarks/torchvision_ddp/requirements.in -torchviz==0.0.2 - # via -r benchmarks/dlrm/requirements.in -torchx==0.7.0 - # via -r benchmarks/dlrm/requirements.in -tqdm==4.66.4 + # -r constraints/extra/torch.xpu.txt +tqdm==4.66.5 # via - # -r benchmarks/dlrm/requirements.in + # -r benchmarks/diffusion/requirements.in # -r benchmarks/flops/requirements.in # -r benchmarks/super-slomo/requirements.in # -r benchmarks/torchvision/requirements.in # -r benchmarks/torchvision_ddp/requirements.in # datasets - # deepspeed - # evaluate - # gdown + # fvcore # huggingface-hub - # torchrec + # iopath + # lightning + # pytorch-lightning + # torchtune # transformers -transformers==4.42.4 +transformers==4.44.0 # via - # -r benchmarks/accelerate_opt/requirements.in + # -r benchmarks/diffusion/requirements.in # -r benchmarks/huggingface/requirements.in # -r benchmarks/llama/requirements.in -trimesh==4.4.3 +trimesh==4.4.4 # via # brax # mujoco-mjx @@ -549,47 +527,48 @@ typing-extensions==4.12.2 # etils # flax # huggingface-hub + # iopath + # lightning # lightning-utilities # orbax-checkpoint - # pydantic - # pydantic-core - # pyre-extensions + # pytorch-lightning # reactivex + # submitit # torch - # typing-inspect -typing-inspect==0.9.0 - # via pyre-extensions tzdata==2024.1 # via pandas -urllib3==1.26.19 +urllib3==2.2.2 # via - # docker + # blobfile # requests - # torchx varname==0.10.0 # via giving voir==0.2.17 # via # -c .pin/../constraints/xpu.txt - # -r benchmarks/accelerate_opt/requirements.in # -r benchmarks/brax/requirements.in - # -r benchmarks/dlrm/requirements.in + # -r benchmarks/diffusion/requirements.in + # -r benchmarks/dinov2/requirements.in # -r benchmarks/flops/requirements.in # -r benchmarks/huggingface/requirements.in + # -r benchmarks/lightning/requirements.in # -r benchmarks/llama/requirements.in - # -r benchmarks/stargan/requirements.in + # -r benchmarks/llm/requirements.in # -r benchmarks/super-slomo/requirements.in # -r benchmarks/timm/requirements.in # -r benchmarks/torchvision/requirements.in # -r benchmarks/torchvision_ddp/requirements.in + # -r constraints/extra/torch.xpu.txt werkzeug==3.0.3 - # via - # flask - # tensorboard + # via flask +wheel==0.44.0 + # via intel-extension-for-openxla +xformers==0.0.27.post2 + # via -r benchmarks/dinov2/requirements.in xxhash==3.4.1 - # via - # datasets - # evaluate + # via datasets +yacs==0.1.8 + # via fvcore yarl==1.9.4 # via aiohttp zipp==3.19.2 diff --git a/benchmarks/brax/requirements.in b/benchmarks/brax/requirements.in index 9db61b9e8..026047c3c 100644 --- a/benchmarks/brax/requirements.in +++ b/benchmarks/brax/requirements.in @@ -1,5 +1,4 @@ -jax[cuda12] ---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html +jax torch brax voir>=0.2.10,<0.3 diff --git a/benchmarks/brax/requirements.rocm.txt b/benchmarks/brax/requirements.rocm.txt deleted file mode 100644 index ea219cfb4..000000000 --- a/benchmarks/brax/requirements.rocm.txt +++ /dev/null @@ -1,429 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.10 -# by the following command: -# -# pip-compile --output-file=benchmarks/brax/requirements.rocm.txt .pin/tmp-constraints-rocm-brax.txt benchmarks/brax/requirements.in -# ---extra-index-url https://pypi.ngc.nvidia.com ---extra-index-url https://download.pytorch.org/whl/rocm6.0 ---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com - -absl-py==2.1.0 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # brax - # chex - # dm-env - # ml-collections - # mujoco - # mujoco-mjx - # optax - # orbax-checkpoint -antlr4-python3-runtime==4.9.3 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # omegaconf -asttokens==2.4.1 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # giving -blinker==1.8.2 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # flask -brax==0.10.5 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # -r benchmarks/brax/requirements.in -chex==0.1.86 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # optax -click==8.1.7 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # flask -cloudpickle==3.0.0 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # gym -codefind==0.1.6 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # ptera -contextlib2==21.6.0 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # ml-collections -dm-env==1.6 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # brax -dm-tree==0.1.8 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # dm-env -etils[epath,epy]==1.7.0 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # brax - # mujoco - # mujoco-mjx - # optax - # orbax-checkpoint -executing==1.2.0 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # varname -filelock==3.15.4 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # pytorch-triton-rocm - # torch -flask==3.0.3 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # brax - # flask-cors -flask-cors==4.0.1 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # brax -flax==0.8.5 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # brax -fsspec==2024.5.0 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # etils - # torch -giving==0.4.2 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # ptera - # voir -glfw==2.7.0 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # mujoco -grpcio==1.65.1 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # brax -gym==0.26.2 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # brax -gym-notices==0.0.8 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # gym -importlib-resources==6.4.0 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # etils -itsdangerous==2.2.0 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # flask -jax[cuda12]==0.4.30 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # -r benchmarks/brax/requirements.in - # brax - # chex - # flax - # jaxopt - # mujoco-mjx - # optax - # orbax-checkpoint -jax-cuda12-pjrt==0.4.30 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # jax-cuda12-plugin -jax-cuda12-plugin[with-cuda]==0.4.30 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # jax -jaxlib==0.4.30 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # brax - # chex - # jax - # jaxopt - # mujoco-mjx - # optax - # orbax-checkpoint -jaxopt==0.8.3 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # brax -jinja2==3.1.4 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # brax - # flask - # torch -markdown-it-py==3.0.0 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # rich -markupsafe==2.1.5 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # jinja2 - # werkzeug -mdurl==0.1.2 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # markdown-it-py -ml-collections==0.1.1 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # brax -ml-dtypes==0.4.0 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # jax - # jaxlib - # tensorstore -mpmath==1.3.0 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # sympy -msgpack==1.0.8 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # flax - # orbax-checkpoint -mujoco==3.2.0 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # brax - # mujoco-mjx -mujoco-mjx==3.2.0 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # brax -nest-asyncio==1.6.0 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # orbax-checkpoint -networkx==3.3 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # torch -numpy==1.26.4 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # brax - # chex - # dm-env - # flax - # gym - # jax - # jaxlib - # jaxopt - # ml-dtypes - # mujoco - # opt-einsum - # optax - # orbax-checkpoint - # scipy - # tensorboardx - # tensorstore - # trimesh -nvidia-cublas-cu12==12.5.3.2 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # jax-cuda12-plugin - # nvidia-cudnn-cu12 - # nvidia-cusolver-cu12 -nvidia-cuda-cupti-cu12==12.5.82 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # jax-cuda12-plugin -nvidia-cuda-nvcc-cu12==12.5.82 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # jax-cuda12-plugin -nvidia-cuda-runtime-cu12==12.5.82 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # jax-cuda12-plugin -nvidia-cudnn-cu12==9.2.1.18 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # jax-cuda12-plugin -nvidia-cufft-cu12==11.2.3.61 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # jax-cuda12-plugin -nvidia-cusolver-cu12==11.6.3.83 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # jax-cuda12-plugin -nvidia-cusparse-cu12==12.5.1.3 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # jax-cuda12-plugin - # nvidia-cusolver-cu12 -nvidia-nccl-cu12==2.22.3 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # jax-cuda12-plugin -nvidia-nvjitlink-cu12==12.5.82 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # jax-cuda12-plugin - # nvidia-cufft-cu12 - # nvidia-cusolver-cu12 - # nvidia-cusparse-cu12 -omegaconf==2.3.0 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # voir -opt-einsum==3.3.0 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # jax -optax==0.2.3 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # brax - # flax -orbax-checkpoint==0.5.21 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # brax - # flax -ovld==0.3.5 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # voir -packaging==24.1 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # tensorboardx -pillow==10.4.0 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # brax -protobuf==4.25.3 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # orbax-checkpoint - # tensorboardx -psutil==5.9.8 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # voir -ptera==1.4.1 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # voir -pygments==2.18.0 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # rich -pynvml==11.5.3 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # voir -pyopengl==3.1.7 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # mujoco -pytinyrenderer==0.0.14 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # brax -pytorch-triton-rocm==2.3.1 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # torch -pyyaml==6.0.1 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # flax - # ml-collections - # omegaconf - # orbax-checkpoint -reactivex==4.0.4 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # giving -rich==13.7.1 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # flax - # voir -scipy==1.14.0 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # brax - # jax - # jaxlib - # jaxopt - # mujoco-mjx -six==1.16.0 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # asttokens - # ml-collections -sympy==1.13.0 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # torch -tensorboardx==2.6.2.2 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # brax -tensorstore==0.1.63 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # flax - # orbax-checkpoint -toolz==0.12.1 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # chex -torch==2.3.1+rocm6.0 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # -r benchmarks/brax/requirements.in -trimesh==4.4.3 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # brax - # mujoco-mjx -typing-extensions==4.12.2 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # brax - # chex - # etils - # flax - # orbax-checkpoint - # reactivex - # torch -varname==0.10.0 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # giving -voir==0.2.17 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # -c .pin/../constraints/rocm.txt - # -r benchmarks/brax/requirements.in -werkzeug==3.0.3 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # flask -zipp==3.19.2 - # via - # -c .pin/../.pin/constraints-rocm-torch.txt - # etils diff --git a/benchmarks/brax/requirements.xpu.txt b/benchmarks/brax/requirements.xpu.txt index c08c7bdaa..5b934c9ba 100644 --- a/benchmarks/brax/requirements.xpu.txt +++ b/benchmarks/brax/requirements.xpu.txt @@ -4,10 +4,8 @@ # # pip-compile --output-file=benchmarks/brax/requirements.xpu.txt .pin/tmp-constraints-xpu-brax.txt benchmarks/brax/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com +--extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ ---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com absl-py==2.1.0 # via @@ -107,7 +105,7 @@ glfw==2.7.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # mujoco -grpcio==1.65.1 +grpcio==1.65.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt # brax @@ -127,7 +125,7 @@ itsdangerous==2.2.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # flask -jax[cuda12]==0.4.30 +jax==0.4.31 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -r benchmarks/brax/requirements.in @@ -138,15 +136,7 @@ jax[cuda12]==0.4.30 # mujoco-mjx # optax # orbax-checkpoint -jax-cuda12-pjrt==0.4.30 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # jax-cuda12-plugin -jax-cuda12-plugin[with-cuda]==0.4.30 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # jax -jaxlib==0.4.30 +jaxlib==0.4.31 # via # -c .pin/../.pin/constraints-xpu-torch.txt # brax @@ -198,12 +188,12 @@ msgpack==1.0.8 # -c .pin/../.pin/constraints-xpu-torch.txt # flax # orbax-checkpoint -mujoco==3.2.0 +mujoco==3.2.2 # via # -c .pin/../.pin/constraints-xpu-torch.txt # brax # mujoco-mjx -mujoco-mjx==3.2.0 +mujoco-mjx==3.2.2 # via # -c .pin/../.pin/constraints-xpu-torch.txt # brax @@ -235,52 +225,6 @@ numpy==1.26.4 # tensorboardx # tensorstore # trimesh -nvidia-cublas-cu12==12.5.3.2 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # jax-cuda12-plugin - # nvidia-cudnn-cu12 - # nvidia-cusolver-cu12 -nvidia-cuda-cupti-cu12==12.5.82 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # jax-cuda12-plugin -nvidia-cuda-nvcc-cu12==12.5.82 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # jax-cuda12-plugin -nvidia-cuda-runtime-cu12==12.5.82 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # jax-cuda12-plugin -nvidia-cudnn-cu12==9.2.1.18 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # jax-cuda12-plugin -nvidia-cufft-cu12==11.2.3.61 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # jax-cuda12-plugin -nvidia-cusolver-cu12==11.6.3.83 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # jax-cuda12-plugin -nvidia-cusparse-cu12==12.5.1.3 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # jax-cuda12-plugin - # nvidia-cusolver-cu12 -nvidia-nccl-cu12==2.22.3 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # jax-cuda12-plugin -nvidia-nvjitlink-cu12==12.5.82 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # jax-cuda12-plugin - # nvidia-cufft-cu12 - # nvidia-cusolver-cu12 - # nvidia-cusparse-cu12 omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt @@ -294,12 +238,12 @@ optax==0.2.3 # -c .pin/../.pin/constraints-xpu-torch.txt # brax # flax -orbax-checkpoint==0.5.21 +orbax-checkpoint==0.5.23 # via # -c .pin/../.pin/constraints-xpu-torch.txt # brax # flax -ovld==0.3.5 +ovld==0.3.8 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir @@ -311,7 +255,7 @@ pillow==10.4.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # brax -protobuf==4.25.3 +protobuf==5.27.3 # via # -c .pin/../.pin/constraints-xpu-torch.txt # orbax-checkpoint @@ -340,7 +284,7 @@ pytinyrenderer==0.0.14 # via # -c .pin/../.pin/constraints-xpu-torch.txt # brax -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-xpu-torch.txt # flax @@ -356,7 +300,7 @@ rich==13.7.1 # -c .pin/../.pin/constraints-xpu-torch.txt # flax # voir -scipy==1.14.0 +scipy==1.11.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt # brax @@ -369,7 +313,7 @@ six==1.16.0 # -c .pin/../.pin/constraints-xpu-torch.txt # asttokens # ml-collections -sympy==1.13.0 +sympy==1.13.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # torch @@ -377,7 +321,7 @@ tensorboardx==2.6.2.2 # via # -c .pin/../.pin/constraints-xpu-torch.txt # brax -tensorstore==0.1.63 +tensorstore==0.1.64 # via # -c .pin/../.pin/constraints-xpu-torch.txt # flax @@ -386,12 +330,13 @@ toolz==0.12.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # chex -torch==2.1.0.post2+cxx11.abi +torch==2.4.0+cpu # via # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/brax/requirements.in -trimesh==4.4.3 +trimesh==4.4.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt # brax @@ -413,6 +358,7 @@ varname==0.10.0 voir==0.2.17 # via # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/brax/requirements.in werkzeug==3.0.3 diff --git a/benchmarks/diffusion/main.py b/benchmarks/diffusion/main.py index bd6668dab..2b4fe9bfd 100644 --- a/benchmarks/diffusion/main.py +++ b/benchmarks/diffusion/main.py @@ -4,8 +4,6 @@ import math import random -from contextlib import nullcontext -from pathlib import Path import numpy as np import torch @@ -14,7 +12,6 @@ from accelerate import Accelerator from datasets import load_dataset from torchvision import transforms -from tqdm.auto import tqdm from transformers import CLIPTextModel, CLIPTokenizer from diffusers import AutoencoderKL, DDPMScheduler, UNet2DConditionModel @@ -44,6 +41,7 @@ class Arguments: lr_scheduler: str = "constant" lr_warmup_steps: int = 500 epochs: int = 10 + cache: str = None def models(accelerator, args: Arguments): diff --git a/benchmarks/diffusion/prepare.py b/benchmarks/diffusion/prepare.py index be7de0312..ed9e3f333 100755 --- a/benchmarks/diffusion/prepare.py +++ b/benchmarks/diffusion/prepare.py @@ -2,10 +2,6 @@ from dataclasses import dataclass import os -from transformers import CLIPTextModel, CLIPTokenizer - -from diffusers import AutoencoderKL, UNet2DConditionModel, DDPMScheduler -from datasets import load_dataset @dataclass @@ -14,6 +10,7 @@ class TrainingConfig: dataset: str = "lambdalabs/naruto-blip-captions" revision: str = None variant: str = None + cache: str = None def main(): @@ -22,6 +19,16 @@ def main(): parser = ArgumentParser() parser.add_arguments(TrainingConfig) args, _ = parser.parse_known_args() + # -- + + if args.cache: + os.environ["XDG_CACHE_HOME"] = str(args.cache) + + # -- + from transformers import CLIPTextModel, CLIPTokenizer + from diffusers import AutoencoderKL, UNet2DConditionModel, DDPMScheduler + from datasets import load_dataset + _ = load_dataset(args.dataset) diff --git a/benchmarks/diffusion/requirements.xpu.txt b/benchmarks/diffusion/requirements.xpu.txt new file mode 100644 index 000000000..62a1aba1e --- /dev/null +++ b/benchmarks/diffusion/requirements.xpu.txt @@ -0,0 +1,333 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/diffusion/requirements.xpu.txt .pin/tmp-constraints-xpu-diffusion-nodes.txt benchmarks/diffusion/requirements.in +# +--extra-index-url https://download.pytorch.org/whl/cpu +--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ + +accelerate==0.33.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -r benchmarks/diffusion/requirements.in + # diffusers +aiohappyeyeballs==2.3.5 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # aiohttp +aiohttp==3.10.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # datasets + # fsspec +aiosignal==1.3.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # aiohttp +antlr4-python3-runtime==4.9.3 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # omegaconf +argklass==1.4.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -r benchmarks/diffusion/requirements.in +asttokens==2.4.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # giving +async-timeout==4.0.3 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # aiohttp +attrs==24.2.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # aiohttp +certifi==2024.7.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # requests +charset-normalizer==3.3.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # requests +codefind==0.1.6 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # ptera +datasets==2.20.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -r benchmarks/diffusion/requirements.in +diffusers[torch]==0.30.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -r benchmarks/diffusion/requirements.in +dill==0.3.8 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # datasets + # multiprocess +executing==1.2.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # varname +filelock==3.15.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # datasets + # diffusers + # huggingface-hub + # torch + # transformers +frozenlist==1.4.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # aiohttp + # aiosignal +fsspec[http]==2024.5.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # datasets + # huggingface-hub + # torch +giving==0.4.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # ptera + # voir +hjson==3.1.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # argklass +huggingface-hub==0.24.5 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # accelerate + # datasets + # diffusers + # tokenizers + # transformers +idna==3.7 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # requests + # yarl +importlib-metadata==8.2.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # diffusers +importlib-resources==6.4.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # argklass +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # torch +markdown-it-py==3.0.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # rich +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # jinja2 +mdurl==0.1.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # markdown-it-py +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # sympy +multidict==6.0.5 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # aiohttp + # yarl +multiprocess==0.70.16 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # datasets +networkx==3.3 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # torch +numpy==1.26.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # accelerate + # datasets + # diffusers + # pandas + # pyarrow + # torchvision + # transformers +omegaconf==2.3.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # voir +ovld==0.3.8 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # voir +packaging==24.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # accelerate + # datasets + # huggingface-hub + # transformers +pandas==2.2.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # datasets +pillow==10.4.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # diffusers + # torchvision +psutil==5.9.8 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # accelerate + # voir +ptera==1.4.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # voir +pyarrow==17.0.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # datasets +pyarrow-hotfix==0.6 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # datasets +pygments==2.18.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # rich +pynvml==11.5.3 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # voir +python-dateutil==2.9.0.post0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # pandas +pytz==2024.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # pandas +pyyaml==6.0.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # accelerate + # datasets + # huggingface-hub + # omegaconf + # transformers +reactivex==4.0.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # giving +regex==2024.7.24 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # diffusers + # transformers +requests==2.32.3 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # datasets + # diffusers + # huggingface-hub + # transformers +rich==13.7.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # voir +safetensors==0.4.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # accelerate + # diffusers + # transformers +six==1.16.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # asttokens + # python-dateutil +sympy==1.13.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # torch +tokenizers==0.19.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # transformers +torch==2.4.0+cpu + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt + # -c .pin/../constraints/xpu.txt + # accelerate + # diffusers + # torchvision +torchvision==0.19.0+cpu + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt + # -c .pin/../constraints/xpu.txt + # -r benchmarks/diffusion/requirements.in +tqdm==4.66.5 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -r benchmarks/diffusion/requirements.in + # datasets + # huggingface-hub + # transformers +transformers==4.44.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -r benchmarks/diffusion/requirements.in +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # huggingface-hub + # reactivex + # torch +tzdata==2024.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # pandas +urllib3==2.2.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # requests +varname==0.10.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # giving +voir==0.2.17 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt + # -c .pin/../constraints/xpu.txt + # -r benchmarks/diffusion/requirements.in +xxhash==3.4.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # datasets +yarl==1.9.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # aiohttp +zipp==3.19.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # importlib-metadata diff --git a/benchmarks/dinov2/benchfile.py b/benchmarks/dinov2/benchfile.py index ddfc4bc06..214a013f8 100644 --- a/benchmarks/dinov2/benchfile.py +++ b/benchmarks/dinov2/benchfile.py @@ -3,8 +3,8 @@ SOURCE_DIR = "src" -REPO_URL = "https://github.com/facebookresearch/dinov2" -BRANCH = "e1277af2ba9496fbadf7aec6eba56e8d882d1e35" +REPO_URL = "https://github.com/Delaunay/dinov2" +BRANCH = "451bc15a084f42cc97c21e3bc0be9e9158f9049c" class Dinov2(Package): @@ -28,7 +28,8 @@ def working_directory(self): def make_env(self): # Return a dict of environment variables for prepare_script and # main_script. - return super().make_env() + env = super().make_env() + return env async def install(self): await super().install() diff --git a/benchmarks/dinov2/requirements.xpu.txt b/benchmarks/dinov2/requirements.xpu.txt new file mode 100644 index 000000000..032296c6f --- /dev/null +++ b/benchmarks/dinov2/requirements.xpu.txt @@ -0,0 +1,217 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/dinov2/requirements.xpu.txt .pin/tmp-constraints-xpu-dinov2-giant-nodes.txt benchmarks/dinov2/requirements.in +# +--extra-index-url https://download.pytorch.org/whl/cpu +--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ + +antlr4-python3-runtime==4.9.3 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # omegaconf +asttokens==2.4.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # giving +cloudpickle==3.0.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # submitit +codefind==0.1.6 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # ptera +executing==1.2.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # varname +filelock==3.15.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # torch +fsspec==2024.5.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # torch +fvcore==0.1.5.post20221221 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -r benchmarks/dinov2/requirements.in +giving==0.4.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # ptera + # voir +iopath==0.1.10 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -r benchmarks/dinov2/requirements.in + # fvcore +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # torch +lightning-utilities==0.11.6 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # torchmetrics +markdown-it-py==3.0.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # rich +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # jinja2 +mdurl==0.1.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # markdown-it-py +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # sympy +networkx==3.3 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # torch +numpy==1.26.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # fvcore + # scipy + # torchmetrics + # torchvision + # xformers +omegaconf==2.3.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -r benchmarks/dinov2/requirements.in + # voir +ovld==0.3.8 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # voir +packaging==24.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # lightning-utilities + # torchmetrics +pillow==10.4.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # fvcore + # torchvision +portalocker==2.10.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # iopath +psutil==5.9.8 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # voir +ptera==1.4.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # voir +pygments==2.18.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # rich +pynvml==11.5.3 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # voir +pyyaml==6.0.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # fvcore + # omegaconf + # yacs +reactivex==4.0.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # giving +rich==13.7.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # voir +scipy==1.11.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -r benchmarks/dinov2/requirements.in +six==1.16.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # asttokens +submitit==1.5.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -r benchmarks/dinov2/requirements.in +sympy==1.13.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # torch +tabulate==0.9.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # fvcore +termcolor==2.4.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # fvcore +torch==2.4.0+cpu + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt + # -c .pin/../constraints/xpu.txt + # -r benchmarks/dinov2/requirements.in + # torchmetrics + # torchvision + # xformers +torchmetrics==1.4.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -r benchmarks/dinov2/requirements.in +torchvision==0.19.0+cpu + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt + # -c .pin/../constraints/xpu.txt + # -r benchmarks/dinov2/requirements.in +tqdm==4.66.5 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # fvcore + # iopath +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # iopath + # lightning-utilities + # reactivex + # submitit + # torch +varname==0.10.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # giving +voir==0.2.17 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt + # -c .pin/../constraints/xpu.txt + # -r benchmarks/dinov2/requirements.in +xformers==0.0.27.post2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -r benchmarks/dinov2/requirements.in +yacs==0.1.8 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # fvcore + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/benchmarks/dinov2/voirfile.py b/benchmarks/dinov2/voirfile.py index f358914dc..fdc616b83 100644 --- a/benchmarks/dinov2/voirfile.py +++ b/benchmarks/dinov2/voirfile.py @@ -26,11 +26,30 @@ class Config: gpu_poll: int = 3 +def populate_slurm(): + import json + import os + + config = json.loads(os.environ["MILABENCH_CONFIG"]) + + nodes = [n["name"] for n in config["system"]["nodes"]] + + env = { + "SLURM_JOB_ID": "123", + "SLURM_JOB_NUM_NODES": "2", + "SLURM_JOB_NODELIST": ",".join(nodes), + "SLURM_NTASKS": str(len(config["system"]["nodes"])), + "SLURM_PROCID": "2", # RANK + "SLURM_LOCALID": "1", # Local RANK + } + + @configurable def instrument_main(ov, options: Config): + import os + yield ov.phases.init - import os import sys sys.path.append(os.path.dirname(__file__) + "/src/") diff --git a/benchmarks/flops/requirements.xpu.txt b/benchmarks/flops/requirements.xpu.txt index 0f73c2b33..5b59eacd4 100644 --- a/benchmarks/flops/requirements.xpu.txt +++ b/benchmarks/flops/requirements.xpu.txt @@ -4,10 +4,8 @@ # # pip-compile --output-file=benchmarks/flops/requirements.xpu.txt .pin/tmp-constraints-xpu-flops.txt benchmarks/flops/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com +--extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ ---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com antlr4-python3-runtime==4.9.3 # via @@ -17,14 +15,6 @@ asttokens==2.4.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -certifi==2024.6.2 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # requests -charset-normalizer==3.3.2 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # requests codefind==0.1.6 # via # -c .pin/../.pin/constraints-xpu-torch.txt @@ -46,10 +36,6 @@ giving==0.4.2 # -c .pin/../.pin/constraints-xpu-torch.txt # ptera # voir -idna==3.7 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # requests importlib-resources==6.4.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt @@ -86,7 +72,7 @@ omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir -ovld==0.3.5 +ovld==0.3.8 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir @@ -110,7 +96,7 @@ pynvml==11.5.3 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-xpu-torch.txt # omegaconf @@ -118,10 +104,6 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -requests==2.32.3 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # torchvision rich==13.7.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt @@ -130,27 +112,30 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # asttokens -sympy==1.13.0 +sympy==1.13.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # torch -torch==2.1.0.post2+cxx11.abi +torch==2.4.0+cpu # via # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/flops/requirements.in # torchvision torchcompat==1.1.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/flops/requirements.in -torchvision==0.16.0.post2+cxx11.abi +torchvision==0.19.0+cpu # via # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/flops/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -r benchmarks/flops/requirements.in @@ -159,10 +144,6 @@ typing-extensions==4.12.2 # -c .pin/../.pin/constraints-xpu-torch.txt # reactivex # torch -urllib3==1.26.19 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # requests varname==0.10.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt @@ -170,5 +151,6 @@ varname==0.10.0 voir==0.2.17 # via # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/flops/requirements.in diff --git a/benchmarks/huggingface/requirements.xpu.txt b/benchmarks/huggingface/requirements.xpu.txt index c1806ada3..f2891e0db 100644 --- a/benchmarks/huggingface/requirements.xpu.txt +++ b/benchmarks/huggingface/requirements.xpu.txt @@ -4,10 +4,8 @@ # # pip-compile --output-file=benchmarks/huggingface/requirements.xpu.txt .pin/tmp-constraints-xpu-hf.txt benchmarks/huggingface/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com +--extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ ---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com antlr4-python3-runtime==4.9.3 # via @@ -17,7 +15,7 @@ asttokens==2.4.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -certifi==2024.6.2 +certifi==2024.7.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt # requests @@ -49,7 +47,7 @@ giving==0.4.2 # -c .pin/../.pin/constraints-xpu-torch.txt # ptera # voir -huggingface-hub==0.24.0 +huggingface-hub==0.24.5 # via # -c .pin/../.pin/constraints-xpu-torch.txt # tokenizers @@ -90,7 +88,7 @@ omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir -ovld==0.3.5 +ovld==0.3.8 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir @@ -99,6 +97,10 @@ packaging==24.1 # -c .pin/../.pin/constraints-xpu-torch.txt # huggingface-hub # transformers +pillow==10.4.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -r benchmarks/huggingface/requirements.in psutil==5.9.8 # via # -c .pin/../.pin/constraints-xpu-torch.txt @@ -115,7 +117,7 @@ pynvml==11.5.3 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-xpu-torch.txt # huggingface-hub @@ -125,7 +127,7 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -regex==2024.5.15 +regex==2024.7.24 # via # -c .pin/../.pin/constraints-xpu-torch.txt # transformers @@ -138,7 +140,7 @@ rich==13.7.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir -safetensors==0.4.3 +safetensors==0.4.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt # transformers @@ -146,7 +148,7 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # asttokens -sympy==1.13.0 +sympy==1.13.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # torch @@ -154,17 +156,18 @@ tokenizers==0.19.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # transformers -torch==2.1.0.post2+cxx11.abi +torch==2.4.0+cpu # via # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/huggingface/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-xpu-torch.txt # huggingface-hub # transformers -transformers==4.42.4 +transformers==4.44.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -r benchmarks/huggingface/requirements.in @@ -174,7 +177,7 @@ typing-extensions==4.12.2 # huggingface-hub # reactivex # torch -urllib3==1.26.19 +urllib3==2.2.2 # via # -c .pin/../.pin/constraints-xpu-torch.txt # requests @@ -185,5 +188,6 @@ varname==0.10.0 voir==0.2.17 # via # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/huggingface/requirements.in diff --git a/benchmarks/lightning/requirements.xpu.txt b/benchmarks/lightning/requirements.xpu.txt new file mode 100644 index 000000000..338ee0fb9 --- /dev/null +++ b/benchmarks/lightning/requirements.xpu.txt @@ -0,0 +1,235 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/lightning/requirements.xpu.txt .pin/tmp-constraints-xpu-lightning-gpus.txt benchmarks/lightning/requirements.in +# +--extra-index-url https://download.pytorch.org/whl/cpu +--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ + +aiohappyeyeballs==2.3.5 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # aiohttp +aiohttp==3.10.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # fsspec +aiosignal==1.3.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # aiohttp +antlr4-python3-runtime==4.9.3 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # omegaconf +asttokens==2.4.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # giving +async-timeout==4.0.3 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # aiohttp +attrs==24.2.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # aiohttp +codefind==0.1.6 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # ptera +executing==1.2.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # varname +filelock==3.15.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # torch +frozenlist==1.4.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # aiohttp + # aiosignal +fsspec[http]==2024.5.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # lightning + # pytorch-lightning + # torch +giving==0.4.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # ptera + # voir +idna==3.7 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # yarl +importlib-resources==6.4.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # torchcompat +jinja2==3.1.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # torch +lightning==2.4.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -r benchmarks/lightning/requirements.in +lightning-utilities==0.11.6 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # lightning + # pytorch-lightning + # torchmetrics +markdown-it-py==3.0.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # rich +markupsafe==2.1.5 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # jinja2 +mdurl==0.1.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # markdown-it-py +mpmath==1.3.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # sympy +multidict==6.0.5 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # aiohttp + # yarl +networkx==3.3 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # torch +numpy==1.26.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # torchmetrics + # torchvision +omegaconf==2.3.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # voir +ovld==0.3.8 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # voir +packaging==24.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # lightning + # lightning-utilities + # pytorch-lightning + # torchmetrics +pillow==10.4.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # torchvision +psutil==5.9.8 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # voir +ptera==1.4.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # voir +pygments==2.18.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # rich +pynvml==11.5.3 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # voir +pytorch-lightning==2.4.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # lightning +pyyaml==6.0.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # lightning + # omegaconf + # pytorch-lightning +reactivex==4.0.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # giving +rich==13.7.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # voir +six==1.16.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # asttokens +sympy==1.13.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # torch +torch==2.4.0+cpu + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt + # -c .pin/../constraints/xpu.txt + # -r benchmarks/lightning/requirements.in + # lightning + # pytorch-lightning + # torchmetrics + # torchvision +torchcompat==1.1.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt + # -c .pin/../constraints/xpu.txt + # -r benchmarks/lightning/requirements.in +torchmetrics==1.4.1 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # lightning + # pytorch-lightning +torchvision==0.19.0+cpu + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt + # -c .pin/../constraints/xpu.txt + # -r benchmarks/lightning/requirements.in +tqdm==4.66.5 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # lightning + # pytorch-lightning +typing-extensions==4.12.2 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # lightning + # lightning-utilities + # pytorch-lightning + # reactivex + # torch +varname==0.10.0 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # giving +voir==0.2.17 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt + # -c .pin/../constraints/xpu.txt + # -r benchmarks/lightning/requirements.in +yarl==1.9.4 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # aiohttp + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/benchmarks/llama/requirements.xpu.txt b/benchmarks/llama/requirements.xpu.txt index bd69f7e55..95086f8eb 100644 --- a/benchmarks/llama/requirements.xpu.txt +++ b/benchmarks/llama/requirements.xpu.txt @@ -4,12 +4,14 @@ # # pip-compile --output-file=benchmarks/llama/requirements.xpu.txt .pin/tmp-constraints-xpu-llm.txt benchmarks/llama/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com +--extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ ---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com -aiohttp==3.9.5 +aiohappyeyeballs==2.3.5 + # via + # -c .pin/../.pin/constraints-xpu-torch.txt + # aiohttp +aiohttp==3.10.2 # via # -c .pin/../.pin/constraints-xpu-torch.txt # datasets @@ -30,11 +32,11 @@ async-timeout==4.0.3 # via # -c .pin/../.pin/constraints-xpu-torch.txt # aiohttp -attrs==23.2.0 +attrs==24.2.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # aiohttp -certifi==2024.6.2 +certifi==2024.7.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt # requests @@ -90,7 +92,7 @@ giving==0.4.2 # -c .pin/../.pin/constraints-xpu-torch.txt # ptera # voir -huggingface-hub==0.24.0 +huggingface-hub==0.24.5 # via # -c .pin/../.pin/constraints-xpu-torch.txt # datasets @@ -146,7 +148,7 @@ omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir -ovld==0.3.5 +ovld==0.3.8 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir @@ -192,7 +194,7 @@ pytz==2024.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # pandas -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-xpu-torch.txt # datasets @@ -203,7 +205,7 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -regex==2024.5.15 +regex==2024.7.24 # via # -c .pin/../.pin/constraints-xpu-torch.txt # transformers @@ -217,7 +219,7 @@ rich==13.7.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir -safetensors==0.4.3 +safetensors==0.4.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt # transformers @@ -231,7 +233,7 @@ six==1.16.0 # asttokens # fire # python-dateutil -sympy==1.13.0 +sympy==1.13.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # torch @@ -243,19 +245,20 @@ tokenizers==0.19.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # transformers -torch==2.1.0.post2+cxx11.abi +torch==2.4.0+cpu # via # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/llama/requirements.in # fairscale -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-xpu-torch.txt # datasets # huggingface-hub # transformers -transformers==4.42.4 +transformers==4.44.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -r benchmarks/llama/requirements.in @@ -269,7 +272,7 @@ tzdata==2024.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # pandas -urllib3==1.26.19 +urllib3==2.2.2 # via # -c .pin/../.pin/constraints-xpu-torch.txt # requests @@ -280,6 +283,7 @@ varname==0.10.0 voir==0.2.17 # via # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/llama/requirements.in xxhash==3.4.1 diff --git a/benchmarks/super-slomo/requirements.rocm.txt b/benchmarks/super-slomo/requirements.rocm.txt index a157466d1..ccedc92ed 100644 --- a/benchmarks/super-slomo/requirements.rocm.txt +++ b/benchmarks/super-slomo/requirements.rocm.txt @@ -4,10 +4,8 @@ # # pip-compile --output-file=benchmarks/super-slomo/requirements.rocm.txt .pin/tmp-constraints-rocm-super-slomo.txt benchmarks/super-slomo/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com --extra-index-url https://download.pytorch.org/whl/rocm6.0 --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com antlr4-python3-runtime==4.9.3 # via @@ -77,7 +75,7 @@ opencv-python==4.10.0.84 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/super-slomo/requirements.in -ovld==0.3.5 +ovld==0.3.8 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir @@ -101,11 +99,11 @@ pynvml==11.5.3 # via # -c .pin/../.pin/constraints-rocm-torch.txt # voir -pytorch-triton-rocm==2.3.1 +pytorch-triton-rocm==3.0.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-rocm-torch.txt # omegaconf @@ -121,20 +119,20 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # asttokens -sympy==1.13.0 +sympy==1.13.1 # via # -c .pin/../.pin/constraints-rocm-torch.txt # torch -torch==2.3.1+rocm6.0 +torch==2.4.0+rocm6.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/super-slomo/requirements.in # torchvision -torchvision==0.18.1+rocm6.0 +torchvision==0.19.0+rocm6.0 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/super-slomo/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-rocm-torch.txt # -r benchmarks/super-slomo/requirements.in diff --git a/benchmarks/super-slomo/requirements.xpu.txt b/benchmarks/super-slomo/requirements.xpu.txt index 65c4ab94d..8dd4a983c 100644 --- a/benchmarks/super-slomo/requirements.xpu.txt +++ b/benchmarks/super-slomo/requirements.xpu.txt @@ -4,10 +4,8 @@ # # pip-compile --output-file=benchmarks/super-slomo/requirements.xpu.txt .pin/tmp-constraints-xpu-super-slomo.txt benchmarks/super-slomo/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com +--extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ ---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com antlr4-python3-runtime==4.9.3 # via @@ -17,14 +15,6 @@ asttokens==2.4.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -certifi==2024.6.2 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # requests -charset-normalizer==3.3.2 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # requests codefind==0.1.6 # via # -c .pin/../.pin/constraints-xpu-torch.txt @@ -46,10 +36,6 @@ giving==0.4.2 # -c .pin/../.pin/constraints-xpu-torch.txt # ptera # voir -idna==3.7 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # requests jinja2==3.1.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt @@ -88,7 +74,7 @@ opencv-python==4.10.0.84 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -r benchmarks/super-slomo/requirements.in -ovld==0.3.5 +ovld==0.3.8 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir @@ -112,7 +98,7 @@ pynvml==11.5.3 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-xpu-torch.txt # omegaconf @@ -120,10 +106,6 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -requests==2.32.3 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # torchvision rich==13.7.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt @@ -132,22 +114,24 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # asttokens -sympy==1.13.0 +sympy==1.13.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # torch -torch==2.1.0.post2+cxx11.abi +torch==2.4.0+cpu # via # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/super-slomo/requirements.in # torchvision -torchvision==0.16.0.post2+cxx11.abi +torchvision==0.19.0+cpu # via # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/super-slomo/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -r benchmarks/super-slomo/requirements.in @@ -156,10 +140,6 @@ typing-extensions==4.12.2 # -c .pin/../.pin/constraints-xpu-torch.txt # reactivex # torch -urllib3==1.26.19 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # requests varname==0.10.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt @@ -167,5 +147,6 @@ varname==0.10.0 voir==0.2.17 # via # -c .pin/../.pin/constraints-xpu-torch.txt + # -c .pin/../constraints/extra/torch.xpu.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/super-slomo/requirements.in diff --git a/benchmarks/timm/requirements.xpu.txt b/benchmarks/timm/requirements.xpu.txt index 78f3c4a00..a39cd0043 100644 --- a/benchmarks/timm/requirements.xpu.txt +++ b/benchmarks/timm/requirements.xpu.txt @@ -4,10 +4,9 @@ # # pip-compile --output-file=benchmarks/timm/requirements.xpu.txt .pin/tmp-constraints-xpu-timm.txt benchmarks/timm/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com +--extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com antlr4-python3-runtime==4.9.3 # via @@ -17,7 +16,7 @@ asttokens==2.4.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -certifi==2024.6.2 +certifi==2024.7.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt # requests @@ -48,7 +47,7 @@ giving==0.4.2 # -c .pin/../.pin/constraints-xpu-torch.txt # ptera # voir -huggingface-hub==0.24.0 +huggingface-hub==0.24.5 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -r benchmarks/timm/requirements.in @@ -88,7 +87,7 @@ omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir -ovld==0.3.5 +ovld==0.3.8 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir @@ -116,7 +115,7 @@ pynvml==11.5.3 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -r benchmarks/timm/requirements.in @@ -130,12 +129,11 @@ requests==2.32.3 # via # -c .pin/../.pin/constraints-xpu-torch.txt # huggingface-hub - # torchvision rich==13.7.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir -safetensors==0.4.3 +safetensors==0.4.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -r benchmarks/timm/requirements.in @@ -143,22 +141,22 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # asttokens -sympy==1.13.0 +sympy==1.13.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # torch -torch==2.1.0.post2+cxx11.abi +torch==2.4.0+cpu # via # -c .pin/../.pin/constraints-xpu-torch.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/timm/requirements.in # torchvision -torchvision==0.16.0.post2+cxx11.abi +torchvision==0.19.0+cpu # via # -c .pin/../.pin/constraints-xpu-torch.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/timm/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-xpu-torch.txt # huggingface-hub @@ -168,7 +166,7 @@ typing-extensions==4.12.2 # huggingface-hub # reactivex # torch -urllib3==1.26.19 +urllib3==2.2.2 # via # -c .pin/../.pin/constraints-xpu-torch.txt # requests diff --git a/benchmarks/torchvision/requirements.xpu.txt b/benchmarks/torchvision/requirements.xpu.txt index 677f04f8b..6ca557e09 100644 --- a/benchmarks/torchvision/requirements.xpu.txt +++ b/benchmarks/torchvision/requirements.xpu.txt @@ -4,10 +4,9 @@ # # pip-compile --output-file=benchmarks/torchvision/requirements.xpu.txt .pin/tmp-constraints-xpu-torchvision.txt benchmarks/torchvision/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com +--extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com antlr4-python3-runtime==4.9.3 # via @@ -17,14 +16,6 @@ asttokens==2.4.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -certifi==2024.6.2 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # requests -charset-normalizer==3.3.2 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # requests codefind==0.1.6 # via # -c .pin/../.pin/constraints-xpu-torch.txt @@ -46,10 +37,6 @@ giving==0.4.2 # -c .pin/../.pin/constraints-xpu-torch.txt # ptera # voir -idna==3.7 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # requests importlib-resources==6.4.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt @@ -86,7 +73,7 @@ omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir -ovld==0.3.5 +ovld==0.3.8 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir @@ -110,7 +97,7 @@ pynvml==11.5.3 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-xpu-torch.txt # omegaconf @@ -118,10 +105,6 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -requests==2.32.3 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # torchvision rich==13.7.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt @@ -130,11 +113,11 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # asttokens -sympy==1.13.0 +sympy==1.13.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # torch -torch==2.1.0.post2+cxx11.abi +torch==2.4.0+cpu # via # -c .pin/../.pin/constraints-xpu-torch.txt # -c .pin/../constraints/xpu.txt @@ -145,12 +128,12 @@ torchcompat==1.1.4 # -c .pin/../.pin/constraints-xpu-torch.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/torchvision/requirements.in -torchvision==0.16.0.post2+cxx11.abi +torchvision==0.19.0+cpu # via # -c .pin/../.pin/constraints-xpu-torch.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/torchvision/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -r benchmarks/torchvision/requirements.in @@ -159,10 +142,6 @@ typing-extensions==4.12.2 # -c .pin/../.pin/constraints-xpu-torch.txt # reactivex # torch -urllib3==1.26.19 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # requests varname==0.10.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt diff --git a/benchmarks/torchvision_ddp/requirements.xpu.txt b/benchmarks/torchvision_ddp/requirements.xpu.txt index 4465de4fe..c741f5ad6 100644 --- a/benchmarks/torchvision_ddp/requirements.xpu.txt +++ b/benchmarks/torchvision_ddp/requirements.xpu.txt @@ -4,10 +4,8 @@ # # pip-compile --output-file=benchmarks/torchvision_ddp/requirements.xpu.txt .pin/tmp-constraints-xpu-torchvision.txt benchmarks/torchvision_ddp/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com +--extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ ---find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html ---trusted-host pypi.ngc.nvidia.com antlr4-python3-runtime==4.9.3 # via @@ -17,14 +15,6 @@ asttokens==2.4.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -certifi==2024.6.2 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # requests -charset-normalizer==3.3.2 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # requests codefind==0.1.6 # via # -c .pin/../.pin/constraints-xpu-torch.txt @@ -46,10 +36,6 @@ giving==0.4.2 # -c .pin/../.pin/constraints-xpu-torch.txt # ptera # voir -idna==3.7 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # requests importlib-resources==6.4.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt @@ -86,7 +72,7 @@ omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir -ovld==0.3.5 +ovld==0.3.8 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir @@ -110,7 +96,7 @@ pynvml==11.5.3 # via # -c .pin/../.pin/constraints-xpu-torch.txt # voir -pyyaml==6.0.1 +pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-xpu-torch.txt # omegaconf @@ -118,10 +104,6 @@ reactivex==4.0.4 # via # -c .pin/../.pin/constraints-xpu-torch.txt # giving -requests==2.32.3 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # torchvision rich==13.7.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt @@ -130,11 +112,11 @@ six==1.16.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt # asttokens -sympy==1.13.0 +sympy==1.13.1 # via # -c .pin/../.pin/constraints-xpu-torch.txt # torch -torch==2.1.0.post2+cxx11.abi +torch==2.4.0+cpu # via # -c .pin/../.pin/constraints-xpu-torch.txt # -c .pin/../constraints/xpu.txt @@ -145,12 +127,12 @@ torchcompat==1.1.4 # -c .pin/../.pin/constraints-xpu-torch.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/torchvision_ddp/requirements.in -torchvision==0.16.0.post2+cxx11.abi +torchvision==0.19.0+cpu # via # -c .pin/../.pin/constraints-xpu-torch.txt # -c .pin/../constraints/xpu.txt # -r benchmarks/torchvision_ddp/requirements.in -tqdm==4.66.4 +tqdm==4.66.5 # via # -c .pin/../.pin/constraints-xpu-torch.txt # -r benchmarks/torchvision_ddp/requirements.in @@ -159,10 +141,6 @@ typing-extensions==4.12.2 # -c .pin/../.pin/constraints-xpu-torch.txt # reactivex # torch -urllib3==1.26.19 - # via - # -c .pin/../.pin/constraints-xpu-torch.txt - # requests varname==0.10.0 # via # -c .pin/../.pin/constraints-xpu-torch.txt diff --git a/config/base.yaml b/config/base.yaml index b9d0fdbdf..f7c378b93 100644 --- a/config/base.yaml +++ b/config/base.yaml @@ -402,6 +402,7 @@ _diffusion: --num_epochs: 5 --batch_size: 32 --num_workers: "auto({n_worker}, 8)" + --cache: "{milabench_cache}" diffusion-single: inherits: _diffusion @@ -414,6 +415,8 @@ diffusion-gpus: num_machines: 1 diffusion-nodes: + tags: + - multinode inherits: _diffusion num_machines: 2 requires_capabilities: @@ -463,7 +466,7 @@ dinov2-giant-single: method: per_gpu argv: - --config-file: src/dinov2/configs/train/vitg14.yaml + --config-file: "{benchmark_folder}/src/dinov2/configs/train/vitg14.yaml" # THOSE NEED TO BE LAST train.dataset_path=ImageNet:split=TRAIN:root={milabench_data}/FakeImageNet:extra={milabench_data}/FakeImageNet: true train.batch_size_per_gpu=32: true @@ -473,7 +476,7 @@ dinov2-giant-single: dinov2-giant-gpus: inherits: _dinov2 argv: - --config-file: src/dinov2/configs/train/vitg14.yaml + --config-file: "{benchmark_folder}/src/dinov2/configs/train/vitg14.yaml" # THOSE NEED TO BE LAST train.dataset_path=ImageNet:split=TRAIN:root={milabench_data}/FakeImageNet:extra={milabench_data}/FakeImageNet: true train.batch_size_per_gpu=32: true @@ -481,12 +484,15 @@ dinov2-giant-gpus: train.num_workers=10: true dinov2-giant-nodes: + tags: + - multinode + max_duration: 3600 inherits: _dinov2 argv: - --config-file: src/dinov2/configs/train/vitg14.yaml + --config-file: "{benchmark_folder}/src/dinov2/configs/train/vitg14.yaml" # THOSE NEED TO BE LAST - train.dataset_path=ImageNet:split=TRAIN:root={milabench_data}/FakeImageNet:extra={milabench_data}/FakeImageNet: true - train.batch_size_per_gpu=32: true + train.dataset_path=ImageFolder:root={milabench_data}/FakeImageNet: true + train.batch_size_per_gpu=12: true train.saveckp_freq=100: true train.num_workers=10: true @@ -546,6 +552,8 @@ llm-lora-ddp-gpus: llm-lora-ddp-nodes: + tags: + - multinode max_duration: 3600 inherits: _llm plan: @@ -612,6 +620,8 @@ llm-full-mp-gpus: llm-full-mp-nodes: + tags: + - multinode max_duration: 3600 inherits: _llm plan: diff --git a/constraints/extra/torch.cuda.txt b/constraints/extra/torch.cuda.txt new file mode 100644 index 000000000..aba504237 --- /dev/null +++ b/constraints/extra/torch.cuda.txt @@ -0,0 +1,2 @@ +jax[cuda12] +--find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html diff --git a/constraints/extra/torch.hpu.txt b/constraints/extra/torch.hpu.txt new file mode 100644 index 000000000..ea8ea5176 --- /dev/null +++ b/constraints/extra/torch.hpu.txt @@ -0,0 +1,18 @@ +--extra-index-url https://download.pytorch.org/whl/cpu +--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ + +# +# Including a package in a constraints file does not trigger installation of the package. +# +torch +torchvision +torchaudio +intel-extension-for-pytorch +oneccl_bind_pt +intel-extension-for-pytorch-deepspeed +intel-extension-for-openxla + +# +# +voir >= 0.2.15 +torchcompat >= 1.0.0 diff --git a/constraints/extra/torch.rocm.txt b/constraints/extra/torch.rocm.txt new file mode 100644 index 000000000..493c77672 --- /dev/null +++ b/constraints/extra/torch.rocm.txt @@ -0,0 +1 @@ +# No jax \ No newline at end of file diff --git a/constraints/extra/torch.xpu.txt b/constraints/extra/torch.xpu.txt new file mode 100644 index 000000000..6b7454cbc --- /dev/null +++ b/constraints/extra/torch.xpu.txt @@ -0,0 +1,20 @@ +--extra-index-url https://download.pytorch.org/whl/cpu +--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ + +# +# Including a package in a constraints file does not trigger installation of the package. +# +torch +torchvision +torchaudio +intel-extension-for-pytorch +oneccl_bind_pt +intel-extension-for-pytorch-deepspeed + +# for jax as well +intel-extension-for-openxla + +# +# +voir >= 0.2.15 +torchcompat >= 1.0.0 diff --git a/constraints/xpu.txt b/constraints/xpu.txt index 8b8b39db7..03319b332 100644 --- a/constraints/xpu.txt +++ b/constraints/xpu.txt @@ -1,16 +1,16 @@ -# --extra-index-url https://download.pytorch.org/whl/cpu +--extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ # # Including a package in a constraints file does not trigger installation of the package. # -torch>=2.1.0 -torchvision>=0.16.0a0 -torchaudio>=2.1.0a0 -intel-extension-for-pytorch>=2.1.10+xpu -oneccl_bind_pt==2.1.100+xpu -intel-extension-for-pytorch-deepspeed>=2.1.30 -intel-extension-for-openxla>=0.3.0 +torch +torchvision +torchaudio +intel-extension-for-pytorch +oneccl_bind_pt +intel-extension-for-pytorch-deepspeed +intel-extension-for-openxla # # diff --git a/milabench/_version.py b/milabench/_version.py index 0640b0ea1..715d03ad7 100644 --- a/milabench/_version.py +++ b/milabench/_version.py @@ -1,5 +1,5 @@ """This file is generated, do not modify""" -__tag__ = "v0.1.0-30-g3d8e9f5b" -__commit__ = "3d8e9f5b25206b42fac1c2030a0f56a4b6dac114" -__date__ = "2024-08-05 15:22:11 -0400" +__tag__ = "v0.1.0-32-gdaf3612e" +__commit__ = "daf3612e0efdcc0bb9fb041129d660fc12f06914" +__date__ = "2024-08-06 15:06:04 -0400" diff --git a/milabench/cli/__init__.py b/milabench/cli/__init__.py index 205942e47..0a2072368 100644 --- a/milabench/cli/__init__.py +++ b/milabench/cli/__init__.py @@ -21,6 +21,7 @@ from .summary import cli_summary from .resolve import cli_resolve from .new import cli_new +from .env import cli_env class Main: @@ -94,6 +95,10 @@ def matrix(): def resolve(): return cli_resolve() + + def env(): + """Print milabench environment variables""" + cli_env() def main(argv=None): diff --git a/milabench/cli/env.py b/milabench/cli/env.py new file mode 100644 index 000000000..3725aa9df --- /dev/null +++ b/milabench/cli/env.py @@ -0,0 +1,27 @@ + + +from milabench.system import _global_options, as_environment_variable, SystemConfig + + +from dataclasses import asdict + + +def cli_env(): + _ = SystemConfig() + + # import yaml + # print(yaml.dump(asdict(_))) + + for k, option in _global_options.items(): + env_name = as_environment_variable(k) + value = option["value"] + default = option["default"] + + if value is None or value == default: + print("# ", end="") + + print(f"export {env_name}={value}") + + +if __name__ == "__main__": + cli_env() diff --git a/milabench/cli/slurm.py b/milabench/cli/slurm.py index 9f245e415..70e660218 100644 --- a/milabench/cli/slurm.py +++ b/milabench/cli/slurm.py @@ -25,13 +25,7 @@ def resolve_hostname(ip): return hostname, False -@tooled -def cli_slurm_system(): - """Generate a system file based of slurm environment variables""" - - node_list = expand_node_list(os.getenv("SLURM_JOB_NODELIST", "")) - - +def make_node_list_from_slurm(node_list): def make_node(i, ip): hostname, local = resolve_hostname(ip) @@ -61,6 +55,29 @@ def make_node(i, ip): else: nodes[0]["main"] = True + return nodes + + +@tooled +def cli_slurm_system(): + """Generate a system file based of slurm environment variables""" + + node_list = expand_node_list(os.getenv("SLURM_JOB_NODELIST", "")) + + if len(node_list) > 0: + nodes = make_node_list_from_slurm(node_list) + else: + self = socket.gethostname() + nodes = [{ + "name": self, + "ip": self, + "hostname": self, + "user": getpass.getuser(), + "main": True, + "sshport": 22, + }] + + system = { "arch": "cuda", "nodes": nodes, diff --git a/milabench/commands/__init__.py b/milabench/commands/__init__.py index 51b239e07..15e13d81c 100644 --- a/milabench/commands/__init__.py +++ b/milabench/commands/__init__.py @@ -485,7 +485,7 @@ def _argv(self, **kwargs) -> List: argv.append(f"-i{key}") argv.append(host) - return argv + return argv # + ["env", "-i"] class SCPCommand(SSHCommand, CmdCommand): @@ -872,7 +872,7 @@ def __init__(self, pack: pack.BasePackage, **kwargs): super().__init__(pack, **kwargs) def _argv(self, **_) -> List: - return [f"{self.pack.dirs.code / 'activator'}", f"{self.pack.dirs.venv}"] + return [f"{self.pack.dirs.code / 'activator'}", f"{self.pack.dirs.venv}", f"{self.pack.dirs.cache}"] @@ -894,9 +894,10 @@ def make_new_node_executor(self, rank, node, base): config = base.pack.config pack = self.make_new_node_pack(rank, node, base) - + executor = base.copy(pack) + return DockerRunCommand( - AccelerateLaunchCommand(pack, rank=rank), + AccelerateLaunchCommand(executor, rank=rank, **self.options), config["system"].get("docker_image"), ) @@ -968,6 +969,8 @@ def _argv(self, **_) -> List: deepspeed_argv = [] cpu_per_process = self.pack.resolve_argument('--cpus_per_gpu', 4) + main_port = option("torchrun.port", int, default=29400) + return [ # -- Run the command in the right venv # This could be inside the SSH Command @@ -976,6 +979,7 @@ def _argv(self, **_) -> List: # inside a specifc venv activator_script(), f"{self.pack.dirs.venv}", + f"{self.pack.dirs.cache}", # -- "accelerate", "launch", @@ -987,7 +991,7 @@ def _argv(self, **_) -> List: f"--gradient_accumulation_steps={self.pack.config.get('gradient_accumulation_steps', 1)}", f"--num_cpu_threads_per_process={cpu_per_process}", f"--main_process_ip={manager['ip']}", - f"--main_process_port={manager['port']}", + f"--main_process_port={main_port}", f"--num_processes={nproc}", *self.accelerate_argv, ] diff --git a/milabench/scripts/activator b/milabench/scripts/activator index 083c28cb1..3ea5b3c86 100755 --- a/milabench/scripts/activator +++ b/milabench/scripts/activator @@ -3,5 +3,11 @@ venv="$1" shift +cache="$1" +shift + +echo "$cache" +export XDG_CACHE_HOME=$cache + source "$venv"/bin/activate exec "$@" diff --git a/milabench/sizer.py b/milabench/sizer.py index 2ae877213..19997e41d 100644 --- a/milabench/sizer.py +++ b/milabench/sizer.py @@ -381,6 +381,7 @@ def clamp(x, mn=cpu_opt.cpu_min, mx=cpu_opt.cpu_max): context["milabench_runs"] = dirs.get('runs', "") context["milabench_cache"] = dirs.get('cache', "") context["milabench_name"] = pack.config.get("name", None) + context["benchmark_folder"] = pack.config.get('definition', None) def auto_eval(arg): newvalue = str(arg).format(**context) diff --git a/milabench/system.py b/milabench/system.py index 6f7a9cf32..4a6f0a42b 100644 --- a/milabench/system.py +++ b/milabench/system.py @@ -16,6 +16,21 @@ system_global = contextvars.ContextVar("system", default=None) +def get_gpu_capacity(strict=False): + try: + capacity = 1e24 + + for k, v in get_gpu_info()["gpus"].items(): + capacity = min(v["memory"]["total"], capacity) + + return int(capacity) + except: + print("GPU not available, defaulting to 0 MiB") + if strict: + raise + return 0 + + def getenv(name, expected_type): value = os.getenv(name) @@ -126,7 +141,7 @@ class SizerOptions: optimized: bool = defaultfield("sizer.optimized", int) # Set a target VRAM capacity to use - capacity: str = defaultfield("sizer.capacity", str) + capacity: str = defaultfield("sizer.capacity", str, None) # Save the batch size, VRM usage data to a scaling file save: str = defaultfield("sizer.save", str, None) @@ -179,17 +194,17 @@ class Torchrun: @dataclass class Options: - sizer: SizerOptions - cpu: CPUOptions - dataset: DatasetConfig - dirs: Dirs - torchrun: Torchrun + sizer: SizerOptions = SizerOptions() + cpu: CPUOptions = CPUOptions() + dataset: DatasetConfig = DatasetConfig() + dirs: Dirs = Dirs() + torchrun: Torchrun = Torchrun() @dataclass class GPUConfig: arch: str = defaultfield("gpu.arch", str, None) - capacity: str = None + capacity: str = defaultfield("gpu.capacity", str, str(get_gpu_capacity())) @dataclass @@ -206,21 +221,29 @@ class Github: pat: str = defaultfield("github.path", str, None) +def default_device(): + try: + gpu_info = get_gpu_info() + return gpu_info["arch"] + except: + return "cpu" + + @dataclass class SystemConfig: """This is meant to be an exhaustive list of all the environment overrides""" - arch: str = defaultfield("gpu.arch", str, None) - sshkey: str = None + arch: str = defaultfield("gpu.arch", str, default_device()) + sshkey: str = defaultfield("ssh", str, "~/.ssh/id_rsa") docker_image: str = None nodes: list[Nodes] = field(default_factory=list) - gpu: GPUConfig = None - options: Options = None + gpu: GPUConfig = GPUConfig() + options: Options = Options() base: str = defaultfield("base", str, None) config: str = defaultfield("config", str, None) dash: bool = defaultfield("dash", bool, 1) noterm: bool = defaultfield("noterm", bool, 0) - github: Github = None + github: Github = Github() def check_node_config(nodes): @@ -364,21 +387,6 @@ def resolve_addresses(nodes): return self -def get_gpu_capacity(strict=False): - try: - capacity = 0 - - for k, v in get_gpu_info()["gpus"].items(): - capacity = min(v["memory"]["total"], capacity) - - return int(capacity) - except: - print("GPU not available, defaulting to 0 MiB") - if strict: - raise - return 0 - - def build_system_config(config_file, defaults=None, gpu=True): """Load the system configuration, verify its validity and resolve ip addresses diff --git a/scripts/article/run_cuda_dev.sh b/scripts/article/run_cuda_dev.sh index 0f52a1724..78224debd 100644 --- a/scripts/article/run_cuda_dev.sh +++ b/scripts/article/run_cuda_dev.sh @@ -3,9 +3,16 @@ set -ex # export MILABENCH_SOURCE=$HOME/milabench +# +# # put those on the shared drived +# export MILABENCH_DIRS_DATA=/home/mila/d/delaunap/scratch/milabench/data +# export MILABENCH_DIRS_VENV=/home/mila/d/delaunap/scratch/milabench/venv +# export MILABENCH_DIRS_RUNS=/home/mila/d/delaunap/scratch/milabench/runs +# +# # mkdir /tmp/workspace && cd /tmp/workspace # conda activate py310 -# +# bash $HOME/milabench/scripts/article/run_cuda_dev.sh # export MILABENCH_GPU_ARCH=cuda @@ -14,9 +21,14 @@ export MILABENCH_WORDIR="$(pwd)/$MILABENCH_GPU_ARCH" export MILABENCH_BASE="$MILABENCH_WORDIR/results" export MILABENCH_CONFIG="$MILABENCH_WORDIR/milabench/config/standard.yaml" export MILABENCH_VENV="$MILABENCH_WORDIR/env" -export BENCHMARK_VENV="$MILABENCH_WORDIR/results/venv/torch" export MILABENCH_SYSTEM="$MILABENCH_WORDIR/system.yaml" +if [ -z "${MILABENCH_DIRS_VENV}" ]; then + export BENCHMARK_VENV="$MILABENCH_WORDIR/results/venv/torch" +else + export BENCHMARK_VENV="$MILABENCH_DIRS_VENV" +fi + if [ -z "${MILABENCH_PREPARE}" ]; then export MILABENCH_PREPARE=0 fi @@ -87,7 +99,7 @@ install_prepare() { module load cuda/12.3.2 -if [ ! -d "$MILABENCH_WORDIR/results/venv/torch" ]; then +if [ ! -d "$BENCHMARK_VENV" ]; then install_prepare else echo "Reusing previous install" @@ -99,7 +111,7 @@ if [ "$MILABENCH_PREPARE" -eq 0 ]; then cd $MILABENCH_WORDIR - milabench prepare --system $MILABENCH_WORDIR/system.yaml "$@" + # milabench prepare --system $MILABENCH_WORDIR/system.yaml "$@" # milabench prepare "$@" # From aed0290cab92ff84eaf98115128afbc9a1c80acf Mon Sep 17 00:00:00 2001 From: "pierre.delaunay" Date: Fri, 9 Aug 2024 14:22:41 -0400 Subject: [PATCH 5/6] Tweaks for dino multi node --- milabench/cli/slurm.py | 18 +------------ milabench/commands/__init__.py | 2 +- milabench/system.py | 46 +++++++++++++++++++++++++++++++--- 3 files changed, 44 insertions(+), 22 deletions(-) diff --git a/milabench/cli/slurm.py b/milabench/cli/slurm.py index 70e660218..ede8cdabe 100644 --- a/milabench/cli/slurm.py +++ b/milabench/cli/slurm.py @@ -4,25 +4,9 @@ import subprocess from coleo import tooled -from ..system import get_gpu_capacity, is_loopback +from ..system import get_gpu_capacity, is_loopback, resolve_hostname, gethostname -def gethostname(host): - try: - return subprocess.check_output(["ssh", host, "cat", "/etc/hostname"], text=True).strip() - except: - print("Could not resolve hostname") - return host - - -def resolve_hostname(ip): - hostname, _, iplist = socket.gethostbyaddr(ip) - - for ip in iplist: - if is_loopback(ip): - return hostname, True - - return hostname, False def make_node_list_from_slurm(node_list): diff --git a/milabench/commands/__init__.py b/milabench/commands/__init__.py index 15e13d81c..a3967a6bb 100644 --- a/milabench/commands/__init__.py +++ b/milabench/commands/__init__.py @@ -456,7 +456,7 @@ def is_local(self): if localnode is not None: return (False # The ip belongs to the local node - or self.host in localnode["ipaddrlist"] + or self.host in localnode.get("ipaddrlist", []) # The hostname is the local node or self.host == localnode["hostname"] ) diff --git a/milabench/system.py b/milabench/system.py index 4a6f0a42b..b9ad93d9f 100644 --- a/milabench/system.py +++ b/milabench/system.py @@ -3,6 +3,7 @@ import socket from dataclasses import dataclass, field import sys +import subprocess from contextlib import contextmanager import ipaddress @@ -274,9 +275,6 @@ def get_remote_ip(): return set(result) - - - def is_loopback(address: str) -> bool: try: # Create an IP address object @@ -344,7 +342,7 @@ def enable_offline(enabled): offline = old -def resolve_addresses(nodes): +def _resolve_addresses(nodes): # Note: it is possible for self to be none # if we are running milabench on a node that is not part of the system # in that case it should still work; the local is then going to @@ -387,6 +385,46 @@ def resolve_addresses(nodes): return self +def gethostname(host): + try: + return subprocess.check_output(["ssh", host, "cat", "/etc/hostname"], text=True).strip() + except: + print("Could not resolve hostname") + return host + + +def resolve_hostname(ip): + hostname, _, iplist = socket.gethostbyaddr(ip) + + for ip in iplist: + if is_loopback(ip): + return hostname, True + + return hostname, False + + +def resolve_node_address(node): + hostname, local = resolve_hostname(node["ip"]) + + node["hostname"] = hostname + node["local"] = local + + if local: + node["hostname"] = socket.gethostname() + + return local + + +def resolve_addresses(nodes): + self = None + + for node in nodes: + if resolve_node_address(node): + self = node + + return self + + def build_system_config(config_file, defaults=None, gpu=True): """Load the system configuration, verify its validity and resolve ip addresses From a4edcfd715af72cd901c8a516d919426e6ae5828 Mon Sep 17 00:00:00 2001 From: "pierre.delaunay" Date: Mon, 12 Aug 2024 12:18:22 -0400 Subject: [PATCH 6/6] Tweaks --- benchmarks/flops/activator | 7 ------- config/base.yaml | 1 + milabench/commands/__init__.py | 2 +- milabench/system.py | 20 +++++++++++++------- 4 files changed, 15 insertions(+), 15 deletions(-) delete mode 100755 benchmarks/flops/activator diff --git a/benchmarks/flops/activator b/benchmarks/flops/activator deleted file mode 100755 index 083c28cb1..000000000 --- a/benchmarks/flops/activator +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -venv="$1" -shift - -source "$venv"/bin/activate -exec "$@" diff --git a/config/base.yaml b/config/base.yaml index f7c378b93..f5d5920d0 100644 --- a/config/base.yaml +++ b/config/base.yaml @@ -484,6 +484,7 @@ dinov2-giant-gpus: train.num_workers=10: true dinov2-giant-nodes: + enabled: false tags: - multinode max_duration: 3600 diff --git a/milabench/commands/__init__.py b/milabench/commands/__init__.py index a3967a6bb..e6d3639d5 100644 --- a/milabench/commands/__init__.py +++ b/milabench/commands/__init__.py @@ -872,7 +872,7 @@ def __init__(self, pack: pack.BasePackage, **kwargs): super().__init__(pack, **kwargs) def _argv(self, **_) -> List: - return [f"{self.pack.dirs.code / 'activator'}", f"{self.pack.dirs.venv}", f"{self.pack.dirs.cache}"] + return [activator_script(), f"{self.pack.dirs.venv}", f"{self.pack.dirs.cache}"] diff --git a/milabench/system.py b/milabench/system.py index b9ad93d9f..4c2d89953 100644 --- a/milabench/system.py +++ b/milabench/system.py @@ -84,8 +84,6 @@ def option(name, etype, default=None): system = system_global.get() if system: options = system.get("options", dict()) - else: - warn_no_config() frags = name.split(".") env_name = as_environment_variable(name) @@ -394,14 +392,20 @@ def gethostname(host): def resolve_hostname(ip): - hostname, _, iplist = socket.gethostbyaddr(ip) + try: + hostname, _, iplist = socket.gethostbyaddr(ip) - for ip in iplist: - if is_loopback(ip): - return hostname, True + for ip in iplist: + if is_loopback(ip): + return hostname, True - return hostname, False + return hostname, False + + except: + if offline: + return ip, False + raise def resolve_node_address(node): hostname, local = resolve_hostname(node["ip"]) @@ -410,6 +414,8 @@ def resolve_node_address(node): node["local"] = local if local: + # `gethostbyaddr` returns `cn-d003` but we want `cn-d003.server.mila.quebec` + # else torchrun does not recognize the main node node["hostname"] = socket.gethostname() return local