From e598089c0faa1e328c9aa34f1486d72bbc293584 Mon Sep 17 00:00:00 2001 From: Satya Ortiz-Gagne Date: Tue, 3 Sep 2024 16:26:02 -0400 Subject: [PATCH] Fix llm requirements * rename huggingface token to MILABENCH_* to automatically forward the env var to a remote in such cases --- .github/workflows/tests_unit.yml | 2 +- benchmarks/llm/prepare.py | 9 +++++++- benchmarks/llm/requirements.cuda.txt | 34 +++++++++++++++++++++++----- 3 files changed, 37 insertions(+), 8 deletions(-) diff --git a/.github/workflows/tests_unit.yml b/.github/workflows/tests_unit.yml index 90d6f4831..28262cf16 100644 --- a/.github/workflows/tests_unit.yml +++ b/.github/workflows/tests_unit.yml @@ -74,7 +74,7 @@ jobs: - name: tests env: - HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN}} + MILABENCH_HF_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN}} run: | source $(poetry env info -p)/bin/activate coverage run --source=milabench -m pytest --ignore=tests/integration tests/ diff --git a/benchmarks/llm/prepare.py b/benchmarks/llm/prepare.py index 9c64ac8fe..221162ffa 100755 --- a/benchmarks/llm/prepare.py +++ b/benchmarks/llm/prepare.py @@ -55,18 +55,25 @@ def generate_model( model_parallel_size=1 ): try: + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "12355" + torch.distributed.init_process_group(rank=rank, world_size=model_parallel_size) fairscale.nn.model_parallel.initialize.initialize_model_parallel(model_parallel_size) + conn.send(os.getpid()) while not conn.poll(): time.sleep(0.1) conn.recv() + params = json.loads(params_path.read_text()) model = llama.model.Transformer(ModelArgs(**params)) torch.save(model.state_dict(), params_path.with_name(f"consolidated.{rank:02}.pth")) + except Exception as e: conn.send(e) raise + finally: conn.close() @@ -101,7 +108,7 @@ def main(): config = OmegaConf.merge(base, cli) repo_id = config["repo_id"] - hf_token = os.getenv("HUGGING_FACE_TOKEN", None) + hf_token = os.getenv("MILABENCH_HF_TOKEN", None) output_dir = config["checkpointer"]["output_dir"] ignore_patterns = ["*.safetensors", "*consolidated.*.pth"] diff --git a/benchmarks/llm/requirements.cuda.txt b/benchmarks/llm/requirements.cuda.txt index a95035534..e7825b5f0 100644 --- a/benchmarks/llm/requirements.cuda.txt +++ b/benchmarks/llm/requirements.cuda.txt @@ -2,10 +2,10 @@ # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # -# pip-compile --output-file=benchmarks/llm/requirements.cuda.txt .pin/tmp-constraints-cuda-llm-lora-single.txt benchmarks/llm/requirements.in +# pip-compile --output-file=benchmarks/llm/requirements.cuda.txt .pin/tmp-constraints-cuda-llm-full-mp-nodes.txt benchmarks/llm/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com --extra-index-url https://download.pytorch.org/whl/cu121 +--extra-index-url https://pypi.ngc.nvidia.com --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html --trusted-host pypi.ngc.nvidia.com @@ -45,7 +45,7 @@ attrs==24.2.0 blobfile==3.0.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt - # -r benchmarks/llm/llama3/requirements.txt + # -r benchmarks/llm/requirements.txt # torchtune certifi==2024.7.4 # via @@ -75,7 +75,7 @@ executing==1.2.0 fairscale==0.4.13 # via # -c .pin/../.pin/constraints-cuda-torch.txt - # -r benchmarks/llm/llama3/requirements.txt + # -r benchmarks/llm/requirements.txt filelock==3.15.4 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -83,11 +83,12 @@ filelock==3.15.4 # datasets # huggingface-hub # torch + # transformers # triton fire==0.6.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt - # -r benchmarks/llm/llama3/requirements.txt + # -r benchmarks/llm/requirements.txt frozenlist==1.4.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -111,7 +112,9 @@ hjson==3.1.0 huggingface-hub==0.24.6 # via # -c .pin/../.pin/constraints-cuda-torch.txt + # accelerate # datasets + # tokenizers # torchtune idna==3.8 # via @@ -183,6 +186,7 @@ networkx==3.3 numpy==1.26.4 # via # -c .pin/../.pin/constraints-cuda-torch.txt + # accelerate # datasets # jax # jaxlib @@ -192,6 +196,7 @@ numpy==1.26.4 # pyarrow # scipy # torchtune + # transformers nvidia-cublas-cu12==12.1.3.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -277,8 +282,10 @@ ovld==0.3.9 packaging==24.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt + # accelerate # datasets # huggingface-hub + # transformers pandas==2.2.2 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -286,6 +293,7 @@ pandas==2.2.2 psutil==5.9.8 # via # -c .pin/../.pin/constraints-cuda-torch.txt + # accelerate # voir ptera==1.4.1 # via @@ -315,9 +323,11 @@ pyyaml==6.0.2 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/llm/requirements.in + # accelerate # datasets # huggingface-hub # omegaconf + # transformers reactivex==4.0.4 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -326,6 +336,7 @@ regex==2024.7.24 # via # -c .pin/../.pin/constraints-cuda-torch.txt # tiktoken + # transformers requests==2.32.3 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -339,6 +350,7 @@ rich==13.8.0 safetensors==0.4.4 # via # -c .pin/../.pin/constraints-cuda-torch.txt + # accelerate # torchtune scipy==1.14.1 # via @@ -367,11 +379,16 @@ tiktoken==0.7.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # torchtune +tokenizers==0.19.1 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # transformers torch==2.4.0+cu121 # via # -c .pin/../.pin/constraints-cuda-torch.txt - # -r benchmarks/llm/llama3/requirements.txt # -r benchmarks/llm/requirements.in + # -r benchmarks/llm/requirements.txt + # accelerate # fairscale torchao==0.3.1+cu121 # via @@ -387,6 +404,11 @@ tqdm==4.66.5 # datasets # huggingface-hub # torchtune + # transformers +transformers==4.43.3 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # -r benchmarks/llm/requirements.in triton==3.0.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt