From 282e9c2ac96005d0b4e841c9a9e146f1fda20b37 Mon Sep 17 00:00:00 2001 From: Satya Ortiz-Gagne Date: Tue, 27 Aug 2024 13:28:30 -0400 Subject: [PATCH] Fix llm requirements * rename huggingface token to MILABENCH_* to automatically forward the env var to a remote in such cases --- .github/workflows/tests_unit.yml | 2 +- benchmarks/llm/prepare.py | 9 +++++- benchmarks/llm/requirements.cuda.txt | 41 ++++++++++++++++++++++++---- 3 files changed, 44 insertions(+), 8 deletions(-) diff --git a/.github/workflows/tests_unit.yml b/.github/workflows/tests_unit.yml index 90d6f4831..28262cf16 100644 --- a/.github/workflows/tests_unit.yml +++ b/.github/workflows/tests_unit.yml @@ -74,7 +74,7 @@ jobs: - name: tests env: - HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN}} + MILABENCH_HF_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN}} run: | source $(poetry env info -p)/bin/activate coverage run --source=milabench -m pytest --ignore=tests/integration tests/ diff --git a/benchmarks/llm/prepare.py b/benchmarks/llm/prepare.py index 631f3dd36..975332700 100755 --- a/benchmarks/llm/prepare.py +++ b/benchmarks/llm/prepare.py @@ -55,18 +55,25 @@ def generate_model( model_parallel_size=1 ): try: + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "12355" + torch.distributed.init_process_group(rank=rank, world_size=model_parallel_size) fairscale.nn.model_parallel.initialize.initialize_model_parallel(model_parallel_size) + conn.send(os.getpid()) while not conn.poll(): time.sleep(0.1) conn.recv() + params = json.loads(params_path.read_text()) model = llama.model.Transformer(ModelArgs(**params)) torch.save(model.state_dict(), params_path.with_name(f"consolidated.{rank:02}.pth")) + except Exception as e: conn.send(e) raise + finally: conn.close() @@ -101,7 +108,7 @@ def main(): config = OmegaConf.merge(base, cli) repo_id = config["repo_id"] - hf_token = os.getenv("HUGGING_FACE_TOKEN", None) + hf_token = os.getenv("MILABENCH_HF_TOKEN", None) output_dir = config["checkpointer"]["output_dir"] ignore_patterns = ["*.safetensors", "original/consolidated.*.pth"] diff --git a/benchmarks/llm/requirements.cuda.txt b/benchmarks/llm/requirements.cuda.txt index bc28b0084..c4c90a125 100644 --- a/benchmarks/llm/requirements.cuda.txt +++ b/benchmarks/llm/requirements.cuda.txt @@ -2,13 +2,17 @@ # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # -# pip-compile --output-file=benchmarks/llm/requirements.cuda.txt .pin/tmp-constraints-cuda-llm-lora-single.txt benchmarks/llm/requirements.in +# pip-compile --output-file=benchmarks/llm/requirements.cuda.txt .pin/tmp-constraints-cuda-llm-full-mp-nodes.txt benchmarks/llm/requirements.in # ---extra-index-url https://pypi.ngc.nvidia.com --extra-index-url https://download.pytorch.org/whl/cu121 +--extra-index-url https://pypi.ngc.nvidia.com --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html --trusted-host pypi.ngc.nvidia.com +accelerate==0.33.0 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # -r benchmarks/llm/requirements.in aiohappyeyeballs==2.3.4 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -45,7 +49,7 @@ attrs==23.2.0 blobfile==2.1.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt - # -r benchmarks/llm/llama3/requirements.txt + # -r benchmarks/llm/requirements.txt # torchtune certifi==2024.7.4 # via @@ -75,7 +79,7 @@ executing==1.2.0 fairscale==0.4.13 # via # -c .pin/../.pin/constraints-cuda-torch.txt - # -r benchmarks/llm/llama3/requirements.txt + # -r benchmarks/llm/requirements.txt filelock==3.15.4 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -83,11 +87,12 @@ filelock==3.15.4 # datasets # huggingface-hub # torch + # transformers # triton fire==0.6.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt - # -r benchmarks/llm/llama3/requirements.txt + # -r benchmarks/llm/requirements.txt frozenlist==1.4.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -111,8 +116,11 @@ hjson==3.1.0 huggingface-hub==0.24.5 # via # -c .pin/../.pin/constraints-cuda-torch.txt + # accelerate # datasets + # tokenizers # torchtune + # transformers idna==3.7 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -162,11 +170,13 @@ networkx==3.3 numpy==1.26.4 # via # -c .pin/../.pin/constraints-cuda-torch.txt + # accelerate # datasets # fairscale # pandas # pyarrow # torchtune + # transformers nvidia-cublas-cu12==12.1.3.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -231,8 +241,10 @@ ovld==0.3.6 packaging==24.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt + # accelerate # datasets # huggingface-hub + # transformers pandas==2.2.2 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -240,6 +252,7 @@ pandas==2.2.2 psutil==5.9.8 # via # -c .pin/../.pin/constraints-cuda-torch.txt + # accelerate # voir ptera==1.4.1 # via @@ -277,9 +290,11 @@ pyyaml==6.0.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt # -r benchmarks/llm/requirements.in + # accelerate # datasets # huggingface-hub # omegaconf + # transformers reactivex==4.0.4 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -288,12 +303,14 @@ regex==2024.7.24 # via # -c .pin/../.pin/constraints-cuda-torch.txt # tiktoken + # transformers requests==2.32.3 # via # -c .pin/../.pin/constraints-cuda-torch.txt # datasets # huggingface-hub # tiktoken + # transformers rich==13.7.1 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -301,7 +318,9 @@ rich==13.7.1 safetensors==0.4.3 # via # -c .pin/../.pin/constraints-cuda-torch.txt + # accelerate # torchtune + # transformers sentencepiece==0.2.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt @@ -324,11 +343,16 @@ tiktoken==0.7.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt # torchtune +tokenizers==0.19.1 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # transformers torch==2.4.0+cu121 # via # -c .pin/../.pin/constraints-cuda-torch.txt - # -r benchmarks/llm/llama3/requirements.txt # -r benchmarks/llm/requirements.in + # -r benchmarks/llm/requirements.txt + # accelerate # fairscale torchao==0.3.1+cu121 # via @@ -344,6 +368,11 @@ tqdm==4.66.4 # datasets # huggingface-hub # torchtune + # transformers +transformers==4.43.3 + # via + # -c .pin/../.pin/constraints-cuda-torch.txt + # -r benchmarks/llm/requirements.in triton==3.0.0 # via # -c .pin/../.pin/constraints-cuda-torch.txt