Merge branch 'main' into checkpointer

felipemello1 · Dec 6, 2024 · 300a4f2 · 300a4f2
2 parents fba9090 + 2b1ee6d
commit 300a4f2
Show file tree

Hide file tree

Showing 192 changed files with 6,145 additions and 822 deletions.
diff --git a/.github/workflows/build_linux_wheels.yaml b/.github/workflows/build_linux_wheels.yaml
@@ -36,6 +36,8 @@ jobs:
     with:
       repository: pytorch/torchtune
       ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
       package-name: torchtune
       build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
       pre-script: .github/scripts/pre_build_script.sh

diff --git a/.github/workflows/export.yaml b/.github/workflows/export.yaml
@@ -0,0 +1,51 @@
+name: Export
+
+on:
+  push:
+    paths:
+      - 'torchtune/modules/_export/**'
+      - 'tests/torchtune/modules/_export/**'
+  pull_request:
+    paths:
+      - 'torchtune/modules/_export/**'
+      - 'tests/torchtune/modules/_export/**'
+  schedule:
+    # Runs at midnight evvery day
+    - cron: '0 0 * * *'
+
+concurrency:
+  group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+defaults:
+  run:
+    shell: bash -l -eo pipefail {0}
+
+jobs:
+  export_unit_tests:
+    if: github.repository_owner == 'pytorch'
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['3.9', '3.10', '3.11']
+    steps:
+      - name: Check out repo
+        uses: actions/checkout@v3
+      - name: Setup conda env
+        uses: conda-incubator/setup-miniconda@v2
+        with:
+          auto-update-conda: true
+          miniconda-version: "latest"
+          activate-environment: test
+          python-version: ${{ matrix.python-version }}
+      - name: Update pip
+        run: python -m pip install --upgrade pip
+      - name: Install dependencies
+        run: |
+          bash torchtune/modules/_export/install_requirements.sh
+          python -m pip install torchao
+          python -m pip install -e ".[dev]"
+      - name: Run unit tests with coverage
+        run: pytest tests/torchtune/modules/_export --cov=. --cov-report=xml --durations=20 -vv
+      - name: Upload Coverage to Codecov
+        uses: codecov/codecov-action@v3
diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml
@@ -55,6 +55,6 @@ jobs:
           python -m pip install -e ".[dev]"
           python -m pip install lm-eval==0.4.5
       - name: Run recipe and unit tests with coverage
-        run: pytest tests --with-integration --cov=. --cov-report=xml --durations=20 -vv
+        run: pytest tests --ignore tests/torchtune/modules/_export --with-integration --cov=. --cov-report=xml --durations=20 -vv
       - name: Upload Coverage to Codecov
         uses: codecov/codecov-action@v3
diff --git a/.github/workflows/unit_test.yaml b/.github/workflows/unit_test.yaml
@@ -37,6 +37,6 @@ jobs:
           python -m pip install torch torchvision torchao
           python -m pip install -e ".[dev]"
       - name: Run unit tests with coverage
-        run: pytest tests --cov=. --cov-report=xml --durations=20 -vv
+        run: pytest tests --ignore tests/torchtune/modules/_export --cov=. --cov-report=xml --durations=20 -vv
       - name: Upload Coverage to Codecov
         uses: codecov/codecov-action@v3
diff --git a/README.md b/README.md
@@ -10,6 +10,7 @@
 [**Introduction**](#introduction) | [**Installation**](#installation) | [**Get Started**](#get-started) |  [**Documentation**](https://pytorch.org/torchtune/main/index.html) | [**Community**](#community) | [**License**](#license) | [**Citing torchtune**](#citing-torchtune)
 
 ### 📣 Recent updates 📣
+* *December 2024*: torchtune now supports **Llama 3.3 70B**! Try it out by following our installation instructions [here](#Installation), then run any of the configs [here](recipes/configs/llama3_3).
 * *November 2024*: torchtune has released [v0.4.0](https://github.com/pytorch/torchtune/releases/tag/v0.4.0) which includes stable support for exciting features like activation offloading and multimodal QLoRA
 * *November 2024*: torchtune has added [Gemma2](recipes/configs/gemma2) to its models!
 * *October 2024*: torchtune added support for Qwen2.5 models - find the recipes [here](recipes/configs/qwen2_5/)
@@ -39,6 +40,7 @@ torchtune currently supports the following models.
 
 | Model                                         | Sizes     |
 |-----------------------------------------------|-----------|
+| [Llama3.3](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_3)    | 70B [[models](torchtune/models/llama3_3/_model_builders.py), [configs](recipes/configs/llama3_3/)]        |
 | [Llama3.2-Vision](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_2#-llama-3.2-vision-models-(11b/90b)-)    | 11B, 90B [[models](torchtune/models/llama3_2_vision/_model_builders.py), [configs](recipes/configs/llama3_2_vision/)]        |
 | [Llama3.2](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_2)    | 1B, 3B [[models](torchtune/models/llama3_2/_model_builders.py), [configs](recipes/configs/llama3_2/)]        |
 | [Llama3.1](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1)    | 8B, 70B, 405B [[models](torchtune/models/llama3_1/_model_builders.py), [configs](recipes/configs/llama3_1/)]        |
@@ -67,7 +69,8 @@ torchtune provides the following finetuning recipes for training on one or more
 | LoRA Finetuning | 1-8  | [lora_finetune_single_device](recipes/lora_finetune_single_device.py) <br> [lora_finetune_distributed](recipes/lora_finetune_distributed.py) | [Qwen2 0.5B single-device](recipes/configs/qwen2/0.5B_lora_single_device.yaml) <br> [Gemma 7B distributed](recipes/configs/gemma/7B_lora.yaml)
 | QLoRA Finetuning | 1-8 | [lora_finetune_single_device](recipes/lora_finetune_single_device.py) <br> [lora_finetune_distributed](recipes/lora_finetune_distributed.py)| [Phi3 Mini single-device](recipes/configs/phi3/mini_qlora_single_device.yaml) <br> [Llama 3.1 405B distributed](recipes/configs/llama3_1/405B_qlora.yaml)
 | DoRA/QDoRA Finetuning | 1-8 | [lora_finetune_single_device](recipes/lora_finetune_single_device.py) <br> [lora_finetune_distributed](recipes/lora_finetune_distributed.py)| [Llama3 8B QDoRA single-device](recipes/configs/llama3/8B_qdora_single_device.yaml) <br> [Llama3 8B DoRA distributed](recipes/configs/llama3/8B_dora.yaml)
-| Quantization-Aware Training | 4-8 | [qat_distributed](recipes/qat_distributed.py)| [Llama3 8B QAT](recipes/configs/llama3/8B_qat_full.yaml)
+| Quantization-Aware Training | 2-8 | [qat_distributed](recipes/qat_distributed.py)| [Llama3 8B QAT](recipes/configs/llama3/8B_qat_full.yaml)
+| Quantization-Aware Training and LoRA Finetuning | 2-8 | [qat_lora_finetune_distributed](recipes/qat_lora_finetune_distributed.py)| [Llama3 8B QAT](recipes/configs/llama3/8B_qat_lora.yaml)
 | Direct Preference Optimization |1-8 | [lora_dpo_single_device](recipes/lora_dpo_single_device.py) <br> [lora_dpo_distributed](recipes/lora_dpo_distributed.py) | [Llama2 7B single-device](recipes/configs/llama2/7B_lora_dpo_single_device.yaml) <br> [Llama2 7B distributed](recipes/configs/llama2/7B_lora_dpo.yaml)
 | Proximal Policy Optimization | 1 |  [ppo_full_finetune_single_device](recipes/ppo_full_finetune_single_device.py) | [Mistral 7B](recipes/configs/mistral/7B_full_ppo_low_memory.yaml)
 | Knowledge Distillation | 1 | [knowledge_distillation_single_device](recipes/knowledge_distillation_single_device.py) | [Qwen2 1.5B -> 0.5B](recipes/configs/qwen2/knowledge_distillation_single_device.yaml)

diff --git a/docs/source/api_ref_models.rst b/docs/source/api_ref_models.rst
@@ -6,6 +6,31 @@ torchtune.models
 
 .. currentmodule:: torchtune.models
 
+llama3.3
+--------
+
+Text-only models from the 3.3 version of `Llama3 family <https://llama.meta.com/llama3/>`_.
+
+Important: You need to request access on `Hugging Face <https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct>`__ before downloading it.
+
+To download the Llama-3.3-70B-Instruct model:
+
+.. code-block:: bash
+
+    tune download meta-llama/Llama-3.3-70B-Instruct --ignore-patterns "original/consolidated.00.pth" --hf-token <HF_TOKEN>
+
+.. autosummary::
+    :toctree: generated/
+    :nosignatures:
+
+    llama3_3.llama3_3_70b
+    llama3_3.lora_llama3_3_70b
+    llama3_3.qlora_llama3_3_70b
+
+.. note::
+
+    The Llama3.3 tokenizer reuses the :class:`~torchtune.models.llama3.llama3_tokenizer` class.
+
 llama3.2
 --------
 

diff --git a/docs/source/basics/multimodal_datasets.rst b/docs/source/basics/multimodal_datasets.rst
@@ -71,12 +71,12 @@ in the text, ``"<image>"`` for where to place the image tokens. This will get re
 
 .. code-block:: yaml
 
-    # In config - model_transforms takes the place of the tokenizer
-    model_transform:
+    tokenizer:
       _component_: torchtune.models.llama3_2_vision_transform
       path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model
       prompt_template: torchtune.data.QuestionAnswerTemplate
       max_seq_len: 8192
+      image_size: 560
 
     dataset:
       _component_: torchtune.datasets.multimodal.multimodal_chat_dataset
@@ -137,7 +137,7 @@ For most datasets, you will also need to specify the ``split`` and/or the subset
 .. code-block:: yaml
 
     # In config
-    model_transform:
+    tokenizer:
       _component_: torchtune.models.llama3_2_vision.llama3_2_vision_transform
       path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model
       max_seq_len: 8192

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -113,6 +113,7 @@ torchtune tutorials.
    recipes/recipes_overview
    recipes/lora_finetune_single_device
    recipes/qat_distributed
+   recipes/dpo
 
 .. toctree::
    :glob:

diff --git a/docs/source/recipes/dpo.rst b/docs/source/recipes/dpo.rst
@@ -0,0 +1,75 @@
+.. _dpo_recipe_label:
+
+====================================
+Direct Preference Optimization
+====================================
+
+This recipe supports several `Direct Preference Optimization <https://arxiv.org/abs/2305.18290>`_ (DPO)-style fine-tuning techniques.
+These techniques aim to steer (or `align <https://en.wikipedia.org/wiki/AI_alignment>`_) a model towards some desirable behaviours.
+For example, a common goal is to train language models to produce safe and honest outputs,
+or to be `helpful and harmless <https://arxiv.org/abs/2204.05862>`_.
+
+To see the best results when using this recipe, it may be helpful to first fine-tune your model with using supervised fine-tuning to ensure your model is
+on-distribution for the domain you're interested in. To do this, check out our other fine-tuning recipes in the :ref:`recipe overview <recipes_overview_label>` which
+support a variety of SFT paradigms.
+
+After supervised fine-tuning, here is an example of DPO with Llama 3.1 8B:
+
+.. note::
+
+    You may need to be granted access to the Llama model you're interested in. See
+    :ref:`here <download_llama_label>` for details on accessing gated repositories.
+
+
+.. code-block:: bash
+
+    tune download meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --ignore-patterns "original/consolidated.00.pth"
+    --HF_TOKEN <HF_TOKEN>
+
+    # run on a single device
+    tune run lora_dpo_single_device --config llama3_1/8B_lora_dpo_single_device
+
+    # run on two gpus
+    tune run --nproc_per_node 2 lora_dpo_distributed --config llama3_1/8B_lora_dpo
+
+It's easy to get started with this recipe with your dataset of choice, including custom local datasets,
+and datasets from Hugging Face. Check out our primer on :ref:`preference datasets <preference_dataset_usage_label>` to
+see how to do this.
+
+For this recipe we include different DPO-style losses:
+
+* :class:`Direct Preference Optimization <torchtune.rlhf.loss.DPOLoss>` (DPO) loss [#]_. The DPO loss function
+  increases the relative log-probabilities of preferred to un-preferred responses, whilst using log probabilities
+  from a reference model to prevent policy degradation during training. Alongside RLHF, this is the most commonly used
+  alignment technique and is used to train a growing number of state-of-the-art LLMs e.g. Llama3.1, Gemma 2, Qwen2, etc.
+  This is a good starting point for alignment fine-tuning.
+* :class:`Statistical Rejection Sampling Optimization <torchtune.rlhf.loss.RSOLoss>` (RSO) or "hinge" loss [#]_.
+  RSO builds on concepts from support vector machines and DPO, applying a margin-based approach that penalizes
+  low-quality responses while ensuring a significant gap between chosen and un-chosen log probabilities.
+
+To use any of these, simply use the ``loss`` config entry or flag through the :ref:`cli_label`:
+
+.. code-block:: bash
+
+    tune run lora_dpo_single_device --config llama2/7B_lora_dpo_single_device \
+    loss=torchtune.modules.loss.RSOLoss \
+    gamma=0.5
+
+.. todo (@SalmanMohammadi) point to an example repo for SimPO
+
+For a deeper understanding of the different levers you can pull when using this recipe,
+see our documentation for the different PEFT training paradigms we support:
+
+* :ref:`glossary_lora`
+* :ref:`glossary_qlora`
+* :ref:`glossary_dora`
+
+Many of our other memory optimization features can be used in this recipe. You can learn more about all of our memory optimization features in our :ref:`memory optimization overview<memory_optimization_overview_label>`.
+
+.. rubric:: References:
+
+.. [#] Rafailov, R., Sharma, A., Mitchell, E., Manning, C.D., Ermon, S. and Finn, C., 2024.
+         Direct preference optimization: Your language model is secretly a reward model. Advances in Neural Information Processing Systems, 36.
+.. [#] Liu, T., Zhao, Y., Joshi, R., Khalman, M., Saleh, M., Liu, P.J. and Liu, J., 2023.
+         Statistical rejection sampling improves preference optimization. arXiv preprint arXiv:2309.06657.
diff --git a/docs/source/recipes/lora_finetune_single_device.rst b/docs/source/recipes/lora_finetune_single_device.rst
@@ -8,7 +8,7 @@ This recipe supports finetuning on next-token prediction tasks using parameter e
 such as :ref:`glossary_lora` and :ref:`glossary_qlora`. These techniques
 significantly reduce memory consumption during training whilst still maintaining competitive performance.
 
-We provide configs which you can get up and running quickly. Here is an example with llama 3.1 8B:
+We provide configs which you can get up and running quickly. Here is an example with Llama 3.1 8B:
 
 .. note::
 

diff --git a/docs/source/recipes/recipes_overview.rst b/docs/source/recipes/recipes_overview.rst
@@ -28,7 +28,7 @@ Our recipes include:
 * Single-device full fine-tuning
 * Distributed full fine-tuning
 * Distributed LoRA fine-tuning
-* Direct Preference Optimization (DPO)
+* :ref:`Direct Preference Optimization (DPO) <dpo_recipe_label>`
 * Proximal Policy Optimization (PPO)
 * :ref:`Distributed Quantization-Aware Training (QAT)<qat_distributed_recipe_label>`.
 

diff --git a/recipes/configs/code_llama2/7B_full_low_memory.yaml b/recipes/configs/code_llama2/7B_full_low_memory.yaml
@@ -19,6 +19,8 @@
 #
 # This config works only for training on single device.
 
+output_dir: /tmp/torchtune/code_llama2_7B/full_low_memory # /tmp may be deleted by your system. Change it to your preference.
+
 # Model arguments
 model:
   _component_: torchtune.models.code_llama2.code_llama2_7b
@@ -39,7 +41,7 @@ checkpointer:
     pytorch_model-00003-of-00003.bin
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/CodeLlama-7b-hf
+  output_dir: ${output_dir}
   model_type: LLAMA2
 resume_from_checkpoint: False
 
@@ -55,14 +57,14 @@ shuffle: True
 epochs: 1
 max_steps_per_epoch: null
 batch_size: 2
-gradient_accumulation_steps: 1  # Use to increase virtual batch size
+gradient_accumulation_steps: 1  # Use to increase effective batch size
 optimizer:
   _component_: bitsandbytes.optim.PagedAdamW
   lr: 2e-5
 optimizer_in_bwd: True  # True saves memory. Requires gradient_accumulation_steps=1
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
-compile: False  # pytorch compile, set to true for better perf/memory
+compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 device: cuda
@@ -73,13 +75,13 @@ enable_activation_offloading: True  # True reduces memory
 dtype: bf16
 
 # Logging
-output_dir: /tmp/codellama_finetune_output
 metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: /tmp/CodeLlama-7b-hf/logs
+  log_dir: ${output_dir}/logs
 log_every_n_steps: 1
 log_peak_memory_stats: True
 
+
 # Profiler (disabled)
 profiler:
   _component_: torchtune.training.setup_torch_profiler

diff --git a/recipes/configs/code_llama2/7B_lora_single_device.yaml b/recipes/configs/code_llama2/7B_lora_single_device.yaml
@@ -15,6 +15,8 @@
 #
 # This config works only for training on single device.
 
+output_dir: /tmp/torchtune/code_llama2_7B/lora_single_device # /tmp may be deleted by your system. Change it to your preference.
+
 # Model Arguments
 model:
   _component_: torchtune.models.code_llama2.lora_code_llama2_7b
@@ -42,7 +44,7 @@ checkpointer:
   ]
   adapter_checkpoint: null
   recipe_checkpoint: null
-  output_dir: /tmp/CodeLlama-7b-hf
+  output_dir: ${output_dir}
   model_type: LLAMA2
 resume_from_checkpoint: False
 save_adapter_weights_only: False
@@ -59,7 +61,7 @@ shuffle: True
 epochs: 1
 max_steps_per_epoch: null
 batch_size: 2
-gradient_accumulation_steps: 8  # Use to increase virtual batch size
+gradient_accumulation_steps: 8  # Use to increase effective batch size
 optimizer:
   _component_: torch.optim.AdamW
   fused: True
@@ -70,7 +72,7 @@ lr_scheduler:
   num_warmup_steps: 100
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
-compile: False  # pytorch compile, set to true for better perf/memory
+compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 device: cuda
@@ -81,10 +83,9 @@ enable_activation_offloading: False  # True reduces memory
 dtype: bf16
 
 # Logging
-output_dir: /tmp/codellama_lora_finetune_output
 metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: /tmp/CodeLlama-7b-hf/logs
+  log_dir: ${output_dir}/logs
 log_every_n_steps: 1
 log_peak_memory_stats: True