From b13f3b9d0cac69c2bfe1908a551cf4ac38f2e316 Mon Sep 17 00:00:00 2001 From: Jason Greene Date: Sat, 21 Dec 2024 04:04:00 +0000 Subject: [PATCH] [Bugfix] Fix fully sharded LoRAs with Mixtral - Changes ReplicatedLinearWithLoRA to always apply regardless of the fully sharded LoRA setting, since in both cases the layer needs to be replicated - Updates the existing mixtral all modeuls test to test both values of fully_sharded_loras (which includes a ReplicatedLayer [gate]) Signed-off-by: Jason Greene --- tests/lora/test_mixtral.py | 4 +++- vllm/lora/layers.py | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py index 150221dfce6ab..797a495201d33 100644 --- a/tests/lora/test_mixtral.py +++ b/tests/lora/test_mixtral.py @@ -62,8 +62,9 @@ def test_mixtral_lora(mixtral_lora_files, tp_size): @pytest.mark.parametrize("tp_size", [4]) +@pytest.mark.parametrize("fully_shard", [True, False]) def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules, - tp_size): + tp_size, fully_shard): """This LoRA model has all supported Mixtral target modules""" if torch.cuda.device_count() < tp_size: @@ -82,6 +83,7 @@ def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules, max_loras=4, distributed_executor_backend="ray", tensor_parallel_size=tp_size, + fully_sharded_loras=fully_shard, max_lora_rank=32, ) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index a6c93a3d8bfe9..85164c2165a3c 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -425,8 +425,9 @@ def forward(self, input_): if self.base_layer.skip_bias_add else None) return output, output_bias + # ReplicatedLinear should always be replaced, regardless of the fully + # sharded LoRAs setting, because it is, by definition, copied per GPU. @classmethod - @_not_fully_sharded_can_replace def can_replace_layer( cls, source_layer: nn.Module,