diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index d72b7638d84af..4e9649f02856a 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -23,7 +23,7 @@ LogitsProcessorWithLoRA, MergedColumnParallelLinearWithLoRA, MergedQKVParallelLinearWithLora, - QKVParallelLinearWithLora, + ModulesToSaveWrapper, QKVParallelLinearWithLora, ReplicatedLinearWithLoRA, RowParallelLinearWithLoRA, VocabParallelEmbeddingWithLoRA) @@ -49,6 +49,7 @@ MergedQKVParallelLinearWithShardedLora, RowParallelLinearWithShardedLoRA, LinearScalingRotaryEmbeddingWithLora, + ModulesToSaveWrapper, } @@ -94,6 +95,7 @@ def replace_submodule(model: nn.Module, module_name: str, def parse_fine_tuned_lora_name( name: str, + enable_lora_modules_to_save: bool = False, weights_mapper: Optional[WeightsMapper] = None ) -> Tuple[str, bool, bool]: """Parse the name of lora weights. @@ -106,7 +108,8 @@ def parse_fine_tuned_lora_name( return: Tuple(module_name, is_lora_a): module_name: the name of the module, e.g. model.dense1, - is_lora_a whether the tensor is lora_a or lora_b. + is_lora_a whether the tensor is lora_a or lora_b, + lora_a=None if this is module_to_save lm_head or token_embeds is_bias whether the tensor is lora bias. """ @@ -120,11 +123,22 @@ def parse_fine_tuned_lora_name( name = "base_model.model." + name parts = name.split(".") - if parts[-1] == "weight" and (parts[-2] == "lora_A" - or parts[-2] == "lora_B"): - new_name = ".".join(parts[2:-2]) - return new_name, parts[-2] == "lora_A", False + + if parts[-1] == "weight": + if parts[-2] == "lora_A" or parts[-2] == "lora_B": + return ".".join(parts[2:-2]), parts[-2] == "lora_A", False + + if parts[-2] in ModulesToSaveWrapper.implemented_layers: + + if not enable_lora_modules_to_save: + error_msg = f"""enable_lora_modules_to_save is False, + but found tensor name {name} in LoRA checkpoint. + Set enable_lora_modules_to_save=True to process + lm_head and embed_tokens as fully trained tensors""" + raise ValueError(error_msg) + + return '.'.join(parts[2:-1]), None, False if parts[-1] == "lora_embedding_A" or parts[-1] == "lora_embedding_B": new_name = ".".join(parts[2:-1]) return new_name, parts[-1] == "lora_embedding_A", False diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 10976fac23028..9047694934831 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -91,8 +91,8 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel: packed_modules_mapping[module]) else: expected_lora_modules.append(module) - expected_lora_modules = list(set(expected_lora_modules)) + expected_modules_to_save: List[str] = model.modules_to_save lora_path = get_adapter_absolute_path(lora_request.lora_path) # For some models like Qwen2VL, we need to use hf_to_vllm_mapper @@ -105,9 +105,12 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel: lora = self._lora_model_cls.from_local_checkpoint( lora_path, expected_lora_modules, + expected_modules_to_save, max_position_embeddings=self.max_position_embeddings, lora_model_id=lora_request.lora_int_id, device="cpu", + enable_lora_modules_to_save=self._adapter_manager.lora_config. + enable_lora_modules_to_save, dtype=self.lora_config.lora_dtype, target_embedding_padding=self.vocab_size + self.lora_config.lora_extra_vocab_size,