From 3f5996c7ab661eb9340e84a3b8daf7a0a8d9cfe0 Mon Sep 17 00:00:00 2001
From: Sumit Vij <sumitvij11+github@gmail.com>
Date: Tue, 17 Dec 2024 06:26:52 +0000
Subject: [PATCH] Fix lora modules and formatting

Remove stale comment

Add llama lora modules

Add llama test case

Add test case and log warning on missing lora modules

Rollback unwanted changes and format fixes

Signed-off-by: Sumit Vij <sumitvij11+github@gmail.com>
---
 tests/conftest.py                      |  16 +++-
 tests/lora/conftest.py                 |  16 +++-
 tests/lora/test_ultravox.py            | 125 +++++++++++++++----------
 vllm/assets/audio.py                   |   7 --
 vllm/lora/models.py                    |  31 ++----
 vllm/model_executor/models/ultravox.py |  32 +++----
 6 files changed, 128 insertions(+), 99 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 917151ddcb8d4..c42de316c1c01 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -733,6 +733,7 @@ def generate(
         images: Optional[PromptImageInput] = None,
         videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
+        **kwargs: Any,
     ) -> List[Tuple[List[List[int]], List[str]]]:
         inputs = self.get_inputs(prompts,
                                  images=images,
@@ -740,7 +741,8 @@ def generate(
                                  audios=audios)
 
         req_outputs = self.model.generate(inputs,
-                                          sampling_params=sampling_params)
+                                          sampling_params=sampling_params,
+                                          **kwargs)
 
         outputs: List[Tuple[List[List[int]], List[str]]] = []
         for req_output in req_outputs:
@@ -778,6 +780,7 @@ def generate_w_logprobs(
         images: Optional[PromptImageInput] = None,
         audios: Optional[PromptAudioInput] = None,
         videos: Optional[PromptVideoInput] = None,
+        **kwargs: Any,
     ) -> Union[List[TokensTextLogprobs],
                List[TokensTextLogprobsPromptLogprobs]]:
         inputs = self.get_inputs(prompts,
@@ -786,7 +789,8 @@ def generate_w_logprobs(
                                  audios=audios)
 
         req_outputs = self.model.generate(inputs,
-                                          sampling_params=sampling_params)
+                                          sampling_params=sampling_params,
+                                          **kwargs)
 
         toks_str_logsprobs_prompt_logprobs = (
             self._final_steps_generate_w_logprobs(req_outputs))
@@ -822,13 +826,15 @@ def generate_greedy(
         images: Optional[PromptImageInput] = None,
         videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
+        **kwargs: Any,
     ) -> List[Tuple[List[int], str]]:
         greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
         outputs = self.generate(prompts,
                                 greedy_params,
                                 images=images,
                                 videos=videos,
-                                audios=audios)
+                                audios=audios,
+                                **kwargs)
         return [(output_ids[0], output_str[0])
                 for output_ids, output_str in outputs]
 
@@ -843,6 +849,7 @@ def generate_greedy_logprobs(
         videos: Optional[PromptVideoInput] = None,
         stop_token_ids: Optional[List[int]] = None,
         stop: Optional[List[str]] = None,
+        **kwargs: Any,
     ) -> Union[List[TokensTextLogprobs],
                List[TokensTextLogprobsPromptLogprobs]]:
         greedy_logprobs_params = SamplingParams(
@@ -857,7 +864,8 @@ def generate_greedy_logprobs(
                                         greedy_logprobs_params,
                                         images=images,
                                         audios=audios,
-                                        videos=videos)
+                                        videos=videos,
+                                        **kwargs)
 
     def generate_encoder_decoder_greedy_logprobs(
         self,
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 022a920766188..00f55d621978f 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -147,18 +147,29 @@ def sql_lora_huggingface_id():
     # huggingface repo id is used to test lora runtime downloading.
     return "yard1/llama-2-7b-sql-lora-test"
 
+
 @pytest.fixture(scope="session")
 def sql_lora_files(sql_lora_huggingface_id):
     return snapshot_download(repo_id=sql_lora_huggingface_id)
 
+
 @pytest.fixture(scope="session")
 def llama3_1_8b_chess_lora():
-    return snapshot_download(repo_id="mkopecki/chess-lora-adapter-llama-3.1-8b")
+    return snapshot_download(
+        repo_id="mkopecki/chess-lora-adapter-llama-3.1-8b")
+
+
+@pytest.fixture(scope="session")
+def llama3_1_8b_ultravox_chess_lora():
+    # ultravox chess lora is result of transformation of above chess llama lora
+    return snapshot_download(repo_id="thedebugger11/ultravox-chess-lora")
+
 
 @pytest.fixture(scope="session")
 def lora_bias_files():
     return snapshot_download(repo_id="followumesh/granite-3b-lora8-bias")
 
+
 @pytest.fixture(scope="session")
 def mixtral_lora_files():
     # Note: this module has incorrect adapter_config.json to test
@@ -214,6 +225,7 @@ def baichuan_zero_lora_files():
     # all the lora_B weights are initialized to zero.
     return snapshot_download(repo_id="jeeejeee/baichuan7b-zero-init")
 
+
 @pytest.fixture(scope="session")
 def baichuan_regex_lora_files():
     return snapshot_download(repo_id="jeeejeee/baichuan-7b-lora-zero-regex")
@@ -223,6 +235,7 @@ def baichuan_regex_lora_files():
 def minicpmv_lora_files():
     return snapshot_download(repo_id="jeeejeee/minicpmv25-lora-pokemon")
 
+
 @pytest.fixture(scope="session")
 def qwen2vl_lora_files():
     return snapshot_download(repo_id="jeeejeee/qwen2-vl-lora-pokemon")
@@ -232,6 +245,7 @@ def qwen2vl_lora_files():
 def tinyllama_lora_files():
     return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
 
+
 @pytest.fixture(scope="session")
 def phi2_lora_files():
     return snapshot_download(repo_id="isotr0py/phi-2-test-sql-lora")
diff --git a/tests/lora/test_ultravox.py b/tests/lora/test_ultravox.py
index f84f9fb9ab8e4..f3986c0ba29fc 100644
--- a/tests/lora/test_ultravox.py
+++ b/tests/lora/test_ultravox.py
@@ -1,24 +1,21 @@
+from typing import List, Tuple
 
-from typing import List
+from transformers import AutoTokenizer
 
-import pytest
-
-import vllm
-
-from transformers import  AutoTokenizer
 from vllm.lora.request import LoRARequest
-from vllm.platforms import current_platform
 
-MODEL_NAME = "fixie-ai/ultravox-v0_3"
+from ..models.utils import check_outputs_equal
+
+ULTRAVOX_MODEL_NAME = "fixie-ai/ultravox-v0_3"
+LLMA_MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
 
 VLLM_PLACEHOLDER = "<|reserved_special_token_0|>"
 
-EXPECTED_OUTPUT = [
-    "Fool mate"
-]
+PROMPT = "Tell me about a silly chess move in 20 words"
+
 
-def _get_prompt(audio_count, question, placeholder):
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+def _get_prompt(audio_count, question, placeholder, model_name) -> str:
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
     placeholder = f"{placeholder}\n" * audio_count
 
     return tokenizer.apply_chat_template([{
@@ -28,44 +25,74 @@ def _get_prompt(audio_count, question, placeholder):
                                          tokenize=False,
                                          add_generation_prompt=True)
 
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
-    sampling_params = vllm.SamplingParams(
-        temperature=0,
-        max_tokens=1000,
-    )
 
-    inputs = [{
-        "prompt":_get_prompt(1, "Tell me about a silly chess move in 20 words", VLLM_PLACEHOLDER),
-    }]
+def test_ultravox_lora(vllm_runner, llama3_1_8b_chess_lora,
+                       llama3_1_8b_ultravox_chess_lora):
+    with vllm_runner(
+            ULTRAVOX_MODEL_NAME,
+            enforce_eager=True,
+            max_num_seqs=128,
+            enable_lora=True,
+            max_loras=4,
+            max_lora_rank=128,
+            dtype="bfloat16",
+            max_model_len=4096,
+    ) as vllm_model:
+        ultravox_outputs: List[Tuple[List[int],
+                                     str]] = vllm_model.generate_greedy(
+                                         [
+                                             _get_prompt(
+                                                 0, PROMPT, VLLM_PLACEHOLDER,
+                                                 ULTRAVOX_MODEL_NAME)
+                                         ],
+                                         256,
+                                         lora_request=LoRARequest(
+                                             str(1), 1,
+                                             llama3_1_8b_ultravox_chess_lora),
+                                     )
+
+    # run llama with and without lora to compare outputs with above
+    with vllm_runner(
+            LLMA_MODEL_NAME,
+            enforce_eager=True,
+            max_num_seqs=128,
+            enable_lora=True,
+            max_loras=4,
+            max_lora_rank=128,
+            dtype="bfloat16",
+            max_model_len=4096,
+    ) as vllm_model:
+        llama_outputs_no_lora: List[Tuple[List[int],
+                                          str]] = vllm_model.generate_greedy(
+                                              [
+                                                  _get_prompt(
+                                                      0, PROMPT,
+                                                      VLLM_PLACEHOLDER,
+                                                      LLMA_MODEL_NAME)
+                                              ],
+                                              256,
+                                          )
+        llama_outputs: List[Tuple[List[int],
+                                  str]] = vllm_model.generate_greedy(
+                                      [
+                                          _get_prompt(0, PROMPT,
+                                                      VLLM_PLACEHOLDER,
+                                                      LLMA_MODEL_NAME)
+                                      ],
+                                      256,
+                                      lora_request=LoRARequest(
+                                          str(1), 1, llama3_1_8b_chess_lora),
+                                  )
 
-    outputs = llm.generate(
-        inputs,
-        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None,
+    check_outputs_equal(
+        outputs_0_lst=ultravox_outputs,
+        outputs_1_lst=llama_outputs,
+        name_0="ultravox",
+        name_1="llama",
     )
-    generated_texts: List[str] = []
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text.strip()
-        generated_texts.append(generated_text)
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-    return generated_texts
 
+    _, llama_no_lora_str = llama_outputs_no_lora[0]
+    _, ultravox_str = ultravox_outputs[0]
 
-def test_fixie_lora(llama3_1_8b_chess_lora):
-    llm = vllm.LLM(
-        MODEL_NAME,
-        max_num_seqs=2,
-        enable_lora=True,
-        max_loras=4,
-        max_lora_rank=128,
-        trust_remote_code=True,
-        dtype="bfloat16",
-        max_model_len=4096,
-        enforce_eager=True
-    )
-    output1 = do_sample(llm, llama3_1_8b_chess_lora, lora_id=1)
-    for i in range(len(EXPECTED_OUTPUT)):
-        assert EXPECTED_OUTPUT[i].startswith(output1[i])
-    return None
\ No newline at end of file
+    # verify that text don't match with no lora
+    assert llama_no_lora_str != ultravox_str
diff --git a/vllm/assets/audio.py b/vllm/assets/audio.py
index 77af766e54d1b..a46c67ad7e00e 100644
--- a/vllm/assets/audio.py
+++ b/vllm/assets/audio.py
@@ -20,13 +20,6 @@
 class AudioAsset:
     name: Literal["winning_call", "mary_had_lamb"]
 
-    def __init__(self, audio_path=None):
-        if audio_path is None:
-            audio_path = get_vllm_public_assets(filename=f"{self.name}.ogg",
-                                            s3_prefix=ASSET_DIR)
-
-        object.__setattr__(self, '_audio_path', audio_path)
-
     @property
     def audio_and_sample_rate(self) -> tuple[npt.NDArray, float]:
         audio_path = get_vllm_public_assets(filename=f"{self.name}.ogg",
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 70f941a52384c..278616c45d8a7 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -167,14 +167,9 @@ def from_lora_tensors(
                     loras[module_name].lora_b = loras[
                         module_name].lora_b.pin_memory()
 
-        print_v=False
         for lora in loras.values():
-            if "v_proj" in lora.module_name and not print_v:
-                print_v=True
-                logger.debug(f"Size of v_proj is: {lora.lora_a.size()}")
             lora.optimize()
 
-        logger.debug(f"Creating loras for {lora_model_id} with following modules {loras.keys()}")
         return cls(lora_model_id,
                    peft_helper.r,
                    loras,
@@ -392,11 +387,10 @@ def activate_adapter(
         logger.debug("Activating LoRA. int id: %d, slot index: %d",
                      lora_model.id, index)
         self.lora_index_to_id[index] = lora_model.id
+        missing_modules = []
         for module_name, module in self.modules.items():
             module_lora = lora_model.get_lora(module_name)
             if module_lora:
-                logger.debug("Setting LoRA. int id: %d, module: %s",
-                        lora_model.id, module_name)
                 module_lora.optimize()
                 # Bias is not explicitly enabled with the flag enable_lora_bias.
                 bias = module_lora.bias
@@ -412,9 +406,14 @@ def activate_adapter(
                                 module_lora.embeddings_tensor,
                                 module_lora.bias)
             else:
-                logger.debug("Reseting lora. int id: %d, module: %s",
-                        lora_model.id, module_name)
+                missing_modules.append(module_name)
                 module.reset_lora(index)
+
+        if len(missing_modules) > 0:
+            logger.warning(
+                "Lora adapter int id %d is activated but is missing \
+                    base model modules %s which could impact output",
+                lora_model.id, missing_modules)
         return True
 
     def _deactivate_adapter(self, lora_id: int):
@@ -471,10 +470,6 @@ def _create_lora_modules(self):
         for module_name, module in self.model.named_modules(
                 remove_duplicate=False):
 
-            logger.debug(
-                    "Create lora module if applicable %s",
-                    module_name,
-                )
             if isinstance(module, PPMissingLayer):
                 continue
             if not self._match_target_modules(module_name):
@@ -521,15 +516,12 @@ def _create_lora_modules(self):
             if self.supports_mm and not isinstance(new_module,
                                                    BaseLayerWithLoRA):
                 logger.warning(
-                    "%s module will be ignored because it isn't of type BaseLayerWithLoRA",
+                    "%s module will be ignored because it isn't of type \
+                        BaseLayerWithLoRA",
                     module_name,
                 )
                 continue
 
-            logger.debug(
-                    "Going to apply lora on %s module",
-                    module_name,
-                )
             self.register_module(module_name, new_module)
             self._register_packed_modules(module_name)
             # All lora layers share the same punica_wrapper based on reference.
@@ -545,9 +537,6 @@ def create_dummy_lora(
             rank: int,
             scaling_factor: Optional[float],
             embedding_modules: Optional[Dict[str, str]] = None) -> LoRAModel:
-        logger.debug(
-                    f"Creating a dummy lora with id: {lora_id}"
-                )
         """Create zero-initialized LoRAModel for warmup."""
         model = LoRAModel(lora_id, rank, {}, scaling_factor)
         for module_name, module in self.model.named_modules():
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index be75ffa3946c9..d563aba8a2838 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -21,6 +21,7 @@
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.loader import DefaultModelLoader
+from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
@@ -30,11 +31,10 @@
                                         MultiModalDataItems, ProcessorInputs,
                                         PromptReplacement)
 from vllm.sequence import IntermediateTensors
-from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 from vllm.utils import is_list_of
 
-from .interfaces import SupportsMultiModal, SupportsPP, SupportsLoRA
+from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings_from_map)
@@ -319,12 +319,18 @@ def forward(
 
 @MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor)
 class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
-    #TODO: not sure what is right thing to do here yet
-    packed_modules_mapping = {}
-    #should all llama3 modules be supported here?
-    #source: https://github.com/fixie-ai/ultravox/blob/812f58c5f50c02589c08668d9afe6e4f8c6d0d74/ultravox/model/ultravox_config.py#L20
+    # same as llamaforcasuallm (language model) minus embedding and other
+    # modules. embedding modules haven't been added as a caution
+    # since it could affect text but not audio
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"]
+    }
+
+    #lm_head is not added for now since it requires logits_processor
+    # which is missing from ultravox
     supported_lora_modules = [
-        'linear_k', 'linear_q', 'k_proj', 'q_proj'
+        "qkv_proj", "o_proj", "gate_up_proj", "down_proj"
     ]
     embedding_modules = {}
     embedding_padding_modules = []
@@ -340,10 +346,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.multi_modal_config = multimodal_config
         assert self.multi_modal_config
 
-        #TODO: maybe log a warning if lora config is present in UltravoxConfig?
-        #TODO: figure out if these prefixes need tweaking to support LoRA and/or
-        #use LLMWrapper or not like this https://github.com/vllm-project/vllm/pull/7199/files#diff-7b8a4e258637b7c94389c745c449c52137d33cf92957f3e5bcb18a0ee204b21bR807
-
         self.secondary_weights = []
         self.audio_tower = ModifiedWhisperEncoder(config.audio_config)
         if config.audio_model_id is not None:
@@ -379,16 +381,12 @@ def sampler(self):
 
         return get_sampler()
 
-    # Following PR: https://github.com/vllm-project/vllm/pull/7199/files
-    # check language_model and audio_tower prefixes
-    # can't tell if vLLM will apply audio lora or not based on following warning:
-    # https://github.com/vllm-project/vllm/pull/7199/files#diff-d3df23c3e3bcfe97ee8507061c6de54f0eff23a8c75d7f5999062c42245290f8R1033
     def get_mm_mapping(self) -> MultiModelKeys:
         """
         Get the module prefix in multimodal models
         """
-        return MultiModelKeys.from_string_field(language_model="language_model",
-                                                tower_model="audio_tower")
+        return MultiModelKeys.from_string_field(
+            language_model="language_model", tower_model="audio_tower")
 
     def _audio_features_to_embeddings(
             self, input_features: torch.Tensor) -> torch.Tensor: