Fix lora modules and formatting

Remove stale comment Add llama lora modules Add llama test case Add test case and log warning on missing lora modules Rollback unwanted changes and format fixes Signed-off-by: Sumit Vij <[email protected]>
vllm-project · Jan 1, 2025 · 3f5996c · 3f5996c
1 parent 5a6b79f
commit 3f5996c
Show file tree

Hide file tree

Showing 6 changed files with 128 additions and 99 deletions.
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -733,14 +733,16 @@ def generate(
         images: Optional[PromptImageInput] = None,
         videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
+        **kwargs: Any,
     ) -> List[Tuple[List[List[int]], List[str]]]:
         inputs = self.get_inputs(prompts,
                                  images=images,
                                  videos=videos,
                                  audios=audios)
 
         req_outputs = self.model.generate(inputs,
-                                          sampling_params=sampling_params)
+                                          sampling_params=sampling_params,
+                                          **kwargs)
 
         outputs: List[Tuple[List[List[int]], List[str]]] = []
         for req_output in req_outputs:
@@ -778,6 +780,7 @@ def generate_w_logprobs(
         images: Optional[PromptImageInput] = None,
         audios: Optional[PromptAudioInput] = None,
         videos: Optional[PromptVideoInput] = None,
+        **kwargs: Any,
     ) -> Union[List[TokensTextLogprobs],
                List[TokensTextLogprobsPromptLogprobs]]:
         inputs = self.get_inputs(prompts,
@@ -786,7 +789,8 @@ def generate_w_logprobs(
                                  audios=audios)
 
         req_outputs = self.model.generate(inputs,
-                                          sampling_params=sampling_params)
+                                          sampling_params=sampling_params,
+                                          **kwargs)
 
         toks_str_logsprobs_prompt_logprobs = (
             self._final_steps_generate_w_logprobs(req_outputs))
@@ -822,13 +826,15 @@ def generate_greedy(
         images: Optional[PromptImageInput] = None,
         videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
+        **kwargs: Any,
     ) -> List[Tuple[List[int], str]]:
         greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
         outputs = self.generate(prompts,
                                 greedy_params,
                                 images=images,
                                 videos=videos,
-                                audios=audios)
+                                audios=audios,
+                                **kwargs)
         return [(output_ids[0], output_str[0])
                 for output_ids, output_str in outputs]
 
@@ -843,6 +849,7 @@ def generate_greedy_logprobs(
         videos: Optional[PromptVideoInput] = None,
         stop_token_ids: Optional[List[int]] = None,
         stop: Optional[List[str]] = None,
+        **kwargs: Any,
     ) -> Union[List[TokensTextLogprobs],
                List[TokensTextLogprobsPromptLogprobs]]:
         greedy_logprobs_params = SamplingParams(
@@ -857,7 +864,8 @@ def generate_greedy_logprobs(
                                         greedy_logprobs_params,
                                         images=images,
                                         audios=audios,
-                                        videos=videos)
+                                        videos=videos,
+                                        **kwargs)
 
     def generate_encoder_decoder_greedy_logprobs(
         self,

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
@@ -147,18 +147,29 @@ def sql_lora_huggingface_id():
     # huggingface repo id is used to test lora runtime downloading.
     return "yard1/llama-2-7b-sql-lora-test"
 
+
 @pytest.fixture(scope="session")
 def sql_lora_files(sql_lora_huggingface_id):
     return snapshot_download(repo_id=sql_lora_huggingface_id)
 
+
 @pytest.fixture(scope="session")
 def llama3_1_8b_chess_lora():
-    return snapshot_download(repo_id="mkopecki/chess-lora-adapter-llama-3.1-8b")
+    return snapshot_download(
+        repo_id="mkopecki/chess-lora-adapter-llama-3.1-8b")
+
+
+@pytest.fixture(scope="session")
+def llama3_1_8b_ultravox_chess_lora():
+    # ultravox chess lora is result of transformation of above chess llama lora
+    return snapshot_download(repo_id="thedebugger11/ultravox-chess-lora")
+
 
 @pytest.fixture(scope="session")
 def lora_bias_files():
     return snapshot_download(repo_id="followumesh/granite-3b-lora8-bias")
 
+
 @pytest.fixture(scope="session")
 def mixtral_lora_files():
     # Note: this module has incorrect adapter_config.json to test
@@ -214,6 +225,7 @@ def baichuan_zero_lora_files():
     # all the lora_B weights are initialized to zero.
     return snapshot_download(repo_id="jeeejeee/baichuan7b-zero-init")
 
+
 @pytest.fixture(scope="session")
 def baichuan_regex_lora_files():
     return snapshot_download(repo_id="jeeejeee/baichuan-7b-lora-zero-regex")
@@ -223,6 +235,7 @@ def baichuan_regex_lora_files():
 def minicpmv_lora_files():
     return snapshot_download(repo_id="jeeejeee/minicpmv25-lora-pokemon")
 
+
 @pytest.fixture(scope="session")
 def qwen2vl_lora_files():
     return snapshot_download(repo_id="jeeejeee/qwen2-vl-lora-pokemon")
@@ -232,6 +245,7 @@ def qwen2vl_lora_files():
 def tinyllama_lora_files():
     return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
 
+
 @pytest.fixture(scope="session")
 def phi2_lora_files():
     return snapshot_download(repo_id="isotr0py/phi-2-test-sql-lora")

diff --git a/tests/lora/test_ultravox.py b/tests/lora/test_ultravox.py
@@ -1,24 +1,21 @@
+from typing import List, Tuple
 
-from typing import List
+from transformers import AutoTokenizer
 
-import pytest
-
-import vllm
-
-from transformers import  AutoTokenizer
 from vllm.lora.request import LoRARequest
-from vllm.platforms import current_platform
 
-MODEL_NAME = "fixie-ai/ultravox-v0_3"
+from ..models.utils import check_outputs_equal
+
+ULTRAVOX_MODEL_NAME = "fixie-ai/ultravox-v0_3"
+LLMA_MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
 
 VLLM_PLACEHOLDER = "<|reserved_special_token_0|>"
 
-EXPECTED_OUTPUT = [
-    "Fool mate"
-]
+PROMPT = "Tell me about a silly chess move in 20 words"
+
 
-def _get_prompt(audio_count, question, placeholder):
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+def _get_prompt(audio_count, question, placeholder, model_name) -> str:
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
     placeholder = f"{placeholder}\n" * audio_count
 
     return tokenizer.apply_chat_template([{
@@ -28,44 +25,74 @@ def _get_prompt(audio_count, question, placeholder):
                                          tokenize=False,
                                          add_generation_prompt=True)
 
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
-    sampling_params = vllm.SamplingParams(
-        temperature=0,
-        max_tokens=1000,
-    )
 
-    inputs = [{
-        "prompt":_get_prompt(1, "Tell me about a silly chess move in 20 words", VLLM_PLACEHOLDER),
-    }]
+def test_ultravox_lora(vllm_runner, llama3_1_8b_chess_lora,
+                       llama3_1_8b_ultravox_chess_lora):
+    with vllm_runner(
+            ULTRAVOX_MODEL_NAME,
+            enforce_eager=True,
+            max_num_seqs=128,
+            enable_lora=True,
+            max_loras=4,
+            max_lora_rank=128,
+            dtype="bfloat16",
+            max_model_len=4096,
+    ) as vllm_model:
+        ultravox_outputs: List[Tuple[List[int],
+                                     str]] = vllm_model.generate_greedy(
+                                         [
+                                             _get_prompt(
+                                                 0, PROMPT, VLLM_PLACEHOLDER,
+                                                 ULTRAVOX_MODEL_NAME)
+                                         ],
+                                         256,
+                                         lora_request=LoRARequest(
+                                             str(1), 1,
+                                             llama3_1_8b_ultravox_chess_lora),
+                                     )
+
+    # run llama with and without lora to compare outputs with above
+    with vllm_runner(
+            LLMA_MODEL_NAME,
+            enforce_eager=True,
+            max_num_seqs=128,
+            enable_lora=True,
+            max_loras=4,
+            max_lora_rank=128,
+            dtype="bfloat16",
+            max_model_len=4096,
+    ) as vllm_model:
+        llama_outputs_no_lora: List[Tuple[List[int],
+                                          str]] = vllm_model.generate_greedy(
+                                              [
+                                                  _get_prompt(
+                                                      0, PROMPT,
+                                                      VLLM_PLACEHOLDER,
+                                                      LLMA_MODEL_NAME)
+                                              ],
+                                              256,
+                                          )
+        llama_outputs: List[Tuple[List[int],
+                                  str]] = vllm_model.generate_greedy(
+                                      [
+                                          _get_prompt(0, PROMPT,
+                                                      VLLM_PLACEHOLDER,
+                                                      LLMA_MODEL_NAME)
+                                      ],
+                                      256,
+                                      lora_request=LoRARequest(
+                                          str(1), 1, llama3_1_8b_chess_lora),
+                                  )
 
-    outputs = llm.generate(
-        inputs,
-        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None,
+    check_outputs_equal(
+        outputs_0_lst=ultravox_outputs,
+        outputs_1_lst=llama_outputs,
+        name_0="ultravox",
+        name_1="llama",
     )
-    generated_texts: List[str] = []
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text.strip()
-        generated_texts.append(generated_text)
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-    return generated_texts
 
+    _, llama_no_lora_str = llama_outputs_no_lora[0]
+    _, ultravox_str = ultravox_outputs[0]
 
-def test_fixie_lora(llama3_1_8b_chess_lora):
-    llm = vllm.LLM(
-        MODEL_NAME,
-        max_num_seqs=2,
-        enable_lora=True,
-        max_loras=4,
-        max_lora_rank=128,
-        trust_remote_code=True,
-        dtype="bfloat16",
-        max_model_len=4096,
-        enforce_eager=True
-    )
-    output1 = do_sample(llm, llama3_1_8b_chess_lora, lora_id=1)
-    for i in range(len(EXPECTED_OUTPUT)):
-        assert EXPECTED_OUTPUT[i].startswith(output1[i])
-    return None
+    # verify that text don't match with no lora
+    assert llama_no_lora_str != ultravox_str
diff --git a/vllm/assets/audio.py b/vllm/assets/audio.py
@@ -20,13 +20,6 @@
 class AudioAsset:
     name: Literal["winning_call", "mary_had_lamb"]
 
-    def __init__(self, audio_path=None):
-        if audio_path is None:
-            audio_path = get_vllm_public_assets(filename=f"{self.name}.ogg",
-                                            s3_prefix=ASSET_DIR)
-
-        object.__setattr__(self, '_audio_path', audio_path)
-
     @property
     def audio_and_sample_rate(self) -> tuple[npt.NDArray, float]:
         audio_path = get_vllm_public_assets(filename=f"{self.name}.ogg",

diff --git a/vllm/lora/models.py b/vllm/lora/models.py
@@ -167,14 +167,9 @@ def from_lora_tensors(
                     loras[module_name].lora_b = loras[
                         module_name].lora_b.pin_memory()
 
-        print_v=False
         for lora in loras.values():
-            if "v_proj" in lora.module_name and not print_v:
-                print_v=True
-                logger.debug(f"Size of v_proj is: {lora.lora_a.size()}")
             lora.optimize()
 
-        logger.debug(f"Creating loras for {lora_model_id} with following modules {loras.keys()}")
         return cls(lora_model_id,
                    peft_helper.r,
                    loras,
@@ -392,11 +387,10 @@ def activate_adapter(
         logger.debug("Activating LoRA. int id: %d, slot index: %d",
                      lora_model.id, index)
         self.lora_index_to_id[index] = lora_model.id
+        missing_modules = []
         for module_name, module in self.modules.items():
             module_lora = lora_model.get_lora(module_name)
             if module_lora:
-                logger.debug("Setting LoRA. int id: %d, module: %s",
-                        lora_model.id, module_name)
                 module_lora.optimize()
                 # Bias is not explicitly enabled with the flag enable_lora_bias.
                 bias = module_lora.bias
@@ -412,9 +406,14 @@ def activate_adapter(
                                 module_lora.embeddings_tensor,
                                 module_lora.bias)
             else:
-                logger.debug("Reseting lora. int id: %d, module: %s",
-                        lora_model.id, module_name)
+                missing_modules.append(module_name)
                 module.reset_lora(index)
+
+        if len(missing_modules) > 0:
+            logger.warning(
+                "Lora adapter int id %d is activated but is missing \
+                    base model modules %s which could impact output",
+                lora_model.id, missing_modules)
         return True
 
     def _deactivate_adapter(self, lora_id: int):
@@ -471,10 +470,6 @@ def _create_lora_modules(self):
         for module_name, module in self.model.named_modules(
                 remove_duplicate=False):
 
-            logger.debug(
-                    "Create lora module if applicable %s",
-                    module_name,
-                )
             if isinstance(module, PPMissingLayer):
                 continue
             if not self._match_target_modules(module_name):
@@ -521,15 +516,12 @@ def _create_lora_modules(self):
             if self.supports_mm and not isinstance(new_module,
                                                    BaseLayerWithLoRA):
                 logger.warning(
-                    "%s module will be ignored because it isn't of type BaseLayerWithLoRA",
+                    "%s module will be ignored because it isn't of type \
+                        BaseLayerWithLoRA",
                     module_name,
                 )
                 continue
 
-            logger.debug(
-                    "Going to apply lora on %s module",
-                    module_name,
-                )
             self.register_module(module_name, new_module)
             self._register_packed_modules(module_name)
             # All lora layers share the same punica_wrapper based on reference.
@@ -545,9 +537,6 @@ def create_dummy_lora(
             rank: int,
             scaling_factor: Optional[float],
             embedding_modules: Optional[Dict[str, str]] = None) -> LoRAModel:
-        logger.debug(
-                    f"Creating a dummy lora with id: {lora_id}"
-                )
         """Create zero-initialized LoRAModel for warmup."""
         model = LoRAModel(lora_id, rank, {}, scaling_factor)
         for module_name, module in self.model.named_modules():