From 0922d0dfd3e70e58f2d65d670d0d075bac9cd5a9 Mon Sep 17 00:00:00 2001 From: Sanju C Sudhakaran Date: Tue, 19 Nov 2024 10:40:45 +0200 Subject: [PATCH] Enable LoRA support for Intel Gaudi Signed-off-by: Sanju C Sudhakaran --- vllm/lora/layers.py | 7 +++++++ vllm/worker/hpu_model_runner.py | 17 ++++------------- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 38cb846578d5c..6f7095d25b6d6 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -30,6 +30,10 @@ LinearScalingRotaryEmbedding, RotaryEmbedding) from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) +from vllm.platforms import current_platform + +if current_platform.is_hpu(): + from vllm_hpu_extension.punica_hpu import GaudiPunicaWrapper if TYPE_CHECKING: from vllm.lora.punica_wrapper import PunicaWrapperBase @@ -255,6 +259,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: full_lora_a_embeddings, self.lora_b_stacked, add_input=True) + return full_output.view_as(full_output_org) @classmethod @@ -1068,6 +1073,8 @@ def _get_logits( ).index_select(0, indices_padded).nan_to_num_(nan=float("-inf"), posinf=float("inf"), neginf=float("-inf"))) + if current_platform.is_hpu(): + lora_logits = lora_logits[:logits.shape[0], :] logits[:, self.base_layer.org_vocab_size:self.base_layer.org_vocab_size + lora_logits.shape[1]] = lora_logits diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 99cf9a7e67256..76efdc7d5a5de 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1282,11 +1282,9 @@ def create_dummy_seq_group_metadata(self, def profile_run(self) -> None: num_layers = self.model_config.get_num_layers(self.parallel_config) kv_caches = [None] * num_layers - max_batch_size = self.bucketing_global_state.prompt_bs_bucket_cfg[-1] - max_seq_len = min( - self.bucketing_global_state.prompt_seq_bucket_cfg[-1], - self.max_num_batched_tokens // max_batch_size) - + max_seq_len = self.bucketing_global_state.prompt_seq_bucket_cfg[-1] + max_batch_size = min(self.max_num_batched_tokens // max_seq_len, + self.scheduler_config.max_num_seqs) self.warmup_scenario(max_batch_size, max_seq_len, True, kv_caches, False, True) return @@ -1304,7 +1302,6 @@ def warmup_scenario(self, f"bs{batch_size}_" f"seq{seq_len}_" f"graphs{'T' if use_graphs else 'F'}") - max_num_seqs = self.scheduler_config.max_num_seqs # This represents the maximum number of different requests # that will have unique loras, an therefore the max amount of memory # consumption create dummy lora request copies from the lora request @@ -1326,16 +1323,10 @@ def warmup_scenario(self, dummy_lora_requests.append(dummy_lora_request) dummy_lora_requests_per_seq = [ dummy_lora_requests[idx % len(dummy_lora_requests)] - for idx in range(max_num_seqs) + for idx in range(batch_size) ] self.profiler.start('internal', scenario_name) times = 3 if use_graphs or is_pt_profiler_run else 1 - if self.lora_config and not is_lora_profile_run: - lora_mapping = LoRAMapping( - **dict(index_mapping=[0] * batch_size * seq_len, - prompt_mapping=[0] * batch_size * seq_len, - is_prefill=is_prompt)) - self.set_active_loras(set(), lora_mapping) if is_prompt: seqs = [ self.create_dummy_seq_group_metadata(