diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index f5591f1098534..455f02a2062f1 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -481,7 +481,6 @@ steps: - label: LoRA TP Test (Distributed) num_gpus: 4 - soft_fail: true source_file_dependencies: - vllm/lora - tests/lora diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index 5f2d32defe030..e25e453201f01 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -77,13 +77,6 @@ def apply(self, x: torch.Tensor, add_input=True) # now have column partitioned output - if self.bias_stacked is not None: - self.bias_stacked = self.bias_stacked.view( - -1, self.bias_stacked.shape[-1]) - self.bias_stacked = self.bias_stacked[ - self.punica_wrapper.token_lora_indices] - output += self.bias_stacked - output = output.view(*out_orig_shape) return output @@ -222,7 +215,7 @@ def apply(self, x: torch.Tensor, self.punica_wrapper.add_expand(output, buffer, self.lora_b_stacked, - self.bias_all, + self.bias_stacked, add_input=True) # now have column partitioned output output = output.view(*out_orig_shape)