fixed accuracy bug

neuralmagic · Nov 17, 2024 · 02ee304 · 02ee304
1 parent 1af03e0
commit 02ee304
Showing 1 changed file with 1 addition and 1 deletion.
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
@@ -305,7 +305,7 @@ def _prepare_decode_inputs(self, num_decodes: int) -> DecodeInputData:
         slot_mapping = block_number * self.block_size + block_offsets
         # Set an out of range value for the padding tokens so that they
         # are ignored when inserting into the KV cache.
-        slot_mapping[-num_decodes:] = _PAD_SLOT_ID
+        slot_mapping[num_decodes:] = _PAD_SLOT_ID
         slot_mapping = slot_mapping[:padded_batch_size]
 
         # BLOCK_TABLE [batch, max_num_blocks_per_req]