Fix for Spec model TP + Chunked Prefill

Signed-off-by: andoorve <[email protected]>
vllm-project · Nov 11, 2024 · 6863d1f · 6863d1f
1 parent 9d5b4e4
commit 6863d1f
Showing 1 changed file with 6 additions and 0 deletions.
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
@@ -641,6 +641,12 @@ def _run_non_driver_rank(self) -> bool:
         # that the hidden states can be propagated to proposer when needed.
         if data["no_spec"]:
             self.scorer_worker.execute_model()
+            # If no spec case we still want to run the proposer model
+            # but ONLY once to match `not skip_proposer` in
+            # driver `_run_no_spec`
+            if not data["disable_all_speculation"]:
+                self.proposer_worker.execute_model()
+            return True
 
         if not data["disable_all_speculation"]:
             # Even if num_lookahead_slots is zero, we want to run the