a bit faster

vllm-project · Nov 17, 2024 · 58e85eb · 58e85eb
1 parent 75c44b4
commit 58e85eb
Showing 1 changed file with 2 additions and 0 deletions.
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
@@ -376,6 +376,7 @@ def execute_model(
             )
 
             # NOTE: TPU<>CPU sync happens here.
+            # It is important to call .cpu() first to avoid compilation on hotpath.
             token_ids = selected_token_ids.cpu()[:num_decodes]
             sampled_token_ids_list = token_ids.tolist()
             sampled_token_ids[:num_decodes] = token_ids
@@ -407,6 +408,7 @@ def execute_model(
                 is_prompt=True
             )
             # NOTE: TPU<>CPU sync happens here.
+            # It is important to call .cpu() first to avoid compilation on hotpath.
             token_id = selected_token_ids.cpu()[prompt_len - 1].item()
             sampled_token_ids[num_decodes + idx] = token_id
             req_state = self.requests[req_id]