cleanup PR

vllm-project · Nov 17, 2024 · d9dc36a · d9dc36a
1 parent fcf4681
commit d9dc36a
Show file tree

Hide file tree

Showing 9 changed files with 133 additions and 162 deletions.
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
@@ -171,7 +171,7 @@ def run_vllm(
                 ignore_eos=True,
             ))
         end = time.perf_counter()
-    
+
     # print(met.metrics_report())
     # print(met.short_metrics_report())
 

diff --git a/tests/entrypoints/openai/test_accuracy.py b/tests/entrypoints/openai/test_accuracy.py
@@ -67,8 +67,8 @@ def run_test(more_args):
                 ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
 
 
-@pytest.mark.skipif(not current_platform.is_cuda() and 
-                    not current_platform.is_tpu(),
+@pytest.mark.skipif(not current_platform.is_cuda()
+                    and not current_platform.is_tpu(),
                     reason="V1 currently only supported on CUDA")
 def test_lm_eval_accuracy_v1_engine(monkeypatch):
     """Run with the V1 Engine."""

diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
@@ -237,8 +237,8 @@ def which_attn_to_use(head_size: int,
         return _Backend.IPEX
 
     if current_platform.is_tpu():
-        if (selected_backend != _Backend.PALLAS and 
-            selected_backend != _Backend.PALLAS_VLLM_V1):
+        if (selected_backend != _Backend.PALLAS
+                and selected_backend != _Backend.PALLAS_VLLM_V1):
             logger.info("Cannot use %s backend on TPU.", selected_backend)
         if use_v1:
             return _Backend.PALLAS_VLLM_V1

diff --git a/vllm/config.py b/vllm/config.py
@@ -1225,7 +1225,7 @@ def __init__(self, device: str = "auto") -> None:
         # Some device types require processing inputs on CPU
         if self.device_type in ["neuron", "openvino"]:
             self.device = torch.device("cpu")
-        # Device initialization should happen after initializing the 
+        # Device initialization should happen after initializing the
         # distributed runtime.
         elif self.device_type in ["tpu"]:
             self.device = None

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
@@ -151,12 +151,12 @@ def schedule(self) -> "SchedulerOutput":
                     num_computed_tokens -= 1
                     num_new_tokens = 1
                     computed_blocks.pop()
-                
+
                 # If chunked prefill is not enabled, breakout of the loop.
-                if (not self.scheduler_config.chunked_prefill_enabled and
-                    num_new_tokens > token_budget):
+                if (not self.scheduler_config.chunked_prefill_enabled
+                        and num_new_tokens > token_budget):
                     break
-            
+
                 num_new_tokens = min(num_new_tokens, token_budget)
                 assert num_new_tokens > 0
                 new_blocks = self.kv_cache_manager.allocate_slots(

diff --git a/vllm/v1/executor/tpu_executor.py b/vllm/v1/executor/tpu_executor.py
@@ -10,6 +10,7 @@
 
 # import torch_xla.debug.profiler as xp
 
+
 class TPUExecutor:
 
     def __init__(self, vllm_config: VllmConfig) -> None:
@@ -32,11 +33,10 @@ def __init__(self, vllm_config: VllmConfig) -> None:
         # self.server = xp.start_server(9012)
 
     def _create_worker(
-        self,
-        local_rank: int = 0,
-        rank: int = 0,
-        distributed_init_method: Optional[str] = None
-    ) -> TPUWorker:
+            self,
+            local_rank: int = 0,
+            rank: int = 0,
+            distributed_init_method: Optional[str] = None) -> TPUWorker:
         """Return worker init args for a given rank."""
 
         if distributed_init_method is None: