Skip to content

Commit

Permalink
cleanup PR
Browse files Browse the repository at this point in the history
  • Loading branch information
robertgshaw2-redhat committed Nov 17, 2024
1 parent fcf4681 commit d9dc36a
Show file tree
Hide file tree
Showing 9 changed files with 133 additions and 162 deletions.
2 changes: 1 addition & 1 deletion benchmarks/benchmark_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def run_vllm(
ignore_eos=True,
))
end = time.perf_counter()

# print(met.metrics_report())
# print(met.short_metrics_report())

Expand Down
4 changes: 2 additions & 2 deletions tests/entrypoints/openai/test_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@ def run_test(more_args):
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"


@pytest.mark.skipif(not current_platform.is_cuda() and
not current_platform.is_tpu(),
@pytest.mark.skipif(not current_platform.is_cuda()
and not current_platform.is_tpu(),
reason="V1 currently only supported on CUDA")
def test_lm_eval_accuracy_v1_engine(monkeypatch):
"""Run with the V1 Engine."""
Expand Down
4 changes: 2 additions & 2 deletions vllm/attention/selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,8 +237,8 @@ def which_attn_to_use(head_size: int,
return _Backend.IPEX

if current_platform.is_tpu():
if (selected_backend != _Backend.PALLAS and
selected_backend != _Backend.PALLAS_VLLM_V1):
if (selected_backend != _Backend.PALLAS
and selected_backend != _Backend.PALLAS_VLLM_V1):
logger.info("Cannot use %s backend on TPU.", selected_backend)
if use_v1:
return _Backend.PALLAS_VLLM_V1
Expand Down
2 changes: 1 addition & 1 deletion vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1225,7 +1225,7 @@ def __init__(self, device: str = "auto") -> None:
# Some device types require processing inputs on CPU
if self.device_type in ["neuron", "openvino"]:
self.device = torch.device("cpu")
# Device initialization should happen after initializing the
# Device initialization should happen after initializing the
# distributed runtime.
elif self.device_type in ["tpu"]:
self.device = None
Expand Down
8 changes: 4 additions & 4 deletions vllm/v1/core/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,12 +151,12 @@ def schedule(self) -> "SchedulerOutput":
num_computed_tokens -= 1
num_new_tokens = 1
computed_blocks.pop()

# If chunked prefill is not enabled, breakout of the loop.
if (not self.scheduler_config.chunked_prefill_enabled and
num_new_tokens > token_budget):
if (not self.scheduler_config.chunked_prefill_enabled
and num_new_tokens > token_budget):
break

num_new_tokens = min(num_new_tokens, token_budget)
assert num_new_tokens > 0
new_blocks = self.kv_cache_manager.allocate_slots(
Expand Down
10 changes: 5 additions & 5 deletions vllm/v1/executor/tpu_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

# import torch_xla.debug.profiler as xp


class TPUExecutor:

def __init__(self, vllm_config: VllmConfig) -> None:
Expand All @@ -32,11 +33,10 @@ def __init__(self, vllm_config: VllmConfig) -> None:
# self.server = xp.start_server(9012)

def _create_worker(
self,
local_rank: int = 0,
rank: int = 0,
distributed_init_method: Optional[str] = None
) -> TPUWorker:
self,
local_rank: int = 0,
rank: int = 0,
distributed_init_method: Optional[str] = None) -> TPUWorker:
"""Return worker init args for a given rank."""

if distributed_init_method is None:
Expand Down
Loading

0 comments on commit d9dc36a

Please sign in to comment.