From 9307bfa2349e80083fb843395356a49c2d3e4e01 Mon Sep 17 00:00:00 2001 From: Hanzhi Zhou Date: Thu, 7 Nov 2024 05:09:15 +0000 Subject: [PATCH] fix format Signed-off-by: Hanzhi Zhou --- vllm/distributed/device_communicators/custom_all_reduce.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py index 48bc8d7392a4e..42fc364b3e0dc 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce.py +++ b/vllm/distributed/device_communicators/custom_all_reduce.py @@ -278,7 +278,7 @@ def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]: return torch.empty_like(input) else: # Note: outside of cuda graph context, custom allreduce incurs a - # cost of cudaMemcpy, which should be small (<=1% of overall + # cost of cudaMemcpy, which should be small (<=1% of overall # latency) compared to the performance gain of using custom kernels return self.all_reduce(input, registered=False)