update format of codes

Signed-off-by: zhenwei <[email protected]>
vllm-project · Jan 22, 2025 · cd75854 · cd75854
1 parent 65afb1f
commit cd75854
Show file tree

Hide file tree

Showing 7 changed files with 28 additions and 27 deletions.
diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
@@ -75,6 +75,7 @@ class HPUAttentionMetadata(HPUPagedAttentionMetadata, AttentionMetadata):
     seq_lens_tensor: Optional[torch.Tensor]
     context_lens_tensor: Optional[torch.Tensor]
 
+
 class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
     """
     If the input tensors contain prompt tokens, the layout is as follows:

diff --git a/vllm/attention/ops/hpu_paged_attn.py b/vllm/attention/ops/hpu_paged_attn.py
@@ -23,6 +23,7 @@ class HPUPagedAttentionMetadata:
     block_scales: Optional[torch.Tensor]
     block_groups: Optional[torch.Tensor]
 
+
 class HPUPagedAttention:
 
     @staticmethod

diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py
@@ -12,9 +12,9 @@
 
 import torch
 from vllm import envs
-from vllm.platforms import current_platform
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from vllm.triton_utils.importing import HAS_TRITON
 from vllm.utils import _check_multiproc_method, get_mp_context, run_method
 

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -182,25 +182,24 @@ def forward_cpu(
             num_expert_group,
         )
 
-    def forward_hpu(self,
-                    layer: torch.nn.Module,
-                    x: torch.Tensor,
-                    use_grouped_topk: bool,
-                    top_k: int,
-                    router_logits: torch.Tensor,
-                    renormalize: bool,
-                    topk_group: Optional[int] = None,
-                    num_expert_group: Optional[int] = None,
-                    **kwargs,
-        ):
-        assert not use_grouped_topk, 'use_grouped_topk must be False on HPU'
-        assert num_expert_group is None, ('num_expert_group is '
-                                          'not supported on HPU')
-        assert topk_group is None, 'topk_group is not supported on HPU'
+    def forward_hpu(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        **kwargs,
+    ):
+        assert not use_grouped_topk, "use_grouped_topk must be False on HPU"
+        assert num_expert_group is None, "num_expert_group is not supported on HPU"
+        assert topk_group is None, "topk_group is not supported on HPU"
         if layer is not None:
-            return layer.hpu_fused_moe(x, layer.w13_weight,
-                                       layer.w2_weight, router_logits,
-                                       top_k)
+            return layer.hpu_fused_moe(x, layer.w13_weight, layer.w2_weight,
+                                       router_logits, top_k)
 
     def forward_tpu(
         self,

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
@@ -44,8 +44,8 @@
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import IntermediateTensors
 from vllm.platforms import current_platform
+from vllm.sequence import IntermediateTensors
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,

diff --git a/vllm/utils.py b/vllm/utils.py
@@ -351,10 +351,12 @@ def reset(self):
         """
         self._index = 0
 
+
 @lru_cache(maxsize=None)
 def is_fake_hpu() -> bool:
     return os.environ.get('VLLM_USE_FAKE_HPU', '0') != '0'
 
+
 @lru_cache(maxsize=None)
 def get_max_shared_memory_bytes(gpu: int = 0) -> int:
     """Returns the maximum shared memory per thread block in bytes."""

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
@@ -43,8 +43,8 @@
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (IntermediateTensors, SequenceData,
                            SequenceGroupMetadata)
-from vllm.utils import (bind_kv_cache, is_pin_memory_available,
-                        make_tensor_with_pad, is_fake_hpu)
+from vllm.utils import (bind_kv_cache, is_fake_hpu, is_pin_memory_available,
+                        make_tensor_with_pad)
 from vllm.worker.model_runner_base import (
     ModelRunnerBase, ModelRunnerInputBase,
     _add_attn_metadata_broadcastable_dict,
@@ -362,7 +362,6 @@ def _set_block_mapping(self, metadata, batch_size, device, dtype):
                                      attn_bias=attn_bias)
         return metadata
 
-
     def _set_block_scales(self, metadata, device):
         block_mapping = metadata.block_mapping
         ones = torch.ones((block_mapping.size(0), ),
@@ -634,6 +633,7 @@ def __init__(
         self._set_gc_threshold()
         self.use_contiguous_pa = os.environ.get('VLLM_CONTIGUOUS_PA',
                                                 'true').lower() == 'true'
+
     def _set_gc_threshold(self) -> None:
         # Read https://docs.python.org/3/library/gc.html#gc.set_threshold
         # for comprehensive description of gc generations.
@@ -941,7 +941,7 @@ def _prepare_prompt(
                                            dtype=torch.long,
                                            device='cpu')
         context_lens_tensor = context_lens_tensor.to(self.device,
-                                                non_blocking=True)
+                                                     non_blocking=True)
         attn_metadata = self.attn_backend.make_metadata(
             is_prompt=True,
             block_list=None,
@@ -1127,8 +1127,7 @@ def _prepare_decode(
             num_prefill_tokens=0,
             num_decode_tokens=num_decode_tokens,
             slot_mapping=slot_mapping,
-            multi_modal_placeholder_index_maps=None
-        )
+            multi_modal_placeholder_index_maps=None)
         return PrepareDecodeMetadata(input_tokens=input_tokens,
                                      input_positions=input_positions,
                                      attn_metadata=attn_metadata,
@@ -1138,7 +1137,6 @@ def _prepare_decode(
                                      slot_mapping=slot_mapping,
                                      lora_ids=lora_ids)
 
-
     def prepare_input_tensors(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],