Skip to content

Commit

Permalink
update format of codes
Browse files Browse the repository at this point in the history
Signed-off-by: zhenwei <[email protected]>
  • Loading branch information
zhenwei-intel committed Jan 22, 2025
1 parent 65afb1f commit cd75854
Show file tree
Hide file tree
Showing 7 changed files with 28 additions and 27 deletions.
1 change: 1 addition & 0 deletions vllm/attention/backends/hpu_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ class HPUAttentionMetadata(HPUPagedAttentionMetadata, AttentionMetadata):
seq_lens_tensor: Optional[torch.Tensor]
context_lens_tensor: Optional[torch.Tensor]


class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
"""
If the input tensors contain prompt tokens, the layout is as follows:
Expand Down
1 change: 1 addition & 0 deletions vllm/attention/ops/hpu_paged_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class HPUPagedAttentionMetadata:
block_scales: Optional[torch.Tensor]
block_groups: Optional[torch.Tensor]


class HPUPagedAttention:

@staticmethod
Expand Down
2 changes: 1 addition & 1 deletion vllm/executor/multiproc_worker_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@

import torch
from vllm import envs
from vllm.platforms import current_platform
from vllm.config import VllmConfig
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.triton_utils.importing import HAS_TRITON
from vllm.utils import _check_multiproc_method, get_mp_context, run_method

Expand Down
35 changes: 17 additions & 18 deletions vllm/model_executor/layers/fused_moe/layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,25 +182,24 @@ def forward_cpu(
num_expert_group,
)

def forward_hpu(self,
layer: torch.nn.Module,
x: torch.Tensor,
use_grouped_topk: bool,
top_k: int,
router_logits: torch.Tensor,
renormalize: bool,
topk_group: Optional[int] = None,
num_expert_group: Optional[int] = None,
**kwargs,
):
assert not use_grouped_topk, 'use_grouped_topk must be False on HPU'
assert num_expert_group is None, ('num_expert_group is '
'not supported on HPU')
assert topk_group is None, 'topk_group is not supported on HPU'
def forward_hpu(
self,
layer: torch.nn.Module,
x: torch.Tensor,
use_grouped_topk: bool,
top_k: int,
router_logits: torch.Tensor,
renormalize: bool,
topk_group: Optional[int] = None,
num_expert_group: Optional[int] = None,
**kwargs,
):
assert not use_grouped_topk, "use_grouped_topk must be False on HPU"
assert num_expert_group is None, "num_expert_group is not supported on HPU"

Check failure on line 198 in vllm/model_executor/layers/fused_moe/layer.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (E501)

vllm/model_executor/layers/fused_moe/layer.py:198:81: E501 Line too long (83 > 80)
assert topk_group is None, "topk_group is not supported on HPU"
if layer is not None:
return layer.hpu_fused_moe(x, layer.w13_weight,
layer.w2_weight, router_logits,
top_k)
return layer.hpu_fused_moe(x, layer.w13_weight, layer.w2_weight,
router_logits, top_k)

def forward_tpu(
self,
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/mixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@
from vllm.model_executor.model_loader.weight_utils import (
default_weight_loader, maybe_remap_kv_scale_name)
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors
from vllm.platforms import current_platform
from vllm.sequence import IntermediateTensors
from .interfaces import SupportsLoRA, SupportsPP
from .utils import (is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers,
Expand Down
2 changes: 2 additions & 0 deletions vllm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,10 +351,12 @@ def reset(self):
"""
self._index = 0


@lru_cache(maxsize=None)
def is_fake_hpu() -> bool:
return os.environ.get('VLLM_USE_FAKE_HPU', '0') != '0'


@lru_cache(maxsize=None)
def get_max_shared_memory_bytes(gpu: int = 0) -> int:
"""Returns the maximum shared memory per thread block in bytes."""
Expand Down
12 changes: 5 additions & 7 deletions vllm/worker/hpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@
from vllm.sampling_params import SamplingParams
from vllm.sequence import (IntermediateTensors, SequenceData,
SequenceGroupMetadata)
from vllm.utils import (bind_kv_cache, is_pin_memory_available,
make_tensor_with_pad, is_fake_hpu)
from vllm.utils import (bind_kv_cache, is_fake_hpu, is_pin_memory_available,
make_tensor_with_pad)
from vllm.worker.model_runner_base import (
ModelRunnerBase, ModelRunnerInputBase,
_add_attn_metadata_broadcastable_dict,
Expand Down Expand Up @@ -362,7 +362,6 @@ def _set_block_mapping(self, metadata, batch_size, device, dtype):
attn_bias=attn_bias)
return metadata


def _set_block_scales(self, metadata, device):
block_mapping = metadata.block_mapping
ones = torch.ones((block_mapping.size(0), ),
Expand Down Expand Up @@ -634,6 +633,7 @@ def __init__(
self._set_gc_threshold()
self.use_contiguous_pa = os.environ.get('VLLM_CONTIGUOUS_PA',
'true').lower() == 'true'

def _set_gc_threshold(self) -> None:
# Read https://docs.python.org/3/library/gc.html#gc.set_threshold
# for comprehensive description of gc generations.
Expand Down Expand Up @@ -941,7 +941,7 @@ def _prepare_prompt(
dtype=torch.long,
device='cpu')
context_lens_tensor = context_lens_tensor.to(self.device,
non_blocking=True)
non_blocking=True)
attn_metadata = self.attn_backend.make_metadata(
is_prompt=True,
block_list=None,
Expand Down Expand Up @@ -1127,8 +1127,7 @@ def _prepare_decode(
num_prefill_tokens=0,
num_decode_tokens=num_decode_tokens,
slot_mapping=slot_mapping,
multi_modal_placeholder_index_maps=None
)
multi_modal_placeholder_index_maps=None)
return PrepareDecodeMetadata(input_tokens=input_tokens,
input_positions=input_positions,
attn_metadata=attn_metadata,
Expand All @@ -1138,7 +1137,6 @@ def _prepare_decode(
slot_mapping=slot_mapping,
lora_ids=lora_ids)


def prepare_input_tensors(
self,
seq_group_metadata_list: List[SequenceGroupMetadata],
Expand Down

0 comments on commit cd75854

Please sign in to comment.