format

ROCm · Dec 19, 2024 · 7908e9b · 7908e9b
1 parent a283f40
commit 7908e9b
Show file tree

Hide file tree

Showing 3 changed files with 7 additions and 6 deletions.
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -534,11 +534,13 @@ def get_default_config_root():
     "Q_SCALE_CONSTANT":
     lambda: int(os.getenv("Q_SCALE_CONSTANT", "20")),
 
-    # Divisor for dynamic key scale factor calculation for FP8 KV Cache and attention
+    # Divisor for dynamic key scale factor calculation
+    # for FP8 KV Cache and attention
     "K_SCALE_CONSTANT":
     lambda: int(os.getenv("K_SCALE_CONSTANT", "20")),
 
-    # Divisor for dynamic value scale factor calculation for FP8 KV Cache and attention
+    # Divisor for dynamic value scale factor calculation
+    # for FP8 KV Cache and attention
     "V_SCALE_CONSTANT":
     lambda: int(os.getenv("V_SCALE_CONSTANT", "10")),
 

diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py
@@ -68,10 +68,9 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                 v_scale *= 2
             layer.calculate_kv_scales = False
 
-        if not isinstance(k_scale, float) or not isinstance(
-                v_scale, float):
+        if not isinstance(k_scale, float) or not isinstance(v_scale, float):
             raise ValueError("Only support per-tensor scaling factor "
-                                "for fp8 KV cache")
+                             "for fp8 KV cache")
 
         # These are used in the final Attention.forward()
         layer._k_scale.copy_(k_scale)

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
@@ -201,7 +201,7 @@ def __init__(
         self.attn_fp8 = envs.VLLM_USE_ROCM_FP8_ATTN \
                         and current_platform.is_rocm() \
                         and not is_navi() \
-                        and isinstance(quant_config, Fp8Config) 
+                        and isinstance(quant_config, Fp8Config)
 
         self.attn = Attention(
             self.num_heads,