From 1f947b59d9422b2c2c5db321ad9bc86055749347 Mon Sep 17 00:00:00 2001 From: Hui Liu <96135754+hliuca@users.noreply.github.com> Date: Thu, 12 Dec 2024 13:23:35 -0600 Subject: [PATCH] disable triton FA by default --- vllm/envs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 3e458f9b6f25f..0e565c338d65b 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -14,7 +14,7 @@ VLLM_ROCM_PREFER_TORCH: bool = False VLLM_ROCM_PREFER_TRITON: bool = True VLLM_USE_SDPA_ATTENTION: bool = False - VLLM_USE_TRITON_FLASH_ATTN: bool = True + VLLM_USE_TRITON_FLASH_ATTN: bool = False VLLM_USE_ROCM_SKINNY_GEMM: bool = True VLLM_USE_ROCM_CUSTOM_PAGED_ATTN: bool = True VLLM_USE_ROCM_CUSTOM_PAGED_ATTN_FP8_OUT: bool = True @@ -227,7 +227,7 @@ def get_default_config_root(): # flag to control if vllm should use triton flash attention "VLLM_USE_TRITON_FLASH_ATTN": - lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in + lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "False").lower() in ("true", "1")), # Internal flag to enable Dynamo fullgraph capture