diff --git a/vllm/envs.py b/vllm/envs.py index 3e458f9b6f25f..0e565c338d65b 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -14,7 +14,7 @@ VLLM_ROCM_PREFER_TORCH: bool = False VLLM_ROCM_PREFER_TRITON: bool = True VLLM_USE_SDPA_ATTENTION: bool = False - VLLM_USE_TRITON_FLASH_ATTN: bool = True + VLLM_USE_TRITON_FLASH_ATTN: bool = False VLLM_USE_ROCM_SKINNY_GEMM: bool = True VLLM_USE_ROCM_CUSTOM_PAGED_ATTN: bool = True VLLM_USE_ROCM_CUSTOM_PAGED_ATTN_FP8_OUT: bool = True @@ -227,7 +227,7 @@ def get_default_config_root(): # flag to control if vllm should use triton flash attention "VLLM_USE_TRITON_FLASH_ATTN": - lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in + lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "False").lower() in ("true", "1")), # Internal flag to enable Dynamo fullgraph capture