chore: enable fast attention for Qwen2-1.5B-Instruct model (#2592)

Explicitly enable fast attention for Qwen2-1.5B-Instruct model, as it has been our recommended default since version 0.13. However, it presents some problems in certain CUDA environments. For details, see #2550. Once upstream issues are resolved, we should remove this workaround.
TabbyML · Jul 8, 2024 · 9ab1466 · 9ab1466
1 parent 6749399
commit 9ab1466
Show file tree

Hide file tree

Showing 3 changed files with 13 additions and 6 deletions.
diff --git a/crates/llama-cpp-server/src/lib.rs b/crates/llama-cpp-server/src/lib.rs
@@ -151,13 +151,20 @@ pub async fn create_chat_completion(config: &LocalModelConfig) -> Arc<dyn ChatCo
     let chat_template = info
         .chat_template
         .unwrap_or_else(|| panic!("Chat model requires specifying prompt template"));
+
+    let mut enable_fast_attention = config.enable_fast_attention;
+    // FIXME(wsxiaoys): Explicitly enable fast attention for Qwen2-1.5B-Instruct, as it has been our recommended default since version 0.13. However, it presents some problems in certain CUDA environments. For details, see https://github.com/TabbyML/tabby/issues/2550.
+    // Once upstream issues are resolved, we should remove this workaround.
+    if config.model_id.ends_with("Qwen2-1.5B-Instruct") {
+        enable_fast_attention = Some(true);
+    }
     Arc::new(
         ChatCompletionServer::new(
             config.num_gpu_layers,
             &model_path,
             config.parallelism,
             chat_template,
-            config.enable_fast_attention,
+            enable_fast_attention.unwrap_or_default(),
         )
         .await,
     )
@@ -173,7 +180,7 @@ pub async fn create_completion(
             config.num_gpu_layers,
             &model_path,
             config.parallelism,
-            config.enable_fast_attention,
+            config.enable_fast_attention.unwrap_or_default(),
         )
         .await,
     );
@@ -191,7 +198,7 @@ pub async fn create_embedding(config: &ModelConfig) -> Arc<dyn Embedding> {
                     llama.num_gpu_layers,
                     &model_path,
                     llama.parallelism,
-                    llama.enable_fast_attention,
+                    llama.enable_fast_attention.unwrap_or_default(),
                 )
                 .await,
             )

diff --git a/crates/tabby-common/src/config.rs b/crates/tabby-common/src/config.rs
@@ -176,7 +176,7 @@ fn default_embedding_config() -> ModelConfig {
         model_id: "Nomic-Embed-Text".into(),
         parallelism: 1,
         num_gpu_layers: 9999,
-        enable_fast_attention: false,
+        enable_fast_attention: None,
     })
 }
 
@@ -245,7 +245,7 @@ pub struct LocalModelConfig {
     pub num_gpu_layers: u16,
 
     #[serde(default)]
-    pub enable_fast_attention: bool,
+    pub enable_fast_attention: Option<bool>,
 }
 
 fn default_parallelism() -> u8 {

diff --git a/crates/tabby/src/main.rs b/crates/tabby/src/main.rs
@@ -135,6 +135,6 @@ fn to_local_config(model: &str, parallelism: u8, device: &Device) -> ModelConfig
         model_id: model.to_owned(),
         parallelism,
         num_gpu_layers,
-        enable_fast_attention: false,
+        enable_fast_attention: None,
     })
 }