Skip to content

Commit

Permalink
chore: enable fast attention for Qwen2-1.5B-Instruct model (#2592)
Browse files Browse the repository at this point in the history
Explicitly enable fast attention for Qwen2-1.5B-Instruct model, as it has been our recommended default since version 0.13. However, it presents some problems in certain CUDA environments. For details, see #2550. Once upstream issues are resolved, we should remove this workaround.
  • Loading branch information
wsxiaoys authored Jul 8, 2024
1 parent 6749399 commit 9ab1466
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 6 deletions.
13 changes: 10 additions & 3 deletions crates/llama-cpp-server/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,13 +151,20 @@ pub async fn create_chat_completion(config: &LocalModelConfig) -> Arc<dyn ChatCo
let chat_template = info
.chat_template
.unwrap_or_else(|| panic!("Chat model requires specifying prompt template"));

let mut enable_fast_attention = config.enable_fast_attention;
// FIXME(wsxiaoys): Explicitly enable fast attention for Qwen2-1.5B-Instruct, as it has been our recommended default since version 0.13. However, it presents some problems in certain CUDA environments. For details, see https://github.com/TabbyML/tabby/issues/2550.
// Once upstream issues are resolved, we should remove this workaround.
if config.model_id.ends_with("Qwen2-1.5B-Instruct") {
enable_fast_attention = Some(true);
}
Arc::new(
ChatCompletionServer::new(
config.num_gpu_layers,
&model_path,
config.parallelism,
chat_template,
config.enable_fast_attention,
enable_fast_attention.unwrap_or_default(),
)
.await,
)
Expand All @@ -173,7 +180,7 @@ pub async fn create_completion(
config.num_gpu_layers,
&model_path,
config.parallelism,
config.enable_fast_attention,
config.enable_fast_attention.unwrap_or_default(),
)
.await,
);
Expand All @@ -191,7 +198,7 @@ pub async fn create_embedding(config: &ModelConfig) -> Arc<dyn Embedding> {
llama.num_gpu_layers,
&model_path,
llama.parallelism,
llama.enable_fast_attention,
llama.enable_fast_attention.unwrap_or_default(),
)
.await,
)
Expand Down
4 changes: 2 additions & 2 deletions crates/tabby-common/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ fn default_embedding_config() -> ModelConfig {
model_id: "Nomic-Embed-Text".into(),
parallelism: 1,
num_gpu_layers: 9999,
enable_fast_attention: false,
enable_fast_attention: None,
})
}

Expand Down Expand Up @@ -245,7 +245,7 @@ pub struct LocalModelConfig {
pub num_gpu_layers: u16,

#[serde(default)]
pub enable_fast_attention: bool,
pub enable_fast_attention: Option<bool>,
}

fn default_parallelism() -> u8 {
Expand Down
2 changes: 1 addition & 1 deletion crates/tabby/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,6 @@ fn to_local_config(model: &str, parallelism: u8, device: &Device) -> ModelConfig
model_id: model.to_owned(),
parallelism,
num_gpu_layers,
enable_fast_attention: false,
enable_fast_attention: None,
})
}

0 comments on commit 9ab1466

Please sign in to comment.