diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c37cf5c6390f..5b1cef0ff83f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -111,7 +111,7 @@ jobs: - run: bash ./ci/prepare_build_environment.sh - name: Bulid release binary - run: cargo build --no-default-features --release --target ${{ matrix.target }} + run: cargo build --no-default-features --release --target ${{ matrix.target }} --package tabby - name: Rename release binary run: mv target/${{ matrix.target }}/release/tabby tabby_${{ matrix.target }} diff --git a/CHANGELOG.md b/CHANGELOG.md index fc32b6a58dfb..744449f2dafb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,8 @@ ## Features ## Fixes and Improvements +* Switch cpu backend to llama.cpp: https://github.com/TabbyML/tabby/pull/638 +* add `server.completion_timeout` to control the code completion interface timeout: https://github.com/TabbyML/tabby/pull/637 # v0.4.0 diff --git a/crates/llama-cpp-bindings/include/engine.h b/crates/llama-cpp-bindings/include/engine.h index 834a1d753f81..fffc0a25586f 100644 --- a/crates/llama-cpp-bindings/include/engine.h +++ b/crates/llama-cpp-bindings/include/engine.h @@ -16,5 +16,5 @@ class TextInferenceEngine { virtual uint32_t eos_token() const = 0; }; -std::unique_ptr create_engine(rust::Str model_path); +std::unique_ptr create_engine(bool use_gpu, rust::Str model_path); } // namespace diff --git a/crates/llama-cpp-bindings/src/engine.cc b/crates/llama-cpp-bindings/src/engine.cc index abcaee0e28db..7f3f2986cd2a 100644 --- a/crates/llama-cpp-bindings/src/engine.cc +++ b/crates/llama-cpp-bindings/src/engine.cc @@ -114,11 +114,11 @@ struct BackendInitializer { }; } // namespace -std::unique_ptr create_engine(rust::Str model_path) { +std::unique_ptr create_engine(bool use_gpu, rust::Str model_path) { static BackendInitializer initializer; llama_model_params model_params = llama_model_default_params(); - model_params.n_gpu_layers = 1; + model_params.n_gpu_layers = use_gpu ? 1 : 0; llama_model* model = llama_load_model_from_file(std::string(model_path).c_str(), model_params); if (!model) { diff --git a/crates/llama-cpp-bindings/src/lib.rs b/crates/llama-cpp-bindings/src/lib.rs index 084280b62c95..53870fc5abd3 100644 --- a/crates/llama-cpp-bindings/src/lib.rs +++ b/crates/llama-cpp-bindings/src/lib.rs @@ -15,7 +15,7 @@ mod ffi { type TextInferenceEngine; - fn create_engine(model_path: &str) -> UniquePtr; + fn create_engine(use_gpu: bool, model_path: &str) -> UniquePtr; fn start(self: Pin<&mut TextInferenceEngine>, input_token_ids: &[u32]); fn step(self: Pin<&mut TextInferenceEngine>) -> Result; @@ -32,6 +32,7 @@ unsafe impl Sync for ffi::TextInferenceEngine {} pub struct LlamaEngineOptions { model_path: String, tokenizer_path: String, + use_gpu: bool, } pub struct LlamaEngine { @@ -42,7 +43,7 @@ pub struct LlamaEngine { impl LlamaEngine { pub fn create(options: LlamaEngineOptions) -> Self { - let engine = create_engine(&options.model_path); + let engine = create_engine(options.use_gpu, &options.model_path); if engine.is_null() { panic!("Unable to load model: {}", options.model_path); } diff --git a/crates/tabby/Cargo.toml b/crates/tabby/Cargo.toml index 87da33f7fca7..37bc80967e69 100644 --- a/crates/tabby/Cargo.toml +++ b/crates/tabby/Cargo.toml @@ -4,7 +4,6 @@ version = "0.5.0-dev" edition = "2021" [dependencies] -ctranslate2-bindings = { path = "../ctranslate2-bindings" } tabby-common = { path = "../tabby-common" } tabby-scheduler = { path = "../tabby-scheduler" } tabby-download = { path = "../tabby-download" } @@ -43,9 +42,8 @@ minijinja = { version = "1.0.8", features = ["loader"] } textdistance = "1.0.2" regex.workspace = true thiserror.workspace = true - -[target.'cfg(all(target_os="macos", target_arch="aarch64"))'.dependencies] llama-cpp-bindings = { path = "../llama-cpp-bindings" } +ctranslate2-bindings = { path = "../ctranslate2-bindings", optional = true } [dependencies.uuid] version = "1.3.3" @@ -57,6 +55,7 @@ features = [ [features] link_shared = ["ctranslate2-bindings/link_shared"] +link_cuda_static = ["ctranslate2-bindings"] [build-dependencies] vergen = { version = "8.0.0", features = ["build", "git", "gitcl"] } diff --git a/crates/tabby/src/serve/engine.rs b/crates/tabby/src/serve/engine.rs index 9eb86f91d767..8675bf32b097 100644 --- a/crates/tabby/src/serve/engine.rs +++ b/crates/tabby/src/serve/engine.rs @@ -1,6 +1,5 @@ use std::path::Path; -use ctranslate2_bindings::{CTranslate2Engine, CTranslate2EngineOptionsBuilder}; use serde::Deserialize; use tabby_common::path::ModelDir; use tabby_inference::TextGeneration; @@ -39,33 +38,36 @@ pub struct EngineInfo { pub chat_template: Option, } -#[cfg(not(all(target_os = "macos", target_arch = "aarch64")))] +#[cfg(not(any(feature = "link_shared", feature = "link_cuda_static")))] fn create_local_engine( args: &crate::serve::ServeArgs, model_dir: &ModelDir, - metadata: &Metadata, + _metadata: &Metadata, ) -> Box { - create_ctranslate2_engine(args, model_dir, metadata) + create_ggml_engine(&args.device, model_dir) } -#[cfg(all(target_os = "macos", target_arch = "aarch64"))] +#[cfg(any(feature = "link_shared", feature = "link_cuda_static"))] fn create_local_engine( args: &crate::serve::ServeArgs, model_dir: &ModelDir, metadata: &Metadata, ) -> Box { - if args.device != super::Device::Metal { - create_ctranslate2_engine(args, model_dir, metadata) + if args.device.use_ggml_backend() { + create_ggml_engine(&args.device, model_dir) } else { - create_llama_engine(model_dir) + create_ctranslate2_engine(args, model_dir, metadata) } } +#[cfg(any(feature = "link_shared", feature = "link_cuda_static"))] fn create_ctranslate2_engine( args: &crate::serve::ServeArgs, model_dir: &ModelDir, metadata: &Metadata, ) -> Box { + use ctranslate2_bindings::{CTranslate2Engine, CTranslate2EngineOptionsBuilder}; + let device = format!("{}", args.device); let options = CTranslate2EngineOptionsBuilder::default() .model_path(model_dir.ctranslate2_dir()) @@ -78,11 +80,11 @@ fn create_ctranslate2_engine( Box::new(CTranslate2Engine::create(options)) } -#[cfg(all(target_os = "macos", target_arch = "aarch64"))] -fn create_llama_engine(model_dir: &ModelDir) -> Box { +fn create_ggml_engine(device: &super::Device, model_dir: &ModelDir) -> Box { let options = llama_cpp_bindings::LlamaEngineOptionsBuilder::default() .model_path(model_dir.ggml_q8_0_file()) .tokenizer_path(model_dir.tokenizer_file()) + .use_gpu(device.ggml_use_gpu()) .build() .unwrap(); @@ -99,6 +101,7 @@ fn get_model_dir(model: &str) -> ModelDir { #[derive(Deserialize)] struct Metadata { + #[allow(dead_code)] auto_model: String, prompt_template: Option, chat_template: Option, diff --git a/crates/tabby/src/serve/mod.rs b/crates/tabby/src/serve/mod.rs index 4bd6d2c3801f..013a446787e3 100644 --- a/crates/tabby/src/serve/mod.rs +++ b/crates/tabby/src/serve/mod.rs @@ -74,7 +74,7 @@ pub enum Device { #[strum(serialize = "cpu")] Cpu, - #[strum(serialize = "cuda")] + #[cfg(any(feature = "link_shared", feature = "link_cuda_static"))] Cuda, #[cfg(all(target_os = "macos", target_arch = "aarch64"))] @@ -85,6 +85,28 @@ pub enum Device { ExperimentalHttp, } +impl Device { + #[cfg(all(target_os = "macos", target_arch = "aarch64"))] + fn use_ggml_backend(&self) -> bool { + *self == Device::Metal || *self == Device::Cpu + } + + #[cfg(not(all(target_os = "macos", target_arch = "aarch64")))] + fn use_ggml_backend(&self) -> bool { + *self == Device::Cpu + } + + #[cfg(all(target_os = "macos", target_arch = "aarch64"))] + fn ggml_use_gpu(&self) -> bool { + *self == Device::Metal + } + + #[cfg(not(all(target_os = "macos", target_arch = "aarch64")))] + fn ggml_use_gpu(&self) -> bool { + false + } +} + #[derive(Args)] pub struct ServeArgs { /// Model id for `/completions` API endpoint. @@ -115,16 +137,6 @@ pub struct ServeArgs { compute_type: Option, } -#[cfg(not(all(target_os = "macos", target_arch = "aarch64")))] -fn should_download_ggml_files(_device: &Device) -> bool { - false -} - -#[cfg(all(target_os = "macos", target_arch = "aarch64"))] -fn should_download_ggml_files(device: &Device) -> bool { - *device == Device::Metal -} - pub async fn main(config: &Config, args: &ServeArgs) { valid_args(args); @@ -275,7 +287,7 @@ fn start_heartbeat(args: &ServeArgs) { async fn download_model(model: &str, device: &Device) { let downloader = Downloader::new(model, /* prefer_local_file= */ true); let handler = |err| fatal!("Failed to fetch model '{}' due to '{}'", model, err,); - let download_result = if should_download_ggml_files(device) { + let download_result = if device.use_ggml_backend() { downloader.download_ggml_files().await } else { downloader.download_ctranslate2_files().await diff --git a/website/docs/models/index.md b/website/docs/models/index.md index 9463efa653f7..f0530d3880b5 100644 --- a/website/docs/models/index.md +++ b/website/docs/models/index.md @@ -17,7 +17,6 @@ We recommend using | [TabbyML/StarCoder-7B](https://huggingface.co/TabbyML/StarCoder-7B) | [BigCode-OpenRAIL-M](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) | ✅ | ✅ | | [TabbyML/StarCoder-3B](https://huggingface.co/TabbyML/StarCoder-3B) | [BigCode-OpenRAIL-M](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) | ✅ | ✅ | | [TabbyML/StarCoder-1B](https://huggingface.co/TabbyML/StarCoder-1B) | [BigCode-OpenRAIL-M](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) | ✅ | ✅ | -| [TabbyML/J-350M](https://huggingface.co/TabbyML/J-350M) | [BSD-3](https://opensource.org/license/bsd-3-clause/) | ❌ | ❌ | ## Chat models (`--chat-model`)