From 05c40e5880034be840ea8476b0f7f867f248420c Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Mon, 13 May 2024 13:50:46 -0700 Subject: [PATCH 1/5] chore: add llama-cpp-server sub crate --- Cargo.lock | 10 +++++ Cargo.toml | 2 +- crates/llama-cpp-bindings/llama.cpp | 2 +- crates/llama-cpp-server/Cargo.toml | 14 +++++++ crates/llama-cpp-server/src/lib.rs | 63 +++++++++++++++++++++++++++++ 5 files changed, 89 insertions(+), 2 deletions(-) create mode 100644 crates/llama-cpp-server/Cargo.toml create mode 100644 crates/llama-cpp-server/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index c81a9d81317d..5aff68db2d1f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2773,6 +2773,16 @@ dependencies = [ "tracing", ] +[[package]] +name = "llama-cpp-server" +version = "0.12.0-dev.0" +dependencies = [ + "http-api-bindings", + "serde_json", + "tabby-inference", + "tokio", +] + [[package]] name = "lock_api" version = "0.4.10" diff --git a/Cargo.toml b/Cargo.toml index c63ab8bc12d1..c07031371262 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,7 @@ members = [ "ee/tabby-db", "ee/tabby-db-macros", "ee/tabby-search", - "ee/tabby-schema", + "ee/tabby-schema", "crates/llama-cpp-server", ] [workspace.package] diff --git a/crates/llama-cpp-bindings/llama.cpp b/crates/llama-cpp-bindings/llama.cpp index b4e4b8a9351d..9aa672490c84 160000 --- a/crates/llama-cpp-bindings/llama.cpp +++ b/crates/llama-cpp-bindings/llama.cpp @@ -1 +1 @@ -Subproject commit b4e4b8a9351d918a56831c73cf9f25c1837b80d1 +Subproject commit 9aa672490c848e45eaa704a554e0f1f6df995fc8 diff --git a/crates/llama-cpp-server/Cargo.toml b/crates/llama-cpp-server/Cargo.toml new file mode 100644 index 000000000000..55ffbea3c0bb --- /dev/null +++ b/crates/llama-cpp-server/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "llama-cpp-server" +version.workspace = true +edition.workspace = true +authors.workspace = true +homepage.workspace = true + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +http-api-bindings = { version = "0.12.0-dev.0", path = "../http-api-bindings" } +serde_json.workspace = true +tabby-inference = { version = "0.12.0-dev.0", path = "../tabby-inference" } +tokio.workspace = true diff --git a/crates/llama-cpp-server/src/lib.rs b/crates/llama-cpp-server/src/lib.rs new file mode 100644 index 000000000000..1f78c632efd8 --- /dev/null +++ b/crates/llama-cpp-server/src/lib.rs @@ -0,0 +1,63 @@ +use std::{process::Stdio, sync::Arc}; + +use serde_json::json; +use tabby_inference::{ChatCompletionStream, CompletionStream, Embedding}; + +struct LlamaCppServer { + process: tokio::process::Child, +} + +const SERVER_PORT: u16 = 30888; + +impl LlamaCppServer { + pub fn new(model_path: &str, use_gpu: bool, parallelism: u8) -> Self { + let mut num_gpu_layers = std::env::var("LLAMA_CPP_N_GPU_LAYERS") + .unwrap_or("9999".into()); + if !use_gpu { + num_gpu_layers = "0".to_string(); + } + let process = tokio::process::Command::new("llama-cpp-server") + .arg("-m") + .arg(model_path) + .arg("--port") + .arg(SERVER_PORT.to_string()) + .arg("-ngl") + .arg(num_gpu_layers) + .arg("-np") + .arg(parallelism.to_string()) + .kill_on_drop(true) + .spawn() + .expect("Failed to spawn llama-cpp-server"); + + Self { process } + } + + pub fn completion(&self, prompt_template: String) -> Arc { + let model_spec: String = serde_json::to_string(&json!({ + "kind": "llama", + "api_endpoint": format!("http://localhost:{SERVER_PORT}"), + "prompt_template": prompt_template, + })) + .expect("Failed to serialize model spec"); + let (engine, _, _) = http_api_bindings::create(&model_spec); + engine + } + + pub fn chat(&self) -> Arc { + let model_spec: String = serde_json::to_string(&json!({ + "kind": "openai-chat", + "api_endpoint": format!("http://localhost:{SERVER_PORT}/v1"), + })) + .expect("Failed to serialize model spec"); + http_api_bindings::create_chat(&model_spec) + } + + pub fn embedding(self) -> Arc { + let model_spec: String = serde_json::to_string(&json!({ + "kind": "llama", + "api_endpoint": format!("http://localhost:{SERVER_PORT}"), + })) + .expect("Failed to serialize model spec"); + http_api_bindings::create_embedding(&model_spec) + } +} From bc2d09b3795f9670854581e7ba7369ee888a4ccc Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Mon, 13 May 2024 14:21:45 -0700 Subject: [PATCH 2/5] chore: add llama-cpp-server to embed llama-server directly --- Cargo.lock | 3 ++ crates/llama-cpp-server/Cargo.toml | 11 ++-- crates/llama-cpp-server/src/lib.rs | 87 +++++++++++++++++++++++++++--- ee/tabby-webserver/Cargo.toml | 2 +- 4 files changed, 92 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5aff68db2d1f..4561d4bec611 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2777,8 +2777,11 @@ dependencies = [ name = "llama-cpp-server" version = "0.12.0-dev.0" dependencies = [ + "futures", "http-api-bindings", + "reqwest 0.12.4", "serde_json", + "tabby-common", "tabby-inference", "tokio", ] diff --git a/crates/llama-cpp-server/Cargo.toml b/crates/llama-cpp-server/Cargo.toml index 55ffbea3c0bb..9b46e5624482 100644 --- a/crates/llama-cpp-server/Cargo.toml +++ b/crates/llama-cpp-server/Cargo.toml @@ -8,7 +8,12 @@ homepage.workspace = true # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -http-api-bindings = { version = "0.12.0-dev.0", path = "../http-api-bindings" } +futures.workspace = true +http-api-bindings = { path = "../http-api-bindings" } +reqwest.workspace = true serde_json.workspace = true -tabby-inference = { version = "0.12.0-dev.0", path = "../tabby-inference" } -tokio.workspace = true +tabby-inference = { path = "../tabby-inference" } +tokio = { workspace = true, features = ["process"] } + +[dev-dependencies] +tabby-common = { path = "../tabby-common" } \ No newline at end of file diff --git a/crates/llama-cpp-server/src/lib.rs b/crates/llama-cpp-server/src/lib.rs index 1f78c632efd8..86276922d2a7 100644 --- a/crates/llama-cpp-server/src/lib.rs +++ b/crates/llama-cpp-server/src/lib.rs @@ -1,22 +1,25 @@ -use std::{process::Stdio, sync::Arc}; +use std::{ + process::{ExitStatus, Stdio}, + sync::Arc, +}; use serde_json::json; use tabby_inference::{ChatCompletionStream, CompletionStream, Embedding}; +use tokio::task::JoinHandle; struct LlamaCppServer { - process: tokio::process::Child, + handle: JoinHandle<()>, } const SERVER_PORT: u16 = 30888; impl LlamaCppServer { pub fn new(model_path: &str, use_gpu: bool, parallelism: u8) -> Self { - let mut num_gpu_layers = std::env::var("LLAMA_CPP_N_GPU_LAYERS") - .unwrap_or("9999".into()); + let mut num_gpu_layers = std::env::var("LLAMA_CPP_N_GPU_LAYERS").unwrap_or("9999".into()); if !use_gpu { num_gpu_layers = "0".to_string(); } - let process = tokio::process::Command::new("llama-cpp-server") + let mut process = tokio::process::Command::new("llama-server") .arg("-m") .arg(model_path) .arg("--port") @@ -26,16 +29,41 @@ impl LlamaCppServer { .arg("-np") .arg(parallelism.to_string()) .kill_on_drop(true) + .stderr(Stdio::null()) + .stdout(Stdio::null()) .spawn() .expect("Failed to spawn llama-cpp-server"); - Self { process } + let handle = tokio::spawn(async move { + let status_code = process + .wait() + .await + .ok() + .and_then(|s| s.code()) + .unwrap_or(-1); + println!("Exist with exit code {}", status_code); + }); + + Self { handle } + } + + async fn wait_for_health(&self) { + let client = reqwest::Client::new(); + loop { + let Ok(resp) = client.get(api_endpoint() + "/health").send().await else { + continue; + }; + + if resp.status().is_success() { + return; + } + } } pub fn completion(&self, prompt_template: String) -> Arc { let model_spec: String = serde_json::to_string(&json!({ "kind": "llama", - "api_endpoint": format!("http://localhost:{SERVER_PORT}"), + "api_endpoint": api_endpoint(), "prompt_template": prompt_template, })) .expect("Failed to serialize model spec"); @@ -61,3 +89,48 @@ impl LlamaCppServer { http_api_bindings::create_embedding(&model_spec) } } + +fn api_endpoint() -> String { + format!("http://localhost:{SERVER_PORT}") +} + +#[cfg(test)] +mod tests { + use futures::StreamExt; + use tabby_common::registry::{parse_model_id, ModelRegistry}; + use tabby_inference::CompletionOptionsBuilder; + + use super::*; + + #[tokio::test] + #[ignore = "Should only be run in local manual testing"] + async fn test_create_completion() { + let model_id = "StarCoder-1B"; + let (registry, name) = parse_model_id(model_id); + let registry = ModelRegistry::new(registry).await; + let model_path = registry.get_model_path(name).display().to_string(); + let model_info = registry.get_model_info(name); + + let server = LlamaCppServer::new(&model_path, false, 1); + server.wait_for_health().await; + + let completion = server.completion(model_info.prompt_template.clone().unwrap()); + let s = completion + .generate( + "def fib(n):", + CompletionOptionsBuilder::default() + .max_decoding_tokens(7) + .max_input_length(1024) + .sampling_temperature(0.0) + .seed(12345) + .build() + .unwrap(), + ) + .await; + + let content: Vec = s.collect().await; + + let content = content.join(""); + assert_eq!(content, "\n if n <= 1:") + } +} diff --git a/ee/tabby-webserver/Cargo.toml b/ee/tabby-webserver/Cargo.toml index 938091abc781..bdc5603628df 100644 --- a/ee/tabby-webserver/Cargo.toml +++ b/ee/tabby-webserver/Cargo.toml @@ -37,7 +37,7 @@ tabby-schema = { path = "../../ee/tabby-schema" } tabby-db = { path = "../../ee/tabby-db" } tarpc = { version = "0.33.0", features = ["serde-transport"] } thiserror.workspace = true -tokio = { workspace = true, features = ["fs", "process"] } +tokio = { workspace = true, features = ["fs"] } tokio-tungstenite = "0.21" tower = { version = "0.4", features = ["util", "limit"] } tower-http = { workspace = true, features = ["fs", "trace"] } From 8be5d2d17b2c90f814cd602a1c8e837e72022b06 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Mon, 13 May 2024 14:26:02 -0700 Subject: [PATCH 3/5] update --- Cargo.lock | 1 + crates/llama-cpp-server/Cargo.toml | 1 + crates/llama-cpp-server/src/lib.rs | 56 +++++++++++++++++++----------- 3 files changed, 37 insertions(+), 21 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4561d4bec611..fa4d90caad04 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2784,6 +2784,7 @@ dependencies = [ "tabby-common", "tabby-inference", "tokio", + "tracing", ] [[package]] diff --git a/crates/llama-cpp-server/Cargo.toml b/crates/llama-cpp-server/Cargo.toml index 9b46e5624482..f3d7f45e0895 100644 --- a/crates/llama-cpp-server/Cargo.toml +++ b/crates/llama-cpp-server/Cargo.toml @@ -13,6 +13,7 @@ http-api-bindings = { path = "../http-api-bindings" } reqwest.workspace = true serde_json.workspace = true tabby-inference = { path = "../tabby-inference" } +tracing.workspace = true tokio = { workspace = true, features = ["process"] } [dev-dependencies] diff --git a/crates/llama-cpp-server/src/lib.rs b/crates/llama-cpp-server/src/lib.rs index 86276922d2a7..016f1be1c33e 100644 --- a/crates/llama-cpp-server/src/lib.rs +++ b/crates/llama-cpp-server/src/lib.rs @@ -6,6 +6,7 @@ use std::{ use serde_json::json; use tabby_inference::{ChatCompletionStream, CompletionStream, Embedding}; use tokio::task::JoinHandle; +use tracing::warn; struct LlamaCppServer { handle: JoinHandle<()>, @@ -19,29 +20,36 @@ impl LlamaCppServer { if !use_gpu { num_gpu_layers = "0".to_string(); } - let mut process = tokio::process::Command::new("llama-server") - .arg("-m") - .arg(model_path) - .arg("--port") - .arg(SERVER_PORT.to_string()) - .arg("-ngl") - .arg(num_gpu_layers) - .arg("-np") - .arg(parallelism.to_string()) - .kill_on_drop(true) - .stderr(Stdio::null()) - .stdout(Stdio::null()) - .spawn() - .expect("Failed to spawn llama-cpp-server"); + let model_path = model_path.to_owned(); let handle = tokio::spawn(async move { - let status_code = process - .wait() - .await - .ok() - .and_then(|s| s.code()) - .unwrap_or(-1); - println!("Exist with exit code {}", status_code); + loop { + let mut process = tokio::process::Command::new("llama-server") + .arg("-m") + .arg(&model_path) + .arg("--port") + .arg(SERVER_PORT.to_string()) + .arg("-ngl") + .arg(&num_gpu_layers) + .arg("-np") + .arg(parallelism.to_string()) + .kill_on_drop(true) + .stderr(Stdio::inherit()) + .stdout(Stdio::inherit()) + .spawn() + .expect("Failed to spawn llama-cpp-server"); + + let status_code = process + .wait() + .await + .ok() + .and_then(|s| s.code()) + .unwrap_or(-1); + + if status_code != 0 { + warn!("llama-server exited with status code {}, restarting...", status_code); + } + } }); Self { handle } @@ -90,6 +98,12 @@ impl LlamaCppServer { } } +impl Drop for LlamaCppServer { + fn drop(&mut self) { + self.handle.abort(); + } +} + fn api_endpoint() -> String { format!("http://localhost:{SERVER_PORT}") } From bb272b0412160775d657b826c0c66ec84d6e67df Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Mon, 13 May 2024 21:31:45 +0000 Subject: [PATCH 4/5] [autofix.ci] apply automated fixes --- crates/llama-cpp-server/src/lib.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/crates/llama-cpp-server/src/lib.rs b/crates/llama-cpp-server/src/lib.rs index 016f1be1c33e..1313b86d6f8b 100644 --- a/crates/llama-cpp-server/src/lib.rs +++ b/crates/llama-cpp-server/src/lib.rs @@ -1,5 +1,5 @@ use std::{ - process::{ExitStatus, Stdio}, + process::{Stdio}, sync::Arc, }; @@ -47,7 +47,10 @@ impl LlamaCppServer { .unwrap_or(-1); if status_code != 0 { - warn!("llama-server exited with status code {}, restarting...", status_code); + warn!( + "llama-server exited with status code {}, restarting...", + status_code + ); } } }); From 85b37e65eaf72879bdeb8b0de8a2df284462e948 Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Mon, 13 May 2024 21:38:26 +0000 Subject: [PATCH 5/5] [autofix.ci] apply automated fixes (attempt 2/3) --- crates/llama-cpp-server/src/lib.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/crates/llama-cpp-server/src/lib.rs b/crates/llama-cpp-server/src/lib.rs index 1313b86d6f8b..385f2dbc4df5 100644 --- a/crates/llama-cpp-server/src/lib.rs +++ b/crates/llama-cpp-server/src/lib.rs @@ -1,7 +1,4 @@ -use std::{ - process::{Stdio}, - sync::Arc, -}; +use std::{process::Stdio, sync::Arc}; use serde_json::json; use tabby_inference::{ChatCompletionStream, CompletionStream, Embedding};