From 05c40e5880034be840ea8476b0f7f867f248420c Mon Sep 17 00:00:00 2001
From: Meng Zhang <meng@tabbyml.com>
Date: Mon, 13 May 2024 13:50:46 -0700
Subject: [PATCH 1/5] chore: add llama-cpp-server sub crate

---
 Cargo.lock                          | 10 +++++
 Cargo.toml                          |  2 +-
 crates/llama-cpp-bindings/llama.cpp |  2 +-
 crates/llama-cpp-server/Cargo.toml  | 14 +++++++
 crates/llama-cpp-server/src/lib.rs  | 63 +++++++++++++++++++++++++++++
 5 files changed, 89 insertions(+), 2 deletions(-)
 create mode 100644 crates/llama-cpp-server/Cargo.toml
 create mode 100644 crates/llama-cpp-server/src/lib.rs
diff --git a/Cargo.lock b/Cargo.lock
index c81a9d81317d..5aff68db2d1f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2773,6 +2773,16 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "llama-cpp-server"
+version = "0.12.0-dev.0"
+dependencies = [
+ "http-api-bindings",
+ "serde_json",
+ "tabby-inference",
+ "tokio",
+]
+
 [[package]]
 name = "lock_api"
 version = "0.4.10"
diff --git a/Cargo.toml b/Cargo.toml
index c63ab8bc12d1..c07031371262 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,7 +13,7 @@ members = [
     "ee/tabby-db",
     "ee/tabby-db-macros",
     "ee/tabby-search",
-    "ee/tabby-schema",
+    "ee/tabby-schema", "crates/llama-cpp-server",
 ]
 
 [workspace.package]
diff --git a/crates/llama-cpp-bindings/llama.cpp b/crates/llama-cpp-bindings/llama.cpp
index b4e4b8a9351d..9aa672490c84 160000
--- a/crates/llama-cpp-bindings/llama.cpp
+++ b/crates/llama-cpp-bindings/llama.cpp
@@ -1 +1 @@
-Subproject commit b4e4b8a9351d918a56831c73cf9f25c1837b80d1
+Subproject commit 9aa672490c848e45eaa704a554e0f1f6df995fc8
diff --git a/crates/llama-cpp-server/Cargo.toml b/crates/llama-cpp-server/Cargo.toml
new file mode 100644
index 000000000000..55ffbea3c0bb
--- /dev/null
+++ b/crates/llama-cpp-server/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "llama-cpp-server"
+version.workspace = true
+edition.workspace = true
+authors.workspace = true
+homepage.workspace = true
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+http-api-bindings = { version = "0.12.0-dev.0", path = "../http-api-bindings" }
+serde_json.workspace = true
+tabby-inference = { version = "0.12.0-dev.0", path = "../tabby-inference" }
+tokio.workspace = true
diff --git a/crates/llama-cpp-server/src/lib.rs b/crates/llama-cpp-server/src/lib.rs
new file mode 100644
index 000000000000..1f78c632efd8
--- /dev/null
+++ b/crates/llama-cpp-server/src/lib.rs
@@ -0,0 +1,63 @@
+use std::{process::Stdio, sync::Arc};
+
+use serde_json::json;
+use tabby_inference::{ChatCompletionStream, CompletionStream, Embedding};
+
+struct LlamaCppServer {
+    process: tokio::process::Child,
+}
+
+const SERVER_PORT: u16 = 30888;
+
+impl LlamaCppServer {
+    pub fn new(model_path: &str, use_gpu: bool, parallelism: u8) -> Self {
+        let mut num_gpu_layers = std::env::var("LLAMA_CPP_N_GPU_LAYERS")
+            .unwrap_or("9999".into());
+        if !use_gpu {
+            num_gpu_layers = "0".to_string();
+        }
+        let process = tokio::process::Command::new("llama-cpp-server")
+            .arg("-m")
+            .arg(model_path)
+            .arg("--port")
+            .arg(SERVER_PORT.to_string())
+            .arg("-ngl")
+            .arg(num_gpu_layers)
+            .arg("-np")
+            .arg(parallelism.to_string())
+            .kill_on_drop(true)
+            .spawn()
+            .expect("Failed to spawn llama-cpp-server");
+
+        Self { process }
+    }
+
+    pub fn completion(&self, prompt_template: String) -> Arc<dyn CompletionStream> {
+        let model_spec: String = serde_json::to_string(&json!({
+            "kind": "llama",
+            "api_endpoint": format!("http://localhost:{SERVER_PORT}"),
+            "prompt_template": prompt_template,
+        }))
+        .expect("Failed to serialize model spec");
+        let (engine, _, _) = http_api_bindings::create(&model_spec);
+        engine
+    }
+
+    pub fn chat(&self) -> Arc<dyn ChatCompletionStream> {
+        let model_spec: String = serde_json::to_string(&json!({
+            "kind": "openai-chat",
+            "api_endpoint": format!("http://localhost:{SERVER_PORT}/v1"),
+        }))
+        .expect("Failed to serialize model spec");
+        http_api_bindings::create_chat(&model_spec)
+    }
+
+    pub fn embedding(self) -> Arc<dyn Embedding> {
+        let model_spec: String = serde_json::to_string(&json!({
+            "kind": "llama",
+            "api_endpoint": format!("http://localhost:{SERVER_PORT}"),
+        }))
+        .expect("Failed to serialize model spec");
+        http_api_bindings::create_embedding(&model_spec)
+    }
+}

From bc2d09b3795f9670854581e7ba7369ee888a4ccc Mon Sep 17 00:00:00 2001
From: Meng Zhang <meng@tabbyml.com>
Date: Mon, 13 May 2024 14:21:45 -0700
Subject: [PATCH 2/5] chore: add llama-cpp-server to embed llama-server
 directly

---
 Cargo.lock                         |  3 ++
 crates/llama-cpp-server/Cargo.toml | 11 ++--
 crates/llama-cpp-server/src/lib.rs | 87 +++++++++++++++++++++++++++---
 ee/tabby-webserver/Cargo.toml      |  2 +-
 4 files changed, 92 insertions(+), 11 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5aff68db2d1f..4561d4bec611 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2777,8 +2777,11 @@ dependencies = [
 name = "llama-cpp-server"
 version = "0.12.0-dev.0"
 dependencies = [
+ "futures",
  "http-api-bindings",
+ "reqwest 0.12.4",
  "serde_json",
+ "tabby-common",
  "tabby-inference",
  "tokio",
 ]
diff --git a/crates/llama-cpp-server/Cargo.toml b/crates/llama-cpp-server/Cargo.toml
index 55ffbea3c0bb..9b46e5624482 100644
--- a/crates/llama-cpp-server/Cargo.toml
+++ b/crates/llama-cpp-server/Cargo.toml
@@ -8,7 +8,12 @@ homepage.workspace = true
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-http-api-bindings = { version = "0.12.0-dev.0", path = "../http-api-bindings" }
+futures.workspace = true
+http-api-bindings = { path = "../http-api-bindings" }
+reqwest.workspace = true
 serde_json.workspace = true
-tabby-inference = { version = "0.12.0-dev.0", path = "../tabby-inference" }
-tokio.workspace = true
+tabby-inference = { path = "../tabby-inference" }
+tokio = { workspace = true, features = ["process"] }
+
+[dev-dependencies]
+tabby-common = { path = "../tabby-common" }
\ No newline at end of file
diff --git a/crates/llama-cpp-server/src/lib.rs b/crates/llama-cpp-server/src/lib.rs
index 1f78c632efd8..86276922d2a7 100644
--- a/crates/llama-cpp-server/src/lib.rs
+++ b/crates/llama-cpp-server/src/lib.rs
@@ -1,22 +1,25 @@
-use std::{process::Stdio, sync::Arc};
+use std::{
+    process::{ExitStatus, Stdio},
+    sync::Arc,
+};
 
 use serde_json::json;
 use tabby_inference::{ChatCompletionStream, CompletionStream, Embedding};
+use tokio::task::JoinHandle;
 
 struct LlamaCppServer {
-    process: tokio::process::Child,
+    handle: JoinHandle<()>,
 }
 
 const SERVER_PORT: u16 = 30888;
 
 impl LlamaCppServer {
     pub fn new(model_path: &str, use_gpu: bool, parallelism: u8) -> Self {
-        let mut num_gpu_layers = std::env::var("LLAMA_CPP_N_GPU_LAYERS")
-            .unwrap_or("9999".into());
+        let mut num_gpu_layers = std::env::var("LLAMA_CPP_N_GPU_LAYERS").unwrap_or("9999".into());
         if !use_gpu {
             num_gpu_layers = "0".to_string();
         }
-        let process = tokio::process::Command::new("llama-cpp-server")
+        let mut process = tokio::process::Command::new("llama-server")
             .arg("-m")
             .arg(model_path)
             .arg("--port")
@@ -26,16 +29,41 @@ impl LlamaCppServer {
             .arg("-np")
             .arg(parallelism.to_string())
             .kill_on_drop(true)
+            .stderr(Stdio::null())
+            .stdout(Stdio::null())
             .spawn()
             .expect("Failed to spawn llama-cpp-server");
 
-        Self { process }
+        let handle = tokio::spawn(async move {
+            let status_code = process
+                .wait()
+                .await
+                .ok()
+                .and_then(|s| s.code())
+                .unwrap_or(-1);
+            println!("Exist with exit code {}", status_code);
+        });
+
+        Self { handle }
+    }
+
+    async fn wait_for_health(&self) {
+        let client = reqwest::Client::new();
+        loop {
+            let Ok(resp) = client.get(api_endpoint() + "/health").send().await else {
+                continue;
+            };
+
+            if resp.status().is_success() {
+                return;
+            }
+        }
     }
 
     pub fn completion(&self, prompt_template: String) -> Arc<dyn CompletionStream> {
         let model_spec: String = serde_json::to_string(&json!({
             "kind": "llama",
-            "api_endpoint": format!("http://localhost:{SERVER_PORT}"),
+            "api_endpoint": api_endpoint(),
             "prompt_template": prompt_template,
         }))
         .expect("Failed to serialize model spec");
@@ -61,3 +89,48 @@ impl LlamaCppServer {
         http_api_bindings::create_embedding(&model_spec)
     }
 }
+
+fn api_endpoint() -> String {
+    format!("http://localhost:{SERVER_PORT}")
+}
+
+#[cfg(test)]
+mod tests {
+    use futures::StreamExt;
+    use tabby_common::registry::{parse_model_id, ModelRegistry};
+    use tabby_inference::CompletionOptionsBuilder;
+
+    use super::*;
+
+    #[tokio::test]
+    #[ignore = "Should only be run in local manual testing"]
+    async fn test_create_completion() {
+        let model_id = "StarCoder-1B";
+        let (registry, name) = parse_model_id(model_id);
+        let registry = ModelRegistry::new(registry).await;
+        let model_path = registry.get_model_path(name).display().to_string();
+        let model_info = registry.get_model_info(name);
+
+        let server = LlamaCppServer::new(&model_path, false, 1);
+        server.wait_for_health().await;
+
+        let completion = server.completion(model_info.prompt_template.clone().unwrap());
+        let s = completion
+            .generate(
+                "def fib(n):",
+                CompletionOptionsBuilder::default()
+                    .max_decoding_tokens(7)
+                    .max_input_length(1024)
+                    .sampling_temperature(0.0)
+                    .seed(12345)
+                    .build()
+                    .unwrap(),
+            )
+            .await;
+
+        let content: Vec<String> = s.collect().await;
+
+        let content = content.join("");
+        assert_eq!(content, "\n    if n <= 1:")
+    }
+}
diff --git a/ee/tabby-webserver/Cargo.toml b/ee/tabby-webserver/Cargo.toml
index 938091abc781..bdc5603628df 100644
--- a/ee/tabby-webserver/Cargo.toml
+++ b/ee/tabby-webserver/Cargo.toml
@@ -37,7 +37,7 @@ tabby-schema = { path = "../../ee/tabby-schema" }
 tabby-db = { path = "../../ee/tabby-db" }
 tarpc = { version = "0.33.0", features = ["serde-transport"] }
 thiserror.workspace = true
-tokio = { workspace = true, features = ["fs", "process"] }
+tokio = { workspace = true, features = ["fs"] }
 tokio-tungstenite = "0.21"
 tower = { version = "0.4", features = ["util", "limit"] }
 tower-http = { workspace = true, features = ["fs", "trace"] }

From 8be5d2d17b2c90f814cd602a1c8e837e72022b06 Mon Sep 17 00:00:00 2001
From: Meng Zhang <meng@tabbyml.com>
Date: Mon, 13 May 2024 14:26:02 -0700
Subject: [PATCH 3/5] update

---
 Cargo.lock                         |  1 +
 crates/llama-cpp-server/Cargo.toml |  1 +
 crates/llama-cpp-server/src/lib.rs | 56 +++++++++++++++++++-----------
 3 files changed, 37 insertions(+), 21 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 4561d4bec611..fa4d90caad04 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2784,6 +2784,7 @@ dependencies = [
  "tabby-common",
  "tabby-inference",
  "tokio",
+ "tracing",
 ]
 
 [[package]]
diff --git a/crates/llama-cpp-server/Cargo.toml b/crates/llama-cpp-server/Cargo.toml
index 9b46e5624482..f3d7f45e0895 100644
--- a/crates/llama-cpp-server/Cargo.toml
+++ b/crates/llama-cpp-server/Cargo.toml
@@ -13,6 +13,7 @@ http-api-bindings = { path = "../http-api-bindings" }
 reqwest.workspace = true
 serde_json.workspace = true
 tabby-inference = { path = "../tabby-inference" }
+tracing.workspace = true
 tokio = { workspace = true, features = ["process"] }
 
 [dev-dependencies]
diff --git a/crates/llama-cpp-server/src/lib.rs b/crates/llama-cpp-server/src/lib.rs
index 86276922d2a7..016f1be1c33e 100644
--- a/crates/llama-cpp-server/src/lib.rs
+++ b/crates/llama-cpp-server/src/lib.rs
@@ -6,6 +6,7 @@ use std::{
 use serde_json::json;
 use tabby_inference::{ChatCompletionStream, CompletionStream, Embedding};
 use tokio::task::JoinHandle;
+use tracing::warn;
 
 struct LlamaCppServer {
     handle: JoinHandle<()>,
@@ -19,29 +20,36 @@ impl LlamaCppServer {
         if !use_gpu {
             num_gpu_layers = "0".to_string();
         }
-        let mut process = tokio::process::Command::new("llama-server")
-            .arg("-m")
-            .arg(model_path)
-            .arg("--port")
-            .arg(SERVER_PORT.to_string())
-            .arg("-ngl")
-            .arg(num_gpu_layers)
-            .arg("-np")
-            .arg(parallelism.to_string())
-            .kill_on_drop(true)
-            .stderr(Stdio::null())
-            .stdout(Stdio::null())
-            .spawn()
-            .expect("Failed to spawn llama-cpp-server");
 
+        let model_path = model_path.to_owned();
         let handle = tokio::spawn(async move {
-            let status_code = process
-                .wait()
-                .await
-                .ok()
-                .and_then(|s| s.code())
-                .unwrap_or(-1);
-            println!("Exist with exit code {}", status_code);
+            loop {
+                let mut process = tokio::process::Command::new("llama-server")
+                    .arg("-m")
+                    .arg(&model_path)
+                    .arg("--port")
+                    .arg(SERVER_PORT.to_string())
+                    .arg("-ngl")
+                    .arg(&num_gpu_layers)
+                    .arg("-np")
+                    .arg(parallelism.to_string())
+                    .kill_on_drop(true)
+                    .stderr(Stdio::inherit())
+                    .stdout(Stdio::inherit())
+                    .spawn()
+                    .expect("Failed to spawn llama-cpp-server");
+
+                let status_code = process
+                    .wait()
+                    .await
+                    .ok()
+                    .and_then(|s| s.code())
+                    .unwrap_or(-1);
+
+                if status_code != 0 {
+                    warn!("llama-server exited with status code {}, restarting...", status_code);
+                }
+            }
         });
 
         Self { handle }
@@ -90,6 +98,12 @@ impl LlamaCppServer {
     }
 }
 
+impl Drop for LlamaCppServer {
+    fn drop(&mut self) {
+        self.handle.abort();
+    }
+}
+
 fn api_endpoint() -> String {
     format!("http://localhost:{SERVER_PORT}")
 }

From bb272b0412160775d657b826c0c66ec84d6e67df Mon Sep 17 00:00:00 2001
From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com>
Date: Mon, 13 May 2024 21:31:45 +0000
Subject: [PATCH 4/5] [autofix.ci] apply automated fixes

---
 crates/llama-cpp-server/src/lib.rs | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/crates/llama-cpp-server/src/lib.rs b/crates/llama-cpp-server/src/lib.rs
index 016f1be1c33e..1313b86d6f8b 100644
--- a/crates/llama-cpp-server/src/lib.rs
+++ b/crates/llama-cpp-server/src/lib.rs
@@ -1,5 +1,5 @@
 use std::{
-    process::{ExitStatus, Stdio},
+    process::{Stdio},
     sync::Arc,
 };
 
@@ -47,7 +47,10 @@ impl LlamaCppServer {
                     .unwrap_or(-1);
 
                 if status_code != 0 {
-                    warn!("llama-server exited with status code {}, restarting...", status_code);
+                    warn!(
+                        "llama-server exited with status code {}, restarting...",
+                        status_code
+                    );
                 }
             }
         });

From 85b37e65eaf72879bdeb8b0de8a2df284462e948 Mon Sep 17 00:00:00 2001
From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com>
Date: Mon, 13 May 2024 21:38:26 +0000
Subject: [PATCH 5/5] [autofix.ci] apply automated fixes (attempt 2/3)

---
 crates/llama-cpp-server/src/lib.rs | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/crates/llama-cpp-server/src/lib.rs b/crates/llama-cpp-server/src/lib.rs
index 1313b86d6f8b..385f2dbc4df5 100644
--- a/crates/llama-cpp-server/src/lib.rs
+++ b/crates/llama-cpp-server/src/lib.rs
@@ -1,7 +1,4 @@
-use std::{
-    process::{Stdio},
-    sync::Arc,
-};
+use std::{process::Stdio, sync::Arc};
 
 use serde_json::json;
 use tabby_inference::{ChatCompletionStream, CompletionStream, Embedding};