TabbyML · wsxiaoys · Apr 22, 2024
diff --git a/crates/http-api-bindings/src/lib.rs b/crates/http-api-bindings/src/lib.rs
@@ -6,9 +6,9 @@
 use openai::OpenAIEngine;
 use openai_chat::OpenAIChatEngine;
 use serde_json::Value;
-use tabby_inference::{chat::ChatCompletionStream, make_text_generation, TextGeneration};
+use tabby_inference::{chat::ChatCompletionStream, TextGenerationStream};
 
-pub fn create(model: &str) -> (Arc<dyn TextGeneration>, Option<String>, Option<String>) {
+pub fn create(model: &str) -> (impl TextGenerationStream, Option<String>, Option<String>) {
     let params = serde_json::from_str(model).expect("Failed to parse model string");
     let kind = get_param(&params, "kind");
     if kind == "openai" {
@@ -17,9 +17,8 @@
         let api_key = get_optional_param(&params, "api_key");
         let prompt_template = get_optional_param(&params, "prompt_template");
         let chat_template = get_optional_param(&params, "chat_template");
-        let engine =
-            make_text_generation(OpenAIEngine::create(&api_endpoint, &model_name, api_key));
-        (Arc::new(engine), prompt_template, chat_template)
+        let engine = OpenAIEngine::create(&api_endpoint, &model_name, api_key);
+        (engine, prompt_template, chat_template)
     } else {
         panic!("Only openai are supported for http completion");
     }

diff --git a/crates/tabby-inference/src/imp.rs b/crates/tabby-inference/src/imp.rs
@@ -18,26 +18,6 @@
             stop_condition_factory: StopConditionFactory::default(),
         }
     }
-}
-
-#[async_trait]
-impl<T: TextGenerationStream> TextGeneration for TextGenerationImpl<T> {
-    async fn generate(&self, prompt: &str, options: TextGenerationOptions) -> String {
-        let prompt = prompt.to_owned();
-        let s = stream! {
-            for await (streaming, text) in self.generate_stream(&prompt, options).await {
-                if !streaming {
-                    yield text;
-                }
-            }
-        };
-
-        if let Some(text) = Box::pin(s).into_future().await.0 {
-            text
-        } else {
-            String::new()
-        }
-    }
 
     async fn generate_stream(
         &self,
@@ -71,3 +51,23 @@
         Box::pin(s)
     }
 }
+
+#[async_trait]
+impl<T: TextGenerationStream> TextGeneration for TextGenerationImpl<T> {
+    async fn generate(&self, prompt: &str, options: TextGenerationOptions) -> String {
+        let prompt = prompt.to_owned();
+        let s = stream! {
+            for await (streaming, text) in self.generate_stream(&prompt, options).await {
+                if !streaming {
+                    yield text;
+                }
+            }
+        };
+
+        if let Some(text) = Box::pin(s).into_future().await.0 {
+            text
+        } else {
+            String::new()
+        }
+    }
+}
diff --git a/crates/tabby-inference/src/lib.rs b/crates/tabby-inference/src/lib.rs
@@ -41,14 +41,16 @@
     async fn generate(&self, prompt: &str, options: TextGenerationOptions) -> BoxStream<String>;
 }
 
+#[async_trait]
+impl TextGenerationStream for Box<dyn TextGenerationStream> {
+    async fn generate(&self, prompt: &str, options: TextGenerationOptions) -> BoxStream<String> {
+        self.as_ref().generate(prompt, options).await
+    }
+}
+
 #[async_trait]
 pub trait TextGeneration: Sync + Send {
     async fn generate(&self, prompt: &str, options: TextGenerationOptions) -> String;
-    async fn generate_stream(
-        &self,
-        prompt: &str,
-        options: TextGenerationOptions,
-    ) -> BoxStream<(bool, String)>;
 }
 
 pub fn make_text_generation(imp: impl TextGenerationStream) -> impl TextGeneration {

diff --git a/crates/tabby/src/services/completion.rs b/crates/tabby/src/services/completion.rs
@@ -11,7 +11,9 @@
     },
     languages::get_language,
 };
-use tabby_inference::{TextGeneration, TextGenerationOptions, TextGenerationOptionsBuilder};
+use tabby_inference::{
+    make_text_generation, TextGeneration, TextGenerationOptions, TextGenerationOptionsBuilder,
+};
 use thiserror::Error;
 use utoipa::ToSchema;
 
@@ -226,20 +228,20 @@
 }
 
 pub struct CompletionService {
-    engine: Arc<dyn TextGeneration>,
+    engine: Box<dyn TextGeneration>,
     logger: Arc<dyn EventLogger>,
     prompt_builder: completion_prompt::PromptBuilder,
 }
 
 impl CompletionService {
     fn new(
-        engine: Arc<dyn TextGeneration>,
+        engine: impl TextGeneration + 'static,
         code: Arc<dyn CodeSearch>,
         logger: Arc<dyn EventLogger>,
         prompt_template: Option<String>,
     ) -> Self {
         Self {
-            engine,
+            engine: Box::new(engine),
             prompt_builder: completion_prompt::PromptBuilder::new(prompt_template, Some(code)),
             logger,
         }
@@ -351,5 +353,5 @@
         },
     ) = model::load_text_generation(model, device, parallelism).await;
 
-    CompletionService::new(engine.clone(), code, logger, prompt_template)
+    CompletionService::new(make_text_generation(engine), code, logger, prompt_template)
 }
diff --git a/crates/tabby/src/services/model/chat.rs b/crates/tabby/src/services/model/chat.rs
@@ -1,13 +1,10 @@
-use std::sync::Arc;
-
 use anyhow::Result;
-use async_stream::stream;
 use futures::stream::BoxStream;
 use minijinja::{context, Environment};
 use tabby_common::api::chat::Message;
 use tabby_inference::{
     chat::{self, ChatCompletionStream},
-    TextGeneration, TextGenerationOptions, TextGenerationStream,
+    TextGenerationOptions, TextGenerationStream,
 };
 
 struct ChatPromptBuilder {
@@ -37,7 +34,7 @@
 }
 
 struct ChatCompletionImpl {
-    engine: Arc<dyn TextGeneration>,
+    engine: Box<dyn TextGenerationStream>,
     prompt_builder: ChatPromptBuilder,
 }
 
@@ -50,21 +47,12 @@
 #[async_trait::async_trait]
 impl TextGenerationStream for ChatCompletionImpl {
     async fn generate(&self, prompt: &str, options: TextGenerationOptions) -> BoxStream<String> {
-        let prompt = prompt.to_owned();
-        let s = stream! {
-            for await (streaming, text) in self.engine.generate_stream(&prompt, options).await {
-                if streaming {
-                    yield text;
-                }
-            }
-        };
-
-        Box::pin(s)
+        self.engine.generate(prompt, options).await
     }
 }
 
 pub fn make_chat_completion(
-    engine: Arc<dyn TextGeneration>,
+    engine: Box<dyn TextGenerationStream>,
     prompt_template: String,
 ) -> impl ChatCompletionStream {
     ChatCompletionImpl {

diff --git a/crates/tabby/src/services/model/mod.rs b/crates/tabby/src/services/model/mod.rs
@@ -8,9 +8,7 @@
     terminal::{HeaderFormat, InfoMessage},
 };
 use tabby_download::download_model;
-use tabby_inference::{
-    chat::ChatCompletionStream, make_text_generation, TextGeneration, TextGenerationStream,
-};
+use tabby_inference::{chat::ChatCompletionStream, TextGenerationStream};
 use tracing::info;
 
 use crate::{fatal, Device};
@@ -39,12 +37,12 @@
     model_id: &str,
     device: &Device,
     parallelism: u8,
-) -> (Arc<dyn TextGeneration>, PromptInfo) {
+) -> (Box<dyn TextGenerationStream>, PromptInfo) {
     #[cfg(feature = "experimental-http")]
     if device == &Device::ExperimentalHttp {
         let (engine, prompt_template, chat_template) = http_api_bindings::create(model_id);
         return (
-            engine,
+            Box::new(engine),
             PromptInfo {
                 prompt_template,
                 chat_template,
@@ -61,15 +59,15 @@
             parallelism,
         );
         let engine_info = PromptInfo::read(path.join("tabby.json"));
-        (Arc::new(make_text_generation(engine)), engine_info)
+        (Box::new(engine), engine_info)
     } else {
         let (registry, name) = parse_model_id(model_id);
         let registry = ModelRegistry::new(registry).await;
         let model_path = registry.get_model_path(name).display().to_string();
         let model_info = registry.get_model_info(name);
         let engine = create_ggml_engine(device, &model_path, parallelism);
         (
-            Arc::new(make_text_generation(engine)),
+            Box::new(engine),
             PromptInfo {
                 prompt_template: model_info.prompt_template.clone(),
                 chat_template: model_info.chat_template.clone(),