TabbyML · zwpaper · Nov 22, 2024 · Nov 22, 2024 · Nov 22, 2024 · Nov 22, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -63,6 +63,7 @@ url = "2.5.0"
 temp_testdir = "0.2"
 git2 = "0.18.3"
 tower-http = "0.5"
+tower = "0.5"
 mime_guess = "2.0.4"
 assert_matches = "1.5"
 insta = "1.34.0"

diff --git a/crates/http-api-bindings/Cargo.toml b/crates/http-api-bindings/Cargo.toml
@@ -18,6 +18,8 @@ tabby-common = { path = "../tabby-common" }
 tabby-inference = { path = "../tabby-inference" }
 ollama-api-bindings = { path = "../ollama-api-bindings" }
 async-openai.workspace = true
+tower = { workspace = true , features = ["limit", "util", "buffer"] }
+tokio = { workspace = true }
 
 [dev-dependencies]
 tokio = { workspace = true, features = ["rt", "macros"] }
diff --git a/crates/http-api-bindings/src/embedding/mod.rs b/crates/http-api-bindings/src/embedding/mod.rs
@@ -1,18 +1,22 @@
 mod llama;
 mod openai;
+mod rate_limit;
 mod voyage;
 
 use core::panic;
 use std::sync::Arc;
 
 use llama::LlamaCppEngine;
+use rate_limit::RateLimitedEmbedding;
 use tabby_common::config::HttpModelConfig;
 use tabby_inference::Embedding;
 
 use self::{openai::OpenAIEmbeddingEngine, voyage::VoyageEmbeddingEngine};
 
 pub async fn create(config: &HttpModelConfig) -> Arc<dyn Embedding> {
-    match config.kind.as_str() {
+    let rpm = config.rate_limit.request_per_minute;
+
+    let embedding: Arc<dyn Embedding> = match config.kind.as_str() {
         "llama.cpp/embedding" => {
             let engine = LlamaCppEngine::create(
                 config
@@ -53,5 +57,9 @@ pub async fn create(config: &HttpModelConfig) -> Arc<dyn Embedding> {
             "Unsupported kind for http embedding model: {}",
             unsupported_kind
         ),
-    }
+    };
+
+    Arc::new(
+        RateLimitedEmbedding::new(embedding, rpm).expect("Failed to create rate limited embedding"),
+    )
 }
diff --git a/crates/http-api-bindings/src/embedding/rate_limit.rs b/crates/http-api-bindings/src/embedding/rate_limit.rs
@@ -0,0 +1,63 @@
+use std::{
+    sync::Arc,
+    task::{Context, Poll},
+    time,
+};
+
+use async_trait::async_trait;
+use futures::future::BoxFuture;
+use tabby_inference::Embedding;
+use tokio::sync::Mutex;
+use tower::{Service, ServiceBuilder, ServiceExt};
+
+struct EmbeddingService {
+    embedding: Arc<dyn Embedding>,
+}
+
+impl Service<String> for EmbeddingService {
+    type Response = Vec<f32>;
+    type Error = anyhow::Error;
+    type Future = BoxFuture<'static, Result<Self::Response, Self::Error>>;
+
+    fn poll_ready(&mut self, _cx: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
+        Poll::Ready(Ok(()))
+    }
+
+    fn call(&mut self, prompt: String) -> Self::Future {
+        let embedding = self.embedding.clone();
+        Box::pin(async move { embedding.embed(&prompt).await })
+    }
+}
+
+pub struct RateLimitedEmbedding {
+    embedding: Arc<Mutex<tower::util::BoxService<String, Vec<f32>, anyhow::Error>>>,
+}
+
+impl RateLimitedEmbedding {
+    pub fn new(embedding: Arc<dyn Embedding>, rpm: u64) -> anyhow::Result<Self> {
+        if rpm == 0 {
+            anyhow::bail!(
+                "Can not create rate limited embedding client with 0 requests per minute"
+            );
+        }
+
+        let service = ServiceBuilder::new()
+            .rate_limit(rpm, time::Duration::from_secs(60))
+            .service(EmbeddingService { embedding })
+            .boxed();
+
+        Ok(Self {
+            embedding: Arc::new(Mutex::new(service)),
+        })
+    }
+}
+
+#[async_trait]
+impl Embedding for RateLimitedEmbedding {
+    async fn embed(&self, prompt: &str) -> anyhow::Result<Vec<f32>> {
+        let mut service = self.embedding.lock().await;
+        let prompt_owned = prompt.to_string();
+        let response = service.ready().await?.call(prompt_owned).await?;
+        Ok(response)
+    }
+}
diff --git a/crates/tabby-common/src/config.rs b/crates/tabby-common/src/config.rs
@@ -289,6 +289,10 @@ pub struct HttpModelConfig {
     #[builder(default)]
     pub api_key: Option<String>,
 
+    #[builder(default)]
+    #[serde(default)]
+    pub rate_limit: RateLimit,
+
     /// Used by OpenAI style API for model name.
     #[builder(default)]
     pub model_name: Option<String>,
@@ -309,6 +313,20 @@ pub struct HttpModelConfig {
     pub additional_stop_words: Option<Vec<String>>,
 }
 
+#[derive(Serialize, Deserialize, Builder, Debug, Clone)]
+pub struct RateLimit {
+    // The limited number of requests can be made in one minute.
+    pub request_per_minute: u64,
+}
+
+impl Default for RateLimit {
+    fn default() -> Self {
+        Self {
+            request_per_minute: 600,
+        }
+    }
+}
+
 #[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
 pub struct LocalModelConfig {
     pub model_id: String,