diff --git a/crates/http-api-bindings/src/rate_limit.rs b/crates/http-api-bindings/src/rate_limit.rs index ac3efb85d9ab..85c9f388b6a5 100644 --- a/crates/http-api-bindings/src/rate_limit.rs +++ b/crates/http-api-bindings/src/rate_limit.rs @@ -10,6 +10,7 @@ use async_trait::async_trait; use futures::stream::BoxStream; use ratelimit::Ratelimiter; use tabby_inference::{ChatCompletionStream, CompletionOptions, CompletionStream, Embedding}; +use tracing::warn; fn new_rate_limiter(rpm: u64) -> Ratelimiter { Ratelimiter::builder(rpm/60, Duration::from_secs(1)) @@ -34,7 +35,7 @@ pub fn new_embedding(embedding: Box, request_per_minute: u64) -> #[async_trait] impl Embedding for RateLimitedEmbedding { async fn embed(&self, prompt: &str) -> anyhow::Result> { - for _ in 0..5 { + for _ in 0..60 { if let Err(sleep) = self.rate_limiter.try_wait() { tokio::time::sleep(sleep).await; continue; @@ -43,7 +44,7 @@ impl Embedding for RateLimitedEmbedding { return self.embedding.embed(prompt).await; } - anyhow::bail!("Rate limit exceeded for embedding computation"); + anyhow::bail!("Failed to acquire request quota for embedding"); } } @@ -65,7 +66,7 @@ pub fn new_completion( #[async_trait] impl CompletionStream for RateLimitedCompletion { async fn generate(&self, prompt: &str, options: CompletionOptions) -> BoxStream { - for _ in 0..5 { + for _ in 0..60 { if let Err(sleep) = self.rate_limiter.try_wait() { tokio::time::sleep(sleep).await; continue; @@ -74,7 +75,7 @@ impl CompletionStream for RateLimitedCompletion { return self.completion.generate(prompt, options).await; } - // Return an empty stream if the rate limit is exceeded + warn!("Failed to acquire request quota for completion"); Box::pin(futures::stream::empty()) } } @@ -100,7 +101,7 @@ impl ChatCompletionStream for RateLimitedChatStream { &self, request: CreateChatCompletionRequest, ) -> Result { - for _ in 0..5 { + for _ in 0..60 { if let Err(sleep) = self.rate_limiter.try_wait() { tokio::time::sleep(sleep).await; continue; @@ -110,7 +111,7 @@ impl ChatCompletionStream for RateLimitedChatStream { } Err(OpenAIError::ApiError(ApiError { - message: "Rate limit exceeded for chat completion".to_owned(), + message: "Failed to acquire request quota for chat".to_owned(), r#type: None, param: None, code: None, @@ -121,7 +122,7 @@ impl ChatCompletionStream for RateLimitedChatStream { &self, request: CreateChatCompletionRequest, ) -> Result { - for _ in 0..5 { + for _ in 0..60 { if let Err(sleep) = self.rate_limiter.try_wait() { tokio::time::sleep(sleep).await; continue; @@ -131,7 +132,7 @@ impl ChatCompletionStream for RateLimitedChatStream { } Err(OpenAIError::ApiError(ApiError { - message: "Rate limit exceeded for chat completion".to_owned(), + message: "Failed to acquire request quota for chat stream".to_owned(), r#type: None, param: None, code: None,