From 39628f8932b48f956dc3e8b7cca9557757a8cd04 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Fri, 5 Jul 2024 17:51:37 +0900 Subject: [PATCH] refactor: use DocIndexer for web crawling (#2586) * refactor: use DocIndexer for web crawling * updatE --- crates/tabby-scheduler/src/doc/mod.rs | 3 +-- crates/tabby-scheduler/src/lib.rs | 23 ++++--------------- .../src/service/background_job/git.rs | 2 +- 3 files changed, 6 insertions(+), 22 deletions(-) diff --git a/crates/tabby-scheduler/src/doc/mod.rs b/crates/tabby-scheduler/src/doc/mod.rs index 106004509695..d3449c0643ff 100644 --- a/crates/tabby-scheduler/src/doc/mod.rs +++ b/crates/tabby-scheduler/src/doc/mod.rs @@ -88,8 +88,7 @@ async fn build_tokens(embedding: Arc, text: &str) -> Vec chunk_embedding_tokens } -// FIXME(meng): make this private interface, always prefer using public::DocIndexer for web doc building. -pub fn create_web_builder(embedding: Arc) -> TantivyDocBuilder { +fn create_web_builder(embedding: Arc) -> TantivyDocBuilder { let builder = DocBuilder::new(embedding); TantivyDocBuilder::new(corpus::WEB, builder) } diff --git a/crates/tabby-scheduler/src/lib.rs b/crates/tabby-scheduler/src/lib.rs index 305110b5dda3..35f3000ef69f 100644 --- a/crates/tabby-scheduler/src/lib.rs +++ b/crates/tabby-scheduler/src/lib.rs @@ -5,9 +5,9 @@ mod code; mod crawl; mod indexer; -pub use code::CodeIndexer; +use chrono::Utc; use crawl::crawl_pipeline; -use doc::create_web_builder; +pub use doc::public::{DocIndexer, WebDocument}; use futures::StreamExt; use indexer::{IndexAttributeBuilder, Indexer}; use tabby_common::index::corpus; @@ -23,8 +23,6 @@ pub mod public { }; } -use crate::doc::public::WebDocument; - pub async fn crawl_index_docs( source_id: &str, start_url: &str, @@ -33,8 +31,7 @@ pub async fn crawl_index_docs( logkit::info!("Starting doc index pipeline for {}", start_url); let embedding = embedding.clone(); let mut num_docs = 0; - let builder = create_web_builder(embedding.clone()); - let indexer = Indexer::new(corpus::WEB); + let indexer = public::DocIndexer::new(embedding.clone()); let mut pipeline = Box::pin(crawl_pipeline(start_url).await?); while let Some(doc) = pipeline.next().await { @@ -48,19 +45,7 @@ pub async fn crawl_index_docs( }; num_docs += 1; - - let (id, s) = builder.build(source_doc).await; - indexer.delete(&id); - s.buffer_unordered(std::cmp::max( - std::thread::available_parallelism().unwrap().get() * 2, - 32, - )) - .for_each(|doc| async { - if let Ok(Some(doc)) = doc { - indexer.add(doc).await; - } - }) - .await; + indexer.add(Utc::now(), source_doc).await; } logkit::info!("Crawled {} documents from '{}'", num_docs, start_url); indexer.commit(); diff --git a/ee/tabby-webserver/src/service/background_job/git.rs b/ee/tabby-webserver/src/service/background_job/git.rs index c3ced05e85f6..05b68dddaa31 100644 --- a/ee/tabby-webserver/src/service/background_job/git.rs +++ b/ee/tabby-webserver/src/service/background_job/git.rs @@ -5,7 +5,7 @@ use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; use tabby_common::config::RepositoryConfig; use tabby_inference::Embedding; -use tabby_scheduler::CodeIndexer; +use tabby_scheduler::public::CodeIndexer; use tabby_schema::{job::JobService, repository::GitRepositoryService}; use super::{helper::Job, BackgroundJobEvent};