Skip to content

Commit

Permalink
refactor: use DocIndexer for web crawling (#2586)
Browse files Browse the repository at this point in the history
* refactor: use DocIndexer for web crawling

* updatE
  • Loading branch information
wsxiaoys authored Jul 5, 2024
1 parent ac49f8e commit 39628f8
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 22 deletions.
3 changes: 1 addition & 2 deletions crates/tabby-scheduler/src/doc/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,7 @@ async fn build_tokens(embedding: Arc<dyn Embedding>, text: &str) -> Vec<String>
chunk_embedding_tokens
}

// FIXME(meng): make this private interface, always prefer using public::DocIndexer for web doc building.
pub fn create_web_builder(embedding: Arc<dyn Embedding>) -> TantivyDocBuilder<WebDocument> {
fn create_web_builder(embedding: Arc<dyn Embedding>) -> TantivyDocBuilder<WebDocument> {
let builder = DocBuilder::new(embedding);
TantivyDocBuilder::new(corpus::WEB, builder)
}
Expand Down
23 changes: 4 additions & 19 deletions crates/tabby-scheduler/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ mod code;
mod crawl;
mod indexer;

pub use code::CodeIndexer;
use chrono::Utc;
use crawl::crawl_pipeline;
use doc::create_web_builder;
pub use doc::public::{DocIndexer, WebDocument};
use futures::StreamExt;
use indexer::{IndexAttributeBuilder, Indexer};
use tabby_common::index::corpus;
Expand All @@ -23,8 +23,6 @@ pub mod public {
};
}

use crate::doc::public::WebDocument;

pub async fn crawl_index_docs(
source_id: &str,
start_url: &str,
Expand All @@ -33,8 +31,7 @@ pub async fn crawl_index_docs(
logkit::info!("Starting doc index pipeline for {}", start_url);
let embedding = embedding.clone();
let mut num_docs = 0;
let builder = create_web_builder(embedding.clone());
let indexer = Indexer::new(corpus::WEB);
let indexer = public::DocIndexer::new(embedding.clone());

let mut pipeline = Box::pin(crawl_pipeline(start_url).await?);
while let Some(doc) = pipeline.next().await {
Expand All @@ -48,19 +45,7 @@ pub async fn crawl_index_docs(
};

num_docs += 1;

let (id, s) = builder.build(source_doc).await;
indexer.delete(&id);
s.buffer_unordered(std::cmp::max(
std::thread::available_parallelism().unwrap().get() * 2,
32,
))
.for_each(|doc| async {
if let Ok(Some(doc)) = doc {
indexer.add(doc).await;
}
})
.await;
indexer.add(Utc::now(), source_doc).await;
}
logkit::info!("Crawled {} documents from '{}'", num_docs, start_url);
indexer.commit();
Expand Down
2 changes: 1 addition & 1 deletion ee/tabby-webserver/src/service/background_job/git.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use tabby_common::config::RepositoryConfig;
use tabby_inference::Embedding;
use tabby_scheduler::CodeIndexer;
use tabby_scheduler::public::CodeIndexer;
use tabby_schema::{job::JobService, repository::GitRepositoryService};

use super::{helper::Job, BackgroundJobEvent};
Expand Down

0 comments on commit 39628f8

Please sign in to comment.