From e01650be3ebd534448ed940edd1d917f8be5c9a2 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Sun, 12 May 2024 21:28:39 -0700 Subject: [PATCH] chore: add scaffold of doc indexing (#2104) * add embedding computation * add cmt * update --- Cargo.lock | 1 + crates/tabby-common/src/index/doc.rs | 6 +- crates/tabby-scheduler/Cargo.toml | 1 + crates/tabby-scheduler/src/doc/mod.rs | 79 +++++++++++++++++++++++---- 4 files changed, 73 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e6f841d419cd..6002537bcd15 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5449,6 +5449,7 @@ dependencies = [ "serde", "serde_json", "tabby-common", + "tabby-inference", "tantivy", "temp_testdir", "text-splitter", diff --git a/crates/tabby-common/src/index/doc.rs b/crates/tabby-common/src/index/doc.rs index 4cb3e91c5c71..88a13568f917 100644 --- a/crates/tabby-common/src/index/doc.rs +++ b/crates/tabby-common/src/index/doc.rs @@ -11,7 +11,7 @@ pub struct DocSearchSchema { pub field_title: Field, pub field_link: Field, - pub field_snippet: Field, + pub field_body: Field, } impl DocSearchSchema { @@ -22,7 +22,7 @@ impl DocSearchSchema { let field_embedding_token = builder.add_text_field("embedding_token", STRING); let field_title = builder.add_text_field("title", STORED); let field_link = builder.add_text_field("link", STORED); - let field_snippet = builder.add_text_field("snippet", STORED); + let field_body = builder.add_text_field("body", STORED); let schema = builder.build(); @@ -32,7 +32,7 @@ impl DocSearchSchema { field_embedding_token, field_title, field_link, - field_snippet, + field_body, } } } diff --git a/crates/tabby-scheduler/Cargo.toml b/crates/tabby-scheduler/Cargo.toml index 905e2084a31c..b97c713a5c72 100644 --- a/crates/tabby-scheduler/Cargo.toml +++ b/crates/tabby-scheduler/Cargo.toml @@ -36,6 +36,7 @@ mdka = "1.2.4" readable-readability = "0.4.0" url.workspace = true voca_rs = "1.15.2" +tabby-inference = { path = "../tabby-inference" } [dev-dependencies] temp_testdir = { workspace = true } diff --git a/crates/tabby-scheduler/src/doc/mod.rs b/crates/tabby-scheduler/src/doc/mod.rs index ccb908902838..3ce19ffa51d3 100644 --- a/crates/tabby-scheduler/src/doc/mod.rs +++ b/crates/tabby-scheduler/src/doc/mod.rs @@ -1,5 +1,10 @@ +use std::{collections::HashSet, sync::Arc}; + use tabby_common::{index::DocSearchSchema, path}; +use tabby_inference::Embedding; use tantivy::{doc, Index, IndexWriter, Term}; +use text_splitter::{Characters, TextSplitter}; +use tracing::warn; use crate::tantivy_utils::open_or_create_index; @@ -7,43 +12,95 @@ struct Document { pub id: String, pub title: String, pub link: String, - pub snippet: String, + pub body: String, } struct DocIndex { + embedding: Arc, doc: DocSearchSchema, index: Index, writer: IndexWriter, + splitter: TextSplitter, +} + +const CHUNK_SIZE: usize = 2048; + +fn make_embedding_token(i: usize) -> String { + format!("embedding_{i}") } impl DocIndex { - pub fn new() -> Self { + pub fn new(embedding: Arc) -> Self { let doc = DocSearchSchema::default(); let index = open_or_create_index(&doc.schema, &path::doc_index_dir()); let writer = index .writer(150_000_000) .expect("Failed to create index writer"); - Self { doc, index, writer } + Self { + embedding, + doc, + index, + writer, + splitter: TextSplitter::default().with_trim_chunks(true), + } } - pub fn add(&mut self, document: Document) { + pub async fn add(&mut self, document: Document) { // Delete the document if it already exists self.writer .delete_term(Term::from_field_text(self.doc.field_id, &document.id)); + let Some(embedding_tokens) = self.compute_embedding_tokens(&document.body).await else { + warn!( + "Failed to compute embedding tokens for document '{}'", + document.id + ); + return; + }; + + let mut doc = doc! { + self.doc.field_id => document.id, + self.doc.field_title => document.title, + self.doc.field_link => document.link, + self.doc.field_body => document.body, + }; + + for token in embedding_tokens { + doc.add_text(self.doc.field_embedding_token, token); + } + // Add the document self.writer - .add_document(doc! { - self.doc.field_id => document.id, - // FIXME: compute embedding token - self.doc.field_title => document.title, - self.doc.field_link => document.link, - self.doc.field_snippet => document.snippet, - }) + .add_document(doc) .expect("Failed to add document"); } + /// This function splits the document into chunks and computes the embedding for each chunk. It then converts the embeddings + /// into binarized tokens by thresholding on zero. + /// + /// The current implementation deduplicates tokens at the document level, but this may require further consideration in the future. + async fn compute_embedding_tokens(&self, content: &str) -> Option> { + let mut tokens = HashSet::new(); + for chunk in self.splitter.chunks(content, CHUNK_SIZE) { + let embedding = match self.embedding.embed(chunk).await { + Ok(embedding) => embedding, + Err(e) => { + warn!("Failed to embed document: {}", e); + return None; + } + }; + + for (i, value) in embedding.iter().enumerate() { + if *value > 0.0 { + tokens.insert(make_embedding_token(i)); + } + } + } + + Some(tokens.into_iter().collect()) + } + pub fn delete(&mut self, id: &str) { self.writer .delete_term(Term::from_field_text(self.doc.field_id, id));