chore: add scaffold of doc indexing (#2104)

* add embedding computation * add cmt * update
TabbyML · May 13, 2024 · e01650b · e01650b
1 parent 6f2bc49
commit e01650b
Show file tree

Hide file tree

Showing 4 changed files with 73 additions and 14 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/tabby-common/src/index/doc.rs b/crates/tabby-common/src/index/doc.rs
@@ -11,7 +11,7 @@ pub struct DocSearchSchema {
 
     pub field_title: Field,
     pub field_link: Field,
-    pub field_snippet: Field,
+    pub field_body: Field,
 }
 
 impl DocSearchSchema {
@@ -22,7 +22,7 @@ impl DocSearchSchema {
         let field_embedding_token = builder.add_text_field("embedding_token", STRING);
         let field_title = builder.add_text_field("title", STORED);
         let field_link = builder.add_text_field("link", STORED);
-        let field_snippet = builder.add_text_field("snippet", STORED);
+        let field_body = builder.add_text_field("body", STORED);
 
         let schema = builder.build();
 
@@ -32,7 +32,7 @@ impl DocSearchSchema {
             field_embedding_token,
             field_title,
             field_link,
-            field_snippet,
+            field_body,
         }
     }
 }

diff --git a/crates/tabby-scheduler/Cargo.toml b/crates/tabby-scheduler/Cargo.toml
@@ -36,6 +36,7 @@ mdka = "1.2.4"
 readable-readability = "0.4.0"
 url.workspace = true
 voca_rs = "1.15.2"
+tabby-inference = { path = "../tabby-inference" }
 
 [dev-dependencies]
 temp_testdir = { workspace = true }

diff --git a/crates/tabby-scheduler/src/doc/mod.rs b/crates/tabby-scheduler/src/doc/mod.rs
@@ -1,49 +1,106 @@
+use std::{collections::HashSet, sync::Arc};
+
 use tabby_common::{index::DocSearchSchema, path};
+use tabby_inference::Embedding;
 use tantivy::{doc, Index, IndexWriter, Term};
+use text_splitter::{Characters, TextSplitter};
+use tracing::warn;
 
 use crate::tantivy_utils::open_or_create_index;
 
 struct Document {
     pub id: String,
     pub title: String,
     pub link: String,
-    pub snippet: String,
+    pub body: String,
 }
 
 struct DocIndex {
+    embedding: Arc<dyn Embedding>,
     doc: DocSearchSchema,
     index: Index,
     writer: IndexWriter,
+    splitter: TextSplitter<Characters>,
+}
+
+const CHUNK_SIZE: usize = 2048;
+
+fn make_embedding_token(i: usize) -> String {
+    format!("embedding_{i}")
 }
 
 impl DocIndex {
-    pub fn new() -> Self {
+    pub fn new(embedding: Arc<dyn Embedding>) -> Self {
         let doc = DocSearchSchema::default();
         let index = open_or_create_index(&doc.schema, &path::doc_index_dir());
         let writer = index
             .writer(150_000_000)
             .expect("Failed to create index writer");
 
-        Self { doc, index, writer }
+        Self {
+            embedding,
+            doc,
+            index,
+            writer,
+            splitter: TextSplitter::default().with_trim_chunks(true),
+        }
     }
 
-    pub fn add(&mut self, document: Document) {
+    pub async fn add(&mut self, document: Document) {
         // Delete the document if it already exists
         self.writer
             .delete_term(Term::from_field_text(self.doc.field_id, &document.id));
 
+        let Some(embedding_tokens) = self.compute_embedding_tokens(&document.body).await else {
+            warn!(
+                "Failed to compute embedding tokens for document '{}'",
+                document.id
+            );
+            return;
+        };
+
+        let mut doc = doc! {
+            self.doc.field_id => document.id,
+            self.doc.field_title => document.title,
+            self.doc.field_link => document.link,
+            self.doc.field_body => document.body,
+        };
+
+        for token in embedding_tokens {
+            doc.add_text(self.doc.field_embedding_token, token);
+        }
+
         // Add the document
         self.writer
-            .add_document(doc! {
-                self.doc.field_id => document.id,
-                // FIXME: compute embedding token
-                self.doc.field_title => document.title,
-                self.doc.field_link => document.link,
-                self.doc.field_snippet => document.snippet,
-            })
+            .add_document(doc)
             .expect("Failed to add document");
     }
 
+    /// This function splits the document into chunks and computes the embedding for each chunk. It then converts the embeddings
+    /// into binarized tokens by thresholding on zero.
+    ///
+    /// The current implementation deduplicates tokens at the document level, but this may require further consideration in the future.
+    async fn compute_embedding_tokens(&self, content: &str) -> Option<Vec<String>> {
+        let mut tokens = HashSet::new();
+        for chunk in self.splitter.chunks(content, CHUNK_SIZE) {
+            let embedding = match self.embedding.embed(chunk).await {
+                Ok(embedding) => embedding,
+                Err(e) => {
+                    warn!("Failed to embed document: {}", e);
+                    return None;
+                }
+            };
+
+            for (i, value) in embedding.iter().enumerate() {
+                if *value > 0.0 {
+                    tokens.insert(make_embedding_token(i));
+                }
+            }
+        }
+
+        Some(tokens.into_iter().collect())
+    }
+
     pub fn delete(&mut self, id: &str) {
         self.writer
             .delete_term(Term::from_field_text(self.doc.field_id, id));