Skip to content

Commit

Permalink
chore: add scaffold of doc indexing (#2104)
Browse files Browse the repository at this point in the history
* add embedding computation

* add cmt

* update
  • Loading branch information
wsxiaoys authored May 13, 2024
1 parent 6f2bc49 commit e01650b
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 14 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions crates/tabby-common/src/index/doc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ pub struct DocSearchSchema {

pub field_title: Field,
pub field_link: Field,
pub field_snippet: Field,
pub field_body: Field,
}

impl DocSearchSchema {
Expand All @@ -22,7 +22,7 @@ impl DocSearchSchema {
let field_embedding_token = builder.add_text_field("embedding_token", STRING);
let field_title = builder.add_text_field("title", STORED);
let field_link = builder.add_text_field("link", STORED);
let field_snippet = builder.add_text_field("snippet", STORED);
let field_body = builder.add_text_field("body", STORED);

let schema = builder.build();

Expand All @@ -32,7 +32,7 @@ impl DocSearchSchema {
field_embedding_token,
field_title,
field_link,
field_snippet,
field_body,
}
}
}
Expand Down
1 change: 1 addition & 0 deletions crates/tabby-scheduler/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ mdka = "1.2.4"
readable-readability = "0.4.0"
url.workspace = true
voca_rs = "1.15.2"
tabby-inference = { path = "../tabby-inference" }

[dev-dependencies]
temp_testdir = { workspace = true }
Expand Down
79 changes: 68 additions & 11 deletions crates/tabby-scheduler/src/doc/mod.rs
Original file line number Diff line number Diff line change
@@ -1,49 +1,106 @@
use std::{collections::HashSet, sync::Arc};

use tabby_common::{index::DocSearchSchema, path};
use tabby_inference::Embedding;
use tantivy::{doc, Index, IndexWriter, Term};
use text_splitter::{Characters, TextSplitter};
use tracing::warn;

use crate::tantivy_utils::open_or_create_index;

struct Document {
pub id: String,
pub title: String,
pub link: String,
pub snippet: String,
pub body: String,
}

struct DocIndex {
embedding: Arc<dyn Embedding>,
doc: DocSearchSchema,
index: Index,
writer: IndexWriter,
splitter: TextSplitter<Characters>,
}

const CHUNK_SIZE: usize = 2048;

fn make_embedding_token(i: usize) -> String {
format!("embedding_{i}")
}

impl DocIndex {
pub fn new() -> Self {
pub fn new(embedding: Arc<dyn Embedding>) -> Self {
let doc = DocSearchSchema::default();
let index = open_or_create_index(&doc.schema, &path::doc_index_dir());
let writer = index
.writer(150_000_000)
.expect("Failed to create index writer");

Self { doc, index, writer }
Self {
embedding,
doc,
index,
writer,
splitter: TextSplitter::default().with_trim_chunks(true),
}
}

pub fn add(&mut self, document: Document) {
pub async fn add(&mut self, document: Document) {
// Delete the document if it already exists
self.writer
.delete_term(Term::from_field_text(self.doc.field_id, &document.id));

let Some(embedding_tokens) = self.compute_embedding_tokens(&document.body).await else {
warn!(
"Failed to compute embedding tokens for document '{}'",
document.id
);
return;
};

let mut doc = doc! {
self.doc.field_id => document.id,
self.doc.field_title => document.title,
self.doc.field_link => document.link,
self.doc.field_body => document.body,
};

for token in embedding_tokens {
doc.add_text(self.doc.field_embedding_token, token);
}

// Add the document
self.writer
.add_document(doc! {
self.doc.field_id => document.id,
// FIXME: compute embedding token
self.doc.field_title => document.title,
self.doc.field_link => document.link,
self.doc.field_snippet => document.snippet,
})
.add_document(doc)
.expect("Failed to add document");
}

/// This function splits the document into chunks and computes the embedding for each chunk. It then converts the embeddings
/// into binarized tokens by thresholding on zero.
///
/// The current implementation deduplicates tokens at the document level, but this may require further consideration in the future.
async fn compute_embedding_tokens(&self, content: &str) -> Option<Vec<String>> {
let mut tokens = HashSet::new();
for chunk in self.splitter.chunks(content, CHUNK_SIZE) {
let embedding = match self.embedding.embed(chunk).await {
Ok(embedding) => embedding,
Err(e) => {
warn!("Failed to embed document: {}", e);
return None;
}
};

for (i, value) in embedding.iter().enumerate() {
if *value > 0.0 {
tokens.insert(make_embedding_token(i));
}
}
}

Some(tokens.into_iter().collect())
}

pub fn delete(&mut self, id: &str) {
self.writer
.delete_term(Term::from_field_text(self.doc.field_id, id));
Expand Down

0 comments on commit e01650b

Please sign in to comment.