diff --git a/crates/tabby-common/src/index/doc.rs b/crates/tabby-common/src/index/doc.rs index f379afdcd40b..4cb3e91c5c71 100644 --- a/crates/tabby-common/src/index/doc.rs +++ b/crates/tabby-common/src/index/doc.rs @@ -2,6 +2,8 @@ use tantivy::schema::{Field, Schema, STORED, STRING}; pub struct DocSearchSchema { pub schema: Schema, + pub field_id: Field, + /// Binarized embedding tokens with the following mapping: /// * [-1, 0] -> 0 /// * (0, 1] -> 1 @@ -16,6 +18,7 @@ impl DocSearchSchema { pub fn new() -> Self { let mut builder = Schema::builder(); + let field_id = builder.add_text_field("id", STRING | STORED); let field_embedding_token = builder.add_text_field("embedding_token", STRING); let field_title = builder.add_text_field("title", STORED); let field_link = builder.add_text_field("link", STORED); @@ -25,6 +28,7 @@ impl DocSearchSchema { Self { schema, + field_id, field_embedding_token, field_title, field_link, diff --git a/crates/tabby-common/src/path.rs b/crates/tabby-common/src/path.rs index f9feb8f72a39..3d43d51dc9dc 100644 --- a/crates/tabby-common/src/path.rs +++ b/crates/tabby-common/src/path.rs @@ -41,6 +41,10 @@ pub fn index_dir() -> PathBuf { tabby_root().join("index") } +pub fn doc_index_dir() -> PathBuf { + tabby_root().join("doc_index") +} + pub fn models_dir() -> PathBuf { if let Some(cache_root) = &*TABBY_MODEL_CACHE_ROOT { cache_root.clone() diff --git a/crates/tabby-scheduler/src/code/index.rs b/crates/tabby-scheduler/src/code/index.rs index d354bdd6a08d..511d54ea8596 100644 --- a/crates/tabby-scheduler/src/code/index.rs +++ b/crates/tabby-scheduler/src/code/index.rs @@ -1,19 +1,14 @@ -use std::{fs, path::Path}; - use ignore::Walk; use kv::Batch; -use tabby_common::{ - config::RepositoryConfig, - index::{register_tokenizers, CodeSearchSchema}, - path, -}; -use tantivy::{directory::MmapDirectory, doc, Index, Term}; -use tracing::{debug, warn}; +use tabby_common::{config::RepositoryConfig, index::CodeSearchSchema, path}; +use tantivy::{doc, Index, Term}; +use tracing::warn; use super::{ cache::CacheStore, intelligence::{CodeIntelligence, SourceFile}, }; +use crate::tantivy_utils::open_or_create_index; // Magic numbers static MAX_LINE_LENGTH_THRESHOLD: usize = 300; @@ -21,13 +16,13 @@ static AVG_LINE_LENGTH_THRESHOLD: f32 = 150f32; pub fn index_repository(cache: &mut CacheStore, repository: &RepositoryConfig) { let code = CodeSearchSchema::default(); - let index = open_or_create_index(&code, &path::index_dir()); + let index = open_or_create_index(&code.schema, &path::index_dir()); add_changed_documents(cache, &code, repository, &index); } pub fn garbage_collection(cache: &mut CacheStore) { let code = CodeSearchSchema::default(); - let index = open_or_create_index(&code, &path::index_dir()); + let index = open_or_create_index(&code.schema, &path::index_dir()); remove_staled_documents(cache, &code, &index); } @@ -37,8 +32,6 @@ fn add_changed_documents( repository: &RepositoryConfig, index: &Index, ) { - register_tokenizers(index); - // Initialize the search index writer with an initial arena size of 150 MB. let mut writer = index .writer(150_000_000) @@ -129,26 +122,3 @@ fn is_valid_file(file: &SourceFile) -> bool { file.max_line_length <= MAX_LINE_LENGTH_THRESHOLD && file.avg_line_length <= AVG_LINE_LENGTH_THRESHOLD } - -fn open_or_create_index(code: &CodeSearchSchema, path: &Path) -> Index { - match open_or_create_index_impl(code, path) { - Ok(index) => index, - Err(err) => { - warn!( - "Failed to open index repositories: {}, removing index directory '{}'...", - err, - path.display() - ); - fs::remove_dir_all(path).expect("Failed to remove index directory"); - - debug!("Reopening index repositories..."); - open_or_create_index_impl(code, path).expect("Failed to open index") - } - } -} - -fn open_or_create_index_impl(code: &CodeSearchSchema, path: &Path) -> tantivy::Result { - fs::create_dir_all(path).expect("Failed to create index directory"); - let directory = MmapDirectory::open(path).expect("Failed to open index directory"); - Index::open_or_create(directory, code.schema.clone()) -} diff --git a/crates/tabby-scheduler/src/doc/mod.rs b/crates/tabby-scheduler/src/doc/mod.rs new file mode 100644 index 000000000000..2f44a0959dd8 --- /dev/null +++ b/crates/tabby-scheduler/src/doc/mod.rs @@ -0,0 +1,57 @@ +use tabby_common::{index::DocSearchSchema, path}; +use tantivy::{doc, Index, IndexWriter, Term}; +use tracing::warn; + +use crate::tantivy_utils::open_or_create_index; + +struct Document { + pub id: String, + pub title: String, + pub link: String, + pub snippet: String, +} + +struct DocIndex { + doc: DocSearchSchema, + index: Index, + writer: IndexWriter, +} + +impl DocIndex { + pub fn new() -> Self { + let doc = DocSearchSchema::default(); + let index = open_or_create_index(&doc.schema, &path::doc_index_dir()); + let writer = index + .writer(150_000_000) + .expect("Failed to create index writer"); + + Self { doc, index, writer } + } + + pub fn add(&mut self, document: Document) { + // Delete the document if it already exists + self.writer + .delete_term(Term::from_field_text(self.doc.field_id, &document.id)); + + // Add the document + self.writer + .add_document(doc! { + self.doc.field_id => document.id, + // FIXME: compute embedding token + self.doc.field_title => document.title, + self.doc.field_link => document.link, + self.doc.field_snippet => document.snippet, + }) + .expect("Failed to add document"); + } + + pub fn delete(&mut self, id: &str) { + self.writer + .delete_term(Term::from_field_text(self.doc.field_id, id)); + } + + pub fn commit(mut self) { + self.writer.commit().expect("Failed to commit changes"); + self.writer.wait_merging_threads().expect("Failed to wait for merging threads"); + } +} \ No newline at end of file diff --git a/crates/tabby-scheduler/src/lib.rs b/crates/tabby-scheduler/src/lib.rs index da827b302277..cdcdb36e6183 100644 --- a/crates/tabby-scheduler/src/lib.rs +++ b/crates/tabby-scheduler/src/lib.rs @@ -1,9 +1,12 @@ //! Responsible for scheduling all of the background jobs for tabby. //! Includes syncing respositories and updating indices. + +pub mod crawl; + mod code; pub use code::CodeIndex; -pub mod crawl; +mod doc; use std::sync::Arc; @@ -66,3 +69,36 @@ fn scheduler_pipeline(repositories: &[RepositoryConfig]) { code.garbage_collection(); } + +mod tantivy_utils { + use std::{fs, path::Path}; + + use tabby_common::index::register_tokenizers; + use tantivy::{directory::MmapDirectory, schema::Schema, Index}; + use tracing::{debug, warn}; + + pub fn open_or_create_index(code: &Schema, path: &Path) -> Index { + let index = match open_or_create_index_impl(code, path) { + Ok(index) => index, + Err(err) => { + warn!( + "Failed to open index repositories: {}, removing index directory '{}'...", + err, + path.display() + ); + fs::remove_dir_all(path).expect("Failed to remove index directory"); + + debug!("Reopening index repositories..."); + open_or_create_index_impl(code, path).expect("Failed to open index") + } + }; + register_tokenizers(&index); + index + } + + fn open_or_create_index_impl(code: &Schema, path: &Path) -> tantivy::Result { + fs::create_dir_all(path).expect("Failed to create index directory"); + let directory = MmapDirectory::open(path).expect("Failed to open index directory"); + Index::open_or_create(directory, code.clone()) + } +}