Skip to content

Commit

Permalink
chore: add scaffold of doc embedding
Browse files Browse the repository at this point in the history
  • Loading branch information
wsxiaoys committed May 13, 2024
1 parent c50806a commit 8b65a3b
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 37 deletions.
4 changes: 4 additions & 0 deletions crates/tabby-common/src/index/doc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ use tantivy::schema::{Field, Schema, STORED, STRING};

pub struct DocSearchSchema {
pub schema: Schema,
pub field_id: Field,

/// Binarized embedding tokens with the following mapping:
/// * [-1, 0] -> 0
/// * (0, 1] -> 1
Expand All @@ -16,6 +18,7 @@ impl DocSearchSchema {
pub fn new() -> Self {
let mut builder = Schema::builder();

let field_id = builder.add_text_field("id", STRING | STORED);

Check warning on line 21 in crates/tabby-common/src/index/doc.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-common/src/index/doc.rs#L21

Added line #L21 was not covered by tests
let field_embedding_token = builder.add_text_field("embedding_token", STRING);
let field_title = builder.add_text_field("title", STORED);
let field_link = builder.add_text_field("link", STORED);
Expand All @@ -25,6 +28,7 @@ impl DocSearchSchema {

Self {
schema,
field_id,

Check warning on line 31 in crates/tabby-common/src/index/doc.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-common/src/index/doc.rs#L31

Added line #L31 was not covered by tests
field_embedding_token,
field_title,
field_link,
Expand Down
4 changes: 4 additions & 0 deletions crates/tabby-common/src/path.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ pub fn index_dir() -> PathBuf {
tabby_root().join("index")
}

pub fn doc_index_dir() -> PathBuf {
tabby_root().join("doc_index")
}

Check warning on line 46 in crates/tabby-common/src/path.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-common/src/path.rs#L44-L46

Added lines #L44 - L46 were not covered by tests

pub fn models_dir() -> PathBuf {
if let Some(cache_root) = &*TABBY_MODEL_CACHE_ROOT {
cache_root.clone()
Expand Down
42 changes: 6 additions & 36 deletions crates/tabby-scheduler/src/code/index.rs
Original file line number Diff line number Diff line change
@@ -1,33 +1,28 @@
use std::{fs, path::Path};

use ignore::Walk;
use kv::Batch;
use tabby_common::{
config::RepositoryConfig,
index::{register_tokenizers, CodeSearchSchema},
path,
};
use tantivy::{directory::MmapDirectory, doc, Index, Term};
use tracing::{debug, warn};
use tabby_common::{config::RepositoryConfig, index::CodeSearchSchema, path};
use tantivy::{doc, Index, Term};
use tracing::warn;

use super::{
cache::CacheStore,
intelligence::{CodeIntelligence, SourceFile},
};
use crate::tantivy_utils::open_or_create_index;

// Magic numbers
static MAX_LINE_LENGTH_THRESHOLD: usize = 300;
static AVG_LINE_LENGTH_THRESHOLD: f32 = 150f32;

pub fn index_repository(cache: &mut CacheStore, repository: &RepositoryConfig) {
let code = CodeSearchSchema::default();
let index = open_or_create_index(&code, &path::index_dir());
let index = open_or_create_index(&code.schema, &path::index_dir());

Check warning on line 19 in crates/tabby-scheduler/src/code/index.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-scheduler/src/code/index.rs#L19

Added line #L19 was not covered by tests
add_changed_documents(cache, &code, repository, &index);
}

pub fn garbage_collection(cache: &mut CacheStore) {
let code = CodeSearchSchema::default();
let index = open_or_create_index(&code, &path::index_dir());
let index = open_or_create_index(&code.schema, &path::index_dir());

Check warning on line 25 in crates/tabby-scheduler/src/code/index.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-scheduler/src/code/index.rs#L25

Added line #L25 was not covered by tests
remove_staled_documents(cache, &code, &index);
}

Expand All @@ -37,8 +32,6 @@ fn add_changed_documents(
repository: &RepositoryConfig,
index: &Index,
) {
register_tokenizers(index);

// Initialize the search index writer with an initial arena size of 150 MB.
let mut writer = index
.writer(150_000_000)
Expand Down Expand Up @@ -129,26 +122,3 @@ fn is_valid_file(file: &SourceFile) -> bool {
file.max_line_length <= MAX_LINE_LENGTH_THRESHOLD
&& file.avg_line_length <= AVG_LINE_LENGTH_THRESHOLD
}

fn open_or_create_index(code: &CodeSearchSchema, path: &Path) -> Index {
match open_or_create_index_impl(code, path) {
Ok(index) => index,
Err(err) => {
warn!(
"Failed to open index repositories: {}, removing index directory '{}'...",
err,
path.display()
);
fs::remove_dir_all(path).expect("Failed to remove index directory");

debug!("Reopening index repositories...");
open_or_create_index_impl(code, path).expect("Failed to open index")
}
}
}

fn open_or_create_index_impl(code: &CodeSearchSchema, path: &Path) -> tantivy::Result<Index> {
fs::create_dir_all(path).expect("Failed to create index directory");
let directory = MmapDirectory::open(path).expect("Failed to open index directory");
Index::open_or_create(directory, code.schema.clone())
}
57 changes: 57 additions & 0 deletions crates/tabby-scheduler/src/doc/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
use tabby_common::{index::DocSearchSchema, path};
use tantivy::{doc, Index, IndexWriter, Term};
use tracing::warn;

use crate::tantivy_utils::open_or_create_index;

struct Document {
pub id: String,
pub title: String,
pub link: String,
pub snippet: String,
}

struct DocIndex {
doc: DocSearchSchema,
index: Index,
writer: IndexWriter,
}

impl DocIndex {
pub fn new() -> Self {
let doc = DocSearchSchema::default();
let index = open_or_create_index(&doc.schema, &path::doc_index_dir());
let writer = index
.writer(150_000_000)
.expect("Failed to create index writer");

Self { doc, index, writer }
}

Check warning on line 29 in crates/tabby-scheduler/src/doc/mod.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-scheduler/src/doc/mod.rs#L21-L29

Added lines #L21 - L29 were not covered by tests

pub fn add(&mut self, document: Document) {
// Delete the document if it already exists
self.writer
.delete_term(Term::from_field_text(self.doc.field_id, &document.id));

// Add the document
self.writer
.add_document(doc! {
self.doc.field_id => document.id,
// FIXME: compute embedding token
self.doc.field_title => document.title,
self.doc.field_link => document.link,
self.doc.field_snippet => document.snippet,
})
.expect("Failed to add document");
}

Check warning on line 46 in crates/tabby-scheduler/src/doc/mod.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-scheduler/src/doc/mod.rs#L31-L46

Added lines #L31 - L46 were not covered by tests

pub fn delete(&mut self, id: &str) {
self.writer
.delete_term(Term::from_field_text(self.doc.field_id, id));
}

Check warning on line 51 in crates/tabby-scheduler/src/doc/mod.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-scheduler/src/doc/mod.rs#L48-L51

Added lines #L48 - L51 were not covered by tests

pub fn commit(mut self) {
self.writer.commit().expect("Failed to commit changes");
self.writer.wait_merging_threads().expect("Failed to wait for merging threads");
}

Check warning on line 56 in crates/tabby-scheduler/src/doc/mod.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-scheduler/src/doc/mod.rs#L53-L56

Added lines #L53 - L56 were not covered by tests
}
38 changes: 37 additions & 1 deletion crates/tabby-scheduler/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
//! Responsible for scheduling all of the background jobs for tabby.
//! Includes syncing respositories and updating indices.
pub mod crawl;

mod code;
pub use code::CodeIndex;

pub mod crawl;
mod doc;

use std::sync::Arc;

Expand Down Expand Up @@ -66,3 +69,36 @@ fn scheduler_pipeline(repositories: &[RepositoryConfig]) {

code.garbage_collection();

Check warning on line 70 in crates/tabby-scheduler/src/lib.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-scheduler/src/lib.rs#L70

Added line #L70 was not covered by tests
}

mod tantivy_utils {
use std::{fs, path::Path};

use tabby_common::index::register_tokenizers;
use tantivy::{directory::MmapDirectory, schema::Schema, Index};
use tracing::{debug, warn};

pub fn open_or_create_index(code: &Schema, path: &Path) -> Index {
let index = match open_or_create_index_impl(code, path) {
Ok(index) => index,
Err(err) => {
warn!(
"Failed to open index repositories: {}, removing index directory '{}'...",
err,
path.display()
);
fs::remove_dir_all(path).expect("Failed to remove index directory");

debug!("Reopening index repositories...");
open_or_create_index_impl(code, path).expect("Failed to open index")

Check warning on line 92 in crates/tabby-scheduler/src/lib.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-scheduler/src/lib.rs#L80-L92

Added lines #L80 - L92 were not covered by tests
}
};
register_tokenizers(&index);
index

Check warning on line 96 in crates/tabby-scheduler/src/lib.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-scheduler/src/lib.rs#L95-L96

Added lines #L95 - L96 were not covered by tests
}

fn open_or_create_index_impl(code: &Schema, path: &Path) -> tantivy::Result<Index> {
fs::create_dir_all(path).expect("Failed to create index directory");
let directory = MmapDirectory::open(path).expect("Failed to open index directory");
Index::open_or_create(directory, code.clone())

Check warning on line 102 in crates/tabby-scheduler/src/lib.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-scheduler/src/lib.rs#L99-L102

Added lines #L99 - L102 were not covered by tests
}
}

0 comments on commit 8b65a3b

Please sign in to comment.