Skip to content

Commit

Permalink
feat(scheduler): switch code indexing implementation to text splitter. (
Browse files Browse the repository at this point in the history
#1868)

* feat(scheduler): switch code indexing implementation to text splitter.

* update

* update index
  • Loading branch information
wsxiaoys authored Apr 18, 2024
1 parent 138ef83 commit 9e06df3
Show file tree
Hide file tree
Showing 9 changed files with 124 additions and 227 deletions.
77 changes: 58 additions & 19 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 0 additions & 2 deletions crates/tabby-common/src/api/code.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,7 @@ pub struct HitDocument {
pub body: String,
pub filepath: String,
pub git_url: String,
pub kind: String,
pub language: String,
pub name: String,
}

#[derive(Error, Debug)]
Expand Down
29 changes: 4 additions & 25 deletions crates/tabby-common/src/index.rs
Original file line number Diff line number Diff line change
@@ -1,35 +1,25 @@
use tantivy::{
query::{TermQuery, TermSetQuery},
schema::{Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, STORED, STRING},
tokenizer::{NgramTokenizer, RegexTokenizer, RemoveLongFilter, TextAnalyzer},
tokenizer::{RegexTokenizer, RemoveLongFilter, TextAnalyzer},
Index, Term,
};

static CODE_TOKENIZER: &str = "code";
static IDENTIFIER_TOKENIZER: &str = "identifier";

pub fn register_tokenizers(index: &Index) {
let code_tokenizer = TextAnalyzer::builder(RegexTokenizer::new(r"(?:\w+)").unwrap())
.filter(RemoveLongFilter::limit(128))
.filter(RemoveLongFilter::limit(64))
.build();

index.tokenizers().register(CODE_TOKENIZER, code_tokenizer);

let identifier_tokenzier =
TextAnalyzer::builder(NgramTokenizer::prefix_only(2, 5).unwrap()).build();

index
.tokenizers()
.register(IDENTIFIER_TOKENIZER, identifier_tokenzier);
}

pub struct CodeSearchSchema {
pub schema: Schema,
pub field_git_url: Field,
pub field_filepath: Field,
pub field_language: Field,
pub field_name: Field,
pub field_kind: Field,
pub field_body: Field,
}

Expand All @@ -39,23 +29,14 @@ impl CodeSearchSchema {

let code_indexing_options = TextFieldIndexing::default()
.set_tokenizer(CODE_TOKENIZER)
.set_index_option(tantivy::schema::IndexRecordOption::WithFreqsAndPositions);
.set_index_option(tantivy::schema::IndexRecordOption::WithFreqs);
let code_options = TextOptions::default()
.set_indexing_options(code_indexing_options)
.set_stored();

let name_indexing_options = TextFieldIndexing::default()
.set_tokenizer(IDENTIFIER_TOKENIZER)
.set_index_option(tantivy::schema::IndexRecordOption::WithFreqsAndPositions);
let name_options = TextOptions::default()
.set_indexing_options(name_indexing_options)
.set_stored();

let field_git_url = builder.add_text_field("git_url", STRING | STORED);
let field_filepath = builder.add_text_field("filepath", STRING | STORED);
let field_language = builder.add_text_field("language", STRING | STORED);
let field_name = builder.add_text_field("name", name_options);
let field_kind = builder.add_text_field("kind", STRING | STORED);
let field_body = builder.add_text_field("body", code_options);
let schema = builder.build();

Expand All @@ -64,8 +45,6 @@ impl CodeSearchSchema {
field_git_url,
field_filepath,
field_language,
field_name,
field_kind,
field_body,
}
}
Expand All @@ -87,7 +66,7 @@ impl CodeSearchSchema {
};
Box::new(TermQuery::new(
Term::from_field_text(self.field_language, language),
IndexRecordOption::WithFreqsAndPositions,
IndexRecordOption::Basic,
))
}

Expand Down
12 changes: 11 additions & 1 deletion crates/tabby-common/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ use std::{
fs::File,
io::{BufReader, Error},
ops::Range,
path::PathBuf,
path::{Path, PathBuf},
};

use path::dataset_dir;
Expand Down Expand Up @@ -47,6 +47,16 @@ impl SourceFile {
});
Ok(iter)
}

pub fn read_content(&self) -> std::io::Result<String> {
let path = Path::new(&self.basedir).join(&self.filepath);
std::fs::read_to_string(path)
}

pub fn read_file_size(&self) -> usize {
let path = Path::new(&self.basedir).join(&self.filepath);
std::fs::metadata(path).map(|x| x.len()).unwrap_or_default() as usize
}
}

#[derive(Serialize, Deserialize, Clone, Debug)]
Expand Down
1 change: 1 addition & 0 deletions crates/tabby-scheduler/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ tokio = { workspace = true, features = ["process"] }
package-lock-json-parser = "0.4.0"
npm-package-json = "0.1.3"
yarn-lock-parser = "0.7.0"
text-splitter = "0.10.0"

[dev-dependencies]
temp_testdir = { workspace = true }
Expand Down
11 changes: 11 additions & 0 deletions crates/tabby-scheduler/src/code/mod.rs
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
use tabby_common::{Point, Tag};
use text_splitter::{Characters, TextSplitter};
use tree_sitter_tags::TagsContext;

mod languages;

pub struct CodeIntelligence {
context: TagsContext,
splitter: TextSplitter<Characters>,
}

impl Default for CodeIntelligence {
fn default() -> Self {
Self {
context: TagsContext::new(),
splitter: TextSplitter::default().with_trim_chunks(true),
}
}
}
Expand Down Expand Up @@ -49,4 +52,12 @@ impl CodeIntelligence {
})
.collect()
}

// FIXME(meng): implement with treesitter based CodeSplitter.
pub fn chunks<'splitter, 'text: 'splitter>(
&'splitter self,
text: &'text str,
) -> impl Iterator<Item = &'text str> + 'splitter {
self.splitter.chunks(text, 192)
}
}
Loading

0 comments on commit 9e06df3

Please sign in to comment.