feat(scheduler): switch code indexing implementation to text splitter. (

#1868) * feat(scheduler): switch code indexing implementation to text splitter. * update * update index
TabbyML · Apr 18, 2024 · 9e06df3 · 9e06df3
1 parent 138ef83
commit 9e06df3
Show file tree

Hide file tree

Showing 9 changed files with 124 additions and 227 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/tabby-common/src/api/code.rs b/crates/tabby-common/src/api/code.rs
@@ -21,9 +21,7 @@ pub struct HitDocument {
     pub body: String,
     pub filepath: String,
     pub git_url: String,
-    pub kind: String,
     pub language: String,
-    pub name: String,
 }
 
 #[derive(Error, Debug)]

diff --git a/crates/tabby-common/src/index.rs b/crates/tabby-common/src/index.rs
@@ -1,35 +1,25 @@
 use tantivy::{
     query::{TermQuery, TermSetQuery},
     schema::{Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, STORED, STRING},
-    tokenizer::{NgramTokenizer, RegexTokenizer, RemoveLongFilter, TextAnalyzer},
+    tokenizer::{RegexTokenizer, RemoveLongFilter, TextAnalyzer},
     Index, Term,
 };
 
 static CODE_TOKENIZER: &str = "code";
-static IDENTIFIER_TOKENIZER: &str = "identifier";
 
 pub fn register_tokenizers(index: &Index) {
     let code_tokenizer = TextAnalyzer::builder(RegexTokenizer::new(r"(?:\w+)").unwrap())
-        .filter(RemoveLongFilter::limit(128))
+        .filter(RemoveLongFilter::limit(64))
         .build();
 
     index.tokenizers().register(CODE_TOKENIZER, code_tokenizer);
-
-    let identifier_tokenzier =
-        TextAnalyzer::builder(NgramTokenizer::prefix_only(2, 5).unwrap()).build();
-
-    index
-        .tokenizers()
-        .register(IDENTIFIER_TOKENIZER, identifier_tokenzier);
 }
 
 pub struct CodeSearchSchema {
     pub schema: Schema,
     pub field_git_url: Field,
     pub field_filepath: Field,
     pub field_language: Field,
-    pub field_name: Field,
-    pub field_kind: Field,
     pub field_body: Field,
 }
 
@@ -39,23 +29,14 @@ impl CodeSearchSchema {
 
         let code_indexing_options = TextFieldIndexing::default()
             .set_tokenizer(CODE_TOKENIZER)
-            .set_index_option(tantivy::schema::IndexRecordOption::WithFreqsAndPositions);
+            .set_index_option(tantivy::schema::IndexRecordOption::WithFreqs);
         let code_options = TextOptions::default()
             .set_indexing_options(code_indexing_options)
             .set_stored();
 
-        let name_indexing_options = TextFieldIndexing::default()
-            .set_tokenizer(IDENTIFIER_TOKENIZER)
-            .set_index_option(tantivy::schema::IndexRecordOption::WithFreqsAndPositions);
-        let name_options = TextOptions::default()
-            .set_indexing_options(name_indexing_options)
-            .set_stored();
-
         let field_git_url = builder.add_text_field("git_url", STRING | STORED);
         let field_filepath = builder.add_text_field("filepath", STRING | STORED);
         let field_language = builder.add_text_field("language", STRING | STORED);
-        let field_name = builder.add_text_field("name", name_options);
-        let field_kind = builder.add_text_field("kind", STRING | STORED);
         let field_body = builder.add_text_field("body", code_options);
         let schema = builder.build();
 
@@ -64,8 +45,6 @@ impl CodeSearchSchema {
             field_git_url,
             field_filepath,
             field_language,
-            field_name,
-            field_kind,
             field_body,
         }
     }
@@ -87,7 +66,7 @@ impl CodeSearchSchema {
         };
         Box::new(TermQuery::new(
             Term::from_field_text(self.field_language, language),
-            IndexRecordOption::WithFreqsAndPositions,
+            IndexRecordOption::Basic,
         ))
     }
 

diff --git a/crates/tabby-common/src/lib.rs b/crates/tabby-common/src/lib.rs
@@ -14,7 +14,7 @@ use std::{
     fs::File,
     io::{BufReader, Error},
     ops::Range,
-    path::PathBuf,
+    path::{Path, PathBuf},
 };
 
 use path::dataset_dir;
@@ -47,6 +47,16 @@ impl SourceFile {
         });
         Ok(iter)
     }
+
+    pub fn read_content(&self) -> std::io::Result<String> {
+        let path = Path::new(&self.basedir).join(&self.filepath);
+        std::fs::read_to_string(path)
+    }
+
+    pub fn read_file_size(&self) -> usize {
+        let path = Path::new(&self.basedir).join(&self.filepath);
+        std::fs::metadata(path).map(|x| x.len()).unwrap_or_default() as usize
+    }
 }
 
 #[derive(Serialize, Deserialize, Clone, Debug)]

diff --git a/crates/tabby-scheduler/Cargo.toml b/crates/tabby-scheduler/Cargo.toml
@@ -35,6 +35,7 @@ tokio = { workspace = true, features = ["process"] }
 package-lock-json-parser = "0.4.0"
 npm-package-json = "0.1.3"
 yarn-lock-parser = "0.7.0"
+text-splitter = "0.10.0"
 
 [dev-dependencies]
 temp_testdir = { workspace = true }

diff --git a/crates/tabby-scheduler/src/code/mod.rs b/crates/tabby-scheduler/src/code/mod.rs
@@ -1,16 +1,19 @@
 use tabby_common::{Point, Tag};
+use text_splitter::{Characters, TextSplitter};
 use tree_sitter_tags::TagsContext;
 
 mod languages;
 
 pub struct CodeIntelligence {
     context: TagsContext,
+    splitter: TextSplitter<Characters>,
 }
 
 impl Default for CodeIntelligence {
     fn default() -> Self {
         Self {
             context: TagsContext::new(),
+            splitter: TextSplitter::default().with_trim_chunks(true),
         }
     }
 }
@@ -49,4 +52,12 @@ impl CodeIntelligence {
             })
             .collect()
     }
+
+    // FIXME(meng): implement with treesitter based CodeSplitter.
+    pub fn chunks<'splitter, 'text: 'splitter>(
+        &'splitter self,
+        text: &'text str,
+    ) -> impl Iterator<Item = &'text str> + 'splitter {
+        self.splitter.chunks(text, 192)
+    }
 }