From a15784342abbd6f3efe9f0a93b3313706e305567 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Tue, 12 Nov 2024 22:29:19 -0800 Subject: [PATCH] refactor(code): simplify token handling and improve chunk body formatting --- crates/tabby-index/src/code/mod.rs | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/crates/tabby-index/src/code/mod.rs b/crates/tabby-index/src/code/mod.rs index 3ba33ebf704a..d480a50174e4 100644 --- a/crates/tabby-index/src/code/mod.rs +++ b/crates/tabby-index/src/code/mod.rs @@ -88,23 +88,19 @@ impl IndexAttributeBuilder for CodeBuilder { let source_code = source_code.clone(); let s = stream! { - let filepath_embedding_tokens = - build_binarize_embedding_tokens(embedding.clone(), &source_code.filepath).await; - for await (start_line, body) in CodeIntelligence::chunks(&text, &source_code.language) { let attributes = json!({ code::fields::CHUNK_FILEPATH: source_code.filepath, code::fields::CHUNK_GIT_URL: source_code.git_url, code::fields::CHUNK_LANGUAGE: source_code.language, - code::fields::CHUNK_BODY: body, + code::fields::CHUNK_BODY: body, code::fields::CHUNK_START_LINE: start_line, }); let embedding = embedding.clone(); - let filepath_embedding_tokens = filepath_embedding_tokens.clone(); + let rewritten_body = format!("```{}\n{}\n```", source_code.filepath, body); yield tokio::spawn(async move { - let tokens = build_binarize_embedding_tokens(embedding.clone(), &body).await; - let tokens= merge_tokens(vec![filepath_embedding_tokens, tokens]); + let tokens = build_binarize_embedding_tokens(embedding.clone(), &rewritten_body).await; (tokens, attributes) }); } @@ -131,11 +127,6 @@ async fn build_binarize_embedding_tokens(embedding: Arc, body: &s tokens } -pub fn merge_tokens(tokens: Vec>) -> Vec { - let tokens = tokens.into_iter().flatten().collect::>(); - tokens.into_iter().collect() -} - fn create_code_builder(embedding: Option>) -> TantivyDocBuilder { let builder = CodeBuilder::new(embedding); TantivyDocBuilder::new(corpus::CODE, builder)