Skip to content

Commit

Permalink
refactor(code): simplify token handling and improve chunk body format…
Browse files Browse the repository at this point in the history
…ting
  • Loading branch information
wsxiaoys committed Nov 13, 2024
1 parent 5adb227 commit a157843
Showing 1 changed file with 3 additions and 12 deletions.
15 changes: 3 additions & 12 deletions crates/tabby-index/src/code/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,23 +88,19 @@ impl IndexAttributeBuilder<SourceCode> for CodeBuilder {

let source_code = source_code.clone();
let s = stream! {
let filepath_embedding_tokens =
build_binarize_embedding_tokens(embedding.clone(), &source_code.filepath).await;

for await (start_line, body) in CodeIntelligence::chunks(&text, &source_code.language) {
let attributes = json!({
code::fields::CHUNK_FILEPATH: source_code.filepath,
code::fields::CHUNK_GIT_URL: source_code.git_url,
code::fields::CHUNK_LANGUAGE: source_code.language,
code::fields::CHUNK_BODY: body,
code::fields::CHUNK_BODY: body,

Check warning on line 96 in crates/tabby-index/src/code/mod.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/code/mod.rs#L96

Added line #L96 was not covered by tests
code::fields::CHUNK_START_LINE: start_line,
});

let embedding = embedding.clone();
let filepath_embedding_tokens = filepath_embedding_tokens.clone();
let rewritten_body = format!("```{}\n{}\n```", source_code.filepath, body);

Check warning on line 101 in crates/tabby-index/src/code/mod.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/code/mod.rs#L101

Added line #L101 was not covered by tests
yield tokio::spawn(async move {
let tokens = build_binarize_embedding_tokens(embedding.clone(), &body).await;
let tokens= merge_tokens(vec![filepath_embedding_tokens, tokens]);
let tokens = build_binarize_embedding_tokens(embedding.clone(), &rewritten_body).await;

Check warning on line 103 in crates/tabby-index/src/code/mod.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/code/mod.rs#L103

Added line #L103 was not covered by tests
(tokens, attributes)
});
}
Expand All @@ -131,11 +127,6 @@ async fn build_binarize_embedding_tokens(embedding: Arc<dyn Embedding>, body: &s
tokens
}

pub fn merge_tokens(tokens: Vec<Vec<String>>) -> Vec<String> {
let tokens = tokens.into_iter().flatten().collect::<HashSet<_>>();
tokens.into_iter().collect()
}

fn create_code_builder(embedding: Option<Arc<dyn Embedding>>) -> TantivyDocBuilder<SourceCode> {
let builder = CodeBuilder::new(embedding);
TantivyDocBuilder::new(corpus::CODE, builder)
Expand Down

0 comments on commit a157843

Please sign in to comment.