From cbf34f6d595995ef66af4f5a60799af6e7f478f8 Mon Sep 17 00:00:00 2001 From: Wei Zhang Date: Mon, 23 Dec 2024 11:13:08 +0800 Subject: [PATCH] feat(index): add commit to CodeSearchDocument (#3577) * WIP: add commit in code search Signed-off-by: Wei Zhang * chore: fix tests Signed-off-by: Wei Zhang * chore: commit fields as optional currently Signed-off-by: Wei Zhang * chore: fix tests Signed-off-by: Wei Zhang * [autofix.ci] apply automated fixes * chore: commit should be in doc.attribute Signed-off-by: Wei Zhang * [autofix.ci] apply automated fixes * chore: add commit when create_hit Signed-off-by: Wei Zhang * chore: add comment to note commit is the last updated commit Signed-off-by: Wei Zhang * backfill commit in source code without redo calculate (#3587) --------- Signed-off-by: Wei Zhang Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> --- crates/tabby-common/src/api/code.rs | 5 ++ crates/tabby-common/src/index/code/mod.rs | 2 + crates/tabby-common/src/index/mod.rs | 35 ++++++---- crates/tabby-index/src/code/index.rs | 59 ++++++++++++++-- crates/tabby-index/src/code/intelligence.rs | 13 +++- crates/tabby-index/src/code/mod.rs | 10 +-- crates/tabby-index/src/code/repository.rs | 24 ++++--- crates/tabby-index/src/code/types.rs | 1 + crates/tabby-index/src/indexer.rs | 54 +++++++++++++++ crates/tabby-index/src/indexer_tests.rs | 3 +- crates/tabby/src/services/code.rs | 68 ++++++++++++++++--- ee/tabby-db/src/threads.rs | 1 + ee/tabby-schema/graphql/schema.graphql | 1 + ee/tabby-schema/src/dao.rs | 2 + ee/tabby-schema/src/schema/thread/types.rs | 2 + ee/tabby-webserver/src/service/answer.rs | 10 ++- .../src/service/answer/testutils/mod.rs | 2 + 17 files changed, 242 insertions(+), 50 deletions(-) diff --git a/crates/tabby-common/src/api/code.rs b/crates/tabby-common/src/api/code.rs index ecd4aafc12c5..b458e96ee416 100644 --- a/crates/tabby-common/src/api/code.rs +++ b/crates/tabby-common/src/api/code.rs @@ -30,6 +30,11 @@ pub struct CodeSearchDocument { pub body: String, pub filepath: String, pub git_url: String, + + // FIXME(kweizh): This should be a required field after 0.25.0. + // commit represents the specific revision at which the file was last edited. + pub commit: Option, + pub language: String, pub start_line: usize, } diff --git a/crates/tabby-common/src/index/code/mod.rs b/crates/tabby-common/src/index/code/mod.rs index 7380b1966e4a..d829959d462b 100644 --- a/crates/tabby-common/src/index/code/mod.rs +++ b/crates/tabby-common/src/index/code/mod.rs @@ -10,6 +10,8 @@ use super::{corpus, IndexSchema}; use crate::api::code::CodeSearchQuery; pub mod fields { + pub const ATTRIBUTE_COMMIT: &str = "commit"; + pub const CHUNK_GIT_URL: &str = "chunk_git_url"; pub const CHUNK_FILEPATH: &str = "chunk_filepath"; pub const CHUNK_LANGUAGE: &str = "chunk_language"; diff --git a/crates/tabby-common/src/index/mod.rs b/crates/tabby-common/src/index/mod.rs index c3676d122ce1..d92798bc2888 100644 --- a/crates/tabby-common/src/index/mod.rs +++ b/crates/tabby-common/src/index/mod.rs @@ -178,6 +178,21 @@ impl IndexSchema { ]) } + /// Build a query to find the document with the given `doc_id`, include chunks. + pub fn doc_query_with_chunks(&self, corpus: &str, doc_id: &str) -> impl Query { + let doc_id_query = TermQuery::new( + Term::from_field_text(self.field_id, doc_id), + tantivy::schema::IndexRecordOption::Basic, + ); + + BooleanQuery::new(vec![ + // Must match the corpus + (Occur::Must, self.corpus_query(corpus)), + // Must match the doc id + (Occur::Must, Box::new(doc_id_query)), + ]) + } + pub fn doc_indexed_after( &self, corpus: &str, @@ -261,21 +276,11 @@ impl IndexSchema { FIELD_ATTRIBUTES, field ))), ), - ]) - } - - /// Build a query to find the document with the given `doc_id`, include chunks. - pub fn doc_query_with_chunks(&self, corpus: &str, doc_id: &str) -> impl Query { - let doc_id_query = TermQuery::new( - Term::from_field_text(self.field_id, doc_id), - tantivy::schema::IndexRecordOption::Basic, - ); - - BooleanQuery::new(vec![ - // Must match the corpus - (Occur::Must, self.corpus_query(corpus)), - // Must match the doc id - (Occur::Must, Box::new(doc_id_query)), + // Exclude chunk documents + ( + Occur::MustNot, + Box::new(ExistsQuery::new_exists_query(FIELD_CHUNK_ID.into())), + ), ]) } diff --git a/crates/tabby-index/src/code/index.rs b/crates/tabby-index/src/code/index.rs index c58d97bb896d..5d0a4966bc4b 100644 --- a/crates/tabby-index/src/code/index.rs +++ b/crates/tabby-index/src/code/index.rs @@ -1,9 +1,10 @@ -use std::{pin::pin, sync::Arc}; +use std::{path::Path, pin::pin, sync::Arc}; +use anyhow::Result; use async_stream::stream; use futures::StreamExt; use ignore::{DirEntry, Walk}; -use tabby_common::index::corpus; +use tabby_common::index::{code, corpus}; use tabby_inference::Embedding; use tracing::warn; @@ -12,7 +13,7 @@ use super::{ intelligence::{CodeIntelligence, SourceCode}, CodeRepository, }; -use crate::indexer::Indexer; +use crate::indexer::{Indexer, TantivyDocBuilder}; // Magic numbers static MAX_LINE_LENGTH_THRESHOLD: usize = 300; @@ -21,7 +22,11 @@ static MIN_ALPHA_NUM_FRACTION: f32 = 0.25f32; static MAX_NUMBER_OF_LINES: usize = 100000; static MAX_NUMBER_FRACTION: f32 = 0.5f32; -pub async fn index_repository(embedding: Arc, repository: &CodeRepository) { +pub async fn index_repository( + embedding: Arc, + repository: &CodeRepository, + commit: &str, +) { let total_files = Walk::new(repository.dir()).count(); let file_stream = stream! { for file in Walk::new(repository.dir()) { @@ -45,7 +50,7 @@ pub async fn index_repository(embedding: Arc, repository: &CodeRe let mut count_chunks = 0; while let Some(files) = file_stream.next().await { count_files += files.len(); - count_chunks += add_changed_documents(repository, embedding.clone(), files).await; + count_chunks += add_changed_documents(repository, commit, embedding.clone(), files).await; logkit::info!("Processed {count_files}/{total_files} files, updated {count_chunks} chunks",); } } @@ -79,6 +84,7 @@ pub async fn garbage_collection() { async fn add_changed_documents( repository: &CodeRepository, + commit: &str, embedding: Arc, files: Vec, ) -> usize { @@ -96,12 +102,23 @@ async fn add_changed_documents( let id = SourceCode::to_index_id(&repository.source_id, &key).id; - // Skip if already indexed and has no failed chunks + // Skip if already indexed and has no failed chunks, + // when skip, we should check if the document needs to be backfilled. if !require_updates(cloned_index.clone(), &id) { + backfill_commit_in_doc_if_needed( + builder.clone(), + cloned_index.clone(), + &id, + repository, + commit, + file.path()).await.unwrap_or_else(|e| { + warn!("Failed to backfill commit for {id}: {e}"); + } + ); continue; } - let Some(code) = CodeIntelligence::compute_source_file(repository, file.path()) else { + let Some(code) = CodeIntelligence::compute_source_file(repository, commit, file.path()) else { continue; }; @@ -143,6 +160,34 @@ fn require_updates(indexer: Arc, id: &str) -> bool { true } +// v0.23.0 add the commit field to the code document. +async fn backfill_commit_in_doc_if_needed( + builder: Arc>, + indexer: Arc, + id: &str, + repository: &CodeRepository, + commit: &str, + path: &Path, +) -> Result<()> { + if indexer.has_attribute_field(id, code::fields::ATTRIBUTE_COMMIT) { + return Ok(()); + } + + let code = CodeIntelligence::compute_source_file(repository, commit, path) + .ok_or_else(|| anyhow::anyhow!("Failed to compute source file"))?; + if !is_valid_file(&code) { + anyhow::bail!("Invalid file"); + } + + let origin = indexer.get_doc(id).await?; + indexer.delete_doc(id); + indexer + .add(builder.backfill_doc_attributes(&origin, &code).await) + .await; + + Ok(()) +} + fn is_valid_file(file: &SourceCode) -> bool { file.max_line_length <= MAX_LINE_LENGTH_THRESHOLD && file.avg_line_length <= AVG_LINE_LENGTH_THRESHOLD diff --git a/crates/tabby-index/src/code/intelligence.rs b/crates/tabby-index/src/code/intelligence.rs index 0af8b3832aa9..3976d3c4e646 100644 --- a/crates/tabby-index/src/code/intelligence.rs +++ b/crates/tabby-index/src/code/intelligence.rs @@ -73,7 +73,11 @@ impl CodeIntelligence { file_key.to_string() == item_key } - pub fn compute_source_file(config: &CodeRepository, path: &Path) -> Option { + pub fn compute_source_file( + config: &CodeRepository, + commit: &str, + path: &Path, + ) -> Option { let source_file_id = Self::compute_source_file_id(path)?; if path.is_dir() || !path.exists() { @@ -114,6 +118,7 @@ impl CodeIntelligence { source_file_id, source_id: config.source_id.clone(), git_url: config.canonical_git_url(), + commit: commit.to_owned(), basedir: config.dir().display().to_string(), filepath: relative_path.display().to_string(), max_line_length, @@ -260,12 +265,14 @@ mod tests { fn test_create_source_file() { set_tabby_root(get_tabby_root()); let config = get_repository_config(); - let source_file = CodeIntelligence::compute_source_file(&config, &get_rust_source_file()) - .expect("Failed to create source file"); + let source_file = + CodeIntelligence::compute_source_file(&config, "commit", &get_rust_source_file()) + .expect("Failed to create source file"); // check source_file properties assert_eq!(source_file.language, "rust"); assert_eq!(source_file.tags.len(), 3); assert_eq!(source_file.filepath, "rust.rs"); + assert_eq!(source_file.commit, "commit"); } } diff --git a/crates/tabby-index/src/code/mod.rs b/crates/tabby-index/src/code/mod.rs index acf913e8a97b..b91b5c14ebe1 100644 --- a/crates/tabby-index/src/code/mod.rs +++ b/crates/tabby-index/src/code/mod.rs @@ -38,9 +38,9 @@ impl CodeIndexer { "Building source code index: {}", repository.canonical_git_url() ); - repository::sync_repository(repository)?; + let commit = repository::sync_repository(repository)?; - index::index_repository(embedding, repository).await; + index::index_repository(embedding, repository, &commit).await; index::garbage_collection().await; Ok(()) @@ -62,8 +62,10 @@ impl CodeBuilder { #[async_trait] impl IndexAttributeBuilder for CodeBuilder { - async fn build_attributes(&self, _source_code: &SourceCode) -> serde_json::Value { - json!({}) + async fn build_attributes(&self, source_code: &SourceCode) -> serde_json::Value { + json!({ + code::fields::ATTRIBUTE_COMMIT: source_code.commit, + }) } async fn build_chunk_attributes<'a>( diff --git a/crates/tabby-index/src/code/repository.rs b/crates/tabby-index/src/code/repository.rs index f89094726c9e..7991cff46a72 100644 --- a/crates/tabby-index/src/code/repository.rs +++ b/crates/tabby-index/src/code/repository.rs @@ -12,11 +12,13 @@ use tracing::warn; use super::CodeRepository; trait RepositoryExt { - fn sync(&self) -> anyhow::Result<()>; + fn sync(&self) -> anyhow::Result; } impl RepositoryExt for CodeRepository { - fn sync(&self) -> anyhow::Result<()> { + // sync clones the repository if it doesn't exist, otherwise it pulls the remote. + // and returns the git commit sha256. + fn sync(&self) -> anyhow::Result { let dir = self.dir(); let mut finished = false; if dir.exists() { @@ -47,10 +49,17 @@ impl RepositoryExt for CodeRepository { } } - Ok(()) + get_commit_sha(self) } } +fn get_commit_sha(repository: &CodeRepository) -> anyhow::Result { + let repo = git2::Repository::open(repository.dir())?; + let head = repo.head()?; + let commit = head.peel_to_commit()?; + Ok(commit.id().to_string()) +} + fn pull_remote(path: &Path) -> bool { let status = Command::new("git") .current_dir(path) @@ -71,16 +80,15 @@ fn pull_remote(path: &Path) -> bool { true } -pub fn sync_repository(repository: &CodeRepository) -> anyhow::Result<()> { +pub fn sync_repository(repository: &CodeRepository) -> anyhow::Result { if repository.is_local_dir() { if !repository.dir().exists() { - panic!("Directory {} does not exist", repository.dir().display()); + bail!("Directory {} does not exist", repository.dir().display()); } + get_commit_sha(repository) } else { - repository.sync()?; + repository.sync() } - - Ok(()) } pub fn garbage_collection(repositories: &[CodeRepository]) { diff --git a/crates/tabby-index/src/code/types.rs b/crates/tabby-index/src/code/types.rs index 1c41e99af76b..41a5239b7af8 100644 --- a/crates/tabby-index/src/code/types.rs +++ b/crates/tabby-index/src/code/types.rs @@ -12,6 +12,7 @@ pub struct SourceCode { pub source_file_id: String, pub source_id: String, pub git_url: String, + pub commit: String, pub basedir: String, pub filepath: String, pub language: String, diff --git a/crates/tabby-index/src/indexer.rs b/crates/tabby-index/src/indexer.rs index 0735aa454907..86b58daa23d8 100644 --- a/crates/tabby-index/src/indexer.rs +++ b/crates/tabby-index/src/indexer.rs @@ -167,6 +167,26 @@ impl TantivyDocBuilder { } } } + + pub async fn backfill_doc_attributes( + &self, + origin: &TantivyDocument, + doc: &T, + ) -> TantivyDocument { + let schema = IndexSchema::instance(); + let mut doc = doc! { + schema.field_id => get_text(origin, schema.field_id), + schema.field_source_id => get_text(origin, schema.field_source_id).to_string(), + schema.field_corpus => get_text(origin, schema.field_corpus).to_string(), + schema.field_attributes => self.builder.build_attributes(doc).await, + schema.field_updated_at => get_date(origin, schema.field_updated_at), + }; + if let Some(failed_chunks) = get_number_optional(origin, schema.field_failed_chunks_count) { + doc.add_u64(schema.field_failed_chunks_count, failed_chunks as u64); + } + + doc + } } pub struct Indexer { @@ -197,6 +217,25 @@ impl Indexer { .expect("Failed to add document"); } + pub async fn get_doc(&self, id: &str) -> Result { + let schema = IndexSchema::instance(); + let query = schema.doc_query(&self.corpus, id); + let docs = match self.searcher.search(&query, &TopDocs::with_limit(1)) { + Ok(docs) => docs, + Err(e) => { + debug!("query tantivy error: {}", e); + return Err(e.into()); + } + }; + if docs.is_empty() { + bail!("Document not found: {}", id); + } + + self.searcher + .doc(docs.first().unwrap().1) + .map_err(|e| e.into()) + } + pub fn delete(&self, id: &str) { let schema = IndexSchema::instance(); let _ = self @@ -204,6 +243,13 @@ impl Indexer { .delete_query(Box::new(schema.doc_query_with_chunks(&self.corpus, id))); } + pub fn delete_doc(&self, id: &str) { + let schema = IndexSchema::instance(); + let _ = self + .writer + .delete_query(Box::new(schema.doc_query(&self.corpus, id))); + } + pub fn commit(mut self) { self.writer.commit().expect("Failed to commit changes"); self.writer @@ -369,3 +415,11 @@ impl IndexGarbageCollector { fn get_text(doc: &TantivyDocument, field: schema::Field) -> &str { doc.get_first(field).unwrap().as_str().unwrap() } + +fn get_date(doc: &TantivyDocument, field: schema::Field) -> tantivy::DateTime { + doc.get_first(field).unwrap().as_datetime().unwrap() +} + +fn get_number_optional(doc: &TantivyDocument, field: schema::Field) -> Option { + doc.get_first(field)?.as_i64() +} diff --git a/crates/tabby-index/src/indexer_tests.rs b/crates/tabby-index/src/indexer_tests.rs index 365deb539aa1..cabeef445d1a 100644 --- a/crates/tabby-index/src/indexer_tests.rs +++ b/crates/tabby-index/src/indexer_tests.rs @@ -206,7 +206,8 @@ mod builder_tests { let builder = Arc::new(create_code_builder(Some(Arc::new(embedding)))); let repo = get_repository_config(); - let code = CodeIntelligence::compute_source_file(&repo, &get_rust_source_file()).unwrap(); + let code = CodeIntelligence::compute_source_file(&repo, "commit", &get_rust_source_file()) + .unwrap(); let index_id = code.to_index_id(); let (id, s) = tokio::runtime::Runtime::new() diff --git a/crates/tabby/src/services/code.rs b/crates/tabby/src/services/code.rs index c235ddc6d0bb..7313fcd0007c 100644 --- a/crates/tabby/src/services/code.rs +++ b/crates/tabby/src/services/code.rs @@ -10,7 +10,7 @@ use tabby_common::{ index::{ self, code::{self, tokenize_code}, - IndexSchema, + corpus, IndexSchema, }, }; use tabby_inference::Embedding; @@ -76,17 +76,17 @@ impl CodeSearchImpl { .await? }; - Ok(merge_code_responses_by_rank( - ¶ms, - docs_from_embedding, - docs_from_bm25, - )) + Ok( + merge_code_responses_by_rank(reader, ¶ms, docs_from_embedding, docs_from_bm25) + .await, + ) } } const RANK_CONSTANT: f32 = 60.0; -fn merge_code_responses_by_rank( +async fn merge_code_responses_by_rank( + reader: &IndexReader, params: &CodeSearchParams, embedding_resp: Vec<(f32, TantivyDocument)>, bm25_resp: Vec<(f32, TantivyDocument)>, @@ -118,9 +118,13 @@ fn merge_code_responses_by_rank( } } - let mut scored_hits: Vec = scored_hits + let scored_hits_futures: Vec<_> = scored_hits .into_values() - .map(|(scores, doc)| create_hit(scores, doc)) + .map(|(scores, doc)| create_hit(reader, scores, doc)) + .collect(); + let mut scored_hits: Vec = futures::future::join_all(scored_hits_futures) + .await + .into_iter() .collect(); scored_hits.sort_by(|a, b| b.scores.rrf.total_cmp(&a.scores.rrf)); retain_at_most_two_hits_per_file(&mut scored_hits); @@ -162,10 +166,17 @@ fn get_chunk_id(doc: &TantivyDocument) -> &str { get_text(doc, schema.field_chunk_id) } -fn create_hit(scores: CodeSearchScores, doc: TantivyDocument) -> CodeSearchHit { +async fn create_hit( + reader: &IndexReader, + scores: CodeSearchScores, + doc: TantivyDocument, +) -> CodeSearchHit { let schema = IndexSchema::instance(); + let file_id = get_text(&doc, schema.field_id).to_owned(); + let commit = get_commit(reader, &file_id).await; + let doc = CodeSearchDocument { - file_id: get_text(&doc, schema.field_id).to_owned(), + file_id, chunk_id: get_text(&doc, schema.field_chunk_id).to_owned(), body: get_json_text_field( &doc, @@ -185,6 +196,9 @@ fn create_hit(scores: CodeSearchScores, doc: TantivyDocument) -> CodeSearchHit { code::fields::CHUNK_GIT_URL, ) .to_owned(), + // commit is introduced in v0.23, but it is also a required field + // so we need to handle the case where it's not present + commit, language: get_json_text_field( &doc, schema.field_chunk_attributes, @@ -200,6 +214,26 @@ fn create_hit(scores: CodeSearchScores, doc: TantivyDocument) -> CodeSearchHit { CodeSearchHit { scores, doc } } +async fn get_commit(reader: &IndexReader, id: &str) -> Option { + let schema = IndexSchema::instance(); + let query = schema.doc_query(corpus::CODE, id); + let doc = reader + .searcher() + .search(&query, &TopDocs::with_limit(1)) + .ok()?; + if doc.is_empty() { + return None; + } + + let doc = reader.searcher().doc(doc[0].1).ok()?; + get_json_text_field_optional( + &doc, + schema.field_attributes, + code::fields::ATTRIBUTE_COMMIT, + ) + .map(|s| s.to_owned()) +} + fn get_text(doc: &TantivyDocument, field: schema::Field) -> &str { doc.get_first(field).unwrap().as_str().unwrap() } @@ -228,6 +262,17 @@ fn get_json_text_field<'a>(doc: &'a TantivyDocument, field: schema::Field, name: .unwrap() } +fn get_json_text_field_optional<'a>( + doc: &'a TantivyDocument, + field: schema::Field, + name: &str, +) -> Option<&'a str> { + doc.get_first(field) + .and_then(|value| value.as_object()) + .and_then(|mut obj| obj.find(|(k, _)| *k == name)) + .and_then(|(_, v)| v.as_str()) +} + struct CodeSearchService { imp: CodeSearchImpl, provider: Arc, @@ -278,6 +323,7 @@ mod tests { body: body.to_string(), filepath: "".to_owned(), git_url: "".to_owned(), + commit: Some("".to_owned()), language: "".to_owned(), start_line: 0, }, diff --git a/ee/tabby-db/src/threads.rs b/ee/tabby-db/src/threads.rs index 37c6ae97d531..835ddfdc3bb5 100644 --- a/ee/tabby-db/src/threads.rs +++ b/ee/tabby-db/src/threads.rs @@ -77,6 +77,7 @@ pub struct ThreadMessageAttachmentAuthor { #[derive(Serialize, Deserialize)] pub struct ThreadMessageAttachmentCode { pub git_url: String, + pub commit: Option, pub language: String, pub filepath: String, pub content: String, diff --git a/ee/tabby-schema/graphql/schema.graphql b/ee/tabby-schema/graphql/schema.graphql index b68e2a19c819..fef920d952c2 100644 --- a/ee/tabby-schema/graphql/schema.graphql +++ b/ee/tabby-schema/graphql/schema.graphql @@ -498,6 +498,7 @@ type MessageAttachmentClientCode { type MessageAttachmentCode { gitUrl: String! + commit: String filepath: String! language: String! content: String! diff --git a/ee/tabby-schema/src/dao.rs b/ee/tabby-schema/src/dao.rs index 9c1e8d70708e..60b7c64c8073 100644 --- a/ee/tabby-schema/src/dao.rs +++ b/ee/tabby-schema/src/dao.rs @@ -202,6 +202,7 @@ impl From for thread::MessageAttachmentCode { fn from(value: ThreadMessageAttachmentCode) -> Self { Self { git_url: value.git_url, + commit: value.commit, filepath: value.filepath, language: value.language, content: value.content, @@ -214,6 +215,7 @@ impl From<&thread::MessageAttachmentCode> for ThreadMessageAttachmentCode { fn from(val: &thread::MessageAttachmentCode) -> Self { ThreadMessageAttachmentCode { git_url: val.git_url.clone(), + commit: val.commit.clone(), filepath: val.filepath.clone(), language: val.language.clone(), content: val.content.clone(), diff --git a/ee/tabby-schema/src/schema/thread/types.rs b/ee/tabby-schema/src/schema/thread/types.rs index 6d06adf7a61d..240ea5437c69 100644 --- a/ee/tabby-schema/src/schema/thread/types.rs +++ b/ee/tabby-schema/src/schema/thread/types.rs @@ -73,6 +73,7 @@ pub struct MessageAttachmentClientCode { #[derive(GraphQLObject, Clone)] pub struct MessageAttachmentCode { pub git_url: String, + pub commit: Option, pub filepath: String, pub language: String, pub content: String, @@ -83,6 +84,7 @@ impl From for MessageAttachmentCode { fn from(doc: CodeSearchDocument) -> Self { Self { git_url: doc.git_url, + commit: doc.commit, filepath: doc.filepath, language: doc.language, content: doc.body, diff --git a/ee/tabby-webserver/src/service/answer.rs b/ee/tabby-webserver/src/service/answer.rs index ec49ae9562ce..4b7b1a07266b 100644 --- a/ee/tabby-webserver/src/service/answer.rs +++ b/ee/tabby-webserver/src/service/answer.rs @@ -577,7 +577,7 @@ pub async fn merge_code_snippets( if let Some(file_content) = file_content { debug!( - "file {} less than 200, it will be included whole file content", + "file {} less than 300, it will be included whole file content", file_hits[0].doc.filepath ); let mut insert_hit = file_hits[0].clone(); @@ -768,6 +768,7 @@ mod tests { })], code: vec![tabby_schema::thread::MessageAttachmentCode { git_url: "https://github.com/".to_owned(), + commit: Some("commit".to_owned()), filepath: "server.py".to_owned(), language: "python".to_owned(), content: "from flask import Flask\n\napp = Flask(__name__)\n\n@app.route('/')\ndef hello():\n return 'Hello, World!'".to_owned(), @@ -801,6 +802,7 @@ mod tests { )], code: vec![tabby_schema::thread::MessageAttachmentCode { git_url: "https://github.com".to_owned(), + commit: Some("commit".to_owned()), filepath: "server.py".to_owned(), language: "python".to_owned(), content: "print('Hello, server!')".to_owned(), @@ -977,6 +979,7 @@ mod tests { )], code: vec![tabby_schema::thread::MessageAttachmentCode { git_url: "https://github.com".to_owned(), + commit: Some("commit".to_owned()), filepath: "server.py".to_owned(), language: "python".to_owned(), content: "print('Hello, server!')".to_owned(), @@ -1038,6 +1041,7 @@ mod tests { )], code: vec![tabby_schema::thread::MessageAttachmentCode { git_url: "https://github.com".to_owned(), + commit: Some("commit".to_owned()), filepath: "server.py".to_owned(), language: "python".to_owned(), content: "print('Hello, server!')".to_owned(), @@ -1335,6 +1339,7 @@ mod tests { body: "fn test1() {}\nfn test2() {}".to_string(), filepath: "test.rs".to_string(), git_url: "https://github.com/test/repo.git".to_string(), + commit: Some("commit".to_string()), language: "rust".to_string(), start_line: 1, }, @@ -1351,6 +1356,7 @@ mod tests { body: "fn test3() {}\nfn test4() {}".to_string(), filepath: "test.rs".to_string(), git_url: "https://github.com/test/repo.git".to_string(), + commit: Some("commit".to_string()), language: "rust".to_string(), start_line: 3, }, @@ -1365,5 +1371,7 @@ mod tests { let result = merge_code_snippets(&repo.unwrap(), hits).await; assert_eq!(result.len(), 2); + assert_eq!(result[0].doc.commit, Some("commit".to_string())); + assert_eq!(result[1].doc.commit, Some("commit".to_string())); } } diff --git a/ee/tabby-webserver/src/service/answer/testutils/mod.rs b/ee/tabby-webserver/src/service/answer/testutils/mod.rs index 0e205d6cccf1..db73e60288ba 100644 --- a/ee/tabby-webserver/src/service/answer/testutils/mod.rs +++ b/ee/tabby-webserver/src/service/answer/testutils/mod.rs @@ -148,6 +148,7 @@ impl CodeSearch for FakeCodeSearch { file_id: "1".to_string(), chunk_id: "chunk1".to_string(), git_url: "https://github.com/test/repo".to_string(), + commit: Some("commit".to_string()), }, scores: CodeSearchScores { bm25: 0.8, @@ -164,6 +165,7 @@ impl CodeSearch for FakeCodeSearch { file_id: "2".to_string(), chunk_id: "chunk2".to_string(), git_url: "https://github.com/test/repo".to_string(), + commit: Some("commit".to_string()), }, scores: CodeSearchScores { bm25: 0.7,