From 6eb0493763a467ec9a6c5c3744f8aab5a8ad1cee Mon Sep 17 00:00:00 2001 From: Wei Zhang Date: Mon, 16 Dec 2024 22:04:12 +0800 Subject: [PATCH] WIP: add commit in code search Signed-off-by: Wei Zhang --- crates/tabby-common/src/api/code.rs | 1 + crates/tabby-common/src/index/code/mod.rs | 1 + crates/tabby-index/src/code/index.rs | 24 ++++++++++++++++----- crates/tabby-index/src/code/intelligence.rs | 7 +++++- crates/tabby-index/src/code/mod.rs | 5 +++-- crates/tabby-index/src/code/repository.rs | 24 ++++++++++++++------- crates/tabby-index/src/code/types.rs | 1 + crates/tabby/src/services/code.rs | 20 +++++++++++++++++ ee/tabby-db/src/threads.rs | 1 + ee/tabby-schema/graphql/schema.graphql | 1 + ee/tabby-schema/src/dao.rs | 2 ++ ee/tabby-schema/src/schema/thread/types.rs | 2 ++ ee/tabby-webserver/src/service/answer.rs | 2 +- 13 files changed, 74 insertions(+), 17 deletions(-) diff --git a/crates/tabby-common/src/api/code.rs b/crates/tabby-common/src/api/code.rs index ecd4aafc12c5..a170b6b023e7 100644 --- a/crates/tabby-common/src/api/code.rs +++ b/crates/tabby-common/src/api/code.rs @@ -30,6 +30,7 @@ pub struct CodeSearchDocument { pub body: String, pub filepath: String, pub git_url: String, + pub commit: String, pub language: String, pub start_line: usize, } diff --git a/crates/tabby-common/src/index/code/mod.rs b/crates/tabby-common/src/index/code/mod.rs index 7380b1966e4a..b143752e6155 100644 --- a/crates/tabby-common/src/index/code/mod.rs +++ b/crates/tabby-common/src/index/code/mod.rs @@ -11,6 +11,7 @@ use crate::api::code::CodeSearchQuery; pub mod fields { pub const CHUNK_GIT_URL: &str = "chunk_git_url"; + pub const CHUNK_COMMIT: &str = "chunk_commit"; pub const CHUNK_FILEPATH: &str = "chunk_filepath"; pub const CHUNK_LANGUAGE: &str = "chunk_language"; pub const CHUNK_BODY: &str = "chunk_body"; diff --git a/crates/tabby-index/src/code/index.rs b/crates/tabby-index/src/code/index.rs index c58d97bb896d..3b92ead5635f 100644 --- a/crates/tabby-index/src/code/index.rs +++ b/crates/tabby-index/src/code/index.rs @@ -3,7 +3,7 @@ use std::{pin::pin, sync::Arc}; use async_stream::stream; use futures::StreamExt; use ignore::{DirEntry, Walk}; -use tabby_common::index::corpus; +use tabby_common::index::{code, corpus}; use tabby_inference::Embedding; use tracing::warn; @@ -21,7 +21,11 @@ static MIN_ALPHA_NUM_FRACTION: f32 = 0.25f32; static MAX_NUMBER_OF_LINES: usize = 100000; static MAX_NUMBER_FRACTION: f32 = 0.5f32; -pub async fn index_repository(embedding: Arc, repository: &CodeRepository) { +pub async fn index_repository( + embedding: Arc, + repository: &CodeRepository, + commit: &str, +) { let total_files = Walk::new(repository.dir()).count(); let file_stream = stream! { for file in Walk::new(repository.dir()) { @@ -45,7 +49,7 @@ pub async fn index_repository(embedding: Arc, repository: &CodeRe let mut count_chunks = 0; while let Some(files) = file_stream.next().await { count_files += files.len(); - count_chunks += add_changed_documents(repository, embedding.clone(), files).await; + count_chunks += add_changed_documents(repository, commit, embedding.clone(), files).await; logkit::info!("Processed {count_files}/{total_files} files, updated {count_chunks} chunks",); } } @@ -79,6 +83,7 @@ pub async fn garbage_collection() { async fn add_changed_documents( repository: &CodeRepository, + commit: &str, embedding: Arc, files: Vec, ) -> usize { @@ -96,12 +101,11 @@ async fn add_changed_documents( let id = SourceCode::to_index_id(&repository.source_id, &key).id; - // Skip if already indexed and has no failed chunks if !require_updates(cloned_index.clone(), &id) { continue; } - let Some(code) = CodeIntelligence::compute_source_file(repository, file.path()) else { + let Some(code) = CodeIntelligence::compute_source_file(repository, commit, file.path()) else { continue; }; @@ -135,7 +139,12 @@ async fn add_changed_documents( count_docs } +// 1. Backfill if the document is missing the commit field +// 2. Skip if already indexed and has no failed chunks fn require_updates(indexer: Arc, id: &str) -> bool { + if should_backfill(indexer.clone(), id) { + return true; + } if indexer.is_indexed(id) && !indexer.has_failed_chunks(id) { return false; }; @@ -143,6 +152,11 @@ fn require_updates(indexer: Arc, id: &str) -> bool { true } +fn should_backfill(indexer: Arc, id: &str) -> bool { + // v0.23.0 add the commit field to the code document. + !indexer.has_attribute_field(id, code::fields::CHUNK_COMMIT) +} + fn is_valid_file(file: &SourceCode) -> bool { file.max_line_length <= MAX_LINE_LENGTH_THRESHOLD && file.avg_line_length <= AVG_LINE_LENGTH_THRESHOLD diff --git a/crates/tabby-index/src/code/intelligence.rs b/crates/tabby-index/src/code/intelligence.rs index 0af8b3832aa9..d6d6ae70099f 100644 --- a/crates/tabby-index/src/code/intelligence.rs +++ b/crates/tabby-index/src/code/intelligence.rs @@ -73,7 +73,11 @@ impl CodeIntelligence { file_key.to_string() == item_key } - pub fn compute_source_file(config: &CodeRepository, path: &Path) -> Option { + pub fn compute_source_file( + config: &CodeRepository, + commit: &str, + path: &Path, + ) -> Option { let source_file_id = Self::compute_source_file_id(path)?; if path.is_dir() || !path.exists() { @@ -114,6 +118,7 @@ impl CodeIntelligence { source_file_id, source_id: config.source_id.clone(), git_url: config.canonical_git_url(), + commit: commit.to_owned(), basedir: config.dir().display().to_string(), filepath: relative_path.display().to_string(), max_line_length, diff --git a/crates/tabby-index/src/code/mod.rs b/crates/tabby-index/src/code/mod.rs index f7d0c0fc7583..44c9391306d4 100644 --- a/crates/tabby-index/src/code/mod.rs +++ b/crates/tabby-index/src/code/mod.rs @@ -38,9 +38,9 @@ impl CodeIndexer { "Building source code index: {}", repository.canonical_git_url() ); - repository::sync_repository(repository)?; + let commit = repository::sync_repository(repository)?; - index::index_repository(embedding, repository).await; + index::index_repository(embedding, repository, &commit).await; index::garbage_collection().await; Ok(()) @@ -102,6 +102,7 @@ impl IndexAttributeBuilder for CodeBuilder { let attributes = json!({ code::fields::CHUNK_FILEPATH: source_code.filepath, code::fields::CHUNK_GIT_URL: source_code.git_url, + code::fields::CHUNK_COMMIT: source_code.commit, code::fields::CHUNK_LANGUAGE: source_code.language, code::fields::CHUNK_BODY: body, code::fields::CHUNK_START_LINE: start_line, diff --git a/crates/tabby-index/src/code/repository.rs b/crates/tabby-index/src/code/repository.rs index f89094726c9e..625182f1d6c6 100644 --- a/crates/tabby-index/src/code/repository.rs +++ b/crates/tabby-index/src/code/repository.rs @@ -12,11 +12,13 @@ use tracing::warn; use super::CodeRepository; trait RepositoryExt { - fn sync(&self) -> anyhow::Result<()>; + fn sync(&self) -> anyhow::Result; } impl RepositoryExt for CodeRepository { - fn sync(&self) -> anyhow::Result<()> { + // sync clones the repository if it doesn't exist, otherwise it pulls the remote. + // and returns the git commit sha256. + fn sync(&self) -> anyhow::Result { let dir = self.dir(); let mut finished = false; if dir.exists() { @@ -47,10 +49,17 @@ impl RepositoryExt for CodeRepository { } } - Ok(()) + get_commit_sha(&self) } } +fn get_commit_sha(repository: &CodeRepository) -> anyhow::Result { + let repo = git2::Repository::open(repository.dir())?; + let head = repo.head()?; + let commit = head.peel_to_commit()?; + Ok(commit.id().to_string()) +} + fn pull_remote(path: &Path) -> bool { let status = Command::new("git") .current_dir(path) @@ -71,16 +80,15 @@ fn pull_remote(path: &Path) -> bool { true } -pub fn sync_repository(repository: &CodeRepository) -> anyhow::Result<()> { +pub fn sync_repository(repository: &CodeRepository) -> anyhow::Result { if repository.is_local_dir() { if !repository.dir().exists() { - panic!("Directory {} does not exist", repository.dir().display()); + bail!("Directory {} does not exist", repository.dir().display()); } + get_commit_sha(repository) } else { - repository.sync()?; + repository.sync() } - - Ok(()) } pub fn garbage_collection(repositories: &[CodeRepository]) { diff --git a/crates/tabby-index/src/code/types.rs b/crates/tabby-index/src/code/types.rs index 1c41e99af76b..41a5239b7af8 100644 --- a/crates/tabby-index/src/code/types.rs +++ b/crates/tabby-index/src/code/types.rs @@ -12,6 +12,7 @@ pub struct SourceCode { pub source_file_id: String, pub source_id: String, pub git_url: String, + pub commit: String, pub basedir: String, pub filepath: String, pub language: String, diff --git a/crates/tabby/src/services/code.rs b/crates/tabby/src/services/code.rs index c235ddc6d0bb..a99b102ba833 100644 --- a/crates/tabby/src/services/code.rs +++ b/crates/tabby/src/services/code.rs @@ -185,6 +185,14 @@ fn create_hit(scores: CodeSearchScores, doc: TantivyDocument) -> CodeSearchHit { code::fields::CHUNK_GIT_URL, ) .to_owned(), + // commit is introduced in v0.23, but it is also a required field + // so we need to handle the case where it's not present + commit: get_json_text_field_or_default( + &doc, + schema.field_chunk_attributes, + code::fields::CHUNK_COMMIT, + ) + .to_owned(), language: get_json_text_field( &doc, schema.field_chunk_attributes, @@ -228,6 +236,18 @@ fn get_json_text_field<'a>(doc: &'a TantivyDocument, field: schema::Field, name: .unwrap() } +fn get_json_text_field_or_default<'a>( + doc: &'a TantivyDocument, + field: schema::Field, + name: &str, +) -> &'a str { + doc.get_first(field) + .and_then(|value| value.as_object()) + .and_then(|mut obj| obj.find(|(k, _)| *k == name)) + .and_then(|(_, v)| v.as_str()) + .unwrap_or("") +} + struct CodeSearchService { imp: CodeSearchImpl, provider: Arc, diff --git a/ee/tabby-db/src/threads.rs b/ee/tabby-db/src/threads.rs index a68709402efc..8a1c51294b10 100644 --- a/ee/tabby-db/src/threads.rs +++ b/ee/tabby-db/src/threads.rs @@ -75,6 +75,7 @@ pub struct ThreadMessageAttachmentAuthor { #[derive(Serialize, Deserialize)] pub struct ThreadMessageAttachmentCode { pub git_url: String, + pub commit: Option, pub language: String, pub filepath: String, pub content: String, diff --git a/ee/tabby-schema/graphql/schema.graphql b/ee/tabby-schema/graphql/schema.graphql index a2296dd89afa..f18f1e050e2b 100644 --- a/ee/tabby-schema/graphql/schema.graphql +++ b/ee/tabby-schema/graphql/schema.graphql @@ -497,6 +497,7 @@ type MessageAttachmentClientCode { type MessageAttachmentCode { gitUrl: String! + commit: String! filepath: String! language: String! content: String! diff --git a/ee/tabby-schema/src/dao.rs b/ee/tabby-schema/src/dao.rs index 9c1e8d70708e..fad9b1928475 100644 --- a/ee/tabby-schema/src/dao.rs +++ b/ee/tabby-schema/src/dao.rs @@ -202,6 +202,7 @@ impl From for thread::MessageAttachmentCode { fn from(value: ThreadMessageAttachmentCode) -> Self { Self { git_url: value.git_url, + commit: value.commit.unwrap_or_default(), filepath: value.filepath, language: value.language, content: value.content, @@ -214,6 +215,7 @@ impl From<&thread::MessageAttachmentCode> for ThreadMessageAttachmentCode { fn from(val: &thread::MessageAttachmentCode) -> Self { ThreadMessageAttachmentCode { git_url: val.git_url.clone(), + commit: Some(val.commit.clone()), filepath: val.filepath.clone(), language: val.language.clone(), content: val.content.clone(), diff --git a/ee/tabby-schema/src/schema/thread/types.rs b/ee/tabby-schema/src/schema/thread/types.rs index 7ac4a4c9ebe4..99353069382c 100644 --- a/ee/tabby-schema/src/schema/thread/types.rs +++ b/ee/tabby-schema/src/schema/thread/types.rs @@ -72,6 +72,7 @@ pub struct MessageAttachmentClientCode { #[derive(GraphQLObject, Clone)] pub struct MessageAttachmentCode { pub git_url: String, + pub commit: String, pub filepath: String, pub language: String, pub content: String, @@ -82,6 +83,7 @@ impl From for MessageAttachmentCode { fn from(doc: CodeSearchDocument) -> Self { Self { git_url: doc.git_url, + commit: doc.commit, filepath: doc.filepath, language: doc.language, content: doc.body, diff --git a/ee/tabby-webserver/src/service/answer.rs b/ee/tabby-webserver/src/service/answer.rs index 7b9458a4aad3..b99849ea53f4 100644 --- a/ee/tabby-webserver/src/service/answer.rs +++ b/ee/tabby-webserver/src/service/answer.rs @@ -576,7 +576,7 @@ pub async fn merge_code_snippets( if let Some(file_content) = file_content { debug!( - "file {} less than 200, it will be included whole file content", + "file {} less than 300, it will be included whole file content", file_hits[0].doc.filepath ); let mut insert_hit = file_hits[0].clone();