Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: add commit in code search #3573

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions crates/tabby-common/src/api/code.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ pub struct CodeSearchDocument {
pub body: String,
pub filepath: String,
pub git_url: String,
pub commit: String,
pub language: String,
pub start_line: usize,
}
Expand Down
1 change: 1 addition & 0 deletions crates/tabby-common/src/index/code/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use crate::api::code::CodeSearchQuery;

pub mod fields {
pub const CHUNK_GIT_URL: &str = "chunk_git_url";
pub const CHUNK_COMMIT: &str = "chunk_commit";
pub const CHUNK_FILEPATH: &str = "chunk_filepath";
pub const CHUNK_LANGUAGE: &str = "chunk_language";
pub const CHUNK_BODY: &str = "chunk_body";
Expand Down
24 changes: 19 additions & 5 deletions crates/tabby-index/src/code/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use std::{pin::pin, sync::Arc};
use async_stream::stream;
use futures::StreamExt;
use ignore::{DirEntry, Walk};
use tabby_common::index::corpus;
use tabby_common::index::{code, corpus};
use tabby_inference::Embedding;
use tracing::warn;

Expand All @@ -21,7 +21,11 @@ static MIN_ALPHA_NUM_FRACTION: f32 = 0.25f32;
static MAX_NUMBER_OF_LINES: usize = 100000;
static MAX_NUMBER_FRACTION: f32 = 0.5f32;

pub async fn index_repository(embedding: Arc<dyn Embedding>, repository: &CodeRepository) {
pub async fn index_repository(
embedding: Arc<dyn Embedding>,
repository: &CodeRepository,
commit: &str,
) {
let total_files = Walk::new(repository.dir()).count();
let file_stream = stream! {
for file in Walk::new(repository.dir()) {
Expand All @@ -45,7 +49,7 @@ pub async fn index_repository(embedding: Arc<dyn Embedding>, repository: &CodeRe
let mut count_chunks = 0;
while let Some(files) = file_stream.next().await {
count_files += files.len();
count_chunks += add_changed_documents(repository, embedding.clone(), files).await;
count_chunks += add_changed_documents(repository, commit, embedding.clone(), files).await;
logkit::info!("Processed {count_files}/{total_files} files, updated {count_chunks} chunks",);
}
}
Expand Down Expand Up @@ -79,6 +83,7 @@ pub async fn garbage_collection() {

async fn add_changed_documents(
repository: &CodeRepository,
commit: &str,
embedding: Arc<dyn Embedding>,
files: Vec<DirEntry>,
) -> usize {
Expand All @@ -96,12 +101,11 @@ async fn add_changed_documents(

let id = SourceCode::to_index_id(&repository.source_id, &key).id;

// Skip if already indexed and has no failed chunks
if !require_updates(cloned_index.clone(), &id) {
continue;
}

let Some(code) = CodeIntelligence::compute_source_file(repository, file.path()) else {
let Some(code) = CodeIntelligence::compute_source_file(repository, commit, file.path()) else {
continue;
};

Expand Down Expand Up @@ -135,14 +139,24 @@ async fn add_changed_documents(
count_docs
}

// 1. Backfill if the document is missing the commit field
// 2. Skip if already indexed and has no failed chunks
fn require_updates(indexer: Arc<Indexer>, id: &str) -> bool {
if should_backfill(indexer.clone(), id) {
return true;
}
if indexer.is_indexed(id) && !indexer.has_failed_chunks(id) {
return false;
};

true
}

fn should_backfill(indexer: Arc<Indexer>, id: &str) -> bool {
// v0.23.0 add the commit field to the code document.
!indexer.has_attribute_field(id, code::fields::CHUNK_COMMIT)
}

fn is_valid_file(file: &SourceCode) -> bool {
file.max_line_length <= MAX_LINE_LENGTH_THRESHOLD
&& file.avg_line_length <= AVG_LINE_LENGTH_THRESHOLD
Expand Down
7 changes: 6 additions & 1 deletion crates/tabby-index/src/code/intelligence.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,11 @@ impl CodeIntelligence {
file_key.to_string() == item_key
}

pub fn compute_source_file(config: &CodeRepository, path: &Path) -> Option<SourceCode> {
pub fn compute_source_file(
config: &CodeRepository,
commit: &str,
path: &Path,
) -> Option<SourceCode> {
let source_file_id = Self::compute_source_file_id(path)?;

if path.is_dir() || !path.exists() {
Expand Down Expand Up @@ -114,6 +118,7 @@ impl CodeIntelligence {
source_file_id,
source_id: config.source_id.clone(),
git_url: config.canonical_git_url(),
commit: commit.to_owned(),
basedir: config.dir().display().to_string(),
filepath: relative_path.display().to_string(),
max_line_length,
Expand Down
5 changes: 3 additions & 2 deletions crates/tabby-index/src/code/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@ impl CodeIndexer {
"Building source code index: {}",
repository.canonical_git_url()
);
repository::sync_repository(repository)?;
let commit = repository::sync_repository(repository)?;

index::index_repository(embedding, repository).await;
index::index_repository(embedding, repository, &commit).await;
index::garbage_collection().await;

Ok(())
Expand Down Expand Up @@ -102,6 +102,7 @@ impl IndexAttributeBuilder<SourceCode> for CodeBuilder {
let attributes = json!({
code::fields::CHUNK_FILEPATH: source_code.filepath,
code::fields::CHUNK_GIT_URL: source_code.git_url,
code::fields::CHUNK_COMMIT: source_code.commit,
code::fields::CHUNK_LANGUAGE: source_code.language,
code::fields::CHUNK_BODY: body,
code::fields::CHUNK_START_LINE: start_line,
Expand Down
24 changes: 16 additions & 8 deletions crates/tabby-index/src/code/repository.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,13 @@ use tracing::warn;
use super::CodeRepository;

trait RepositoryExt {
fn sync(&self) -> anyhow::Result<()>;
fn sync(&self) -> anyhow::Result<String>;
}

impl RepositoryExt for CodeRepository {
fn sync(&self) -> anyhow::Result<()> {
// sync clones the repository if it doesn't exist, otherwise it pulls the remote.
// and returns the git commit sha256.
fn sync(&self) -> anyhow::Result<String> {
let dir = self.dir();
let mut finished = false;
if dir.exists() {
Expand Down Expand Up @@ -47,10 +49,17 @@ impl RepositoryExt for CodeRepository {
}
}

Ok(())
get_commit_sha(&self)
}
}

fn get_commit_sha(repository: &CodeRepository) -> anyhow::Result<String> {
let repo = git2::Repository::open(repository.dir())?;
let head = repo.head()?;
let commit = head.peel_to_commit()?;
Ok(commit.id().to_string())
}

fn pull_remote(path: &Path) -> bool {
let status = Command::new("git")
.current_dir(path)
Expand All @@ -71,16 +80,15 @@ fn pull_remote(path: &Path) -> bool {
true
}

pub fn sync_repository(repository: &CodeRepository) -> anyhow::Result<()> {
pub fn sync_repository(repository: &CodeRepository) -> anyhow::Result<String> {
if repository.is_local_dir() {
if !repository.dir().exists() {
panic!("Directory {} does not exist", repository.dir().display());
bail!("Directory {} does not exist", repository.dir().display());
}
get_commit_sha(repository)
} else {
repository.sync()?;
repository.sync()
}

Ok(())
}

pub fn garbage_collection(repositories: &[CodeRepository]) {
Expand Down
1 change: 1 addition & 0 deletions crates/tabby-index/src/code/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ pub struct SourceCode {
pub source_file_id: String,
pub source_id: String,
pub git_url: String,
pub commit: String,
pub basedir: String,
pub filepath: String,
pub language: String,
Expand Down
20 changes: 20 additions & 0 deletions crates/tabby/src/services/code.rs
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,14 @@ fn create_hit(scores: CodeSearchScores, doc: TantivyDocument) -> CodeSearchHit {
code::fields::CHUNK_GIT_URL,
)
.to_owned(),
// commit is introduced in v0.23, but it is also a required field
// so we need to handle the case where it's not present
commit: get_json_text_field_or_default(
&doc,
schema.field_chunk_attributes,
code::fields::CHUNK_COMMIT,
)
.to_owned(),
language: get_json_text_field(
&doc,
schema.field_chunk_attributes,
Expand Down Expand Up @@ -228,6 +236,18 @@ fn get_json_text_field<'a>(doc: &'a TantivyDocument, field: schema::Field, name:
.unwrap()
}

fn get_json_text_field_or_default<'a>(
doc: &'a TantivyDocument,
field: schema::Field,
name: &str,
) -> &'a str {
doc.get_first(field)
.and_then(|value| value.as_object())
.and_then(|mut obj| obj.find(|(k, _)| *k == name))
.and_then(|(_, v)| v.as_str())
.unwrap_or("")
}

struct CodeSearchService {
imp: CodeSearchImpl,
provider: Arc<IndexReaderProvider>,
Expand Down
1 change: 1 addition & 0 deletions ee/tabby-db/src/threads.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ pub struct ThreadMessageAttachmentAuthor {
#[derive(Serialize, Deserialize)]
pub struct ThreadMessageAttachmentCode {
pub git_url: String,
pub commit: Option<String>,
pub language: String,
pub filepath: String,
pub content: String,
Expand Down
1 change: 1 addition & 0 deletions ee/tabby-schema/graphql/schema.graphql
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,7 @@ type MessageAttachmentClientCode {

type MessageAttachmentCode {
gitUrl: String!
commit: String!
filepath: String!
language: String!
content: String!
Expand Down
2 changes: 2 additions & 0 deletions ee/tabby-schema/src/dao.rs
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ impl From<ThreadMessageAttachmentCode> for thread::MessageAttachmentCode {
fn from(value: ThreadMessageAttachmentCode) -> Self {
Self {
git_url: value.git_url,
commit: value.commit.unwrap_or_default(),
filepath: value.filepath,
language: value.language,
content: value.content,
Expand All @@ -214,6 +215,7 @@ impl From<&thread::MessageAttachmentCode> for ThreadMessageAttachmentCode {
fn from(val: &thread::MessageAttachmentCode) -> Self {
ThreadMessageAttachmentCode {
git_url: val.git_url.clone(),
commit: Some(val.commit.clone()),
filepath: val.filepath.clone(),
language: val.language.clone(),
content: val.content.clone(),
Expand Down
2 changes: 2 additions & 0 deletions ee/tabby-schema/src/schema/thread/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ pub struct MessageAttachmentClientCode {
#[derive(GraphQLObject, Clone)]
pub struct MessageAttachmentCode {
pub git_url: String,
pub commit: String,
pub filepath: String,
pub language: String,
pub content: String,
Expand All @@ -82,6 +83,7 @@ impl From<CodeSearchDocument> for MessageAttachmentCode {
fn from(doc: CodeSearchDocument) -> Self {
Self {
git_url: doc.git_url,
commit: doc.commit,
filepath: doc.filepath,
language: doc.language,
content: doc.body,
Expand Down
2 changes: 1 addition & 1 deletion ee/tabby-webserver/src/service/answer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -576,7 +576,7 @@ pub async fn merge_code_snippets(

if let Some(file_content) = file_content {
debug!(
"file {} less than 200, it will be included whole file content",
"file {} less than 300, it will be included whole file content",
file_hits[0].doc.filepath
);
let mut insert_hit = file_hits[0].clone();
Expand Down
Loading