Skip to content

Commit

Permalink
backfill commit in source code without redo calculate (#3587)
Browse files Browse the repository at this point in the history
  • Loading branch information
zwpaper authored Dec 23, 2024
1 parent 3d2c5f9 commit 3018718
Show file tree
Hide file tree
Showing 3 changed files with 115 additions and 25 deletions.
35 changes: 20 additions & 15 deletions crates/tabby-common/src/index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,21 @@ impl IndexSchema {
])
}

/// Build a query to find the document with the given `doc_id`, include chunks.
pub fn doc_query_with_chunks(&self, corpus: &str, doc_id: &str) -> impl Query {
let doc_id_query = TermQuery::new(
Term::from_field_text(self.field_id, doc_id),
tantivy::schema::IndexRecordOption::Basic,
);

BooleanQuery::new(vec![
// Must match the corpus
(Occur::Must, self.corpus_query(corpus)),
// Must match the doc id
(Occur::Must, Box::new(doc_id_query)),
])
}

pub fn doc_indexed_after(
&self,
corpus: &str,
Expand Down Expand Up @@ -261,21 +276,11 @@ impl IndexSchema {
FIELD_ATTRIBUTES, field
))),
),
])
}

/// Build a query to find the document with the given `doc_id`, include chunks.
pub fn doc_query_with_chunks(&self, corpus: &str, doc_id: &str) -> impl Query {
let doc_id_query = TermQuery::new(
Term::from_field_text(self.field_id, doc_id),
tantivy::schema::IndexRecordOption::Basic,
);

BooleanQuery::new(vec![
// Must match the corpus
(Occur::Must, self.corpus_query(corpus)),
// Must match the doc id
(Occur::Must, Box::new(doc_id_query)),
// Exclude chunk documents
(
Occur::MustNot,
Box::new(ExistsQuery::new_exists_query(FIELD_CHUNK_ID.into())),
),
])
}

Expand Down
51 changes: 41 additions & 10 deletions crates/tabby-index/src/code/index.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use std::{pin::pin, sync::Arc};
use std::{path::Path, pin::pin, sync::Arc};

use anyhow::Result;
use async_stream::stream;
use futures::StreamExt;
use ignore::{DirEntry, Walk};
Expand All @@ -12,7 +13,7 @@ use super::{
intelligence::{CodeIntelligence, SourceCode},
CodeRepository,
};
use crate::indexer::Indexer;
use crate::indexer::{Indexer, TantivyDocBuilder};

// Magic numbers
static MAX_LINE_LENGTH_THRESHOLD: usize = 300;
Expand Down Expand Up @@ -101,7 +102,19 @@ async fn add_changed_documents(

let id = SourceCode::to_index_id(&repository.source_id, &key).id;

// Skip if already indexed and has no failed chunks,
// when skip, we should check if the document needs to be backfilled.

Check warning on line 106 in crates/tabby-index/src/code/index.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/code/index.rs#L105-L106

Added lines #L105 - L106 were not covered by tests
if !require_updates(cloned_index.clone(), &id) {
backfill_commit_in_doc_if_needed(
builder.clone(),
cloned_index.clone(),
&id,
repository,
commit,
file.path()).await.unwrap_or_else(|e| {
warn!("Failed to backfill commit for {id}: {e}");
}
);

Check warning on line 117 in crates/tabby-index/src/code/index.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/code/index.rs#L108-L117

Added lines #L108 - L117 were not covered by tests
continue;
}

Expand Down Expand Up @@ -139,22 +152,40 @@ async fn add_changed_documents(
count_docs
}

// 1. Backfill if the document is missing the commit field
// 2. Skip if already indexed and has no failed chunks
fn require_updates(indexer: Arc<Indexer>, id: &str) -> bool {
if should_backfill(indexer.clone(), id) {
return true;
}
if indexer.is_indexed(id) && !indexer.has_failed_chunks(id) {
return false;
};

true
}

fn should_backfill(indexer: Arc<Indexer>, id: &str) -> bool {
// v0.23.0 add the commit field to the code document.
!indexer.has_attribute_field(id, code::fields::ATTRIBUTE_COMMIT)
// v0.23.0 add the commit field to the code document.
async fn backfill_commit_in_doc_if_needed(
builder: Arc<TantivyDocBuilder<SourceCode>>,
indexer: Arc<Indexer>,
id: &str,
repository: &CodeRepository,
commit: &str,
path: &Path,
) -> Result<()> {
if indexer.has_attribute_field(id, code::fields::ATTRIBUTE_COMMIT) {
return Ok(());
}

Check warning on line 174 in crates/tabby-index/src/code/index.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/code/index.rs#L164-L174

Added lines #L164 - L174 were not covered by tests

let code = CodeIntelligence::compute_source_file(repository, commit, path)
.ok_or_else(|| anyhow::anyhow!("Failed to compute source file"))?;
if !is_valid_file(&code) {
anyhow::bail!("Invalid file");
}

Check warning on line 180 in crates/tabby-index/src/code/index.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/code/index.rs#L176-L180

Added lines #L176 - L180 were not covered by tests

let origin = indexer.get_doc(id).await?;
indexer.delete_doc(id);
indexer
.add(builder.backfill_doc_attributes(&origin, &code).await)
.await;

Check warning on line 186 in crates/tabby-index/src/code/index.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/code/index.rs#L182-L186

Added lines #L182 - L186 were not covered by tests

Ok(())
}

Check warning on line 189 in crates/tabby-index/src/code/index.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/code/index.rs#L188-L189

Added lines #L188 - L189 were not covered by tests

fn is_valid_file(file: &SourceCode) -> bool {
Expand Down
54 changes: 54 additions & 0 deletions crates/tabby-index/src/indexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,26 @@ impl<T: ToIndexId> TantivyDocBuilder<T> {
}
}
}

pub async fn backfill_doc_attributes(
&self,
origin: &TantivyDocument,
doc: &T,
) -> TantivyDocument {
let schema = IndexSchema::instance();
let mut doc = doc! {
schema.field_id => get_text(origin, schema.field_id),
schema.field_source_id => get_text(origin, schema.field_source_id).to_string(),
schema.field_corpus => get_text(origin, schema.field_corpus).to_string(),
schema.field_attributes => self.builder.build_attributes(doc).await,
schema.field_updated_at => get_date(origin, schema.field_updated_at),

Check warning on line 182 in crates/tabby-index/src/indexer.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/indexer.rs#L171-L182

Added lines #L171 - L182 were not covered by tests
};
if let Some(failed_chunks) = get_number_optional(origin, schema.field_failed_chunks_count) {
doc.add_u64(schema.field_failed_chunks_count, failed_chunks as u64);
}

Check warning on line 186 in crates/tabby-index/src/indexer.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/indexer.rs#L184-L186

Added lines #L184 - L186 were not covered by tests

doc
}

Check warning on line 189 in crates/tabby-index/src/indexer.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/indexer.rs#L188-L189

Added lines #L188 - L189 were not covered by tests
}

pub struct Indexer {
Expand Down Expand Up @@ -197,13 +217,39 @@ impl Indexer {
.expect("Failed to add document");
}

pub async fn get_doc(&self, id: &str) -> Result<TantivyDocument> {
let schema = IndexSchema::instance();
let query = schema.doc_query(&self.corpus, id);
let docs = match self.searcher.search(&query, &TopDocs::with_limit(1)) {
Ok(docs) => docs,
Err(e) => {
debug!("query tantivy error: {}", e);
return Err(e.into());

Check warning on line 227 in crates/tabby-index/src/indexer.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/indexer.rs#L220-L227

Added lines #L220 - L227 were not covered by tests
}
};
if docs.is_empty() {
bail!("Document not found: {}", id);
}

self.searcher
.doc(docs.first().unwrap().1)
.map_err(|e| e.into())
}

Check warning on line 237 in crates/tabby-index/src/indexer.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/indexer.rs#L230-L237

Added lines #L230 - L237 were not covered by tests

pub fn delete(&self, id: &str) {
let schema = IndexSchema::instance();
let _ = self
.writer
.delete_query(Box::new(schema.doc_query_with_chunks(&self.corpus, id)));
}

pub fn delete_doc(&self, id: &str) {
let schema = IndexSchema::instance();
let _ = self
.writer
.delete_query(Box::new(schema.doc_query(&self.corpus, id)));
}

Check warning on line 251 in crates/tabby-index/src/indexer.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/indexer.rs#L246-L251

Added lines #L246 - L251 were not covered by tests

pub fn commit(mut self) {
self.writer.commit().expect("Failed to commit changes");
self.writer
Expand Down Expand Up @@ -369,3 +415,11 @@ impl IndexGarbageCollector {
fn get_text(doc: &TantivyDocument, field: schema::Field) -> &str {
doc.get_first(field).unwrap().as_str().unwrap()
}

fn get_date(doc: &TantivyDocument, field: schema::Field) -> tantivy::DateTime {
doc.get_first(field).unwrap().as_datetime().unwrap()
}

Check warning on line 421 in crates/tabby-index/src/indexer.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/indexer.rs#L419-L421

Added lines #L419 - L421 were not covered by tests

fn get_number_optional(doc: &TantivyDocument, field: schema::Field) -> Option<i64> {
doc.get_first(field)?.as_i64()
}

Check warning on line 425 in crates/tabby-index/src/indexer.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/indexer.rs#L423-L425

Added lines #L423 - L425 were not covered by tests

0 comments on commit 3018718

Please sign in to comment.