Skip to content

Commit

Permalink
feat: add author field to issue and pull context
Browse files Browse the repository at this point in the history
Signed-off-by: Wei Zhang <[email protected]>
  • Loading branch information
zwpaper committed Nov 28, 2024
1 parent 38e3198 commit 18a84d6
Show file tree
Hide file tree
Showing 14 changed files with 178 additions and 30 deletions.
41 changes: 31 additions & 10 deletions crates/tabby-common/src/api/structured_doc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ pub struct DocSearchWebDocument {
pub struct DocSearchIssueDocument {
pub title: String,
pub link: String,
pub author: String,
pub body: String,
pub closed: bool,
}
Expand All @@ -70,6 +71,7 @@ pub struct DocSearchIssueDocument {
pub struct DocSearchPullDocument {
pub title: String,
pub link: String,
pub author: String,
pub body: String,
pub diff: String,
pub merged: bool,
Expand Down Expand Up @@ -139,6 +141,11 @@ impl FromTantivyDocument for DocSearchIssueDocument {
schema.field_attributes,
structured_doc::fields::issue::LINK,
);
let author = get_json_text_field(
doc,
schema.field_attributes,
structured_doc::fields::issue::AUTHOR,
);
let body = get_json_text_field(
doc,
schema.field_attributes,
Expand All @@ -152,6 +159,7 @@ impl FromTantivyDocument for DocSearchIssueDocument {
Some(Self {
title: title.into(),
link: link.into(),
author: author.into(),
body: body.into(),
closed,
})
Expand All @@ -171,6 +179,11 @@ impl FromTantivyDocument for DocSearchPullDocument {
schema.field_attributes,
structured_doc::fields::pull::LINK,
);
let author = get_json_text_field(
doc,
schema.field_attributes,
structured_doc::fields::pull::AUTHOR,
);
let body = get_json_text_field(
doc,
schema.field_attributes,
Expand All @@ -189,6 +202,7 @@ impl FromTantivyDocument for DocSearchPullDocument {
Some(Self {
title: title.into(),
link: link.into(),
author: author.into(),
body: body.into(),
diff: diff.into(),
merged,
Expand All @@ -200,20 +214,27 @@ fn get_json_field<'a>(
doc: &'a TantivyDocument,
field: schema::Field,
name: &str,
) -> CompactDocValue<'a> {
doc.get_first(field)
.unwrap()
.as_object()
.unwrap()
.find(|(k, _)| *k == name)
.unwrap()
.1
) -> Option<CompactDocValue<'a>> {
Some(
doc.get_first(field)?
.as_object()?
.find(|(k, _)| *k == name)?
.1,
)
}

fn get_json_bool_field(doc: &TantivyDocument, field: schema::Field, name: &str) -> bool {
get_json_field(doc, field, name).as_bool().unwrap()
if let Some(field) = get_json_field(doc, field, name) {
return field.as_bool().unwrap_or_default();
} else {
false
}
}

fn get_json_text_field<'a>(doc: &'a TantivyDocument, field: schema::Field, name: &str) -> &'a str {
get_json_field(doc, field, name).as_str().unwrap()
if let Some(field) = get_json_field(doc, field, name) {
field.as_str().unwrap_or_default()
} else {
""
}
}
30 changes: 29 additions & 1 deletion crates/tabby-common/src/index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ const FIELD_CHUNK_ID: &str = "chunk_id";
const FIELD_UPDATED_AT: &str = "updated_at";
const FIELD_FAILED_CHUNKS_COUNT: &str = "failed_chunks_count";
pub const FIELD_SOURCE_ID: &str = "source_id";
pub const FIELD_CHUNK_ATTRIBUTES: &str = "chunk_attributes";

pub mod corpus {
pub const CODE: &str = "code";
Expand Down Expand Up @@ -107,7 +108,7 @@ impl IndexSchema {

let field_chunk_id = builder.add_text_field(FIELD_CHUNK_ID, STRING | FAST | STORED);
let field_chunk_attributes = builder.add_json_field(
"chunk_attributes",
FIELD_CHUNK_ATTRIBUTES,
JsonObjectOptions::default()
.set_stored()
.set_indexing_options(
Expand Down Expand Up @@ -228,6 +229,33 @@ impl IndexSchema {
])
}

/// Build a query to check if the document has specific attribute field.
pub fn doc_has_attribute_field(&self, corpus: &str, doc_id: &str, field: &str) -> impl Query {
let doc_id_query = TermQuery::new(
Term::from_field_text(self.field_id, doc_id),
tantivy::schema::IndexRecordOption::Basic,
);

BooleanQuery::new(vec![
// Must match the corpus
(Occur::Must, self.corpus_query(corpus)),
// Must match the doc id
(Occur::Must, Box::new(doc_id_query)),
// Must has the failed_chunks_count field
(
Occur::Must,
Box::new(ExistsQuery::new_exists_query(
format!("{}.{}", FIELD_CHUNK_ATTRIBUTES, field).into(),
)),
),
// Exclude chunk documents
(
Occur::MustNot,
Box::new(ExistsQuery::new_exists_query(FIELD_CHUNK_ID.into())),
),
])
}

/// Build a query to find the document with the given `doc_id`, include chunks.
pub fn doc_query_with_chunks(&self, corpus: &str, doc_id: &str) -> impl Query {
let doc_id_query = TermQuery::new(
Expand Down
2 changes: 2 additions & 0 deletions crates/tabby-common/src/index/structured_doc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@ pub mod fields {
pub mod issue {
pub const TITLE: &str = "title";
pub const LINK: &str = "link";
pub const AUTHOR: &str = "author";
pub const BODY: &str = "body";
pub const CLOSED: &str = "closed";
}

pub mod pull {
pub const TITLE: &str = "title";
pub const LINK: &str = "link";
pub const AUTHOR: &str = "author";
pub const BODY: &str = "body";
pub const DIFF: &str = "diff";
pub const MERGED: &str = "merged";
Expand Down
16 changes: 13 additions & 3 deletions crates/tabby-index/src/indexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -252,10 +252,9 @@ impl Indexer {
!docs.is_empty()
}

/// Get the failed_chunks_count field for a document.
/// tracks the number of embedding indexing failed chunks for a document.
/// Check whether the document has failed chunks.
///
/// return 0 if the field is not found.
/// failed chunks tracks the number of embedding indexing failed chunks for a document.
pub fn has_failed_chunks(&self, id: &str) -> bool {
let schema = IndexSchema::instance();
let query = schema.doc_has_failed_chunks(&self.corpus, id);
Expand All @@ -265,6 +264,17 @@ impl Indexer {

!docs.is_empty()
}

// Check whether the document has attribute field.
pub fn has_attribute_field(&self, id: &str, field: &str) -> bool {
let schema = IndexSchema::instance();
let query = schema.doc_has_attribute_field(&self.corpus, id, field);
let Ok(docs) = self.searcher.search(&query, &TopDocs::with_limit(1)) else {
return false;
};

!docs.is_empty()
}
}

pub struct IndexGarbageCollector {
Expand Down
19 changes: 18 additions & 1 deletion crates/tabby-index/src/structured_doc/public.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use std::sync::Arc;
use async_stream::stream;
use chrono::{DateTime, Utc};
use futures::StreamExt;
use tabby_common::index::corpus;
use tabby_common::index::{corpus, structured_doc::fields as StructuredDocIndexFields};
use tabby_inference::Embedding;

pub use super::types::{
Expand Down Expand Up @@ -85,6 +85,10 @@ impl StructuredDocIndexer {
return false;
}

if self.should_reindex(document) {
return true;
}

if self.indexer.is_indexed_after(document.id(), updated_at)
&& !self.indexer.has_failed_chunks(document.id())
{
Expand All @@ -93,4 +97,17 @@ impl StructuredDocIndexer {

true
}

fn should_reindex(&self, document: &StructuredDoc) -> bool {
// v0.22.0 add the author field to the issue and pull documents.
match &document.fields {
StructuredDocFields::Issue(_) => self
.indexer
.has_attribute_field(document.id(), StructuredDocIndexFields::issue::AUTHOR),
StructuredDocFields::Pull(_) => self
.indexer
.has_attribute_field(document.id(), StructuredDocIndexFields::pull::AUTHOR),
_ => false,
}
}
}
2 changes: 2 additions & 0 deletions crates/tabby-index/src/structured_doc/types/issue.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use super::{build_tokens, BuildStructuredDoc};
pub struct IssueDocument {
pub link: String,
pub title: String,
pub author: String,
pub body: String,
pub closed: bool,
}
Expand All @@ -27,6 +28,7 @@ impl BuildStructuredDoc for IssueDocument {
json!({
fields::issue::LINK: self.link,
fields::issue::TITLE: self.title,
fields::issue::AUTHOR: self.author,
fields::issue::BODY: self.body,
fields::issue::CLOSED: self.closed,
})
Expand Down
2 changes: 2 additions & 0 deletions crates/tabby-index/src/structured_doc/types/pull.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use super::{build_tokens, BuildStructuredDoc};
pub struct PullDocument {
pub link: String,
pub title: String,
pub author: String,
pub body: String,

/// The diff represents the code changes in this PR,
Expand All @@ -33,6 +34,7 @@ impl BuildStructuredDoc for PullDocument {
json!({
fields::pull::LINK: self.link,
fields::pull::TITLE: self.title,
fields::pull::AUTHOR: self.author,
fields::pull::BODY: self.body,
fields::pull::DIFF: self.diff,
fields::pull::MERGED: self.merged,
Expand Down
69 changes: 54 additions & 15 deletions crates/tabby-index/src/structured_doc_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ mod structured_doc_tests {
use std::sync::Arc;

use serial_test::file_serial;
use tabby_common::index::corpus;
use tabby_common::index::{corpus, structured_doc::fields as StructuredDocIndexFields};
use temp_testdir::TempDir;

use super::mock_embedding::MockEmbedding;
Expand Down Expand Up @@ -59,6 +59,7 @@ mod structured_doc_tests {
fields: StructuredDocFields::Issue(StructuredDocIssueFields {
link: id.to_owned(),
title: "title".to_owned(),
author: "author".to_owned(),
body: "body".to_owned(),
closed: false,
}),
Expand All @@ -82,13 +83,7 @@ mod structured_doc_tests {
indexer.commit();

let validator = Indexer::new(corpus::STRUCTURED_DOC);
// Wait for up to 60s for the document to be indexed.
for _ in 0..10 {
if validator.is_indexed(id) {
break;
}
std::thread::sleep(std::time::Duration::from_secs(1));
}

assert!(validator.is_indexed(id));
assert!(validator.has_failed_chunks(id));

Expand All @@ -111,6 +106,7 @@ mod structured_doc_tests {
fields: StructuredDocFields::Issue(StructuredDocIssueFields {
link: id.to_owned(),
title: "title".to_owned(),
author: "author".to_owned(),
body: "body".to_owned(),
closed: false,
}),
Expand All @@ -134,18 +130,59 @@ mod structured_doc_tests {
indexer.commit();

let validator = Indexer::new(corpus::STRUCTURED_DOC);
// Wait for up to 60s for the document to be indexed.
for _ in 0..10 {
if validator.is_indexed(id) {
break;
}
std::thread::sleep(std::time::Duration::from_secs(1));
}

assert!(validator.is_indexed(id));
assert!(!validator.has_failed_chunks(id));

tabby_common::path::set_tabby_root(root);
}

#[test]
#[file_serial(set_tabby_root)]
fn test_structured_doc_has_attribute_field() {
let root = tabby_common::path::tabby_root();
let temp_dir = TempDir::default();
tabby_common::path::set_tabby_root(temp_dir.to_owned());

let id = "structured_doc_empty_embedding";
let embedding = MockEmbedding::new(vec![]);
let embedding = Arc::new(embedding);
let indexer = StructuredDocIndexer::new(embedding.clone());
let doc = StructuredDoc {
source_id: "source".to_owned(),
fields: StructuredDocFields::Issue(StructuredDocIssueFields {
link: id.to_owned(),
title: "title".to_owned(),
author: "author".to_owned(),
body: "body".to_owned(),
closed: false,
}),
};

let updated_at = chrono::Utc::now();
let res = tokio::runtime::Runtime::new().unwrap().block_on(async {
let updated = indexer
.sync(
StructuredDocState {
updated_at,
deleted: false,
},
doc,
)
.await;
println!("{}", updated);
updated
});
assert!(res);
indexer.commit();

let validator = Indexer::new(corpus::STRUCTURED_DOC);

assert!(validator.is_indexed(id));
assert!(validator.has_attribute_field(id, StructuredDocIndexFields::issue::AUTHOR));

tabby_common::path::set_tabby_root(root);
}
}

mod builder_tests {
Expand Down Expand Up @@ -185,6 +222,7 @@ mod builder_tests {
fields: StructuredDocFields::Issue(StructuredDocIssueFields {
link: test_id.to_owned(),
title: "title".to_owned(),
author: "author".to_owned(),
body: "body".to_owned(),
closed: false,
}),
Expand Down Expand Up @@ -240,6 +278,7 @@ mod builder_tests {
fields: StructuredDocFields::Issue(StructuredDocIssueFields {
link: test_id.to_owned(),
title: "title".to_owned(),
author: "author".to_owned(),
body: "body".to_owned(),
closed: false,
}),
Expand Down
Loading

0 comments on commit 18a84d6

Please sign in to comment.