feat: add author field to issue and pull context

Signed-off-by: Wei Zhang <[email protected]>
TabbyML · Nov 28, 2024 · 18a84d6 · 18a84d6
1 parent 38e3198
commit 18a84d6
Show file tree

Hide file tree

Showing 14 changed files with 178 additions and 30 deletions.
diff --git a/crates/tabby-common/src/api/structured_doc.rs b/crates/tabby-common/src/api/structured_doc.rs
@@ -62,6 +62,7 @@ pub struct DocSearchWebDocument {
 pub struct DocSearchIssueDocument {
     pub title: String,
     pub link: String,
+    pub author: String,
     pub body: String,
     pub closed: bool,
 }
@@ -70,6 +71,7 @@ pub struct DocSearchIssueDocument {
 pub struct DocSearchPullDocument {
     pub title: String,
     pub link: String,
+    pub author: String,
     pub body: String,
     pub diff: String,
     pub merged: bool,
@@ -139,6 +141,11 @@ impl FromTantivyDocument for DocSearchIssueDocument {
             schema.field_attributes,
             structured_doc::fields::issue::LINK,
         );
+        let author = get_json_text_field(
+            doc,
+            schema.field_attributes,
+            structured_doc::fields::issue::AUTHOR,
+        );
         let body = get_json_text_field(
             doc,
             schema.field_attributes,
@@ -152,6 +159,7 @@ impl FromTantivyDocument for DocSearchIssueDocument {
         Some(Self {
             title: title.into(),
             link: link.into(),
+            author: author.into(),
             body: body.into(),
             closed,
         })
@@ -171,6 +179,11 @@ impl FromTantivyDocument for DocSearchPullDocument {
             schema.field_attributes,
             structured_doc::fields::pull::LINK,
         );
+        let author = get_json_text_field(
+            doc,
+            schema.field_attributes,
+            structured_doc::fields::pull::AUTHOR,
+        );
         let body = get_json_text_field(
             doc,
             schema.field_attributes,
@@ -189,6 +202,7 @@ impl FromTantivyDocument for DocSearchPullDocument {
         Some(Self {
             title: title.into(),
             link: link.into(),
+            author: author.into(),
             body: body.into(),
             diff: diff.into(),
             merged,
@@ -200,20 +214,27 @@ fn get_json_field<'a>(
     doc: &'a TantivyDocument,
     field: schema::Field,
     name: &str,
-) -> CompactDocValue<'a> {
-    doc.get_first(field)
-        .unwrap()
-        .as_object()
-        .unwrap()
-        .find(|(k, _)| *k == name)
-        .unwrap()
-        .1
+) -> Option<CompactDocValue<'a>> {
+    Some(
+        doc.get_first(field)?
+            .as_object()?
+            .find(|(k, _)| *k == name)?
+            .1,
+    )
 }
 
 fn get_json_bool_field(doc: &TantivyDocument, field: schema::Field, name: &str) -> bool {
-    get_json_field(doc, field, name).as_bool().unwrap()
+    if let Some(field) = get_json_field(doc, field, name) {
+        return field.as_bool().unwrap_or_default();
+    } else {
+        false
+    }
 }
 
 fn get_json_text_field<'a>(doc: &'a TantivyDocument, field: schema::Field, name: &str) -> &'a str {
-    get_json_field(doc, field, name).as_str().unwrap()
+    if let Some(field) = get_json_field(doc, field, name) {
+        field.as_str().unwrap_or_default()
+    } else {
+        ""
+    }
 }
diff --git a/crates/tabby-common/src/index/mod.rs b/crates/tabby-common/src/index/mod.rs
@@ -76,6 +76,7 @@ const FIELD_CHUNK_ID: &str = "chunk_id";
 const FIELD_UPDATED_AT: &str = "updated_at";
 const FIELD_FAILED_CHUNKS_COUNT: &str = "failed_chunks_count";
 pub const FIELD_SOURCE_ID: &str = "source_id";
+pub const FIELD_CHUNK_ATTRIBUTES: &str = "chunk_attributes";
 
 pub mod corpus {
     pub const CODE: &str = "code";
@@ -107,7 +108,7 @@ impl IndexSchema {
 
         let field_chunk_id = builder.add_text_field(FIELD_CHUNK_ID, STRING | FAST | STORED);
         let field_chunk_attributes = builder.add_json_field(
-            "chunk_attributes",
+            FIELD_CHUNK_ATTRIBUTES,
             JsonObjectOptions::default()
                 .set_stored()
                 .set_indexing_options(
@@ -228,6 +229,33 @@ impl IndexSchema {
         ])
     }
 
+    /// Build a query to check if the document has specific attribute field.
+    pub fn doc_has_attribute_field(&self, corpus: &str, doc_id: &str, field: &str) -> impl Query {
+        let doc_id_query = TermQuery::new(
+            Term::from_field_text(self.field_id, doc_id),
+            tantivy::schema::IndexRecordOption::Basic,
+        );
+
+        BooleanQuery::new(vec![
+            // Must match the corpus
+            (Occur::Must, self.corpus_query(corpus)),
+            // Must match the doc id
+            (Occur::Must, Box::new(doc_id_query)),
+            // Must has the failed_chunks_count field
+            (
+                Occur::Must,
+                Box::new(ExistsQuery::new_exists_query(
+                    format!("{}.{}", FIELD_CHUNK_ATTRIBUTES, field).into(),
+                )),
+            ),
+            // Exclude chunk documents
+            (
+                Occur::MustNot,
+                Box::new(ExistsQuery::new_exists_query(FIELD_CHUNK_ID.into())),
+            ),
+        ])
+    }
+
     /// Build a query to find the document with the given `doc_id`, include chunks.
     pub fn doc_query_with_chunks(&self, corpus: &str, doc_id: &str) -> impl Query {
         let doc_id_query = TermQuery::new(

diff --git a/crates/tabby-common/src/index/structured_doc.rs b/crates/tabby-common/src/index/structured_doc.rs
@@ -10,13 +10,15 @@ pub mod fields {
     pub mod issue {
         pub const TITLE: &str = "title";
         pub const LINK: &str = "link";
+        pub const AUTHOR: &str = "author";
         pub const BODY: &str = "body";
         pub const CLOSED: &str = "closed";
     }
 
     pub mod pull {
         pub const TITLE: &str = "title";
         pub const LINK: &str = "link";
+        pub const AUTHOR: &str = "author";
         pub const BODY: &str = "body";
         pub const DIFF: &str = "diff";
         pub const MERGED: &str = "merged";

diff --git a/crates/tabby-index/src/indexer.rs b/crates/tabby-index/src/indexer.rs
@@ -252,10 +252,9 @@ impl Indexer {
         !docs.is_empty()
     }
 
-    /// Get the failed_chunks_count field for a document.
-    /// tracks the number of embedding indexing failed chunks for a document.
+    /// Check whether the document has failed chunks.
     ///
-    /// return 0 if the field is not found.
+    /// failed chunks tracks the number of embedding indexing failed chunks for a document.
     pub fn has_failed_chunks(&self, id: &str) -> bool {
         let schema = IndexSchema::instance();
         let query = schema.doc_has_failed_chunks(&self.corpus, id);
@@ -265,6 +264,17 @@ impl Indexer {
 
         !docs.is_empty()
     }
+
+    // Check whether the document has attribute field.
+    pub fn has_attribute_field(&self, id: &str, field: &str) -> bool {
+        let schema = IndexSchema::instance();
+        let query = schema.doc_has_attribute_field(&self.corpus, id, field);
+        let Ok(docs) = self.searcher.search(&query, &TopDocs::with_limit(1)) else {
+            return false;
+        };
+
+        !docs.is_empty()
+    }
 }
 
 pub struct IndexGarbageCollector {

diff --git a/crates/tabby-index/src/structured_doc/public.rs b/crates/tabby-index/src/structured_doc/public.rs
@@ -3,7 +3,7 @@ use std::sync::Arc;
 use async_stream::stream;
 use chrono::{DateTime, Utc};
 use futures::StreamExt;
-use tabby_common::index::corpus;
+use tabby_common::index::{corpus, structured_doc::fields as StructuredDocIndexFields};
 use tabby_inference::Embedding;
 
 pub use super::types::{
@@ -85,6 +85,10 @@ impl StructuredDocIndexer {
             return false;
         }
 
+        if self.should_reindex(document) {
+            return true;
+        }
+
         if self.indexer.is_indexed_after(document.id(), updated_at)
             && !self.indexer.has_failed_chunks(document.id())
         {
@@ -93,4 +97,17 @@ impl StructuredDocIndexer {
 
         true
     }
+
+    fn should_reindex(&self, document: &StructuredDoc) -> bool {
+        // v0.22.0 add the author field to the issue and pull documents.
+        match &document.fields {
+            StructuredDocFields::Issue(_) => self
+                .indexer
+                .has_attribute_field(document.id(), StructuredDocIndexFields::issue::AUTHOR),
+            StructuredDocFields::Pull(_) => self
+                .indexer
+                .has_attribute_field(document.id(), StructuredDocIndexFields::pull::AUTHOR),
+            _ => false,
+        }
+    }
 }
diff --git a/crates/tabby-index/src/structured_doc/types/issue.rs b/crates/tabby-index/src/structured_doc/types/issue.rs
@@ -13,6 +13,7 @@ use super::{build_tokens, BuildStructuredDoc};
 pub struct IssueDocument {
     pub link: String,
     pub title: String,
+    pub author: String,
     pub body: String,
     pub closed: bool,
 }
@@ -27,6 +28,7 @@ impl BuildStructuredDoc for IssueDocument {
         json!({
             fields::issue::LINK: self.link,
             fields::issue::TITLE: self.title,
+            fields::issue::AUTHOR: self.author,
             fields::issue::BODY: self.body,
             fields::issue::CLOSED: self.closed,
         })

diff --git a/crates/tabby-index/src/structured_doc/types/pull.rs b/crates/tabby-index/src/structured_doc/types/pull.rs
@@ -13,6 +13,7 @@ use super::{build_tokens, BuildStructuredDoc};
 pub struct PullDocument {
     pub link: String,
     pub title: String,
+    pub author: String,
     pub body: String,
 
     /// The diff represents the code changes in this PR,
@@ -33,6 +34,7 @@ impl BuildStructuredDoc for PullDocument {
         json!({
             fields::pull::LINK: self.link,
             fields::pull::TITLE: self.title,
+            fields::pull::AUTHOR: self.author,
             fields::pull::BODY: self.body,
             fields::pull::DIFF: self.diff,
             fields::pull::MERGED: self.merged,

diff --git a/crates/tabby-index/src/structured_doc_tests.rs b/crates/tabby-index/src/structured_doc_tests.rs
@@ -29,7 +29,7 @@ mod structured_doc_tests {
     use std::sync::Arc;
 
     use serial_test::file_serial;
-    use tabby_common::index::corpus;
+    use tabby_common::index::{corpus, structured_doc::fields as StructuredDocIndexFields};
     use temp_testdir::TempDir;
 
     use super::mock_embedding::MockEmbedding;
@@ -59,6 +59,7 @@ mod structured_doc_tests {
             fields: StructuredDocFields::Issue(StructuredDocIssueFields {
                 link: id.to_owned(),
                 title: "title".to_owned(),
+                author: "author".to_owned(),
                 body: "body".to_owned(),
                 closed: false,
             }),
@@ -82,13 +83,7 @@ mod structured_doc_tests {
         indexer.commit();
 
         let validator = Indexer::new(corpus::STRUCTURED_DOC);
-        // Wait for up to 60s for the document to be indexed.
-        for _ in 0..10 {
-            if validator.is_indexed(id) {
-                break;
-            }
-            std::thread::sleep(std::time::Duration::from_secs(1));
-        }
+
         assert!(validator.is_indexed(id));
         assert!(validator.has_failed_chunks(id));
 
@@ -111,6 +106,7 @@ mod structured_doc_tests {
             fields: StructuredDocFields::Issue(StructuredDocIssueFields {
                 link: id.to_owned(),
                 title: "title".to_owned(),
+                author: "author".to_owned(),
                 body: "body".to_owned(),
                 closed: false,
             }),
@@ -134,18 +130,59 @@ mod structured_doc_tests {
         indexer.commit();
 
         let validator = Indexer::new(corpus::STRUCTURED_DOC);
-        // Wait for up to 60s for the document to be indexed.
-        for _ in 0..10 {
-            if validator.is_indexed(id) {
-                break;
-            }
-            std::thread::sleep(std::time::Duration::from_secs(1));
-        }
+
         assert!(validator.is_indexed(id));
         assert!(!validator.has_failed_chunks(id));
 
         tabby_common::path::set_tabby_root(root);
     }
+
+    #[test]
+    #[file_serial(set_tabby_root)]
+    fn test_structured_doc_has_attribute_field() {
+        let root = tabby_common::path::tabby_root();
+        let temp_dir = TempDir::default();
+        tabby_common::path::set_tabby_root(temp_dir.to_owned());
+
+        let id = "structured_doc_empty_embedding";
+        let embedding = MockEmbedding::new(vec![]);
+        let embedding = Arc::new(embedding);
+        let indexer = StructuredDocIndexer::new(embedding.clone());
+        let doc = StructuredDoc {
+            source_id: "source".to_owned(),
+            fields: StructuredDocFields::Issue(StructuredDocIssueFields {
+                link: id.to_owned(),
+                title: "title".to_owned(),
+                author: "author".to_owned(),
+                body: "body".to_owned(),
+                closed: false,
+            }),
+        };
+
+        let updated_at = chrono::Utc::now();
+        let res = tokio::runtime::Runtime::new().unwrap().block_on(async {
+            let updated = indexer
+                .sync(
+                    StructuredDocState {
+                        updated_at,
+                        deleted: false,
+                    },
+                    doc,
+                )
+                .await;
+            println!("{}", updated);
+            updated
+        });
+        assert!(res);
+        indexer.commit();
+
+        let validator = Indexer::new(corpus::STRUCTURED_DOC);
+
+        assert!(validator.is_indexed(id));
+        assert!(validator.has_attribute_field(id, StructuredDocIndexFields::issue::AUTHOR));
+
+        tabby_common::path::set_tabby_root(root);
+    }
 }
 
 mod builder_tests {
@@ -185,6 +222,7 @@ mod builder_tests {
             fields: StructuredDocFields::Issue(StructuredDocIssueFields {
                 link: test_id.to_owned(),
                 title: "title".to_owned(),
+                author: "author".to_owned(),
                 body: "body".to_owned(),
                 closed: false,
             }),
@@ -240,6 +278,7 @@ mod builder_tests {
             fields: StructuredDocFields::Issue(StructuredDocIssueFields {
                 link: test_id.to_owned(),
                 title: "title".to_owned(),
+                author: "author".to_owned(),
                 body: "body".to_owned(),
                 closed: false,
             }),