From b496c32a2c837730a9709f88a3a47f28db4ad307 Mon Sep 17 00:00:00 2001 From: Wei Zhang Date: Mon, 25 Nov 2024 13:47:31 +0800 Subject: [PATCH 1/9] chore: delete index when pr closed --- .../tabby-index/src/structured_doc/public.rs | 4 +++ .../background_job/third_party_integration.rs | 27 +++++++++----- .../third_party_integration/issues.rs | 16 ++++++--- .../third_party_integration/pulls.rs | 35 +++++++++++++++---- 4 files changed, 63 insertions(+), 19 deletions(-) diff --git a/crates/tabby-index/src/structured_doc/public.rs b/crates/tabby-index/src/structured_doc/public.rs index 0832d48c8bb9..854f21dbb370 100644 --- a/crates/tabby-index/src/structured_doc/public.rs +++ b/crates/tabby-index/src/structured_doc/public.rs @@ -44,6 +44,10 @@ impl StructuredDocIndexer { true } + pub async fn delete(&self, id: &str) { + self.indexer.delete(id); + } + pub fn commit(self) { self.indexer.commit(); } diff --git a/ee/tabby-webserver/src/service/background_job/third_party_integration.rs b/ee/tabby-webserver/src/service/background_job/third_party_integration.rs index 2bdd85244f38..587ecf8b905e 100644 --- a/ee/tabby-webserver/src/service/background_job/third_party_integration.rs +++ b/ee/tabby-webserver/src/service/background_job/third_party_integration.rs @@ -140,18 +140,24 @@ impl SchedulerGithubGitlabJob { stream! { let mut count = 0; let mut num_updated = 0; - for await (updated_at, doc) in issue_stream.chain(pull_stream) { - if index.add(updated_at, doc).await { + let mut num_deleted = 0; + for await (state, doc) in issue_stream.chain(pull_stream) { + let id = &doc.id().to_owned(); + if index.add(state.updated_at, doc).await { num_updated += 1 } + if state.should_clean { + index.delete(&id).await; + num_deleted += 1; + } count += 1; if count % 100 == 0 { - logkit::info!("{} docs seen, {} docs updated", count, num_updated); + logkit::info!("{} docs seen, {} docs updated, {} docs deleted", count, num_updated, num_deleted); }; } - logkit::info!("{} docs seen, {} docs updated", count, num_updated); + logkit::info!("{} docs seen, {} docs updated, {} docs deleted", count, num_updated, num_deleted); index.commit(); } .count() @@ -179,11 +185,16 @@ impl SchedulerGithubGitlabJob { } } +pub struct FetchState { + updated_at: DateTime, + should_clean: bool, +} + async fn fetch_all_issues( integration: &Integration, repository: &ProvidedRepository, -) -> tabby_schema::Result, StructuredDoc)>> { - let s: BoxStream<(DateTime, StructuredDoc)> = match &integration.kind { +) -> tabby_schema::Result> { + let s: BoxStream<(FetchState, StructuredDoc)> = match &integration.kind { IntegrationKind::Github | IntegrationKind::GithubSelfHosted => list_github_issues( &repository.source_id(), integration.api_base(), @@ -207,8 +218,8 @@ async fn fetch_all_issues( async fn fetch_all_pulls( integration: &Integration, repository: &ProvidedRepository, -) -> tabby_schema::Result, StructuredDoc)>> { - let s: BoxStream<(DateTime, StructuredDoc)> = list_github_pulls( +) -> tabby_schema::Result> { + let s: BoxStream<(FetchState, StructuredDoc)> = list_github_pulls( &repository.source_id(), integration.api_base(), &repository.display_name, diff --git a/ee/tabby-webserver/src/service/background_job/third_party_integration/issues.rs b/ee/tabby-webserver/src/service/background_job/third_party_integration/issues.rs index d718d630e78e..cc04abbb1f13 100644 --- a/ee/tabby-webserver/src/service/background_job/third_party_integration/issues.rs +++ b/ee/tabby-webserver/src/service/background_job/third_party_integration/issues.rs @@ -1,3 +1,5 @@ +use super::FetchState; + use anyhow::{anyhow, Result}; use async_stream::stream; use chrono::{DateTime, Utc}; @@ -14,7 +16,7 @@ pub async fn list_github_issues( api_base: &str, full_name: &str, access_token: &str, -) -> Result, StructuredDoc)>> { +) -> Result> { let octocrab = Octocrab::builder() .personal_token(access_token.to_string()) .base_uri(api_base)? @@ -62,7 +64,10 @@ pub async fn list_github_issues( closed: issue.state == octocrab::models::IssueState::Closed, }) }; - yield (issue.updated_at, doc); + yield (FetchState { + updated_at: issue.updated_at, + should_clean: false, + }, doc); } page += 1; @@ -89,7 +94,7 @@ pub async fn list_gitlab_issues( api_base: &str, full_name: &str, access_token: &str, -) -> Result, StructuredDoc)>> { +) -> Result> { let gitlab = create_gitlab_client(api_base, access_token).await?; let source_id = source_id.to_owned(); @@ -118,7 +123,10 @@ pub async fn list_gitlab_issues( body: issue.description.unwrap_or_default(), closed: issue.state == "closed", })}; - yield (issue.updated_at, doc); + yield (FetchState { + updated_at: issue.updated_at, + should_clean: false, + }, doc); } }; diff --git a/ee/tabby-webserver/src/service/background_job/third_party_integration/pulls.rs b/ee/tabby-webserver/src/service/background_job/third_party_integration/pulls.rs index 7559773bf885..5c8bbecdefcc 100644 --- a/ee/tabby-webserver/src/service/background_job/third_party_integration/pulls.rs +++ b/ee/tabby-webserver/src/service/background_job/third_party_integration/pulls.rs @@ -1,6 +1,7 @@ +use super::FetchState; + use anyhow::{anyhow, Result}; use async_stream::stream; -use chrono::{DateTime, Utc}; use futures::Stream; use octocrab::{models::IssueState, Octocrab}; use tabby_index::public::{StructuredDoc, StructuredDocFields, StructuredDocPullDocumentFields}; @@ -10,7 +11,7 @@ pub async fn list_github_pulls( api_base: &str, full_name: &str, access_token: &str, -) -> Result, StructuredDoc)>> { +) -> Result> { let octocrab = Octocrab::builder() .personal_token(access_token.to_string()) .base_uri(api_base)? @@ -43,14 +44,31 @@ pub async fn list_github_pulls( let pages = response.number_of_pages().unwrap_or_default(); for pull in response.items { + let url = pull.html_url.map(|url| url.to_string()).unwrap_or_else(|| pull.url); + let title = pull.title.clone().unwrap_or_default(); + let body = pull.body.clone().unwrap_or_default(); + let doc = StructuredDoc { + source_id: source_id.to_string(), + fields: StructuredDocFields::Pull(StructuredDocPullDocumentFields { + link: url.clone(), + title, + body, + merged: pull.merged_at.is_some(), + diff: String::new(), + }), + }; + // skip closed but not merged pulls if let Some(state) = pull.state { if state == IssueState::Closed && pull.merged_at.is_none() { - continue + yield (FetchState{ + updated_at: pull.updated_at.unwrap(), + should_clean: true, + }, doc); } } - let url = pull.html_url.map(|url| url.to_string()).unwrap_or_else(|| pull.url); + let diff = match octocrab.pulls(&owner, &repo).get_diff(pull.number).await { Ok(x) if x.len() < 1024*1024*10 => x, Ok(_) => { @@ -71,10 +89,13 @@ pub async fn list_github_pulls( body: pull.body.unwrap_or_default(), diff, merged: pull.merged_at.is_some(), - }) - }; + })}; + - yield (pull.updated_at.unwrap(), doc); + yield (FetchState{ + updated_at: pull.updated_at.unwrap(), + should_clean: false, + }, doc); } page += 1; From 9dfd727dff74358e77883b7ffb5636401711ec9b Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Mon, 25 Nov 2024 06:04:34 +0000 Subject: [PATCH 2/9] [autofix.ci] apply automated fixes --- .../src/service/background_job/third_party_integration.rs | 2 +- .../service/background_job/third_party_integration/issues.rs | 3 +-- .../service/background_job/third_party_integration/pulls.rs | 4 ++-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/ee/tabby-webserver/src/service/background_job/third_party_integration.rs b/ee/tabby-webserver/src/service/background_job/third_party_integration.rs index 587ecf8b905e..beb9dc7970c9 100644 --- a/ee/tabby-webserver/src/service/background_job/third_party_integration.rs +++ b/ee/tabby-webserver/src/service/background_job/third_party_integration.rs @@ -147,7 +147,7 @@ impl SchedulerGithubGitlabJob { num_updated += 1 } if state.should_clean { - index.delete(&id).await; + index.delete(id).await; num_deleted += 1; } diff --git a/ee/tabby-webserver/src/service/background_job/third_party_integration/issues.rs b/ee/tabby-webserver/src/service/background_job/third_party_integration/issues.rs index cc04abbb1f13..a9de29c9745e 100644 --- a/ee/tabby-webserver/src/service/background_job/third_party_integration/issues.rs +++ b/ee/tabby-webserver/src/service/background_job/third_party_integration/issues.rs @@ -1,5 +1,3 @@ -use super::FetchState; - use anyhow::{anyhow, Result}; use async_stream::stream; use chrono::{DateTime, Utc}; @@ -9,6 +7,7 @@ use octocrab::Octocrab; use serde::Deserialize; use tabby_index::public::{StructuredDoc, StructuredDocFields, StructuredDocIssueFields}; +use super::FetchState; use crate::service::create_gitlab_client; pub async fn list_github_issues( diff --git a/ee/tabby-webserver/src/service/background_job/third_party_integration/pulls.rs b/ee/tabby-webserver/src/service/background_job/third_party_integration/pulls.rs index 5c8bbecdefcc..6b4b03c8db9a 100644 --- a/ee/tabby-webserver/src/service/background_job/third_party_integration/pulls.rs +++ b/ee/tabby-webserver/src/service/background_job/third_party_integration/pulls.rs @@ -1,11 +1,11 @@ -use super::FetchState; - use anyhow::{anyhow, Result}; use async_stream::stream; use futures::Stream; use octocrab::{models::IssueState, Octocrab}; use tabby_index::public::{StructuredDoc, StructuredDocFields, StructuredDocPullDocumentFields}; +use super::FetchState; + pub async fn list_github_pulls( source_id: &str, api_base: &str, From b447bf0b32d34738490ce47d07cc4ea87a0db26f Mon Sep 17 00:00:00 2001 From: Wei Zhang Date: Mon, 25 Nov 2024 14:05:11 +0800 Subject: [PATCH 3/9] chore: not count as deleted when not existed --- crates/tabby-index/src/structured_doc/public.rs | 9 +++++++-- .../service/background_job/third_party_integration.rs | 3 +-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/crates/tabby-index/src/structured_doc/public.rs b/crates/tabby-index/src/structured_doc/public.rs index 854f21dbb370..2b54a5b9731a 100644 --- a/crates/tabby-index/src/structured_doc/public.rs +++ b/crates/tabby-index/src/structured_doc/public.rs @@ -44,8 +44,13 @@ impl StructuredDocIndexer { true } - pub async fn delete(&self, id: &str) { - self.indexer.delete(id); + pub async fn delete(&self, id: &str) -> bool { + if self.indexer.is_indexed(id) { + self.indexer.delete(id); + true + } else { + false + } } pub fn commit(self) { diff --git a/ee/tabby-webserver/src/service/background_job/third_party_integration.rs b/ee/tabby-webserver/src/service/background_job/third_party_integration.rs index beb9dc7970c9..d60d01f1bb43 100644 --- a/ee/tabby-webserver/src/service/background_job/third_party_integration.rs +++ b/ee/tabby-webserver/src/service/background_job/third_party_integration.rs @@ -146,8 +146,7 @@ impl SchedulerGithubGitlabJob { if index.add(state.updated_at, doc).await { num_updated += 1 } - if state.should_clean { - index.delete(id).await; + if state.should_clean && index.delete(id).await { num_deleted += 1; } From 5d0f6afaf68d719c062729d0e9a148e460a11f4f Mon Sep 17 00:00:00 2001 From: Wei Zhang Date: Mon, 25 Nov 2024 14:23:31 +0800 Subject: [PATCH 4/9] chore: fix duplicated add --- .../src/service/background_job/third_party_integration.rs | 2 +- .../service/background_job/third_party_integration/issues.rs | 2 +- .../src/service/background_job/third_party_integration/pulls.rs | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/ee/tabby-webserver/src/service/background_job/third_party_integration.rs b/ee/tabby-webserver/src/service/background_job/third_party_integration.rs index d60d01f1bb43..c84fbe14185b 100644 --- a/ee/tabby-webserver/src/service/background_job/third_party_integration.rs +++ b/ee/tabby-webserver/src/service/background_job/third_party_integration.rs @@ -143,7 +143,7 @@ impl SchedulerGithubGitlabJob { let mut num_deleted = 0; for await (state, doc) in issue_stream.chain(pull_stream) { let id = &doc.id().to_owned(); - if index.add(state.updated_at, doc).await { + if !state.should_clean && index.add(state.updated_at, doc).await { num_updated += 1 } if state.should_clean && index.delete(id).await { diff --git a/ee/tabby-webserver/src/service/background_job/third_party_integration/issues.rs b/ee/tabby-webserver/src/service/background_job/third_party_integration/issues.rs index a9de29c9745e..db641a0ed33e 100644 --- a/ee/tabby-webserver/src/service/background_job/third_party_integration/issues.rs +++ b/ee/tabby-webserver/src/service/background_job/third_party_integration/issues.rs @@ -65,7 +65,7 @@ pub async fn list_github_issues( }; yield (FetchState { updated_at: issue.updated_at, - should_clean: false, + should_clean: false, }, doc); } diff --git a/ee/tabby-webserver/src/service/background_job/third_party_integration/pulls.rs b/ee/tabby-webserver/src/service/background_job/third_party_integration/pulls.rs index 6b4b03c8db9a..83f6bfe56685 100644 --- a/ee/tabby-webserver/src/service/background_job/third_party_integration/pulls.rs +++ b/ee/tabby-webserver/src/service/background_job/third_party_integration/pulls.rs @@ -65,6 +65,7 @@ pub async fn list_github_pulls( updated_at: pull.updated_at.unwrap(), should_clean: true, }, doc); + continue; } } From 21ad094b627838babdc18a3d93d002623d07c98a Mon Sep 17 00:00:00 2001 From: Wei Zhang Date: Mon, 25 Nov 2024 16:22:21 +0800 Subject: [PATCH 5/9] chore: use structuredDocState --- crates/tabby-index/src/lib.rs | 2 +- .../tabby-index/src/structured_doc/public.rs | 13 ++++++++-- .../background_job/third_party_integration.rs | 26 ++++++------------- .../third_party_integration/issues.rs | 17 ++++++------ .../third_party_integration/pulls.rs | 16 ++++++------ .../src/service/background_job/web_crawler.rs | 13 ++++++++-- 6 files changed, 48 insertions(+), 39 deletions(-) diff --git a/crates/tabby-index/src/lib.rs b/crates/tabby-index/src/lib.rs index 063706427258..443c0b14fd8e 100644 --- a/crates/tabby-index/src/lib.rs +++ b/crates/tabby-index/src/lib.rs @@ -20,7 +20,7 @@ pub mod public { code::CodeIndexer, structured_doc::public::{ StructuredDoc, StructuredDocFields, StructuredDocIndexer, StructuredDocIssueFields, - StructuredDocPullDocumentFields, StructuredDocWebFields, + StructuredDocPullDocumentFields, StructuredDocState, StructuredDocWebFields, }, }; diff --git a/crates/tabby-index/src/structured_doc/public.rs b/crates/tabby-index/src/structured_doc/public.rs index 2b54a5b9731a..efed332d3c13 100644 --- a/crates/tabby-index/src/structured_doc/public.rs +++ b/crates/tabby-index/src/structured_doc/public.rs @@ -14,6 +14,11 @@ pub use super::types::{ use super::{create_structured_doc_builder, types::BuildStructuredDoc}; use crate::{indexer::TantivyDocBuilder, Indexer}; +pub struct StructuredDocState { + pub updated_at: DateTime, + pub deleted: bool, +} + pub struct StructuredDocIndexer { builder: TantivyDocBuilder, indexer: Indexer, @@ -26,11 +31,15 @@ impl StructuredDocIndexer { Self { indexer, builder } } - pub async fn add(&self, updated_at: DateTime, document: StructuredDoc) -> bool { - if !self.require_updates(updated_at, &document) { + pub async fn add(&self, state: StructuredDocState, document: StructuredDoc) -> bool { + if !self.require_updates(state.updated_at, &document) { return false; } + if state.deleted { + return self.delete(document.id()).await; + } + stream! { let (id, s) = self.builder.build(document).await; self.indexer.delete(&id); diff --git a/ee/tabby-webserver/src/service/background_job/third_party_integration.rs b/ee/tabby-webserver/src/service/background_job/third_party_integration.rs index c84fbe14185b..8039ff6f062c 100644 --- a/ee/tabby-webserver/src/service/background_job/third_party_integration.rs +++ b/ee/tabby-webserver/src/service/background_job/third_party_integration.rs @@ -8,7 +8,7 @@ use juniper::ID; use pulls::list_github_pulls; use serde::{Deserialize, Serialize}; use tabby_common::config::CodeRepository; -use tabby_index::public::{CodeIndexer, StructuredDoc, StructuredDocIndexer}; +use tabby_index::public::{CodeIndexer, StructuredDoc, StructuredDocIndexer, StructuredDocState}; use tabby_inference::Embedding; use tabby_schema::{ integration::{Integration, IntegrationKind, IntegrationService}, @@ -140,23 +140,18 @@ impl SchedulerGithubGitlabJob { stream! { let mut count = 0; let mut num_updated = 0; - let mut num_deleted = 0; for await (state, doc) in issue_stream.chain(pull_stream) { - let id = &doc.id().to_owned(); - if !state.should_clean && index.add(state.updated_at, doc).await { + if index.add(state, doc).await { num_updated += 1 } - if state.should_clean && index.delete(id).await { - num_deleted += 1; - } count += 1; if count % 100 == 0 { - logkit::info!("{} docs seen, {} docs updated, {} docs deleted", count, num_updated, num_deleted); + logkit::info!("{} docs seen, {} docs updated", count, num_updated); }; } - logkit::info!("{} docs seen, {} docs updated, {} docs deleted", count, num_updated, num_deleted); + logkit::info!("{} docs seen, {} docs updated", count, num_updated); index.commit(); } .count() @@ -184,16 +179,11 @@ impl SchedulerGithubGitlabJob { } } -pub struct FetchState { - updated_at: DateTime, - should_clean: bool, -} - async fn fetch_all_issues( integration: &Integration, repository: &ProvidedRepository, -) -> tabby_schema::Result> { - let s: BoxStream<(FetchState, StructuredDoc)> = match &integration.kind { +) -> tabby_schema::Result> { + let s: BoxStream<(StructuredDocState, StructuredDoc)> = match &integration.kind { IntegrationKind::Github | IntegrationKind::GithubSelfHosted => list_github_issues( &repository.source_id(), integration.api_base(), @@ -217,8 +207,8 @@ async fn fetch_all_issues( async fn fetch_all_pulls( integration: &Integration, repository: &ProvidedRepository, -) -> tabby_schema::Result> { - let s: BoxStream<(FetchState, StructuredDoc)> = list_github_pulls( +) -> tabby_schema::Result> { + let s: BoxStream<(StructuredDocState, StructuredDoc)> = list_github_pulls( &repository.source_id(), integration.api_base(), &repository.display_name, diff --git a/ee/tabby-webserver/src/service/background_job/third_party_integration/issues.rs b/ee/tabby-webserver/src/service/background_job/third_party_integration/issues.rs index db641a0ed33e..ecf9d5fbbb92 100644 --- a/ee/tabby-webserver/src/service/background_job/third_party_integration/issues.rs +++ b/ee/tabby-webserver/src/service/background_job/third_party_integration/issues.rs @@ -5,9 +5,10 @@ use futures::Stream; use gitlab::api::{issues::ProjectIssues, AsyncQuery}; use octocrab::Octocrab; use serde::Deserialize; -use tabby_index::public::{StructuredDoc, StructuredDocFields, StructuredDocIssueFields}; +use tabby_index::public::{ + StructuredDoc, StructuredDocFields, StructuredDocIssueFields, StructuredDocState, +}; -use super::FetchState; use crate::service::create_gitlab_client; pub async fn list_github_issues( @@ -15,7 +16,7 @@ pub async fn list_github_issues( api_base: &str, full_name: &str, access_token: &str, -) -> Result> { +) -> Result> { let octocrab = Octocrab::builder() .personal_token(access_token.to_string()) .base_uri(api_base)? @@ -63,9 +64,9 @@ pub async fn list_github_issues( closed: issue.state == octocrab::models::IssueState::Closed, }) }; - yield (FetchState { + yield (StructuredDocState { updated_at: issue.updated_at, - should_clean: false, + deleted: false, }, doc); } @@ -93,7 +94,7 @@ pub async fn list_gitlab_issues( api_base: &str, full_name: &str, access_token: &str, -) -> Result> { +) -> Result> { let gitlab = create_gitlab_client(api_base, access_token).await?; let source_id = source_id.to_owned(); @@ -122,9 +123,9 @@ pub async fn list_gitlab_issues( body: issue.description.unwrap_or_default(), closed: issue.state == "closed", })}; - yield (FetchState { + yield (StructuredDocState { updated_at: issue.updated_at, - should_clean: false, + deleted: false, }, doc); } }; diff --git a/ee/tabby-webserver/src/service/background_job/third_party_integration/pulls.rs b/ee/tabby-webserver/src/service/background_job/third_party_integration/pulls.rs index 83f6bfe56685..3101497cb0e6 100644 --- a/ee/tabby-webserver/src/service/background_job/third_party_integration/pulls.rs +++ b/ee/tabby-webserver/src/service/background_job/third_party_integration/pulls.rs @@ -2,16 +2,16 @@ use anyhow::{anyhow, Result}; use async_stream::stream; use futures::Stream; use octocrab::{models::IssueState, Octocrab}; -use tabby_index::public::{StructuredDoc, StructuredDocFields, StructuredDocPullDocumentFields}; - -use super::FetchState; +use tabby_index::public::{ + StructuredDoc, StructuredDocFields, StructuredDocPullDocumentFields, StructuredDocState, +}; pub async fn list_github_pulls( source_id: &str, api_base: &str, full_name: &str, access_token: &str, -) -> Result> { +) -> Result> { let octocrab = Octocrab::builder() .personal_token(access_token.to_string()) .base_uri(api_base)? @@ -61,9 +61,9 @@ pub async fn list_github_pulls( // skip closed but not merged pulls if let Some(state) = pull.state { if state == IssueState::Closed && pull.merged_at.is_none() { - yield (FetchState{ + yield (StructuredDocState{ updated_at: pull.updated_at.unwrap(), - should_clean: true, + deleted: true, }, doc); continue; } @@ -93,9 +93,9 @@ pub async fn list_github_pulls( })}; - yield (FetchState{ + yield (StructuredDocState{ updated_at: pull.updated_at.unwrap(), - should_clean: false, + deleted: false, }, doc); } diff --git a/ee/tabby-webserver/src/service/background_job/web_crawler.rs b/ee/tabby-webserver/src/service/background_job/web_crawler.rs index 8d4450310309..ff5dd1ef9de4 100644 --- a/ee/tabby-webserver/src/service/background_job/web_crawler.rs +++ b/ee/tabby-webserver/src/service/background_job/web_crawler.rs @@ -5,7 +5,8 @@ use futures::StreamExt; use serde::{Deserialize, Serialize}; use tabby_crawler::crawl_pipeline; use tabby_index::public::{ - StructuredDoc, StructuredDocFields, StructuredDocIndexer, StructuredDocWebFields, + StructuredDoc, StructuredDocFields, StructuredDocIndexer, StructuredDocState, + StructuredDocWebFields, }; use tabby_inference::Embedding; @@ -53,7 +54,15 @@ impl WebCrawlerJob { }; num_docs += 1; - indexer.add(Utc::now(), source_doc).await; + indexer + .add( + StructuredDocState { + updated_at: Utc::now(), + deleted: false, + }, + source_doc, + ) + .await; } logkit::info!("Crawled {} documents from '{}'", num_docs, self.url); indexer.commit(); From 0d1085f920ddaccd12700ee7921886034cab1e86 Mon Sep 17 00:00:00 2001 From: Wei Zhang Date: Mon, 25 Nov 2024 16:36:56 +0800 Subject: [PATCH 6/9] chore: rename index add to sync --- .../tabby-index/src/structured_doc/public.rs | 2 +- .../tabby-index/src/structured_doc_tests.rs | 29 +++++++++++++++---- ...tests_chat__run_chat_golden_tests.snap.new | 6 ++++ .../background_job/third_party_integration.rs | 2 +- .../src/service/background_job/web_crawler.rs | 2 +- 5 files changed, 32 insertions(+), 9 deletions(-) create mode 100644 crates/tabby/tests/snapshots/goldentests_chat__run_chat_golden_tests.snap.new diff --git a/crates/tabby-index/src/structured_doc/public.rs b/crates/tabby-index/src/structured_doc/public.rs index efed332d3c13..3c38ae756821 100644 --- a/crates/tabby-index/src/structured_doc/public.rs +++ b/crates/tabby-index/src/structured_doc/public.rs @@ -31,7 +31,7 @@ impl StructuredDocIndexer { Self { indexer, builder } } - pub async fn add(&self, state: StructuredDocState, document: StructuredDoc) -> bool { + pub async fn sync(&self, state: StructuredDocState, document: StructuredDoc) -> bool { if !self.require_updates(state.updated_at, &document) { return false; } diff --git a/crates/tabby-index/src/structured_doc_tests.rs b/crates/tabby-index/src/structured_doc_tests.rs index 0ac8fe9fdabf..9e1bfb1e3d53 100644 --- a/crates/tabby-index/src/structured_doc_tests.rs +++ b/crates/tabby-index/src/structured_doc_tests.rs @@ -35,6 +35,7 @@ mod structured_doc_tests { use super::mock_embedding::MockEmbedding; use crate::{ indexer::Indexer, + public::StructuredDocState, structured_doc::public::{ StructuredDoc, StructuredDocFields, StructuredDocIndexer, StructuredDocIssueFields, }, @@ -65,9 +66,17 @@ mod structured_doc_tests { let updated_at = chrono::Utc::now(); let res = tokio::runtime::Runtime::new().unwrap().block_on(async { - let added = indexer.add(updated_at, doc).await; - println!("{}", added); - added + let updated = indexer + .sync( + StructuredDocState { + updated_at, + deleted: false, + }, + doc, + ) + .await; + println!("{}", updated); + updated }); assert!(res); indexer.commit(); @@ -109,9 +118,17 @@ mod structured_doc_tests { let updated_at = chrono::Utc::now(); let res = tokio::runtime::Runtime::new().unwrap().block_on(async { - let added = indexer.add(updated_at, doc).await; - println!("{}", added); - added + let updated = indexer + .sync( + StructuredDocState { + updated_at, + deleted: false, + }, + doc, + ) + .await; + println!("{}", updated); + updated }); assert!(res); indexer.commit(); diff --git a/crates/tabby/tests/snapshots/goldentests_chat__run_chat_golden_tests.snap.new b/crates/tabby/tests/snapshots/goldentests_chat__run_chat_golden_tests.snap.new new file mode 100644 index 000000000000..ed775083e199 --- /dev/null +++ b/crates/tabby/tests/snapshots/goldentests_chat__run_chat_golden_tests.snap.new @@ -0,0 +1,6 @@ +--- +source: crates/tabby/tests/goldentests_chat.rs +assertion_line: 128 +expression: "golden_test(json!({\n \"seed\": 0, \"model\": \"default\", \"messages\":\n [{\n \"role\": \"user\", \"content\":\n \"How to convert a list of string to numbers in python\"\n }]\n})).await" +--- +" You can convert a list of strings to numbers in Python using the built-in `list()` function to convert the list of strings to a list of numbers, and then using the `int()` function to convert each element of the list to an integer. Here's an example:\n```\n# A list of strings\nnum_strings = ['1', '2', '3']\n\n# Convert the list of strings to a list of numbers\nnum_list = list(map(int, num_strings))\n\n# Print the list of numbers\nprint(num_list)\n```\nThis will output:\n```\n[1, 2, 3]\n```\nNote that this will only work if the strings represent integers. If the strings represent a different type of number, such as a decimal number, you will need to use a different function, such as `float()`, to convert them to a float.\n\nAlso, if you want to convert the string to a specific number type, you can use the built-in `int()` function and pass the number as an argument.\n\nFor example, to convert the string '123' to a float:\n```\nnum_string = '123'\nnum_float = float(num_string)\nprint(num_float)\n```\nThis will output:\n```\n123.0\n```\nAnd to convert the string '123' to a decimal:\n```\nnum_string = '123.45'\nnum_decimal = float(num_string)\nprint(num_decimal)\n```\nThis will output:\n```\n123.45\n```" diff --git a/ee/tabby-webserver/src/service/background_job/third_party_integration.rs b/ee/tabby-webserver/src/service/background_job/third_party_integration.rs index 8039ff6f062c..03c984fa3d45 100644 --- a/ee/tabby-webserver/src/service/background_job/third_party_integration.rs +++ b/ee/tabby-webserver/src/service/background_job/third_party_integration.rs @@ -141,7 +141,7 @@ impl SchedulerGithubGitlabJob { let mut count = 0; let mut num_updated = 0; for await (state, doc) in issue_stream.chain(pull_stream) { - if index.add(state, doc).await { + if index.sync(state, doc).await { num_updated += 1 } diff --git a/ee/tabby-webserver/src/service/background_job/web_crawler.rs b/ee/tabby-webserver/src/service/background_job/web_crawler.rs index ff5dd1ef9de4..da307f61e028 100644 --- a/ee/tabby-webserver/src/service/background_job/web_crawler.rs +++ b/ee/tabby-webserver/src/service/background_job/web_crawler.rs @@ -55,7 +55,7 @@ impl WebCrawlerJob { num_docs += 1; indexer - .add( + .sync( StructuredDocState { updated_at: Utc::now(), deleted: false, From 85567971f68fa2c5bfae965d9fd0a6b6a41aa10c Mon Sep 17 00:00:00 2001 From: Wei Zhang Date: Thu, 21 Nov 2024 14:20:41 +0800 Subject: [PATCH 7/9] doc(structured_doc): add description to structuredDocState --- crates/tabby-index/src/structured_doc/public.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/crates/tabby-index/src/structured_doc/public.rs b/crates/tabby-index/src/structured_doc/public.rs index 3c38ae756821..5d4341fb0188 100644 --- a/crates/tabby-index/src/structured_doc/public.rs +++ b/crates/tabby-index/src/structured_doc/public.rs @@ -14,8 +14,14 @@ pub use super::types::{ use super::{create_structured_doc_builder, types::BuildStructuredDoc}; use crate::{indexer::TantivyDocBuilder, Indexer}; +/// StructuredDocState is used to track the state of the document source. +/// It is used to determine whether the document should be updated or deleted. pub struct StructuredDocState { + // updated_at is the time when the document was last updated. pub updated_at: DateTime, + // deleted indecates whether the document should be deleted in indexer + // for example, a closed pull request will be marked as deleted, and + // the indexer will remove it from the index. pub deleted: bool, } From eee0f3cfeb60957e7515a26e346b7afc762e288e Mon Sep 17 00:00:00 2001 From: Wei Zhang Date: Mon, 25 Nov 2024 16:53:03 +0800 Subject: [PATCH 8/9] chore: deleted accidently added file --- .../goldentests_chat__run_chat_golden_tests.snap.new | 6 ------ 1 file changed, 6 deletions(-) delete mode 100644 crates/tabby/tests/snapshots/goldentests_chat__run_chat_golden_tests.snap.new diff --git a/crates/tabby/tests/snapshots/goldentests_chat__run_chat_golden_tests.snap.new b/crates/tabby/tests/snapshots/goldentests_chat__run_chat_golden_tests.snap.new deleted file mode 100644 index ed775083e199..000000000000 --- a/crates/tabby/tests/snapshots/goldentests_chat__run_chat_golden_tests.snap.new +++ /dev/null @@ -1,6 +0,0 @@ ---- -source: crates/tabby/tests/goldentests_chat.rs -assertion_line: 128 -expression: "golden_test(json!({\n \"seed\": 0, \"model\": \"default\", \"messages\":\n [{\n \"role\": \"user\", \"content\":\n \"How to convert a list of string to numbers in python\"\n }]\n})).await" ---- -" You can convert a list of strings to numbers in Python using the built-in `list()` function to convert the list of strings to a list of numbers, and then using the `int()` function to convert each element of the list to an integer. Here's an example:\n```\n# A list of strings\nnum_strings = ['1', '2', '3']\n\n# Convert the list of strings to a list of numbers\nnum_list = list(map(int, num_strings))\n\n# Print the list of numbers\nprint(num_list)\n```\nThis will output:\n```\n[1, 2, 3]\n```\nNote that this will only work if the strings represent integers. If the strings represent a different type of number, such as a decimal number, you will need to use a different function, such as `float()`, to convert them to a float.\n\nAlso, if you want to convert the string to a specific number type, you can use the built-in `int()` function and pass the number as an argument.\n\nFor example, to convert the string '123' to a float:\n```\nnum_string = '123'\nnum_float = float(num_string)\nprint(num_float)\n```\nThis will output:\n```\n123.0\n```\nAnd to convert the string '123' to a decimal:\n```\nnum_string = '123.45'\nnum_decimal = float(num_string)\nprint(num_decimal)\n```\nThis will output:\n```\n123.45\n```" From 5d4336092f85b2055b602e4c21b0483fac4ad4d8 Mon Sep 17 00:00:00 2001 From: Wei Zhang Date: Thu, 21 Nov 2024 14:20:41 +0800 Subject: [PATCH 9/9] doc(structured_doc): add sync logic comments --- .../tabby-index/src/structured_doc/public.rs | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/crates/tabby-index/src/structured_doc/public.rs b/crates/tabby-index/src/structured_doc/public.rs index 5d4341fb0188..aca3b6296b5a 100644 --- a/crates/tabby-index/src/structured_doc/public.rs +++ b/crates/tabby-index/src/structured_doc/public.rs @@ -14,14 +14,16 @@ pub use super::types::{ use super::{create_structured_doc_builder, types::BuildStructuredDoc}; use crate::{indexer::TantivyDocBuilder, Indexer}; -/// StructuredDocState is used to track the state of the document source. -/// It is used to determine whether the document should be updated or deleted. +/// StructuredDocState tracks the state of the document source. +/// It helps determine whether the document should be updated or deleted. pub struct StructuredDocState { // updated_at is the time when the document was last updated. + // when the updated_at is earlier than the document's index time, + // the update will be skipped. pub updated_at: DateTime, - // deleted indecates whether the document should be deleted in indexer - // for example, a closed pull request will be marked as deleted, and - // the indexer will remove it from the index. + // deleted indicates whether the document should be removed from the indexer. + // For instance, a closed pull request will be marked as deleted, + // prompting the indexer to remove it from the index. pub deleted: bool, } @@ -37,6 +39,12 @@ impl StructuredDocIndexer { Self { indexer, builder } } + // The sync process updates the document in the indexer incrementally. + // It first determines whether the document requires an update. + // + // If an update is needed, it checks the deletion state of the document. + // If the document is marked as deleted, it will be removed. + // Next, the document is rebuilt, the original is deleted, and the newly indexed document is added. pub async fn sync(&self, state: StructuredDocState, document: StructuredDoc) -> bool { if !self.require_updates(state.updated_at, &document) { return false;