diff --git a/crates/tabby-common/src/api/structured_doc.rs b/crates/tabby-common/src/api/structured_doc.rs index 03d43f004dbc..e524222af63d 100644 --- a/crates/tabby-common/src/api/structured_doc.rs +++ b/crates/tabby-common/src/api/structured_doc.rs @@ -20,6 +20,7 @@ pub struct DocSearchHit { pub enum DocSearchDocument { Web(DocSearchWebDocument), Issue(DocSearchIssueDocument), + Pull(DocSearchPullDocument), } #[derive(Error, Debug)] @@ -65,6 +66,15 @@ pub struct DocSearchIssueDocument { pub closed: bool, } +#[derive(Clone)] +pub struct DocSearchPullDocument { + pub title: String, + pub link: String, + pub body: String, + pub diff: String, + pub merged: bool, +} + pub trait FromTantivyDocument { fn from_tantivy_document(doc: &TantivyDocument, chunk: &TantivyDocument) -> Option where @@ -82,6 +92,8 @@ impl FromTantivyDocument for DocSearchDocument { } "issue" => DocSearchIssueDocument::from_tantivy_document(doc, chunk) .map(DocSearchDocument::Issue), + "pull" => DocSearchPullDocument::from_tantivy_document(doc, chunk) + .map(DocSearchDocument::Pull), _ => None, } } @@ -146,6 +158,44 @@ impl FromTantivyDocument for DocSearchIssueDocument { } } +impl FromTantivyDocument for DocSearchPullDocument { + fn from_tantivy_document(doc: &TantivyDocument, _: &TantivyDocument) -> Option { + let schema = IndexSchema::instance(); + let title = get_json_text_field( + doc, + schema.field_attributes, + structured_doc::fields::pull::TITLE, + ); + let link = get_json_text_field( + doc, + schema.field_attributes, + structured_doc::fields::pull::LINK, + ); + let body = get_json_text_field( + doc, + schema.field_attributes, + structured_doc::fields::pull::BODY, + ); + let diff = get_json_text_field( + doc, + schema.field_attributes, + structured_doc::fields::pull::DIFF, + ); + let merged = get_json_bool_field( + doc, + schema.field_attributes, + structured_doc::fields::pull::MERGED, + ); + Some(Self { + title: title.into(), + link: link.into(), + body: body.into(), + diff: diff.into(), + merged, + }) + } +} + fn get_json_field<'a>( doc: &'a TantivyDocument, field: schema::Field, diff --git a/crates/tabby-common/src/index/structured_doc.rs b/crates/tabby-common/src/index/structured_doc.rs index 9dceabbe7506..978a82aa5371 100644 --- a/crates/tabby-common/src/index/structured_doc.rs +++ b/crates/tabby-common/src/index/structured_doc.rs @@ -13,4 +13,12 @@ pub mod fields { pub const BODY: &str = "body"; pub const CLOSED: &str = "closed"; } + + pub mod pull { + pub const TITLE: &str = "title"; + pub const LINK: &str = "link"; + pub const BODY: &str = "body"; + pub const DIFF: &str = "diff"; + pub const MERGED: &str = "merged"; + } } diff --git a/crates/tabby-index/src/lib.rs b/crates/tabby-index/src/lib.rs index dc87050a2b04..d284f4d7eaf9 100644 --- a/crates/tabby-index/src/lib.rs +++ b/crates/tabby-index/src/lib.rs @@ -17,7 +17,7 @@ pub mod public { code::CodeIndexer, structured_doc::public::{ StructuredDoc, StructuredDocFields, StructuredDocIndexer, StructuredDocIssueFields, - StructuredDocWebFields, + StructuredDocPullDocumentFields, StructuredDocWebFields, }, }; diff --git a/crates/tabby-index/src/structured_doc/public.rs b/crates/tabby-index/src/structured_doc/public.rs index 56e45fadb50a..4104dfaf88f1 100644 --- a/crates/tabby-index/src/structured_doc/public.rs +++ b/crates/tabby-index/src/structured_doc/public.rs @@ -7,8 +7,9 @@ use tabby_common::index::corpus; use tabby_inference::Embedding; pub use super::types::{ - issue::IssueDocument as StructuredDocIssueFields, web::WebDocument as StructuredDocWebFields, - StructuredDoc, StructuredDocFields, + issue::IssueDocument as StructuredDocIssueFields, + pull::PullDocument as StructuredDocPullDocumentFields, + web::WebDocument as StructuredDocWebFields, StructuredDoc, StructuredDocFields, }; use super::{create_structured_doc_builder, types::BuildStructuredDoc}; use crate::{indexer::TantivyDocBuilder, Indexer}; diff --git a/crates/tabby-index/src/structured_doc/types.rs b/crates/tabby-index/src/structured_doc/types.rs index f447354878cc..69172139e62c 100644 --- a/crates/tabby-index/src/structured_doc/types.rs +++ b/crates/tabby-index/src/structured_doc/types.rs @@ -1,4 +1,5 @@ pub mod issue; +pub mod pull; pub mod web; use std::sync::Arc; @@ -21,6 +22,7 @@ impl StructuredDoc { match &self.fields { StructuredDocFields::Web(web) => &web.link, StructuredDocFields::Issue(issue) => &issue.link, + StructuredDocFields::Pull(pull) => &pull.link, } } @@ -28,6 +30,7 @@ impl StructuredDoc { match &self.fields { StructuredDocFields::Web(_) => "web", StructuredDocFields::Issue(_) => "issue", + StructuredDocFields::Pull(_) => "pull", } } } @@ -55,6 +58,7 @@ pub trait BuildStructuredDoc { pub enum StructuredDocFields { Web(web::WebDocument), Issue(issue::IssueDocument), + Pull(pull::PullDocument), } #[async_trait] @@ -63,6 +67,7 @@ impl BuildStructuredDoc for StructuredDoc { match &self.fields { StructuredDocFields::Web(doc) => doc.should_skip(), StructuredDocFields::Issue(doc) => doc.should_skip(), + StructuredDocFields::Pull(doc) => doc.should_skip(), } } @@ -70,6 +75,7 @@ impl BuildStructuredDoc for StructuredDoc { match &self.fields { StructuredDocFields::Web(doc) => doc.build_attributes().await, StructuredDocFields::Issue(doc) => doc.build_attributes().await, + StructuredDocFields::Pull(doc) => doc.build_attributes().await, } } @@ -80,6 +86,7 @@ impl BuildStructuredDoc for StructuredDoc { match &self.fields { StructuredDocFields::Web(doc) => doc.build_chunk_attributes(embedding).await, StructuredDocFields::Issue(doc) => doc.build_chunk_attributes(embedding).await, + StructuredDocFields::Pull(doc) => doc.build_chunk_attributes(embedding).await, } } } diff --git a/crates/tabby-index/src/structured_doc/types/pull.rs b/crates/tabby-index/src/structured_doc/types/pull.rs new file mode 100644 index 000000000000..67b7bf4ea2e2 --- /dev/null +++ b/crates/tabby-index/src/structured_doc/types/pull.rs @@ -0,0 +1,58 @@ +use std::sync::Arc; + +use async_stream::stream; +use async_trait::async_trait; +use futures::stream::BoxStream; +use serde_json::json; +use tabby_common::index::structured_doc::fields; +use tabby_inference::Embedding; +use tokio::task::JoinHandle; + +use super::{build_tokens, BuildStructuredDoc}; + +pub struct PullDocument { + pub link: String, + pub title: String, + pub body: String, + + /// The diff represents the code changes in this PR, + /// including metadata, affected line ranges, and added (+) or removed (-) lines. + /// For more details on the diff format, refer to: + /// https://git-scm.com/docs/diff-format#_combined_diff_format + pub diff: String, + pub merged: bool, +} + +#[async_trait] +impl BuildStructuredDoc for PullDocument { + fn should_skip(&self) -> bool { + false + } + + async fn build_attributes(&self) -> serde_json::Value { + json!({ + fields::pull::LINK: self.link, + fields::pull::TITLE: self.title, + fields::pull::BODY: self.body, + fields::pull::DIFF: self.diff, + fields::pull::MERGED: self.merged, + }) + } + + async fn build_chunk_attributes( + &self, + embedding: Arc, + ) -> BoxStream, serde_json::Value)>> { + // currently not indexing the diff + let text = format!("{}\n\n{}", self.title, self.body); + let s = stream! { + yield tokio::spawn(async move { + let tokens = build_tokens(embedding, &text).await; + let chunk_attributes = json!({}); + (tokens, chunk_attributes) + }) + }; + + Box::pin(s) + } +} diff --git a/ee/tabby-db/src/lib.rs b/ee/tabby-db/src/lib.rs index a288d678185b..248d38886baf 100644 --- a/ee/tabby-db/src/lib.rs +++ b/ee/tabby-db/src/lib.rs @@ -15,8 +15,8 @@ pub use server_setting::ServerSettingDAO; use sqlx::{query, query_scalar, sqlite::SqliteQueryResult, Pool, Sqlite, SqlitePool}; pub use threads::{ ThreadDAO, ThreadMessageAttachmentClientCode, ThreadMessageAttachmentCode, - ThreadMessageAttachmentDoc, ThreadMessageAttachmentIssueDoc, ThreadMessageAttachmentWebDoc, - ThreadMessageDAO, + ThreadMessageAttachmentDoc, ThreadMessageAttachmentIssueDoc, ThreadMessageAttachmentPullDoc, + ThreadMessageAttachmentWebDoc, ThreadMessageDAO, }; use tokio::sync::Mutex; use user_completions::UserCompletionDailyStatsDAO; diff --git a/ee/tabby-db/src/threads.rs b/ee/tabby-db/src/threads.rs index 01d2e0292853..1e3909c6bc3e 100644 --- a/ee/tabby-db/src/threads.rs +++ b/ee/tabby-db/src/threads.rs @@ -36,6 +36,7 @@ pub struct ThreadMessageDAO { pub enum ThreadMessageAttachmentDoc { Web(ThreadMessageAttachmentWebDoc), Issue(ThreadMessageAttachmentIssueDoc), + Pull(ThreadMessageAttachmentPullDoc), } #[derive(Serialize, Deserialize)] @@ -53,6 +54,15 @@ pub struct ThreadMessageAttachmentIssueDoc { pub closed: bool, } +#[derive(Serialize, Deserialize)] +pub struct ThreadMessageAttachmentPullDoc { + pub title: String, + pub link: String, + pub body: String, + pub diff: String, + pub merged: bool, +} + #[derive(Serialize, Deserialize)] pub struct ThreadMessageAttachmentCode { pub git_url: String, diff --git a/ee/tabby-schema/graphql/schema.graphql b/ee/tabby-schema/graphql/schema.graphql index 085bac638402..97c59b64c9d0 100644 --- a/ee/tabby-schema/graphql/schema.graphql +++ b/ee/tabby-schema/graphql/schema.graphql @@ -516,6 +516,14 @@ type MessageAttachmentIssueDoc { closed: Boolean! } +type MessageAttachmentPullDoc { + title: String! + link: String! + body: String! + patch: String! + merged: Boolean! +} + type MessageAttachmentWebDoc { title: String! link: String! @@ -900,7 +908,7 @@ type WebContextSource implements ContextSourceId & ContextSource { sourceName: String! } -union MessageAttachmentDoc = MessageAttachmentWebDoc | MessageAttachmentIssueDoc +union MessageAttachmentDoc = MessageAttachmentWebDoc | MessageAttachmentIssueDoc | MessageAttachmentPullDoc """ Schema of thread run stream. diff --git a/ee/tabby-schema/src/dao.rs b/ee/tabby-schema/src/dao.rs index dc0da9b3bc3b..8e8ba645d82e 100644 --- a/ee/tabby-schema/src/dao.rs +++ b/ee/tabby-schema/src/dao.rs @@ -4,8 +4,8 @@ use lazy_static::lazy_static; use tabby_db::{ EmailSettingDAO, IntegrationDAO, InvitationDAO, JobRunDAO, OAuthCredentialDAO, ServerSettingDAO, ThreadDAO, ThreadMessageAttachmentClientCode, ThreadMessageAttachmentCode, - ThreadMessageAttachmentDoc, ThreadMessageAttachmentIssueDoc, ThreadMessageAttachmentWebDoc, - ThreadMessageDAO, UserEventDAO, + ThreadMessageAttachmentDoc, ThreadMessageAttachmentIssueDoc, ThreadMessageAttachmentPullDoc, + ThreadMessageAttachmentWebDoc, ThreadMessageDAO, UserEventDAO, }; use crate::{ @@ -246,6 +246,15 @@ impl From for thread::MessageAttachmentDoc { closed: val.closed, }) } + ThreadMessageAttachmentDoc::Pull(val) => { + thread::MessageAttachmentDoc::Pull(thread::MessageAttachmentPullDoc { + title: val.title, + link: val.link, + body: val.body, + patch: val.diff, + merged: val.merged, + }) + } } } } @@ -268,6 +277,15 @@ impl From<&thread::MessageAttachmentDoc> for ThreadMessageAttachmentDoc { closed: val.closed, }) } + thread::MessageAttachmentDoc::Pull(val) => { + ThreadMessageAttachmentDoc::Pull(ThreadMessageAttachmentPullDoc { + title: val.title.clone(), + link: val.link.clone(), + body: val.body.clone(), + diff: val.patch.clone(), + merged: val.merged, + }) + } } } } diff --git a/ee/tabby-schema/src/schema/thread/types.rs b/ee/tabby-schema/src/schema/thread/types.rs index f89cc850575f..3b0cd7588a2f 100644 --- a/ee/tabby-schema/src/schema/thread/types.rs +++ b/ee/tabby-schema/src/schema/thread/types.rs @@ -125,6 +125,7 @@ impl From for MessageCodeSearchHit { pub enum MessageAttachmentDoc { Web(MessageAttachmentWebDoc), Issue(MessageAttachmentIssueDoc), + Pull(MessageAttachmentPullDoc), } #[derive(GraphQLObject, Clone)] @@ -142,6 +143,15 @@ pub struct MessageAttachmentIssueDoc { pub closed: bool, } +#[derive(GraphQLObject, Clone)] +pub struct MessageAttachmentPullDoc { + pub title: String, + pub link: String, + pub body: String, + pub patch: String, + pub merged: bool, +} + impl From for MessageAttachmentDoc { fn from(doc: DocSearchDocument) -> Self { match doc { @@ -158,6 +168,13 @@ impl From for MessageAttachmentDoc { closed: issue.closed, }) } + DocSearchDocument::Pull(pull) => MessageAttachmentDoc::Pull(MessageAttachmentPullDoc { + title: pull.title, + link: pull.link, + body: pull.body, + patch: pull.diff, + merged: pull.merged, + }), } } } diff --git a/ee/tabby-webserver/src/service/answer.rs b/ee/tabby-webserver/src/service/answer.rs index ec2343401cfe..8ecf5e0527b6 100644 --- a/ee/tabby-webserver/src/service/answer.rs +++ b/ee/tabby-webserver/src/service/answer.rs @@ -126,7 +126,7 @@ impl AnswerService { .map(|x| x.doc.clone().into()) .collect::>(); - debug!("doc content: {:?}", doc_query.content); + debug!("doc content: {:?}: {:?}", doc_query.content, attachment.doc.len()); if !attachment.doc.is_empty() { let hits = hits.into_iter().map(|x| x.into()).collect::>(); @@ -603,6 +603,7 @@ fn get_content(doc: &MessageAttachmentDoc) -> &str { match doc { MessageAttachmentDoc::Web(web) => &web.content, MessageAttachmentDoc::Issue(issue) => &issue.body, + MessageAttachmentDoc::Pull(pull) => &pull.body, } } @@ -711,6 +712,7 @@ mod tests { match doc { DocSearchDocument::Web(web_doc) => &web_doc.title, DocSearchDocument::Issue(issue_doc) => &issue_doc.title, + DocSearchDocument::Pull(pull_doc) => &pull_doc.title, } } diff --git a/ee/tabby-webserver/src/service/background_job/third_party_integration.rs b/ee/tabby-webserver/src/service/background_job/third_party_integration.rs index 76c670a5dbce..2bdd85244f38 100644 --- a/ee/tabby-webserver/src/service/background_job/third_party_integration.rs +++ b/ee/tabby-webserver/src/service/background_job/third_party_integration.rs @@ -5,6 +5,7 @@ use chrono::{DateTime, Utc}; use futures::{stream::BoxStream, StreamExt}; use issues::{list_github_issues, list_gitlab_issues}; use juniper::ID; +use pulls::list_github_pulls; use serde::{Deserialize, Serialize}; use tabby_common::config::CodeRepository; use tabby_index::public::{CodeIndexer, StructuredDoc, StructuredDocIndexer}; @@ -19,6 +20,7 @@ use tracing::debug; use super::{helper::Job, BackgroundJobEvent}; mod issues; +mod pulls; #[derive(Serialize, Deserialize, Clone)] pub struct SyncIntegrationJob { @@ -113,7 +115,7 @@ impl SchedulerGithubGitlabJob { repository.display_name ); let index = StructuredDocIndexer::new(embedding); - let s = match fetch_all_issues(&integration, &repository).await { + let issue_stream = match fetch_all_issues(&integration, &repository).await { Ok(s) => s, Err(e) => { integration_service @@ -124,10 +126,21 @@ impl SchedulerGithubGitlabJob { } }; + let pull_stream = match fetch_all_pulls(&integration, &repository).await { + Ok(s) => s, + Err(e) => { + integration_service + .update_integration_sync_status(integration.id, Some(e.to_string())) + .await?; + logkit::error!("Failed to fetch pulls: {}", e); + return Err(e); + } + }; + stream! { let mut count = 0; let mut num_updated = 0; - for await (updated_at, doc) in s { + for await (updated_at, doc) in issue_stream.chain(pull_stream) { if index.add(updated_at, doc).await { num_updated += 1 } @@ -191,3 +204,18 @@ async fn fetch_all_issues( Ok(s) } +async fn fetch_all_pulls( + integration: &Integration, + repository: &ProvidedRepository, +) -> tabby_schema::Result, StructuredDoc)>> { + let s: BoxStream<(DateTime, StructuredDoc)> = list_github_pulls( + &repository.source_id(), + integration.api_base(), + &repository.display_name, + &integration.access_token, + ) + .await? + .boxed(); + + Ok(s) +} diff --git a/ee/tabby-webserver/src/service/background_job/third_party_integration/issues.rs b/ee/tabby-webserver/src/service/background_job/third_party_integration/issues.rs index 84bf63c7203f..d718d630e78e 100644 --- a/ee/tabby-webserver/src/service/background_job/third_party_integration/issues.rs +++ b/ee/tabby-webserver/src/service/background_job/third_party_integration/issues.rs @@ -47,6 +47,12 @@ pub async fn list_github_issues( let pages = response.number_of_pages().unwrap_or_default(); for issue in response.items { + // pull request is also an issue in GitHub, + // skip them here + if issue.pull_request.is_some() { + continue; + } + let doc = StructuredDoc { source_id: source_id.to_string(), fields: StructuredDocFields::Issue(StructuredDocIssueFields { diff --git a/ee/tabby-webserver/src/service/background_job/third_party_integration/pulls.rs b/ee/tabby-webserver/src/service/background_job/third_party_integration/pulls.rs new file mode 100644 index 000000000000..7559773bf885 --- /dev/null +++ b/ee/tabby-webserver/src/service/background_job/third_party_integration/pulls.rs @@ -0,0 +1,88 @@ +use anyhow::{anyhow, Result}; +use async_stream::stream; +use chrono::{DateTime, Utc}; +use futures::Stream; +use octocrab::{models::IssueState, Octocrab}; +use tabby_index::public::{StructuredDoc, StructuredDocFields, StructuredDocPullDocumentFields}; + +pub async fn list_github_pulls( + source_id: &str, + api_base: &str, + full_name: &str, + access_token: &str, +) -> Result, StructuredDoc)>> { + let octocrab = Octocrab::builder() + .personal_token(access_token.to_string()) + .base_uri(api_base)? + .build()?; + + let (owner, repo) = full_name + .split_once('/') + .ok_or_else(|| anyhow!("Invalid repository name"))?; + + let owner = owner.to_owned(); + let repo = repo.to_owned(); + let source_id = source_id.to_owned(); + let s = stream! { + let mut page = 1u32; + loop { + let response = match octocrab + .pulls(&owner, &repo) + .list() + .state(octocrab::params::State::All) + .page(page) + .send() + .await { + Ok(x) => x, + Err(e) => { + logkit::error!("Failed to fetch pull requests: {}", e); + break; + } + }; + + let pages = response.number_of_pages().unwrap_or_default(); + + for pull in response.items { + // skip closed but not merged pulls + if let Some(state) = pull.state { + if state == IssueState::Closed && pull.merged_at.is_none() { + continue + } + } + + let url = pull.html_url.map(|url| url.to_string()).unwrap_or_else(|| pull.url); + let diff = match octocrab.pulls(&owner, &repo).get_diff(pull.number).await { + Ok(x) if x.len() < 1024*1024*10 => x, + Ok(_) => { + logkit::warn!("Pull request {} diff is larger than 10MB, skipping", url); + continue + } + Err(e) => { + logkit::error!("Failed to fetch pull request diff for {}: {}", url, e); + continue + } + }; + + let doc = StructuredDoc { + source_id: source_id.to_string(), + fields: StructuredDocFields::Pull(StructuredDocPullDocumentFields { + link: url, + title: pull.title.unwrap_or_default(), + body: pull.body.unwrap_or_default(), + diff, + merged: pull.merged_at.is_some(), + }) + }; + + yield (pull.updated_at.unwrap(), doc); + } + + page += 1; + if page > pages { + break; + } + } + }; + + Ok(s) +}