diff --git a/crates/tabby-common/src/config.rs b/crates/tabby-common/src/config.rs index 8e4e1daf1ced..e6a0a3093a7c 100644 --- a/crates/tabby-common/src/config.rs +++ b/crates/tabby-common/src/config.rs @@ -9,6 +9,7 @@ use serde::{Deserialize, Serialize}; use crate::{ path::repositories_dir, terminal::{HeaderFormat, InfoMessage}, + SourceFile, }; #[derive(Serialize, Deserialize, Default)] @@ -149,6 +150,9 @@ impl Default for ServerConfig { #[async_trait] pub trait RepositoryAccess: Send + Sync { async fn list_repositories(&self) -> Result>; + fn start_snapshot(&self, _version: u64) {} + fn process_file(&self, _version: u64, _file: SourceFile) {} + fn finish_snapshot(&self, _version: u64) {} } pub struct ConfigRepositoryAccess; diff --git a/crates/tabby-common/src/lib.rs b/crates/tabby-common/src/lib.rs index 0c91be5b7cc1..f8fa66f647a0 100644 --- a/crates/tabby-common/src/lib.rs +++ b/crates/tabby-common/src/lib.rs @@ -21,7 +21,7 @@ use path::dataset_dir; use serde::{Deserialize, Serialize}; use serde_jsonlines::JsonLinesReader; -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Clone)] pub struct SourceFile { pub git_url: String, pub filepath: String, diff --git a/crates/tabby-scheduler/src/dataset.rs b/crates/tabby-scheduler/src/dataset.rs index db746f566a3a..a1dd17981959 100644 --- a/crates/tabby-scheduler/src/dataset.rs +++ b/crates/tabby-scheduler/src/dataset.rs @@ -15,7 +15,7 @@ use kdam::BarExt; use lazy_static::lazy_static; use serde_jsonlines::WriteExt; use tabby_common::{ - config::RepositoryConfig, + config::{RepositoryAccess, RepositoryConfig}, path::{dataset_dir, dependency_file}, DependencyFile, SourceFile, }; @@ -25,11 +25,21 @@ use tree_sitter_tags::TagsContext; use crate::utils::tqdm; trait RepositoryExt { - fn create_dataset(&self, writer: &mut impl Write) -> Result<()>; + fn create_dataset( + &self, + writer: &mut impl Write, + access: &impl RepositoryAccess, + snapshot_version: u64, + ) -> Result<()>; } impl RepositoryExt for RepositoryConfig { - fn create_dataset(&self, writer: &mut impl Write) -> Result<()> { + fn create_dataset( + &self, + writer: &mut impl Write, + access: &impl RepositoryAccess, + snapshot_version: u64, + ) -> Result<()> { let dir = self.dir(); let walk_dir_iter = || { @@ -70,7 +80,8 @@ impl RepositoryExt for RepositoryConfig { language, content: file_content, }; - writer.write_json_lines([source_file])?; + writer.write_json_lines([source_file.clone()])?; + access.process_file(snapshot_version, source_file); } Err(e) => { error!("Cannot read {relative_path:?}: {e:?}"); @@ -95,9 +106,10 @@ fn is_source_code(entry: &DirEntry) -> bool { } } -pub fn create_dataset(config: &[RepositoryConfig]) -> Result<()> { +pub fn create_dataset(config: &[RepositoryConfig], access: &impl RepositoryAccess) -> Result<()> { fs::remove_dir_all(dataset_dir()).ok(); fs::create_dir_all(dataset_dir())?; + let mut writer = FileRotate::new( SourceFile::files_jsonl(), AppendCount::new(usize::max_value()), @@ -107,10 +119,16 @@ pub fn create_dataset(config: &[RepositoryConfig]) -> Result<()> { None, ); + let snapshot_version = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .expect("Failed to read system clock") + .as_millis() as u64; + access.start_snapshot(snapshot_version); + let mut deps = DependencyFile::default(); for repository in config { deps::collect(repository.dir().as_path(), &mut deps); - repository.create_dataset(&mut writer)?; + repository.create_dataset(&mut writer, access, snapshot_version)?; } serdeconv::to_json_file(&deps, dependency_file())?; diff --git a/crates/tabby-scheduler/src/lib.rs b/crates/tabby-scheduler/src/lib.rs index e4b55d38a91f..7b14e9155f7e 100644 --- a/crates/tabby-scheduler/src/lib.rs +++ b/crates/tabby-scheduler/src/lib.rs @@ -15,7 +15,7 @@ use tracing::{error, info, warn}; pub async fn scheduler(now: bool, access: T) -> Result<()> { if now { let repositories = access.list_repositories().await?; - job_sync(&repositories)?; + job_sync(&repositories, &access)?; job_index(&repositories)?; } else { let access = Arc::new(access); @@ -37,7 +37,7 @@ pub async fn scheduler(now: bool, access: T) -> R .list_repositories() .await .expect("Must be able to retrieve repositories for sync"); - if let Err(e) = job_sync(&repositories) { + if let Err(e) = job_sync(&repositories, &*access) { error!("{e}"); } if let Err(e) = job_index(&repositories) { @@ -66,7 +66,7 @@ fn job_index(repositories: &[RepositoryConfig]) -> Result<()> { Ok(()) } -fn job_sync(repositories: &[RepositoryConfig]) -> Result<()> { +fn job_sync(repositories: &[RepositoryConfig], access: &impl RepositoryAccess) -> Result<()> { println!("Syncing {} repositories...", repositories.len()); let ret = repository::sync_repositories(repositories); if let Err(err) = ret { @@ -74,7 +74,7 @@ fn job_sync(repositories: &[RepositoryConfig]) -> Result<()> { } println!("Building dataset..."); - let ret = dataset::create_dataset(repositories); + let ret = dataset::create_dataset(repositories, access); if let Err(err) = ret { return Err(err.context("Failed to build dataset")); }