From 043b1824482ab562f6d2d3073391726f5d206240 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Tue, 23 Apr 2024 00:40:23 -0700 Subject: [PATCH 01/12] refactor(webserver): switch implementation of file search tree walking with git ls-files --- Cargo.lock | 57 +++++++++++++++++++++++++++++++++++++- Cargo.toml | 1 + ee/tabby-search/Cargo.toml | 2 +- ee/tabby-search/src/lib.rs | 56 +++++++++++++++++++++++++++---------- 4 files changed, 99 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index eb14dab9c06f..40b0522775b5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1480,6 +1480,21 @@ version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ad0a93d233ebf96623465aad4046a8d3aa4da22d4f4beba5388838c8a434bbb4" +[[package]] +name = "git2" +version = "0.18.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "232e6a7bfe35766bf715e55a88b39a700596c0ccfd88cd3680b4cdb40d66ef70" +dependencies = [ + "bitflags 2.4.0", + "libc", + "libgit2-sys", + "log", + "openssl-probe", + "openssl-sys", + "url", +] + [[package]] name = "glob" version = "0.3.1" @@ -2140,6 +2155,20 @@ version = "0.2.149" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b" +[[package]] +name = "libgit2-sys" +version = "0.16.2+1.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee4126d8b4ee5c9d9ea891dd875cfdc1e9d0950437179104b183d7d8a74d24e8" +dependencies = [ + "cc", + "libc", + "libssh2-sys", + "libz-sys", + "openssl-sys", + "pkg-config", +] + [[package]] name = "libloading" version = "0.7.4" @@ -2167,6 +2196,32 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "libssh2-sys" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dc8a030b787e2119a731f1951d6a773e2280c660f8ec4b0f5e1505a386e71ee" +dependencies = [ + "cc", + "libc", + "libz-sys", + "openssl-sys", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "libz-sys" +version = "1.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e143b5e666b2695d28f6bca6497720813f699c9602dd7f5cac91008b8ada7f9" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + [[package]] name = "link-cplusplus" version = "1.0.8" @@ -4668,7 +4723,7 @@ name = "tabby-search" version = "0.11.0-dev.0" dependencies = [ "anyhow", - "ignore", + "git2", "nucleo", ] diff --git a/Cargo.toml b/Cargo.toml index 450daded67fa..1d73c71f74f2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -55,6 +55,7 @@ ignore = "0.4.20" nucleo = "0.5.0" url = "2.5.0" temp_testdir = "0.2" +git2 = "0.18.3" [workspace.dependencies.uuid] version = "1.3.3" diff --git a/ee/tabby-search/Cargo.toml b/ee/tabby-search/Cargo.toml index 5283635fdb9e..1eaac19f9e86 100644 --- a/ee/tabby-search/Cargo.toml +++ b/ee/tabby-search/Cargo.toml @@ -7,5 +7,5 @@ homepage.workspace = true [dependencies] anyhow.workspace = true -ignore.workspace = true +git2.workspace = true nucleo.workspace = true diff --git a/ee/tabby-search/src/lib.rs b/ee/tabby-search/src/lib.rs index 70af80e7b4a1..1e36a98b87f3 100644 --- a/ee/tabby-search/src/lib.rs +++ b/ee/tabby-search/src/lib.rs @@ -1,6 +1,7 @@ -use std::path::Path; - -use ignore::Walk; +use std::{ + collections::HashSet, + path::{Path, PathBuf}, +}; pub struct FileSearch { pub r#type: String, @@ -24,6 +25,34 @@ impl FileSearch { pattern: &str, limit: usize, ) -> Result, anyhow::Error> { + let repo = git2::Repository::open(base)?; + let paths: Vec = { + let mut options = git2::StatusOptions::default(); + options.include_unmodified(true); + let statuses = repo.statuses(Some(&mut options))?; + + let mut dirs = HashSet::new(); + statuses + .iter() + .filter_map(|x| x.path().map(|x| x.to_owned())) + .flat_map(|relpath| { + let relpath = PathBuf::from(relpath); + let Some(parent) = relpath.parent() else { + return vec![relpath.to_owned()]; + }; + + // Add directories to paths as git statues only tracks files. + if !dirs.contains(parent) { + // De-dupe directories with `dirs` + dirs.insert(parent.to_owned()); + vec![parent.to_owned(), relpath] + } else { + vec![relpath] + } + }) + .collect() + }; + let mut nucleo = nucleo::Matcher::new(nucleo::Config::DEFAULT.match_paths()); let needle = nucleo::pattern::Pattern::new( pattern, @@ -32,26 +61,23 @@ impl FileSearch { nucleo::pattern::AtomKind::Fuzzy, ); - let mut scored_entries: Vec<(_, _)> = Walk::new(base) + let mut scored_entries: Vec<(_, _)> = paths + .into_iter() // Limit traversal for at most 1M entries for performance reasons. .take(1_000_000) - .filter_map(|path| { - let entry = path.ok()?; - let r#type = if entry.file_type().map(|x| x.is_dir()).unwrap_or_default() { + .filter_map(|basepath| { + let path = PathBuf::from(base).join(&basepath); + let metadata = path.metadata().ok()?; + let r#type = if metadata.is_dir() { "dir".into() } else { "file".into() }; - let path = entry - .into_path() - .strip_prefix(base) - .ok()? - .to_string_lossy() - .into_owned(); - let haystack: nucleo::Utf32String = path.clone().into(); + let basepath = basepath.display().to_string(); + let haystack: nucleo::Utf32String = basepath.clone().into(); let mut indices = Vec::new(); let score = needle.indices(haystack.slice(..), &mut nucleo, &mut indices); - score.map(|score| (score, FileSearch::new(r#type, path, indices))) + score.map(|score| (score, FileSearch::new(r#type, basepath, indices))) }) // Ensure there's at least 1000 entries with scores > 0 for quality. .take(1000) From 3c93057da124dd2a09be5efd80def576f80fde5a Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Tue, 23 Apr 2024 00:48:10 -0700 Subject: [PATCH 02/12] simplify --- ee/tabby-search/src/lib.rs | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/ee/tabby-search/src/lib.rs b/ee/tabby-search/src/lib.rs index 1e36a98b87f3..228eec35a103 100644 --- a/ee/tabby-search/src/lib.rs +++ b/ee/tabby-search/src/lib.rs @@ -26,31 +26,23 @@ impl FileSearch { limit: usize, ) -> Result, anyhow::Error> { let repo = git2::Repository::open(base)?; - let paths: Vec = { + let paths = { let mut options = git2::StatusOptions::default(); options.include_unmodified(true); let statuses = repo.statuses(Some(&mut options))?; - let mut dirs = HashSet::new(); + let mut paths = HashSet::new(); statuses .iter() .filter_map(|x| x.path().map(|x| x.to_owned())) - .flat_map(|relpath| { + .for_each(|relpath| { let relpath = PathBuf::from(relpath); - let Some(parent) = relpath.parent() else { - return vec![relpath.to_owned()]; + if let Some(parent) = relpath.parent() { + paths.insert(parent.to_owned()); }; - - // Add directories to paths as git statues only tracks files. - if !dirs.contains(parent) { - // De-dupe directories with `dirs` - dirs.insert(parent.to_owned()); - vec![parent.to_owned(), relpath] - } else { - vec![relpath] - } - }) - .collect() + paths.insert(relpath); + }); + paths.into_iter() }; let mut nucleo = nucleo::Matcher::new(nucleo::Config::DEFAULT.match_paths()); @@ -62,7 +54,6 @@ impl FileSearch { ); let mut scored_entries: Vec<(_, _)> = paths - .into_iter() // Limit traversal for at most 1M entries for performance reasons. .take(1_000_000) .filter_map(|basepath| { From 91565b265c664648e57454769e1450b3cccfa612 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Tue, 23 Apr 2024 11:48:28 -0700 Subject: [PATCH 03/12] update --- ee/tabby-search/src/lib.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/ee/tabby-search/src/lib.rs b/ee/tabby-search/src/lib.rs index 228eec35a103..088f71bb53d5 100644 --- a/ee/tabby-search/src/lib.rs +++ b/ee/tabby-search/src/lib.rs @@ -54,8 +54,6 @@ impl FileSearch { ); let mut scored_entries: Vec<(_, _)> = paths - // Limit traversal for at most 1M entries for performance reasons. - .take(1_000_000) .filter_map(|basepath| { let path = PathBuf::from(base).join(&basepath); let metadata = path.metadata().ok()?; From beb178665a3f7c18acf5dedebb6ff0ecb285076b Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Tue, 23 Apr 2024 11:48:28 -0700 Subject: [PATCH 04/12] update --- ee/tabby-search/src/lib.rs | 41 +++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/ee/tabby-search/src/lib.rs b/ee/tabby-search/src/lib.rs index 088f71bb53d5..938283753572 100644 --- a/ee/tabby-search/src/lib.rs +++ b/ee/tabby-search/src/lib.rs @@ -3,6 +3,7 @@ use std::{ path::{Path, PathBuf}, }; +#[derive(Debug)] pub struct FileSearch { pub r#type: String, pub path: String, @@ -25,16 +26,13 @@ impl FileSearch { pattern: &str, limit: usize, ) -> Result, anyhow::Error> { - let repo = git2::Repository::open(base)?; let paths = { - let mut options = git2::StatusOptions::default(); - options.include_unmodified(true); - let statuses = repo.statuses(Some(&mut options))?; - + let repo = git2::Repository::open(base)?; + let index = repo.index()?; let mut paths = HashSet::new(); - statuses + index .iter() - .filter_map(|x| x.path().map(|x| x.to_owned())) + .map(|x| bytes2path(&x.path).to_owned()) .for_each(|relpath| { let relpath = PathBuf::from(relpath); if let Some(parent) = relpath.parent() { @@ -82,3 +80,32 @@ impl FileSearch { Ok(entries) } } + +#[cfg(unix)] +pub fn bytes2path(b: &[u8]) -> &Path { + use std::os::unix::prelude::*; + Path::new(std::ffi::OsStr::from_bytes(b)) +} +#[cfg(windows)] +pub fn bytes2path(b: &[u8]) -> &Path { + use std::str; + Path::new(str::from_utf8(b).unwrap()) +} + +#[cfg(test)] +mod tests { + use std::path::PathBuf; + + use crate::FileSearch; + + #[test] + fn it_search() { + let result = FileSearch::search( + &PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../"), + "website".into(), + 1, + ) + .unwrap(); + assert_eq!(result.len(), 1); + } +} From a5257bd66e7c42f2a1d7d2d5d48d9b77c1b53008 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Tue, 23 Apr 2024 15:36:39 -0700 Subject: [PATCH 05/12] update --- ee/tabby-search/src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/ee/tabby-search/src/lib.rs b/ee/tabby-search/src/lib.rs index 938283753572..3cc19b1dd3dd 100644 --- a/ee/tabby-search/src/lib.rs +++ b/ee/tabby-search/src/lib.rs @@ -99,6 +99,7 @@ mod tests { use crate::FileSearch; #[test] + #[ignore] fn it_search() { let result = FileSearch::search( &PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../"), From 21d06fa7993f7618c8cc8e713df28de6232a6bc5 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Tue, 23 Apr 2024 15:46:43 -0700 Subject: [PATCH 06/12] update --- .../src/service/git_repository.rs | 71 ------------------- 1 file changed, 71 deletions(-) diff --git a/ee/tabby-webserver/src/service/git_repository.rs b/ee/tabby-webserver/src/service/git_repository.rs index 7de1eab07b7c..3c198d507be9 100644 --- a/ee/tabby-webserver/src/service/git_repository.rs +++ b/ee/tabby-webserver/src/service/git_repository.rs @@ -157,75 +157,4 @@ mod tests { "Example2" ); } - - #[tokio::test] - pub async fn test_search_files() { - let db = DbConn::new_in_memory().await.unwrap(); - let service: &dyn GitRepositoryService = &db; - - let dir = TempDir::default(); - let repo_name = "test_repo".to_owned(); - let test_repo_dir = dir.join(&repo_name); - service - .create( - repo_name.clone(), - format!("file://{}", test_repo_dir.display()), - ) - .await - .unwrap(); - tokio::fs::create_dir(&test_repo_dir).await.unwrap(); - tokio::fs::write(test_repo_dir.join("file1.txt"), []) - .await - .unwrap(); - tokio::fs::write(test_repo_dir.join("file2.txt"), []) - .await - .unwrap(); - tokio::fs::write(test_repo_dir.join("file3.txt"), []) - .await - .unwrap(); - - let inner = test_repo_dir.join("inner"); - tokio::fs::create_dir(&inner).await.unwrap(); - tokio::fs::write(inner.join("main.rs"), []).await.unwrap(); - - let matches: Vec<_> = service - .search_files(&repo_name, "ex 1", 100) - .await - .unwrap() - .into_iter() - .map(|f| f.path) - .collect(); - - assert!(matches.iter().any(|p| p.contains("file1.txt"))); - assert!(!matches.iter().any(|p| p.contains("file2.txt"))); - - let matches: Vec<_> = service - .search_files(&repo_name, "rs", 10) - .await - .unwrap() - .into_iter() - .map(|f| f.path) - .collect(); - - assert_eq!(matches.len(), 1); - assert!(matches.iter().any(|p| p.contains("main.rs"))); - - let matches: Vec<_> = service - .search_files(&repo_name, "inner", 10) - .await - .unwrap() - .into_iter() - .collect(); - - assert!(matches.iter().any(|f| f.r#type == "dir")); - assert_eq!(matches.len(), 2); - - let matches: Vec<_> = service - .search_files(&repo_name, "", 10) - .await - .unwrap() - .into_iter() - .collect(); - assert_eq!(matches.len(), 0); - } } From 4f16f1cbac2f8f74aeba1b03e4ee758923aee6a0 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Tue, 23 Apr 2024 15:47:12 -0700 Subject: [PATCH 07/12] update --- ee/tabby-search/src/lib.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ee/tabby-search/src/lib.rs b/ee/tabby-search/src/lib.rs index 3cc19b1dd3dd..312119f957b3 100644 --- a/ee/tabby-search/src/lib.rs +++ b/ee/tabby-search/src/lib.rs @@ -34,7 +34,6 @@ impl FileSearch { .iter() .map(|x| bytes2path(&x.path).to_owned()) .for_each(|relpath| { - let relpath = PathBuf::from(relpath); if let Some(parent) = relpath.parent() { paths.insert(parent.to_owned()); }; @@ -103,7 +102,7 @@ mod tests { fn it_search() { let result = FileSearch::search( &PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../"), - "website".into(), + "website", 1, ) .unwrap(); From e7f0897d8f4b0c7ef801bfe57b1628a18cb9f8f3 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Thu, 25 Apr 2024 20:39:00 -0700 Subject: [PATCH 08/12] add test --- Cargo.lock | 1 + ee/tabby-search/Cargo.toml | 1 + ee/tabby-search/src/lib.rs | 24 ++++++++++++++++-------- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 909398787fe7..92db4adeab3b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5076,6 +5076,7 @@ dependencies = [ "anyhow", "git2", "nucleo", + "temp_testdir", ] [[package]] diff --git a/ee/tabby-search/Cargo.toml b/ee/tabby-search/Cargo.toml index 1eaac19f9e86..6af5d9d55782 100644 --- a/ee/tabby-search/Cargo.toml +++ b/ee/tabby-search/Cargo.toml @@ -9,3 +9,4 @@ homepage.workspace = true anyhow.workspace = true git2.workspace = true nucleo.workspace = true +temp_testdir.workspace = true diff --git a/ee/tabby-search/src/lib.rs b/ee/tabby-search/src/lib.rs index 312119f957b3..ebb6a3c2f272 100644 --- a/ee/tabby-search/src/lib.rs +++ b/ee/tabby-search/src/lib.rs @@ -93,19 +93,27 @@ pub fn bytes2path(b: &[u8]) -> &Path { #[cfg(test)] mod tests { - use std::path::PathBuf; + use std::{path::PathBuf, process::Command}; + + use temp_testdir::TempDir; use crate::FileSearch; #[test] - #[ignore] fn it_search() { - let result = FileSearch::search( - &PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../"), - "website", - 1, - ) - .unwrap(); + let root = TempDir::default(); + + Command::new("git") + .current_dir(&root) + .arg("clone") + .args(["--depth", "1"]) + .arg("https://github.com/TabbyML/interview-questions") + .status() + .unwrap(); + + let dir = root.join("interview-questions"); + + let result = FileSearch::search(dir.as_path(), "moonscript_lora", 1).unwrap(); assert_eq!(result.len(), 1); } } From f0a5c6ca35ae5b431de17978e3fd687fac1d7a42 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Thu, 25 Apr 2024 20:41:54 -0700 Subject: [PATCH 09/12] add test --- ee/tabby-search/src/lib.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ee/tabby-search/src/lib.rs b/ee/tabby-search/src/lib.rs index ebb6a3c2f272..e92580789bd5 100644 --- a/ee/tabby-search/src/lib.rs +++ b/ee/tabby-search/src/lib.rs @@ -113,7 +113,9 @@ mod tests { let dir = root.join("interview-questions"); - let result = FileSearch::search(dir.as_path(), "moonscript_lora", 1).unwrap(); + let result = FileSearch::search(dir.as_path(), "moonscript_lora md", 5).unwrap(); assert_eq!(result.len(), 1); + assert_eq!(result[0].r#type, "file"); + assert_eq!(result[0].path, "201_lm_moonscript_lora/README.md"); } } From 02f13687b6a19b8a10e2350d2222547cae6efa24 Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Fri, 26 Apr 2024 03:50:15 +0000 Subject: [PATCH 10/12] [autofix.ci] apply automated fixes --- ee/tabby-search/src/lib.rs | 2 +- ee/tabby-webserver/src/service/git_repository.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ee/tabby-search/src/lib.rs b/ee/tabby-search/src/lib.rs index e92580789bd5..93da901dd97a 100644 --- a/ee/tabby-search/src/lib.rs +++ b/ee/tabby-search/src/lib.rs @@ -93,7 +93,7 @@ pub fn bytes2path(b: &[u8]) -> &Path { #[cfg(test)] mod tests { - use std::{path::PathBuf, process::Command}; + use std::{process::Command}; use temp_testdir::TempDir; diff --git a/ee/tabby-webserver/src/service/git_repository.rs b/ee/tabby-webserver/src/service/git_repository.rs index 3c198d507be9..38f8859ad3a5 100644 --- a/ee/tabby-webserver/src/service/git_repository.rs +++ b/ee/tabby-webserver/src/service/git_repository.rs @@ -72,7 +72,7 @@ impl GitRepositoryService for DbConn { #[cfg(test)] mod tests { use tabby_db::DbConn; - use temp_testdir::TempDir; + use super::*; From 180db88c313ff0f833378171b6084b0d05078f8e Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Fri, 26 Apr 2024 03:58:16 +0000 Subject: [PATCH 11/12] [autofix.ci] apply automated fixes (attempt 2/3) --- ee/tabby-search/src/lib.rs | 2 +- ee/tabby-webserver/src/service/git_repository.rs | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/ee/tabby-search/src/lib.rs b/ee/tabby-search/src/lib.rs index 93da901dd97a..65b68e3e5dea 100644 --- a/ee/tabby-search/src/lib.rs +++ b/ee/tabby-search/src/lib.rs @@ -93,7 +93,7 @@ pub fn bytes2path(b: &[u8]) -> &Path { #[cfg(test)] mod tests { - use std::{process::Command}; + use std::process::Command; use temp_testdir::TempDir; diff --git a/ee/tabby-webserver/src/service/git_repository.rs b/ee/tabby-webserver/src/service/git_repository.rs index 38f8859ad3a5..4cac5c789b36 100644 --- a/ee/tabby-webserver/src/service/git_repository.rs +++ b/ee/tabby-webserver/src/service/git_repository.rs @@ -72,7 +72,6 @@ impl GitRepositoryService for DbConn { #[cfg(test)] mod tests { use tabby_db::DbConn; - use super::*; From d5979ebe987809f64c5b0f4ab460554033d77870 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Thu, 25 Apr 2024 21:08:24 -0700 Subject: [PATCH 12/12] update --- ee/tabby-search/src/lib.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/ee/tabby-search/src/lib.rs b/ee/tabby-search/src/lib.rs index 65b68e3e5dea..30d2a065ef26 100644 --- a/ee/tabby-search/src/lib.rs +++ b/ee/tabby-search/src/lib.rs @@ -3,7 +3,6 @@ use std::{ path::{Path, PathBuf}, }; -#[derive(Debug)] pub struct FileSearch { pub r#type: String, pub path: String,