Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(webserver): switch implementation of file search tree walkin… #1934

Merged
merged 13 commits into from
Apr 26, 2024
62 changes: 59 additions & 3 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ ignore = "0.4.20"
nucleo = "0.5.0"
url = "2.5.0"
temp_testdir = "0.2"
git2 = "0.18.3"

[workspace.dependencies.uuid]
version = "1.3.3"
Expand Down
3 changes: 2 additions & 1 deletion ee/tabby-search/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,6 @@ homepage.workspace = true

[dependencies]
anyhow.workspace = true
ignore.workspace = true
git2.workspace = true
nucleo.workspace = true
temp_testdir.workspace = true
85 changes: 68 additions & 17 deletions ee/tabby-search/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use std::path::Path;

use ignore::Walk;
use std::{
collections::HashSet,
path::{Path, PathBuf},
};

pub struct FileSearch {
pub r#type: String,
Expand All @@ -24,6 +25,22 @@ impl FileSearch {
pattern: &str,
limit: usize,
) -> Result<Vec<FileSearch>, anyhow::Error> {
let paths = {
let repo = git2::Repository::open(base)?;
let index = repo.index()?;
let mut paths = HashSet::new();
index
.iter()
.map(|x| bytes2path(&x.path).to_owned())
.for_each(|relpath| {
if let Some(parent) = relpath.parent() {
paths.insert(parent.to_owned());
};
paths.insert(relpath);
});
paths.into_iter()
};

let mut nucleo = nucleo::Matcher::new(nucleo::Config::DEFAULT.match_paths());
let needle = nucleo::pattern::Pattern::new(
pattern,
Expand All @@ -32,26 +49,20 @@ impl FileSearch {
nucleo::pattern::AtomKind::Fuzzy,
);

let mut scored_entries: Vec<(_, _)> = Walk::new(base)
// Limit traversal for at most 1M entries for performance reasons.
.take(1_000_000)
.filter_map(|path| {
let entry = path.ok()?;
let r#type = if entry.file_type().map(|x| x.is_dir()).unwrap_or_default() {
let mut scored_entries: Vec<(_, _)> = paths
.filter_map(|basepath| {
let path = PathBuf::from(base).join(&basepath);
let metadata = path.metadata().ok()?;
let r#type = if metadata.is_dir() {
"dir".into()
} else {
"file".into()
};
let path = entry
.into_path()
.strip_prefix(base)
.ok()?
.to_string_lossy()
.into_owned();
let haystack: nucleo::Utf32String = path.clone().into();
let basepath = basepath.display().to_string();
let haystack: nucleo::Utf32String = basepath.clone().into();
let mut indices = Vec::new();
let score = needle.indices(haystack.slice(..), &mut nucleo, &mut indices);
score.map(|score| (score, FileSearch::new(r#type, path, indices)))
score.map(|score| (score, FileSearch::new(r#type, basepath, indices)))
})
// Ensure there's at least 1000 entries with scores > 0 for quality.
.take(1000)
Expand All @@ -67,3 +78,43 @@ impl FileSearch {
Ok(entries)
}
}

#[cfg(unix)]
pub fn bytes2path(b: &[u8]) -> &Path {
use std::os::unix::prelude::*;
Path::new(std::ffi::OsStr::from_bytes(b))
}
#[cfg(windows)]
pub fn bytes2path(b: &[u8]) -> &Path {
use std::str;
Path::new(str::from_utf8(b).unwrap())
}

#[cfg(test)]
mod tests {
use std::process::Command;

use temp_testdir::TempDir;

use crate::FileSearch;

#[test]
fn it_search() {
let root = TempDir::default();

Command::new("git")
.current_dir(&root)
.arg("clone")
.args(["--depth", "1"])
.arg("https://github.com/TabbyML/interview-questions")
.status()
.unwrap();

let dir = root.join("interview-questions");

let result = FileSearch::search(dir.as_path(), "moonscript_lora md", 5).unwrap();
assert_eq!(result.len(), 1);
assert_eq!(result[0].r#type, "file");
assert_eq!(result[0].path, "201_lm_moonscript_lora/README.md");
}
}
72 changes: 0 additions & 72 deletions ee/tabby-webserver/src/service/git_repository.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ impl GitRepositoryService for DbConn {
#[cfg(test)]
mod tests {
use tabby_db::DbConn;
use temp_testdir::TempDir;

use super::*;

Expand Down Expand Up @@ -157,75 +156,4 @@ mod tests {
"Example2"
);
}

#[tokio::test]
pub async fn test_search_files() {
let db = DbConn::new_in_memory().await.unwrap();
let service: &dyn GitRepositoryService = &db;

let dir = TempDir::default();
let repo_name = "test_repo".to_owned();
let test_repo_dir = dir.join(&repo_name);
service
.create(
repo_name.clone(),
format!("file://{}", test_repo_dir.display()),
)
.await
.unwrap();
tokio::fs::create_dir(&test_repo_dir).await.unwrap();
tokio::fs::write(test_repo_dir.join("file1.txt"), [])
.await
.unwrap();
tokio::fs::write(test_repo_dir.join("file2.txt"), [])
.await
.unwrap();
tokio::fs::write(test_repo_dir.join("file3.txt"), [])
.await
.unwrap();

let inner = test_repo_dir.join("inner");
tokio::fs::create_dir(&inner).await.unwrap();
tokio::fs::write(inner.join("main.rs"), []).await.unwrap();

let matches: Vec<_> = service
.search_files(&repo_name, "ex 1", 100)
.await
.unwrap()
.into_iter()
.map(|f| f.path)
.collect();

assert!(matches.iter().any(|p| p.contains("file1.txt")));
assert!(!matches.iter().any(|p| p.contains("file2.txt")));

let matches: Vec<_> = service
.search_files(&repo_name, "rs", 10)
.await
.unwrap()
.into_iter()
.map(|f| f.path)
.collect();

assert_eq!(matches.len(), 1);
assert!(matches.iter().any(|p| p.contains("main.rs")));

let matches: Vec<_> = service
.search_files(&repo_name, "inner", 10)
.await
.unwrap()
.into_iter()
.collect();

assert!(matches.iter().any(|f| f.r#type == "dir"));
assert_eq!(matches.len(), 2);

let matches: Vec<_> = service
.search_files(&repo_name, "", 10)
.await
.unwrap()
.into_iter()
.collect();
assert_eq!(matches.len(), 0);
}
}
Loading