From 0eeefde04a5c3b7609d47525e59cc27c1d7be08b Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Sun, 12 May 2024 21:18:53 -0700 Subject: [PATCH 1/8] add cmt --- crates/tabby-scheduler/src/doc/mod.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/crates/tabby-scheduler/src/doc/mod.rs b/crates/tabby-scheduler/src/doc/mod.rs index 3ce19ffa51d3..6efda335bcca 100644 --- a/crates/tabby-scheduler/src/doc/mod.rs +++ b/crates/tabby-scheduler/src/doc/mod.rs @@ -80,6 +80,10 @@ impl DocIndex { /// into binarized tokens by thresholding on zero. /// /// The current implementation deduplicates tokens at the document level, but this may require further consideration in the future. + /// We split the document into chunks and compute the embedding for each chunk, and convert embeddings + /// to binarized tokens by thresholding on zero. + /// + /// Current implementation deduplicate tokens at the document level, this might need further revise in future. async fn compute_embedding_tokens(&self, content: &str) -> Option> { let mut tokens = HashSet::new(); for chunk in self.splitter.chunks(content, CHUNK_SIZE) { From 679d83cfa3ec13c5f94c470034230d69dd6a7611 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Sun, 12 May 2024 21:21:01 -0700 Subject: [PATCH 2/8] update --- crates/tabby-scheduler/src/doc/mod.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/crates/tabby-scheduler/src/doc/mod.rs b/crates/tabby-scheduler/src/doc/mod.rs index 6efda335bcca..3ce19ffa51d3 100644 --- a/crates/tabby-scheduler/src/doc/mod.rs +++ b/crates/tabby-scheduler/src/doc/mod.rs @@ -80,10 +80,6 @@ impl DocIndex { /// into binarized tokens by thresholding on zero. /// /// The current implementation deduplicates tokens at the document level, but this may require further consideration in the future. - /// We split the document into chunks and compute the embedding for each chunk, and convert embeddings - /// to binarized tokens by thresholding on zero. - /// - /// Current implementation deduplicate tokens at the document level, this might need further revise in future. async fn compute_embedding_tokens(&self, content: &str) -> Option> { let mut tokens = HashSet::new(); for chunk in self.splitter.chunks(content, CHUNK_SIZE) { From 8fe1233da2b0f4fc07836d20216513c8f16bf369 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Sun, 12 May 2024 21:38:30 -0700 Subject: [PATCH 3/8] renamings --- crates/tabby-common/src/path.rs | 4 +++- .../examples/{crawler.rs => crawl_and_index.rs} | 0 crates/tabby-scheduler/src/code/cache.rs | 8 ++++---- crates/tabby-scheduler/src/code/index.rs | 4 ++-- crates/tabby-scheduler/src/code/intelligence.rs | 6 +++--- crates/tabby-scheduler/src/code/types.rs | 4 ++-- crates/tabby-scheduler/src/doc/mod.rs | 8 +++----- crates/tabby-scheduler/src/lib.rs | 1 + 8 files changed, 18 insertions(+), 17 deletions(-) rename crates/tabby-scheduler/examples/{crawler.rs => crawl_and_index.rs} (100%) diff --git a/crates/tabby-common/src/path.rs b/crates/tabby-common/src/path.rs index 3d43d51dc9dc..2acc991aebd1 100644 --- a/crates/tabby-common/src/path.rs +++ b/crates/tabby-common/src/path.rs @@ -37,12 +37,13 @@ pub fn repositories_dir() -> PathBuf { tabby_root().join("repositories") } +// FIXME: migrate to /code_index/tantivy pub fn index_dir() -> PathBuf { tabby_root().join("index") } pub fn doc_index_dir() -> PathBuf { - tabby_root().join("doc_index") + tabby_root().join("doc_index").join("tantivy") } pub fn models_dir() -> PathBuf { @@ -57,6 +58,7 @@ pub fn events_dir() -> PathBuf { tabby_root().join("events") } +// FIXME: migrate to /code_index/cache pub fn cache_dir() -> PathBuf { tabby_root().join("cache") } diff --git a/crates/tabby-scheduler/examples/crawler.rs b/crates/tabby-scheduler/examples/crawl_and_index.rs similarity index 100% rename from crates/tabby-scheduler/examples/crawler.rs rename to crates/tabby-scheduler/examples/crawl_and_index.rs diff --git a/crates/tabby-scheduler/src/code/cache.rs b/crates/tabby-scheduler/src/code/cache.rs index 4bf7c979a99e..3b86eca0d8bd 100644 --- a/crates/tabby-scheduler/src/code/cache.rs +++ b/crates/tabby-scheduler/src/code/cache.rs @@ -10,7 +10,7 @@ use serde::{Deserialize, Serialize}; use tabby_common::{config::RepositoryConfig, languages::get_language_by_ext}; use tracing::info; -use super::intelligence::{CodeIntelligence, SourceFile}; +use super::intelligence::{CodeIntelligence, SourceCode}; const SOURCE_FILE_BUCKET_KEY: &str = "source_files"; const INDEX_BUCKET_KEY: &str = "indexed_files"; @@ -145,10 +145,10 @@ impl CacheStore { &mut self, config: &RepositoryConfig, path: &Path, - ) -> Option { + ) -> Option { let key: String = SourceFileKey::try_from(path).ok()?.to_string(); - let dataset_bucket: Bucket>> = self + let dataset_bucket: Bucket>> = self .store .bucket(Some(SOURCE_FILE_BUCKET_KEY)) .expect("Could not access dataset bucket"); @@ -171,7 +171,7 @@ impl CacheStore { pub fn garbage_collection_for_source_files(&self) { info!("Started cleaning up 'source_files' bucket"); - let bucket: Bucket> = self + let bucket: Bucket> = self .store .bucket(Some(SOURCE_FILE_BUCKET_KEY)) .expect("Could not access dataset bucket"); diff --git a/crates/tabby-scheduler/src/code/index.rs b/crates/tabby-scheduler/src/code/index.rs index 511d54ea8596..7d266c77a034 100644 --- a/crates/tabby-scheduler/src/code/index.rs +++ b/crates/tabby-scheduler/src/code/index.rs @@ -6,7 +6,7 @@ use tracing::warn; use super::{ cache::CacheStore, - intelligence::{CodeIntelligence, SourceFile}, + intelligence::{CodeIntelligence, SourceCode}, }; use crate::tantivy_utils::open_or_create_index; @@ -118,7 +118,7 @@ pub fn remove_staled_documents(cache: &mut CacheStore, code: &CodeSearchSchema, gc_commit(); } -fn is_valid_file(file: &SourceFile) -> bool { +fn is_valid_file(file: &SourceCode) -> bool { file.max_line_length <= MAX_LINE_LENGTH_THRESHOLD && file.avg_line_length <= AVG_LINE_LENGTH_THRESHOLD } diff --git a/crates/tabby-scheduler/src/code/intelligence.rs b/crates/tabby-scheduler/src/code/intelligence.rs index c0bef0e18e1d..ca7c02383a9c 100644 --- a/crates/tabby-scheduler/src/code/intelligence.rs +++ b/crates/tabby-scheduler/src/code/intelligence.rs @@ -6,7 +6,7 @@ use tracing::warn; use tree_sitter_tags::TagsContext; use super::languages; -pub use super::types::{Point, SourceFile, Tag}; +pub use super::types::{Point, SourceCode, Tag}; pub struct CodeIntelligence { context: TagsContext, @@ -61,7 +61,7 @@ impl CodeIntelligence { &mut self, config: &RepositoryConfig, path: &Path, - ) -> Option { + ) -> Option { if path.is_dir() || !path.exists() { return None; } @@ -86,7 +86,7 @@ impl CodeIntelligence { return None; } }; - let source_file = SourceFile { + let source_file = SourceCode { git_url: config.canonical_git_url(), basedir: config.dir().display().to_string(), filepath: relative_path.display().to_string(), diff --git a/crates/tabby-scheduler/src/code/types.rs b/crates/tabby-scheduler/src/code/types.rs index 36222e880811..93133e16ebfc 100644 --- a/crates/tabby-scheduler/src/code/types.rs +++ b/crates/tabby-scheduler/src/code/types.rs @@ -3,7 +3,7 @@ use std::{ops::Range, path::Path}; use serde::{Deserialize, Serialize}; #[derive(Serialize, Deserialize, Clone)] -pub struct SourceFile { +pub struct SourceCode { pub git_url: String, pub basedir: String, pub filepath: String, @@ -14,7 +14,7 @@ pub struct SourceFile { pub tags: Vec, } -impl SourceFile { +impl SourceCode { pub fn read_content(&self) -> std::io::Result { let path = Path::new(&self.basedir).join(&self.filepath); std::fs::read_to_string(path) diff --git a/crates/tabby-scheduler/src/doc/mod.rs b/crates/tabby-scheduler/src/doc/mod.rs index 3ce19ffa51d3..89f4f2bb11c8 100644 --- a/crates/tabby-scheduler/src/doc/mod.rs +++ b/crates/tabby-scheduler/src/doc/mod.rs @@ -8,17 +8,16 @@ use tracing::warn; use crate::tantivy_utils::open_or_create_index; -struct Document { +pub struct SourceDocument { pub id: String, pub title: String, pub link: String, pub body: String, } -struct DocIndex { +pub struct DocIndex { embedding: Arc, doc: DocSearchSchema, - index: Index, writer: IndexWriter, splitter: TextSplitter, } @@ -40,13 +39,12 @@ impl DocIndex { Self { embedding, doc, - index, writer, splitter: TextSplitter::default().with_trim_chunks(true), } } - pub async fn add(&mut self, document: Document) { + pub async fn add(&mut self, document: SourceDocument) { // Delete the document if it already exists self.writer .delete_term(Term::from_field_text(self.doc.field_id, &document.id)); diff --git a/crates/tabby-scheduler/src/lib.rs b/crates/tabby-scheduler/src/lib.rs index cdcdb36e6183..625e286c9b7e 100644 --- a/crates/tabby-scheduler/src/lib.rs +++ b/crates/tabby-scheduler/src/lib.rs @@ -7,6 +7,7 @@ mod code; pub use code::CodeIndex; mod doc; +pub use doc::DocIndex; use std::sync::Arc; From fe83090edd352bc8ef74e0d161e19f04ec624eb6 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Sun, 12 May 2024 21:58:53 -0700 Subject: [PATCH 4/8] add crawl_and_index example --- Cargo.lock | 1 + crates/tabby-scheduler/Cargo.toml | 1 + .../examples/crawl_and_index.rs | 51 +++++++++++++++---- crates/tabby-scheduler/src/lib.rs | 3 +- 4 files changed, 45 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6002537bcd15..c81a9d81317d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5456,6 +5456,7 @@ dependencies = [ "tokio", "tokio-cron-scheduler", "tracing", + "tracing-subscriber 0.3.17", "tracing-test", "tree-sitter-c", "tree-sitter-c-sharp", diff --git a/crates/tabby-scheduler/Cargo.toml b/crates/tabby-scheduler/Cargo.toml index b97c713a5c72..fd99ce4ed487 100644 --- a/crates/tabby-scheduler/Cargo.toml +++ b/crates/tabby-scheduler/Cargo.toml @@ -45,3 +45,4 @@ tracing-test = "0.1" tokio = { workspace = true, features = ["rt", "macros", "rt-multi-thread"] } serde_json = { workspace = true } async-trait = { workspace = true } +tracing-subscriber = { workspace = true } \ No newline at end of file diff --git a/crates/tabby-scheduler/examples/crawl_and_index.rs b/crates/tabby-scheduler/examples/crawl_and_index.rs index eccc16ea68ed..d016bfe1531e 100644 --- a/crates/tabby-scheduler/examples/crawl_and_index.rs +++ b/crates/tabby-scheduler/examples/crawl_and_index.rs @@ -1,23 +1,56 @@ +use std::sync::Arc; + use async_stream::stream; +use async_trait::async_trait; use futures::StreamExt; -use tabby_scheduler::crawl::crawl_pipeline; +use tabby_inference::Embedding; +use tabby_scheduler::{crawl::crawl_pipeline, DocIndex, SourceDocument}; +use tracing::debug; #[tokio::main] async fn main() { - let mut cnt = 3; + let _ = tracing_subscriber::fmt() + .with_env_filter("tabby=debug,crawl_and_index=debug") + .init(); + + let mut doc_index = DocIndex::new(Arc::new(FakeEmbedding)); + let mut cnt = 0; stream! { - println!("Crawling https://tabby.tabbyml.com/"); for await doc in crawl_pipeline("https://tabby.tabbyml.com/").await { - println!("Title: {:?}", doc.metadata.title); - println!("Description: {:?}", doc.metadata.description); - println!("URL: {}\n", doc.url); - println!("Markdown: {}", doc.markdown); - cnt -= 1; - if cnt <= 0 { + debug!("Title: {:?}", doc.metadata.title); + debug!("Description: {:?}", doc.metadata.description); + debug!("URL: {}\n", doc.url); + cnt += 1; + if cnt >= 3 { break; } + + let id = cnt.to_string(); + debug!("Adding document {} to index...", id); + let source_doc = SourceDocument { + id, + title: doc.metadata.title.unwrap_or_default(), + link: doc.url, + body: doc.markdown, + }; + + doc_index.add(source_doc).await; } + + doc_index.commit(); } .collect::<()>() .await; } + +struct FakeEmbedding; + +#[async_trait] +impl Embedding for FakeEmbedding { + async fn embed(&self, _: &str) -> anyhow::Result> { + let mut embedding = vec![0.0; 512]; + embedding[3] = 1.0; + embedding[128] = 1.0; + Ok(embedding) + } +} diff --git a/crates/tabby-scheduler/src/lib.rs b/crates/tabby-scheduler/src/lib.rs index 625e286c9b7e..a4d193f9dd4c 100644 --- a/crates/tabby-scheduler/src/lib.rs +++ b/crates/tabby-scheduler/src/lib.rs @@ -7,10 +7,9 @@ mod code; pub use code::CodeIndex; mod doc; -pub use doc::DocIndex; - use std::sync::Arc; +pub use doc::{DocIndex, SourceDocument}; use tabby_common::config::{RepositoryAccess, RepositoryConfig}; use tokio_cron_scheduler::{Job, JobScheduler}; use tracing::{info, warn}; From fef1a4d53fa085ccbf5c7970dc1290a8a740d986 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Sun, 12 May 2024 21:59:44 -0700 Subject: [PATCH 5/8] update --- crates/tabby-common/src/path.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/tabby-common/src/path.rs b/crates/tabby-common/src/path.rs index 2acc991aebd1..d0a38fbe6971 100644 --- a/crates/tabby-common/src/path.rs +++ b/crates/tabby-common/src/path.rs @@ -33,6 +33,7 @@ pub fn usage_id_file() -> PathBuf { tabby_root().join("usage_anonymous_id") } +// FIXME: migrate to /code_index/repositories pub fn repositories_dir() -> PathBuf { tabby_root().join("repositories") } From 9b64597693bd23c8e26ff0aa99e1812fe2da798f Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Sun, 12 May 2024 22:00:01 -0700 Subject: [PATCH 6/8] update --- crates/tabby-common/src/path.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/crates/tabby-common/src/path.rs b/crates/tabby-common/src/path.rs index d0a38fbe6971..d74c2eb18801 100644 --- a/crates/tabby-common/src/path.rs +++ b/crates/tabby-common/src/path.rs @@ -33,18 +33,18 @@ pub fn usage_id_file() -> PathBuf { tabby_root().join("usage_anonymous_id") } -// FIXME: migrate to /code_index/repositories +// FIXME: migrate to /code/repositories pub fn repositories_dir() -> PathBuf { tabby_root().join("repositories") } -// FIXME: migrate to /code_index/tantivy +// FIXME: migrate to /code/tantivy pub fn index_dir() -> PathBuf { tabby_root().join("index") } pub fn doc_index_dir() -> PathBuf { - tabby_root().join("doc_index").join("tantivy") + tabby_root().join("doc").join("tantivy") } pub fn models_dir() -> PathBuf { @@ -59,7 +59,7 @@ pub fn events_dir() -> PathBuf { tabby_root().join("events") } -// FIXME: migrate to /code_index/cache +// FIXME: migrate to /code/cache pub fn cache_dir() -> PathBuf { tabby_root().join("cache") } From c455afb48450a36e1b3454b768e8336589865a18 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Sun, 12 May 2024 22:01:45 -0700 Subject: [PATCH 7/8] update --- crates/tabby-common/src/path.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/crates/tabby-common/src/path.rs b/crates/tabby-common/src/path.rs index d74c2eb18801..38bbe7698b0b 100644 --- a/crates/tabby-common/src/path.rs +++ b/crates/tabby-common/src/path.rs @@ -33,18 +33,18 @@ pub fn usage_id_file() -> PathBuf { tabby_root().join("usage_anonymous_id") } -// FIXME: migrate to /code/repositories +// FIXME: migrate to /corpus/code/repositories pub fn repositories_dir() -> PathBuf { tabby_root().join("repositories") } -// FIXME: migrate to /code/tantivy +// FIXME: migrate to /corpus/code/tantivy pub fn index_dir() -> PathBuf { tabby_root().join("index") } pub fn doc_index_dir() -> PathBuf { - tabby_root().join("doc").join("tantivy") + tabby_root().join("corpus").join("doc").join("tantivy") } pub fn models_dir() -> PathBuf { @@ -59,7 +59,7 @@ pub fn events_dir() -> PathBuf { tabby_root().join("events") } -// FIXME: migrate to /code/cache +// FIXME: migrate to /corpus/code/cache pub fn cache_dir() -> PathBuf { tabby_root().join("cache") } From 978094a969e28a90423aac8a06444e7517cf52f5 Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Mon, 13 May 2024 05:08:24 +0000 Subject: [PATCH 8/8] [autofix.ci] apply automated fixes --- crates/tabby-scheduler/examples/crawl_and_index.rs | 2 +- crates/tabby-scheduler/src/doc/mod.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/tabby-scheduler/examples/crawl_and_index.rs b/crates/tabby-scheduler/examples/crawl_and_index.rs index d016bfe1531e..b10b9e9c4c2a 100644 --- a/crates/tabby-scheduler/examples/crawl_and_index.rs +++ b/crates/tabby-scheduler/examples/crawl_and_index.rs @@ -9,7 +9,7 @@ use tracing::debug; #[tokio::main] async fn main() { - let _ = tracing_subscriber::fmt() + tracing_subscriber::fmt() .with_env_filter("tabby=debug,crawl_and_index=debug") .init(); diff --git a/crates/tabby-scheduler/src/doc/mod.rs b/crates/tabby-scheduler/src/doc/mod.rs index 89f4f2bb11c8..970e53c79579 100644 --- a/crates/tabby-scheduler/src/doc/mod.rs +++ b/crates/tabby-scheduler/src/doc/mod.rs @@ -2,7 +2,7 @@ use std::{collections::HashSet, sync::Arc}; use tabby_common::{index::DocSearchSchema, path}; use tabby_inference::Embedding; -use tantivy::{doc, Index, IndexWriter, Term}; +use tantivy::{doc, IndexWriter, Term}; use text_splitter::{Characters, TextSplitter}; use tracing::warn;