From 1a7f07d50b2bc6201a831a253017d3d0a4c2106a Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Sun, 12 May 2024 22:14:38 -0700 Subject: [PATCH] chore: add crawl_and_index example (#2105) * add cmt * update * renamings * add crawl_and_index example * update * update * update --- Cargo.lock | 1 + crates/tabby-common/src/path.rs | 5 +- crates/tabby-scheduler/Cargo.toml | 1 + .../examples/crawl_and_index.rs | 56 +++++++++++++++++++ crates/tabby-scheduler/examples/crawler.rs | 23 -------- crates/tabby-scheduler/src/code/cache.rs | 8 +-- crates/tabby-scheduler/src/code/index.rs | 4 +- .../tabby-scheduler/src/code/intelligence.rs | 6 +- crates/tabby-scheduler/src/code/types.rs | 4 +- crates/tabby-scheduler/src/doc/mod.rs | 10 ++-- crates/tabby-scheduler/src/lib.rs | 2 +- 11 files changed, 78 insertions(+), 42 deletions(-) create mode 100644 crates/tabby-scheduler/examples/crawl_and_index.rs delete mode 100644 crates/tabby-scheduler/examples/crawler.rs diff --git a/Cargo.lock b/Cargo.lock index 6002537bcd15..c81a9d81317d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5456,6 +5456,7 @@ dependencies = [ "tokio", "tokio-cron-scheduler", "tracing", + "tracing-subscriber 0.3.17", "tracing-test", "tree-sitter-c", "tree-sitter-c-sharp", diff --git a/crates/tabby-common/src/path.rs b/crates/tabby-common/src/path.rs index 3d43d51dc9dc..38bbe7698b0b 100644 --- a/crates/tabby-common/src/path.rs +++ b/crates/tabby-common/src/path.rs @@ -33,16 +33,18 @@ pub fn usage_id_file() -> PathBuf { tabby_root().join("usage_anonymous_id") } +// FIXME: migrate to /corpus/code/repositories pub fn repositories_dir() -> PathBuf { tabby_root().join("repositories") } +// FIXME: migrate to /corpus/code/tantivy pub fn index_dir() -> PathBuf { tabby_root().join("index") } pub fn doc_index_dir() -> PathBuf { - tabby_root().join("doc_index") + tabby_root().join("corpus").join("doc").join("tantivy") } pub fn models_dir() -> PathBuf { @@ -57,6 +59,7 @@ pub fn events_dir() -> PathBuf { tabby_root().join("events") } +// FIXME: migrate to /corpus/code/cache pub fn cache_dir() -> PathBuf { tabby_root().join("cache") } diff --git a/crates/tabby-scheduler/Cargo.toml b/crates/tabby-scheduler/Cargo.toml index b97c713a5c72..fd99ce4ed487 100644 --- a/crates/tabby-scheduler/Cargo.toml +++ b/crates/tabby-scheduler/Cargo.toml @@ -45,3 +45,4 @@ tracing-test = "0.1" tokio = { workspace = true, features = ["rt", "macros", "rt-multi-thread"] } serde_json = { workspace = true } async-trait = { workspace = true } +tracing-subscriber = { workspace = true } \ No newline at end of file diff --git a/crates/tabby-scheduler/examples/crawl_and_index.rs b/crates/tabby-scheduler/examples/crawl_and_index.rs new file mode 100644 index 000000000000..b10b9e9c4c2a --- /dev/null +++ b/crates/tabby-scheduler/examples/crawl_and_index.rs @@ -0,0 +1,56 @@ +use std::sync::Arc; + +use async_stream::stream; +use async_trait::async_trait; +use futures::StreamExt; +use tabby_inference::Embedding; +use tabby_scheduler::{crawl::crawl_pipeline, DocIndex, SourceDocument}; +use tracing::debug; + +#[tokio::main] +async fn main() { + tracing_subscriber::fmt() + .with_env_filter("tabby=debug,crawl_and_index=debug") + .init(); + + let mut doc_index = DocIndex::new(Arc::new(FakeEmbedding)); + let mut cnt = 0; + stream! { + for await doc in crawl_pipeline("https://tabby.tabbyml.com/").await { + debug!("Title: {:?}", doc.metadata.title); + debug!("Description: {:?}", doc.metadata.description); + debug!("URL: {}\n", doc.url); + cnt += 1; + if cnt >= 3 { + break; + } + + let id = cnt.to_string(); + debug!("Adding document {} to index...", id); + let source_doc = SourceDocument { + id, + title: doc.metadata.title.unwrap_or_default(), + link: doc.url, + body: doc.markdown, + }; + + doc_index.add(source_doc).await; + } + + doc_index.commit(); + } + .collect::<()>() + .await; +} + +struct FakeEmbedding; + +#[async_trait] +impl Embedding for FakeEmbedding { + async fn embed(&self, _: &str) -> anyhow::Result> { + let mut embedding = vec![0.0; 512]; + embedding[3] = 1.0; + embedding[128] = 1.0; + Ok(embedding) + } +} diff --git a/crates/tabby-scheduler/examples/crawler.rs b/crates/tabby-scheduler/examples/crawler.rs deleted file mode 100644 index eccc16ea68ed..000000000000 --- a/crates/tabby-scheduler/examples/crawler.rs +++ /dev/null @@ -1,23 +0,0 @@ -use async_stream::stream; -use futures::StreamExt; -use tabby_scheduler::crawl::crawl_pipeline; - -#[tokio::main] -async fn main() { - let mut cnt = 3; - stream! { - println!("Crawling https://tabby.tabbyml.com/"); - for await doc in crawl_pipeline("https://tabby.tabbyml.com/").await { - println!("Title: {:?}", doc.metadata.title); - println!("Description: {:?}", doc.metadata.description); - println!("URL: {}\n", doc.url); - println!("Markdown: {}", doc.markdown); - cnt -= 1; - if cnt <= 0 { - break; - } - } - } - .collect::<()>() - .await; -} diff --git a/crates/tabby-scheduler/src/code/cache.rs b/crates/tabby-scheduler/src/code/cache.rs index 4bf7c979a99e..3b86eca0d8bd 100644 --- a/crates/tabby-scheduler/src/code/cache.rs +++ b/crates/tabby-scheduler/src/code/cache.rs @@ -10,7 +10,7 @@ use serde::{Deserialize, Serialize}; use tabby_common::{config::RepositoryConfig, languages::get_language_by_ext}; use tracing::info; -use super::intelligence::{CodeIntelligence, SourceFile}; +use super::intelligence::{CodeIntelligence, SourceCode}; const SOURCE_FILE_BUCKET_KEY: &str = "source_files"; const INDEX_BUCKET_KEY: &str = "indexed_files"; @@ -145,10 +145,10 @@ impl CacheStore { &mut self, config: &RepositoryConfig, path: &Path, - ) -> Option { + ) -> Option { let key: String = SourceFileKey::try_from(path).ok()?.to_string(); - let dataset_bucket: Bucket>> = self + let dataset_bucket: Bucket>> = self .store .bucket(Some(SOURCE_FILE_BUCKET_KEY)) .expect("Could not access dataset bucket"); @@ -171,7 +171,7 @@ impl CacheStore { pub fn garbage_collection_for_source_files(&self) { info!("Started cleaning up 'source_files' bucket"); - let bucket: Bucket> = self + let bucket: Bucket> = self .store .bucket(Some(SOURCE_FILE_BUCKET_KEY)) .expect("Could not access dataset bucket"); diff --git a/crates/tabby-scheduler/src/code/index.rs b/crates/tabby-scheduler/src/code/index.rs index 511d54ea8596..7d266c77a034 100644 --- a/crates/tabby-scheduler/src/code/index.rs +++ b/crates/tabby-scheduler/src/code/index.rs @@ -6,7 +6,7 @@ use tracing::warn; use super::{ cache::CacheStore, - intelligence::{CodeIntelligence, SourceFile}, + intelligence::{CodeIntelligence, SourceCode}, }; use crate::tantivy_utils::open_or_create_index; @@ -118,7 +118,7 @@ pub fn remove_staled_documents(cache: &mut CacheStore, code: &CodeSearchSchema, gc_commit(); } -fn is_valid_file(file: &SourceFile) -> bool { +fn is_valid_file(file: &SourceCode) -> bool { file.max_line_length <= MAX_LINE_LENGTH_THRESHOLD && file.avg_line_length <= AVG_LINE_LENGTH_THRESHOLD } diff --git a/crates/tabby-scheduler/src/code/intelligence.rs b/crates/tabby-scheduler/src/code/intelligence.rs index c0bef0e18e1d..ca7c02383a9c 100644 --- a/crates/tabby-scheduler/src/code/intelligence.rs +++ b/crates/tabby-scheduler/src/code/intelligence.rs @@ -6,7 +6,7 @@ use tracing::warn; use tree_sitter_tags::TagsContext; use super::languages; -pub use super::types::{Point, SourceFile, Tag}; +pub use super::types::{Point, SourceCode, Tag}; pub struct CodeIntelligence { context: TagsContext, @@ -61,7 +61,7 @@ impl CodeIntelligence { &mut self, config: &RepositoryConfig, path: &Path, - ) -> Option { + ) -> Option { if path.is_dir() || !path.exists() { return None; } @@ -86,7 +86,7 @@ impl CodeIntelligence { return None; } }; - let source_file = SourceFile { + let source_file = SourceCode { git_url: config.canonical_git_url(), basedir: config.dir().display().to_string(), filepath: relative_path.display().to_string(), diff --git a/crates/tabby-scheduler/src/code/types.rs b/crates/tabby-scheduler/src/code/types.rs index 36222e880811..93133e16ebfc 100644 --- a/crates/tabby-scheduler/src/code/types.rs +++ b/crates/tabby-scheduler/src/code/types.rs @@ -3,7 +3,7 @@ use std::{ops::Range, path::Path}; use serde::{Deserialize, Serialize}; #[derive(Serialize, Deserialize, Clone)] -pub struct SourceFile { +pub struct SourceCode { pub git_url: String, pub basedir: String, pub filepath: String, @@ -14,7 +14,7 @@ pub struct SourceFile { pub tags: Vec, } -impl SourceFile { +impl SourceCode { pub fn read_content(&self) -> std::io::Result { let path = Path::new(&self.basedir).join(&self.filepath); std::fs::read_to_string(path) diff --git a/crates/tabby-scheduler/src/doc/mod.rs b/crates/tabby-scheduler/src/doc/mod.rs index 3ce19ffa51d3..970e53c79579 100644 --- a/crates/tabby-scheduler/src/doc/mod.rs +++ b/crates/tabby-scheduler/src/doc/mod.rs @@ -2,23 +2,22 @@ use std::{collections::HashSet, sync::Arc}; use tabby_common::{index::DocSearchSchema, path}; use tabby_inference::Embedding; -use tantivy::{doc, Index, IndexWriter, Term}; +use tantivy::{doc, IndexWriter, Term}; use text_splitter::{Characters, TextSplitter}; use tracing::warn; use crate::tantivy_utils::open_or_create_index; -struct Document { +pub struct SourceDocument { pub id: String, pub title: String, pub link: String, pub body: String, } -struct DocIndex { +pub struct DocIndex { embedding: Arc, doc: DocSearchSchema, - index: Index, writer: IndexWriter, splitter: TextSplitter, } @@ -40,13 +39,12 @@ impl DocIndex { Self { embedding, doc, - index, writer, splitter: TextSplitter::default().with_trim_chunks(true), } } - pub async fn add(&mut self, document: Document) { + pub async fn add(&mut self, document: SourceDocument) { // Delete the document if it already exists self.writer .delete_term(Term::from_field_text(self.doc.field_id, &document.id)); diff --git a/crates/tabby-scheduler/src/lib.rs b/crates/tabby-scheduler/src/lib.rs index cdcdb36e6183..a4d193f9dd4c 100644 --- a/crates/tabby-scheduler/src/lib.rs +++ b/crates/tabby-scheduler/src/lib.rs @@ -7,9 +7,9 @@ mod code; pub use code::CodeIndex; mod doc; - use std::sync::Arc; +pub use doc::{DocIndex, SourceDocument}; use tabby_common::config::{RepositoryAccess, RepositoryConfig}; use tokio_cron_scheduler::{Job, JobScheduler}; use tracing::{info, warn};