Skip to content

Commit

Permalink
chore: add crawl_and_index example (#2105)
Browse files Browse the repository at this point in the history
* add cmt

* update

* renamings

* add crawl_and_index example

* update

* update

* update
  • Loading branch information
wsxiaoys authored May 13, 2024
1 parent e01650b commit 1a7f07d
Show file tree
Hide file tree
Showing 11 changed files with 78 additions and 42 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion crates/tabby-common/src/path.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,16 +33,18 @@ pub fn usage_id_file() -> PathBuf {
tabby_root().join("usage_anonymous_id")
}

// FIXME: migrate to /corpus/code/repositories
pub fn repositories_dir() -> PathBuf {
tabby_root().join("repositories")
}

// FIXME: migrate to /corpus/code/tantivy
pub fn index_dir() -> PathBuf {
tabby_root().join("index")
}

pub fn doc_index_dir() -> PathBuf {
tabby_root().join("doc_index")
tabby_root().join("corpus").join("doc").join("tantivy")
}

pub fn models_dir() -> PathBuf {
Expand All @@ -57,6 +59,7 @@ pub fn events_dir() -> PathBuf {
tabby_root().join("events")
}

// FIXME: migrate to /corpus/code/cache
pub fn cache_dir() -> PathBuf {
tabby_root().join("cache")
}
Expand Down
1 change: 1 addition & 0 deletions crates/tabby-scheduler/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,4 @@ tracing-test = "0.1"
tokio = { workspace = true, features = ["rt", "macros", "rt-multi-thread"] }
serde_json = { workspace = true }
async-trait = { workspace = true }
tracing-subscriber = { workspace = true }
56 changes: 56 additions & 0 deletions crates/tabby-scheduler/examples/crawl_and_index.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
use std::sync::Arc;

use async_stream::stream;
use async_trait::async_trait;
use futures::StreamExt;
use tabby_inference::Embedding;
use tabby_scheduler::{crawl::crawl_pipeline, DocIndex, SourceDocument};
use tracing::debug;

#[tokio::main]
async fn main() {
tracing_subscriber::fmt()
.with_env_filter("tabby=debug,crawl_and_index=debug")
.init();

let mut doc_index = DocIndex::new(Arc::new(FakeEmbedding));
let mut cnt = 0;
stream! {
for await doc in crawl_pipeline("https://tabby.tabbyml.com/").await {
debug!("Title: {:?}", doc.metadata.title);
debug!("Description: {:?}", doc.metadata.description);
debug!("URL: {}\n", doc.url);
cnt += 1;
if cnt >= 3 {
break;
}

let id = cnt.to_string();
debug!("Adding document {} to index...", id);
let source_doc = SourceDocument {
id,
title: doc.metadata.title.unwrap_or_default(),
link: doc.url,
body: doc.markdown,
};

doc_index.add(source_doc).await;
}

doc_index.commit();
}
.collect::<()>()
.await;
}

struct FakeEmbedding;

#[async_trait]
impl Embedding for FakeEmbedding {
async fn embed(&self, _: &str) -> anyhow::Result<Vec<f32>> {
let mut embedding = vec![0.0; 512];
embedding[3] = 1.0;
embedding[128] = 1.0;
Ok(embedding)
}
}
23 changes: 0 additions & 23 deletions crates/tabby-scheduler/examples/crawler.rs

This file was deleted.

8 changes: 4 additions & 4 deletions crates/tabby-scheduler/src/code/cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use serde::{Deserialize, Serialize};
use tabby_common::{config::RepositoryConfig, languages::get_language_by_ext};
use tracing::info;

use super::intelligence::{CodeIntelligence, SourceFile};
use super::intelligence::{CodeIntelligence, SourceCode};

const SOURCE_FILE_BUCKET_KEY: &str = "source_files";
const INDEX_BUCKET_KEY: &str = "indexed_files";
Expand Down Expand Up @@ -145,10 +145,10 @@ impl CacheStore {
&mut self,
config: &RepositoryConfig,
path: &Path,
) -> Option<SourceFile> {
) -> Option<SourceCode> {
let key: String = SourceFileKey::try_from(path).ok()?.to_string();

let dataset_bucket: Bucket<String, Json<Option<SourceFile>>> = self
let dataset_bucket: Bucket<String, Json<Option<SourceCode>>> = self
.store
.bucket(Some(SOURCE_FILE_BUCKET_KEY))
.expect("Could not access dataset bucket");
Expand All @@ -171,7 +171,7 @@ impl CacheStore {

pub fn garbage_collection_for_source_files(&self) {
info!("Started cleaning up 'source_files' bucket");
let bucket: Bucket<String, Json<SourceFile>> = self
let bucket: Bucket<String, Json<SourceCode>> = self
.store
.bucket(Some(SOURCE_FILE_BUCKET_KEY))
.expect("Could not access dataset bucket");
Expand Down
4 changes: 2 additions & 2 deletions crates/tabby-scheduler/src/code/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use tracing::warn;

use super::{
cache::CacheStore,
intelligence::{CodeIntelligence, SourceFile},
intelligence::{CodeIntelligence, SourceCode},
};
use crate::tantivy_utils::open_or_create_index;

Expand Down Expand Up @@ -118,7 +118,7 @@ pub fn remove_staled_documents(cache: &mut CacheStore, code: &CodeSearchSchema,
gc_commit();
}

fn is_valid_file(file: &SourceFile) -> bool {
fn is_valid_file(file: &SourceCode) -> bool {
file.max_line_length <= MAX_LINE_LENGTH_THRESHOLD
&& file.avg_line_length <= AVG_LINE_LENGTH_THRESHOLD
}
6 changes: 3 additions & 3 deletions crates/tabby-scheduler/src/code/intelligence.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use tracing::warn;
use tree_sitter_tags::TagsContext;

use super::languages;
pub use super::types::{Point, SourceFile, Tag};
pub use super::types::{Point, SourceCode, Tag};

pub struct CodeIntelligence {
context: TagsContext,
Expand Down Expand Up @@ -61,7 +61,7 @@ impl CodeIntelligence {
&mut self,
config: &RepositoryConfig,
path: &Path,
) -> Option<SourceFile> {
) -> Option<SourceCode> {
if path.is_dir() || !path.exists() {
return None;
}
Expand All @@ -86,7 +86,7 @@ impl CodeIntelligence {
return None;
}
};
let source_file = SourceFile {
let source_file = SourceCode {
git_url: config.canonical_git_url(),
basedir: config.dir().display().to_string(),
filepath: relative_path.display().to_string(),
Expand Down
4 changes: 2 additions & 2 deletions crates/tabby-scheduler/src/code/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use std::{ops::Range, path::Path};
use serde::{Deserialize, Serialize};

#[derive(Serialize, Deserialize, Clone)]
pub struct SourceFile {
pub struct SourceCode {
pub git_url: String,
pub basedir: String,
pub filepath: String,
Expand All @@ -14,7 +14,7 @@ pub struct SourceFile {
pub tags: Vec<Tag>,
}

impl SourceFile {
impl SourceCode {
pub fn read_content(&self) -> std::io::Result<String> {
let path = Path::new(&self.basedir).join(&self.filepath);
std::fs::read_to_string(path)
Expand Down
10 changes: 4 additions & 6 deletions crates/tabby-scheduler/src/doc/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,22 @@ use std::{collections::HashSet, sync::Arc};

use tabby_common::{index::DocSearchSchema, path};
use tabby_inference::Embedding;
use tantivy::{doc, Index, IndexWriter, Term};
use tantivy::{doc, IndexWriter, Term};
use text_splitter::{Characters, TextSplitter};
use tracing::warn;

use crate::tantivy_utils::open_or_create_index;

struct Document {
pub struct SourceDocument {
pub id: String,
pub title: String,
pub link: String,
pub body: String,
}

struct DocIndex {
pub struct DocIndex {
embedding: Arc<dyn Embedding>,
doc: DocSearchSchema,
index: Index,
writer: IndexWriter,
splitter: TextSplitter<Characters>,
}
Expand All @@ -40,13 +39,12 @@ impl DocIndex {
Self {
embedding,
doc,
index,
writer,
splitter: TextSplitter::default().with_trim_chunks(true),
}
}

pub async fn add(&mut self, document: Document) {
pub async fn add(&mut self, document: SourceDocument) {
// Delete the document if it already exists
self.writer
.delete_term(Term::from_field_text(self.doc.field_id, &document.id));
Expand Down
2 changes: 1 addition & 1 deletion crates/tabby-scheduler/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ mod code;
pub use code::CodeIndex;

mod doc;

use std::sync::Arc;

pub use doc::{DocIndex, SourceDocument};
use tabby_common::config::{RepositoryAccess, RepositoryConfig};
use tokio_cron_scheduler::{Job, JobScheduler};
use tracing::{info, warn};
Expand Down

0 comments on commit 1a7f07d

Please sign in to comment.