Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: add crawl_and_index example #2105

Merged
merged 8 commits into from
May 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion crates/tabby-common/src/path.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,16 +33,18 @@
tabby_root().join("usage_anonymous_id")
}

// FIXME: migrate to /corpus/code/repositories
pub fn repositories_dir() -> PathBuf {
tabby_root().join("repositories")
}

// FIXME: migrate to /corpus/code/tantivy
pub fn index_dir() -> PathBuf {
tabby_root().join("index")
}

pub fn doc_index_dir() -> PathBuf {
tabby_root().join("doc_index")
tabby_root().join("corpus").join("doc").join("tantivy")

Check warning on line 47 in crates/tabby-common/src/path.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-common/src/path.rs#L47

Added line #L47 was not covered by tests
}

pub fn models_dir() -> PathBuf {
Expand All @@ -57,6 +59,7 @@
tabby_root().join("events")
}

// FIXME: migrate to /corpus/code/cache
pub fn cache_dir() -> PathBuf {
tabby_root().join("cache")
}
Expand Down
1 change: 1 addition & 0 deletions crates/tabby-scheduler/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,4 @@ tracing-test = "0.1"
tokio = { workspace = true, features = ["rt", "macros", "rt-multi-thread"] }
serde_json = { workspace = true }
async-trait = { workspace = true }
tracing-subscriber = { workspace = true }
56 changes: 56 additions & 0 deletions crates/tabby-scheduler/examples/crawl_and_index.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
use std::sync::Arc;

use async_stream::stream;
use async_trait::async_trait;
use futures::StreamExt;
use tabby_inference::Embedding;
use tabby_scheduler::{crawl::crawl_pipeline, DocIndex, SourceDocument};
use tracing::debug;

#[tokio::main]
async fn main() {
tracing_subscriber::fmt()
.with_env_filter("tabby=debug,crawl_and_index=debug")
.init();

let mut doc_index = DocIndex::new(Arc::new(FakeEmbedding));
let mut cnt = 0;
stream! {
for await doc in crawl_pipeline("https://tabby.tabbyml.com/").await {
debug!("Title: {:?}", doc.metadata.title);
debug!("Description: {:?}", doc.metadata.description);
debug!("URL: {}\n", doc.url);
cnt += 1;
if cnt >= 3 {
break;
}

let id = cnt.to_string();
debug!("Adding document {} to index...", id);
let source_doc = SourceDocument {
id,
title: doc.metadata.title.unwrap_or_default(),
link: doc.url,
body: doc.markdown,
};

doc_index.add(source_doc).await;
}

doc_index.commit();
}
.collect::<()>()
.await;
}

struct FakeEmbedding;

#[async_trait]
impl Embedding for FakeEmbedding {
async fn embed(&self, _: &str) -> anyhow::Result<Vec<f32>> {
let mut embedding = vec![0.0; 512];
embedding[3] = 1.0;
embedding[128] = 1.0;
Ok(embedding)
}
}
23 changes: 0 additions & 23 deletions crates/tabby-scheduler/examples/crawler.rs

This file was deleted.

8 changes: 4 additions & 4 deletions crates/tabby-scheduler/src/code/cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
use tabby_common::{config::RepositoryConfig, languages::get_language_by_ext};
use tracing::info;

use super::intelligence::{CodeIntelligence, SourceFile};
use super::intelligence::{CodeIntelligence, SourceCode};

const SOURCE_FILE_BUCKET_KEY: &str = "source_files";
const INDEX_BUCKET_KEY: &str = "indexed_files";
Expand Down Expand Up @@ -145,10 +145,10 @@
&mut self,
config: &RepositoryConfig,
path: &Path,
) -> Option<SourceFile> {
) -> Option<SourceCode> {

Check warning on line 148 in crates/tabby-scheduler/src/code/cache.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-scheduler/src/code/cache.rs#L148

Added line #L148 was not covered by tests
let key: String = SourceFileKey::try_from(path).ok()?.to_string();

let dataset_bucket: Bucket<String, Json<Option<SourceFile>>> = self
let dataset_bucket: Bucket<String, Json<Option<SourceCode>>> = self

Check warning on line 151 in crates/tabby-scheduler/src/code/cache.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-scheduler/src/code/cache.rs#L151

Added line #L151 was not covered by tests
.store
.bucket(Some(SOURCE_FILE_BUCKET_KEY))
.expect("Could not access dataset bucket");
Expand All @@ -171,7 +171,7 @@

pub fn garbage_collection_for_source_files(&self) {
info!("Started cleaning up 'source_files' bucket");
let bucket: Bucket<String, Json<SourceFile>> = self
let bucket: Bucket<String, Json<SourceCode>> = self

Check warning on line 174 in crates/tabby-scheduler/src/code/cache.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-scheduler/src/code/cache.rs#L174

Added line #L174 was not covered by tests
.store
.bucket(Some(SOURCE_FILE_BUCKET_KEY))
.expect("Could not access dataset bucket");
Expand Down
4 changes: 2 additions & 2 deletions crates/tabby-scheduler/src/code/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

use super::{
cache::CacheStore,
intelligence::{CodeIntelligence, SourceFile},
intelligence::{CodeIntelligence, SourceCode},
};
use crate::tantivy_utils::open_or_create_index;

Expand Down Expand Up @@ -118,7 +118,7 @@
gc_commit();
}

fn is_valid_file(file: &SourceFile) -> bool {
fn is_valid_file(file: &SourceCode) -> bool {

Check warning on line 121 in crates/tabby-scheduler/src/code/index.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-scheduler/src/code/index.rs#L121

Added line #L121 was not covered by tests
file.max_line_length <= MAX_LINE_LENGTH_THRESHOLD
&& file.avg_line_length <= AVG_LINE_LENGTH_THRESHOLD
}
6 changes: 3 additions & 3 deletions crates/tabby-scheduler/src/code/intelligence.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
use tree_sitter_tags::TagsContext;

use super::languages;
pub use super::types::{Point, SourceFile, Tag};
pub use super::types::{Point, SourceCode, Tag};

pub struct CodeIntelligence {
context: TagsContext,
Expand Down Expand Up @@ -61,7 +61,7 @@
&mut self,
config: &RepositoryConfig,
path: &Path,
) -> Option<SourceFile> {
) -> Option<SourceCode> {

Check warning on line 64 in crates/tabby-scheduler/src/code/intelligence.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-scheduler/src/code/intelligence.rs#L64

Added line #L64 was not covered by tests
if path.is_dir() || !path.exists() {
return None;
}
Expand All @@ -86,7 +86,7 @@
return None;
}
};
let source_file = SourceFile {
let source_file = SourceCode {

Check warning on line 89 in crates/tabby-scheduler/src/code/intelligence.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-scheduler/src/code/intelligence.rs#L89

Added line #L89 was not covered by tests
git_url: config.canonical_git_url(),
basedir: config.dir().display().to_string(),
filepath: relative_path.display().to_string(),
Expand Down
4 changes: 2 additions & 2 deletions crates/tabby-scheduler/src/code/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use std::{ops::Range, path::Path};
use serde::{Deserialize, Serialize};

#[derive(Serialize, Deserialize, Clone)]
pub struct SourceFile {
pub struct SourceCode {
pub git_url: String,
pub basedir: String,
pub filepath: String,
Expand All @@ -14,7 +14,7 @@ pub struct SourceFile {
pub tags: Vec<Tag>,
}

impl SourceFile {
impl SourceCode {
pub fn read_content(&self) -> std::io::Result<String> {
let path = Path::new(&self.basedir).join(&self.filepath);
std::fs::read_to_string(path)
Expand Down
10 changes: 4 additions & 6 deletions crates/tabby-scheduler/src/doc/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,22 @@

use tabby_common::{index::DocSearchSchema, path};
use tabby_inference::Embedding;
use tantivy::{doc, Index, IndexWriter, Term};
use tantivy::{doc, IndexWriter, Term};
use text_splitter::{Characters, TextSplitter};
use tracing::warn;

use crate::tantivy_utils::open_or_create_index;

struct Document {
pub struct SourceDocument {
pub id: String,
pub title: String,
pub link: String,
pub body: String,
}

struct DocIndex {
pub struct DocIndex {
embedding: Arc<dyn Embedding>,
doc: DocSearchSchema,
index: Index,
writer: IndexWriter,
splitter: TextSplitter<Characters>,
}
Expand All @@ -40,13 +39,12 @@
Self {
embedding,
doc,
index,
writer,
splitter: TextSplitter::default().with_trim_chunks(true),
}
}

pub async fn add(&mut self, document: Document) {
pub async fn add(&mut self, document: SourceDocument) {

Check warning on line 47 in crates/tabby-scheduler/src/doc/mod.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-scheduler/src/doc/mod.rs#L47

Added line #L47 was not covered by tests
// Delete the document if it already exists
self.writer
.delete_term(Term::from_field_text(self.doc.field_id, &document.id));
Expand Down
2 changes: 1 addition & 1 deletion crates/tabby-scheduler/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ mod code;
pub use code::CodeIndex;

mod doc;

use std::sync::Arc;

pub use doc::{DocIndex, SourceDocument};
use tabby_common::config::{RepositoryAccess, RepositoryConfig};
use tokio_cron_scheduler::{Job, JobScheduler};
use tracing::{info, warn};
Expand Down
Loading