Skip to content

Commit

Permalink
chore: add field_corpus to search index to search from different corp…
Browse files Browse the repository at this point in the history
…us (#2344)

* add field_kind to filter query

* update

* update

* update

* [autofix.ci] apply automated fixes

* when index schema mismatch, don't load

---------

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
  • Loading branch information
wsxiaoys and autofix-ci[bot] authored Jun 4, 2024
1 parent f5fcec2 commit 6d9adef
Show file tree
Hide file tree
Showing 7 changed files with 58 additions and 25 deletions.
8 changes: 7 additions & 1 deletion crates/tabby-common/src/index/code/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use tantivy::{
};
pub use tokenizer::tokenize_code;

use super::IndexSchema;
use super::{corpus, IndexSchema};
use crate::api::code::CodeSearchQuery;

pub mod fields {
Expand Down Expand Up @@ -68,11 +68,17 @@ pub fn code_search_query(
query: &CodeSearchQuery,
chunk_tokens_query: Box<dyn Query>,
) -> BooleanQuery {
let schema = IndexSchema::instance();
let corpus_query = schema.corpus_query(corpus::CODE);
let language_query = language_query(&query.language);
let git_url_query = git_url_query(&query.git_url);

// language / git_url / filepath field shouldn't contribute to the score, mark them to 0.0.
let mut subqueries: Vec<(Occur, Box<dyn Query>)> = vec![
(
Occur::Must,
Box::new(ConstScoreQuery::new(corpus_query, 0.0)),
),
(
Occur::Must,
Box::new(ConstScoreQuery::new(language_query, 0.0)),
Expand Down
15 changes: 15 additions & 0 deletions crates/tabby-common/src/index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ pub struct IndexSchema {

// === Fields for both document and chunk ===
pub field_id: Field,
pub field_corpus: Field,
pub field_updated_at: Field,

// === Fields for document ===
Expand All @@ -32,6 +33,11 @@ pub struct IndexSchema {

const FIELD_CHUNK_ID: &str = "chunk_id";

pub mod corpus {
pub const CODE: &str = "code";
pub const WEB: &str = "web";
}

impl IndexSchema {
pub fn instance() -> &'static Self {
&INDEX_SCHEMA
Expand All @@ -41,6 +47,7 @@ impl IndexSchema {
let mut builder = Schema::builder();

let field_id = builder.add_text_field("id", STRING | STORED);
let field_corpus = builder.add_text_field("corpus", STRING | FAST);
let field_updated_at = builder.add_date_field("updated_at", INDEXED);
let field_attributes = builder.add_text_field("attributes", STORED);

Expand All @@ -64,6 +71,7 @@ impl IndexSchema {
Self {
schema,
field_id,
field_corpus,
field_updated_at,
field_attributes,

Expand All @@ -90,6 +98,13 @@ impl IndexSchema {
),
])
}

pub fn corpus_query(&self, kind: &str) -> Box<dyn Query> {
Box::new(TermQuery::new(
Term::from_field_text(self.field_corpus, kind),
tantivy::schema::IndexRecordOption::Basic,
))
}
}

lazy_static! {
Expand Down
13 changes: 6 additions & 7 deletions crates/tabby-scheduler/src/code/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@ use async_stream::stream;
use async_trait::async_trait;
use futures::stream::BoxStream;
use serde_json::json;
use tabby_common::{config::RepositoryConfig, index::code};
use tabby_common::{
config::RepositoryConfig,
index::{code, corpus},
};
use tabby_inference::Embedding;
use tracing::{info, warn};

Expand Down Expand Up @@ -61,12 +64,8 @@ impl CodeBuilder {

#[async_trait]
impl IndexAttributeBuilder<KeyedSourceCode> for CodeBuilder {
fn format_id(&self, id: &str) -> String {
format!("code:{}", id)
}

async fn build_id(&self, source_code: &KeyedSourceCode) -> String {
self.format_id(&source_code.key)
source_code.key.clone()
}

async fn build_attributes(&self, _source_code: &KeyedSourceCode) -> serde_json::Value {
Expand Down Expand Up @@ -127,5 +126,5 @@ impl IndexAttributeBuilder<KeyedSourceCode> for CodeBuilder {

fn create_code_index(embedding: Option<Arc<dyn Embedding>>) -> Indexer<KeyedSourceCode> {
let builder = CodeBuilder::new(embedding);
Indexer::new(builder)
Indexer::new(corpus::CODE, builder)
}
10 changes: 3 additions & 7 deletions crates/tabby-scheduler/src/doc/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use async_stream::stream;
use async_trait::async_trait;
use futures::stream::BoxStream;
use serde_json::json;
use tabby_common::index::{self, doc};
use tabby_common::index::{self, corpus, doc};
use tabby_inference::Embedding;
use tantivy::doc;
use text_splitter::TextSplitter;
Expand Down Expand Up @@ -33,12 +33,8 @@ impl DocBuilder {

#[async_trait]
impl IndexAttributeBuilder<SourceDocument> for DocBuilder {
fn format_id(&self, id: &str) -> String {
format!("web:{id}")
}

async fn build_id(&self, document: &SourceDocument) -> String {
self.format_id(&document.id)
document.id.clone()
}

async fn build_attributes(&self, document: &SourceDocument) -> serde_json::Value {
Expand Down Expand Up @@ -89,5 +85,5 @@ impl IndexAttributeBuilder<SourceDocument> for DocBuilder {

pub fn create_web_index(embedding: Arc<dyn Embedding>) -> Indexer<SourceDocument> {
let builder = DocBuilder::new(embedding);
Indexer::new(builder)
Indexer::new(corpus::WEB, builder)
}
15 changes: 11 additions & 4 deletions crates/tabby-scheduler/src/indexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ use crate::tantivy_utils::open_or_create_index;

#[async_trait::async_trait]
pub trait IndexAttributeBuilder<T>: Send + Sync {
fn format_id(&self, id: &str) -> String;
async fn build_id(&self, document: &T) -> String;
async fn build_attributes(&self, document: &T) -> serde_json::Value;
async fn build_chunk_attributes(
Expand All @@ -16,20 +15,22 @@ pub trait IndexAttributeBuilder<T>: Send + Sync {
}

pub struct Indexer<T> {
kind: &'static str,
builder: Box<dyn IndexAttributeBuilder<T>>,
writer: IndexWriter,
pub recreated: bool,
}

impl<T> Indexer<T> {
pub fn new(builder: impl IndexAttributeBuilder<T> + 'static) -> Self {
pub fn new(kind: &'static str, builder: impl IndexAttributeBuilder<T> + 'static) -> Self {
let doc = IndexSchema::instance();
let (recreated, index) = open_or_create_index(&doc.schema, &path::index_dir());
let writer = index
.writer(150_000_000)
.expect("Failed to create index writer");

Self {
kind,
builder: Box::new(builder),
writer,
recreated,
Expand Down Expand Up @@ -60,7 +61,8 @@ impl<T> Indexer<T> {
let updated_at = tantivy::DateTime::from_utc(now);

let doc = doc! {
schema.field_id => id,
schema.field_id => self.format_id(&id),
schema.field_corpus => self.kind,
schema.field_attributes => self.builder.build_attributes(&document).await,
schema.field_updated_at => updated_at,
};
Expand All @@ -82,6 +84,7 @@ impl<T> Indexer<T> {
.map(move |(chunk_id, (tokens, chunk_attributes))| {
let mut doc = doc! {
schema.field_id => id,
schema.field_corpus => self.kind,
schema.field_updated_at => updated_at,
schema.field_chunk_id => format!("{}-{}", id, chunk_id),
schema.field_chunk_attributes => chunk_attributes,
Expand All @@ -98,10 +101,14 @@ impl<T> Indexer<T> {
pub fn delete(&self, id: &str) {
self.writer.delete_term(Term::from_field_text(
IndexSchema::instance().field_id,
&self.builder.format_id(id),
&self.format_id(id),
));
}

fn format_id(&self, id: &str) -> String {
format!("{}:{}", self.kind, id)
}

pub fn commit(mut self) {
self.writer.commit().expect("Failed to commit changes");
self.writer
Expand Down
16 changes: 11 additions & 5 deletions crates/tabby/src/services/doc/tantivy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@ use anyhow::Result;
use async_trait::async_trait;
use tabby_common::{
api::doc::{DocSearch, DocSearchDocument, DocSearchError, DocSearchHit, DocSearchResponse},
index::{self, doc},
index::{self, corpus, doc},
};
use tabby_inference::Embedding;
use tantivy::{
collector::TopDocs,
query::{BooleanQuery, ConstScoreQuery, Occur},
schema::{self, Value},
IndexReader, TantivyDocument,
};
Expand Down Expand Up @@ -36,12 +37,17 @@ impl DocSearchImpl {
let embedding = self.embedding.embed(q).await?;
let embedding_tokens_query =
index::embedding_tokens_query(embedding.len(), embedding.iter());
let corpus_query = schema.corpus_query(corpus::WEB);
let query = BooleanQuery::new(vec![
(
Occur::Must,
Box::new(ConstScoreQuery::new(corpus_query, 0.0)),
),
(Occur::Must, Box::new(embedding_tokens_query)),
]);

let searcher = reader.searcher();
let top_chunks = searcher.search(
&embedding_tokens_query,
&TopDocs::with_limit(limit).and_offset(offset),
)?;
let top_chunks = searcher.search(&query, &TopDocs::with_limit(limit).and_offset(offset))?;

let hits = top_chunks
.iter()
Expand Down
6 changes: 5 additions & 1 deletion crates/tabby/src/services/tantivy.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::{sync::Arc, time::Duration};

use tabby_common::path;
use tabby_common::{index::IndexSchema, path};
use tantivy::{Index, IndexReader};
use tokio::sync::RwLock;
use tracing::debug;
Expand All @@ -20,6 +20,10 @@ impl IndexReaderProvider {
fn load() -> anyhow::Result<IndexReader> {
let index = Index::open_in_dir(path::index_dir())?;

if index.schema() != IndexSchema::instance().schema {
return Err(anyhow::anyhow!("Index schema mismatch"));
}

Ok(index.reader_builder().try_into()?)
}

Expand Down

0 comments on commit 6d9adef

Please sign in to comment.