-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
refactor: restructure code search (#2102)
* refactor: restructure search 1. remove dataset / deps related functionalities. 2. add document search index / apis. * update
- Loading branch information
Showing
26 changed files
with
274 additions
and
644 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
use async_trait::async_trait; | ||
use serde::{Deserialize, Serialize}; | ||
use thiserror::Error; | ||
use utoipa::ToSchema; | ||
|
||
#[derive(Default, Serialize, Deserialize, Debug, ToSchema)] | ||
pub struct DocSearchResponse { | ||
pub num_hits: usize, | ||
pub hits: Vec<DocSearchHit>, | ||
} | ||
|
||
#[derive(Serialize, Deserialize, Debug, ToSchema)] | ||
pub struct DocSearchHit { | ||
pub score: f32, | ||
pub doc: DocSearchDocument, | ||
pub id: u32, | ||
} | ||
|
||
#[derive(Serialize, Deserialize, Debug, ToSchema)] | ||
pub struct DocSearchDocument { | ||
pub title: String, | ||
pub link: String, | ||
pub snippet: String, | ||
} | ||
|
||
#[derive(Error, Debug)] | ||
pub enum DocSearchError { | ||
#[error("index not ready")] | ||
NotReady, | ||
|
||
#[error(transparent)] | ||
QueryParserError(#[from] tantivy::query::QueryParserError), | ||
|
||
#[error(transparent)] | ||
TantivyError(#[from] tantivy::TantivyError), | ||
|
||
#[error(transparent)] | ||
Other(#[from] anyhow::Error), | ||
} | ||
|
||
#[async_trait] | ||
pub trait DocSearch: Send + Sync { | ||
async fn search( | ||
&self, | ||
q: &str, | ||
limit: usize, | ||
offset: usize, | ||
) -> Result<DocSearchResponse, DocSearchError>; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
pub mod code; | ||
pub mod doc; | ||
pub mod event; | ||
pub mod server_setting; | ||
|
||
|
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
use tantivy::schema::{Field, Schema, STORED, STRING}; | ||
|
||
pub struct DocSearchSchema { | ||
pub schema: Schema, | ||
/// Binarized embedding tokens with the following mapping: | ||
/// * [-1, 0] -> 0 | ||
/// * (0, 1] -> 1 | ||
pub field_embedding_token: Field, | ||
|
||
pub field_title: Field, | ||
pub field_link: Field, | ||
pub field_snippet: Field, | ||
} | ||
|
||
impl DocSearchSchema { | ||
pub fn new() -> Self { | ||
let mut builder = Schema::builder(); | ||
|
||
let field_embedding_token = builder.add_text_field("embedding_token", STRING); | ||
let field_title = builder.add_text_field("title", STORED); | ||
let field_link = builder.add_text_field("link", STORED); | ||
let field_snippet = builder.add_text_field("snippet", STORED); | ||
|
||
let schema = builder.build(); | ||
|
||
Self { | ||
schema, | ||
field_embedding_token, | ||
field_title, | ||
field_link, | ||
field_snippet, | ||
} | ||
} | ||
} | ||
|
||
impl Default for DocSearchSchema { | ||
fn default() -> Self { | ||
Self::new() | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
mod code; | ||
pub use code::{register_tokenizers, CodeSearchSchema}; | ||
|
||
mod doc; | ||
pub use doc::DocSearchSchema; |
Oops, something went wrong.