-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
refactor(index): migrate corpus::WEB to corpus::STRUCTURED_DOC (#3352)
* add structured doc * [autofix.ci] apply automated fixes * chore: implement structured_doc::DocService * refactor(index): refactored `web_crawler.rs` to use updated `StructuredDoc` and `StructuredDocFields` types. run make fix * switch doc search * chore: adapt frontend * delete doc related files * run make fix * add deprecation notes for corpus::WEB * [autofix.ci] apply automated fixes * [autofix.ci] apply automated fixes (attempt 2/3) --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
- Loading branch information
1 parent
414cac3
commit a52c4e6
Showing
39 changed files
with
773 additions
and
440 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
pub mod code; | ||
pub mod doc; | ||
pub mod event; | ||
pub mod server_setting; | ||
pub mod structured_doc; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,169 @@ | ||
use async_trait::async_trait; | ||
use tantivy::{ | ||
schema::{self, document::CompactDocValue, Value}, | ||
TantivyDocument, | ||
}; | ||
use thiserror::Error; | ||
|
||
use crate::index::{structured_doc, IndexSchema}; | ||
|
||
pub struct DocSearchResponse { | ||
pub hits: Vec<DocSearchHit>, | ||
} | ||
|
||
pub struct DocSearchHit { | ||
pub score: f32, | ||
pub doc: DocSearchDocument, | ||
} | ||
|
||
#[derive(Clone)] | ||
pub enum DocSearchDocument { | ||
Web(DocSearchWebDocument), | ||
Issue(DocSearchIssueDocument), | ||
} | ||
|
||
#[derive(Error, Debug)] | ||
pub enum DocSearchError { | ||
#[error("index not ready")] | ||
NotReady, | ||
|
||
#[error(transparent)] | ||
QueryParserError(#[from] tantivy::query::QueryParserError), | ||
|
||
#[error(transparent)] | ||
TantivyError(#[from] tantivy::TantivyError), | ||
|
||
#[error(transparent)] | ||
Other(#[from] anyhow::Error), | ||
} | ||
|
||
#[async_trait] | ||
pub trait DocSearch: Send + Sync { | ||
/// Search docs from underlying index. | ||
/// | ||
/// * `source_ids`: Filter documents by source IDs, when empty, search all sources. | ||
async fn search( | ||
&self, | ||
source_ids: &[String], | ||
q: &str, | ||
limit: usize, | ||
) -> Result<DocSearchResponse, DocSearchError>; | ||
} | ||
|
||
#[derive(Clone)] | ||
pub struct DocSearchWebDocument { | ||
pub title: String, | ||
pub link: String, | ||
pub snippet: String, | ||
} | ||
|
||
#[derive(Clone)] | ||
pub struct DocSearchIssueDocument { | ||
pub title: String, | ||
pub link: String, | ||
pub body: String, | ||
pub closed: bool, | ||
} | ||
|
||
pub trait FromTantivyDocument { | ||
fn from_tantivy_document(doc: &TantivyDocument, chunk: &TantivyDocument) -> Option<Self> | ||
where | ||
Self: Sized; | ||
} | ||
|
||
impl FromTantivyDocument for DocSearchDocument { | ||
fn from_tantivy_document(doc: &TantivyDocument, chunk: &TantivyDocument) -> Option<Self> { | ||
let schema = IndexSchema::instance(); | ||
let kind = get_json_text_field(doc, schema.field_attributes, structured_doc::fields::KIND); | ||
|
||
match kind { | ||
"web" => { | ||
DocSearchWebDocument::from_tantivy_document(doc, chunk).map(DocSearchDocument::Web) | ||
} | ||
"issue" => DocSearchIssueDocument::from_tantivy_document(doc, chunk) | ||
.map(DocSearchDocument::Issue), | ||
_ => None, | ||
} | ||
} | ||
} | ||
|
||
impl FromTantivyDocument for DocSearchWebDocument { | ||
fn from_tantivy_document(doc: &TantivyDocument, chunk: &TantivyDocument) -> Option<Self> { | ||
let schema = IndexSchema::instance(); | ||
let title = get_json_text_field( | ||
doc, | ||
schema.field_attributes, | ||
structured_doc::fields::web::TITLE, | ||
); | ||
let link = get_json_text_field( | ||
doc, | ||
schema.field_attributes, | ||
structured_doc::fields::web::LINK, | ||
); | ||
let snippet = get_json_text_field( | ||
chunk, | ||
schema.field_chunk_attributes, | ||
structured_doc::fields::web::CHUNK_TEXT, | ||
); | ||
|
||
Some(Self { | ||
title: title.into(), | ||
link: link.into(), | ||
snippet: snippet.into(), | ||
}) | ||
} | ||
} | ||
|
||
impl FromTantivyDocument for DocSearchIssueDocument { | ||
fn from_tantivy_document(doc: &TantivyDocument, _: &TantivyDocument) -> Option<Self> { | ||
let schema = IndexSchema::instance(); | ||
let title = get_json_text_field( | ||
doc, | ||
schema.field_attributes, | ||
structured_doc::fields::issue::TITLE, | ||
); | ||
let link = get_json_text_field( | ||
doc, | ||
schema.field_attributes, | ||
structured_doc::fields::issue::LINK, | ||
); | ||
let body = get_json_text_field( | ||
doc, | ||
schema.field_attributes, | ||
structured_doc::fields::issue::BODY, | ||
); | ||
let closed = get_json_bool_field( | ||
doc, | ||
schema.field_attributes, | ||
structured_doc::fields::issue::CLOSED, | ||
); | ||
Some(Self { | ||
title: title.into(), | ||
link: link.into(), | ||
body: body.into(), | ||
closed, | ||
}) | ||
} | ||
} | ||
|
||
fn get_json_field<'a>( | ||
doc: &'a TantivyDocument, | ||
field: schema::Field, | ||
name: &str, | ||
) -> CompactDocValue<'a> { | ||
doc.get_first(field) | ||
.unwrap() | ||
.as_object() | ||
.unwrap() | ||
.find(|(k, _)| *k == name) | ||
.unwrap() | ||
.1 | ||
} | ||
|
||
fn get_json_bool_field(doc: &TantivyDocument, field: schema::Field, name: &str) -> bool { | ||
get_json_field(doc, field, name).as_bool().unwrap() | ||
} | ||
|
||
fn get_json_text_field<'a>(doc: &'a TantivyDocument, field: schema::Field, name: &str) -> &'a str { | ||
get_json_field(doc, field, name).as_str().unwrap() | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
pub mod fields { | ||
pub const KIND: &str = "kind"; | ||
|
||
pub mod web { | ||
pub const TITLE: &str = "title"; | ||
pub const LINK: &str = "link"; | ||
pub const CHUNK_TEXT: &str = "chunk_text"; | ||
} | ||
|
||
pub mod issue { | ||
pub const TITLE: &str = "title"; | ||
pub const LINK: &str = "link"; | ||
pub const BODY: &str = "body"; | ||
pub const CLOSED: &str = "closed"; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.