diff --git a/CMakeLists.txt b/CMakeLists.txt index 525a6f2..a3ba4f2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,5 @@ cmake_minimum_required(VERSION 3.15) +project(mgcxx VERSION 0.0.3) set (CMAKE_CXX_STANDARD 20) if (NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE "Debug") @@ -6,7 +7,6 @@ endif() set(CMAKE_EXPORT_COMPILE_COMMANDS ON) include(ExternalProject) include(FetchContent) -project(cxxtantivy) option(ENABLE_TESTS "Enable tests" ON) # NOTE: Be careful with moving this outside of the if block (it should not be diff --git a/README.md b/README.md index e34bf85..64714a0 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,17 @@ -# cxxtantivy +# mgcxx (experimental) -## Work in Progress +A collection of C++ wrappers around Rust libraries. +The list includes: + * full-text search enabled by [tantivy](https://github.com/quickwit-oss/tantivy) + +## text_search ### TODOs -- [ ] Figure out the right API ⏳ - - [ ] All READ methods (`search`, `aggregate`, `find`) depend on the exact schema -> make it robust +- [ ] Implement full API + - [ ] delete + - [ ] update +- [ ] Polish & test all error messages - [ ] Write unit / integration test to compare STRING vs JSON fiels search query syntax. - [ ] Figure out what's the right search syntax for a property graph - [ ] Add some notion of pagination @@ -29,14 +35,21 @@ - [ ] Note [DocAddress](https://docs.rs/tantivy/latest/tantivy/struct.DocAddress.html) is composed of 2 u32 but the `SegmentOrdinal` is tied to the `Searcher` -> is it possible/wise to cache the address (`SegmentId` is UUID) - [ ] A [searcher](https://docs.rs/tantivy/latest/tantivy/struct.IndexReader.html#method.searcher) per transaction -> cache `DocAddress` inside Memgraph's `ElementAccessors`? - [ ] Implement the stress test by adding & searching to the same index concurrently + large dataset generator. +- [ ] Consider implementing panic! handler preventing outside process to crash (optionally). ### NOTEs * if a field doesn't get specified in the schema, it's ignored -* `TEXT` means the field will be tokenized and indexed (required to be able to search) -* Tantivy add_json_object accepts serde_json::map::Map. +* `TEXT` means the field will be tokenized and indexed (required to be able to + search) +* Tantivy add_json_object accepts serde_json::map::Map +* C++ text-search API is snake case because it's implemented in Rust +* Writing each document and then committing (writing to disk) will be + expensive. In a standard OLTP workload that's a common case -> introduce some + form of batching. ## Resources * https://fulmicoton.com/posts/behold-tantivy-part2 * https://stackoverflow.com/questions/37924383/combining-several-static-libraries-into-one-using-cmake + --> decided to have 2 separate libraries user code has to link diff --git a/text_search/ci.sh b/text_search/ci.sh index feb3c12..e92ce3b 100755 --- a/text_search/ci.sh +++ b/text_search/ci.sh @@ -29,23 +29,35 @@ echo "Run config:" echo " full : $MGCXX_TEXT_SEARCH_CI_FULL" echo " release: $MGCXX_TEXT_SEARCH_CI_RELEASE" +cd "$SCRIPT_DIR/.." +FILES_TO_FIX=$({ git diff --name-only ; git diff --name-only --staged ; } | sort | uniq | egrep "\.c$|\.cpp$|.cxx$|\.h$|\.hpp$|\.hxx$" || true) +if [ ! -z "$FILES_TO_FIX" ]; then + for file in "${FILES_TO_FIX}"; do + clang-format -i -verbose ${file} + done +fi cd "$SCRIPT_DIR" -# TODO(gitbuda): Add clang-format call here. cargo fmt mkdir -p "$SCRIPT_DIR/../build" cd "$SCRIPT_DIR/../build" if [ "$MGCXX_TEXT_SEARCH_CI_FULL" = true ]; then rm -rf ./* && rm -rf .cache + # Added here because Cargo.lock is ignored for libraries, but it's not + # located under build folder. Rebuilding from scratch should also start clean + # from cargo perspective. + rm "$SCRIPT_DIR/Cargo.lock" || true else rm -rf index* fi + if [ "$MGCXX_TEXT_SEARCH_CI_RELEASE" = true ]; then cmake -DCMAKE_BUILD_TYPE=Release .. else cmake .. fi make -j8 + cd "$SCRIPT_DIR/../build/text_search" ./test_unit ./test_bench diff --git a/text_search/src/lib.rs b/text_search/src/lib.rs index 10fb809..23eb066 100644 --- a/text_search/src/lib.rs +++ b/text_search/src/lib.rs @@ -35,6 +35,7 @@ mod ffi { /// senamtics might be different. struct IndexConfig { mappings: String, + // TODO(gitbuda): Add tokenizer as an option (each field can have one). } struct DocumentInput { @@ -42,24 +43,19 @@ mod ffi { /// Mappings inside IndexConfig defines how data will be handeled. data: String, } - // NOTE: The input struct is / should be aligned with the schema. - // NOTE: Having a specific input object under ffi is a challange for general solution. - // NOTE: The following are metadata fields required by Memgraph - // metadata: String, - // gid: u64, - // txid: u64, - // deleted: bool, - // is_node: bool, - // props: String, // TODO(gitbuda): Consider using https://cxx.rs/binding/cxxstring.html + // NOTE: The input struct is/should_be aligned with the schema. + struct DocumentOutput { + data: String, // NOTE: Here should probably be Option but it's not supported in cxx. + } struct SearchInput { + search_fields: Vec, search_query: String, + return_fields: Vec, aggregation_query: String, // TODO(gitbuda): Add stuff like skip & limit. - } - - struct DocumentOutput { - data: String, // NOTE: Here should probably be Option but it's not supported in cxx. + // NOTE: Any primitive value here is a bit of a problem because of default value on the C++ + // side. } struct SearchOutput { docs: Vec, @@ -69,276 +65,31 @@ mod ffi { // NOTE: Since return type is Result, always return Result. extern "Rust" { type TantivyContext; - fn drop_index(name: &String) -> Result<()>; fn init(_log_level: &String) -> Result<()>; /// path is just passed into std::path::Path::new -> pass any absolute or relative path to /// yours process working directory /// config contains mappings definition, take a look under [IndexConfig] fn create_index(path: &String, config: &IndexConfig) -> Result; - fn aggregate(context: &mut Context, input: &SearchInput) -> Result; - fn search(context: &mut Context, input: &SearchInput) -> Result; - fn find(context: &mut Context, input: &SearchInput) -> Result; fn add(context: &mut Context, input: &DocumentInput, skip_commit: bool) -> Result<()>; fn commit(context: &mut Context) -> Result<()>; fn rollback(context: &mut Context) -> Result<()>; + fn search(context: &mut Context, input: &SearchInput) -> Result; + fn aggregate(context: &mut Context, input: &SearchInput) -> Result; + fn drop_index(path: &String) -> Result<()>; } } pub struct TantivyContext { + pub index_path: std::path::PathBuf, pub schema: Schema, pub index: Index, pub index_writer: IndexWriter, } -fn rollback(context: &mut ffi::Context) -> Result<(), std::io::Error> { - let index_writer = &mut context.tantivyContext.index_writer; - match index_writer.rollback() { - Ok(_) => { - return Ok(()); - } - Err(e) => { - return Err(Error::new( - ErrorKind::Other, - format!("Unable to rollback -> {}", e), - )); - } - } -} - -fn commit_(index_writer: &mut IndexWriter) -> Result<(), std::io::Error> { - match index_writer.commit() { - Ok(_) => { - return Ok(()); - } - Err(e) => { - return Err(Error::new( - ErrorKind::Other, - format!("Unable to commit -> {}", e), - )); - } - } -} - -fn commit(context: &mut ffi::Context) -> Result<(), std::io::Error> { - let index_writer = &mut context.tantivyContext.index_writer; - commit_(index_writer) -} - -fn add_document( - index_writer: &mut IndexWriter, - document: Document, - skip_commit: bool, -) -> Result<(), std::io::Error> { - match index_writer.add_document(document) { - Ok(_) => { - if skip_commit { - return Ok(()); - } else { - commit_(index_writer) - } - } - Err(e) => { - return Err(Error::new( - ErrorKind::Other, - format!("Unable to add document -> {}", e), - )); - } - } -} - -fn add( - context: &mut ffi::Context, - input: &ffi::DocumentInput, - skip_commit: bool, -) -> Result<(), std::io::Error> { - let schema = &context.tantivyContext.schema; - let index_writer = &mut context.tantivyContext.index_writer; - // TODO(gitbuda): schema.parse_document > TantivyDocument::parse_json (LATEST UNSTABLE) - let document = match schema.parse_document(&input.data) { - Ok(json) => json, - Err(e) => panic!("failed to parser metadata {}", e), - }; - add_document(index_writer, document, skip_commit) -} - -fn aggregate( - context: &mut ffi::Context, - input: &ffi::SearchInput, -) -> Result { - let index = &context.tantivyContext.index; - let schema = &context.tantivyContext.schema; - let reader = match index - .reader_builder() - .reload_policy(ReloadPolicy::OnCommit) - .try_into() - { - Ok(r) => r, - Err(e) => { - return Err(Error::new( - ErrorKind::Other, - format!("Unable to read (reader init failed): {}", e), - )); - } - }; - let data_field = schema.get_field("data").unwrap(); - let query_parser = QueryParser::for_index(index, vec![data_field]); - let query = match query_parser.parse_query(&input.search_query) { - Ok(q) => q, - Err(e) => { - return Err(Error::new( - ErrorKind::Other, - format!("Unable to create search query {}", e), - )); - } - }; - let searcher = reader.searcher(); - let agg_req: Aggregations = serde_json::from_str(&input.aggregation_query)?; - let collector = AggregationCollector::from_aggs(agg_req, Default::default()); - let agg_res: AggregationResults = searcher.search(&query, &collector).unwrap(); - let res: Value = serde_json::to_value(agg_res)?; - Ok(ffi::DocumentOutput { - data: res.to_string(), - }) -} - -fn find( - context: &mut ffi::Context, - input: &ffi::SearchInput, -) -> Result { - let index = &context.tantivyContext.index; - let schema = &context.tantivyContext.schema; - let reader = match index - .reader_builder() - .reload_policy(ReloadPolicy::OnCommit) - .try_into() - { - Ok(r) => r, - Err(e) => { - return Err(Error::new( - ErrorKind::Other, - format!("Unable to read (reader init failed): {}", e), - )); - } - }; - let gid_field = schema.get_field("gid").unwrap(); - let data_field = schema.get_field("data").unwrap(); - let query_parser = QueryParser::for_index(index, vec![gid_field]); - let query = match query_parser.parse_query(&input.search_query) { - Ok(q) => q, - Err(e) => { - return Err(Error::new( - ErrorKind::Other, - format!("Unable to create search query {}", e), - )); - } - }; - let top_docs = match reader.searcher().search(&query, &TopDocs::with_limit(10)) { - Ok(docs) => docs, - Err(_e) => { - return Err(Error::new(ErrorKind::Other, "Unable to perform search")); - } - }; - let mut docs: Vec = vec![]; - for (_score, doc_address) in top_docs { - let doc = match reader.searcher().doc(doc_address) { - Ok(d) => d, - Err(_) => { - panic!("Unable to find document returned by the search query."); - } - }; - let data = doc.get_first(data_field).unwrap().as_json().unwrap(); - docs.push(ffi::DocumentOutput { - data: match to_string(&data) { - Ok(s) => s, - Err(_e) => { - panic!("stored data not JSON"); - } - }, - }); - } - Ok(ffi::SearchOutput { docs }) -} - -fn search( - context: &mut ffi::Context, - input: &ffi::SearchInput, -) -> Result { - let index = &context.tantivyContext.index; - let schema = &context.tantivyContext.schema; - let reader = match index - .reader_builder() - .reload_policy(ReloadPolicy::OnCommit) - .try_into() - { - Ok(r) => r, - Err(e) => { - return Err(Error::new( - ErrorKind::Other, - format!("Unable to read (reader init failed): {}", e), - )); - } - }; - let metadata_field = schema.get_field("metadata").unwrap(); - let data_field = schema.get_field("data").unwrap(); - let query_parser = QueryParser::for_index(index, vec![metadata_field]); - let query = match query_parser.parse_query(&input.search_query) { - Ok(q) => q, - Err(e) => { - return Err(Error::new( - ErrorKind::Other, - format!("Unable to create search query {}", e), - )); - } - }; - let top_docs = match reader.searcher().search(&query, &TopDocs::with_limit(10)) { - Ok(docs) => docs, - Err(_e) => { - return Err(Error::new(ErrorKind::Other, "Unable to perform search")); - } - }; - let mut docs: Vec = vec![]; - for (_score, doc_address) in top_docs { - let doc = match reader.searcher().doc(doc_address) { - Ok(d) => d, - Err(_) => { - panic!("Unable to find document returned by the search query."); - } - }; - // let metadata = doc.get_first(metadata_field).unwrap().as_json().unwrap(); - let data = doc.get_first(data_field).unwrap().as_json().unwrap(); - // let data = schema.to_json(&doc); - docs.push(ffi::DocumentOutput { - data: match to_string(&data) { - Ok(s) => s, - Err(_e) => { - panic!("stored data not JSON"); - } - }, - }); - } - Ok(ffi::SearchOutput { docs }) -} - -fn drop_index(name: &String) -> Result<(), std::io::Error> { - let index_path = std::path::Path::new(name); - if index_path.exists() { - match std::fs::remove_dir_all(index_path) { - Ok(_) => { - debug!("tantivy_index removed"); - } - Err(_) => { - // panic!("Failed to remove tantivy_index folder {}", e); - } - } - } else { - debug!("tantivy_index folder doesn't exist"); - } - Ok(()) -} - fn init(_log_level: &String) -> Result<(), std::io::Error> { + // TODO(gitbuda): Used as a library code inside a C++ application -> align logger format. let log_init_res = env_logger::try_init_from_env( - env_logger::Env::default().filter_or(env_logger::DEFAULT_FILTER_ENV, "info"), + env_logger::Env::default().filter_or(env_logger::DEFAULT_FILTER_ENV, "warn"), ); // TODO(gitbuda): If more than one module tries to do this -> the later call might fail -> // in that case, this code should be adjusted (or the error should be ignored because the @@ -352,54 +103,10 @@ fn init(_log_level: &String) -> Result<(), std::io::Error> { Ok(()) } -fn ensure_index_dir_structure(path: &String, schema: &Schema) -> Result { - let index_path = std::path::Path::new(path); - if !index_path.exists() { - match std::fs::create_dir(index_path) { - Ok(_) => { - debug!("{:?} folder created", index_path); - } - Err(e) => { - return Err(Error::new( - ErrorKind::Other, - format!( - "Failed to create {:?} text search index folder -> {}", - index_path, e - ), - )); - } - } - } - let mmap_directory = MmapDirectory::open(&index_path).unwrap(); - // NOTE: If schema doesn't match, open_or_create is going to return an error. - let index = match Index::open_or_create(mmap_directory, schema.clone()) { - Ok(index) => index, - Err(e) => { - return Err(Error::new( - ErrorKind::Other, - format!( - "Unable to initialize text search index under {:?} -> {}", - index_path, e - ), - )); - } - }; - Ok(index) -} - -fn create_index_writter(index: &Index) -> Result { - let index_writer: IndexWriter = match index.writer(50_000_000) { - Ok(writer) => writer, - Err(_e) => { - // TODO(gitbuda): This message won't be intuitive to the user -> rewrite. - return Err(Error::new(ErrorKind::Other, "Unable to initialize writer")); - } - }; - Ok(index_writer) -} - // TODO(gitbuda): Implement full range of extract_schema options. -fn extract_schema(mappings: &serde_json::Map) -> Result { +fn create_index_schema( + mappings: &serde_json::Map, +) -> Result { let mut schema_builder = Schema::builder(); if let Some(properties) = mappings.get("properties") { if let Some(properties_map) = properties.as_object() { @@ -540,24 +247,366 @@ fn extract_schema(mappings: &serde_json::Map) -> Result Result<(Index, std::path::PathBuf), std::io::Error> { + let index_path = std::path::Path::new(path); + if !index_path.exists() { + match std::fs::create_dir(index_path) { + Ok(_) => { + debug!("{:?} folder created", index_path); + } + Err(e) => { + return Err(Error::new( + ErrorKind::Other, + format!( + "Failed to create {:?} text search index folder -> {}", + index_path, e + ), + )); + } + } + } + let mmap_directory = match MmapDirectory::open(&index_path) { + Ok(d) => d, + Err(e) => { + return Err(Error::new( + ErrorKind::Other, + format!( + "Failed to mmap text search index folder at {:?} -> {}", + index_path, e + ), + )); + } + }; + // NOTE: If schema doesn't match, open_or_create is going to return an error. + let index = match Index::open_or_create(mmap_directory, schema.clone()) { + Ok(index) => index, + Err(e) => { + return Err(Error::new( + ErrorKind::Other, + format!( + "Unable to initialize text search index under {:?} -> {}", + index_path, e + ), + )); + } + }; + Ok((index, index_path.to_path_buf())) +} + fn create_index(path: &String, config: &ffi::IndexConfig) -> Result { let mappings = match serde_json::from_str::>(&config.mappings) { Ok(r) => r, Err(e) => { return Err(Error::new( ErrorKind::Other, - format!("Unable to parse mappings: {}", e), + format!("Unable to parse mappings for index at {} -> {}", path, e), )); } }; - let schema = extract_schema(&mappings)?; - let index = ensure_index_dir_structure(path, &schema)?; - let index_writer = create_index_writter(&index)?; + let schema = create_index_schema(&mappings)?; + let (index, path) = create_index_dir_structure(path, &schema)?; + let index_writer: IndexWriter = match index.writer(50_000_000) { + Ok(writer) => writer, + Err(e) => { + return Err(Error::new(ErrorKind::Other, format!("Unable to initialize {:?} text search index writer -> {} This happened during the index creation. Make sure underlying machine is properly configured and try to execute create index again.", path, e))); + } + }; Ok(ffi::Context { tantivyContext: Box::new(TantivyContext { + index_path: path, schema, index, index_writer, }), }) } + +fn add( + context: &mut ffi::Context, + input: &ffi::DocumentInput, + skip_commit: bool, +) -> Result<(), std::io::Error> { + let index_path = &context.tantivyContext.index_path; + let schema = &context.tantivyContext.schema; + let index_writer = &mut context.tantivyContext.index_writer; + // TODO(gitbuda): schema.parse_document > TantivyDocument::parse_json (LATEST UNSTABLE) + let document = match schema.parse_document(&input.data) { + Ok(json) => json, + Err(e) => { + return Err(Error::new( + ErrorKind::Other, + format!( + "Unable to add document into text search index {:?} because schema doesn't match -> {} Please check mappings.", + index_path, e + ), + )); + } + }; + match index_writer.add_document(document) { + Ok(_) => { + if skip_commit { + return Ok(()); + } else { + commit(context) + } + } + Err(e) => { + return Err(Error::new( + ErrorKind::Other, + format!("Unable to add document -> {}", e), + )); + } + } +} + +fn commit(context: &mut ffi::Context) -> Result<(), std::io::Error> { + let index_writer = &mut context.tantivyContext.index_writer; + let index_path = &context.tantivyContext.index_path; + match index_writer.commit() { + Ok(_) => { + return Ok(()); + } + Err(e) => { + return Err(Error::new( + ErrorKind::Other, + format!( + "Unable to commit text search index at {:?} -> {}", + index_path, e + ), + )); + } + } +} + +fn rollback(context: &mut ffi::Context) -> Result<(), std::io::Error> { + let index_writer = &mut context.tantivyContext.index_writer; + let index_path = &context.tantivyContext.index_path; + match index_writer.rollback() { + Ok(_) => { + return Ok(()); + } + Err(e) => { + return Err(Error::new( + ErrorKind::Other, + format!( + "Unable to rollback text search index at {:?} -> {}", + index_path, e + ), + )); + } + } +} + +fn search_get_fields( + fields: &Vec, + schema: &Schema, + index_path: &std::path::PathBuf, +) -> Result, std::io::Error> { + let mut result: Vec = Vec::new(); + result.reserve(fields.len()); + for name in fields { + match schema.get_field(name) { + Ok(f) => result.push(f), + Err(e) => { + return Err(Error::new( + ErrorKind::Other, + format!("{} inside {:?} text seatch index", e, index_path), + )); + } + } + } + Ok(result) +} + +fn search( + context: &mut ffi::Context, + input: &ffi::SearchInput, +) -> Result { + let index_path = &context.tantivyContext.index_path; + let index = &context.tantivyContext.index; + let schema = &context.tantivyContext.schema; + let reader = match index + .reader_builder() + .reload_policy(ReloadPolicy::OnCommit) + .try_into() + { + Ok(r) => r, + Err(e) => { + return Err(Error::new( + ErrorKind::Other, + format!( + "Unable to read (reader for {:?} text search index failed) -> {}", + index_path, e + ), + )); + } + }; + let search_fields = search_get_fields(&input.search_fields, schema, index_path)?; + let return_fields = search_get_fields(&input.return_fields, schema, index_path)?; + let query_parser = QueryParser::for_index(index, search_fields); + let query = match query_parser.parse_query(&input.search_query) { + Ok(q) => q, + Err(e) => { + return Err(Error::new( + ErrorKind::Other, + format!( + "Unable to create search query for {:?} test search index -> {}", + index_path, e + ), + )); + } + }; + // TODO(gitbuda): Replace hardcoded limit 10 inside the search function. + let top_docs = match reader.searcher().search(&query, &TopDocs::with_limit(10)) { + Ok(docs) => docs, + Err(e) => { + return Err(Error::new( + ErrorKind::Other, + format!( + "Unable to perform text search under {:?} -> {}", + index_path, e + ), + )); + } + }; + let mut docs: Vec = vec![]; + for (_score, doc_address) in top_docs { + let doc = match reader.searcher().doc(doc_address) { + Ok(d) => d, + Err(e) => { + return Err(Error::new( + ErrorKind::Other, + format!( + "Unable to find document inside {:?} text search index) -> {}", + index_path, e + ), + )); + } + }; + let mut data: serde_json::Map = serde_json::Map::new(); + for (name, field) in input.return_fields.iter().zip(return_fields.iter()) { + let field_data = match doc.get_first(*field) { + Some(f) => f, + None => continue, + }; + // TODO(gitbuda): Shouldn't not just be JSON -> deduce from mappings! + let field_as_tantivy_json = match field_data.as_json() { + Some(f) => f, + None => { + // TODO(gitbuda): Is error here the best? + return Err(Error::new( + ErrorKind::Other, + format!("Unable to convert field data to json"), + )); + } + }; + let field_as_json = match serde_json::to_value(field_as_tantivy_json) { + Ok(f) => f, + Err(_) => { + return Err(Error::new( + ErrorKind::Other, + format!("Unable to convert field data to json"), + )); + } + }; + data.insert(name.to_string(), field_as_json); + } + docs.push(ffi::DocumentOutput { + data: match to_string(&data) { + Ok(s) => s, + Err(e) => { + return Err(Error::new( + ErrorKind::Other, + format!( + "Unable to serialieze {:?} text search index data into a string -> {}", + index_path, e + ), + )); + } + }, + }); + } + Ok(ffi::SearchOutput { docs }) +} + +fn aggregate( + context: &mut ffi::Context, + input: &ffi::SearchInput, +) -> Result { + let index_path = &context.tantivyContext.index_path; + let index = &context.tantivyContext.index; + let schema = &context.tantivyContext.schema; + let reader = match index + .reader_builder() + .reload_policy(ReloadPolicy::OnCommit) + .try_into() + { + Ok(r) => r, + Err(e) => { + return Err(Error::new( + ErrorKind::Other, + format!("Unable to read (reader init failed): {}", e), + )); + } + }; + let search_fields = search_get_fields(&input.search_fields, schema, index_path)?; + let query_parser = QueryParser::for_index(index, search_fields); + let query = match query_parser.parse_query(&input.search_query) { + Ok(q) => q, + Err(e) => { + return Err(Error::new( + ErrorKind::Other, + format!("Unable to create search query {}", e), + )); + } + }; + let searcher = reader.searcher(); + let agg_req: Aggregations = serde_json::from_str(&input.aggregation_query)?; + let collector = AggregationCollector::from_aggs(agg_req, Default::default()); + let agg_res: AggregationResults = match searcher.search(&query, &collector) { + Ok(r) => r, + Err(e) => { + return Err(Error::new( + ErrorKind::Other, + format!( + "Failed to gather aggregation results for {:?} text search index -> {}", + index_path, e + ), + )); + } + }; + let res: Value = serde_json::to_value(agg_res)?; + Ok(ffi::DocumentOutput { + data: res.to_string(), + }) +} + +/// Removes underlying data on disk. +/// NOTE: Before executing this information, make sure no code is actively using the underlying +/// index. +/// +fn drop_index(path: &String) -> Result<(), std::io::Error> { + let index_path = std::path::Path::new(path); + if index_path.exists() { + match std::fs::remove_dir_all(index_path) { + Ok(_) => { + debug!("Text search index at {:?} removed", index_path); + } + Err(e) => { + return Err(Error::new( + ErrorKind::Other, + format!( + "Failed to remove underlying text search index folder -> {}", + e + ), + )); + } + } + } else { + debug!("Index at {:?} does NOT exist", index_path); + } + Ok(()) +} diff --git a/text_search/test_bench.cpp b/text_search/test_bench.cpp index 2f433f5..f623d9c 100644 --- a/text_search/test_bench.cpp +++ b/text_search/test_bench.cpp @@ -95,7 +95,10 @@ BENCHMARK_DEFINE_F(MyFixture1, BM_BenchLookup)(benchmark::State &state) { memcxx::text_search::commit(*context); memcxx::text_search::SearchInput search_input = { - .search_query = fmt::format("metadata.gid:{}", 0)}; + .search_fields = {"metadata"}, + .search_query = fmt::format("metadata.gid:{}", 0), + .return_fields = {"data"}, + }; for (auto _ : state) { auto result = memcxx::text_search::search(*context, search_input); if (result.docs.size() < 1) { @@ -112,10 +115,13 @@ BENCHMARK_DEFINE_F(MyFixture2, BM_BenchLookup)(benchmark::State &state) { } memcxx::text_search::commit(*context); - memcxx::text_search::SearchInput search_input = {.search_query = - fmt::format("{}", 0)}; + memcxx::text_search::SearchInput search_input = { + .search_fields = {"gid"}, + .search_query = fmt::format("{}", 0), + .return_fields = {"data"}, + }; for (auto _ : state) { - auto result = memcxx::text_search::find(*context, search_input); + auto result = memcxx::text_search::search(*context, search_input); if (result.docs.size() < 1) { std::exit(1); } diff --git a/text_search/test_unit.cpp b/text_search/test_unit.cpp index 0790c63..a110f43 100644 --- a/text_search/test_unit.cpp +++ b/text_search/test_unit.cpp @@ -18,8 +18,10 @@ TEST(text_search_test_case, simple_test1) { }); } - memcxx::text_search::SearchInput search_input = {.search_query = - "data.key1:AWESOME"}; + memcxx::text_search::SearchInput search_input = { + .search_fields = {"metadata"}, + .search_query = "data.key1:AWESOME", + .return_fields = {"data"}}; auto result1 = measure_time_diff("search1", [&]() { return memcxx::text_search::search(context, search_input); @@ -36,12 +38,13 @@ TEST(text_search_test_case, simple_test1) { nlohmann::json aggregation_query = {}; aggregation_query["count"]["value_count"]["field"] = "metadata.txid"; - memcxx::text_search::SearchInput aggregate = { + memcxx::text_search::SearchInput aggregate_input = { + .search_fields = {"data"}, .search_query = "data.key1:AWESOME", .aggregation_query = aggregation_query.dump(), }; auto aggregation_result = nlohmann::json::parse( - memcxx::text_search::aggregate(context, aggregate).data); + memcxx::text_search::aggregate(context, aggregate_input).data); EXPECT_NEAR(aggregation_result["count"]["value"], 5, 1e-6); std::cout << aggregation_result << std::endl; } catch (const ::rust::Error &error) { @@ -65,9 +68,11 @@ TEST(text_search_test_case, simple_test2) { }); } - memcxx::text_search::SearchInput search_input = {.search_query = - fmt::format("{}", 0)}; - auto result = memcxx::text_search::find(context, search_input); + memcxx::text_search::SearchInput search_input = {.search_fields = {"gid"}, + .search_query = + fmt::format("{}", 0), + .return_fields = {"data"}}; + auto result = memcxx::text_search::search(context, search_input); ASSERT_EQ(result.docs.size(), 1); for (const auto &doc : result.docs) { std::cout << doc << std::endl; @@ -92,15 +97,21 @@ TEST(text_search_test_case, mappings) { {"type", "json"}, {"stored", true}, {"text", true}, {"fast", true}}; mappings["properties"]["prop4"] = { {"type", "bool"}, {"stored", true}, {"text", true}, {"fast", true}}; - memcxx::text_search::create_index( + auto context = memcxx::text_search::create_index( index_name, memcxx::text_search::IndexConfig{.mappings = mappings.dump()}); // NOTE: This test just verifies the code can be called, add deeper test // when improving extract_schema. // TODO(gitbuda): Implement full range of extract_schema options. + memcxx::text_search::SearchInput search_input = { + .search_fields = {"prop1000"}, + .search_query = "bla", + .return_fields = {"data"}}; + memcxx::text_search::search(context, search_input); } catch (const ::rust::Error &error) { std::cout << error.what() << std::endl; - FAIL(); + EXPECT_STREQ(error.what(), "The field does not exist: 'prop1000' inside " + "\"tantivy_index_mappings\" text seatch index"); } }