From 14e53ea2cfa51ecb2a7863c2d641632a2d08fbc2 Mon Sep 17 00:00:00 2001 From: Tor Egge Date: Mon, 24 Jun 2019 15:18:28 +0200 Subject: [PATCH] Cap interleaved features in memory index (field_length, num_occs) to prevent them wrapping around to low values. Cap reconstucted interleaved features the same way. Use interleaved features from memory index when writing disk index. --- .../field_index/field_index_test.cpp | 11 ++++++ .../vespa/searchlib/diskindex/fieldreader.cpp | 7 ++-- .../searchlib/memoryindex/field_index.cpp | 34 ++++++------------- .../ordered_field_index_inserter.cpp | 6 ++-- 4 files changed, 30 insertions(+), 28 deletions(-) diff --git a/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp b/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp index 7f2014a22072..ac1735e0549f 100644 --- a/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp +++ b/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp @@ -679,6 +679,17 @@ TEST_F(FieldIndexInterleavedFeaturesTest, no_features_are_unpacked) expect_features_unpacked("{1000000:}", 0, 0); } +TEST_F(FieldIndexInterleavedFeaturesTest, interleaved_features_are_capped) +{ + FeatureStore::DecodeContextCooked decoder(nullptr); + WrapInserter(idx).word("b").add(11, getFeatures(66001, 66000)).flush(); + auto itr = this->idx.find("b"); + EXPECT_EQ(11, itr.getKey()); + auto &entry = itr.getData(); + EXPECT_EQ(std::numeric_limits::max(), entry.get_num_occs()); + EXPECT_EQ(std::numeric_limits::max(), entry.get_field_length()); +} + Schema make_multi_field_schema() { diff --git a/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp b/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp index aeb4e2976b5d..ab77a6bfd082 100644 --- a/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp @@ -17,6 +17,8 @@ namespace { vespalib::string PosOccIdCooked = "PosOcc.3.Cooked"; vespalib::string interleaved_features("interleaved_features"); +uint16_t cap_u16(uint32_t val) { return std::min(val, static_cast(std::numeric_limits::max())); } + } using vespalib::getLastErrorString; @@ -346,8 +348,9 @@ FieldReaderStripInfo::read() if (_hasElements && _field_length_scanner) { field_length = _field_length_scanner->get_field_length(features.doc_id()); } - features.set_field_length(field_length); - features.set_num_occs(num_occs); + // cap interleaved features to 16 bits each, to match memory index + features.set_field_length(cap_u16(field_length)); + features.set_num_occs(cap_u16(num_occs)); } } diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp b/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp index 10cd200bdb00..352fd8606422 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp +++ b/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp @@ -32,20 +32,6 @@ using vespalib::GenerationHandler; namespace search::memoryindex { -namespace { - -void set_interleaved_features(DocIdAndFeatures &features) -{ - // Set cheap features based on normal features. - // TODO: Update when proper cheap features are present in memory index. - assert(!features.elements().empty()); - const auto &element = features.elements().front(); - features.set_field_length(element.getElementLen()); - features.set_num_occs(element.getNumOccs()); -} - -} - using datastore::EntryRef; template @@ -194,12 +180,12 @@ FieldIndex::dump(search::index::IndexBuilder & indexBuilde auto pitr = tree->begin(_postingListStore.getAllocator()); assert(pitr.valid()); for (; pitr.valid(); ++pitr) { - uint32_t docId = pitr.getKey(); - EntryRef featureRef(pitr.getData().get_features()); - _featureStore.setupForReadFeatures(featureRef, decoder); + features.set_doc_id(pitr.getKey()); + const PostingListEntryType &entry(pitr.getData()); + features.set_num_occs(entry.get_num_occs()); + features.set_field_length(entry.get_field_length()); + _featureStore.setupForReadFeatures(entry.get_features(), decoder); decoder.readFeatures(features); - features.set_doc_id(docId); - set_interleaved_features(features); indexBuilder.add_document(features); } } else { @@ -207,12 +193,12 @@ FieldIndex::dump(search::index::IndexBuilder & indexBuilde _postingListStore.getKeyDataEntry(plist, clusterSize); const PostingListKeyDataType *kde = kd + clusterSize; for (; kd != kde; ++kd) { - uint32_t docId = kd->_key; - EntryRef featureRef(kd->getData().get_features()); - _featureStore.setupForReadFeatures(featureRef, decoder); + features.set_doc_id(kd->_key); + const PostingListEntryType &entry(kd->getData()); + features.set_num_occs(entry.get_num_occs()); + features.set_field_length(entry.get_field_length()); + _featureStore.setupForReadFeatures(entry.get_features(), decoder); decoder.readFeatures(features); - features.set_doc_id(docId); - set_interleaved_features(features); indexBuilder.add_document(features); } } diff --git a/searchlib/src/vespa/searchlib/memoryindex/ordered_field_index_inserter.cpp b/searchlib/src/vespa/searchlib/memoryindex/ordered_field_index_inserter.cpp index c75087e85775..4eda7cf48bda 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/ordered_field_index_inserter.cpp +++ b/searchlib/src/vespa/searchlib/memoryindex/ordered_field_index_inserter.cpp @@ -25,6 +25,8 @@ namespace { const vespalib::string emptyWord = ""; +uint16_t cap_u16(uint32_t val) { return std::min(val, static_cast(std::numeric_limits::max())); } + } template @@ -119,8 +121,8 @@ OrderedFieldIndexInserter::add(uint32_t docId, (_prevDocId == docId && !_prevAdd)); datastore::EntryRef featureRef = _fieldIndex.addFeatures(features); _adds.push_back(PostingListKeyDataType(docId, PostingListEntryType(featureRef, - features.num_occs(), - features.field_length()))); + cap_u16(features.num_occs()), + cap_u16(features.field_length())))); _listener.insert(_dItr.getKey()._wordRef, docId); _prevDocId = docId; _prevAdd = true;