diff --git a/.github/wordlist.txt b/.github/wordlist.txt index 8851c5f67..0bd257ec1 100644 --- a/.github/wordlist.txt +++ b/.github/wordlist.txt @@ -1,29 +1,51 @@ +AVX +BatchIterator +DQ +Datatypes +FP HDF HNSW +KNN +RediSearch +RedisAI +SIMD TBD +TopK VSCode +VecSimBasics +VecSimGeneral +VecSimUpdatedIndex VectorSimilarity ZSH +allocators +ann benchmarked benchmarking -byndings +bm cmake +cpp +dataset +datasets +destructor devcontainer dir +enum +fp +frac gcc github gnist hnsw +hnswlib mnist neighbor pre py repo +runtime +templated tox valgrind +vecsim virtualenv whl -datasets -runtime -RedisAI -dataset diff --git a/.github/workflows/arm.yml b/.github/workflows/arm.yml index 1f831ff00..fea7e637c 100644 --- a/.github/workflows/arm.yml +++ b/.github/workflows/arm.yml @@ -24,7 +24,7 @@ jobs: github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} # Ubuntu 22.04 region AMI for ARM ec2-image-id: ami-062b37d89f25c958f - ec2-instance-type: t4g.small + ec2-instance-type: t4g.medium subnet-id: ${{ secrets.AWS_EC2_SUBNET_ID }} security-group-id: ${{ secrets.AWS_EC2_SG_ID }} diff --git a/Makefile b/Makefile index 55c39d647..9e144dd3d 100644 --- a/Makefile +++ b/Makefile @@ -71,6 +71,7 @@ make clean # remove binary files make unit_test # run unit tests CTEST_ARGS=args # extra CTest arguments VG|VALGRIND=1 # run tests with valgrind + FP_64=1 # run tests with 64-bit floating point make valgrind # build for Valgrind and run tests make flow_test # run flow tests (with pytest) TEST=file::name # run specific test @@ -124,6 +125,11 @@ ifeq ($(VERBOSE),1) CMAKE_FLAGS += -DCMAKE_VERBOSE_MAKEFILE=on endif +# CMake flags for fp64 unit tests +ifeq ($(FP_64),1) +CMAKE_FLAGS += -DFP64_TESTS=on +endif + CMAKE_FLAGS += \ -Wno-deprecated \ -DCMAKE_WARN_DEPRECATED=OFF \ diff --git a/setup.py b/setup.py index 9da9ca609..8d3819388 100644 --- a/setup.py +++ b/setup.py @@ -92,6 +92,5 @@ def build_extension(self, ext): description="Python library around collection of vector similarity algorithm", long_description="", ext_modules=[CMakeExtension("VecSim", "src/python_bindings")], - py_modules=['src/python_bindings/Mybytearray'], cmdclass={"build_ext": CMakeBuild} ) diff --git a/src/VecSim/CMakeLists.txt b/src/VecSim/CMakeLists.txt index 5967e3c5e..54986b9ff 100644 --- a/src/VecSim/CMakeLists.txt +++ b/src/VecSim/CMakeLists.txt @@ -15,8 +15,10 @@ add_subdirectory(spaces) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wall") add_library(VectorSimilarity ${VECSIM_LIBTYPE} - algorithms/brute_force/brute_force_factory.cpp - algorithms/hnsw/hnsw_factory.cpp + index_factories/brute_force_factory.cpp + index_factories/hnsw_factory.cpp + index_factories/tiered_factory.cpp + index_factories/index_factory.cpp algorithms/brute_force/vector_block.cpp algorithms/hnsw/visited_nodes_handler.cpp vec_sim.cpp diff --git a/src/VecSim/algorithms/brute_force/bf_batch_iterator.h b/src/VecSim/algorithms/brute_force/bf_batch_iterator.h index 68ea25c9e..76e47ff87 100644 --- a/src/VecSim/algorithms/brute_force/bf_batch_iterator.h +++ b/src/VecSim/algorithms/brute_force/bf_batch_iterator.h @@ -23,6 +23,8 @@ template class BF_BatchIterator : public VecSimBatchIterator { protected: const BruteForceIndex *index; + size_t index_label_count; // number of labels in the index when calculating the scores, + // which is the only time we access the index. vecsim_stl::vector> scores; // vector of scores for every label. size_t scores_valid_start_pos; // the first index in the scores vector that contains a vector // that hasn't been returned already. @@ -56,13 +58,15 @@ template VecSimQueryResult_List BF_BatchIterator::searchByHeuristics(size_t n_res, VecSimQueryResult_Order order) { - if ((this->index->indexLabelCount() - this->getResultsCount()) / 1000 > n_res) { + if ((this->index_label_count - this->getResultsCount()) / 1000 > n_res) { // Heap based search always returns the results ordered by score return this->heapBasedSearch(n_res); } VecSimQueryResult_List rl = this->selectBasedSearch(n_res); if (order == BY_SCORE) { sort_results_by_score(rl); + } else if (order == BY_SCORE_THEN_ID) { + sort_results_by_score_then_id(rl); } return rl; } @@ -167,17 +171,17 @@ BF_BatchIterator::BF_BatchIterator( void *query_vector, const BruteForceIndex *bf_index, VecSimQueryParams *queryParams, std::shared_ptr allocator) : VecSimBatchIterator(query_vector, queryParams ? queryParams->timeoutCtx : nullptr, allocator), - index(bf_index), scores(allocator), scores_valid_start_pos(0) {} + index(bf_index), index_label_count(index->indexLabelCount()), scores(allocator), + scores_valid_start_pos(0) {} template VecSimQueryResult_List BF_BatchIterator::getNextResults(size_t n_res, VecSimQueryResult_Order order) { - assert((order == BY_ID || order == BY_SCORE) && - "Possible order values are only 'BY_ID' or 'BY_SCORE'"); // Only in the first iteration we need to compute all the scores if (this->scores.empty()) { assert(getResultsCount() == 0); + // The only time we access the index. This function also updates the iterator's label count. auto rc = calculateScores(); if (VecSim_OK != rc) { @@ -198,8 +202,8 @@ BF_BatchIterator::getNextResults(size_t n_res, VecSimQueryRe template bool BF_BatchIterator::isDepleted() { - assert(this->getResultsCount() <= this->index->indexLabelCount()); - bool depleted = this->getResultsCount() == this->index->indexLabelCount(); + assert(this->getResultsCount() <= this->index_label_count); + bool depleted = this->getResultsCount() == this->index_label_count; return depleted; } diff --git a/src/VecSim/algorithms/brute_force/bfm_batch_iterator.h b/src/VecSim/algorithms/brute_force/bfm_batch_iterator.h index 5fa4bb497..ff4bc8e75 100644 --- a/src/VecSim/algorithms/brute_force/bfm_batch_iterator.h +++ b/src/VecSim/algorithms/brute_force/bfm_batch_iterator.h @@ -20,9 +20,9 @@ class BFM_BatchIterator : public BF_BatchIterator { private: inline VecSimQueryResult_Code calculateScores() override { - - this->scores.reserve(this->index->indexLabelCount()); - vecsim_stl::unordered_map tmp_scores(this->index->indexLabelCount(), + this->index_label_count = this->index->indexLabelCount(); + this->scores.reserve(this->index_label_count); + vecsim_stl::unordered_map tmp_scores(this->index_label_count, this->allocator); vecsim_stl::vector blocks = this->index->getVectorBlocks(); VecSimQueryResult_Code rc; diff --git a/src/VecSim/algorithms/brute_force/bfs_batch_iterator.h b/src/VecSim/algorithms/brute_force/bfs_batch_iterator.h index dac7d3819..9e77d1a7e 100644 --- a/src/VecSim/algorithms/brute_force/bfs_batch_iterator.h +++ b/src/VecSim/algorithms/brute_force/bfs_batch_iterator.h @@ -20,8 +20,8 @@ class BFS_BatchIterator : public BF_BatchIterator { private: inline VecSimQueryResult_Code calculateScores() override { - - this->scores.reserve(this->index->indexLabelCount()); + this->index_label_count = this->index->indexLabelCount(); + this->scores.reserve(this->index_label_count); vecsim_stl::vector blocks = this->index->getVectorBlocks(); VecSimQueryResult_Code rc; diff --git a/src/VecSim/algorithms/brute_force/brute_force.h b/src/VecSim/algorithms/brute_force/brute_force.h index 4ff435451..0058f2b2c 100644 --- a/src/VecSim/algorithms/brute_force/brute_force.h +++ b/src/VecSim/algorithms/brute_force/brute_force.h @@ -11,7 +11,7 @@ #include "VecSim/spaces/spaces.h" #include "VecSim/utils/vecsim_stl.h" #include "VecSim/utils/vecsim_results_container.h" -#include "VecSim/algorithms/brute_force/brute_force_factory.h" +#include "VecSim/index_factories/brute_force_factory.h" #include "VecSim/spaces/spaces.h" #include "VecSim/query_result_struct.h" #include "VecSim/utils/vec_utils.h" @@ -33,7 +33,7 @@ class BruteForceIndex : public VecSimIndexAbstract { idType count; public: - BruteForceIndex(const BFParams *params, std::shared_ptr allocator); + BruteForceIndex(const BFParams *params, const AbstractIndexInitParams &abstractInitParams); size_t indexSize() const override; size_t indexCapacity() const override; @@ -45,18 +45,46 @@ class BruteForceIndex : public VecSimIndexAbstract { return (DataType *)vectorBlocks.at(id / this->blockSize)->getVector(id % this->blockSize); } virtual VecSimQueryResult_List topKQuery(const void *queryBlob, size_t k, - VecSimQueryParams *queryParams) override; - VecSimQueryResult_List rangeQuery(const void *queryBlob, double radius, - VecSimQueryParams *queryParams) override; + VecSimQueryParams *queryParams) const override; + virtual VecSimQueryResult_List rangeQuery(const void *queryBlob, double radius, + VecSimQueryParams *queryParams) const override; virtual VecSimIndexInfo info() const override; virtual VecSimInfoIterator *infoIterator() const override; + VecSimIndexBasicInfo basicInfo() const override; virtual VecSimBatchIterator *newBatchIterator(const void *queryBlob, VecSimQueryParams *queryParams) const override; - bool preferAdHocSearch(size_t subsetSize, size_t k, bool initial_check) override; + bool preferAdHocSearch(size_t subsetSize, size_t k, bool initial_check) const override; inline labelType getVectorLabel(idType id) const { return idToLabelMapping.at(id); } inline vecsim_stl::vector getVectorBlocks() const { return vectorBlocks; } + inline const labelType getLabelByInternalId(idType internal_id) const { + return idToLabelMapping.at(internal_id); + } + // Remove a specific vector that is stored under a label from the index by its internal id. + virtual int deleteVectorById(labelType label, idType id) = 0; + // Remove a vector and return a map between internal ids and the original internal ids of the + // vector that they hold as a result of the overall removals and swaps, along with its label. + virtual std::unordered_map> + deleteVectorAndGetUpdatedIds(labelType label) = 0; + // Check if a certain label exists in the index. + virtual inline bool isLabelExists(labelType label) = 0; + // Return a set of all labels that are stored in the index (helper for computing label count + // without duplicates in tiered index). Caller should hold the flat buffer lock for read. + virtual inline vecsim_stl::set getLabelsSet() const = 0; + virtual ~BruteForceIndex(); +#ifdef BUILD_TESTS + /** + * @brief Used for testing - store vector(s) data associated with a given label. This function + * copies the vector(s)' data buffer(s) and place it in the output vector + * + * @param label + * @param vectors_output empty vector to be modified, should store the blob(s) associated with + * the label. + */ + virtual void getDataByLabel(labelType label, + std::vector> &vectors_output) const = 0; +#endif protected: // Private internal function that implements generic single vector insertion. @@ -74,7 +102,7 @@ class BruteForceIndex : public VecSimIndexAbstract { } // inline priority queue getter that need to be implemented by derived class virtual inline vecsim_stl::abstract_priority_queue * - getNewMaxPriorityQueue() = 0; + getNewMaxPriorityQueue() const = 0; // inline label to id setters that need to be implemented by derived class virtual inline std::unique_ptr @@ -96,11 +124,10 @@ class BruteForceIndex : public VecSimIndexAbstract { /******************** Ctor / Dtor **************/ template -BruteForceIndex::BruteForceIndex(const BFParams *params, - std::shared_ptr allocator) - : VecSimIndexAbstract(allocator, params->dim, params->type, params->metric, - params->blockSize, params->multi), - idToLabelMapping(allocator), vectorBlocks(allocator), count(0) { +BruteForceIndex::BruteForceIndex( + const BFParams *params, const AbstractIndexInitParams &abstractInitParams) + : VecSimIndexAbstract(abstractInitParams), idToLabelMapping(this->allocator), + vectorBlocks(this->allocator), count(0) { assert(VecSimType_sizeof(this->vecType) == sizeof(DataType)); this->idToLabelMapping.resize(params->initialCapacity); } @@ -135,6 +162,7 @@ void BruteForceIndex::appendVector(const void *vector_data, size_t last_block_vectors_count = id % this->blockSize; this->idToLabelMapping.resize( idToLabelMapping_size + this->blockSize - last_block_vectors_count, 0); + this->idToLabelMapping.shrink_to_fit(); } // add label to idToLabelMapping @@ -160,6 +188,7 @@ void BruteForceIndex::removeVector(idType id_to_delete) { // If we are *not* trying to remove the last vector, update mapping and move // the data of the last vector in the index in place of the deleted vector. if (id_to_delete != last_idx) { + assert(id_to_delete < last_idx); // Update idToLabelMapping. // Put the label of the last_id in the deleted_id. setVectorLabel(id_to_delete, last_idx_label); @@ -184,10 +213,11 @@ void BruteForceIndex::removeVector(idType id_to_delete) { // Resize and align the idToLabelMapping. size_t idToLabel_size = idToLabelMapping.size(); // If the new size is smaller by at least one block comparing to the idToLabelMapping - // align to be a multiplication of blocksize and resize by one block. + // align to be a multiplication of block size and resize by one block. if (this->count + this->blockSize <= idToLabel_size) { size_t vector_to_align_count = idToLabel_size % this->blockSize; this->idToLabelMapping.resize(idToLabel_size - this->blockSize - vector_to_align_count); + this->idToLabelMapping.shrink_to_fit(); } } } @@ -230,7 +260,7 @@ vecsim_stl::vector BruteForceIndex::computeBlockSc template VecSimQueryResult_List BruteForceIndex::topKQuery(const void *queryBlob, size_t k, - VecSimQueryParams *queryParams) { + VecSimQueryParams *queryParams) const { VecSimQueryResult_List rl = {0}; void *timeoutCtx = queryParams ? queryParams->timeoutCtx : NULL; @@ -241,14 +271,6 @@ BruteForceIndex::topKQuery(const void *queryBlob, size_t k, return rl; } - DataType normalized_blob[this->dim]; // This will be use only if metric == VecSimMetric_Cosine. - if (this->metric == VecSimMetric_Cosine) { - memcpy(normalized_blob, queryBlob, this->dim * sizeof(DataType)); - normalizeVector(normalized_blob, this->dim); - - queryBlob = normalized_blob; - } - DistType upperBound = std::numeric_limits::lowest(); vecsim_stl::abstract_priority_queue *TopCandidates = getNewMaxPriorityQueue(); @@ -289,18 +311,11 @@ BruteForceIndex::topKQuery(const void *queryBlob, size_t k, template VecSimQueryResult_List BruteForceIndex::rangeQuery(const void *queryBlob, double radius, - VecSimQueryParams *queryParams) { + VecSimQueryParams *queryParams) const { auto rl = (VecSimQueryResult_List){0}; void *timeoutCtx = queryParams ? queryParams->timeoutCtx : nullptr; this->last_mode = RANGE_QUERY; - DataType normalized_blob[this->dim]; // This will be use only if metric == VecSimMetric_Cosine. - if (this->metric == VecSimMetric_Cosine) { - memcpy(normalized_blob, queryBlob, this->dim * sizeof(DataType)); - normalizeVector(normalized_blob, this->dim); - queryBlob = normalized_blob; - } - // Compute scores in every block and save results that are within the range. auto res_container = getNewResultsContainer(10); // Use 10 as the initial capacity for the dynamic array. @@ -330,16 +345,18 @@ template VecSimIndexInfo BruteForceIndex::info() const { VecSimIndexInfo info; + info.commonInfo = this->getCommonInfo(); + info.commonInfo.basicInfo.algo = VecSimAlgo_BF; + + return info; +} + +template +VecSimIndexBasicInfo BruteForceIndex::basicInfo() const { + + VecSimIndexBasicInfo info = this->getBasicInfo(); info.algo = VecSimAlgo_BF; - info.bfInfo.dim = this->dim; - info.bfInfo.type = this->vecType; - info.bfInfo.metric = this->metric; - info.bfInfo.indexSize = this->count; - info.bfInfo.indexLabelCount = this->indexLabelCount(); - info.bfInfo.blockSize = this->blockSize; - info.bfInfo.memory = this->getAllocationSize(); - info.bfInfo.isMulti = this->isMulti; - info.bfInfo.last_mode = this->last_mode; + info.isTiered = false; return info; } @@ -347,51 +364,19 @@ template VecSimInfoIterator *BruteForceIndex::infoIterator() const { VecSimIndexInfo info = this->info(); // For readability. Update this number when needed. - size_t numberOfInfoFields = 8; + size_t numberOfInfoFields = 10; VecSimInfoIterator *infoIterator = new VecSimInfoIterator(numberOfInfoFields); - infoIterator->addInfoField(VecSim_InfoField{ - .fieldName = VecSimCommonStrings::ALGORITHM_STRING, - .fieldType = INFOFIELD_STRING, - .fieldValue = {FieldValue{.stringValue = VecSimAlgo_ToString(info.algo)}}}); - infoIterator->addInfoField(VecSim_InfoField{ - .fieldName = VecSimCommonStrings::TYPE_STRING, - .fieldType = INFOFIELD_STRING, - .fieldValue = {FieldValue{.stringValue = VecSimType_ToString(info.bfInfo.type)}}}); - infoIterator->addInfoField( - VecSim_InfoField{.fieldName = VecSimCommonStrings::DIMENSION_STRING, - .fieldType = INFOFIELD_UINT64, - .fieldValue = {FieldValue{.uintegerValue = info.bfInfo.dim}}}); - infoIterator->addInfoField(VecSim_InfoField{ - .fieldName = VecSimCommonStrings::METRIC_STRING, - .fieldType = INFOFIELD_STRING, - .fieldValue = {FieldValue{.stringValue = VecSimMetric_ToString(info.bfInfo.metric)}}}); - infoIterator->addInfoField( - VecSim_InfoField{.fieldName = VecSimCommonStrings::IS_MULTI_STRING, - .fieldType = INFOFIELD_UINT64, - .fieldValue = {FieldValue{.uintegerValue = info.bfInfo.isMulti}}}); - infoIterator->addInfoField( - VecSim_InfoField{.fieldName = VecSimCommonStrings::INDEX_SIZE_STRING, - .fieldType = INFOFIELD_UINT64, - .fieldValue = {FieldValue{.uintegerValue = info.bfInfo.indexSize}}}); - infoIterator->addInfoField( - VecSim_InfoField{.fieldName = VecSimCommonStrings::INDEX_LABEL_COUNT_STRING, - .fieldType = INFOFIELD_UINT64, - .fieldValue = {FieldValue{.uintegerValue = info.bfInfo.indexLabelCount}}}); infoIterator->addInfoField( - VecSim_InfoField{.fieldName = VecSimCommonStrings::BLOCK_SIZE_STRING, - .fieldType = INFOFIELD_UINT64, - .fieldValue = {FieldValue{.uintegerValue = info.bfInfo.blockSize}}}); - infoIterator->addInfoField( - VecSim_InfoField{.fieldName = VecSimCommonStrings::MEMORY_STRING, - .fieldType = INFOFIELD_UINT64, - .fieldValue = {FieldValue{.uintegerValue = info.bfInfo.memory}}}); - infoIterator->addInfoField( - VecSim_InfoField{.fieldName = VecSimCommonStrings::SEARCH_MODE_STRING, + VecSim_InfoField{.fieldName = VecSimCommonStrings::ALGORITHM_STRING, .fieldType = INFOFIELD_STRING, .fieldValue = {FieldValue{ - .stringValue = VecSimSearchMode_ToString(info.bfInfo.last_mode)}}}); - + .stringValue = VecSimAlgo_ToString(info.commonInfo.basicInfo.algo)}}}); + this->addCommonInfoToIterator(infoIterator, info.commonInfo); + infoIterator->addInfoField(VecSim_InfoField{ + .fieldName = VecSimCommonStrings::BLOCK_SIZE_STRING, + .fieldType = INFOFIELD_UINT64, + .fieldValue = {FieldValue{.uintegerValue = info.commonInfo.basicInfo.blockSize}}}); return infoIterator; } @@ -401,22 +386,19 @@ BruteForceIndex::newBatchIterator(const void *queryBlob, VecSimQueryParams *queryParams) const { auto *queryBlobCopy = this->allocator->allocate(sizeof(DataType) * this->dim); memcpy(queryBlobCopy, queryBlob, this->dim * sizeof(DataType)); - if (this->metric == VecSimMetric_Cosine) { - normalizeVector((DataType *)queryBlobCopy, this->dim); - } // Ownership of queryBlobCopy moves to BF_BatchIterator that will free it at the end. return newBatchIterator_Instance(queryBlobCopy, queryParams); } template bool BruteForceIndex::preferAdHocSearch(size_t subsetSize, size_t k, - bool initial_check) { + bool initial_check) const { // This heuristic is based on sklearn decision tree classifier (with 10 leaves nodes) - // see scripts/BF_batches_clf.py size_t index_size = this->indexSize(); - if (subsetSize > index_size) { - throw std::runtime_error("internal error: subset size cannot be larger than index size"); - } + // Referring to too large subset size as if it was the maximum possible size. + subsetSize = std::min(subsetSize, index_size); + size_t d = this->dim; float r = (index_size == 0) ? 0.0f : (float)(subsetSize) / (float)this->indexLabelCount(); bool res; diff --git a/src/VecSim/algorithms/brute_force/brute_force_factory.cpp b/src/VecSim/algorithms/brute_force/brute_force_factory.cpp deleted file mode 100644 index 3278ee57a..000000000 --- a/src/VecSim/algorithms/brute_force/brute_force_factory.cpp +++ /dev/null @@ -1,64 +0,0 @@ -/* - *Copyright Redis Ltd. 2021 - present - *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or - *the Server Side Public License v1 (SSPLv1). - */ - -#include "VecSim/algorithms/brute_force/brute_force_factory.h" -#include "VecSim/algorithms/brute_force/brute_force.h" -#include "VecSim/algorithms/brute_force/brute_force_single.h" -#include "VecSim/algorithms/brute_force/brute_force_multi.h" - -namespace BruteForceFactory { -template -inline VecSimIndex *NewIndex_ChooseMultiOrSingle(const BFParams *params, - std::shared_ptr allocator) { - // check if single and return new bf_index - if (params->multi) - return new (allocator) BruteForceIndex_Multi(params, allocator); - else - return new (allocator) BruteForceIndex_Single(params, allocator); -} - -VecSimIndex *NewIndex(const BFParams *params, std::shared_ptr allocator) { - if (params->type == VecSimType_FLOAT32) { - return NewIndex_ChooseMultiOrSingle(params, allocator); - } else if (params->type == VecSimType_FLOAT64) { - return NewIndex_ChooseMultiOrSingle(params, allocator); - } - - // If we got here something is wrong. - return NULL; -} - -template -inline size_t EstimateInitialSize_ChooseMultiOrSingle(bool is_multi) { - // check if single and return new bf_index - if (is_multi) - return sizeof(BruteForceIndex_Multi); - else - return sizeof(BruteForceIndex_Single); -} - -size_t EstimateInitialSize(const BFParams *params) { - - // Constant part (not effected by parameters). - size_t est = sizeof(VecSimAllocator) + sizeof(size_t); - if (params->type == VecSimType_FLOAT32) { - est += EstimateInitialSize_ChooseMultiOrSingle(params->multi); - } else if (params->type == VecSimType_FLOAT64) { - est += EstimateInitialSize_ChooseMultiOrSingle(params->multi); - } - // Parameters related part. - - if (params->initialCapacity) { - est += params->initialCapacity * sizeof(labelType) + sizeof(size_t); - } - - return est; -} - -size_t EstimateElementSize(const BFParams *params) { - return params->dim * VecSimType_sizeof(params->type) + sizeof(labelType); -} -}; // namespace BruteForceFactory diff --git a/src/VecSim/algorithms/brute_force/brute_force_multi.h b/src/VecSim/algorithms/brute_force/brute_force_multi.h index 71639c38d..0f963e2ba 100644 --- a/src/VecSim/algorithms/brute_force/brute_force_multi.h +++ b/src/VecSim/algorithms/brute_force/brute_force_multi.h @@ -17,13 +17,15 @@ class BruteForceIndex_Multi : public BruteForceIndex { vecsim_stl::unordered_map> labelToIdsLookup; public: - BruteForceIndex_Multi(const BFParams *params, std::shared_ptr allocator) - : BruteForceIndex(params, allocator), labelToIdsLookup(allocator) {} + BruteForceIndex_Multi(const BFParams *params, const AbstractIndexInitParams &abstractInitParams) + : BruteForceIndex(params, abstractInitParams), + labelToIdsLookup(this->allocator) {} ~BruteForceIndex_Multi() {} - int addVector(const void *vector_data, labelType label, bool overwrite_allowed = true) override; + int addVector(const void *vector_data, labelType label, void *auxiliaryCtx = nullptr) override; int deleteVector(labelType labelType) override; + int deleteVectorById(labelType label, idType id) override; double getDistanceFrom(labelType label, const void *vector_data) const override; inline size_t indexLabelCount() const override { return this->labelToIdsLookup.size(); } @@ -32,8 +34,11 @@ class BruteForceIndex_Multi : public BruteForceIndex { return std::unique_ptr( new (this->allocator) vecsim_stl::unique_results_container(cap, this->allocator)); } + std::unordered_map> + deleteVectorAndGetUpdatedIds(labelType label) override; #ifdef BUILD_TESTS - void GetDataByLabel(labelType label, std::vector> &vectors_output) { + void getDataByLabel(labelType label, + std::vector> &vectors_output) const override { auto ids = labelToIdsLookup.find(label); @@ -51,8 +56,21 @@ class BruteForceIndex_Multi : public BruteForceIndex { inline void replaceIdOfLabel(labelType label, idType new_id, idType old_id) override; + inline bool isLabelExists(labelType label) override { + return labelToIdsLookup.find(label) != labelToIdsLookup.end(); + } + // Return a set of all labels that are stored in the index (helper for computing label count + // without duplicates in tiered index). Caller should hold the flat buffer lock for read. + inline vecsim_stl::set getLabelsSet() const override { + vecsim_stl::set keys(this->allocator); + for (auto &it : labelToIdsLookup) { + keys.insert(it.first); + } + return keys; + }; + inline vecsim_stl::abstract_priority_queue * - getNewMaxPriorityQueue() override { + getNewMaxPriorityQueue() const override { return new (this->allocator) vecsim_stl::updatable_max_heap(this->allocator); } @@ -72,15 +90,7 @@ class BruteForceIndex_Multi : public BruteForceIndex { template int BruteForceIndex_Multi::addVector(const void *vector_data, labelType label, - bool overwrite_allowed) { - - DataType normalized_blob[this->dim]; // This will be use only if metric == VecSimMetric_Cosine. - if (this->metric == VecSimMetric_Cosine) { - memcpy(normalized_blob, vector_data, this->dim * sizeof(DataType)); - normalizeVector(normalized_blob, this->dim); - vector_data = normalized_blob; - } - + void *auxiliaryCtx) { this->appendVector(vector_data, label); return 1; } @@ -107,6 +117,76 @@ int BruteForceIndex_Multi::deleteVector(labelType label) { return ret; } +template +std::unordered_map> +BruteForceIndex_Multi::deleteVectorAndGetUpdatedIds(labelType label) { + // Hold a mapping from ids that are removed and changed to the original ids that were swapped + // into it. For example, if we have ids 0, 1, 2, 3, 4 and are about to remove ids 1, 3, 4, we + // should get the following scenario: {1->4} => {1->4} => {1->2}. + // Explanation: first we delete 1 and swap it with 4. Then, we remove 3 and have no swap since 3 + // is the last id. Lastly, we delete the original 4 which is now in id 1, and swap it with 2. + // Eventually, in id 1 we should have the original vector whose id was 2. + std::unordered_map> updated_ids; + + // Find the id to delete. + auto deleted_label_ids_pair = this->labelToIdsLookup.find(label); + if (deleted_label_ids_pair == this->labelToIdsLookup.end()) { + // Nothing to delete. + return updated_ids; + } + + // Deletes all vectors under the given label. + for (size_t i = 0; i < deleted_label_ids_pair->second.size(); i++) { + idType cur_id_to_delete = deleted_label_ids_pair->second[i]; + // The removal take into consideration the current internal id to remove, even if it is not + // the original id, and it has swapped into this id after previous swap of another id that + // belongs to this label. + labelType last_id_label = this->idToLabelMapping[this->count - 1]; + this->removeVector(cur_id_to_delete); + // If cur_id_to_delete exists in the map, remove it as it is no longer valid, whether it + // will get a new value due to a swap, or it is the last element in the index. + updated_ids.erase(cur_id_to_delete); + // If a swap was made, update who was the original id that now resides in cur_id_to_delete. + if (cur_id_to_delete != this->count) { + if (updated_ids.find(this->count) != updated_ids.end()) { + updated_ids[cur_id_to_delete] = updated_ids[this->count]; + updated_ids.erase(this->count); + } else { + // Otherwise, the last id now resides where the deleted id was. + updated_ids[cur_id_to_delete] = {this->count, last_id_label}; + } + } + } + // Remove the pair of the deleted vector. + labelToIdsLookup.erase(label); + return updated_ids; +} + +template +int BruteForceIndex_Multi::deleteVectorById(labelType label, idType id) { + // Find the id to delete. + auto deleted_label_ids_pair = this->labelToIdsLookup.find(label); + if (deleted_label_ids_pair == this->labelToIdsLookup.end()) { + // Nothing to delete. + return 0; + } + + // Delete the specific vector id which is under the given label. + auto &ids = deleted_label_ids_pair->second; + for (size_t i = 0; i < ids.size(); i++) { + if (ids[i] == id) { + this->removeVector(id); + ids.erase(ids.begin() + i); + if (ids.empty()) { + labelToIdsLookup.erase(label); + } + return 1; + } + } + assert(false && "id to delete was not found under the given label"); + return 0; +} + template double BruteForceIndex_Multi::getDistanceFrom(labelType label, const void *vector_data) const { @@ -129,8 +209,18 @@ template void BruteForceIndex_Multi::replaceIdOfLabel(labelType label, idType new_id, idType old_id) { assert(labelToIdsLookup.find(label) != labelToIdsLookup.end()); + // *Non-trivial code here* - in every iteration we replace the internal id of the previous last + // id that has been swapped with the deleted id. Note that if the old and the new replaced ids + // both belong to the same label, then we are going to delete the new id later on as well, since + // we are currently iterating on this exact array of ids in 'deleteVector'. Hence, the relevant + // part of the vector that should be updated is the "tail" that comes after the position of + // old_id, while the "head" may contain old occurrences of old_id that are irrelevant for the + // future deletions. Therefore, we iterate from end to beginning. For example, assuming we are + // deleting a label that contains the only 3 ids that exist in the index. Hence, we would + // expect the following scenario w.r.t. the ids array: + // [|1, 0, 2] -> [1, |0, 1] -> [1, 0, |0] (where | marks the current position) auto &ids = labelToIdsLookup.at(label); - for (size_t i = 0; i < ids.size(); i++) { + for (int i = ids.size() - 1; i >= 0; i--) { if (ids[i] == old_id) { ids[i] = new_id; return; diff --git a/src/VecSim/algorithms/brute_force/brute_force_multi_tests_friends.h b/src/VecSim/algorithms/brute_force/brute_force_multi_tests_friends.h index 43c7e0ec6..c4c3f8770 100644 --- a/src/VecSim/algorithms/brute_force/brute_force_multi_tests_friends.h +++ b/src/VecSim/algorithms/brute_force/brute_force_multi_tests_friends.h @@ -14,3 +14,4 @@ INDEX_TEST_FRIEND_CLASS(BruteForceMultiTest_indexing_same_vector_Test) INDEX_TEST_FRIEND_CLASS(BruteForceMultiTest_test_delete_swap_block_Test) INDEX_TEST_FRIEND_CLASS(BruteForceMultiTest_test_dynamic_bf_info_iterator_Test) INDEX_TEST_FRIEND_CLASS(BruteForceMultiTest_remove_vector_after_replacing_block_Test) +INDEX_TEST_FRIEND_CLASS(BruteForceMultiTest_removeVectorWithSwaps_Test) diff --git a/src/VecSim/algorithms/brute_force/brute_force_single.h b/src/VecSim/algorithms/brute_force/brute_force_single.h index adb4840ef..15e018150 100644 --- a/src/VecSim/algorithms/brute_force/brute_force_single.h +++ b/src/VecSim/algorithms/brute_force/brute_force_single.h @@ -17,11 +17,13 @@ class BruteForceIndex_Single : public BruteForceIndex { vecsim_stl::unordered_map labelToIdLookup; public: - BruteForceIndex_Single(const BFParams *params, std::shared_ptr allocator); + BruteForceIndex_Single(const BFParams *params, + const AbstractIndexInitParams &abstractInitParams); ~BruteForceIndex_Single(); - int addVector(const void *vector_data, labelType label, bool overwrite_allowed = true) override; + int addVector(const void *vector_data, labelType label, void *auxiliaryCtx = nullptr) override; int deleteVector(labelType label) override; + int deleteVectorById(labelType label, idType id) override; double getDistanceFrom(labelType label, const void *vector_data) const override; inline std::unique_ptr @@ -31,8 +33,15 @@ class BruteForceIndex_Single : public BruteForceIndex { } inline size_t indexLabelCount() const override { return this->count; } + std::unordered_map> + deleteVectorAndGetUpdatedIds(labelType label) override; + + // We call this when we KNOW that the label exists in the index. + idType getIdOfLabel(labelType label) const { return labelToIdLookup.find(label)->second; } + #ifdef BUILD_TESTS - void GetDataByLabel(labelType label, std::vector> &vectors_output) { + void getDataByLabel(labelType label, + std::vector> &vectors_output) const override { auto id = labelToIdLookup.at(label); @@ -62,8 +71,21 @@ class BruteForceIndex_Single : public BruteForceIndex { labelToIdLookup.at(label) = new_id; } + inline bool isLabelExists(labelType label) override { + return labelToIdLookup.find(label) != labelToIdLookup.end(); + } + // Return a set of all labels that are stored in the index (helper for computing label count + // without duplicates in tiered index). Caller should hold the flat buffer lock for read. + inline vecsim_stl::set getLabelsSet() const override { + vecsim_stl::set keys(this->allocator); + for (auto &it : labelToIdLookup) { + keys.insert(it.first); + } + return keys; + }; + inline vecsim_stl::abstract_priority_queue * - getNewMaxPriorityQueue() override { + getNewMaxPriorityQueue() const override { return new (this->allocator) vecsim_stl::max_priority_queue(this->allocator); } @@ -84,22 +106,16 @@ class BruteForceIndex_Single : public BruteForceIndex { template BruteForceIndex_Single::BruteForceIndex_Single( - const BFParams *params, std::shared_ptr allocator) - : BruteForceIndex(params, allocator), labelToIdLookup(allocator) {} + const BFParams *params, const AbstractIndexInitParams &abstractInitParams) + : BruteForceIndex(params, abstractInitParams), + labelToIdLookup(this->allocator) {} template BruteForceIndex_Single::~BruteForceIndex_Single() {} template int BruteForceIndex_Single::addVector(const void *vector_data, labelType label, - bool overwrite_allowed) { - - DataType normalized_blob[this->dim]; // This will be use only if metric == VecSimMetric_Cosine - if (this->metric == VecSimMetric_Cosine) { - memcpy(normalized_blob, vector_data, this->dim * sizeof(DataType)); - normalizeVector(normalized_blob, this->dim); - vector_data = normalized_blob; - } + void *auxiliaryCtx) { auto optionalID = this->labelToIdLookup.find(label); // Check if label already exists, so it is an update operation. @@ -133,6 +149,36 @@ int BruteForceIndex_Single::deleteVector(labelType label) { return 1; } +template +std::unordered_map> +BruteForceIndex_Single::deleteVectorAndGetUpdatedIds(labelType label) { + + std::unordered_map> updated_ids; + // Find the id to delete. + auto deleted_label_id_pair = this->labelToIdLookup.find(label); + if (deleted_label_id_pair == this->labelToIdLookup.end()) { + // Nothing to delete. + return updated_ids; + } + + // Get deleted vector id. + idType id_to_delete = deleted_label_id_pair->second; + + // Remove the pair of the deleted vector. + labelToIdLookup.erase(label); + labelType last_id_label = this->idToLabelMapping[this->count - 1]; + this->removeVector(id_to_delete); // this will decrease this->count and make the swap + if (id_to_delete != this->count) { + updated_ids[id_to_delete] = {this->count, last_id_label}; + } + return updated_ids; +} + +template +int BruteForceIndex_Single::deleteVectorById(labelType label, idType id) { + return deleteVector(label); +} + template double BruteForceIndex_Single::getDistanceFrom(labelType label, const void *vector_data) const { diff --git a/src/VecSim/algorithms/hnsw/hnsw.h b/src/VecSim/algorithms/hnsw/hnsw.h index 779ec40b6..c4726ef66 100644 --- a/src/VecSim/algorithms/hnsw/hnsw.h +++ b/src/VecSim/algorithms/hnsw/hnsw.h @@ -34,12 +34,10 @@ #include #include #include +#include using std::pair; -#define HNSW_INVALID_ID UINT_MAX -#define HNSW_INVALID_LEVEL SIZE_MAX - typedef uint16_t linkListSize; typedef uint16_t elementFlags; @@ -47,6 +45,24 @@ template using candidatesMaxHeap = vecsim_stl::max_priority_queue; template using candidatesLabelsMaxHeap = vecsim_stl::abstract_priority_queue; +using graphNodeType = pair; // represented as: (element_id, level) + +// Vectors flags (for marking a specific vector) +typedef enum { + DELETE_MARK = 0x1, // element is logically deleted, but still exists in the graph + IN_PROCESS = 0x2, // element is being inserted into the graph +} Flags; + +// The state of the index and the newly inserted vector to be passed into addVector API in case that +// the index global data structures are updated atomically from an external scope (such as in +// tiered index), +// TODO: this might need to be generalized for future usages of async indexing. +struct AddVectorCtx { + idType newElementId; + int elementMaxLevel; + idType currEntryPoint; + int currMaxLevel; +}; template class HNSWIndex : public VecSimIndexAbstract, @@ -69,7 +85,6 @@ class HNSWIndex : public VecSimIndexAbstract, double epsilon_; // Index meta-data (based on the data dimensionality and index parameters) - size_t data_size_; size_t size_data_per_element_; size_t size_links_per_element_; size_t size_links_level0_; @@ -82,24 +97,23 @@ class HNSWIndex : public VecSimIndexAbstract, // Index level generator of the top level for a new element std::default_random_engine level_generator_; - // Index state + // Index global state - these should be guarded by the index_data_guard_ lock in + // multithreaded scenario. size_t cur_element_count; - size_t maxlevel_; - - // Index data structures - idType entrypoint_node_; - char *data_level0_memory_; - char **linkLists_; vecsim_stl::vector element_levels_; + idType entrypoint_node_; + size_t max_level_; // this is the top level of the entry point's element + + // Index data + char *data_level0_memory_; // neighbors in level 0, element label, flags and data (vector) + char **linkLists_; // neighbors in level higher than 0 // Used for marking the visited nodes in graph scans (the pool supports parallel graph scans). // This is mutable since the object changes upon search operations as well (which are const). mutable VisitedNodesHandlerPool visited_nodes_handler_pool; -#ifdef ENABLE_PARALLELIZATION - std::mutex global; - std::mutex cur_element_count_guard_; - std::vector link_list_locks_; -#endif + + mutable std::shared_mutex index_data_guard_; + mutable vecsim_stl::vector element_neighbors_locks_; #ifdef BUILD_TESTS #include "VecSim/algorithms/hnsw/hnsw_base_tests_friends.h" @@ -115,10 +129,10 @@ class HNSWIndex : public VecSimIndexAbstract, inline size_t getRandomLevel(double reverse_size); inline vecsim_stl::vector *getIncomingEdgesPtr(idType internal_id, size_t level) const; inline void setIncomingEdgesPtr(idType internal_id, size_t level, void *edges_ptr); - inline elementFlags *get_flags(idType internal_id) const; - inline idType *get_linklist0(idType internal_id) const; - inline idType *get_linklist(idType internal_id, size_t level) const; - inline void setListCount(idType *list, linkListSize size); + inline elementFlags *getElementFlags(idType internal_id) const; + inline idType *getNodeNeighborsAtBaseLevel(idType internal_id) const; + inline idType *getNodeNeighborsAtNonBaseLevel(idType internal_id, size_t level) const; + inline void setNodeNeighborsCount(idType *list, linkListSize size); inline void removeExtraLinks(candidatesMaxHeap candidates, size_t Mcurmax, idType *node_neighbors, const vecsim_stl::vector &bitmap, idType *removed_links, size_t *removed_links_num); @@ -147,10 +161,26 @@ class HNSWIndex : public VecSimIndexAbstract, void *timeoutCtx, VecSimQueryResult_Code *rc) const; void getNeighborsByHeuristic2(candidatesMaxHeap &top_candidates, size_t M); - inline idType mutuallyConnectNewElement(idType cur_c, + // Helper function for re-selecting node's neighbors which was selected as a neighbor for + // a newly inserted node. Also, responsible for mutually connect the new node and the neighbor + // (unidirectional or bidirectional connection). + // *Note that node_lock and neighbor_lock should be locked upon calling this function* + void revisitNeighborConnections(size_t level, idType new_node_id, + const std::pair &neighbor_data, + idType *new_node_neighbors_list, + idType *neighbor_neighbors_list, + std::unique_lock &node_lock, + std::unique_lock &neighbor_lock); + inline idType mutuallyConnectNewElement(idType new_node_id, candidatesMaxHeap &top_candidates, size_t level); - template + void mutuallyUpdateForRepairedNode(idType node_id, size_t level, + vecsim_stl::vector &neighbors_to_remove, + vecsim_stl::vector &nodes_to_update, + vecsim_stl::vector &chosen_neighbors, + size_t max_M_cur); + + template void greedySearchLevel(const void *vector_data, size_t level, idType &curObj, DistType &curDist, void *timeoutCtx = nullptr, VecSimQueryResult_Code *rc = nullptr) const; void repairConnectionsForDeletion(idType element_internal_id, idType neighbour_id, @@ -158,21 +188,30 @@ class HNSWIndex : public VecSimIndexAbstract, size_t level, vecsim_stl::vector &neighbours_bitmap); inline void replaceEntryPoint(); inline void resizeIndexInternal(size_t new_max_elements); + + template inline void SwapLastIdWithDeletedId(idType element_internal_id); // Protected internal function that implements generic single vector insertion. - void appendVector(const void *vector_data, labelType label); + void appendVector(const void *vector_data, labelType label, + AddVectorCtx *auxiliaryCtx = nullptr); // Protected internal function that implements generic single vector deletion. - void removeVector(idType id); + void removeVectorInPlace(idType id); inline void emplaceToHeap(vecsim_stl::abstract_priority_queue &heap, DistType dist, idType id) const; inline void emplaceToHeap(vecsim_stl::abstract_priority_queue &heap, DistType dist, idType id) const; + // Helper method that swaps the last element in the ids list with the given one (equivalent to + // removing the given element id from the list). + inline bool removeIdFromList(vecsim_stl::vector &element_ids_list, idType element_id); + + template + void removeAndSwap(idType internalId); public: - HNSWIndex(const HNSWParams *params, std::shared_ptr allocator, + HNSWIndex(const HNSWParams *params, const AbstractIndexInitParams &abstractInitParams, size_t random_seed = 100, size_t initial_pool_size = 1); virtual ~HNSWIndex(); @@ -185,32 +224,67 @@ class HNSWIndex : public VecSimIndexAbstract, inline size_t getEfConstruction() const; inline size_t getM() const; inline size_t getMaxLevel() const; - inline idType getEntryPointId() const; inline labelType getEntryPointLabel() const; inline labelType getExternalLabel(idType internal_id) const; + // Check if the given label exists in the labels lookup while holding the index data lock. + // Optionally validate that the associated vector(s) are not in process and done indexing + // (this option is used currently for tests). + virtual inline bool safeCheckIfLabelExistsInIndex(labelType label, + bool also_done_processing = false) const = 0; + inline auto safeGetEntryPointState() const; + inline void lockIndexDataGuard() const; + inline void unlockIndexDataGuard() const; + inline void lockNodeLinks(idType node_id) const; + inline void unlockNodeLinks(idType node_id) const; inline VisitedNodesHandler *getVisitedList() const; inline void returnVisitedList(VisitedNodesHandler *visited_nodes_handler) const; VecSimIndexInfo info() const override; + VecSimIndexBasicInfo basicInfo() const override; VecSimInfoIterator *infoIterator() const override; - bool preferAdHocSearch(size_t subsetSize, size_t k, bool initial_check) override; + bool preferAdHocSearch(size_t subsetSize, size_t k, bool initial_check) const override; char *getDataByInternalId(idType internal_id) const; - inline idType *get_linklist_at_level(idType internal_id, size_t level) const; - inline linkListSize getListCount(const idType *list) const; + inline idType *getNodeNeighborsAtLevel(idType internal_id, size_t level) const; + inline linkListSize getNodeNeighborsCount(const idType *list) const; inline idType searchBottomLayerEP(const void *query_data, void *timeoutCtx, VecSimQueryResult_Code *rc) const; VecSimQueryResult_List topKQuery(const void *query_data, size_t k, - VecSimQueryParams *queryParams) override; + VecSimQueryParams *queryParams) const override; VecSimQueryResult_List rangeQuery(const void *query_data, double radius, - VecSimQueryParams *queryParams) override; + VecSimQueryParams *queryParams) const override; inline void markDeletedInternal(idType internalId); inline bool isMarkedDeleted(idType internalId) const; + inline bool isInProcess(idType internalId) const; + inline void markInProcess(idType internalId); + inline void unmarkInProcess(idType internalId); void increaseCapacity() override; - - // inline priority queue getter that need to be implemented by derived class + AddVectorCtx storeNewElement(labelType label); + void removeAndSwapDeletedElement(idType internalId); + void repairNodeConnections(idType node_id, size_t level); + inline size_t getElementTopLevel(idType internalId); + vecsim_stl::vector safeCollectAllNodeIncomingNeighbors(idType node_id, + size_t node_top_level); + // Return all the labels in the index - this should be used for computing the number of distinct + // labels in a tiered index, and caller should hold the index data guard. + virtual inline vecsim_stl::set getLabelsSet() const = 0; + + // Inline priority queue getter that need to be implemented by derived class. virtual inline candidatesLabelsMaxHeap *getNewMaxPriorityQueue() const = 0; + virtual double safeGetDistanceFrom(labelType label, const void *vector_data) const = 0; +#ifdef BUILD_TESTS + /** + * @brief Used for testing - store vector(s) data associated with a given label. This function + * copies the vector(s)' data buffer(s) and place it in the output vector + * + * @param label + * @param vectors_output empty vector to be modified, should store the blob(s) associated with + * the label. + */ + virtual void getDataByLabel(labelType label, + std::vector> &vectors_output) const = 0; +#endif protected: // inline label to id setters that need to be implemented by derived class virtual inline std::unique_ptr @@ -266,12 +340,12 @@ size_t HNSWIndex::getM() const { template size_t HNSWIndex::getMaxLevel() const { - return maxlevel_; + return max_level_; } template labelType HNSWIndex::getEntryPointLabel() const { - if (entrypoint_node_ != HNSW_INVALID_ID) + if (entrypoint_node_ != INVALID_ID) return getExternalLabel(entrypoint_node_); return SIZE_MAX; } @@ -336,14 +410,14 @@ void HNSWIndex::setIncomingEdgesPtr(idType internal_id, size } template -elementFlags *HNSWIndex::get_flags(idType internal_id) const { +elementFlags *HNSWIndex::getElementFlags(idType internal_id) const { // elementFlags offset is 0 from the start of the element metadata return (elementFlags *)(data_level0_memory_ + internal_id * size_data_per_element_ + offsetLevel0_); } template -idType *HNSWIndex::get_linklist0(idType internal_id) const { +idType *HNSWIndex::getNodeNeighborsAtBaseLevel(idType internal_id) const { // links offset at level 0 is `sizeof(elementFlags) + sizeof(linkListSize)` from the start of // the element metadata return (idType *)(data_level0_memory_ + internal_id * size_data_per_element_ + @@ -351,33 +425,30 @@ idType *HNSWIndex::get_linklist0(idType internal_id) const { } template -idType *HNSWIndex::get_linklist(idType internal_id, size_t level) const { +idType *HNSWIndex::getNodeNeighborsAtNonBaseLevel(idType internal_id, + size_t level) const { // links offset at level >0 is `sizeof(linkListSize)` from the start of the element metadata return (idType *)(linkLists_[internal_id] + (level - 1) * size_links_per_element_ + sizeof(linkListSize)); } template -idType *HNSWIndex::get_linklist_at_level(idType internal_id, - size_t level) const { - return level == 0 ? get_linklist0(internal_id) : get_linklist(internal_id, level); +idType *HNSWIndex::getNodeNeighborsAtLevel(idType internal_id, + size_t level) const { + return level == 0 ? getNodeNeighborsAtBaseLevel(internal_id) + : getNodeNeighborsAtNonBaseLevel(internal_id, level); } template -linkListSize HNSWIndex::getListCount(const idType *list) const { +linkListSize HNSWIndex::getNodeNeighborsCount(const idType *list) const { return *(((linkListSize *)list) - 1); } template -void HNSWIndex::setListCount(idType *list, const linkListSize size) { +void HNSWIndex::setNodeNeighborsCount(idType *list, const linkListSize size) { *(((linkListSize *)list) - 1) = size; } -template -idType HNSWIndex::getEntryPointId() const { - return entrypoint_node_; -} - template VisitedNodesHandler *HNSWIndex::getVisitedList() const { return visited_nodes_handler_pool.getAvailableVisitedNodesHandler(); @@ -391,20 +462,73 @@ void HNSWIndex::returnVisitedList( template void HNSWIndex::markDeletedInternal(idType internalId) { + // Here we are holding the global index data guard (and the main index lock of the tiered index + // for shared ownership). assert(internalId < this->cur_element_count); if (!isMarkedDeleted(internalId)) { - elementFlags *flags = get_flags(internalId); - *flags |= DELETE_MARK; + if (internalId == entrypoint_node_) { + // Internally, we hold and release the entrypoint neighbors lock. + replaceEntryPoint(); + } + // Atomically set the deletion mark flag (note that other parallel threads may set the flags + // at the same time (for changing the IN_PROCESS flag). + __atomic_fetch_or(getElementFlags(internalId), DELETE_MARK, 0); this->num_marked_deleted++; } } template bool HNSWIndex::isMarkedDeleted(idType internalId) const { - elementFlags *flags = get_flags(internalId); + elementFlags *flags = getElementFlags(internalId); return *flags & DELETE_MARK; } +template +bool HNSWIndex::isInProcess(idType internalId) const { + elementFlags *flags = getElementFlags(internalId); + return *flags & IN_PROCESS; +} + +template +void HNSWIndex::markInProcess(idType internalId) { + // Atomically set the IN_PROCESS mark flag. Even though other threads shouldn't modify the flags + // at that time (we're holding index global data guard, so this element cannot be marked as + // deleted in parallel), we do it for safety. + __atomic_fetch_or(getElementFlags(internalId), IN_PROCESS, 0); +} + +template +void HNSWIndex::unmarkInProcess(idType internalId) { + // Atomically unset the IN_PROCESS mark flag (note that other parallel threads may set the flags + // at the same time (for marking the element with MARK_DELETE flag). + __atomic_fetch_and(getElementFlags(internalId), ~IN_PROCESS, 0); +} + +template +void HNSWIndex::lockIndexDataGuard() const { + index_data_guard_.lock(); +} + +template +void HNSWIndex::unlockIndexDataGuard() const { + index_data_guard_.unlock(); +} + +template +void HNSWIndex::lockNodeLinks(idType node_id) const { + element_neighbors_locks_[node_id].lock(); +} + +template +void HNSWIndex::unlockNodeLinks(idType node_id) const { + element_neighbors_locks_[node_id].unlock(); +} + +template +inline size_t HNSWIndex::getElementTopLevel(idType internalId) { + return element_levels_[internalId]; +} + /** * helper functions */ @@ -435,7 +559,7 @@ void HNSWIndex::removeExtraLinks( orig_candidates.pop(); } } - setListCount(node_neighbors, link_idx); + setNodeNeighborsCount(node_neighbors, link_idx); *removed_links_num = removed_idx; } @@ -461,11 +585,9 @@ DistType HNSWIndex::processCandidate( tag_t *elements_tags, vecsim_stl::abstract_priority_queue &top_candidates, candidatesMaxHeap &candidate_set, DistType lowerBound) const { -#ifdef ENABLE_PARALLELIZATION - std::unique_lock lock(link_list_locks_[curNodeId]); -#endif - idType *node_links = get_linklist_at_level(curNodeId, layer); - linkListSize links_num = getListCount(node_links); + std::unique_lock lock(element_neighbors_locks_[curNodeId]); + idType *node_links = getNodeNeighborsAtLevel(curNodeId, layer); + linkListSize links_num = getNodeNeighborsCount(node_links); __builtin_prefetch(elements_tags + *node_links); __builtin_prefetch(getDataByInternalId(*node_links)); @@ -478,7 +600,7 @@ DistType HNSWIndex::processCandidate( __builtin_prefetch(elements_tags + *next_candidate_pos); __builtin_prefetch(getDataByInternalId(*next_candidate_pos)); - if (elements_tags[candidate_id] == visited_tag) + if (elements_tags[candidate_id] == visited_tag || isInProcess(candidate_id)) continue; elements_tags[candidate_id] = visited_tag; @@ -503,7 +625,7 @@ DistType HNSWIndex::processCandidate( } // Pre-fetch the neighbours list of the top candidate (the one that is going // to be processed in the next iteration) into memory cache, to improve performance. - __builtin_prefetch(get_linklist_at_level(candidate_set.top().second, layer)); + __builtin_prefetch(getNodeNeighborsAtLevel(candidate_set.top().second, layer)); return lowerBound; } @@ -515,11 +637,9 @@ void HNSWIndex::processCandidate_RangeSearch( tag_t *elements_tags, std::unique_ptr &results, candidatesMaxHeap &candidate_set, DistType dyn_range, double radius) const { -#ifdef ENABLE_PARALLELIZATION - std::unique_lock lock(link_list_locks_[curNodeId]); -#endif - idType *node_links = get_linklist_at_level(curNodeId, layer); - linkListSize links_num = getListCount(node_links); + std::unique_lock lock(element_neighbors_locks_[curNodeId]); + idType *node_links = getNodeNeighborsAtLevel(curNodeId, layer); + linkListSize links_num = getNodeNeighborsCount(node_links); __builtin_prefetch(elements_tags + *node_links); __builtin_prefetch(getDataByInternalId(*node_links)); @@ -535,7 +655,7 @@ void HNSWIndex::processCandidate_RangeSearch( __builtin_prefetch(elements_tags + *next_candidate_pos); __builtin_prefetch(getDataByInternalId(*next_candidate_pos)); - if (elements_tags[candidate_id] == visited_tag) + if (elements_tags[candidate_id] == visited_tag || isInProcess(candidate_id)) continue; elements_tags[candidate_id] = visited_tag; char *candidate_data = getDataByInternalId(candidate_id); @@ -553,7 +673,7 @@ void HNSWIndex::processCandidate_RangeSearch( } // Pre-fetch the neighbours list of the top candidate (the one that is going // to be processed in the next iteration) into memory cache, to improve performance. - __builtin_prefetch(get_linklist_at_level(candidate_set.top().second, layer)); + __builtin_prefetch(getNodeNeighborsAtLevel(candidate_set.top().second, layer)); } template @@ -644,116 +764,204 @@ void HNSWIndex::getNeighborsByHeuristic2( } template -idType HNSWIndex::mutuallyConnectNewElement( - idType cur_c, candidatesMaxHeap &top_candidates, size_t level) { - size_t Mcurmax = level ? maxM_ : maxM0_; - getNeighborsByHeuristic2(top_candidates, M_); - if (top_candidates.size() > M_) - throw std::runtime_error( - "Should be not be more than M_ candidates returned by the heuristic"); +void HNSWIndex::revisitNeighborConnections( + size_t level, idType new_node_id, const std::pair &neighbor_data, + idType *new_node_neighbors_list, idType *neighbor_neighbors_list, + std::unique_lock &node_lock, std::unique_lock &neighbor_lock) { + // Note - expect that node_lock and neighbor_lock are locked at that point. - vecsim_stl::vector selectedNeighbors(this->allocator); - selectedNeighbors.reserve(M_); - while (top_candidates.size() > 0) { - selectedNeighbors.push_back(top_candidates.top().second); - top_candidates.pop(); + // Collect the existing neighbors and the new node as the neighbor's neighbors candidates. + candidatesMaxHeap candidates(this->allocator); + // Add the new node along with the pre-calculated distance to the current neighbor, + candidates.emplace(neighbor_data.first, new_node_id); + + idType selected_neighbor = neighbor_data.second; + for (size_t j = 0; j < getNodeNeighborsCount(neighbor_neighbors_list); j++) { + candidates.emplace(this->dist_func(getDataByInternalId(neighbor_neighbors_list[j]), + getDataByInternalId(selected_neighbor), this->dim), + neighbor_neighbors_list[j]); } - idType next_closest_entry_point = selectedNeighbors.back(); - { - idType *ll_cur = get_linklist_at_level(cur_c, level); - assert(getListCount(ll_cur) == 0 && - "The newly inserted element should have blank link list"); - const linkListSize size = selectedNeighbors.size(); - setListCount(ll_cur, size); + std::vector nodes_to_update; + auto orig_candidates = candidates; + + // Candidates will store the newly selected neighbours (for the neighbor). + size_t max_M_cur = level ? maxM_ : maxM0_; + getNeighborsByHeuristic2(candidates, max_M_cur); + + // Go over the original candidates set, and save the ones chosen to be removed to update later + // on. + bool cur_node_chosen = false; + while (orig_candidates.size() > 0) { + idType orig_candidate = orig_candidates.top().second; + // If the current original candidate was not selected as neighbor by the heuristics, it + // should be updated and removed from the neighbor's neighbors. + if (candidates.empty() || orig_candidate != candidates.top().second) { + // Don't add the new_node_id to nodes_to_update, it will be inserted either way later. + if (orig_candidate != new_node_id) { + nodes_to_update.push_back(orig_candidate); + } + orig_candidates.pop(); + // Otherwise, the original candidate was selected to remain a neighbor - no need to + // update. + } else { + candidates.pop(); + orig_candidates.pop(); + if (orig_candidate == new_node_id) { + cur_node_chosen = true; + } + } + } + + // Acquire all relevant locks for making the updates for the selected neighbor - all its removed + // neighbors, along with the neighbors itself and the cur node. + // but first, we release the node and neighbors lock to avoid deadlocks. + node_lock.unlock(); + neighbor_lock.unlock(); - for (auto cur_neighbor = selectedNeighbors.rbegin(); - cur_neighbor != selectedNeighbors.rend(); ++cur_neighbor) { + nodes_to_update.push_back(selected_neighbor); + nodes_to_update.push_back(new_node_id); - assert(*ll_cur == 0 && "Possible memory corruption"); - assert(level <= element_levels_[*cur_neighbor] && - "Trying to make a link on a non-existent level"); + std::sort(nodes_to_update.begin(), nodes_to_update.end()); + size_t nodes_to_update_count = nodes_to_update.size(); + std::unique_lock locks[nodes_to_update_count]; + for (size_t i = 0; i < nodes_to_update_count; i++) { + locks[i] = std::unique_lock(element_neighbors_locks_[nodes_to_update[i]]); + } + + auto *neighbour_incoming_edges = getIncomingEdgesPtr(selected_neighbor, level); + size_t neighbor_neighbors_count = getNodeNeighborsCount(neighbor_neighbors_list); - *ll_cur = *cur_neighbor; - ll_cur++; + size_t neighbour_neighbours_idx = 0; + bool update_cur_node_required = true; + for (size_t i = 0; i < neighbor_neighbors_count; i++) { + if (!std::binary_search(nodes_to_update.begin(), nodes_to_update.end(), + neighbor_neighbors_list[i])) { + // The neighbor is not in the "to_update" nodes list - leave it as is. + neighbor_neighbors_list[neighbour_neighbours_idx++] = neighbor_neighbors_list[i]; + continue; + } else if (neighbor_neighbors_list[i] == new_node_id) { + // The new node got into the neighbor's neighbours - this means there was an update in + // another thread during between we released and reacquire the locks - leave it + // as is. + neighbor_neighbors_list[neighbour_neighbours_idx++] = neighbor_neighbors_list[i]; + update_cur_node_required = false; + continue; + } + // Now we know that we are looking at a node to be removed from the neighbor's neighbors. + auto removed_node = neighbor_neighbors_list[i]; + auto *removed_node_incoming_edges = getIncomingEdgesPtr(removed_node, level); + // Perform the mutual update: + // if the removed node id (the neighbour's neighbour to be removed) + // wasn't pointing to the neighbour (i.e., the edge was uni-directional), + // we should remove the current neighbor from the node's incoming edges. + // otherwise, the edge turned from bidirectional to uni-directional, so we insert it to the + // neighbour's incoming edges set. Note: we assume that every update is performed atomically + // mutually, so it should be sufficient to look at the removed node's incoming edges set + // alone. + if (!removeIdFromList(*removed_node_incoming_edges, selected_neighbor)) { + neighbour_incoming_edges->push_back(removed_node); } + } - auto *incoming_edges = new (this->allocator) vecsim_stl::vector(this->allocator); - setIncomingEdgesPtr(cur_c, level, (void *)incoming_edges); + size_t cur_node_neighbors_count = getNodeNeighborsCount(new_node_neighbors_list); + if (update_cur_node_required && cur_node_neighbors_count < max_M_cur && + !isMarkedDeleted(new_node_id) && !isMarkedDeleted(selected_neighbor)) { + // update the connection between the new node and the neighbor. + new_node_neighbors_list[cur_node_neighbors_count++] = selected_neighbor; + setNodeNeighborsCount(new_node_neighbors_list, cur_node_neighbors_count); + if (cur_node_chosen && neighbour_neighbours_idx < max_M_cur) { + // connection is mutual - both new node and the selected neighbor in each other's list. + neighbor_neighbors_list[neighbour_neighbours_idx++] = new_node_id; + } else { + // unidirectional connection - put the new node in the neighbour's incoming edges. + neighbour_incoming_edges->push_back(new_node_id); + } } + // Done updating the neighbor's neighbors. + setNodeNeighborsCount(neighbor_neighbors_list, neighbour_neighbours_idx); +} - // go over the selected neighbours - selectedNeighbor is the neighbour id - vecsim_stl::vector neighbors_bitmap(this->allocator); - for (idType selectedNeighbor : selectedNeighbors) { -#ifdef ENABLE_PARALLELIZATION - std::unique_lock lock(link_list_locks_[selectedNeighbor]); -#endif - idType *neighbor_neighbors = get_linklist_at_level(selectedNeighbor, level); - linkListSize sz_link_list_other = getListCount(neighbor_neighbors); - - if (sz_link_list_other > Mcurmax) - throw std::runtime_error("Bad value of sz_link_list_other"); - if (selectedNeighbor == cur_c) - throw std::runtime_error("Trying to connect an element to itself"); - if (level > element_levels_[selectedNeighbor]) - throw std::runtime_error("Trying to make a link on a non-existent level"); - - // If the selected neighbor can add another link (hasn't reached the max) - add it. - if (sz_link_list_other < Mcurmax) { - neighbor_neighbors[sz_link_list_other] = cur_c; - setListCount(neighbor_neighbors, sz_link_list_other + 1); +template +idType HNSWIndex::mutuallyConnectNewElement( + idType new_node_id, candidatesMaxHeap &top_candidates, size_t level) { + + // The maximum number of neighbors allowed for an existing neighbor (not new). + size_t max_M_cur = level ? maxM_ : maxM0_; + + // Filter the top candidates to the selected neighbors by the algorithm heuristics. + getNeighborsByHeuristic2(top_candidates, M_); + assert(top_candidates.size() <= M_ && + "Should be not be more than M_ candidates returned by the heuristic"); + + // Hold (distance_from_new_node_id, neighbor_id) pair for every selected neighbor. + vecsim_stl::vector> selected_neighbors(this->allocator); + selected_neighbors.reserve(M_); + while (!top_candidates.empty()) { + selected_neighbors.push_back(top_candidates.top()); + top_candidates.pop(); + } + + // The closest vector that has found to be returned (and start the scan from it in the next + // level). + idType next_closest_entry_point = selected_neighbors.back().second; + idType *new_node_neighbors_list = getNodeNeighborsAtLevel(new_node_id, level); + assert(getNodeNeighborsCount(new_node_neighbors_list) == 0 && + "The newly inserted element should have blank link list"); + + // Create the incoming edges for the new node in the current level. + auto *incoming_edges = new (this->allocator) vecsim_stl::vector(this->allocator); + setIncomingEdgesPtr(new_node_id, level, (void *)incoming_edges); + + for (auto &neighbor_data : selected_neighbors) { + idType selected_neighbor = neighbor_data.second; // neighbor's id + std::unique_lock node_lock; + std::unique_lock neighbor_lock; + idType lower_id = (new_node_id < selected_neighbor) ? new_node_id : selected_neighbor; + if (lower_id == new_node_id) { + node_lock = std::unique_lock(element_neighbors_locks_[new_node_id]); + neighbor_lock = + std::unique_lock(element_neighbors_locks_[selected_neighbor]); } else { - // try finding "weak" elements to replace it with the new one with the heuristic: - candidatesMaxHeap candidates(this->allocator); - // (re)use the bitmap to represent the set of the original neighbours for the current - // selected neighbour. - neighbors_bitmap.assign(cur_element_count, false); - DistType d_max = this->dist_func(getDataByInternalId(cur_c), - getDataByInternalId(selectedNeighbor), this->dim); - candidates.emplace(d_max, cur_c); - // consider cur_c as if it was a link of the selected neighbor - neighbors_bitmap[cur_c] = true; - for (size_t j = 0; j < sz_link_list_other; j++) { - candidates.emplace(this->dist_func(getDataByInternalId(neighbor_neighbors[j]), - getDataByInternalId(selectedNeighbor), - this->dim), - neighbor_neighbors[j]); - neighbors_bitmap[neighbor_neighbors[j]] = true; - } + neighbor_lock = + std::unique_lock(element_neighbors_locks_[selected_neighbor]); + node_lock = std::unique_lock(element_neighbors_locks_[new_node_id]); + } - idType removed_links[sz_link_list_other + 1]; - size_t removed_links_num; - removeExtraLinks(candidates, Mcurmax, neighbor_neighbors, neighbors_bitmap, - removed_links, &removed_links_num); - - // remove the current neighbor from the incoming list of nodes for the - // neighbours that were chosen to remove (if edge wasn't bidirectional) - auto *neighbour_incoming_edges = getIncomingEdgesPtr(selectedNeighbor, level); - for (size_t i = 0; i < removed_links_num; i++) { - idType node_id = removed_links[i]; - auto *node_incoming_edges = getIncomingEdgesPtr(node_id, level); - // if we removed cur_c (the node just inserted), then it points to the current - // neighbour, but not vise versa. - if (node_id == cur_c) { - neighbour_incoming_edges->push_back(cur_c); - continue; - } + // get the updated count - this may change between iterations due to releasing the lock. + linkListSize cur_node_neighbors_count = getNodeNeighborsCount(new_node_neighbors_list); + idType *neighbor_neighbors_list = getNodeNeighborsAtLevel(selected_neighbor, level); + linkListSize neighbor_neighbors_count = getNodeNeighborsCount(neighbor_neighbors_list); - // if the node id (the neighbour's neighbour to be removed) - // wasn't pointing to the neighbour (i.e., the edge was uni-directional), - // we should remove the current neighbor from the node's incoming edges. - // otherwise, the edge turned from bidirectional to - // uni-directional, so we insert it to the neighbour's - // incoming edges set. - auto it = std::find(node_incoming_edges->begin(), node_incoming_edges->end(), - selectedNeighbor); - if (it != node_incoming_edges->end()) { - node_incoming_edges->erase(it); - } else { - neighbour_incoming_edges->push_back(node_id); - } - } + // validations... + assert(cur_node_neighbors_count <= max_M_cur && "Neighbors number exceeds limit"); + assert(selected_neighbor != new_node_id && "Trying to connect an element to itself"); + + if (cur_node_neighbors_count == max_M_cur) { + // The new node cannot add more neighbors + break; } + + // If one of the two nodes has already deleted - skip the operation. + if (isMarkedDeleted(new_node_id) || isMarkedDeleted(selected_neighbor)) { + continue; + } + + // if the neighbor's neighbors list has the capacity to add the new node, make the update + // and finish. + if (neighbor_neighbors_count < max_M_cur) { + new_node_neighbors_list[cur_node_neighbors_count] = selected_neighbor; + setNodeNeighborsCount(new_node_neighbors_list, cur_node_neighbors_count + 1); + neighbor_neighbors_list[neighbor_neighbors_count] = new_node_id; + setNodeNeighborsCount(neighbor_neighbors_list, neighbor_neighbors_count + 1); + continue; + } + + // Otherwise - we need to re-evaluate the neighbor's neighbors. + // We collect all the existing neighbors and the new node as candidates, and mutually update + // the neighbor's neighbors. + revisitNeighborConnections(level, new_node_id, neighbor_data, new_node_neighbors_list, + neighbor_neighbors_list, node_lock, neighbor_lock); } return next_closest_entry_point; } @@ -765,7 +973,7 @@ void HNSWIndex::repairConnectionsForDeletion( // put the deleted element's neighbours in the candidates. candidatesMaxHeap candidates(this->allocator); - linkListSize neighbours_count = getListCount(neighbours); + linkListSize neighbours_count = getNodeNeighborsCount(neighbours); for (size_t j = 0; j < neighbours_count; j++) { // Don't put the neighbor itself in his own candidates if (neighbours[j] == neighbour_id) { @@ -779,7 +987,7 @@ void HNSWIndex::repairConnectionsForDeletion( // add the deleted element's neighbour's original neighbors in the candidates. vecsim_stl::vector neighbour_orig_neighbours_set(cur_element_count, false, this->allocator); - linkListSize neighbour_neighbours_count = getListCount(neighbour_neighbours); + linkListSize neighbour_neighbours_count = getNodeNeighborsCount(neighbour_neighbours); for (size_t j = 0; j < neighbour_neighbours_count; j++) { neighbour_orig_neighbours_set[neighbour_neighbours[j]] = true; @@ -813,16 +1021,13 @@ void HNSWIndex::repairConnectionsForDeletion( // we should remove it from the node's incoming edges. // otherwise, edge turned from bidirectional to one directional, // and it should be saved in the neighbor's incoming edges. - auto it = std::find(node_incoming_edges->begin(), node_incoming_edges->end(), neighbour_id); - if (it != node_incoming_edges->end()) { - node_incoming_edges->erase(it); - } else { + if (!removeIdFromList(*node_incoming_edges, neighbour_id)) { neighbour_incoming_edges->push_back(node_id); } } // updates for the new edges created - linkListSize updated_links_num = getListCount(neighbour_neighbours); + linkListSize updated_links_num = getNodeNeighborsCount(neighbour_neighbours); for (size_t i = 0; i < updated_links_num; i++) { idType node_id = neighbour_neighbours[i]; if (!neighbour_orig_neighbours_set[node_id]) { @@ -830,15 +1035,15 @@ void HNSWIndex::repairConnectionsForDeletion( // if the node has an edge to the neighbour as well, remove it // from the incoming nodes of the neighbour // otherwise, need to update the edge as incoming. - idType *node_links = get_linklist_at_level(node_id, level); - unsigned short node_links_size = getListCount(node_links); + idType *node_links = getNodeNeighborsAtLevel(node_id, level); + unsigned short node_links_size = getNodeNeighborsCount(node_links); bool bidirectional_edge = false; for (size_t j = 0; j < node_links_size; j++) { if (node_links[j] == neighbour_id) { - neighbour_incoming_edges->erase(std::find(neighbour_incoming_edges->begin(), - neighbour_incoming_edges->end(), - node_id)); + // Swap the last element with the current one (equivalent to removing the + // neighbor from the list) - this should always succeed and return true. + removeIdFromList(*neighbour_incoming_edges, node_id); bidirectional_edge = true; break; } @@ -855,49 +1060,91 @@ void HNSWIndex::replaceEntryPoint() { idType old_entry = entrypoint_node_; // Sets an (arbitrary) new entry point, after deleting the current entry point. while (old_entry == entrypoint_node_) { - idType *top_level_list = get_linklist_at_level(old_entry, maxlevel_); - if (getListCount(top_level_list) > 0) { - // Tries to set the (arbitrary) first neighbor as the entry point, if exists. - entrypoint_node_ = *top_level_list; - } else { - // If there is no neighbors in the current level, check for any vector at - // this level to be the new entry point. - for (idType cur_id = 0; cur_id < cur_element_count; cur_id++) { - if (element_levels_[cur_id] == maxlevel_ && cur_id != old_entry) { + // Use volatile for this variable, so that in case we would have to busy wait for this + // element to finish its indexing, the compiler will not use optimizations. Otherwise, + // the compiler might evaluate 'isInProcess(candidate_in_process)' once instead of calling + // it multiple times in a busy wait manner, and we'll run into an infinite loop if the + // candidate is in process when we reach the loop. + volatile idType candidate_in_process = INVALID_ID; + { + // Go over the entry point's neighbors at the top level. + std::unique_lock lock(this->element_neighbors_locks_[entrypoint_node_]); + idType *top_level_list = getNodeNeighborsAtLevel(old_entry, max_level_); + auto neighbors_count = getNodeNeighborsCount(top_level_list); + // Tries to set the (arbitrary) first neighbor as the entry point which is not deleted, + // if exists. + for (size_t i = 0; i < neighbors_count; i++) { + if (!isMarkedDeleted(top_level_list[i])) { + if (!isInProcess(top_level_list[i])) { + entrypoint_node_ = top_level_list[i]; + return; + } else { + // Store this candidate which is currently being inserted into the graph in + // case we won't find other candidate at the top level. + candidate_in_process = top_level_list[i]; + } + } + } + } + // If there is no neighbors in the current level, check for any vector at + // this level to be the new entry point. + for (idType cur_id = 0; cur_id < cur_element_count; cur_id++) { + if (element_levels_[cur_id] == max_level_ && cur_id != old_entry && + !isMarkedDeleted(cur_id)) { + // Found a non element in the current max level. + if (!isInProcess(cur_id)) { entrypoint_node_ = cur_id; - break; + return; + } else if (candidate_in_process == INVALID_ID) { + // This element is still in process, and there hasn't been another candidate in + // process that has found in this level. + candidate_in_process = cur_id; } } } - // If we didn't find any vector at the top level, decrease the maxlevel_ and try again, + // If we only found candidates which are in process at this level, do busy wait until they + // are done being processed (this should happen in very rare cases...). Since + // candidate_in_process was declared volatile, we can be sure that isInProcess is called in + // every iteration. + if (candidate_in_process != INVALID_ID) { + while (isInProcess(candidate_in_process)) + ; + entrypoint_node_ = candidate_in_process; + return; + } + // If we didn't find any vector at the top level, decrease the max_level_ and try again, // until we find a new entry point, or the index is empty. - if (old_entry == entrypoint_node_) { - maxlevel_--; - if ((int)maxlevel_ < 0) { - maxlevel_ = HNSW_INVALID_LEVEL; - entrypoint_node_ = HNSW_INVALID_ID; - } + assert(old_entry == entrypoint_node_); + max_level_--; + if ((int)max_level_ < 0) { + max_level_ = HNSW_INVALID_LEVEL; + entrypoint_node_ = INVALID_ID; } } } template +template void HNSWIndex::SwapLastIdWithDeletedId(idType element_internal_id) { - // swap label - replaceIdOfLabel(getExternalLabel(cur_element_count), element_internal_id, cur_element_count); + // Swap label - this is relevant when the last element's label exists (it is not marked as + // deleted). For inplace delete, this is always the case. + if (!has_marked_deleted || !isMarkedDeleted(cur_element_count)) { + replaceIdOfLabel(getExternalLabel(cur_element_count), element_internal_id, + cur_element_count); + } - // swap neighbours + // Swap neighbours size_t last_element_top_level = element_levels_[cur_element_count]; for (size_t level = 0; level <= last_element_top_level; level++) { - idType *neighbours = get_linklist_at_level(cur_element_count, level); - linkListSize neighbours_count = getListCount(neighbours); + idType *neighbours = getNodeNeighborsAtLevel(cur_element_count, level); + linkListSize neighbours_count = getNodeNeighborsCount(neighbours); - // go over the neighbours that also points back to the last element whose is going to + // Go over the neighbours that also points back to the last element whose is going to // change, and update the id. for (size_t i = 0; i < neighbours_count; i++) { idType neighbour_id = neighbours[i]; - idType *neighbour_neighbours = get_linklist_at_level(neighbour_id, level); - linkListSize neighbour_neighbours_count = getListCount(neighbour_neighbours); + idType *neighbour_neighbours = getNodeNeighborsAtLevel(neighbour_id, level); + linkListSize neighbour_neighbours_count = getNodeNeighborsCount(neighbour_neighbours); bool bidirectional_edge = false; for (size_t j = 0; j < neighbour_neighbours_count; j++) { @@ -909,25 +1156,23 @@ void HNSWIndex::SwapLastIdWithDeletedId(idType element_inter } } - // if this edge is uni-directional, we should update the id in the neighbor's + // If this edge is uni-directional, we should update the id in the neighbor's // incoming edges. if (!bidirectional_edge) { auto *neighbour_incoming_edges = getIncomingEdgesPtr(neighbour_id, level); - auto it = std::find(neighbour_incoming_edges->begin(), - neighbour_incoming_edges->end(), cur_element_count); - assert(it != neighbour_incoming_edges->end()); - neighbour_incoming_edges->erase(it); + // This should always succeed and return true. + removeIdFromList(*neighbour_incoming_edges, cur_element_count); neighbour_incoming_edges->push_back(element_internal_id); } } - // next, go over the rest of incoming edges (the ones that are not bidirectional) and make + // Next, go over the rest of incoming edges (the ones that are not bidirectional) and make // updates. auto *incoming_edges = getIncomingEdgesPtr(cur_element_count, level); for (auto incoming_edge : *incoming_edges) { - idType *incoming_neighbour_neighbours = get_linklist_at_level(incoming_edge, level); + idType *incoming_neighbour_neighbours = getNodeNeighborsAtLevel(incoming_edge, level); linkListSize incoming_neighbour_neighbours_count = - getListCount(incoming_neighbour_neighbours); + getNodeNeighborsCount(incoming_neighbour_neighbours); for (size_t j = 0; j < incoming_neighbour_neighbours_count; j++) { if (incoming_neighbour_neighbours[j] == cur_element_count) { incoming_neighbour_neighbours[j] = element_internal_id; @@ -937,18 +1182,18 @@ void HNSWIndex::SwapLastIdWithDeletedId(idType element_inter } } - // swap the last_id level 0 data, and invalidate the deleted id's data + // Swap the last_id level 0 data, and invalidate the deleted id's data. memcpy(data_level0_memory_ + element_internal_id * size_data_per_element_ + offsetLevel0_, data_level0_memory_ + cur_element_count * size_data_per_element_ + offsetLevel0_, size_data_per_element_); memset(data_level0_memory_ + cur_element_count * size_data_per_element_ + offsetLevel0_, 0, size_data_per_element_); - // swap pointer of higher levels links + // Swap pointer of higher levels links. linkLists_[element_internal_id] = linkLists_[cur_element_count]; linkLists_[cur_element_count] = nullptr; - // swap top element level + // Swap top element level. element_levels_[element_internal_id] = element_levels_[cur_element_count]; element_levels_[cur_element_count] = HNSW_INVALID_LEVEL; @@ -959,41 +1204,106 @@ void HNSWIndex::SwapLastIdWithDeletedId(idType element_inter // This function is greedily searching for the closest candidate to the given data point at the // given level, starting at the given node. It sets `curObj` to the closest node found, and -// `curDist` to the distance to this node. If `with_timeout` is true, the search will check for -// timeout and return if it has occurred. `timeoutCtx` and `rc` must be valid if `with_timeout` is -// true. +// `curDist` to the distance to this node. If `running_query` is true, the search will check for +// timeout and return if it has occurred. `timeoutCtx` and `rc` must be valid if `running_query` is +// true. *Note that we assume that level is higher than 0*. Also, if we're not running a query (we +// are searching neighbors for a new vector), then bestCand should be a non-deleted element! template -template +template void HNSWIndex::greedySearchLevel(const void *vector_data, size_t level, - idType &curObj, DistType &curDist, + idType &bestCand, DistType &curDist, void *timeoutCtx, VecSimQueryResult_Code *rc) const { bool changed; + // Don't allow choosing a deleted node as an entry point upon searching for neighbors + // candidates (that is, we're NOT running a query, but inserting a new vector). + idType bestNonDeletedCand = bestCand; + do { - if (with_timeout && VECSIM_TIMEOUT(timeoutCtx)) { + if (running_query && VECSIM_TIMEOUT(timeoutCtx)) { *rc = VecSim_QueryResult_TimedOut; - curObj = HNSW_INVALID_ID; + bestCand = INVALID_ID; return; } + changed = false; -#ifdef ENABLE_PARALLELIZATION - std::unique_lock lock(link_list_locks_[currObj]); -#endif - idType *node_links = get_linklist(curObj, level); - linkListSize links_count = getListCount(node_links); + std::unique_lock lock(element_neighbors_locks_[bestCand]); + idType *node_links = getNodeNeighborsAtNonBaseLevel(bestCand, level); + linkListSize links_count = getNodeNeighborsCount(node_links); for (int i = 0; i < links_count; i++) { idType candidate = node_links[i]; assert(candidate < this->cur_element_count && "candidate error: out of index range"); - + if (isInProcess(candidate)) { + continue; + } DistType d = this->dist_func(vector_data, getDataByInternalId(candidate), this->dim); if (d < curDist) { curDist = d; - curObj = candidate; + bestCand = candidate; changed = true; + // Run this code only for non-query code - update the best non deleted cand as well. + // Upon running a query, we don't mind having a deleted element as an entry point + // for the next level, as eventually we return non-deleted elements in level 0. + if (!running_query && !isMarkedDeleted(candidate)) { + bestNonDeletedCand = bestCand; + } } } } while (changed); + if (!running_query) { + bestCand = bestNonDeletedCand; + } +} + +template +vecsim_stl::vector +HNSWIndex::safeCollectAllNodeIncomingNeighbors(idType node_id, + size_t node_top_level) { + vecsim_stl::vector incoming_neighbors(this->allocator); + + for (size_t level = 0; level <= node_top_level; level++) { + // Save the node neighbor's in the current level while holding its neighbors lock. + std::vector neighbors_copy; + std::unique_lock element_lock(element_neighbors_locks_[node_id]); + auto *neighbours = getNodeNeighborsAtLevel(node_id, level); + unsigned short neighbours_count = getNodeNeighborsCount(neighbours); + // Store the deleted element's neighbours. + neighbors_copy.assign(neighbours, neighbours + neighbours_count); + element_lock.unlock(); + + // Go over the neighbours and collect tho ones that also points back to the removed node. + for (auto neighbour_id : neighbors_copy) { + // Hold the neighbor's lock while we are going over its neighbors. + std::unique_lock neighbor_lock(element_neighbors_locks_[neighbour_id]); + auto *neighbour_neighbours = getNodeNeighborsAtLevel(neighbour_id, level); + unsigned short neighbour_neighbours_count = getNodeNeighborsCount(neighbour_neighbours); + for (size_t j = 0; j < neighbour_neighbours_count; j++) { + // A bidirectional edge was found - this connection should be repaired. + if (neighbour_neighbours[j] == node_id) { + incoming_neighbors.emplace_back(neighbour_id, (ushort)level); + break; + } + } + } + + // Next, collect the rest of incoming edges (the ones that are not bidirectional) in the + // current level to repair them. + element_lock.lock(); + auto *incoming_edges = getIncomingEdgesPtr(node_id, level); + // Note that the deleted element might be in the process of indexing into the graph in the + // meantime (in async mode). Since the incoming_edges lists in every level are allocated + // while the element is being indexed into that level (in lazy mode), we may find ourselves + // in a situation where the incoming edges was not allocated yet in this level (but we do + // guarantee that the pointer is NULL in that case). In which case, we just continue. We + // also validate that we won't add new edges to a deleted node later on. + if (!incoming_edges) + continue; + for (auto incoming_edge : *incoming_edges) { + incoming_neighbors.emplace_back(incoming_edge, (ushort)level); + } + } + return incoming_neighbors; } template @@ -1002,9 +1312,8 @@ void HNSWIndex::resizeIndexInternal(size_t new_max_elements) element_levels_.shrink_to_fit(); resizeLabelLookup(new_max_elements); visited_nodes_handler_pool.resize(new_max_elements); -#ifdef ENABLE_PARALLELIZATION - std::vector(new_max_elements).swap(link_list_locks_); -#endif + vecsim_stl::vector(new_max_elements, this->allocator) + .swap(element_neighbors_locks_); // Reallocate base layer char *data_level0_memory_new = (char *)this->allocator->reallocate( data_level0_memory_, new_max_elements * size_data_per_element_); @@ -1022,6 +1331,217 @@ void HNSWIndex::resizeIndexInternal(size_t new_max_elements) max_elements_ = new_max_elements; } +template +void HNSWIndex::mutuallyUpdateForRepairedNode( + idType node_id, size_t level, vecsim_stl::vector &neighbors_to_remove, + vecsim_stl::vector &nodes_to_update, vecsim_stl::vector &chosen_neighbors, + size_t max_M_cur) { + // Sort the nodes to remove set for fast lookup. + std::sort(neighbors_to_remove.begin(), neighbors_to_remove.end()); + + // Acquire the required locks for the updates, after sorting the nodes to update + // (to avoid deadlocks) + nodes_to_update.push_back(node_id); + std::sort(nodes_to_update.begin(), nodes_to_update.end()); + size_t nodes_to_update_count = nodes_to_update.size(); + std::unique_lock locks[nodes_to_update_count]; + for (size_t i = 0; i < nodes_to_update_count; i++) { + locks[i] = std::unique_lock(element_neighbors_locks_[nodes_to_update[i]]); + } + + idType *node_neighbors = getNodeNeighborsAtLevel(node_id, level); + linkListSize node_neighbors_count = getNodeNeighborsCount(node_neighbors); + auto *node_incoming_edges = getIncomingEdgesPtr(node_id, level); + + // Perform mutual updates: go over the node's neighbors and overwrite the neighbors to remove + // that are still exist. + size_t node_neighbors_idx = 0; + for (size_t i = 0; i < node_neighbors_count; i++) { + if (!std::binary_search(nodes_to_update.begin(), nodes_to_update.end(), + node_neighbors[i])) { + // The repaired node added a new neighbor that we didn't account for before in the + // meantime - leave it as is. + node_neighbors[node_neighbors_idx++] = node_neighbors[i]; + continue; + } + // Check if the current neighbor is in the chosen neighbors list, and remove it from there + // if so. + if (removeIdFromList(chosen_neighbors, node_neighbors[i])) { + // A chosen neighbor is already connected to the node - leave it as is. + node_neighbors[node_neighbors_idx++] = node_neighbors[i]; + continue; + } + // Now we know that we are looking at a neighbor that needs to be removed. + auto removed_node = node_neighbors[i]; + auto *removed_node_incoming_edges = getIncomingEdgesPtr(removed_node, level); + // Perform the mutual update: + // if the removed node id (the node's neighbour to be removed) + // wasn't pointing to the node (i.e., the edge was uni-directional), + // we should remove the current neighbor from the node's incoming edges. + // otherwise, the edge turned from bidirectional to uni-directional, so we insert it to the + // neighbour's incoming edges set. Note: we assume that every update is performed atomically + // mutually, so it should be sufficient to look at the removed node's incoming edges set + // alone. + if (!removeIdFromList(*removed_node_incoming_edges, node_id)) { + node_incoming_edges->push_back(removed_node); + } + } + + // Go over the chosen new neighbors that are not connected yet and perform updates. + for (auto chosen_id : chosen_neighbors) { + if (node_neighbors_idx == max_M_cur) { + // Cannot add more new neighbors, we reached the capacity. + this->log("Couldn't add all the chosen new nodes upon updating %u, as we reached the" + " maximum number of neighbors per node", + node_id); + break; + } + // We don't add new neighbors for deleted nodes - if node_id is deleted we can finish. + // Also, don't add new neighbors to a node who is currently being indexed in parallel, as it + // may choose the same element as its neighbor right after the repair is done and connect it + // to it, and have a duplicate neighbor as a result. + if (isMarkedDeleted(node_id) || isInProcess(node_id)) { + break; + } + // If this specific new neighbor is deleted, we don't add this connection and continue. + // Also, don't add a new node whose being indexed in parallel, as it may choose this node + // as its neighbor and create a double connection (then this node will have a duplicate + // neighbor). + if (isMarkedDeleted(chosen_id) || isInProcess(chosen_id)) { + continue; + } + auto *new_neighbor_incoming_edges = getIncomingEdgesPtr(chosen_id, level); + node_neighbors[node_neighbors_idx++] = chosen_id; + // If the node is in the chosen new node incoming edges, there is a unidirectional + // connection from the chosen node to the repaired node that turns into bidirectional. Then, + // remove it from the incoming edges set. Otherwise, the edge is created unidirectional, so + // we add it to the unidirectional edges set. Note: we assume that all updates occur + // mutually and atomically, then can rely on this assumption. + if (!removeIdFromList(*node_incoming_edges, chosen_id)) { + new_neighbor_incoming_edges->push_back(node_id); + } + } + // Done updating the node's neighbors. + setNodeNeighborsCount(node_neighbors, node_neighbors_idx); +} + +template +void HNSWIndex::repairNodeConnections(idType node_id, size_t level) { + + candidatesMaxHeap neighbors_candidates(this->allocator); + // Use bitmaps for fast accesses: + // node_orig_neighbours_set is used to diffrentiate between the neighboes that will *not* be + // selected by the heuritics - only the ones that were originally neighbors should be removed. + vecsim_stl::vector node_orig_neighbours_set(max_elements_, false, this->allocator); + // neighbors_candidates_set is used to store the nodes that were already collected as + // candidates, so we will not collect them again as candidates if we run into them from another + // path. + vecsim_stl::vector neighbors_candidates_set(max_elements_, false, this->allocator); + vecsim_stl::vector deleted_neighbors(this->allocator); + + // Go over the repaired node neighbors, collect the non-deleted ones to be neighbors candidates + // after the repair as well. + { + std::unique_lock node_lock(element_neighbors_locks_[node_id]); + idType *node_neighbors = getNodeNeighborsAtLevel(node_id, level); + linkListSize node_neighbors_count = getNodeNeighborsCount(node_neighbors); + for (size_t j = 0; j < node_neighbors_count; j++) { + node_orig_neighbours_set[node_neighbors[j]] = true; + // Don't add the removed element to the candidates. + if (isMarkedDeleted(node_neighbors[j])) { + deleted_neighbors.push_back(node_neighbors[j]); + continue; + } + neighbors_candidates_set[node_neighbors[j]] = true; + neighbors_candidates.emplace(this->dist_func(getDataByInternalId(node_id), + getDataByInternalId(node_neighbors[j]), + this->dim), + node_neighbors[j]); + } + } + // If there are not deleted neighbors at that point the repair job has already been made by + // another parallel job, and there is no need to repair the node anymore. + if (deleted_neighbors.empty()) { + return; + } + + // Hold 3 sets of nodes - all the original neighbors at that point to later (potentially) + // update, subset of these which are the chosen neighbors nodes, and a subset of the original + // neighbors that are going to be removed. + vecsim_stl::vector nodes_to_update(this->allocator); + vecsim_stl::vector chosen_neighbors(this->allocator); + vecsim_stl::vector neighbors_to_remove(this->allocator); + + // Go over the deleted nodes and collect their neighbors to the candidates set. + for (idType deleted_neighbor_id : deleted_neighbors) { + nodes_to_update.push_back(deleted_neighbor_id); + neighbors_to_remove.push_back(deleted_neighbor_id); + + std::unique_lock neighbor_lock( + this->element_neighbors_locks_[deleted_neighbor_id]); + idType *neighbor_neighbours = getNodeNeighborsAtLevel(deleted_neighbor_id, level); + linkListSize neighbor_neighbours_count = getNodeNeighborsCount(neighbor_neighbours); + + for (size_t j = 0; j < neighbor_neighbours_count; j++) { + // Don't add removed elements to the candidates, nor nodes that are already in the + // candidates set, nor the original node to repair itself. + if (isMarkedDeleted(neighbor_neighbours[j]) || + neighbors_candidates_set[neighbor_neighbours[j]] || + neighbor_neighbours[j] == node_id) { + continue; + } + neighbors_candidates_set[neighbor_neighbours[j]] = true; + neighbors_candidates.emplace( + this->dist_func(getDataByInternalId(node_id), + getDataByInternalId(neighbor_neighbours[j]), this->dim), + neighbor_neighbours[j]); + } + } + + // Copy the original candidates, and run the heuristics. Afterwards, neighbors_candidates will + // store the newly selected neighbours (for the node), while candidates which were originally + // neighbors and are not going to be selected, are going to be removed. + auto orig_candidates = neighbors_candidates; + size_t max_M_cur = level ? maxM_ : maxM0_; + getNeighborsByHeuristic2(neighbors_candidates, max_M_cur); + + while (!orig_candidates.empty()) { + idType orig_candidate = orig_candidates.top().second; + if (neighbors_candidates.empty() || orig_candidate != neighbors_candidates.top().second) { + if (node_orig_neighbours_set[orig_candidate]) { + neighbors_to_remove.push_back(orig_candidate); + nodes_to_update.push_back(orig_candidate); + } + orig_candidates.pop(); + } else { + chosen_neighbors.push_back(orig_candidate); + nodes_to_update.push_back(orig_candidate); + neighbors_candidates.pop(); + orig_candidates.pop(); + } + } + + // Perform the actual updates for the node and the impacted neighbors while holding the nodes' + // locks. + mutuallyUpdateForRepairedNode(node_id, level, neighbors_to_remove, nodes_to_update, + chosen_neighbors, max_M_cur); +} + +template +inline bool +HNSWIndex::removeIdFromList(vecsim_stl::vector &element_ids_list, + idType element_id) { + auto it = std::find(element_ids_list.begin(), element_ids_list.end(), element_id); + if (it != element_ids_list.end()) { + // Swap the last element with the current one (equivalent to removing the element id from + // the list). + *it = element_ids_list.back(); + element_ids_list.pop_back(); + return true; + } + return false; +} + /** * Ctor / Dtor */ @@ -1038,20 +1558,12 @@ void HNSWIndex::resizeIndexInternal(size_t new_max_elements) } HNSWParams; */ template HNSWIndex::HNSWIndex(const HNSWParams *params, - std::shared_ptr allocator, + const AbstractIndexInitParams &abstractInitParams, size_t random_seed, size_t pool_initial_size) - : VecSimIndexAbstract(allocator, params->dim, params->type, params->metric, - params->blockSize, params->multi), - VecSimIndexTombstone(), max_elements_(params->initialCapacity), - data_size_(VecSimType_sizeof(params->type) * this->dim), - element_levels_(max_elements_, allocator), - visited_nodes_handler_pool(pool_initial_size, max_elements_, this->allocator) - -#ifdef ENABLE_PARALLELIZATION - , - link_list_locks_(max_elements_) -#endif -{ + : VecSimIndexAbstract(abstractInitParams), VecSimIndexTombstone(), + max_elements_(params->initialCapacity), element_levels_(max_elements_, this->allocator), + visited_nodes_handler_pool(pool_initial_size, max_elements_, this->allocator), + element_neighbors_locks_(max_elements_, this->allocator) { size_t M = params->M ? params->M : HNSW_DEFAULT_M; if (M > UINT16_MAX / 2) throw std::runtime_error("HNSW index parameter M is too large: argument overflow"); @@ -1068,8 +1580,8 @@ HNSWIndex::HNSWIndex(const HNSWParams *params, num_marked_deleted = 0; // initializations for special treatment of the first node - entrypoint_node_ = HNSW_INVALID_ID; - maxlevel_ = HNSW_INVALID_LEVEL; + entrypoint_node_ = INVALID_ID; + max_level_ = HNSW_INVALID_LEVEL; if (M <= 1) throw std::runtime_error("HNSW index parameter M cannot be 1"); @@ -1077,18 +1589,18 @@ HNSWIndex::HNSWIndex(const HNSWParams *params, level_generator_.seed(random_seed); // data_level0_memory will look like this: - // | ---2--- | -----2----- | -----4*M0----------- | ---------8-------- |-data_size_-| ---8--- | + // | ---2--- | -----2----- | -----4*M0----------- | ---------8-------- |-data_size-| ---8--- | // | | | ... || |