diff --git a/.github/wordlist.txt b/.github/wordlist.txt
index 8851c5f67..0bd257ec1 100644
--- a/.github/wordlist.txt
+++ b/.github/wordlist.txt
@@ -1,29 +1,51 @@
+AVX
+BatchIterator
+DQ
+Datatypes
+FP
 HDF
 HNSW
+KNN
+RediSearch
+RedisAI
+SIMD
 TBD
+TopK
 VSCode
+VecSimBasics
+VecSimGeneral
+VecSimUpdatedIndex
 VectorSimilarity
 ZSH
+allocators
+ann
 benchmarked
 benchmarking
-byndings
+bm
 cmake
+cpp
+dataset
+datasets
+destructor
 devcontainer
 dir
+enum
+fp
+frac
 gcc
 github
 gnist
 hnsw
+hnswlib
 mnist
 neighbor
 pre
 py
 repo
+runtime
+templated
 tox
 valgrind
+vecsim
 virtualenv
 whl
-datasets
-runtime
-RedisAI
-dataset
diff --git a/.github/workflows/arm.yml b/.github/workflows/arm.yml
index 1f831ff00..fea7e637c 100644
--- a/.github/workflows/arm.yml
+++ b/.github/workflows/arm.yml
@@ -24,7 +24,7 @@ jobs:
           github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
           # Ubuntu 22.04 region AMI for ARM
           ec2-image-id: ami-062b37d89f25c958f
-          ec2-instance-type: t4g.small
+          ec2-instance-type: t4g.medium
           subnet-id: ${{ secrets.AWS_EC2_SUBNET_ID }}
           security-group-id: ${{ secrets.AWS_EC2_SG_ID }}
 
diff --git a/Makefile b/Makefile
index 55c39d647..9e144dd3d 100644
--- a/Makefile
+++ b/Makefile
@@ -71,6 +71,7 @@ make clean         # remove binary files
 make unit_test     # run unit tests
   CTEST_ARGS=args    # extra CTest arguments
   VG|VALGRIND=1      # run tests with valgrind
+  FP_64=1			# run tests with 64-bit floating point
 make valgrind      # build for Valgrind and run tests
 make flow_test     # run flow tests (with pytest)
   TEST=file::name    # run specific test
@@ -124,6 +125,11 @@ ifeq ($(VERBOSE),1)
 CMAKE_FLAGS += -DCMAKE_VERBOSE_MAKEFILE=on
 endif
 
+# CMake flags for fp64 unit tests
+ifeq ($(FP_64),1)
+CMAKE_FLAGS += -DFP64_TESTS=on
+endif
+
 CMAKE_FLAGS += \
 	-Wno-deprecated \
 	-DCMAKE_WARN_DEPRECATED=OFF \
diff --git a/setup.py b/setup.py
index 9da9ca609..8d3819388 100644
--- a/setup.py
+++ b/setup.py
@@ -92,6 +92,5 @@ def build_extension(self, ext):
     description="Python library around collection of vector similarity algorithm",
     long_description="",
     ext_modules=[CMakeExtension("VecSim", "src/python_bindings")],
-    py_modules=['src/python_bindings/Mybytearray'],
     cmdclass={"build_ext": CMakeBuild}
 )
diff --git a/src/VecSim/CMakeLists.txt b/src/VecSim/CMakeLists.txt
index 5967e3c5e..54986b9ff 100644
--- a/src/VecSim/CMakeLists.txt
+++ b/src/VecSim/CMakeLists.txt
@@ -15,8 +15,10 @@ add_subdirectory(spaces)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wall")
 
 add_library(VectorSimilarity ${VECSIM_LIBTYPE}
-    algorithms/brute_force/brute_force_factory.cpp
-    algorithms/hnsw/hnsw_factory.cpp
+    index_factories/brute_force_factory.cpp
+    index_factories/hnsw_factory.cpp
+    index_factories/tiered_factory.cpp
+    index_factories/index_factory.cpp
     algorithms/brute_force/vector_block.cpp
     algorithms/hnsw/visited_nodes_handler.cpp
     vec_sim.cpp
diff --git a/src/VecSim/algorithms/brute_force/bf_batch_iterator.h b/src/VecSim/algorithms/brute_force/bf_batch_iterator.h
index 68ea25c9e..76e47ff87 100644
--- a/src/VecSim/algorithms/brute_force/bf_batch_iterator.h
+++ b/src/VecSim/algorithms/brute_force/bf_batch_iterator.h
@@ -23,6 +23,8 @@ template <typename DataType, typename DistType>
 class BF_BatchIterator : public VecSimBatchIterator {
 protected:
     const BruteForceIndex<DataType, DistType> *index;
+    size_t index_label_count; // number of labels in the index when calculating the scores,
+                              // which is the only time we access the index.
     vecsim_stl::vector<pair<DistType, labelType>> scores; // vector of scores for every label.
     size_t scores_valid_start_pos; // the first index in the scores vector that contains a vector
                                    // that hasn't been returned already.
@@ -56,13 +58,15 @@ template <typename DataType, typename DistType>
 VecSimQueryResult_List
 BF_BatchIterator<DataType, DistType>::searchByHeuristics(size_t n_res,
                                                          VecSimQueryResult_Order order) {
-    if ((this->index->indexLabelCount() - this->getResultsCount()) / 1000 > n_res) {
+    if ((this->index_label_count - this->getResultsCount()) / 1000 > n_res) {
         // Heap based search always returns the results ordered by score
         return this->heapBasedSearch(n_res);
     }
     VecSimQueryResult_List rl = this->selectBasedSearch(n_res);
     if (order == BY_SCORE) {
         sort_results_by_score(rl);
+    } else if (order == BY_SCORE_THEN_ID) {
+        sort_results_by_score_then_id(rl);
     }
     return rl;
 }
@@ -167,17 +171,17 @@ BF_BatchIterator<DataType, DistType>::BF_BatchIterator(
     void *query_vector, const BruteForceIndex<DataType, DistType> *bf_index,
     VecSimQueryParams *queryParams, std::shared_ptr<VecSimAllocator> allocator)
     : VecSimBatchIterator(query_vector, queryParams ? queryParams->timeoutCtx : nullptr, allocator),
-      index(bf_index), scores(allocator), scores_valid_start_pos(0) {}
+      index(bf_index), index_label_count(index->indexLabelCount()), scores(allocator),
+      scores_valid_start_pos(0) {}
 
 template <typename DataType, typename DistType>
 VecSimQueryResult_List
 BF_BatchIterator<DataType, DistType>::getNextResults(size_t n_res, VecSimQueryResult_Order order) {
-    assert((order == BY_ID || order == BY_SCORE) &&
-           "Possible order values are only 'BY_ID' or 'BY_SCORE'");
     // Only in the first iteration we need to compute all the scores
     if (this->scores.empty()) {
         assert(getResultsCount() == 0);
 
+        // The only time we access the index. This function also updates the iterator's label count.
         auto rc = calculateScores();
 
         if (VecSim_OK != rc) {
@@ -198,8 +202,8 @@ BF_BatchIterator<DataType, DistType>::getNextResults(size_t n_res, VecSimQueryRe
 
 template <typename DataType, typename DistType>
 bool BF_BatchIterator<DataType, DistType>::isDepleted() {
-    assert(this->getResultsCount() <= this->index->indexLabelCount());
-    bool depleted = this->getResultsCount() == this->index->indexLabelCount();
+    assert(this->getResultsCount() <= this->index_label_count);
+    bool depleted = this->getResultsCount() == this->index_label_count;
     return depleted;
 }
 
diff --git a/src/VecSim/algorithms/brute_force/bfm_batch_iterator.h b/src/VecSim/algorithms/brute_force/bfm_batch_iterator.h
index 5fa4bb497..ff4bc8e75 100644
--- a/src/VecSim/algorithms/brute_force/bfm_batch_iterator.h
+++ b/src/VecSim/algorithms/brute_force/bfm_batch_iterator.h
@@ -20,9 +20,9 @@ class BFM_BatchIterator : public BF_BatchIterator<DataType, DistType> {
 
 private:
     inline VecSimQueryResult_Code calculateScores() override {
-
-        this->scores.reserve(this->index->indexLabelCount());
-        vecsim_stl::unordered_map<labelType, DistType> tmp_scores(this->index->indexLabelCount(),
+        this->index_label_count = this->index->indexLabelCount();
+        this->scores.reserve(this->index_label_count);
+        vecsim_stl::unordered_map<labelType, DistType> tmp_scores(this->index_label_count,
                                                                   this->allocator);
         vecsim_stl::vector<VectorBlock *> blocks = this->index->getVectorBlocks();
         VecSimQueryResult_Code rc;
diff --git a/src/VecSim/algorithms/brute_force/bfs_batch_iterator.h b/src/VecSim/algorithms/brute_force/bfs_batch_iterator.h
index dac7d3819..9e77d1a7e 100644
--- a/src/VecSim/algorithms/brute_force/bfs_batch_iterator.h
+++ b/src/VecSim/algorithms/brute_force/bfs_batch_iterator.h
@@ -20,8 +20,8 @@ class BFS_BatchIterator : public BF_BatchIterator<DataType, DistType> {
 
 private:
     inline VecSimQueryResult_Code calculateScores() override {
-
-        this->scores.reserve(this->index->indexLabelCount());
+        this->index_label_count = this->index->indexLabelCount();
+        this->scores.reserve(this->index_label_count);
         vecsim_stl::vector<VectorBlock *> blocks = this->index->getVectorBlocks();
         VecSimQueryResult_Code rc;
 
diff --git a/src/VecSim/algorithms/brute_force/brute_force.h b/src/VecSim/algorithms/brute_force/brute_force.h
index 4ff435451..0058f2b2c 100644
--- a/src/VecSim/algorithms/brute_force/brute_force.h
+++ b/src/VecSim/algorithms/brute_force/brute_force.h
@@ -11,7 +11,7 @@
 #include "VecSim/spaces/spaces.h"
 #include "VecSim/utils/vecsim_stl.h"
 #include "VecSim/utils/vecsim_results_container.h"
-#include "VecSim/algorithms/brute_force/brute_force_factory.h"
+#include "VecSim/index_factories/brute_force_factory.h"
 #include "VecSim/spaces/spaces.h"
 #include "VecSim/query_result_struct.h"
 #include "VecSim/utils/vec_utils.h"
@@ -33,7 +33,7 @@ class BruteForceIndex : public VecSimIndexAbstract<DistType> {
     idType count;
 
 public:
-    BruteForceIndex(const BFParams *params, std::shared_ptr<VecSimAllocator> allocator);
+    BruteForceIndex(const BFParams *params, const AbstractIndexInitParams &abstractInitParams);
 
     size_t indexSize() const override;
     size_t indexCapacity() const override;
@@ -45,18 +45,46 @@ class BruteForceIndex : public VecSimIndexAbstract<DistType> {
         return (DataType *)vectorBlocks.at(id / this->blockSize)->getVector(id % this->blockSize);
     }
     virtual VecSimQueryResult_List topKQuery(const void *queryBlob, size_t k,
-                                             VecSimQueryParams *queryParams) override;
-    VecSimQueryResult_List rangeQuery(const void *queryBlob, double radius,
-                                      VecSimQueryParams *queryParams) override;
+                                             VecSimQueryParams *queryParams) const override;
+    virtual VecSimQueryResult_List rangeQuery(const void *queryBlob, double radius,
+                                              VecSimQueryParams *queryParams) const override;
     virtual VecSimIndexInfo info() const override;
     virtual VecSimInfoIterator *infoIterator() const override;
+    VecSimIndexBasicInfo basicInfo() const override;
     virtual VecSimBatchIterator *newBatchIterator(const void *queryBlob,
                                                   VecSimQueryParams *queryParams) const override;
-    bool preferAdHocSearch(size_t subsetSize, size_t k, bool initial_check) override;
+    bool preferAdHocSearch(size_t subsetSize, size_t k, bool initial_check) const override;
     inline labelType getVectorLabel(idType id) const { return idToLabelMapping.at(id); }
 
     inline vecsim_stl::vector<VectorBlock *> getVectorBlocks() const { return vectorBlocks; }
+    inline const labelType getLabelByInternalId(idType internal_id) const {
+        return idToLabelMapping.at(internal_id);
+    }
+    // Remove a specific vector that is stored under a label from the index by its internal id.
+    virtual int deleteVectorById(labelType label, idType id) = 0;
+    // Remove a vector and return a map between internal ids and the original internal ids of the
+    // vector that they hold as a result of the overall removals and swaps, along with its label.
+    virtual std::unordered_map<idType, std::pair<idType, labelType>>
+    deleteVectorAndGetUpdatedIds(labelType label) = 0;
+    // Check if a certain label exists in the index.
+    virtual inline bool isLabelExists(labelType label) = 0;
+    // Return a set of all labels that are stored in the index (helper for computing label count
+    // without duplicates in tiered index). Caller should hold the flat buffer lock for read.
+    virtual inline vecsim_stl::set<labelType> getLabelsSet() const = 0;
+
     virtual ~BruteForceIndex();
+#ifdef BUILD_TESTS
+    /**
+     * @brief Used for testing - store vector(s) data associated with a given label. This function
+     * copies the vector(s)' data buffer(s) and place it in the output vector
+     *
+     * @param label
+     * @param vectors_output empty vector to be modified, should store the blob(s) associated with
+     * the label.
+     */
+    virtual void getDataByLabel(labelType label,
+                                std::vector<std::vector<DataType>> &vectors_output) const = 0;
+#endif
 
 protected:
     // Private internal function that implements generic single vector insertion.
@@ -74,7 +102,7 @@ class BruteForceIndex : public VecSimIndexAbstract<DistType> {
     }
     // inline priority queue getter that need to be implemented by derived class
     virtual inline vecsim_stl::abstract_priority_queue<DistType, labelType> *
-    getNewMaxPriorityQueue() = 0;
+    getNewMaxPriorityQueue() const = 0;
 
     // inline label to id setters that need to be implemented by derived class
     virtual inline std::unique_ptr<vecsim_stl::abstract_results_container>
@@ -96,11 +124,10 @@ class BruteForceIndex : public VecSimIndexAbstract<DistType> {
 
 /******************** Ctor / Dtor **************/
 template <typename DataType, typename DistType>
-BruteForceIndex<DataType, DistType>::BruteForceIndex(const BFParams *params,
-                                                     std::shared_ptr<VecSimAllocator> allocator)
-    : VecSimIndexAbstract<DistType>(allocator, params->dim, params->type, params->metric,
-                                    params->blockSize, params->multi),
-      idToLabelMapping(allocator), vectorBlocks(allocator), count(0) {
+BruteForceIndex<DataType, DistType>::BruteForceIndex(
+    const BFParams *params, const AbstractIndexInitParams &abstractInitParams)
+    : VecSimIndexAbstract<DistType>(abstractInitParams), idToLabelMapping(this->allocator),
+      vectorBlocks(this->allocator), count(0) {
     assert(VecSimType_sizeof(this->vecType) == sizeof(DataType));
     this->idToLabelMapping.resize(params->initialCapacity);
 }
@@ -135,6 +162,7 @@ void BruteForceIndex<DataType, DistType>::appendVector(const void *vector_data,
         size_t last_block_vectors_count = id % this->blockSize;
         this->idToLabelMapping.resize(
             idToLabelMapping_size + this->blockSize - last_block_vectors_count, 0);
+        this->idToLabelMapping.shrink_to_fit();
     }
 
     // add label to idToLabelMapping
@@ -160,6 +188,7 @@ void BruteForceIndex<DataType, DistType>::removeVector(idType id_to_delete) {
     // If we are *not* trying to remove the last vector, update mapping and move
     // the data of the last vector in the index in place of the deleted vector.
     if (id_to_delete != last_idx) {
+        assert(id_to_delete < last_idx);
         // Update idToLabelMapping.
         // Put the label of the last_id in the deleted_id.
         setVectorLabel(id_to_delete, last_idx_label);
@@ -184,10 +213,11 @@ void BruteForceIndex<DataType, DistType>::removeVector(idType id_to_delete) {
         // Resize and align the idToLabelMapping.
         size_t idToLabel_size = idToLabelMapping.size();
         // If the new size is smaller by at least one block comparing to the idToLabelMapping
-        // align to be a multiplication of blocksize  and resize by one block.
+        // align to be a multiplication of block size and resize by one block.
         if (this->count + this->blockSize <= idToLabel_size) {
             size_t vector_to_align_count = idToLabel_size % this->blockSize;
             this->idToLabelMapping.resize(idToLabel_size - this->blockSize - vector_to_align_count);
+            this->idToLabelMapping.shrink_to_fit();
         }
     }
 }
@@ -230,7 +260,7 @@ vecsim_stl::vector<DistType> BruteForceIndex<DataType, DistType>::computeBlockSc
 template <typename DataType, typename DistType>
 VecSimQueryResult_List
 BruteForceIndex<DataType, DistType>::topKQuery(const void *queryBlob, size_t k,
-                                               VecSimQueryParams *queryParams) {
+                                               VecSimQueryParams *queryParams) const {
 
     VecSimQueryResult_List rl = {0};
     void *timeoutCtx = queryParams ? queryParams->timeoutCtx : NULL;
@@ -241,14 +271,6 @@ BruteForceIndex<DataType, DistType>::topKQuery(const void *queryBlob, size_t k,
         return rl;
     }
 
-    DataType normalized_blob[this->dim]; // This will be use only if metric == VecSimMetric_Cosine.
-    if (this->metric == VecSimMetric_Cosine) {
-        memcpy(normalized_blob, queryBlob, this->dim * sizeof(DataType));
-        normalizeVector(normalized_blob, this->dim);
-
-        queryBlob = normalized_blob;
-    }
-
     DistType upperBound = std::numeric_limits<DistType>::lowest();
     vecsim_stl::abstract_priority_queue<DistType, labelType> *TopCandidates =
         getNewMaxPriorityQueue();
@@ -289,18 +311,11 @@ BruteForceIndex<DataType, DistType>::topKQuery(const void *queryBlob, size_t k,
 template <typename DataType, typename DistType>
 VecSimQueryResult_List
 BruteForceIndex<DataType, DistType>::rangeQuery(const void *queryBlob, double radius,
-                                                VecSimQueryParams *queryParams) {
+                                                VecSimQueryParams *queryParams) const {
     auto rl = (VecSimQueryResult_List){0};
     void *timeoutCtx = queryParams ? queryParams->timeoutCtx : nullptr;
     this->last_mode = RANGE_QUERY;
 
-    DataType normalized_blob[this->dim]; // This will be use only if metric == VecSimMetric_Cosine.
-    if (this->metric == VecSimMetric_Cosine) {
-        memcpy(normalized_blob, queryBlob, this->dim * sizeof(DataType));
-        normalizeVector(normalized_blob, this->dim);
-        queryBlob = normalized_blob;
-    }
-
     // Compute scores in every block and save results that are within the range.
     auto res_container =
         getNewResultsContainer(10); // Use 10 as the initial capacity for the dynamic array.
@@ -330,16 +345,18 @@ template <typename DataType, typename DistType>
 VecSimIndexInfo BruteForceIndex<DataType, DistType>::info() const {
 
     VecSimIndexInfo info;
+    info.commonInfo = this->getCommonInfo();
+    info.commonInfo.basicInfo.algo = VecSimAlgo_BF;
+
+    return info;
+}
+
+template <typename DataType, typename DistType>
+VecSimIndexBasicInfo BruteForceIndex<DataType, DistType>::basicInfo() const {
+
+    VecSimIndexBasicInfo info = this->getBasicInfo();
     info.algo = VecSimAlgo_BF;
-    info.bfInfo.dim = this->dim;
-    info.bfInfo.type = this->vecType;
-    info.bfInfo.metric = this->metric;
-    info.bfInfo.indexSize = this->count;
-    info.bfInfo.indexLabelCount = this->indexLabelCount();
-    info.bfInfo.blockSize = this->blockSize;
-    info.bfInfo.memory = this->getAllocationSize();
-    info.bfInfo.isMulti = this->isMulti;
-    info.bfInfo.last_mode = this->last_mode;
+    info.isTiered = false;
     return info;
 }
 
@@ -347,51 +364,19 @@ template <typename DataType, typename DistType>
 VecSimInfoIterator *BruteForceIndex<DataType, DistType>::infoIterator() const {
     VecSimIndexInfo info = this->info();
     // For readability. Update this number when needed.
-    size_t numberOfInfoFields = 8;
+    size_t numberOfInfoFields = 10;
     VecSimInfoIterator *infoIterator = new VecSimInfoIterator(numberOfInfoFields);
 
-    infoIterator->addInfoField(VecSim_InfoField{
-        .fieldName = VecSimCommonStrings::ALGORITHM_STRING,
-        .fieldType = INFOFIELD_STRING,
-        .fieldValue = {FieldValue{.stringValue = VecSimAlgo_ToString(info.algo)}}});
-    infoIterator->addInfoField(VecSim_InfoField{
-        .fieldName = VecSimCommonStrings::TYPE_STRING,
-        .fieldType = INFOFIELD_STRING,
-        .fieldValue = {FieldValue{.stringValue = VecSimType_ToString(info.bfInfo.type)}}});
-    infoIterator->addInfoField(
-        VecSim_InfoField{.fieldName = VecSimCommonStrings::DIMENSION_STRING,
-                         .fieldType = INFOFIELD_UINT64,
-                         .fieldValue = {FieldValue{.uintegerValue = info.bfInfo.dim}}});
-    infoIterator->addInfoField(VecSim_InfoField{
-        .fieldName = VecSimCommonStrings::METRIC_STRING,
-        .fieldType = INFOFIELD_STRING,
-        .fieldValue = {FieldValue{.stringValue = VecSimMetric_ToString(info.bfInfo.metric)}}});
-    infoIterator->addInfoField(
-        VecSim_InfoField{.fieldName = VecSimCommonStrings::IS_MULTI_STRING,
-                         .fieldType = INFOFIELD_UINT64,
-                         .fieldValue = {FieldValue{.uintegerValue = info.bfInfo.isMulti}}});
-    infoIterator->addInfoField(
-        VecSim_InfoField{.fieldName = VecSimCommonStrings::INDEX_SIZE_STRING,
-                         .fieldType = INFOFIELD_UINT64,
-                         .fieldValue = {FieldValue{.uintegerValue = info.bfInfo.indexSize}}});
-    infoIterator->addInfoField(
-        VecSim_InfoField{.fieldName = VecSimCommonStrings::INDEX_LABEL_COUNT_STRING,
-                         .fieldType = INFOFIELD_UINT64,
-                         .fieldValue = {FieldValue{.uintegerValue = info.bfInfo.indexLabelCount}}});
     infoIterator->addInfoField(
-        VecSim_InfoField{.fieldName = VecSimCommonStrings::BLOCK_SIZE_STRING,
-                         .fieldType = INFOFIELD_UINT64,
-                         .fieldValue = {FieldValue{.uintegerValue = info.bfInfo.blockSize}}});
-    infoIterator->addInfoField(
-        VecSim_InfoField{.fieldName = VecSimCommonStrings::MEMORY_STRING,
-                         .fieldType = INFOFIELD_UINT64,
-                         .fieldValue = {FieldValue{.uintegerValue = info.bfInfo.memory}}});
-    infoIterator->addInfoField(
-        VecSim_InfoField{.fieldName = VecSimCommonStrings::SEARCH_MODE_STRING,
+        VecSim_InfoField{.fieldName = VecSimCommonStrings::ALGORITHM_STRING,
                          .fieldType = INFOFIELD_STRING,
                          .fieldValue = {FieldValue{
-                             .stringValue = VecSimSearchMode_ToString(info.bfInfo.last_mode)}}});
-
+                             .stringValue = VecSimAlgo_ToString(info.commonInfo.basicInfo.algo)}}});
+    this->addCommonInfoToIterator(infoIterator, info.commonInfo);
+    infoIterator->addInfoField(VecSim_InfoField{
+        .fieldName = VecSimCommonStrings::BLOCK_SIZE_STRING,
+        .fieldType = INFOFIELD_UINT64,
+        .fieldValue = {FieldValue{.uintegerValue = info.commonInfo.basicInfo.blockSize}}});
     return infoIterator;
 }
 
@@ -401,22 +386,19 @@ BruteForceIndex<DataType, DistType>::newBatchIterator(const void *queryBlob,
                                                       VecSimQueryParams *queryParams) const {
     auto *queryBlobCopy = this->allocator->allocate(sizeof(DataType) * this->dim);
     memcpy(queryBlobCopy, queryBlob, this->dim * sizeof(DataType));
-    if (this->metric == VecSimMetric_Cosine) {
-        normalizeVector((DataType *)queryBlobCopy, this->dim);
-    }
     // Ownership of queryBlobCopy moves to BF_BatchIterator that will free it at the end.
     return newBatchIterator_Instance(queryBlobCopy, queryParams);
 }
 
 template <typename DataType, typename DistType>
 bool BruteForceIndex<DataType, DistType>::preferAdHocSearch(size_t subsetSize, size_t k,
-                                                            bool initial_check) {
+                                                            bool initial_check) const {
     // This heuristic is based on sklearn decision tree classifier (with 10 leaves nodes) -
     // see scripts/BF_batches_clf.py
     size_t index_size = this->indexSize();
-    if (subsetSize > index_size) {
-        throw std::runtime_error("internal error: subset size cannot be larger than index size");
-    }
+    // Referring to too large subset size as if it was the maximum possible size.
+    subsetSize = std::min(subsetSize, index_size);
+
     size_t d = this->dim;
     float r = (index_size == 0) ? 0.0f : (float)(subsetSize) / (float)this->indexLabelCount();
     bool res;
diff --git a/src/VecSim/algorithms/brute_force/brute_force_factory.cpp b/src/VecSim/algorithms/brute_force/brute_force_factory.cpp
deleted file mode 100644
index 3278ee57a..000000000
--- a/src/VecSim/algorithms/brute_force/brute_force_factory.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- *Copyright Redis Ltd. 2021 - present
- *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
- *the Server Side Public License v1 (SSPLv1).
- */
-
-#include "VecSim/algorithms/brute_force/brute_force_factory.h"
-#include "VecSim/algorithms/brute_force/brute_force.h"
-#include "VecSim/algorithms/brute_force/brute_force_single.h"
-#include "VecSim/algorithms/brute_force/brute_force_multi.h"
-
-namespace BruteForceFactory {
-template <typename DataType, typename DistType = DataType>
-inline VecSimIndex *NewIndex_ChooseMultiOrSingle(const BFParams *params,
-                                                 std::shared_ptr<VecSimAllocator> allocator) {
-    // check if single and return new bf_index
-    if (params->multi)
-        return new (allocator) BruteForceIndex_Multi<DataType, DistType>(params, allocator);
-    else
-        return new (allocator) BruteForceIndex_Single<DataType, DistType>(params, allocator);
-}
-
-VecSimIndex *NewIndex(const BFParams *params, std::shared_ptr<VecSimAllocator> allocator) {
-    if (params->type == VecSimType_FLOAT32) {
-        return NewIndex_ChooseMultiOrSingle<float>(params, allocator);
-    } else if (params->type == VecSimType_FLOAT64) {
-        return NewIndex_ChooseMultiOrSingle<double>(params, allocator);
-    }
-
-    // If we got here something is wrong.
-    return NULL;
-}
-
-template <typename DataType, typename DistType = DataType>
-inline size_t EstimateInitialSize_ChooseMultiOrSingle(bool is_multi) {
-    // check if single and return new bf_index
-    if (is_multi)
-        return sizeof(BruteForceIndex_Multi<DataType, DistType>);
-    else
-        return sizeof(BruteForceIndex_Single<DataType, DistType>);
-}
-
-size_t EstimateInitialSize(const BFParams *params) {
-
-    // Constant part (not effected by parameters).
-    size_t est = sizeof(VecSimAllocator) + sizeof(size_t);
-    if (params->type == VecSimType_FLOAT32) {
-        est += EstimateInitialSize_ChooseMultiOrSingle<float>(params->multi);
-    } else if (params->type == VecSimType_FLOAT64) {
-        est += EstimateInitialSize_ChooseMultiOrSingle<double>(params->multi);
-    }
-    // Parameters related part.
-
-    if (params->initialCapacity) {
-        est += params->initialCapacity * sizeof(labelType) + sizeof(size_t);
-    }
-
-    return est;
-}
-
-size_t EstimateElementSize(const BFParams *params) {
-    return params->dim * VecSimType_sizeof(params->type) + sizeof(labelType);
-}
-}; // namespace BruteForceFactory
diff --git a/src/VecSim/algorithms/brute_force/brute_force_multi.h b/src/VecSim/algorithms/brute_force/brute_force_multi.h
index 71639c38d..0f963e2ba 100644
--- a/src/VecSim/algorithms/brute_force/brute_force_multi.h
+++ b/src/VecSim/algorithms/brute_force/brute_force_multi.h
@@ -17,13 +17,15 @@ class BruteForceIndex_Multi : public BruteForceIndex<DataType, DistType> {
     vecsim_stl::unordered_map<labelType, vecsim_stl::vector<idType>> labelToIdsLookup;
 
 public:
-    BruteForceIndex_Multi(const BFParams *params, std::shared_ptr<VecSimAllocator> allocator)
-        : BruteForceIndex<DataType, DistType>(params, allocator), labelToIdsLookup(allocator) {}
+    BruteForceIndex_Multi(const BFParams *params, const AbstractIndexInitParams &abstractInitParams)
+        : BruteForceIndex<DataType, DistType>(params, abstractInitParams),
+          labelToIdsLookup(this->allocator) {}
 
     ~BruteForceIndex_Multi() {}
 
-    int addVector(const void *vector_data, labelType label, bool overwrite_allowed = true) override;
+    int addVector(const void *vector_data, labelType label, void *auxiliaryCtx = nullptr) override;
     int deleteVector(labelType labelType) override;
+    int deleteVectorById(labelType label, idType id) override;
     double getDistanceFrom(labelType label, const void *vector_data) const override;
     inline size_t indexLabelCount() const override { return this->labelToIdsLookup.size(); }
 
@@ -32,8 +34,11 @@ class BruteForceIndex_Multi : public BruteForceIndex<DataType, DistType> {
         return std::unique_ptr<vecsim_stl::abstract_results_container>(
             new (this->allocator) vecsim_stl::unique_results_container(cap, this->allocator));
     }
+    std::unordered_map<idType, std::pair<idType, labelType>>
+    deleteVectorAndGetUpdatedIds(labelType label) override;
 #ifdef BUILD_TESTS
-    void GetDataByLabel(labelType label, std::vector<std::vector<DataType>> &vectors_output) {
+    void getDataByLabel(labelType label,
+                        std::vector<std::vector<DataType>> &vectors_output) const override {
 
         auto ids = labelToIdsLookup.find(label);
 
@@ -51,8 +56,21 @@ class BruteForceIndex_Multi : public BruteForceIndex<DataType, DistType> {
 
     inline void replaceIdOfLabel(labelType label, idType new_id, idType old_id) override;
 
+    inline bool isLabelExists(labelType label) override {
+        return labelToIdsLookup.find(label) != labelToIdsLookup.end();
+    }
+    // Return a set of all labels that are stored in the index (helper for computing label count
+    // without duplicates in tiered index). Caller should hold the flat buffer lock for read.
+    inline vecsim_stl::set<labelType> getLabelsSet() const override {
+        vecsim_stl::set<labelType> keys(this->allocator);
+        for (auto &it : labelToIdsLookup) {
+            keys.insert(it.first);
+        }
+        return keys;
+    };
+
     inline vecsim_stl::abstract_priority_queue<DistType, labelType> *
-    getNewMaxPriorityQueue() override {
+    getNewMaxPriorityQueue() const override {
         return new (this->allocator)
             vecsim_stl::updatable_max_heap<DistType, labelType>(this->allocator);
     }
@@ -72,15 +90,7 @@ class BruteForceIndex_Multi : public BruteForceIndex<DataType, DistType> {
 
 template <typename DataType, typename DistType>
 int BruteForceIndex_Multi<DataType, DistType>::addVector(const void *vector_data, labelType label,
-                                                         bool overwrite_allowed) {
-
-    DataType normalized_blob[this->dim]; // This will be use only if metric == VecSimMetric_Cosine.
-    if (this->metric == VecSimMetric_Cosine) {
-        memcpy(normalized_blob, vector_data, this->dim * sizeof(DataType));
-        normalizeVector(normalized_blob, this->dim);
-        vector_data = normalized_blob;
-    }
-
+                                                         void *auxiliaryCtx) {
     this->appendVector(vector_data, label);
     return 1;
 }
@@ -107,6 +117,76 @@ int BruteForceIndex_Multi<DataType, DistType>::deleteVector(labelType label) {
     return ret;
 }
 
+template <typename DataType, typename DistType>
+std::unordered_map<idType, std::pair<idType, labelType>>
+BruteForceIndex_Multi<DataType, DistType>::deleteVectorAndGetUpdatedIds(labelType label) {
+    // Hold a mapping from ids that are removed and changed to the original ids that were swapped
+    // into it. For example, if we have ids 0, 1, 2, 3, 4 and are about to remove ids 1, 3, 4, we
+    // should get the following scenario: {1->4} => {1->4} => {1->2}.
+    // Explanation: first we delete 1 and swap it with 4. Then, we remove 3 and have no swap since 3
+    // is the last id. Lastly, we delete the original 4 which is now in id 1, and swap it with 2.
+    // Eventually, in id 1 we should have the original vector whose id was 2.
+    std::unordered_map<idType, std::pair<idType, labelType>> updated_ids;
+
+    // Find the id to delete.
+    auto deleted_label_ids_pair = this->labelToIdsLookup.find(label);
+    if (deleted_label_ids_pair == this->labelToIdsLookup.end()) {
+        // Nothing to delete.
+        return updated_ids;
+    }
+
+    // Deletes all vectors under the given label.
+    for (size_t i = 0; i < deleted_label_ids_pair->second.size(); i++) {
+        idType cur_id_to_delete = deleted_label_ids_pair->second[i];
+        // The removal take into consideration the current internal id to remove, even if it is not
+        // the original id, and it has swapped into this id after previous swap of another id that
+        // belongs to this label.
+        labelType last_id_label = this->idToLabelMapping[this->count - 1];
+        this->removeVector(cur_id_to_delete);
+        // If cur_id_to_delete exists in the map, remove it as it is no longer valid, whether it
+        // will get a new value due to a swap, or it is the last element in the index.
+        updated_ids.erase(cur_id_to_delete);
+        // If a swap was made, update who was the original id that now resides in cur_id_to_delete.
+        if (cur_id_to_delete != this->count) {
+            if (updated_ids.find(this->count) != updated_ids.end()) {
+                updated_ids[cur_id_to_delete] = updated_ids[this->count];
+                updated_ids.erase(this->count);
+            } else {
+                // Otherwise, the last id now resides where the deleted id was.
+                updated_ids[cur_id_to_delete] = {this->count, last_id_label};
+            }
+        }
+    }
+    // Remove the pair of the deleted vector.
+    labelToIdsLookup.erase(label);
+    return updated_ids;
+}
+
+template <typename DataType, typename DistType>
+int BruteForceIndex_Multi<DataType, DistType>::deleteVectorById(labelType label, idType id) {
+    // Find the id to delete.
+    auto deleted_label_ids_pair = this->labelToIdsLookup.find(label);
+    if (deleted_label_ids_pair == this->labelToIdsLookup.end()) {
+        // Nothing to delete.
+        return 0;
+    }
+
+    // Delete the specific vector id which is under the given label.
+    auto &ids = deleted_label_ids_pair->second;
+    for (size_t i = 0; i < ids.size(); i++) {
+        if (ids[i] == id) {
+            this->removeVector(id);
+            ids.erase(ids.begin() + i);
+            if (ids.empty()) {
+                labelToIdsLookup.erase(label);
+            }
+            return 1;
+        }
+    }
+    assert(false && "id to delete was not found under the given label");
+    return 0;
+}
+
 template <typename DataType, typename DistType>
 double BruteForceIndex_Multi<DataType, DistType>::getDistanceFrom(labelType label,
                                                                   const void *vector_data) const {
@@ -129,8 +209,18 @@ template <typename DataType, typename DistType>
 void BruteForceIndex_Multi<DataType, DistType>::replaceIdOfLabel(labelType label, idType new_id,
                                                                  idType old_id) {
     assert(labelToIdsLookup.find(label) != labelToIdsLookup.end());
+    // *Non-trivial code here* - in every iteration we replace the internal id of the previous last
+    // id that has been swapped with the deleted id. Note that if the old and the new replaced ids
+    // both belong to the same label, then we are going to delete the new id later on as well, since
+    // we are currently iterating on this exact array of ids in 'deleteVector'. Hence, the relevant
+    // part of the vector that should be updated is the "tail" that comes after the position of
+    // old_id, while the "head" may contain old occurrences of old_id that are irrelevant for the
+    // future deletions. Therefore, we iterate from end to beginning. For example, assuming we are
+    // deleting a label that contains the only 3 ids that exist in the index. Hence, we would
+    // expect the following scenario w.r.t. the ids array:
+    // [|1, 0, 2] -> [1, |0, 1] -> [1, 0, |0] (where | marks the current position)
     auto &ids = labelToIdsLookup.at(label);
-    for (size_t i = 0; i < ids.size(); i++) {
+    for (int i = ids.size() - 1; i >= 0; i--) {
         if (ids[i] == old_id) {
             ids[i] = new_id;
             return;
diff --git a/src/VecSim/algorithms/brute_force/brute_force_multi_tests_friends.h b/src/VecSim/algorithms/brute_force/brute_force_multi_tests_friends.h
index 43c7e0ec6..c4c3f8770 100644
--- a/src/VecSim/algorithms/brute_force/brute_force_multi_tests_friends.h
+++ b/src/VecSim/algorithms/brute_force/brute_force_multi_tests_friends.h
@@ -14,3 +14,4 @@ INDEX_TEST_FRIEND_CLASS(BruteForceMultiTest_indexing_same_vector_Test)
 INDEX_TEST_FRIEND_CLASS(BruteForceMultiTest_test_delete_swap_block_Test)
 INDEX_TEST_FRIEND_CLASS(BruteForceMultiTest_test_dynamic_bf_info_iterator_Test)
 INDEX_TEST_FRIEND_CLASS(BruteForceMultiTest_remove_vector_after_replacing_block_Test)
+INDEX_TEST_FRIEND_CLASS(BruteForceMultiTest_removeVectorWithSwaps_Test)
diff --git a/src/VecSim/algorithms/brute_force/brute_force_single.h b/src/VecSim/algorithms/brute_force/brute_force_single.h
index adb4840ef..15e018150 100644
--- a/src/VecSim/algorithms/brute_force/brute_force_single.h
+++ b/src/VecSim/algorithms/brute_force/brute_force_single.h
@@ -17,11 +17,13 @@ class BruteForceIndex_Single : public BruteForceIndex<DataType, DistType> {
     vecsim_stl::unordered_map<labelType, idType> labelToIdLookup;
 
 public:
-    BruteForceIndex_Single(const BFParams *params, std::shared_ptr<VecSimAllocator> allocator);
+    BruteForceIndex_Single(const BFParams *params,
+                           const AbstractIndexInitParams &abstractInitParams);
     ~BruteForceIndex_Single();
 
-    int addVector(const void *vector_data, labelType label, bool overwrite_allowed = true) override;
+    int addVector(const void *vector_data, labelType label, void *auxiliaryCtx = nullptr) override;
     int deleteVector(labelType label) override;
+    int deleteVectorById(labelType label, idType id) override;
     double getDistanceFrom(labelType label, const void *vector_data) const override;
 
     inline std::unique_ptr<vecsim_stl::abstract_results_container>
@@ -31,8 +33,15 @@ class BruteForceIndex_Single : public BruteForceIndex<DataType, DistType> {
     }
 
     inline size_t indexLabelCount() const override { return this->count; }
+    std::unordered_map<idType, std::pair<idType, labelType>>
+    deleteVectorAndGetUpdatedIds(labelType label) override;
+
+    // We call this when we KNOW that the label exists in the index.
+    idType getIdOfLabel(labelType label) const { return labelToIdLookup.find(label)->second; }
+
 #ifdef BUILD_TESTS
-    void GetDataByLabel(labelType label, std::vector<std::vector<DataType>> &vectors_output) {
+    void getDataByLabel(labelType label,
+                        std::vector<std::vector<DataType>> &vectors_output) const override {
 
         auto id = labelToIdLookup.at(label);
 
@@ -62,8 +71,21 @@ class BruteForceIndex_Single : public BruteForceIndex<DataType, DistType> {
         labelToIdLookup.at(label) = new_id;
     }
 
+    inline bool isLabelExists(labelType label) override {
+        return labelToIdLookup.find(label) != labelToIdLookup.end();
+    }
+    // Return a set of all labels that are stored in the index (helper for computing label count
+    // without duplicates in tiered index). Caller should hold the flat buffer lock for read.
+    inline vecsim_stl::set<labelType> getLabelsSet() const override {
+        vecsim_stl::set<labelType> keys(this->allocator);
+        for (auto &it : labelToIdLookup) {
+            keys.insert(it.first);
+        }
+        return keys;
+    };
+
     inline vecsim_stl::abstract_priority_queue<DistType, labelType> *
-    getNewMaxPriorityQueue() override {
+    getNewMaxPriorityQueue() const override {
         return new (this->allocator)
             vecsim_stl::max_priority_queue<DistType, labelType>(this->allocator);
     }
@@ -84,22 +106,16 @@ class BruteForceIndex_Single : public BruteForceIndex<DataType, DistType> {
 
 template <typename DataType, typename DistType>
 BruteForceIndex_Single<DataType, DistType>::BruteForceIndex_Single(
-    const BFParams *params, std::shared_ptr<VecSimAllocator> allocator)
-    : BruteForceIndex<DataType, DistType>(params, allocator), labelToIdLookup(allocator) {}
+    const BFParams *params, const AbstractIndexInitParams &abstractInitParams)
+    : BruteForceIndex<DataType, DistType>(params, abstractInitParams),
+      labelToIdLookup(this->allocator) {}
 
 template <typename DataType, typename DistType>
 BruteForceIndex_Single<DataType, DistType>::~BruteForceIndex_Single() {}
 
 template <typename DataType, typename DistType>
 int BruteForceIndex_Single<DataType, DistType>::addVector(const void *vector_data, labelType label,
-                                                          bool overwrite_allowed) {
-
-    DataType normalized_blob[this->dim]; // This will be use only if metric == VecSimMetric_Cosine
-    if (this->metric == VecSimMetric_Cosine) {
-        memcpy(normalized_blob, vector_data, this->dim * sizeof(DataType));
-        normalizeVector(normalized_blob, this->dim);
-        vector_data = normalized_blob;
-    }
+                                                          void *auxiliaryCtx) {
 
     auto optionalID = this->labelToIdLookup.find(label);
     // Check if label already exists, so it is an update operation.
@@ -133,6 +149,36 @@ int BruteForceIndex_Single<DataType, DistType>::deleteVector(labelType label) {
     return 1;
 }
 
+template <typename DataType, typename DistType>
+std::unordered_map<idType, std::pair<idType, labelType>>
+BruteForceIndex_Single<DataType, DistType>::deleteVectorAndGetUpdatedIds(labelType label) {
+
+    std::unordered_map<idType, std::pair<idType, labelType>> updated_ids;
+    // Find the id to delete.
+    auto deleted_label_id_pair = this->labelToIdLookup.find(label);
+    if (deleted_label_id_pair == this->labelToIdLookup.end()) {
+        // Nothing to delete.
+        return updated_ids;
+    }
+
+    // Get deleted vector id.
+    idType id_to_delete = deleted_label_id_pair->second;
+
+    // Remove the pair of the deleted vector.
+    labelToIdLookup.erase(label);
+    labelType last_id_label = this->idToLabelMapping[this->count - 1];
+    this->removeVector(id_to_delete); // this will decrease this->count and make the swap
+    if (id_to_delete != this->count) {
+        updated_ids[id_to_delete] = {this->count, last_id_label};
+    }
+    return updated_ids;
+}
+
+template <typename DataType, typename DistType>
+int BruteForceIndex_Single<DataType, DistType>::deleteVectorById(labelType label, idType id) {
+    return deleteVector(label);
+}
+
 template <typename DataType, typename DistType>
 double BruteForceIndex_Single<DataType, DistType>::getDistanceFrom(labelType label,
                                                                    const void *vector_data) const {
diff --git a/src/VecSim/algorithms/hnsw/hnsw.h b/src/VecSim/algorithms/hnsw/hnsw.h
index 779ec40b6..c4726ef66 100644
--- a/src/VecSim/algorithms/hnsw/hnsw.h
+++ b/src/VecSim/algorithms/hnsw/hnsw.h
@@ -34,12 +34,10 @@
 #include <unordered_map>
 #include <sys/resource.h>
 #include <fstream>
+#include <shared_mutex>
 
 using std::pair;
 
-#define HNSW_INVALID_ID    UINT_MAX
-#define HNSW_INVALID_LEVEL SIZE_MAX
-
 typedef uint16_t linkListSize;
 typedef uint16_t elementFlags;
 
@@ -47,6 +45,24 @@ template <typename DistType>
 using candidatesMaxHeap = vecsim_stl::max_priority_queue<DistType, idType>;
 template <typename DistType>
 using candidatesLabelsMaxHeap = vecsim_stl::abstract_priority_queue<DistType, labelType>;
+using graphNodeType = pair<idType, ushort>; // represented as: (element_id, level)
+
+// Vectors flags (for marking a specific vector)
+typedef enum {
+    DELETE_MARK = 0x1, // element is logically deleted, but still exists in the graph
+    IN_PROCESS = 0x2,  // element is being inserted into the graph
+} Flags;
+
+// The state of the index and the newly inserted vector to be passed into addVector API in case that
+// the index global data structures are updated atomically from an external scope (such as in
+// tiered index),
+// TODO: this might need to be generalized for future usages of async indexing.
+struct AddVectorCtx {
+    idType newElementId;
+    int elementMaxLevel;
+    idType currEntryPoint;
+    int currMaxLevel;
+};
 
 template <typename DataType, typename DistType>
 class HNSWIndex : public VecSimIndexAbstract<DistType>,
@@ -69,7 +85,6 @@ class HNSWIndex : public VecSimIndexAbstract<DistType>,
     double epsilon_;
 
     // Index meta-data (based on the data dimensionality and index parameters)
-    size_t data_size_;
     size_t size_data_per_element_;
     size_t size_links_per_element_;
     size_t size_links_level0_;
@@ -82,24 +97,23 @@ class HNSWIndex : public VecSimIndexAbstract<DistType>,
     // Index level generator of the top level for a new element
     std::default_random_engine level_generator_;
 
-    // Index state
+    // Index global state - these should be guarded by the index_data_guard_ lock in
+    // multithreaded scenario.
     size_t cur_element_count;
-    size_t maxlevel_;
-
-    // Index data structures
-    idType entrypoint_node_;
-    char *data_level0_memory_;
-    char **linkLists_;
     vecsim_stl::vector<size_t> element_levels_;
+    idType entrypoint_node_;
+    size_t max_level_; // this is the top level of the entry point's element
+
+    // Index data
+    char *data_level0_memory_; // neighbors in level 0, element label, flags and data (vector)
+    char **linkLists_;         // neighbors in level higher than 0
 
     // Used for marking the visited nodes in graph scans (the pool supports parallel graph scans).
     // This is mutable since the object changes upon search operations as well (which are const).
     mutable VisitedNodesHandlerPool visited_nodes_handler_pool;
-#ifdef ENABLE_PARALLELIZATION
-    std::mutex global;
-    std::mutex cur_element_count_guard_;
-    std::vector<std::mutex> link_list_locks_;
-#endif
+
+    mutable std::shared_mutex index_data_guard_;
+    mutable vecsim_stl::vector<std::mutex> element_neighbors_locks_;
 
 #ifdef BUILD_TESTS
 #include "VecSim/algorithms/hnsw/hnsw_base_tests_friends.h"
@@ -115,10 +129,10 @@ class HNSWIndex : public VecSimIndexAbstract<DistType>,
     inline size_t getRandomLevel(double reverse_size);
     inline vecsim_stl::vector<idType> *getIncomingEdgesPtr(idType internal_id, size_t level) const;
     inline void setIncomingEdgesPtr(idType internal_id, size_t level, void *edges_ptr);
-    inline elementFlags *get_flags(idType internal_id) const;
-    inline idType *get_linklist0(idType internal_id) const;
-    inline idType *get_linklist(idType internal_id, size_t level) const;
-    inline void setListCount(idType *list, linkListSize size);
+    inline elementFlags *getElementFlags(idType internal_id) const;
+    inline idType *getNodeNeighborsAtBaseLevel(idType internal_id) const;
+    inline idType *getNodeNeighborsAtNonBaseLevel(idType internal_id, size_t level) const;
+    inline void setNodeNeighborsCount(idType *list, linkListSize size);
     inline void removeExtraLinks(candidatesMaxHeap<DistType> candidates, size_t Mcurmax,
                                  idType *node_neighbors, const vecsim_stl::vector<bool> &bitmap,
                                  idType *removed_links, size_t *removed_links_num);
@@ -147,10 +161,26 @@ class HNSWIndex : public VecSimIndexAbstract<DistType>,
                                                           void *timeoutCtx,
                                                           VecSimQueryResult_Code *rc) const;
     void getNeighborsByHeuristic2(candidatesMaxHeap<DistType> &top_candidates, size_t M);
-    inline idType mutuallyConnectNewElement(idType cur_c,
+    // Helper function for re-selecting node's neighbors which was selected as a neighbor for
+    // a newly inserted node. Also, responsible for mutually connect the new node and the neighbor
+    // (unidirectional or bidirectional connection).
+    // *Note that node_lock and neighbor_lock should be locked upon calling this function*
+    void revisitNeighborConnections(size_t level, idType new_node_id,
+                                    const std::pair<DistType, idType> &neighbor_data,
+                                    idType *new_node_neighbors_list,
+                                    idType *neighbor_neighbors_list,
+                                    std::unique_lock<std::mutex> &node_lock,
+                                    std::unique_lock<std::mutex> &neighbor_lock);
+    inline idType mutuallyConnectNewElement(idType new_node_id,
                                             candidatesMaxHeap<DistType> &top_candidates,
                                             size_t level);
-    template <bool with_timeout>
+    void mutuallyUpdateForRepairedNode(idType node_id, size_t level,
+                                       vecsim_stl::vector<idType> &neighbors_to_remove,
+                                       vecsim_stl::vector<idType> &nodes_to_update,
+                                       vecsim_stl::vector<idType> &chosen_neighbors,
+                                       size_t max_M_cur);
+
+    template <bool running_query>
     void greedySearchLevel(const void *vector_data, size_t level, idType &curObj, DistType &curDist,
                            void *timeoutCtx = nullptr, VecSimQueryResult_Code *rc = nullptr) const;
     void repairConnectionsForDeletion(idType element_internal_id, idType neighbour_id,
@@ -158,21 +188,30 @@ class HNSWIndex : public VecSimIndexAbstract<DistType>,
                                       size_t level, vecsim_stl::vector<bool> &neighbours_bitmap);
     inline void replaceEntryPoint();
     inline void resizeIndexInternal(size_t new_max_elements);
+
+    template <bool has_marked_deleted>
     inline void SwapLastIdWithDeletedId(idType element_internal_id);
 
     // Protected internal function that implements generic single vector insertion.
-    void appendVector(const void *vector_data, labelType label);
+    void appendVector(const void *vector_data, labelType label,
+                      AddVectorCtx *auxiliaryCtx = nullptr);
 
     // Protected internal function that implements generic single vector deletion.
-    void removeVector(idType id);
+    void removeVectorInPlace(idType id);
 
     inline void emplaceToHeap(vecsim_stl::abstract_priority_queue<DistType, idType> &heap,
                               DistType dist, idType id) const;
     inline void emplaceToHeap(vecsim_stl::abstract_priority_queue<DistType, labelType> &heap,
                               DistType dist, idType id) const;
+    // Helper method that swaps the last element in the ids list with the given one (equivalent to
+    // removing the given element id from the list).
+    inline bool removeIdFromList(vecsim_stl::vector<idType> &element_ids_list, idType element_id);
+
+    template <bool has_marked_deleted>
+    void removeAndSwap(idType internalId);
 
 public:
-    HNSWIndex(const HNSWParams *params, std::shared_ptr<VecSimAllocator> allocator,
+    HNSWIndex(const HNSWParams *params, const AbstractIndexInitParams &abstractInitParams,
               size_t random_seed = 100, size_t initial_pool_size = 1);
     virtual ~HNSWIndex();
 
@@ -185,32 +224,67 @@ class HNSWIndex : public VecSimIndexAbstract<DistType>,
     inline size_t getEfConstruction() const;
     inline size_t getM() const;
     inline size_t getMaxLevel() const;
-    inline idType getEntryPointId() const;
     inline labelType getEntryPointLabel() const;
     inline labelType getExternalLabel(idType internal_id) const;
+    // Check if the given label exists in the labels lookup while holding the index data lock.
+    // Optionally validate that the associated vector(s) are not in process and done indexing
+    // (this option is used currently for tests).
+    virtual inline bool safeCheckIfLabelExistsInIndex(labelType label,
+                                                      bool also_done_processing = false) const = 0;
+    inline auto safeGetEntryPointState() const;
+    inline void lockIndexDataGuard() const;
+    inline void unlockIndexDataGuard() const;
+    inline void lockNodeLinks(idType node_id) const;
+    inline void unlockNodeLinks(idType node_id) const;
     inline VisitedNodesHandler *getVisitedList() const;
     inline void returnVisitedList(VisitedNodesHandler *visited_nodes_handler) const;
     VecSimIndexInfo info() const override;
+    VecSimIndexBasicInfo basicInfo() const override;
     VecSimInfoIterator *infoIterator() const override;
-    bool preferAdHocSearch(size_t subsetSize, size_t k, bool initial_check) override;
+    bool preferAdHocSearch(size_t subsetSize, size_t k, bool initial_check) const override;
     char *getDataByInternalId(idType internal_id) const;
-    inline idType *get_linklist_at_level(idType internal_id, size_t level) const;
-    inline linkListSize getListCount(const idType *list) const;
+    inline idType *getNodeNeighborsAtLevel(idType internal_id, size_t level) const;
+    inline linkListSize getNodeNeighborsCount(const idType *list) const;
     inline idType searchBottomLayerEP(const void *query_data, void *timeoutCtx,
                                       VecSimQueryResult_Code *rc) const;
 
     VecSimQueryResult_List topKQuery(const void *query_data, size_t k,
-                                     VecSimQueryParams *queryParams) override;
+                                     VecSimQueryParams *queryParams) const override;
     VecSimQueryResult_List rangeQuery(const void *query_data, double radius,
-                                      VecSimQueryParams *queryParams) override;
+                                      VecSimQueryParams *queryParams) const override;
 
     inline void markDeletedInternal(idType internalId);
     inline bool isMarkedDeleted(idType internalId) const;
+    inline bool isInProcess(idType internalId) const;
+    inline void markInProcess(idType internalId);
+    inline void unmarkInProcess(idType internalId);
     void increaseCapacity() override;
-
-    // inline priority queue getter that need to be implemented by derived class
+    AddVectorCtx storeNewElement(labelType label);
+    void removeAndSwapDeletedElement(idType internalId);
+    void repairNodeConnections(idType node_id, size_t level);
+    inline size_t getElementTopLevel(idType internalId);
+    vecsim_stl::vector<graphNodeType> safeCollectAllNodeIncomingNeighbors(idType node_id,
+                                                                          size_t node_top_level);
+    // Return all the labels in the index - this should be used for computing the number of distinct
+    // labels in a tiered index, and caller should hold the index data guard.
+    virtual inline vecsim_stl::set<labelType> getLabelsSet() const = 0;
+
+    // Inline priority queue getter that need to be implemented by derived class.
     virtual inline candidatesLabelsMaxHeap<DistType> *getNewMaxPriorityQueue() const = 0;
+    virtual double safeGetDistanceFrom(labelType label, const void *vector_data) const = 0;
 
+#ifdef BUILD_TESTS
+    /**
+     * @brief Used for testing - store vector(s) data associated with a given label. This function
+     * copies the vector(s)' data buffer(s) and place it in the output vector
+     *
+     * @param label
+     * @param vectors_output empty vector to be modified, should store the blob(s) associated with
+     * the label.
+     */
+    virtual void getDataByLabel(labelType label,
+                                std::vector<std::vector<DataType>> &vectors_output) const = 0;
+#endif
 protected:
     // inline label to id setters that need to be implemented by derived class
     virtual inline std::unique_ptr<vecsim_stl::abstract_results_container>
@@ -266,12 +340,12 @@ size_t HNSWIndex<DataType, DistType>::getM() const {
 
 template <typename DataType, typename DistType>
 size_t HNSWIndex<DataType, DistType>::getMaxLevel() const {
-    return maxlevel_;
+    return max_level_;
 }
 
 template <typename DataType, typename DistType>
 labelType HNSWIndex<DataType, DistType>::getEntryPointLabel() const {
-    if (entrypoint_node_ != HNSW_INVALID_ID)
+    if (entrypoint_node_ != INVALID_ID)
         return getExternalLabel(entrypoint_node_);
     return SIZE_MAX;
 }
@@ -336,14 +410,14 @@ void HNSWIndex<DataType, DistType>::setIncomingEdgesPtr(idType internal_id, size
 }
 
 template <typename DataType, typename DistType>
-elementFlags *HNSWIndex<DataType, DistType>::get_flags(idType internal_id) const {
+elementFlags *HNSWIndex<DataType, DistType>::getElementFlags(idType internal_id) const {
     // elementFlags offset is 0 from the start of the element metadata
     return (elementFlags *)(data_level0_memory_ + internal_id * size_data_per_element_ +
                             offsetLevel0_);
 }
 
 template <typename DataType, typename DistType>
-idType *HNSWIndex<DataType, DistType>::get_linklist0(idType internal_id) const {
+idType *HNSWIndex<DataType, DistType>::getNodeNeighborsAtBaseLevel(idType internal_id) const {
     // links offset at level 0 is `sizeof(elementFlags) + sizeof(linkListSize)` from the start of
     // the element metadata
     return (idType *)(data_level0_memory_ + internal_id * size_data_per_element_ +
@@ -351,33 +425,30 @@ idType *HNSWIndex<DataType, DistType>::get_linklist0(idType internal_id) const {
 }
 
 template <typename DataType, typename DistType>
-idType *HNSWIndex<DataType, DistType>::get_linklist(idType internal_id, size_t level) const {
+idType *HNSWIndex<DataType, DistType>::getNodeNeighborsAtNonBaseLevel(idType internal_id,
+                                                                      size_t level) const {
     // links offset at level >0 is `sizeof(linkListSize)` from the start of the element metadata
     return (idType *)(linkLists_[internal_id] + (level - 1) * size_links_per_element_ +
                       sizeof(linkListSize));
 }
 
 template <typename DataType, typename DistType>
-idType *HNSWIndex<DataType, DistType>::get_linklist_at_level(idType internal_id,
-                                                             size_t level) const {
-    return level == 0 ? get_linklist0(internal_id) : get_linklist(internal_id, level);
+idType *HNSWIndex<DataType, DistType>::getNodeNeighborsAtLevel(idType internal_id,
+                                                               size_t level) const {
+    return level == 0 ? getNodeNeighborsAtBaseLevel(internal_id)
+                      : getNodeNeighborsAtNonBaseLevel(internal_id, level);
 }
 
 template <typename DataType, typename DistType>
-linkListSize HNSWIndex<DataType, DistType>::getListCount(const idType *list) const {
+linkListSize HNSWIndex<DataType, DistType>::getNodeNeighborsCount(const idType *list) const {
     return *(((linkListSize *)list) - 1);
 }
 
 template <typename DataType, typename DistType>
-void HNSWIndex<DataType, DistType>::setListCount(idType *list, const linkListSize size) {
+void HNSWIndex<DataType, DistType>::setNodeNeighborsCount(idType *list, const linkListSize size) {
     *(((linkListSize *)list) - 1) = size;
 }
 
-template <typename DataType, typename DistType>
-idType HNSWIndex<DataType, DistType>::getEntryPointId() const {
-    return entrypoint_node_;
-}
-
 template <typename DataType, typename DistType>
 VisitedNodesHandler *HNSWIndex<DataType, DistType>::getVisitedList() const {
     return visited_nodes_handler_pool.getAvailableVisitedNodesHandler();
@@ -391,20 +462,73 @@ void HNSWIndex<DataType, DistType>::returnVisitedList(
 
 template <typename DataType, typename DistType>
 void HNSWIndex<DataType, DistType>::markDeletedInternal(idType internalId) {
+    // Here we are holding the global index data guard (and the main index lock of the tiered index
+    // for shared ownership).
     assert(internalId < this->cur_element_count);
     if (!isMarkedDeleted(internalId)) {
-        elementFlags *flags = get_flags(internalId);
-        *flags |= DELETE_MARK;
+        if (internalId == entrypoint_node_) {
+            // Internally, we hold and release the entrypoint neighbors lock.
+            replaceEntryPoint();
+        }
+        // Atomically set the deletion mark flag (note that other parallel threads may set the flags
+        // at the same time (for changing the IN_PROCESS flag).
+        __atomic_fetch_or(getElementFlags(internalId), DELETE_MARK, 0);
         this->num_marked_deleted++;
     }
 }
 
 template <typename DataType, typename DistType>
 bool HNSWIndex<DataType, DistType>::isMarkedDeleted(idType internalId) const {
-    elementFlags *flags = get_flags(internalId);
+    elementFlags *flags = getElementFlags(internalId);
     return *flags & DELETE_MARK;
 }
 
+template <typename DataType, typename DistType>
+bool HNSWIndex<DataType, DistType>::isInProcess(idType internalId) const {
+    elementFlags *flags = getElementFlags(internalId);
+    return *flags & IN_PROCESS;
+}
+
+template <typename DataType, typename DistType>
+void HNSWIndex<DataType, DistType>::markInProcess(idType internalId) {
+    // Atomically set the IN_PROCESS mark flag. Even though other threads shouldn't modify the flags
+    // at that time (we're holding index global data guard, so this element cannot be marked as
+    // deleted in parallel), we do it for safety.
+    __atomic_fetch_or(getElementFlags(internalId), IN_PROCESS, 0);
+}
+
+template <typename DataType, typename DistType>
+void HNSWIndex<DataType, DistType>::unmarkInProcess(idType internalId) {
+    // Atomically unset the IN_PROCESS mark flag (note that other parallel threads may set the flags
+    // at the same time (for marking the element with MARK_DELETE flag).
+    __atomic_fetch_and(getElementFlags(internalId), ~IN_PROCESS, 0);
+}
+
+template <typename DataType, typename DistType>
+void HNSWIndex<DataType, DistType>::lockIndexDataGuard() const {
+    index_data_guard_.lock();
+}
+
+template <typename DataType, typename DistType>
+void HNSWIndex<DataType, DistType>::unlockIndexDataGuard() const {
+    index_data_guard_.unlock();
+}
+
+template <typename DataType, typename DistType>
+void HNSWIndex<DataType, DistType>::lockNodeLinks(idType node_id) const {
+    element_neighbors_locks_[node_id].lock();
+}
+
+template <typename DataType, typename DistType>
+void HNSWIndex<DataType, DistType>::unlockNodeLinks(idType node_id) const {
+    element_neighbors_locks_[node_id].unlock();
+}
+
+template <typename DataType, typename DistType>
+inline size_t HNSWIndex<DataType, DistType>::getElementTopLevel(idType internalId) {
+    return element_levels_[internalId];
+}
+
 /**
  * helper functions
  */
@@ -435,7 +559,7 @@ void HNSWIndex<DataType, DistType>::removeExtraLinks(
             orig_candidates.pop();
         }
     }
-    setListCount(node_neighbors, link_idx);
+    setNodeNeighborsCount(node_neighbors, link_idx);
     *removed_links_num = removed_idx;
 }
 
@@ -461,11 +585,9 @@ DistType HNSWIndex<DataType, DistType>::processCandidate(
     tag_t *elements_tags, vecsim_stl::abstract_priority_queue<DistType, Identifier> &top_candidates,
     candidatesMaxHeap<DistType> &candidate_set, DistType lowerBound) const {
 
-#ifdef ENABLE_PARALLELIZATION
-    std::unique_lock<std::mutex> lock(link_list_locks_[curNodeId]);
-#endif
-    idType *node_links = get_linklist_at_level(curNodeId, layer);
-    linkListSize links_num = getListCount(node_links);
+    std::unique_lock<std::mutex> lock(element_neighbors_locks_[curNodeId]);
+    idType *node_links = getNodeNeighborsAtLevel(curNodeId, layer);
+    linkListSize links_num = getNodeNeighborsCount(node_links);
 
     __builtin_prefetch(elements_tags + *node_links);
     __builtin_prefetch(getDataByInternalId(*node_links));
@@ -478,7 +600,7 @@ DistType HNSWIndex<DataType, DistType>::processCandidate(
         __builtin_prefetch(elements_tags + *next_candidate_pos);
         __builtin_prefetch(getDataByInternalId(*next_candidate_pos));
 
-        if (elements_tags[candidate_id] == visited_tag)
+        if (elements_tags[candidate_id] == visited_tag || isInProcess(candidate_id))
             continue;
 
         elements_tags[candidate_id] = visited_tag;
@@ -503,7 +625,7 @@ DistType HNSWIndex<DataType, DistType>::processCandidate(
     }
     // Pre-fetch the neighbours list of the top candidate (the one that is going
     // to be processed in the next iteration) into memory cache, to improve performance.
-    __builtin_prefetch(get_linklist_at_level(candidate_set.top().second, layer));
+    __builtin_prefetch(getNodeNeighborsAtLevel(candidate_set.top().second, layer));
 
     return lowerBound;
 }
@@ -515,11 +637,9 @@ void HNSWIndex<DataType, DistType>::processCandidate_RangeSearch(
     tag_t *elements_tags, std::unique_ptr<vecsim_stl::abstract_results_container> &results,
     candidatesMaxHeap<DistType> &candidate_set, DistType dyn_range, double radius) const {
 
-#ifdef ENABLE_PARALLELIZATION
-    std::unique_lock<std::mutex> lock(link_list_locks_[curNodeId]);
-#endif
-    idType *node_links = get_linklist_at_level(curNodeId, layer);
-    linkListSize links_num = getListCount(node_links);
+    std::unique_lock<std::mutex> lock(element_neighbors_locks_[curNodeId]);
+    idType *node_links = getNodeNeighborsAtLevel(curNodeId, layer);
+    linkListSize links_num = getNodeNeighborsCount(node_links);
 
     __builtin_prefetch(elements_tags + *node_links);
     __builtin_prefetch(getDataByInternalId(*node_links));
@@ -535,7 +655,7 @@ void HNSWIndex<DataType, DistType>::processCandidate_RangeSearch(
         __builtin_prefetch(elements_tags + *next_candidate_pos);
         __builtin_prefetch(getDataByInternalId(*next_candidate_pos));
 
-        if (elements_tags[candidate_id] == visited_tag)
+        if (elements_tags[candidate_id] == visited_tag || isInProcess(candidate_id))
             continue;
         elements_tags[candidate_id] = visited_tag;
         char *candidate_data = getDataByInternalId(candidate_id);
@@ -553,7 +673,7 @@ void HNSWIndex<DataType, DistType>::processCandidate_RangeSearch(
     }
     // Pre-fetch the neighbours list of the top candidate (the one that is going
     // to be processed in the next iteration) into memory cache, to improve performance.
-    __builtin_prefetch(get_linklist_at_level(candidate_set.top().second, layer));
+    __builtin_prefetch(getNodeNeighborsAtLevel(candidate_set.top().second, layer));
 }
 
 template <typename DataType, typename DistType>
@@ -644,116 +764,204 @@ void HNSWIndex<DataType, DistType>::getNeighborsByHeuristic2(
 }
 
 template <typename DataType, typename DistType>
-idType HNSWIndex<DataType, DistType>::mutuallyConnectNewElement(
-    idType cur_c, candidatesMaxHeap<DistType> &top_candidates, size_t level) {
-    size_t Mcurmax = level ? maxM_ : maxM0_;
-    getNeighborsByHeuristic2(top_candidates, M_);
-    if (top_candidates.size() > M_)
-        throw std::runtime_error(
-            "Should be not be more than M_ candidates returned by the heuristic");
+void HNSWIndex<DataType, DistType>::revisitNeighborConnections(
+    size_t level, idType new_node_id, const std::pair<DistType, idType> &neighbor_data,
+    idType *new_node_neighbors_list, idType *neighbor_neighbors_list,
+    std::unique_lock<std::mutex> &node_lock, std::unique_lock<std::mutex> &neighbor_lock) {
+    // Note - expect that node_lock and neighbor_lock are locked at that point.
 
-    vecsim_stl::vector<idType> selectedNeighbors(this->allocator);
-    selectedNeighbors.reserve(M_);
-    while (top_candidates.size() > 0) {
-        selectedNeighbors.push_back(top_candidates.top().second);
-        top_candidates.pop();
+    // Collect the existing neighbors and the new node as the neighbor's neighbors candidates.
+    candidatesMaxHeap<DistType> candidates(this->allocator);
+    // Add the new node along with the pre-calculated distance to the current neighbor,
+    candidates.emplace(neighbor_data.first, new_node_id);
+
+    idType selected_neighbor = neighbor_data.second;
+    for (size_t j = 0; j < getNodeNeighborsCount(neighbor_neighbors_list); j++) {
+        candidates.emplace(this->dist_func(getDataByInternalId(neighbor_neighbors_list[j]),
+                                           getDataByInternalId(selected_neighbor), this->dim),
+                           neighbor_neighbors_list[j]);
     }
 
-    idType next_closest_entry_point = selectedNeighbors.back();
-    {
-        idType *ll_cur = get_linklist_at_level(cur_c, level);
-        assert(getListCount(ll_cur) == 0 &&
-               "The newly inserted element should have blank link list");
-        const linkListSize size = selectedNeighbors.size();
-        setListCount(ll_cur, size);
+    std::vector<idType> nodes_to_update;
+    auto orig_candidates = candidates;
+
+    // Candidates will store the newly selected neighbours (for the neighbor).
+    size_t max_M_cur = level ? maxM_ : maxM0_;
+    getNeighborsByHeuristic2(candidates, max_M_cur);
+
+    // Go over the original candidates set, and save the ones chosen to be removed to update later
+    // on.
+    bool cur_node_chosen = false;
+    while (orig_candidates.size() > 0) {
+        idType orig_candidate = orig_candidates.top().second;
+        // If the current original candidate was not selected as neighbor by the heuristics, it
+        // should be updated and removed from the neighbor's neighbors.
+        if (candidates.empty() || orig_candidate != candidates.top().second) {
+            // Don't add the new_node_id to nodes_to_update, it will be inserted either way later.
+            if (orig_candidate != new_node_id) {
+                nodes_to_update.push_back(orig_candidate);
+            }
+            orig_candidates.pop();
+            // Otherwise, the original candidate was selected to remain a neighbor - no need to
+            // update.
+        } else {
+            candidates.pop();
+            orig_candidates.pop();
+            if (orig_candidate == new_node_id) {
+                cur_node_chosen = true;
+            }
+        }
+    }
+
+    // Acquire all relevant locks for making the updates for the selected neighbor - all its removed
+    // neighbors, along with the neighbors itself and the cur node.
+    // but first, we release the node and neighbors lock to avoid deadlocks.
+    node_lock.unlock();
+    neighbor_lock.unlock();
 
-        for (auto cur_neighbor = selectedNeighbors.rbegin();
-             cur_neighbor != selectedNeighbors.rend(); ++cur_neighbor) {
+    nodes_to_update.push_back(selected_neighbor);
+    nodes_to_update.push_back(new_node_id);
 
-            assert(*ll_cur == 0 && "Possible memory corruption");
-            assert(level <= element_levels_[*cur_neighbor] &&
-                   "Trying to make a link on a non-existent level");
+    std::sort(nodes_to_update.begin(), nodes_to_update.end());
+    size_t nodes_to_update_count = nodes_to_update.size();
+    std::unique_lock<std::mutex> locks[nodes_to_update_count];
+    for (size_t i = 0; i < nodes_to_update_count; i++) {
+        locks[i] = std::unique_lock<std::mutex>(element_neighbors_locks_[nodes_to_update[i]]);
+    }
+
+    auto *neighbour_incoming_edges = getIncomingEdgesPtr(selected_neighbor, level);
+    size_t neighbor_neighbors_count = getNodeNeighborsCount(neighbor_neighbors_list);
 
-            *ll_cur = *cur_neighbor;
-            ll_cur++;
+    size_t neighbour_neighbours_idx = 0;
+    bool update_cur_node_required = true;
+    for (size_t i = 0; i < neighbor_neighbors_count; i++) {
+        if (!std::binary_search(nodes_to_update.begin(), nodes_to_update.end(),
+                                neighbor_neighbors_list[i])) {
+            // The neighbor is not in the "to_update" nodes list - leave it as is.
+            neighbor_neighbors_list[neighbour_neighbours_idx++] = neighbor_neighbors_list[i];
+            continue;
+        } else if (neighbor_neighbors_list[i] == new_node_id) {
+            // The new node got into the neighbor's neighbours - this means there was an update in
+            // another thread during between we released and reacquire the locks - leave it
+            // as is.
+            neighbor_neighbors_list[neighbour_neighbours_idx++] = neighbor_neighbors_list[i];
+            update_cur_node_required = false;
+            continue;
+        }
+        // Now we know that we are looking at a node to be removed from the neighbor's neighbors.
+        auto removed_node = neighbor_neighbors_list[i];
+        auto *removed_node_incoming_edges = getIncomingEdgesPtr(removed_node, level);
+        // Perform the mutual update:
+        // if the removed node id (the neighbour's neighbour to be removed)
+        // wasn't pointing to the neighbour (i.e., the edge was uni-directional),
+        // we should remove the current neighbor from the node's incoming edges.
+        // otherwise, the edge turned from bidirectional to uni-directional, so we insert it to the
+        // neighbour's incoming edges set. Note: we assume that every update is performed atomically
+        // mutually, so it should be sufficient to look at the removed node's incoming edges set
+        // alone.
+        if (!removeIdFromList(*removed_node_incoming_edges, selected_neighbor)) {
+            neighbour_incoming_edges->push_back(removed_node);
         }
+    }
 
-        auto *incoming_edges = new (this->allocator) vecsim_stl::vector<idType>(this->allocator);
-        setIncomingEdgesPtr(cur_c, level, (void *)incoming_edges);
+    size_t cur_node_neighbors_count = getNodeNeighborsCount(new_node_neighbors_list);
+    if (update_cur_node_required && cur_node_neighbors_count < max_M_cur &&
+        !isMarkedDeleted(new_node_id) && !isMarkedDeleted(selected_neighbor)) {
+        // update the connection between the new node and the neighbor.
+        new_node_neighbors_list[cur_node_neighbors_count++] = selected_neighbor;
+        setNodeNeighborsCount(new_node_neighbors_list, cur_node_neighbors_count);
+        if (cur_node_chosen && neighbour_neighbours_idx < max_M_cur) {
+            // connection is mutual - both new node and the selected neighbor in each other's list.
+            neighbor_neighbors_list[neighbour_neighbours_idx++] = new_node_id;
+        } else {
+            // unidirectional connection - put the new node in the neighbour's incoming edges.
+            neighbour_incoming_edges->push_back(new_node_id);
+        }
     }
+    // Done updating the neighbor's neighbors.
+    setNodeNeighborsCount(neighbor_neighbors_list, neighbour_neighbours_idx);
+}
 
-    // go over the selected neighbours - selectedNeighbor is the neighbour id
-    vecsim_stl::vector<bool> neighbors_bitmap(this->allocator);
-    for (idType selectedNeighbor : selectedNeighbors) {
-#ifdef ENABLE_PARALLELIZATION
-        std::unique_lock<std::mutex> lock(link_list_locks_[selectedNeighbor]);
-#endif
-        idType *neighbor_neighbors = get_linklist_at_level(selectedNeighbor, level);
-        linkListSize sz_link_list_other = getListCount(neighbor_neighbors);
-
-        if (sz_link_list_other > Mcurmax)
-            throw std::runtime_error("Bad value of sz_link_list_other");
-        if (selectedNeighbor == cur_c)
-            throw std::runtime_error("Trying to connect an element to itself");
-        if (level > element_levels_[selectedNeighbor])
-            throw std::runtime_error("Trying to make a link on a non-existent level");
-
-        // If the selected neighbor can add another link (hasn't reached the max) - add it.
-        if (sz_link_list_other < Mcurmax) {
-            neighbor_neighbors[sz_link_list_other] = cur_c;
-            setListCount(neighbor_neighbors, sz_link_list_other + 1);
+template <typename DataType, typename DistType>
+idType HNSWIndex<DataType, DistType>::mutuallyConnectNewElement(
+    idType new_node_id, candidatesMaxHeap<DistType> &top_candidates, size_t level) {
+
+    // The maximum number of neighbors allowed for an existing neighbor (not new).
+    size_t max_M_cur = level ? maxM_ : maxM0_;
+
+    // Filter the top candidates to the selected neighbors by the algorithm heuristics.
+    getNeighborsByHeuristic2(top_candidates, M_);
+    assert(top_candidates.size() <= M_ &&
+           "Should be not be more than M_ candidates returned by the heuristic");
+
+    // Hold (distance_from_new_node_id, neighbor_id) pair for every selected neighbor.
+    vecsim_stl::vector<std::pair<DistType, idType>> selected_neighbors(this->allocator);
+    selected_neighbors.reserve(M_);
+    while (!top_candidates.empty()) {
+        selected_neighbors.push_back(top_candidates.top());
+        top_candidates.pop();
+    }
+
+    // The closest vector that has found to be returned (and start the scan from it in the next
+    // level).
+    idType next_closest_entry_point = selected_neighbors.back().second;
+    idType *new_node_neighbors_list = getNodeNeighborsAtLevel(new_node_id, level);
+    assert(getNodeNeighborsCount(new_node_neighbors_list) == 0 &&
+           "The newly inserted element should have blank link list");
+
+    // Create the incoming edges for the new node in the current level.
+    auto *incoming_edges = new (this->allocator) vecsim_stl::vector<idType>(this->allocator);
+    setIncomingEdgesPtr(new_node_id, level, (void *)incoming_edges);
+
+    for (auto &neighbor_data : selected_neighbors) {
+        idType selected_neighbor = neighbor_data.second; // neighbor's id
+        std::unique_lock<std::mutex> node_lock;
+        std::unique_lock<std::mutex> neighbor_lock;
+        idType lower_id = (new_node_id < selected_neighbor) ? new_node_id : selected_neighbor;
+        if (lower_id == new_node_id) {
+            node_lock = std::unique_lock<std::mutex>(element_neighbors_locks_[new_node_id]);
+            neighbor_lock =
+                std::unique_lock<std::mutex>(element_neighbors_locks_[selected_neighbor]);
         } else {
-            // try finding "weak" elements to replace it with the new one with the heuristic:
-            candidatesMaxHeap<DistType> candidates(this->allocator);
-            // (re)use the bitmap to represent the set of the original neighbours for the current
-            // selected neighbour.
-            neighbors_bitmap.assign(cur_element_count, false);
-            DistType d_max = this->dist_func(getDataByInternalId(cur_c),
-                                             getDataByInternalId(selectedNeighbor), this->dim);
-            candidates.emplace(d_max, cur_c);
-            // consider cur_c as if it was a link of the selected neighbor
-            neighbors_bitmap[cur_c] = true;
-            for (size_t j = 0; j < sz_link_list_other; j++) {
-                candidates.emplace(this->dist_func(getDataByInternalId(neighbor_neighbors[j]),
-                                                   getDataByInternalId(selectedNeighbor),
-                                                   this->dim),
-                                   neighbor_neighbors[j]);
-                neighbors_bitmap[neighbor_neighbors[j]] = true;
-            }
+            neighbor_lock =
+                std::unique_lock<std::mutex>(element_neighbors_locks_[selected_neighbor]);
+            node_lock = std::unique_lock<std::mutex>(element_neighbors_locks_[new_node_id]);
+        }
 
-            idType removed_links[sz_link_list_other + 1];
-            size_t removed_links_num;
-            removeExtraLinks(candidates, Mcurmax, neighbor_neighbors, neighbors_bitmap,
-                             removed_links, &removed_links_num);
-
-            // remove the current neighbor from the incoming list of nodes for the
-            // neighbours that were chosen to remove (if edge wasn't bidirectional)
-            auto *neighbour_incoming_edges = getIncomingEdgesPtr(selectedNeighbor, level);
-            for (size_t i = 0; i < removed_links_num; i++) {
-                idType node_id = removed_links[i];
-                auto *node_incoming_edges = getIncomingEdgesPtr(node_id, level);
-                // if we removed cur_c (the node just inserted), then it points to the current
-                // neighbour, but not vise versa.
-                if (node_id == cur_c) {
-                    neighbour_incoming_edges->push_back(cur_c);
-                    continue;
-                }
+        // get the updated count - this may change between iterations due to releasing the lock.
+        linkListSize cur_node_neighbors_count = getNodeNeighborsCount(new_node_neighbors_list);
+        idType *neighbor_neighbors_list = getNodeNeighborsAtLevel(selected_neighbor, level);
+        linkListSize neighbor_neighbors_count = getNodeNeighborsCount(neighbor_neighbors_list);
 
-                // if the node id (the neighbour's neighbour to be removed)
-                // wasn't pointing to the neighbour (i.e., the edge was uni-directional),
-                // we should remove the current neighbor from the node's incoming edges.
-                // otherwise, the edge turned from bidirectional to
-                // uni-directional, so we insert it to the neighbour's
-                // incoming edges set.
-                auto it = std::find(node_incoming_edges->begin(), node_incoming_edges->end(),
-                                    selectedNeighbor);
-                if (it != node_incoming_edges->end()) {
-                    node_incoming_edges->erase(it);
-                } else {
-                    neighbour_incoming_edges->push_back(node_id);
-                }
-            }
+        // validations...
+        assert(cur_node_neighbors_count <= max_M_cur && "Neighbors number exceeds limit");
+        assert(selected_neighbor != new_node_id && "Trying to connect an element to itself");
+
+        if (cur_node_neighbors_count == max_M_cur) {
+            // The new node cannot add more neighbors
+            break;
         }
+
+        // If one of the two nodes has already deleted - skip the operation.
+        if (isMarkedDeleted(new_node_id) || isMarkedDeleted(selected_neighbor)) {
+            continue;
+        }
+
+        // if the neighbor's neighbors list has the capacity to add the new node, make the update
+        // and finish.
+        if (neighbor_neighbors_count < max_M_cur) {
+            new_node_neighbors_list[cur_node_neighbors_count] = selected_neighbor;
+            setNodeNeighborsCount(new_node_neighbors_list, cur_node_neighbors_count + 1);
+            neighbor_neighbors_list[neighbor_neighbors_count] = new_node_id;
+            setNodeNeighborsCount(neighbor_neighbors_list, neighbor_neighbors_count + 1);
+            continue;
+        }
+
+        // Otherwise - we need to re-evaluate the neighbor's neighbors.
+        // We collect all the existing neighbors and the new node as candidates, and mutually update
+        // the neighbor's neighbors.
+        revisitNeighborConnections(level, new_node_id, neighbor_data, new_node_neighbors_list,
+                                   neighbor_neighbors_list, node_lock, neighbor_lock);
     }
     return next_closest_entry_point;
 }
@@ -765,7 +973,7 @@ void HNSWIndex<DataType, DistType>::repairConnectionsForDeletion(
 
     // put the deleted element's neighbours in the candidates.
     candidatesMaxHeap<DistType> candidates(this->allocator);
-    linkListSize neighbours_count = getListCount(neighbours);
+    linkListSize neighbours_count = getNodeNeighborsCount(neighbours);
     for (size_t j = 0; j < neighbours_count; j++) {
         // Don't put the neighbor itself in his own candidates
         if (neighbours[j] == neighbour_id) {
@@ -779,7 +987,7 @@ void HNSWIndex<DataType, DistType>::repairConnectionsForDeletion(
     // add the deleted element's neighbour's original neighbors in the candidates.
     vecsim_stl::vector<bool> neighbour_orig_neighbours_set(cur_element_count, false,
                                                            this->allocator);
-    linkListSize neighbour_neighbours_count = getListCount(neighbour_neighbours);
+    linkListSize neighbour_neighbours_count = getNodeNeighborsCount(neighbour_neighbours);
 
     for (size_t j = 0; j < neighbour_neighbours_count; j++) {
         neighbour_orig_neighbours_set[neighbour_neighbours[j]] = true;
@@ -813,16 +1021,13 @@ void HNSWIndex<DataType, DistType>::repairConnectionsForDeletion(
         // we should remove it from the node's incoming edges.
         // otherwise, edge turned from bidirectional to one directional,
         // and it should be saved in the neighbor's incoming edges.
-        auto it = std::find(node_incoming_edges->begin(), node_incoming_edges->end(), neighbour_id);
-        if (it != node_incoming_edges->end()) {
-            node_incoming_edges->erase(it);
-        } else {
+        if (!removeIdFromList(*node_incoming_edges, neighbour_id)) {
             neighbour_incoming_edges->push_back(node_id);
         }
     }
 
     // updates for the new edges created
-    linkListSize updated_links_num = getListCount(neighbour_neighbours);
+    linkListSize updated_links_num = getNodeNeighborsCount(neighbour_neighbours);
     for (size_t i = 0; i < updated_links_num; i++) {
         idType node_id = neighbour_neighbours[i];
         if (!neighbour_orig_neighbours_set[node_id]) {
@@ -830,15 +1035,15 @@ void HNSWIndex<DataType, DistType>::repairConnectionsForDeletion(
             // if the node has an edge to the neighbour as well, remove it
             // from the incoming nodes of the neighbour
             // otherwise, need to update the edge as incoming.
-            idType *node_links = get_linklist_at_level(node_id, level);
-            unsigned short node_links_size = getListCount(node_links);
+            idType *node_links = getNodeNeighborsAtLevel(node_id, level);
+            unsigned short node_links_size = getNodeNeighborsCount(node_links);
 
             bool bidirectional_edge = false;
             for (size_t j = 0; j < node_links_size; j++) {
                 if (node_links[j] == neighbour_id) {
-                    neighbour_incoming_edges->erase(std::find(neighbour_incoming_edges->begin(),
-                                                              neighbour_incoming_edges->end(),
-                                                              node_id));
+                    // Swap the last element with the current one (equivalent to removing the
+                    // neighbor from the list) - this should always succeed and return true.
+                    removeIdFromList(*neighbour_incoming_edges, node_id);
                     bidirectional_edge = true;
                     break;
                 }
@@ -855,49 +1060,91 @@ void HNSWIndex<DataType, DistType>::replaceEntryPoint() {
     idType old_entry = entrypoint_node_;
     // Sets an (arbitrary) new entry point, after deleting the current entry point.
     while (old_entry == entrypoint_node_) {
-        idType *top_level_list = get_linklist_at_level(old_entry, maxlevel_);
-        if (getListCount(top_level_list) > 0) {
-            // Tries to set the (arbitrary) first neighbor as the entry point, if exists.
-            entrypoint_node_ = *top_level_list;
-        } else {
-            // If there is no neighbors in the current level, check for any vector at
-            // this level to be the new entry point.
-            for (idType cur_id = 0; cur_id < cur_element_count; cur_id++) {
-                if (element_levels_[cur_id] == maxlevel_ && cur_id != old_entry) {
+        // Use volatile for this variable, so that in case we would have to busy wait for this
+        // element to finish its indexing, the compiler will not use optimizations. Otherwise,
+        // the compiler might evaluate 'isInProcess(candidate_in_process)' once instead of calling
+        // it multiple times in a busy wait manner, and we'll run into an infinite loop if the
+        // candidate is in process when we reach the loop.
+        volatile idType candidate_in_process = INVALID_ID;
+        {
+            // Go over the entry point's neighbors at the top level.
+            std::unique_lock<std::mutex> lock(this->element_neighbors_locks_[entrypoint_node_]);
+            idType *top_level_list = getNodeNeighborsAtLevel(old_entry, max_level_);
+            auto neighbors_count = getNodeNeighborsCount(top_level_list);
+            // Tries to set the (arbitrary) first neighbor as the entry point which is not deleted,
+            // if exists.
+            for (size_t i = 0; i < neighbors_count; i++) {
+                if (!isMarkedDeleted(top_level_list[i])) {
+                    if (!isInProcess(top_level_list[i])) {
+                        entrypoint_node_ = top_level_list[i];
+                        return;
+                    } else {
+                        // Store this candidate which is currently being inserted into the graph in
+                        // case we won't find other candidate at the top level.
+                        candidate_in_process = top_level_list[i];
+                    }
+                }
+            }
+        }
+        // If there is no neighbors in the current level, check for any vector at
+        // this level to be the new entry point.
+        for (idType cur_id = 0; cur_id < cur_element_count; cur_id++) {
+            if (element_levels_[cur_id] == max_level_ && cur_id != old_entry &&
+                !isMarkedDeleted(cur_id)) {
+                // Found a non element in the current max level.
+                if (!isInProcess(cur_id)) {
                     entrypoint_node_ = cur_id;
-                    break;
+                    return;
+                } else if (candidate_in_process == INVALID_ID) {
+                    // This element is still in process, and there hasn't been another candidate in
+                    // process that has found in this level.
+                    candidate_in_process = cur_id;
                 }
             }
         }
-        // If we didn't find any vector at the top level, decrease the maxlevel_ and try again,
+        // If we only found candidates which are in process at this level, do busy wait until they
+        // are done being processed (this should happen in very rare cases...). Since
+        // candidate_in_process was declared volatile, we can be sure that isInProcess is called in
+        // every iteration.
+        if (candidate_in_process != INVALID_ID) {
+            while (isInProcess(candidate_in_process))
+                ;
+            entrypoint_node_ = candidate_in_process;
+            return;
+        }
+        // If we didn't find any vector at the top level, decrease the max_level_ and try again,
         // until we find a new entry point, or the index is empty.
-        if (old_entry == entrypoint_node_) {
-            maxlevel_--;
-            if ((int)maxlevel_ < 0) {
-                maxlevel_ = HNSW_INVALID_LEVEL;
-                entrypoint_node_ = HNSW_INVALID_ID;
-            }
+        assert(old_entry == entrypoint_node_);
+        max_level_--;
+        if ((int)max_level_ < 0) {
+            max_level_ = HNSW_INVALID_LEVEL;
+            entrypoint_node_ = INVALID_ID;
         }
     }
 }
 
 template <typename DataType, typename DistType>
+template <bool has_marked_deleted>
 void HNSWIndex<DataType, DistType>::SwapLastIdWithDeletedId(idType element_internal_id) {
-    // swap label
-    replaceIdOfLabel(getExternalLabel(cur_element_count), element_internal_id, cur_element_count);
+    // Swap label - this is relevant when the last element's label exists (it is not marked as
+    // deleted). For inplace delete, this is always the case.
+    if (!has_marked_deleted || !isMarkedDeleted(cur_element_count)) {
+        replaceIdOfLabel(getExternalLabel(cur_element_count), element_internal_id,
+                         cur_element_count);
+    }
 
-    // swap neighbours
+    // Swap neighbours
     size_t last_element_top_level = element_levels_[cur_element_count];
     for (size_t level = 0; level <= last_element_top_level; level++) {
-        idType *neighbours = get_linklist_at_level(cur_element_count, level);
-        linkListSize neighbours_count = getListCount(neighbours);
+        idType *neighbours = getNodeNeighborsAtLevel(cur_element_count, level);
+        linkListSize neighbours_count = getNodeNeighborsCount(neighbours);
 
-        // go over the neighbours that also points back to the last element whose is going to
+        // Go over the neighbours that also points back to the last element whose is going to
         // change, and update the id.
         for (size_t i = 0; i < neighbours_count; i++) {
             idType neighbour_id = neighbours[i];
-            idType *neighbour_neighbours = get_linklist_at_level(neighbour_id, level);
-            linkListSize neighbour_neighbours_count = getListCount(neighbour_neighbours);
+            idType *neighbour_neighbours = getNodeNeighborsAtLevel(neighbour_id, level);
+            linkListSize neighbour_neighbours_count = getNodeNeighborsCount(neighbour_neighbours);
 
             bool bidirectional_edge = false;
             for (size_t j = 0; j < neighbour_neighbours_count; j++) {
@@ -909,25 +1156,23 @@ void HNSWIndex<DataType, DistType>::SwapLastIdWithDeletedId(idType element_inter
                 }
             }
 
-            // if this edge is uni-directional, we should update the id in the neighbor's
+            // If this edge is uni-directional, we should update the id in the neighbor's
             // incoming edges.
             if (!bidirectional_edge) {
                 auto *neighbour_incoming_edges = getIncomingEdgesPtr(neighbour_id, level);
-                auto it = std::find(neighbour_incoming_edges->begin(),
-                                    neighbour_incoming_edges->end(), cur_element_count);
-                assert(it != neighbour_incoming_edges->end());
-                neighbour_incoming_edges->erase(it);
+                // This should always succeed and return true.
+                removeIdFromList(*neighbour_incoming_edges, cur_element_count);
                 neighbour_incoming_edges->push_back(element_internal_id);
             }
         }
 
-        // next, go over the rest of incoming edges (the ones that are not bidirectional) and make
+        // Next, go over the rest of incoming edges (the ones that are not bidirectional) and make
         // updates.
         auto *incoming_edges = getIncomingEdgesPtr(cur_element_count, level);
         for (auto incoming_edge : *incoming_edges) {
-            idType *incoming_neighbour_neighbours = get_linklist_at_level(incoming_edge, level);
+            idType *incoming_neighbour_neighbours = getNodeNeighborsAtLevel(incoming_edge, level);
             linkListSize incoming_neighbour_neighbours_count =
-                getListCount(incoming_neighbour_neighbours);
+                getNodeNeighborsCount(incoming_neighbour_neighbours);
             for (size_t j = 0; j < incoming_neighbour_neighbours_count; j++) {
                 if (incoming_neighbour_neighbours[j] == cur_element_count) {
                     incoming_neighbour_neighbours[j] = element_internal_id;
@@ -937,18 +1182,18 @@ void HNSWIndex<DataType, DistType>::SwapLastIdWithDeletedId(idType element_inter
         }
     }
 
-    // swap the last_id level 0 data, and invalidate the deleted id's data
+    // Swap the last_id level 0 data, and invalidate the deleted id's data.
     memcpy(data_level0_memory_ + element_internal_id * size_data_per_element_ + offsetLevel0_,
            data_level0_memory_ + cur_element_count * size_data_per_element_ + offsetLevel0_,
            size_data_per_element_);
     memset(data_level0_memory_ + cur_element_count * size_data_per_element_ + offsetLevel0_, 0,
            size_data_per_element_);
 
-    // swap pointer of higher levels links
+    // Swap pointer of higher levels links.
     linkLists_[element_internal_id] = linkLists_[cur_element_count];
     linkLists_[cur_element_count] = nullptr;
 
-    // swap top element level
+    // Swap top element level.
     element_levels_[element_internal_id] = element_levels_[cur_element_count];
     element_levels_[cur_element_count] = HNSW_INVALID_LEVEL;
 
@@ -959,41 +1204,106 @@ void HNSWIndex<DataType, DistType>::SwapLastIdWithDeletedId(idType element_inter
 
 // This function is greedily searching for the closest candidate to the given data point at the
 // given level, starting at the given node. It sets `curObj` to the closest node found, and
-// `curDist` to the distance to this node. If `with_timeout` is true, the search will check for
-// timeout and return if it has occurred. `timeoutCtx` and `rc` must be valid if `with_timeout` is
-// true.
+// `curDist` to the distance to this node. If `running_query` is true, the search will check for
+// timeout and return if it has occurred. `timeoutCtx` and `rc` must be valid if `running_query` is
+// true. *Note that we assume that level is higher than 0*. Also, if we're not running a query (we
+// are searching neighbors for a new vector), then bestCand should be a non-deleted element!
 template <typename DataType, typename DistType>
-template <bool with_timeout>
+template <bool running_query>
 void HNSWIndex<DataType, DistType>::greedySearchLevel(const void *vector_data, size_t level,
-                                                      idType &curObj, DistType &curDist,
+                                                      idType &bestCand, DistType &curDist,
                                                       void *timeoutCtx,
                                                       VecSimQueryResult_Code *rc) const {
     bool changed;
+    // Don't allow choosing a deleted node as an entry point upon searching for neighbors
+    // candidates (that is, we're NOT running a query, but inserting a new vector).
+    idType bestNonDeletedCand = bestCand;
+
     do {
-        if (with_timeout && VECSIM_TIMEOUT(timeoutCtx)) {
+        if (running_query && VECSIM_TIMEOUT(timeoutCtx)) {
             *rc = VecSim_QueryResult_TimedOut;
-            curObj = HNSW_INVALID_ID;
+            bestCand = INVALID_ID;
             return;
         }
+
         changed = false;
-#ifdef ENABLE_PARALLELIZATION
-        std::unique_lock<std::mutex> lock(link_list_locks_[currObj]);
-#endif
-        idType *node_links = get_linklist(curObj, level);
-        linkListSize links_count = getListCount(node_links);
+        std::unique_lock<std::mutex> lock(element_neighbors_locks_[bestCand]);
+        idType *node_links = getNodeNeighborsAtNonBaseLevel(bestCand, level);
+        linkListSize links_count = getNodeNeighborsCount(node_links);
 
         for (int i = 0; i < links_count; i++) {
             idType candidate = node_links[i];
             assert(candidate < this->cur_element_count && "candidate error: out of index range");
-
+            if (isInProcess(candidate)) {
+                continue;
+            }
             DistType d = this->dist_func(vector_data, getDataByInternalId(candidate), this->dim);
             if (d < curDist) {
                 curDist = d;
-                curObj = candidate;
+                bestCand = candidate;
                 changed = true;
+                // Run this code only for non-query code - update the best non deleted cand as well.
+                // Upon running a query, we don't mind having a deleted element as an entry point
+                // for the next level, as eventually we return non-deleted elements in level 0.
+                if (!running_query && !isMarkedDeleted(candidate)) {
+                    bestNonDeletedCand = bestCand;
+                }
             }
         }
     } while (changed);
+    if (!running_query) {
+        bestCand = bestNonDeletedCand;
+    }
+}
+
+template <typename DataType, typename DistType>
+vecsim_stl::vector<graphNodeType>
+HNSWIndex<DataType, DistType>::safeCollectAllNodeIncomingNeighbors(idType node_id,
+                                                                   size_t node_top_level) {
+    vecsim_stl::vector<graphNodeType> incoming_neighbors(this->allocator);
+
+    for (size_t level = 0; level <= node_top_level; level++) {
+        // Save the node neighbor's in the current level while holding its neighbors lock.
+        std::vector<idType> neighbors_copy;
+        std::unique_lock<std::mutex> element_lock(element_neighbors_locks_[node_id]);
+        auto *neighbours = getNodeNeighborsAtLevel(node_id, level);
+        unsigned short neighbours_count = getNodeNeighborsCount(neighbours);
+        // Store the deleted element's neighbours.
+        neighbors_copy.assign(neighbours, neighbours + neighbours_count);
+        element_lock.unlock();
+
+        // Go over the neighbours and collect tho ones that also points back to the removed node.
+        for (auto neighbour_id : neighbors_copy) {
+            // Hold the neighbor's lock while we are going over its neighbors.
+            std::unique_lock<std::mutex> neighbor_lock(element_neighbors_locks_[neighbour_id]);
+            auto *neighbour_neighbours = getNodeNeighborsAtLevel(neighbour_id, level);
+            unsigned short neighbour_neighbours_count = getNodeNeighborsCount(neighbour_neighbours);
+            for (size_t j = 0; j < neighbour_neighbours_count; j++) {
+                // A bidirectional edge was found - this connection should be repaired.
+                if (neighbour_neighbours[j] == node_id) {
+                    incoming_neighbors.emplace_back(neighbour_id, (ushort)level);
+                    break;
+                }
+            }
+        }
+
+        // Next, collect the rest of incoming edges (the ones that are not bidirectional) in the
+        // current level to repair them.
+        element_lock.lock();
+        auto *incoming_edges = getIncomingEdgesPtr(node_id, level);
+        // Note that the deleted element might be in the process of indexing into the graph in the
+        // meantime (in async mode). Since the incoming_edges lists in every level are allocated
+        // while the element is being indexed into that level (in lazy mode), we may find ourselves
+        // in a situation where the incoming edges was not allocated yet in this level (but we do
+        // guarantee that the pointer is NULL in that case). In which case, we just continue. We
+        // also validate that we won't add new edges to a deleted node later on.
+        if (!incoming_edges)
+            continue;
+        for (auto incoming_edge : *incoming_edges) {
+            incoming_neighbors.emplace_back(incoming_edge, (ushort)level);
+        }
+    }
+    return incoming_neighbors;
 }
 
 template <typename DataType, typename DistType>
@@ -1002,9 +1312,8 @@ void HNSWIndex<DataType, DistType>::resizeIndexInternal(size_t new_max_elements)
     element_levels_.shrink_to_fit();
     resizeLabelLookup(new_max_elements);
     visited_nodes_handler_pool.resize(new_max_elements);
-#ifdef ENABLE_PARALLELIZATION
-    std::vector<std::mutex>(new_max_elements).swap(link_list_locks_);
-#endif
+    vecsim_stl::vector<std::mutex>(new_max_elements, this->allocator)
+        .swap(element_neighbors_locks_);
     // Reallocate base layer
     char *data_level0_memory_new = (char *)this->allocator->reallocate(
         data_level0_memory_, new_max_elements * size_data_per_element_);
@@ -1022,6 +1331,217 @@ void HNSWIndex<DataType, DistType>::resizeIndexInternal(size_t new_max_elements)
     max_elements_ = new_max_elements;
 }
 
+template <typename DataType, typename DistType>
+void HNSWIndex<DataType, DistType>::mutuallyUpdateForRepairedNode(
+    idType node_id, size_t level, vecsim_stl::vector<idType> &neighbors_to_remove,
+    vecsim_stl::vector<idType> &nodes_to_update, vecsim_stl::vector<idType> &chosen_neighbors,
+    size_t max_M_cur) {
+    // Sort the nodes to remove set for fast lookup.
+    std::sort(neighbors_to_remove.begin(), neighbors_to_remove.end());
+
+    // Acquire the required locks for the updates, after sorting the nodes to update
+    // (to avoid deadlocks)
+    nodes_to_update.push_back(node_id);
+    std::sort(nodes_to_update.begin(), nodes_to_update.end());
+    size_t nodes_to_update_count = nodes_to_update.size();
+    std::unique_lock<std::mutex> locks[nodes_to_update_count];
+    for (size_t i = 0; i < nodes_to_update_count; i++) {
+        locks[i] = std::unique_lock<std::mutex>(element_neighbors_locks_[nodes_to_update[i]]);
+    }
+
+    idType *node_neighbors = getNodeNeighborsAtLevel(node_id, level);
+    linkListSize node_neighbors_count = getNodeNeighborsCount(node_neighbors);
+    auto *node_incoming_edges = getIncomingEdgesPtr(node_id, level);
+
+    // Perform mutual updates: go over the node's neighbors and overwrite the neighbors to remove
+    // that are still exist.
+    size_t node_neighbors_idx = 0;
+    for (size_t i = 0; i < node_neighbors_count; i++) {
+        if (!std::binary_search(nodes_to_update.begin(), nodes_to_update.end(),
+                                node_neighbors[i])) {
+            // The repaired node added a new neighbor that we didn't account for before in the
+            // meantime - leave it as is.
+            node_neighbors[node_neighbors_idx++] = node_neighbors[i];
+            continue;
+        }
+        // Check if the current neighbor is in the chosen neighbors list, and remove it from there
+        // if so.
+        if (removeIdFromList(chosen_neighbors, node_neighbors[i])) {
+            // A chosen neighbor is already connected to the node - leave it as is.
+            node_neighbors[node_neighbors_idx++] = node_neighbors[i];
+            continue;
+        }
+        // Now we know that we are looking at a neighbor that needs to be removed.
+        auto removed_node = node_neighbors[i];
+        auto *removed_node_incoming_edges = getIncomingEdgesPtr(removed_node, level);
+        // Perform the mutual update:
+        // if the removed node id (the node's neighbour to be removed)
+        // wasn't pointing to the node (i.e., the edge was uni-directional),
+        // we should remove the current neighbor from the node's incoming edges.
+        // otherwise, the edge turned from bidirectional to uni-directional, so we insert it to the
+        // neighbour's incoming edges set. Note: we assume that every update is performed atomically
+        // mutually, so it should be sufficient to look at the removed node's incoming edges set
+        // alone.
+        if (!removeIdFromList(*removed_node_incoming_edges, node_id)) {
+            node_incoming_edges->push_back(removed_node);
+        }
+    }
+
+    // Go over the chosen new neighbors that are not connected yet and perform updates.
+    for (auto chosen_id : chosen_neighbors) {
+        if (node_neighbors_idx == max_M_cur) {
+            // Cannot add more new neighbors, we reached the capacity.
+            this->log("Couldn't add all the chosen new nodes upon updating %u, as we reached the"
+                      " maximum number of neighbors per node",
+                      node_id);
+            break;
+        }
+        // We don't add new neighbors for deleted nodes - if node_id is deleted we can finish.
+        // Also, don't add new neighbors to a node who is currently being indexed in parallel, as it
+        // may choose the same element as its neighbor right after the repair is done and connect it
+        // to it, and have a duplicate neighbor as a result.
+        if (isMarkedDeleted(node_id) || isInProcess(node_id)) {
+            break;
+        }
+        // If this specific new neighbor is deleted, we don't add this connection and continue.
+        // Also, don't add a new node whose being indexed in parallel, as it may choose this node
+        // as its neighbor and create a double connection (then this node will have a duplicate
+        // neighbor).
+        if (isMarkedDeleted(chosen_id) || isInProcess(chosen_id)) {
+            continue;
+        }
+        auto *new_neighbor_incoming_edges = getIncomingEdgesPtr(chosen_id, level);
+        node_neighbors[node_neighbors_idx++] = chosen_id;
+        // If the node is in the chosen new node incoming edges, there is a unidirectional
+        // connection from the chosen node to the repaired node that turns into bidirectional. Then,
+        // remove it from the incoming edges set. Otherwise, the edge is created unidirectional, so
+        // we add it to the unidirectional edges set. Note: we assume that all updates occur
+        // mutually and atomically, then can rely on this assumption.
+        if (!removeIdFromList(*node_incoming_edges, chosen_id)) {
+            new_neighbor_incoming_edges->push_back(node_id);
+        }
+    }
+    // Done updating the node's neighbors.
+    setNodeNeighborsCount(node_neighbors, node_neighbors_idx);
+}
+
+template <typename DataType, typename DistType>
+void HNSWIndex<DataType, DistType>::repairNodeConnections(idType node_id, size_t level) {
+
+    candidatesMaxHeap<DistType> neighbors_candidates(this->allocator);
+    // Use bitmaps for fast accesses:
+    // node_orig_neighbours_set is used to diffrentiate between the neighboes that will *not* be
+    // selected by the heuritics - only the ones that were originally neighbors should be removed.
+    vecsim_stl::vector<bool> node_orig_neighbours_set(max_elements_, false, this->allocator);
+    // neighbors_candidates_set is used to store the nodes that were already collected as
+    // candidates, so we will not collect them again as candidates if we run into them from another
+    // path.
+    vecsim_stl::vector<bool> neighbors_candidates_set(max_elements_, false, this->allocator);
+    vecsim_stl::vector<idType> deleted_neighbors(this->allocator);
+
+    // Go over the repaired node neighbors, collect the non-deleted ones to be neighbors candidates
+    // after the repair as well.
+    {
+        std::unique_lock<std::mutex> node_lock(element_neighbors_locks_[node_id]);
+        idType *node_neighbors = getNodeNeighborsAtLevel(node_id, level);
+        linkListSize node_neighbors_count = getNodeNeighborsCount(node_neighbors);
+        for (size_t j = 0; j < node_neighbors_count; j++) {
+            node_orig_neighbours_set[node_neighbors[j]] = true;
+            // Don't add the removed element to the candidates.
+            if (isMarkedDeleted(node_neighbors[j])) {
+                deleted_neighbors.push_back(node_neighbors[j]);
+                continue;
+            }
+            neighbors_candidates_set[node_neighbors[j]] = true;
+            neighbors_candidates.emplace(this->dist_func(getDataByInternalId(node_id),
+                                                         getDataByInternalId(node_neighbors[j]),
+                                                         this->dim),
+                                         node_neighbors[j]);
+        }
+    }
+    // If there are not deleted neighbors at that point the repair job has already been made by
+    // another parallel job, and there is no need to repair the node anymore.
+    if (deleted_neighbors.empty()) {
+        return;
+    }
+
+    // Hold 3 sets of nodes - all the original neighbors at that point to later (potentially)
+    // update, subset of these which are the chosen neighbors nodes, and a subset of the original
+    // neighbors that are going to be removed.
+    vecsim_stl::vector<idType> nodes_to_update(this->allocator);
+    vecsim_stl::vector<idType> chosen_neighbors(this->allocator);
+    vecsim_stl::vector<idType> neighbors_to_remove(this->allocator);
+
+    // Go over the deleted nodes and collect their neighbors to the candidates set.
+    for (idType deleted_neighbor_id : deleted_neighbors) {
+        nodes_to_update.push_back(deleted_neighbor_id);
+        neighbors_to_remove.push_back(deleted_neighbor_id);
+
+        std::unique_lock<std::mutex> neighbor_lock(
+            this->element_neighbors_locks_[deleted_neighbor_id]);
+        idType *neighbor_neighbours = getNodeNeighborsAtLevel(deleted_neighbor_id, level);
+        linkListSize neighbor_neighbours_count = getNodeNeighborsCount(neighbor_neighbours);
+
+        for (size_t j = 0; j < neighbor_neighbours_count; j++) {
+            // Don't add removed elements to the candidates, nor nodes that are already in the
+            // candidates set, nor the original node to repair itself.
+            if (isMarkedDeleted(neighbor_neighbours[j]) ||
+                neighbors_candidates_set[neighbor_neighbours[j]] ||
+                neighbor_neighbours[j] == node_id) {
+                continue;
+            }
+            neighbors_candidates_set[neighbor_neighbours[j]] = true;
+            neighbors_candidates.emplace(
+                this->dist_func(getDataByInternalId(node_id),
+                                getDataByInternalId(neighbor_neighbours[j]), this->dim),
+                neighbor_neighbours[j]);
+        }
+    }
+
+    // Copy the original candidates, and run the heuristics. Afterwards, neighbors_candidates will
+    // store the newly selected neighbours (for the node), while candidates which were originally
+    // neighbors and are not going to be selected, are going to be removed.
+    auto orig_candidates = neighbors_candidates;
+    size_t max_M_cur = level ? maxM_ : maxM0_;
+    getNeighborsByHeuristic2(neighbors_candidates, max_M_cur);
+
+    while (!orig_candidates.empty()) {
+        idType orig_candidate = orig_candidates.top().second;
+        if (neighbors_candidates.empty() || orig_candidate != neighbors_candidates.top().second) {
+            if (node_orig_neighbours_set[orig_candidate]) {
+                neighbors_to_remove.push_back(orig_candidate);
+                nodes_to_update.push_back(orig_candidate);
+            }
+            orig_candidates.pop();
+        } else {
+            chosen_neighbors.push_back(orig_candidate);
+            nodes_to_update.push_back(orig_candidate);
+            neighbors_candidates.pop();
+            orig_candidates.pop();
+        }
+    }
+
+    // Perform the actual updates for the node and the impacted neighbors while holding the nodes'
+    // locks.
+    mutuallyUpdateForRepairedNode(node_id, level, neighbors_to_remove, nodes_to_update,
+                                  chosen_neighbors, max_M_cur);
+}
+
+template <typename DataType, typename DistType>
+inline bool
+HNSWIndex<DataType, DistType>::removeIdFromList(vecsim_stl::vector<idType> &element_ids_list,
+                                                idType element_id) {
+    auto it = std::find(element_ids_list.begin(), element_ids_list.end(), element_id);
+    if (it != element_ids_list.end()) {
+        // Swap the last element with the current one (equivalent to removing the element id from
+        // the list).
+        *it = element_ids_list.back();
+        element_ids_list.pop_back();
+        return true;
+    }
+    return false;
+}
+
 /**
  * Ctor / Dtor
  */
@@ -1038,20 +1558,12 @@ void HNSWIndex<DataType, DistType>::resizeIndexInternal(size_t new_max_elements)
 } HNSWParams; */
 template <typename DataType, typename DistType>
 HNSWIndex<DataType, DistType>::HNSWIndex(const HNSWParams *params,
-                                         std::shared_ptr<VecSimAllocator> allocator,
+                                         const AbstractIndexInitParams &abstractInitParams,
                                          size_t random_seed, size_t pool_initial_size)
-    : VecSimIndexAbstract<DistType>(allocator, params->dim, params->type, params->metric,
-                                    params->blockSize, params->multi),
-      VecSimIndexTombstone(), max_elements_(params->initialCapacity),
-      data_size_(VecSimType_sizeof(params->type) * this->dim),
-      element_levels_(max_elements_, allocator),
-      visited_nodes_handler_pool(pool_initial_size, max_elements_, this->allocator)
-
-#ifdef ENABLE_PARALLELIZATION
-      ,
-      link_list_locks_(max_elements_)
-#endif
-{
+    : VecSimIndexAbstract<DistType>(abstractInitParams), VecSimIndexTombstone(),
+      max_elements_(params->initialCapacity), element_levels_(max_elements_, this->allocator),
+      visited_nodes_handler_pool(pool_initial_size, max_elements_, this->allocator),
+      element_neighbors_locks_(max_elements_, this->allocator) {
     size_t M = params->M ? params->M : HNSW_DEFAULT_M;
     if (M > UINT16_MAX / 2)
         throw std::runtime_error("HNSW index parameter M is too large: argument overflow");
@@ -1068,8 +1580,8 @@ HNSWIndex<DataType, DistType>::HNSWIndex(const HNSWParams *params,
     num_marked_deleted = 0;
 
     // initializations for special treatment of the first node
-    entrypoint_node_ = HNSW_INVALID_ID;
-    maxlevel_ = HNSW_INVALID_LEVEL;
+    entrypoint_node_ = INVALID_ID;
+    max_level_ = HNSW_INVALID_LEVEL;
 
     if (M <= 1)
         throw std::runtime_error("HNSW index parameter M cannot be 1");
@@ -1077,18 +1589,18 @@ HNSWIndex<DataType, DistType>::HNSWIndex(const HNSWParams *params,
     level_generator_.seed(random_seed);
 
     // data_level0_memory will look like this:
-    // | ---2--- | -----2----- | -----4*M0----------- | ---------8-------- |-data_size_-| ---8--- |
+    // | ---2--- | -----2----- | -----4*M0----------- | ---------8-------- |-data_size-| ---8--- |
     // | <flags> | <links_len> | <link_1> <link_2>... |<incoming_links_ptr>|   <data>   | <label> |
 
     size_links_level0_ =
         sizeof(linkListSize) + sizeof(elementFlags) + maxM0_ * sizeof(idType) + sizeof(void *);
-    size_data_per_element_ = size_links_level0_ + data_size_ + sizeof(labelType);
+    size_data_per_element_ = size_links_level0_ + this->data_size + sizeof(labelType);
 
     // No need to test for overflow because we passed the test for size_links_level0_ and this is
     // less.
     incoming_links_offset0 = maxM0_ * sizeof(idType) + sizeof(linkListSize) + sizeof(elementFlags);
     offsetData_ = size_links_level0_;
-    label_offset_ = size_links_level0_ + data_size_;
+    label_offset_ = size_links_level0_ + this->data_size;
     offsetLevel0_ = 0;
 
     data_level0_memory_ =
@@ -1134,31 +1646,103 @@ void HNSWIndex<DataType, DistType>::increaseCapacity() {
 }
 
 template <typename DataType, typename DistType>
-void HNSWIndex<DataType, DistType>::removeVector(const idType element_internal_id) {
+template <bool has_marked_deleted>
+void HNSWIndex<DataType, DistType>::removeAndSwap(idType internalId) {
+
+    // Delete the incoming edges sets for this element at every level.
+    size_t element_top_level = element_levels_[internalId];
+    for (size_t level = 0; level <= element_top_level; level++) {
+        auto *incoming_edges = getIncomingEdgesPtr(internalId, level);
+        assert(!has_marked_deleted || incoming_edges->size() == 0);
+        delete incoming_edges;
+    }
+
+    if (has_marked_deleted) {
+        // If the index allows marking vectors as deleted (as in tiered HNSW), the id to remove
+        // cannot be the entry point, as it should have been replaced upon marking it as deleted.
+        assert(entrypoint_node_ != internalId);
+    } else if (internalId == entrypoint_node_) {
+        // For inplace delete, we replace entry point now.
+        assert(element_top_level == max_level_);
+        replaceEntryPoint();
+    }
+
+    // We can say now that the element has removed completely from index.
+    --cur_element_count;
+    if (has_marked_deleted) {
+        --num_marked_deleted;
+    }
+
+    // Remove the deleted id form the relevant incoming edges sets in which it appears.
+    for (size_t level = 0; level <= element_top_level; level++) {
+        auto *neighbours = getNodeNeighborsAtLevel(internalId, level);
+        auto neighbours_count = getNodeNeighborsCount(neighbours);
+        for (size_t i = 0; i < neighbours_count; i++) {
+            idType neighbour_id = neighbours[i];
+            // This should always succeed, since every outgoing edge should be unidirectional at
+            // this point (after all the repair jobs are done).
+            auto *neighbour_incoming_edges = getIncomingEdgesPtr(neighbour_id, level);
+            removeIdFromList(*neighbour_incoming_edges, internalId);
+        }
+    }
+
+    // Swap the last id with the deleted one, and invalidate the last id data.
+    if (element_levels_[internalId] > 0) {
+        this->allocator->free_allocation(linkLists_[internalId]);
+        linkLists_[internalId] = nullptr;
+    }
+    if (cur_element_count == internalId) {
+        // We're deleting the last internal id, just invalidate data without swapping.
+        memset(data_level0_memory_ + cur_element_count * size_data_per_element_ + offsetLevel0_, 0,
+               size_data_per_element_);
+    } else {
+        SwapLastIdWithDeletedId<has_marked_deleted>(internalId);
+    }
+
+    // If we need to free a complete block and there is at least one block between the
+    // capacity and the size.
+    if (cur_element_count % this->blockSize == 0 &&
+        cur_element_count + this->blockSize <= max_elements_) {
+
+        // Check if the capacity is aligned to block size.
+        size_t extra_space_to_free = max_elements_ % this->blockSize;
+
+        // Remove one block from the capacity.
+        this->resizeIndexInternal(max_elements_ - this->blockSize - extra_space_to_free);
+    }
+}
+
+template <typename DataType, typename DistType>
+void HNSWIndex<DataType, DistType>::removeAndSwapDeletedElement(idType internalId) {
+    removeAndSwap<true>(internalId);
+}
+
+template <typename DataType, typename DistType>
+void HNSWIndex<DataType, DistType>::removeVectorInPlace(const idType element_internal_id) {
 
     vecsim_stl::vector<bool> neighbours_bitmap(this->allocator);
 
-    // go over levels and repair connections
+    // Go over the element's nodes at every level and repair the effected connections.
     size_t element_top_level = element_levels_[element_internal_id];
     for (size_t level = 0; level <= element_top_level; level++) {
-        idType *neighbours = get_linklist_at_level(element_internal_id, level);
-        linkListSize neighbours_count = getListCount(neighbours);
-        // reset the neighbours' bitmap for the current level.
+        idType *neighbours = getNodeNeighborsAtLevel(element_internal_id, level);
+        linkListSize neighbours_count = getNodeNeighborsCount(neighbours);
+        // Reset the neighbours' bitmap for the current level.
         neighbours_bitmap.assign(cur_element_count, false);
-        // store the deleted element's neighbours set in a bitmap for fast access.
+        // Store the deleted element's neighbours set in a bitmap for fast access.
         for (size_t j = 0; j < neighbours_count; j++) {
             neighbours_bitmap[neighbours[j]] = true;
         }
-        // go over the neighbours that also points back to the removed point and make a local
+        // Go over the neighbours that also points back to the removed point and make a local
         // repair.
         for (size_t i = 0; i < neighbours_count; i++) {
             idType neighbour_id = neighbours[i];
-            idType *neighbour_neighbours = get_linklist_at_level(neighbour_id, level);
-            linkListSize neighbour_neighbours_count = getListCount(neighbour_neighbours);
+            idType *neighbour_neighbours = getNodeNeighborsAtLevel(neighbour_id, level);
+            linkListSize neighbour_neighbours_count = getNodeNeighborsCount(neighbour_neighbours);
 
             bool bidirectional_edge = false;
             for (size_t j = 0; j < neighbour_neighbours_count; j++) {
-                // if the edge is bidirectional, do repair for this neighbor
+                // If the edge is bidirectional, do repair for this neighbor.
                 if (neighbour_neighbours[j] == element_internal_id) {
                     bidirectional_edge = true;
                     repairConnectionsForDeletion(element_internal_id, neighbour_id, neighbours,
@@ -1167,184 +1751,160 @@ void HNSWIndex<DataType, DistType>::removeVector(const idType element_internal_i
                 }
             }
 
-            // if this edge is uni-directional, we should remove the element from the neighbor's
+            // If this edge is uni-directional, we should remove the element from the neighbor's
             // incoming edges.
             if (!bidirectional_edge) {
                 auto *neighbour_incoming_edges = getIncomingEdgesPtr(neighbour_id, level);
-                neighbour_incoming_edges->erase(std::find(neighbour_incoming_edges->begin(),
-                                                          neighbour_incoming_edges->end(),
-                                                          element_internal_id));
+                // This should always return true (remove should succeed).
+                removeIdFromList(*neighbour_incoming_edges, element_internal_id);
             }
         }
 
-        // next, go over the rest of incoming edges (the ones that are not bidirectional) and make
+        // Next, go over the rest of incoming edges (the ones that are not bidirectional) and make
         // repairs.
         auto *incoming_edges = getIncomingEdgesPtr(element_internal_id, level);
         for (auto incoming_edge : *incoming_edges) {
-            idType *incoming_node_neighbours = get_linklist_at_level(incoming_edge, level);
+            idType *incoming_node_neighbours = getNodeNeighborsAtLevel(incoming_edge, level);
             repairConnectionsForDeletion(element_internal_id, incoming_edge, neighbours,
                                          incoming_node_neighbours, level, neighbours_bitmap);
         }
-        delete incoming_edges;
     }
+    // Finally, remove the element from the index and make a swap with the last internal id to
+    // avoid fragmentation and reclaim memory when needed.
+    removeAndSwap<false>(element_internal_id);
+}
 
-    // replace the entry point with another one, if we are deleting the current entry point.
-    if (element_internal_id == entrypoint_node_) {
-        assert(element_top_level == maxlevel_);
-        replaceEntryPoint();
-    }
+// Store the new element in the global data structures and keep the new state. In multithreaded
+// scenario, the index data guard should be held by the caller (exclusive lock).
+template <typename DataType, typename DistType>
+AddVectorCtx HNSWIndex<DataType, DistType>::storeNewElement(labelType label) {
+    AddVectorCtx state{};
 
-    // We can say now that the element was deleted
-    --cur_element_count;
+    // Choose randomly the maximum level in which the new element will be in the index.
+    state.elementMaxLevel = getRandomLevel(mult_);
 
-    // Swap the last id with the deleted one, and invalidate the last id data.
-    if (element_levels_[element_internal_id] > 0) {
-        this->allocator->free_allocation(linkLists_[element_internal_id]);
-        linkLists_[element_internal_id] = nullptr;
-    }
-    if (cur_element_count == element_internal_id) {
-        // we're deleting the last internal id, just invalidate data without swapping.
-        memset(data_level0_memory_ + cur_element_count * size_data_per_element_ + offsetLevel0_, 0,
-               size_data_per_element_);
-    } else {
-        SwapLastIdWithDeletedId(element_internal_id);
+    // Access and update the index global data structures with the new element meta-data.
+    state.newElementId = cur_element_count++;
+    assert(indexCapacity() >= indexSize());
+    // Reset the data (and meta-data) for id=state.newElementId in the index.
+    memset(data_level0_memory_ + state.newElementId * size_data_per_element_ + offsetLevel0_, 0,
+           size_data_per_element_);
+    // We mark id as in process *before* we set it in the label lookup, otherwise we might check
+    // that the label exist with safeCheckIfLabelExistsInIndex and see that IN_PROCESS flag is
+    // clear.
+    markInProcess(state.newElementId);
+    setVectorId(label, state.newElementId);
+    element_levels_[state.newElementId] = state.elementMaxLevel;
+    // Allocate memory for the links in higher levels and initialize this memory to zeros. The
+    // reason for doing it here is that we might mark this vector as deleted BEFORE we finish its
+    // indexing. In that case, we will collect the incoming edges to this element in every level,
+    // and try to access its link lists in higher levels. Therefore, we allocate it here and
+    // initialize it with zeros, (otherwise we might crash...)
+    if (state.elementMaxLevel > 0) {
+        linkLists_[state.newElementId] =
+            (char *)this->allocator->callocate(size_links_per_element_ * state.elementMaxLevel);
+        if (linkLists_[state.newElementId] == nullptr) {
+            this->log(
+                "Error - allocating memory for links in higher level failed due to low memory");
+            throw std::runtime_error("VecSim index low memory error");
+        }
     }
 
-    // If we need to free a complete block & there is a least one block between the
-    // capacity and the size.
-    if (cur_element_count % this->blockSize == 0 &&
-        cur_element_count + this->blockSize <= max_elements_) {
-
-        // Check if the capacity is aligned to block size.
-        size_t extra_space_to_free = max_elements_ % this->blockSize;
-
-        // Remove one block from the capacity.
-        this->resizeIndexInternal(max_elements_ - this->blockSize - extra_space_to_free);
+    state.currMaxLevel = (int)max_level_;
+    state.currEntryPoint = entrypoint_node_;
+    if (state.elementMaxLevel > state.currMaxLevel) {
+        if (entrypoint_node_ == INVALID_ID && max_level_ != HNSW_INVALID_LEVEL) {
+            throw std::runtime_error("Internal error - inserting the first element to the graph,"
+                                     " but the current max level is not INVALID");
+        }
+        // If the new elements max level is higher than the maximum level the currently exists in
+        // the graph, update the max level and set the new element as entry point.
+        entrypoint_node_ = state.newElementId;
+        max_level_ = state.elementMaxLevel;
     }
+    return state;
 }
 
 template <typename DataType, typename DistType>
-void HNSWIndex<DataType, DistType>::appendVector(const void *vector_data, const labelType label) {
-    assert(indexCapacity() > indexSize());
-    idType cur_c;
-
-    DataType normalized_blob[this->dim]; // This will be use only if metric == VecSimMetric_Cosine
-    if (this->metric == VecSimMetric_Cosine) {
-        memcpy(normalized_blob, vector_data, this->dim * sizeof(DataType));
-        normalizeVector(normalized_blob, this->dim);
-        vector_data = normalized_blob;
-    }
-
-    {
-#ifdef ENABLE_PARALLELIZATION
-        std::unique_lock<std::mutex> templock_curr(cur_element_count_guard_);
-#endif
-
-        cur_c = cur_element_count++;
-        setVectorId(label, cur_c);
-    }
-#ifdef ENABLE_PARALLELIZATION
-    std::unique_lock<std::mutex> lock_el(link_list_locks_[cur_c]);
-#endif
-    // choose randomly the maximum level in which the new element will be in the index.
-    size_t element_max_level = getRandomLevel(mult_);
-    element_levels_[cur_c] = element_max_level;
-
-#ifdef ENABLE_PARALLELIZATION
-    std::unique_lock<std::mutex> entry_point_lock(global);
-#endif
-    size_t maxlevelcopy = maxlevel_;
-
-#ifdef ENABLE_PARALLELIZATION
-    if (element_max_level <= maxlevelcopy)
-        entry_point_lock.unlock();
-#endif
-    idType currObj = entrypoint_node_;
-
-    memset(data_level0_memory_ + cur_c * size_data_per_element_ + offsetLevel0_, 0,
-           size_data_per_element_);
-
-    // Initialisation of the data and label
-    setExternalLabel(cur_c, label);
-    memcpy(getDataByInternalId(cur_c), vector_data, data_size_);
-
-    if (element_max_level > 0) {
-        linkLists_[cur_c] =
-            (char *)this->allocator->allocate(size_links_per_element_ * element_max_level);
-        if (linkLists_[cur_c] == nullptr)
-            throw std::runtime_error("Not enough memory: addPoint failed to allocate linklist");
-        memset(linkLists_[cur_c], 0, size_links_per_element_ * element_max_level);
+void HNSWIndex<DataType, DistType>::appendVector(const void *vector_data, const labelType label,
+                                                 AddVectorCtx *auxiliaryCtx) {
+
+    // If auxiliaryCtx is not NULL, the index state has already been updated from outside (such as
+    // in tiered index). Also, the synchronization responsibility in this case is on the caller,
+    // otherwise, this function should acquire and release the lock to ensure proper parallelism.
+    AddVectorCtx state{};
+    if (auxiliaryCtx == nullptr) {
+        this->lockIndexDataGuard();
+        if (indexSize() == indexCapacity()) {
+            increaseCapacity();
+        }
+        state = storeNewElement(label);
+        if (state.currMaxLevel >= state.elementMaxLevel) {
+            this->unlockIndexDataGuard();
+        }
+    } else {
+        state = *auxiliaryCtx;
     }
-
-    // this condition only means that we are not inserting the first element.
-    if (entrypoint_node_ != HNSW_INVALID_ID) {
+    // Deconstruct the state variables from the auxiliaryCtx. prev_entry_point and prev_max_level
+    // are the entry point and index max level at the point of time when the element was stored, and
+    // they may (or may not) have changed due to the insertion.
+    auto [new_element_id, element_max_level, prev_entry_point, prev_max_level] = state;
+    // Initialisation of the vector data and its label.
+    setExternalLabel(new_element_id, label);
+    memcpy(getDataByInternalId(new_element_id), vector_data, this->data_size);
+
+    // Start scanning the graph from the current entry point.
+    idType curr_element = prev_entry_point;
+    // This condition only means that we are not inserting the first (non-deleted) element.
+    if (curr_element != INVALID_ID) {
         DistType cur_dist = std::numeric_limits<DistType>::max();
-        if (element_max_level < maxlevelcopy) {
-            cur_dist = this->dist_func(vector_data, getDataByInternalId(currObj), this->dim);
-            for (size_t level = maxlevelcopy; level > element_max_level; level--) {
+        if (element_max_level < prev_max_level) {
+            cur_dist = this->dist_func(vector_data, getDataByInternalId(curr_element), this->dim);
+            for (int level = prev_max_level; level > element_max_level; level--) {
                 // this is done for the levels which are above the max level
                 // to which we are going to insert the new element. We do
                 // a greedy search in the graph starting from the entry point
                 // at each level, and move on with the closest element we can find.
                 // When there is no improvement to do, we take a step down.
-                greedySearchLevel<false>(vector_data, level, currObj, cur_dist);
+                greedySearchLevel<false>(vector_data, level, curr_element, cur_dist);
             }
         }
 
-        auto max_common_level = std::min(element_max_level, maxlevelcopy);
-        if (this->num_marked_deleted) {
-            if (element_max_level >= maxlevelcopy) {
-                // `cur_dist` is not initialized yet.
-                cur_dist = this->dist_func(vector_data, getDataByInternalId(currObj), this->dim);
-            }
-            for (size_t level = max_common_level; (int)level >= 0; level--) {
-
-                candidatesMaxHeap<DistType> top_candidates =
-                    searchLayer<true>(currObj, vector_data, level, ef_construction_);
-                if (top_candidates.empty()) {
-                    // This means that we haven't found any non-marked-deleted candidate in the
-                    // layer.
-
-                    // Get currObj and cur_dist ready for the next iteration.
-                    greedySearchLevel<false>(vector_data, level, currObj, cur_dist);
-                    // Set incoming edges list to empty.
-                    auto ptr = new (this->allocator) vecsim_stl::vector<idType>(this->allocator);
-                    setIncomingEdgesPtr(cur_c, level, ptr);
-
-                } else {
-                    currObj = mutuallyConnectNewElement(cur_c, top_candidates, level);
-                }
-            }
-        } else {
-            for (size_t level = max_common_level; (int)level >= 0; level--) {
-
-                candidatesMaxHeap<DistType> top_candidates =
-                    searchLayer<false>(currObj, vector_data, level, ef_construction_);
-                currObj = mutuallyConnectNewElement(cur_c, top_candidates, level);
-            }
+        auto max_common_level = std::min(element_max_level, prev_max_level);
+        for (int level = max_common_level; (int)level >= 0; level--) {
+            candidatesMaxHeap<DistType> top_candidates =
+                searchLayer<false>(curr_element, vector_data, level, ef_construction_);
+            curr_element = mutuallyConnectNewElement(new_element_id, top_candidates, level);
         }
 
-        // updating the maximum level (holding a global lock)
-        if (element_max_level > maxlevelcopy) {
-            entrypoint_node_ = cur_c;
-            maxlevel_ = element_max_level;
-            // create the incoming edges set for the new levels.
-            for (size_t level_idx = maxlevelcopy + 1; level_idx <= element_max_level; level_idx++) {
+        if (element_max_level > prev_max_level) {
+            // Create the incoming edges sets for the new levels.
+            for (int level_idx = prev_max_level + 1; level_idx <= element_max_level; level_idx++) {
                 auto *incoming_edges =
                     new (this->allocator) vecsim_stl::vector<idType>(this->allocator);
-                setIncomingEdgesPtr(cur_c, level_idx, incoming_edges);
+                setIncomingEdgesPtr(new_element_id, level_idx, incoming_edges);
             }
         }
     } else {
-        // Do nothing for the first element
-        entrypoint_node_ = 0;
-        for (size_t level_idx = maxlevel_ + 1; level_idx <= element_max_level; level_idx++) {
+        // Inserting the first (non-deleted) element to the graph - only need to allocate incoming
+        // neighbors sets without creating any connections.
+        for (int level_idx = 0; level_idx <= element_max_level; level_idx++) {
             auto *incoming_edges =
                 new (this->allocator) vecsim_stl::vector<idType>(this->allocator);
-            setIncomingEdgesPtr(cur_c, level_idx, incoming_edges);
+            setIncomingEdgesPtr(new_element_id, level_idx, incoming_edges);
         }
-        maxlevel_ = element_max_level;
     }
+    unmarkInProcess(new_element_id);
+    if (auxiliaryCtx == nullptr && state.currMaxLevel < state.elementMaxLevel) {
+        // No external auxiliaryCtx, so it's this function responsibility to release the lock.
+        this->unlockIndexDataGuard();
+    }
+}
+
+template <typename DataType, typename DistType>
+auto HNSWIndex<DataType, DistType>::safeGetEntryPointState() const {
+    std::shared_lock<std::shared_mutex> lock(index_data_guard_);
+    return std::make_pair(entrypoint_node_, max_level_);
 }
 
 template <typename DataType, typename DistType>
@@ -1352,16 +1912,15 @@ idType HNSWIndex<DataType, DistType>::searchBottomLayerEP(const void *query_data
                                                           VecSimQueryResult_Code *rc) const {
     *rc = VecSim_QueryResult_OK;
 
-    if (cur_element_count == 0) {
-        return entrypoint_node_;
-    }
-    idType currObj = entrypoint_node_;
-    DistType cur_dist =
-        this->dist_func(query_data, getDataByInternalId(entrypoint_node_), this->dim);
-    for (size_t level = maxlevel_; level > 0 && currObj != HNSW_INVALID_ID; level--) {
-        greedySearchLevel<true>(query_data, level, currObj, cur_dist, timeoutCtx, rc);
+    auto [curr_element, max_level] = safeGetEntryPointState();
+    if (curr_element == INVALID_ID)
+        return curr_element; // index is empty.
+
+    DistType cur_dist = this->dist_func(query_data, getDataByInternalId(curr_element), this->dim);
+    for (size_t level = max_level; level > 0 && curr_element != INVALID_ID; level--) {
+        greedySearchLevel<true>(query_data, level, curr_element, cur_dist, timeoutCtx, rc);
     }
-    return currObj;
+    return curr_element;
 }
 
 template <typename DataType, typename DistType>
@@ -1419,8 +1978,9 @@ HNSWIndex<DataType, DistType>::searchBottomLayer_WithTimeout(idType ep_id, const
 }
 
 template <typename DataType, typename DistType>
-VecSimQueryResult_List HNSWIndex<DataType, DistType>::topKQuery(const void *query_data, size_t k,
-                                                                VecSimQueryParams *queryParams) {
+VecSimQueryResult_List
+HNSWIndex<DataType, DistType>::topKQuery(const void *query_data, size_t k,
+                                         VecSimQueryParams *queryParams) const {
 
     VecSimQueryResult_List rl = {0};
     this->last_mode = STANDARD_KNN;
@@ -1433,12 +1993,6 @@ VecSimQueryResult_List HNSWIndex<DataType, DistType>::topKQuery(const void *quer
 
     void *timeoutCtx = nullptr;
 
-    DataType normalized_blob[this->dim]; // This will be use only if metric == VecSimMetric_Cosine.
-    if (this->metric == VecSimMetric_Cosine) {
-        memcpy(normalized_blob, query_data, this->dim * sizeof(DataType));
-        normalizeVector(normalized_blob, this->dim);
-        query_data = normalized_blob;
-    }
     // Get original efRuntime and store it.
     size_t ef = ef_;
 
@@ -1452,6 +2006,12 @@ VecSimQueryResult_List HNSWIndex<DataType, DistType>::topKQuery(const void *quer
     idType bottom_layer_ep = searchBottomLayerEP(query_data, timeoutCtx, &rl.code);
     if (VecSim_OK != rl.code) {
         return rl;
+    } else if (bottom_layer_ep == INVALID_ID) {
+        // Although we checked that the index is not empty (cur_element_count == 0), it might be
+        // that another thread deleted all the elements or didn't finish inserting the first element
+        // yet. Anyway, we observed that the index is empty, so we return an empty result list.
+        rl.results = array_new<VecSimQueryResult>(0);
+        return rl;
     }
 
     // We now oun the results heap, we need to free (delete) it when we done
@@ -1546,9 +2106,10 @@ VecSimQueryResult *HNSWIndex<DataType, DistType>::searchRangeBottomLayer_WithTim
 }
 
 template <typename DataType, typename DistType>
-VecSimQueryResult_List HNSWIndex<DataType, DistType>::rangeQuery(const void *query_data,
-                                                                 double radius,
-                                                                 VecSimQueryParams *queryParams) {
+VecSimQueryResult_List
+HNSWIndex<DataType, DistType>::rangeQuery(const void *query_data, double radius,
+                                          VecSimQueryParams *queryParams) const {
+
     VecSimQueryResult_List rl = {0};
     this->last_mode = RANGE_QUERY;
 
@@ -1559,13 +2120,6 @@ VecSimQueryResult_List HNSWIndex<DataType, DistType>::rangeQuery(const void *que
     }
     void *timeoutCtx = nullptr;
 
-    DataType normalized_blob[this->dim]; // This will be use only if metric == VecSimMetric_Cosine
-    if (this->metric == VecSimMetric_Cosine) {
-        memcpy(normalized_blob, query_data, this->dim * sizeof(DataType));
-        normalizeVector(normalized_blob, this->dim);
-        query_data = normalized_blob;
-    }
-
     double epsilon = epsilon_;
     if (queryParams) {
         timeoutCtx = queryParams->timeoutCtx;
@@ -1575,7 +2129,10 @@ VecSimQueryResult_List HNSWIndex<DataType, DistType>::rangeQuery(const void *que
     }
 
     idType bottom_layer_ep = searchBottomLayerEP(query_data, timeoutCtx, &rl.code);
-    if (VecSim_OK != rl.code) {
+    // Although we checked that the index is not empty (cur_element_count == 0), it might be
+    // that another thread deleted all the elements or didn't finish inserting the first element
+    // yet. Anyway, we observed that the index is empty, so we return an empty result list.
+    if (VecSim_OK != rl.code || bottom_layer_ep == INVALID_ID) {
         rl.results = array_new<VecSimQueryResult>(0);
         return rl;
     }
@@ -1588,7 +2145,6 @@ VecSimQueryResult_List HNSWIndex<DataType, DistType>::rangeQuery(const void *que
     else
         rl.results = searchRangeBottomLayer_WithTimeout<false>(bottom_layer_ep, query_data, epsilon,
                                                                radius, timeoutCtx, &rl.code);
-
     return rl;
 }
 
@@ -1596,23 +2152,25 @@ template <typename DataType, typename DistType>
 VecSimIndexInfo HNSWIndex<DataType, DistType>::info() const {
 
     VecSimIndexInfo info;
-    info.algo = VecSimAlgo_HNSWLIB;
-    info.hnswInfo.dim = this->dim;
-    info.hnswInfo.type = this->vecType;
-    info.hnswInfo.isMulti = this->isMulti;
-    info.hnswInfo.metric = this->metric;
-    info.hnswInfo.blockSize = this->blockSize;
+    info.commonInfo = this->getCommonInfo();
+
+    info.commonInfo.basicInfo.algo = VecSimAlgo_HNSWLIB;
     info.hnswInfo.M = this->getM();
     info.hnswInfo.efConstruction = this->getEfConstruction();
     info.hnswInfo.efRuntime = this->getEf();
     info.hnswInfo.epsilon = this->epsilon_;
-    info.hnswInfo.indexSize = this->indexSize();
-    info.hnswInfo.indexLabelCount = this->indexLabelCount();
     info.hnswInfo.max_level = this->getMaxLevel();
     info.hnswInfo.entrypoint = this->getEntryPointLabel();
-    info.hnswInfo.memory = this->getAllocationSize();
-    info.hnswInfo.last_mode = this->last_mode;
     info.hnswInfo.visitedNodesPoolSize = this->visited_nodes_handler_pool.getPoolSize();
+    info.hnswInfo.numberOfMarkedDeletedNodes = this->getNumMarkedDeleted();
+    return info;
+}
+
+template <typename DataType, typename DistType>
+VecSimIndexBasicInfo HNSWIndex<DataType, DistType>::basicInfo() const {
+    VecSimIndexBasicInfo info = this->getBasicInfo();
+    info.algo = VecSimAlgo_HNSWLIB;
+    info.isTiered = false;
     return info;
 }
 
@@ -1620,84 +2178,69 @@ template <typename DataType, typename DistType>
 VecSimInfoIterator *HNSWIndex<DataType, DistType>::infoIterator() const {
     VecSimIndexInfo info = this->info();
     // For readability. Update this number when needed.
-    size_t numberOfInfoFields = 12;
+    size_t numberOfInfoFields = 17;
     VecSimInfoIterator *infoIterator = new VecSimInfoIterator(numberOfInfoFields);
 
-    infoIterator->addInfoField(VecSim_InfoField{
-        .fieldName = VecSimCommonStrings::ALGORITHM_STRING,
-        .fieldType = INFOFIELD_STRING,
-        .fieldValue = {FieldValue{.stringValue = VecSimAlgo_ToString(info.algo)}}});
-    infoIterator->addInfoField(VecSim_InfoField{
-        .fieldName = VecSimCommonStrings::TYPE_STRING,
-        .fieldType = INFOFIELD_STRING,
-        .fieldValue = {FieldValue{.stringValue = VecSimType_ToString(info.hnswInfo.type)}}});
     infoIterator->addInfoField(
-        VecSim_InfoField{.fieldName = VecSimCommonStrings::DIMENSION_STRING,
-                         .fieldType = INFOFIELD_UINT64,
-                         .fieldValue = {FieldValue{.uintegerValue = info.hnswInfo.dim}}});
-    infoIterator->addInfoField(VecSim_InfoField{
-        .fieldName = VecSimCommonStrings::METRIC_STRING,
-        .fieldType = INFOFIELD_STRING,
-        .fieldValue = {FieldValue{.stringValue = VecSimMetric_ToString(info.hnswInfo.metric)}}});
+        VecSim_InfoField{.fieldName = VecSimCommonStrings::ALGORITHM_STRING,
+                         .fieldType = INFOFIELD_STRING,
+                         .fieldValue = {FieldValue{
+                             .stringValue = VecSimAlgo_ToString(info.commonInfo.basicInfo.algo)}}});
+
+    this->addCommonInfoToIterator(infoIterator, info.commonInfo);
 
-    infoIterator->addInfoField(
-        VecSim_InfoField{.fieldName = VecSimCommonStrings::IS_MULTI_STRING,
-                         .fieldType = INFOFIELD_UINT64,
-                         .fieldValue = {FieldValue{.uintegerValue = info.hnswInfo.isMulti}}});
-    infoIterator->addInfoField(
-        VecSim_InfoField{.fieldName = VecSimCommonStrings::INDEX_SIZE_STRING,
-                         .fieldType = INFOFIELD_UINT64,
-                         .fieldValue = {FieldValue{.uintegerValue = info.hnswInfo.indexSize}}});
     infoIterator->addInfoField(VecSim_InfoField{
-        .fieldName = VecSimCommonStrings::INDEX_LABEL_COUNT_STRING,
+        .fieldName = VecSimCommonStrings::BLOCK_SIZE_STRING,
         .fieldType = INFOFIELD_UINT64,
-        .fieldValue = {FieldValue{.uintegerValue = info.hnswInfo.indexLabelCount}}});
+        .fieldValue = {FieldValue{.uintegerValue = info.commonInfo.basicInfo.blockSize}}});
+
     infoIterator->addInfoField(
         VecSim_InfoField{.fieldName = VecSimCommonStrings::HNSW_M_STRING,
                          .fieldType = INFOFIELD_UINT64,
                          .fieldValue = {FieldValue{.uintegerValue = info.hnswInfo.M}}});
+
     infoIterator->addInfoField(VecSim_InfoField{
         .fieldName = VecSimCommonStrings::HNSW_EF_CONSTRUCTION_STRING,
         .fieldType = INFOFIELD_UINT64,
         .fieldValue = {FieldValue{.uintegerValue = info.hnswInfo.efConstruction}}});
+
     infoIterator->addInfoField(
         VecSim_InfoField{.fieldName = VecSimCommonStrings::HNSW_EF_RUNTIME_STRING,
                          .fieldType = INFOFIELD_UINT64,
                          .fieldValue = {FieldValue{.uintegerValue = info.hnswInfo.efRuntime}}});
+
     infoIterator->addInfoField(
         VecSim_InfoField{.fieldName = VecSimCommonStrings::HNSW_MAX_LEVEL,
                          .fieldType = INFOFIELD_UINT64,
                          .fieldValue = {FieldValue{.uintegerValue = info.hnswInfo.max_level}}});
+
     infoIterator->addInfoField(
         VecSim_InfoField{.fieldName = VecSimCommonStrings::HNSW_ENTRYPOINT,
                          .fieldType = INFOFIELD_UINT64,
                          .fieldValue = {FieldValue{.uintegerValue = info.hnswInfo.entrypoint}}});
-    infoIterator->addInfoField(
-        VecSim_InfoField{.fieldName = VecSimCommonStrings::MEMORY_STRING,
-                         .fieldType = INFOFIELD_UINT64,
-                         .fieldValue = {FieldValue{.uintegerValue = info.hnswInfo.memory}}});
-    infoIterator->addInfoField(
-        VecSim_InfoField{.fieldName = VecSimCommonStrings::SEARCH_MODE_STRING,
-                         .fieldType = INFOFIELD_STRING,
-                         .fieldValue = {FieldValue{
-                             .stringValue = VecSimSearchMode_ToString(info.hnswInfo.last_mode)}}});
+
     infoIterator->addInfoField(
         VecSim_InfoField{.fieldName = VecSimCommonStrings::HNSW_EPSILON_STRING,
                          .fieldType = INFOFIELD_FLOAT64,
                          .fieldValue = {FieldValue{.floatingPointValue = info.hnswInfo.epsilon}}});
 
+    infoIterator->addInfoField(VecSim_InfoField{
+        .fieldName = VecSimCommonStrings::HNSW_NUM_MARKED_DELETED,
+        .fieldType = INFOFIELD_UINT64,
+        .fieldValue = {FieldValue{.uintegerValue = info.hnswInfo.numberOfMarkedDeletedNodes}}});
+
     return infoIterator;
 }
 
 template <typename DataType, typename DistType>
 bool HNSWIndex<DataType, DistType>::preferAdHocSearch(size_t subsetSize, size_t k,
-                                                      bool initial_check) {
+                                                      bool initial_check) const {
     // This heuristic is based on sklearn decision tree classifier (with 20 leaves nodes) -
     // see scripts/HNSW_batches_clf.py
     size_t index_size = this->indexSize();
-    if (subsetSize > index_size) {
-        throw std::runtime_error("internal error: subset size cannot be larger than index size");
-    }
+    // Referring to too large subset size as if it was the maximum possible size.
+    subsetSize = std::min(subsetSize, index_size);
+
     size_t d = this->dim;
     size_t M = this->getM();
     float r = (index_size == 0) ? 0.0f : (float)(subsetSize) / (float)this->indexLabelCount();
diff --git a/src/VecSim/algorithms/hnsw/hnsw_base_tests_friends.h b/src/VecSim/algorithms/hnsw/hnsw_base_tests_friends.h
index 1365940ee..8e332b7ae 100644
--- a/src/VecSim/algorithms/hnsw/hnsw_base_tests_friends.h
+++ b/src/VecSim/algorithms/hnsw/hnsw_base_tests_friends.h
@@ -14,3 +14,7 @@ INDEX_TEST_FRIEND_CLASS(IndexAllocatorTest_testIncomingEdgesSet_Test)
 INDEX_TEST_FRIEND_CLASS(IndexAllocatorTest_test_hnsw_reclaim_memory_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTest_markDelete_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTest_allMarkedDeletedLevel_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTestParallel)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_deleteFromHNSWMultiLevels_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_deleteFromHNSWWithRepairJobExec_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTest_testIncomingEdgesSize_Test)
diff --git a/src/VecSim/algorithms/hnsw/hnsw_batch_iterator.h b/src/VecSim/algorithms/hnsw/hnsw_batch_iterator.h
index c134af926..1523d9835 100644
--- a/src/VecSim/algorithms/hnsw/hnsw_batch_iterator.h
+++ b/src/VecSim/algorithms/hnsw/hnsw_batch_iterator.h
@@ -78,7 +78,7 @@ HNSW_BatchIterator<DataType, DistType>::HNSW_BatchIterator(
 
     this->dist_func = index->getDistFunc();
     this->dim = index->getDim();
-    this->entry_point = index->getEntryPointId();
+    this->entry_point = INVALID_ID; // temporary until we store the entry point to level 0.
     // Use "fresh" tag to mark nodes that were visited along the search in some iteration.
     this->visited_list = index->getVisitedList();
     this->visited_tag = this->visited_list->getFreshTag();
@@ -114,8 +114,9 @@ VecSimQueryResult_Code HNSW_BatchIterator<DataType, DistType>::scanGraphInternal
 
         // Take the current node out of the candidates queue and go over his neighbours.
         candidates.pop();
-        idType *node_links = this->index->get_linklist_at_level(curr_node_id, 0);
-        linkListSize links_num = this->index->getListCount(node_links);
+        this->index->lockNodeLinks(curr_node_id);
+        idType *node_links = this->index->getNodeNeighborsAtLevel(curr_node_id, 0);
+        linkListSize links_num = this->index->getNodeNeighborsCount(node_links);
 
         __builtin_prefetch(visited_list->getElementsTags() + *node_links);
         __builtin_prefetch(index->getDataByInternalId(*node_links));
@@ -135,8 +136,9 @@ VecSimQueryResult_Code HNSW_BatchIterator<DataType, DistType>::scanGraphInternal
             DistType candidate_dist =
                 dist_func(this->getQueryBlob(), (const void *)candidate_data, dim);
             candidates.emplace(candidate_dist, candidate_id);
-            __builtin_prefetch(index->get_linklist_at_level(candidates.top().second, 0));
+            __builtin_prefetch(index->getNodeNeighborsAtLevel(candidates.top().second, 0));
         }
+        this->index->unlockNodeLinks(curr_node_id);
     }
     return VecSim_QueryResult_OK;
 }
@@ -146,7 +148,7 @@ candidatesLabelsMaxHeap<DistType> *
 HNSW_BatchIterator<DataType, DistType>::scanGraph(VecSimQueryResult_Code *rc) {
 
     candidatesLabelsMaxHeap<DistType> *top_candidates = this->index->getNewMaxPriorityQueue();
-    if (this->entry_point == HNSW_INVALID_ID) {
+    if (this->entry_point == INVALID_ID) {
         this->depleted = true;
         return top_candidates;
     }
diff --git a/src/VecSim/algorithms/hnsw/hnsw_multi.h b/src/VecSim/algorithms/hnsw/hnsw_multi.h
index acfb68b69..8bf02f59a 100644
--- a/src/VecSim/algorithms/hnsw/hnsw_multi.h
+++ b/src/VecSim/algorithms/hnsw/hnsw_multi.h
@@ -13,6 +13,8 @@
 template <typename DataType, typename DistType>
 class HNSWIndex_Multi : public HNSWIndex<DataType, DistType> {
 private:
+    // Index global state - this should be guarded by the index_data_guard_ lock in
+    // multithreaded scenario.
     vecsim_stl::unordered_map<labelType, vecsim_stl::vector<idType>> label_lookup_;
 
 #ifdef BUILD_TESTS
@@ -30,25 +32,40 @@ class HNSWIndex_Multi : public HNSWIndex<DataType, DistType> {
     }
     inline void resizeLabelLookup(size_t new_max_elements) override;
 
+    // Return all the labels in the index - this should be used for computing the number of distinct
+    // labels in a tiered index, and caller should hold the index data guard.
+    inline vecsim_stl::set<labelType> getLabelsSet() const override {
+        vecsim_stl::set<labelType> keys(this->allocator);
+        for (auto &it : label_lookup_) {
+            keys.insert(it.first);
+        }
+        return keys;
+    };
+
+    template <bool Safe>
+    inline double getDistanceFromInternal(labelType label, const void *vector_data) const;
+
 public:
-    HNSWIndex_Multi(const HNSWParams *params, std::shared_ptr<VecSimAllocator> allocator,
+    HNSWIndex_Multi(const HNSWParams *params, const AbstractIndexInitParams &abstractInitParams,
                     size_t random_seed = 100, size_t initial_pool_size = 1)
-        : HNSWIndex<DataType, DistType>(params, allocator, random_seed, initial_pool_size),
-          label_lookup_(this->max_elements_, allocator) {}
+        : HNSWIndex<DataType, DistType>(params, abstractInitParams, random_seed, initial_pool_size),
+          label_lookup_(this->max_elements_, this->allocator) {}
 #ifdef BUILD_TESTS
     // Ctor to be used before loading a serialized index. Can be used from v2 and up.
     HNSWIndex_Multi(std::ifstream &input, const HNSWParams *params,
-                    std::shared_ptr<VecSimAllocator> allocator, Serializer::EncodingVersion version)
-        : HNSWIndex<DataType, DistType>(input, params, allocator, version),
-          label_lookup_(this->max_elements_, allocator) {}
+                    const AbstractIndexInitParams &abstractInitParams,
+                    Serializer::EncodingVersion version)
+        : HNSWIndex<DataType, DistType>(input, params, abstractInitParams, version),
+          label_lookup_(this->max_elements_, this->allocator) {}
 
-    void GetDataByLabel(labelType label, std::vector<std::vector<DataType>> &vectors_output) {
+    void getDataByLabel(labelType label,
+                        std::vector<std::vector<DataType>> &vectors_output) const override {
 
         auto ids = label_lookup_.find(label);
 
         for (idType id : ids->second) {
             auto vec = std::vector<DataType>(this->dim);
-            memcpy(vec.data(), this->getDataByInternalId(id), this->data_size_);
+            memcpy(vec.data(), this->getDataByInternalId(id), this->data_size);
             vectors_output.push_back(vec);
         }
     }
@@ -70,9 +87,16 @@ class HNSWIndex_Multi : public HNSWIndex<DataType, DistType> {
                                           VecSimQueryParams *queryParams) const override;
 
     int deleteVector(labelType label) override;
-    int addVector(const void *vector_data, labelType label, bool overwrite_allowed = true) override;
-    double getDistanceFrom(labelType label, const void *vector_data) const override;
+    int addVector(const void *vector_data, labelType label, void *auxiliaryCtx = nullptr) override;
     inline std::vector<idType> markDelete(labelType label) override;
+    inline bool safeCheckIfLabelExistsInIndex(labelType label,
+                                              bool also_done_processing) const override;
+    double getDistanceFrom(labelType label, const void *vector_data) const override {
+        return getDistanceFromInternal<false>(label, vector_data);
+    }
+    double safeGetDistanceFrom(labelType label, const void *vector_data) const override {
+        return getDistanceFromInternal<true>(label, vector_data);
+    }
 };
 
 /**
@@ -84,36 +108,68 @@ size_t HNSWIndex_Multi<DataType, DistType>::indexLabelCount() const {
     return label_lookup_.size();
 }
 
+/**
+ * helper functions
+ */
+
+// Depending on the value of the Safe template parameter, this function will either return a copy
+// of the argument or a reference to it.
+template <bool Safe, typename Arg>
+constexpr decltype(auto) getCopyOrReference(Arg &&arg) {
+    if constexpr (Safe) {
+        return std::decay_t<Arg>(arg);
+    } else {
+        return (arg);
+    }
+}
+
 template <typename DataType, typename DistType>
-double HNSWIndex_Multi<DataType, DistType>::getDistanceFrom(labelType label,
-                                                            const void *vector_data) const {
+template <bool Safe>
+double HNSWIndex_Multi<DataType, DistType>::getDistanceFromInternal(labelType label,
+                                                                    const void *vector_data) const {
+    DistType dist = INVALID_SCORE;
 
-    auto IDs = this->label_lookup_.find(label);
-    if (IDs == this->label_lookup_.end()) {
-        return INVALID_SCORE;
+    // Check if the label exists in the index, return invalid score if not.
+    if (Safe)
+        this->index_data_guard_.lock_shared();
+    auto it = this->label_lookup_.find(label);
+    if (it == this->label_lookup_.end()) {
+        if (Safe)
+            this->index_data_guard_.unlock_shared();
+        return dist;
     }
 
-    DistType dist = std::numeric_limits<DistType>::infinity();
-    for (auto id : IDs->second) {
-        if (!this->isMarkedDeleted(id)) {
-            DistType d = this->dist_func(this->getDataByInternalId(id), vector_data, this->dim);
-            dist = (dist < d) ? dist : d;
-        }
+    // Get the vector of ids associated with the label.
+    // Get a copy if `Safe` is true, otherwise get a reference.
+    decltype(auto) IDs = getCopyOrReference<Safe>(it->second);
+    if (Safe)
+        this->index_data_guard_.unlock_shared();
+
+    // Iterate over the ids and find the minimum distance.
+    for (auto id : IDs) {
+        DistType d = this->dist_func(this->getDataByInternalId(id), vector_data, this->dim);
+        dist = std::fmin(dist, d);
     }
 
     return dist;
 }
 
-/**
- * helper functions
- */
-
 template <typename DataType, typename DistType>
 void HNSWIndex_Multi<DataType, DistType>::replaceIdOfLabel(labelType label, idType new_id,
                                                            idType old_id) {
     assert(label_lookup_.find(label) != label_lookup_.end());
+    // *Non-trivial code here* - in every iteration we replace the internal id of the previous last
+    // id that has been swapped with the deleted id. Note that if the old and the new replaced ids
+    // both belong to the same label, then we are going to delete the new id later on as well, since
+    // we are currently iterating on this exact array of ids in 'deleteVector'. Hence, the relevant
+    // part of the vector that should be updated is the "tail" that comes after the position of
+    // old_id, while the "head" may contain old occurrences of old_id that are irrelevant for the
+    // future deletions. Therefore, we iterate from end to beginning. For example, assuming we are
+    // deleting a label that contains the only 3 ids that exist in the index. Hence, we would
+    // expect the following scenario w.r.t. the ids array:
+    // [|1, 0, 2] -> [1, |0, 1] -> [1, 0, |0] (where | marks the current position)
     auto &ids = label_lookup_.at(label);
-    for (size_t i = 0; i < ids.size(); i++) {
+    for (int i = ids.size() - 1; i >= 0; i--) {
         if (ids[i] == old_id) {
             ids[i] = new_id;
             return;
@@ -140,7 +196,7 @@ int HNSWIndex_Multi<DataType, DistType>::deleteVector(const labelType label) {
         return ret;
     }
     for (idType id : ids->second) {
-        this->removeVector(id);
+        this->removeVectorInPlace(id);
         ret++;
     }
     label_lookup_.erase(ids);
@@ -149,9 +205,9 @@ int HNSWIndex_Multi<DataType, DistType>::deleteVector(const labelType label) {
 
 template <typename DataType, typename DistType>
 int HNSWIndex_Multi<DataType, DistType>::addVector(const void *vector_data, const labelType label,
-                                                   bool overwrite_allowed) {
+                                                   void *auxiliaryCtx) {
 
-    this->appendVector(vector_data, label);
+    this->appendVector(vector_data, label, (AddVectorCtx *)auxiliaryCtx);
     return 1; // We always add the vector, no overrides in multi.
 }
 
@@ -161,9 +217,6 @@ HNSWIndex_Multi<DataType, DistType>::newBatchIterator(const void *queryBlob,
                                                       VecSimQueryParams *queryParams) const {
     auto queryBlobCopy = this->allocator->allocate(sizeof(DataType) * this->dim);
     memcpy(queryBlobCopy, queryBlob, this->dim * sizeof(DataType));
-    if (this->metric == VecSimMetric_Cosine) {
-        normalizeVector((DataType *)queryBlobCopy, this->dim);
-    }
     // Ownership of queryBlobCopy moves to HNSW_BatchIterator that will free it at the end.
     return new (this->allocator) HNSWMulti_BatchIterator<DataType, DistType>(
         queryBlobCopy, this, queryParams, this->allocator);
@@ -176,6 +229,7 @@ HNSWIndex_Multi<DataType, DistType>::newBatchIterator(const void *queryBlob,
 template <typename DataType, typename DistType>
 std::vector<idType> HNSWIndex_Multi<DataType, DistType>::markDelete(labelType label) {
     std::vector<idType> idsToDelete;
+    std::unique_lock<std::shared_mutex> index_data_lock(this->index_data_guard_);
     auto search = label_lookup_.find(label);
     if (search == label_lookup_.end()) {
         return idsToDelete;
@@ -188,3 +242,23 @@ std::vector<idType> HNSWIndex_Multi<DataType, DistType>::markDelete(labelType la
     label_lookup_.erase(search);
     return idsToDelete;
 }
+
+template <typename DataType, typename DistType>
+inline bool HNSWIndex_Multi<DataType, DistType>::safeCheckIfLabelExistsInIndex(
+    labelType label, bool also_done_processing) const {
+    std::unique_lock<std::shared_mutex> index_data_lock(this->index_data_guard_);
+    auto search_res = label_lookup_.find(label);
+    bool exists = search_res != label_lookup_.end();
+    // If we want to make sure that the vector(s) stored under the label were already indexed,
+    // we go on and check that every associated vector is no longer in process.
+    if (exists && also_done_processing) {
+        for (auto id : search_res->second) {
+            exists = !this->isInProcess(id);
+            // If we find at least one internal id that is still in process, consider it as not
+            // ready.
+            if (!exists)
+                return false;
+        }
+    }
+    return exists;
+}
diff --git a/src/VecSim/algorithms/hnsw/hnsw_multi_batch_iterator.h b/src/VecSim/algorithms/hnsw/hnsw_multi_batch_iterator.h
index de8ea3fa0..9e45346dd 100644
--- a/src/VecSim/algorithms/hnsw/hnsw_multi_batch_iterator.h
+++ b/src/VecSim/algorithms/hnsw/hnsw_multi_batch_iterator.h
@@ -48,12 +48,9 @@ VecSimQueryResult_List HNSWMulti_BatchIterator<DataType, DistType>::prepareResul
     // Return results from the top candidates heap, put them in reverse order in the batch results
     // array.
     for (int i = (int)(top_candidates->size() - 1); i >= 0; i--) {
-        labelType label = top_candidates->top().second;
-        // TODO: get best score by only checking unvisited vectors under this label.
-        DistType score = this->index->getDistanceFrom(label, this->getQueryBlob());
-        VecSimQueryResult_SetId(rl.results[i], label);
-        VecSimQueryResult_SetScore(rl.results[i], score);
-        this->returned.insert(label);
+        VecSimQueryResult_SetId(rl.results[i], top_candidates->top().second);
+        VecSimQueryResult_SetScore(rl.results[i], top_candidates->top().first);
+        this->returned.insert(top_candidates->top().second);
         top_candidates->pop();
     }
     return rl;
diff --git a/src/VecSim/algorithms/hnsw/hnsw_multi_tests_friends.h b/src/VecSim/algorithms/hnsw/hnsw_multi_tests_friends.h
index 90b3bb09c..ca8a6d880 100644
--- a/src/VecSim/algorithms/hnsw/hnsw_multi_tests_friends.h
+++ b/src/VecSim/algorithms/hnsw/hnsw_multi_tests_friends.h
@@ -12,3 +12,6 @@ INDEX_TEST_FRIEND_CLASS(HNSWMultiTest_test_dynamic_hnsw_info_iterator_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWMultiTest_preferAdHocOptimization_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWMultiTest_testSizeEstimation_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWMultiTest_markDelete_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_testSizeEstimation_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWMultiTest_removeVectorWithSwaps_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_swapJobBasic_Test)
diff --git a/src/VecSim/algorithms/hnsw/hnsw_serialization_utils.h b/src/VecSim/algorithms/hnsw/hnsw_serialization_utils.h
index c8c73771d..9aa955e16 100644
--- a/src/VecSim/algorithms/hnsw/hnsw_serialization_utils.h
+++ b/src/VecSim/algorithms/hnsw/hnsw_serialization_utils.h
@@ -11,4 +11,5 @@ typedef struct {
     size_t unidirectional_connections;
     size_t min_in_degree;
     size_t max_in_degree;
+    size_t connections_to_repair;
 } HNSWIndexMetaData;
diff --git a/src/VecSim/algorithms/hnsw/hnsw_serializer.h b/src/VecSim/algorithms/hnsw/hnsw_serializer.h
index 684d04f19..956229b61 100644
--- a/src/VecSim/algorithms/hnsw/hnsw_serializer.h
+++ b/src/VecSim/algorithms/hnsw/hnsw_serializer.h
@@ -2,13 +2,13 @@
 
 template <typename DataType, typename DistType>
 HNSWIndex<DataType, DistType>::HNSWIndex(std::ifstream &input, const HNSWParams *params,
-                                         std::shared_ptr<VecSimAllocator> allocator,
-                                         EncodingVersion version)
-    : VecSimIndexAbstract<DistType>(allocator, params->dim, params->type, params->metric,
-                                    params->blockSize, params->multi),
-      Serializer(version), max_elements_(params->initialCapacity), epsilon_(params->epsilon),
-      element_levels_(max_elements_, allocator),
-      visited_nodes_handler_pool(1, max_elements_, allocator) {
+                                         const AbstractIndexInitParams &abstractInitParams,
+                                         Serializer::EncodingVersion version)
+    : VecSimIndexAbstract<DistType>(abstractInitParams), Serializer(version),
+      max_elements_(params->initialCapacity), epsilon_(params->epsilon),
+      element_levels_(max_elements_, this->allocator),
+      visited_nodes_handler_pool(1, max_elements_, this->allocator),
+      element_neighbors_locks_(max_elements_, this->allocator) {
 
     this->restoreIndexFields(input);
     this->fieldsValidation();
@@ -49,7 +49,8 @@ HNSWIndexMetaData HNSWIndex<DataType, DistType>::checkIntegrity() const {
                              .double_connections = HNSW_INVALID_META_DATA,
                              .unidirectional_connections = HNSW_INVALID_META_DATA,
                              .min_in_degree = HNSW_INVALID_META_DATA,
-                             .max_in_degree = HNSW_INVALID_META_DATA};
+                             .max_in_degree = HNSW_INVALID_META_DATA,
+                             .connections_to_repair = 0};
 
     // Save the current memory usage (before we use additional memory for the integrity check).
     res.memory_usage = this->getAllocationSize();
@@ -61,21 +62,25 @@ HNSWIndexMetaData HNSWIndex<DataType, DistType>::checkIntegrity() const {
             num_deleted++;
         }
         for (size_t l = 0; l <= this->element_levels_[i]; l++) {
-            idType *cur_links = this->get_linklist_at_level(i, l);
-            linkListSize size = this->getListCount(cur_links);
+            idType *cur_links = this->getNodeNeighborsAtLevel(i, l);
+            linkListSize size = this->getNodeNeighborsCount(cur_links);
             std::set<idType> s;
             for (unsigned int j = 0; j < size; j++) {
                 // Check if we found an invalid neighbor.
                 if (cur_links[j] >= this->cur_element_count || cur_links[j] == i) {
                     return res;
                 }
+                // If the neighbor has deleted, then this connection should be repaired.
+                if (isMarkedDeleted(cur_links[j])) {
+                    res.connections_to_repair++;
+                }
                 inbound_connections_num[cur_links[j]]++;
                 s.insert(cur_links[j]);
                 connections_checked++;
 
                 // Check if this connection is bidirectional.
-                idType *other_links = this->get_linklist_at_level(cur_links[j], l);
-                linkListSize size_other = this->getListCount(other_links);
+                idType *other_links = this->getNodeNeighborsAtLevel(cur_links[j], l);
+                linkListSize size_other = this->getNodeNeighborsCount(other_links);
                 for (int r = 0; r < size_other; r++) {
                     if (other_links[r] == (idType)i) {
                         double_connections++;
@@ -125,7 +130,7 @@ void HNSWIndex<DataType, DistType>::restoreIndexFields(std::ifstream &input) {
     // epsilon is only restored from v2 up.
 
     // Restore index meta-data
-    readBinaryPOD(input, this->data_size_);
+    readBinaryPOD(input, this->data_size);
     readBinaryPOD(input, this->size_data_per_element_);
     readBinaryPOD(input, this->size_links_per_element_);
     readBinaryPOD(input, this->size_links_level0_);
@@ -147,7 +152,7 @@ void HNSWIndex<DataType, DistType>::restoreIndexFields(std::ifstream &input) {
     } else {
         readBinaryPOD(input, this->num_marked_deleted);
     }
-    readBinaryPOD(input, this->maxlevel_);
+    readBinaryPOD(input, this->max_level_);
     readBinaryPOD(input, this->entrypoint_node_);
 }
 
@@ -294,7 +299,7 @@ void HNSWIndex<DataType, DistType>::saveIndexFields(std::ofstream &output) const
     writeBinaryPOD(output, this->ef_);
 
     // Save index meta-data
-    writeBinaryPOD(output, this->data_size_);
+    writeBinaryPOD(output, this->data_size);
     writeBinaryPOD(output, this->size_data_per_element_);
     writeBinaryPOD(output, this->size_links_per_element_);
     writeBinaryPOD(output, this->size_links_level0_);
@@ -308,7 +313,7 @@ void HNSWIndex<DataType, DistType>::saveIndexFields(std::ofstream &output) const
     // Save index state
     writeBinaryPOD(output, this->cur_element_count);
     writeBinaryPOD(output, this->num_marked_deleted);
-    writeBinaryPOD(output, this->maxlevel_);
+    writeBinaryPOD(output, this->max_level_);
     writeBinaryPOD(output, this->entrypoint_node_);
 }
 
diff --git a/src/VecSim/algorithms/hnsw/hnsw_serializer_declarations.h b/src/VecSim/algorithms/hnsw/hnsw_serializer_declarations.h
index 1393afbe0..a954b11d3 100644
--- a/src/VecSim/algorithms/hnsw/hnsw_serializer_declarations.h
+++ b/src/VecSim/algorithms/hnsw/hnsw_serializer_declarations.h
@@ -3,7 +3,7 @@
 // Serializing and tests functions.
 public:
 HNSWIndex(std::ifstream &input, const HNSWParams *params,
-          std::shared_ptr<VecSimAllocator> allocator, EncodingVersion version);
+          const AbstractIndexInitParams &abstractInitParams, EncodingVersion version);
 
 // Validates the connections between vectors
 HNSWIndexMetaData checkIntegrity() const;
diff --git a/src/VecSim/algorithms/hnsw/hnsw_single.h b/src/VecSim/algorithms/hnsw/hnsw_single.h
index f17896859..42b0f1f19 100644
--- a/src/VecSim/algorithms/hnsw/hnsw_single.h
+++ b/src/VecSim/algorithms/hnsw/hnsw_single.h
@@ -12,6 +12,8 @@
 template <typename DataType, typename DistType>
 class HNSWIndex_Single : public HNSWIndex<DataType, DistType> {
 private:
+    // Index global state - this should be guarded by the index_data_guard_ lock in
+    // multithreaded scenario.
     vecsim_stl::unordered_map<labelType, idType> label_lookup_;
 
 #ifdef BUILD_TESTS
@@ -21,26 +23,31 @@ class HNSWIndex_Single : public HNSWIndex<DataType, DistType> {
     inline void replaceIdOfLabel(labelType label, idType new_id, idType old_id) override;
     inline void setVectorId(labelType label, idType id) override { label_lookup_[label] = id; }
     inline void resizeLabelLookup(size_t new_max_elements) override;
+    inline vecsim_stl::set<labelType> getLabelsSet() const override;
+
+    template <bool Safe>
+    inline double getDistanceFromInternal(labelType label, const void *vector_data) const;
 
 public:
-    HNSWIndex_Single(const HNSWParams *params, std::shared_ptr<VecSimAllocator> allocator,
+    HNSWIndex_Single(const HNSWParams *params, const AbstractIndexInitParams &abstractInitParams,
                      size_t random_seed = 100, size_t initial_pool_size = 1)
-        : HNSWIndex<DataType, DistType>(params, allocator, random_seed, initial_pool_size),
-          label_lookup_(this->max_elements_, allocator) {}
+        : HNSWIndex<DataType, DistType>(params, abstractInitParams, random_seed, initial_pool_size),
+          label_lookup_(this->max_elements_, this->allocator) {}
 #ifdef BUILD_TESTS
     // Ctor to be used before loading a serialized index. Can be used from v2 and up.
     HNSWIndex_Single(std::ifstream &input, const HNSWParams *params,
-                     std::shared_ptr<VecSimAllocator> allocator,
+                     const AbstractIndexInitParams &abstractInitParams,
                      Serializer::EncodingVersion version)
-        : HNSWIndex<DataType, DistType>(input, params, allocator, version),
-          label_lookup_(this->max_elements_, allocator) {}
+        : HNSWIndex<DataType, DistType>(input, params, abstractInitParams, version),
+          label_lookup_(this->max_elements_, this->allocator) {}
 
-    void GetDataByLabel(labelType label, std::vector<std::vector<DataType>> &vectors_output) {
+    void getDataByLabel(labelType label,
+                        std::vector<std::vector<DataType>> &vectors_output) const override {
 
         auto id = label_lookup_.at(label);
 
         auto vec = std::vector<DataType>(this->dim);
-        memcpy(vec.data(), this->getDataByInternalId(id), this->data_size_);
+        memcpy(vec.data(), this->getDataByInternalId(id), this->data_size);
         vectors_output.push_back(vec);
     }
 #endif
@@ -61,9 +68,17 @@ class HNSWIndex_Single : public HNSWIndex<DataType, DistType> {
                                           VecSimQueryParams *queryParams) const override;
 
     int deleteVector(labelType label) override;
-    int addVector(const void *vector_data, labelType label, bool overwrite_allowed = true) override;
-    double getDistanceFrom(labelType label, const void *vector_data) const override;
+    int addVector(const void *vector_data, labelType label, void *auxiliaryCtx = nullptr) override;
     inline std::vector<idType> markDelete(labelType label) override;
+    inline bool safeCheckIfLabelExistsInIndex(labelType label,
+                                              bool also_done_processing = false) const override;
+
+    double getDistanceFrom(labelType label, const void *vector_data) const override {
+        return getDistanceFromInternal<false>(label, vector_data);
+    }
+    double safeGetDistanceFrom(labelType label, const void *vector_data) const override {
+        return getDistanceFromInternal<true>(label, vector_data);
+    }
 };
 
 /**
@@ -75,19 +90,41 @@ size_t HNSWIndex_Single<DataType, DistType>::indexLabelCount() const {
     return label_lookup_.size();
 }
 
+/**
+ * helper functions
+ */
+
+// Return all the labels in the index - this should be used for computing the number of distinct
+// labels in a tiered index, and caller should hold the index data guard.
+template <typename DataType, typename DistType>
+inline vecsim_stl::set<labelType> HNSWIndex_Single<DataType, DistType>::getLabelsSet() const {
+    vecsim_stl::set<labelType> keys(this->allocator);
+    for (auto &it : label_lookup_) {
+        keys.insert(it.first);
+    }
+    return keys;
+};
+
 template <typename DataType, typename DistType>
-double HNSWIndex_Single<DataType, DistType>::getDistanceFrom(labelType label,
-                                                             const void *vector_data) const {
-    auto id = label_lookup_.find(label);
-    if (id == label_lookup_.end() || this->isMarkedDeleted(id->second)) {
+template <bool Safe>
+double
+HNSWIndex_Single<DataType, DistType>::getDistanceFromInternal(labelType label,
+                                                              const void *vector_data) const {
+    if (Safe)
+        this->index_data_guard_.lock_shared();
+
+    auto it = label_lookup_.find(label);
+    if (it == label_lookup_.end()) {
+        if (Safe)
+            this->index_data_guard_.unlock_shared();
         return INVALID_SCORE;
     }
-    return this->dist_func(vector_data, this->getDataByInternalId(id->second), this->dim);
-}
+    idType id = it->second;
+    if (Safe)
+        this->index_data_guard_.unlock_shared();
 
-/**
- * helper functions
- */
+    return this->dist_func(vector_data, this->getDataByInternalId(id), this->dim);
+}
 
 template <typename DataType, typename DistType>
 void HNSWIndex_Single<DataType, DistType>::replaceIdOfLabel(labelType label, idType new_id,
@@ -112,28 +149,26 @@ int HNSWIndex_Single<DataType, DistType>::deleteVector(const labelType label) {
     }
     idType element_internal_id = label_lookup_[label];
     label_lookup_.erase(label);
-    this->removeVector(element_internal_id);
+    this->removeVectorInPlace(element_internal_id);
     return 1;
 }
 
 template <typename DataType, typename DistType>
 int HNSWIndex_Single<DataType, DistType>::addVector(const void *vector_data, const labelType label,
-                                                    bool overwrite_allowed) {
+                                                    void *auxiliaryCtx) {
 
     // Checking if an element with the given label already exists.
     bool label_exists = false;
-    if (label_lookup_.find(label) != label_lookup_.end()) {
-        label_exists = true;
-        if (overwrite_allowed) {
-            // Remove the vector in place if override allowed (in non-async scenario)
+    // Note that is it the caller responsibility to ensure that this label doesn't exist in the
+    // index and increase the element count before calling this, if auxiliaryCtx is *not* NULL.
+    if (auxiliaryCtx == nullptr) {
+        if (label_lookup_.find(label) != label_lookup_.end()) {
+            label_exists = true;
+            // Remove the vector in place if override allowed (in non-async scenario).
             deleteVector(label);
-        } else {
-            // If override is not allowed, we don't do anything.
-            return -1;
         }
     }
-
-    this->appendVector(vector_data, label);
+    this->appendVector(vector_data, label, (AddVectorCtx *)auxiliaryCtx);
     // Return the delta in the index size due to the insertion.
     return label_exists ? 0 : 1;
 }
@@ -144,9 +179,7 @@ HNSWIndex_Single<DataType, DistType>::newBatchIterator(const void *queryBlob,
                                                        VecSimQueryParams *queryParams) const {
     auto queryBlobCopy = this->allocator->allocate(sizeof(DataType) * this->dim);
     memcpy(queryBlobCopy, queryBlob, this->dim * sizeof(DataType));
-    if (this->metric == VecSimMetric_Cosine) {
-        normalizeVector((DataType *)queryBlobCopy, this->dim);
-    }
+
     // Ownership of queryBlobCopy moves to HNSW_BatchIterator that will free it at the end.
     return new (this->allocator) HNSWSingle_BatchIterator<DataType, DistType>(
         queryBlobCopy, this, queryParams, this->allocator);
@@ -159,6 +192,7 @@ HNSWIndex_Single<DataType, DistType>::newBatchIterator(const void *queryBlob,
 template <typename DataType, typename DistType>
 std::vector<idType> HNSWIndex_Single<DataType, DistType>::markDelete(labelType label) {
     std::vector<idType> idsToDelete;
+    std::unique_lock<std::shared_mutex> index_data_lock(this->index_data_guard_);
     auto search = label_lookup_.find(label);
     if (search == label_lookup_.end()) {
         return idsToDelete;
@@ -168,3 +202,17 @@ std::vector<idType> HNSWIndex_Single<DataType, DistType>::markDelete(labelType l
     label_lookup_.erase(search);
     return idsToDelete;
 }
+
+template <typename DataType, typename DistType>
+inline bool HNSWIndex_Single<DataType, DistType>::safeCheckIfLabelExistsInIndex(
+    labelType label, bool also_done_processing) const {
+    std::unique_lock<std::shared_mutex> index_data_lock(this->index_data_guard_);
+    auto it = label_lookup_.find(label);
+    bool exists = it != label_lookup_.end();
+    // If we want to make sure that the vector stored under the label was already indexed,
+    // we go on and check that its associated internal id is no longer in process.
+    if (exists && also_done_processing) {
+        return !this->isInProcess(it->second);
+    }
+    return exists;
+}
diff --git a/src/VecSim/algorithms/hnsw/hnsw_single_tests_friends.h b/src/VecSim/algorithms/hnsw/hnsw_single_tests_friends.h
index 83667fb22..db30d1d7d 100644
--- a/src/VecSim/algorithms/hnsw/hnsw_single_tests_friends.h
+++ b/src/VecSim/algorithms/hnsw/hnsw_single_tests_friends.h
@@ -10,3 +10,6 @@ INDEX_TEST_FRIEND_CLASS(HNSWTest_preferAdHocOptimization_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTest_testSizeEstimation_Test)
 INDEX_TEST_FRIEND_CLASS(IndexAllocatorTest_testIncomingEdgesSet_Test)
 INDEX_TEST_FRIEND_CLASS(IndexAllocatorTest_test_hnsw_reclaim_memory_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTestParallel_parallelInsertSearch_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_testSizeEstimation_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_swapJobBasic_Test)
diff --git a/src/VecSim/algorithms/hnsw/hnsw_tiered.h b/src/VecSim/algorithms/hnsw/hnsw_tiered.h
index 9e895c278..111e94099 100644
--- a/src/VecSim/algorithms/hnsw/hnsw_tiered.h
+++ b/src/VecSim/algorithms/hnsw/hnsw_tiered.h
@@ -1,75 +1,1091 @@
 #pragma once
 
+#include "VecSim/algorithms/brute_force/brute_force_single.h"
 #include "VecSim/vec_sim_tiered_index.h"
 #include "hnsw.h"
-#include "hnsw_factory.h"
+#include "VecSim/index_factories/hnsw_factory.h"
 
 #include <unordered_map>
+/**
+ * Definition of a job that inserts a new vector from flat into HNSW Index.
+ */
+struct HNSWInsertJob : public AsyncJob {
+    labelType label;
+    idType id;
+
+    HNSWInsertJob(std::shared_ptr<VecSimAllocator> allocator, labelType label_, idType id_,
+                  JobCallback insertCb, VecSimIndex *index_)
+        : AsyncJob(allocator, HNSW_INSERT_VECTOR_JOB, insertCb, index_), label(label_), id(id_) {}
+};
+
+/**
+ * Definition of a job that swaps last id with a deleted id in HNSW Index after delete operation.
+ */
+struct HNSWSwapJob : public VecsimBaseObject {
+    idType deleted_id;
+    std::atomic_int
+        pending_repair_jobs_counter; // number of repair jobs left to complete before this job
+                                     // is ready to be executed (atomic counter).
+    HNSWSwapJob(std::shared_ptr<VecSimAllocator> allocator, idType deletedId)
+        : VecsimBaseObject(allocator), deleted_id(deletedId), pending_repair_jobs_counter(0) {}
+    void setRepairJobsNum(long num_repair_jobs) { pending_repair_jobs_counter = num_repair_jobs; }
+    int atomicDecreasePendingJobsNum() {
+        int ret = --pending_repair_jobs_counter;
+        assert(pending_repair_jobs_counter >= 0);
+        return ret;
+    }
+};
+
+static const size_t DEFAULT_PENDING_SWAP_JOBS_THRESHOLD = DEFAULT_BLOCK_SIZE;
+static const size_t MAX_PENDING_SWAP_JOBS_THRESHOLD = 100000;
+
+/**
+ * Definition of a job that repairs a certain node's connection in HNSW Index after delete
+ * operation.
+ */
+struct HNSWRepairJob : public AsyncJob {
+    idType node_id;
+    unsigned short level;
+    vecsim_stl::vector<HNSWSwapJob *> associatedSwapJobs;
+
+    HNSWRepairJob(std::shared_ptr<VecSimAllocator> allocator, idType id_, unsigned short level_,
+                  JobCallback repairCb, VecSimIndex *index_, HNSWSwapJob *swapJob)
+        : AsyncJob(allocator, HNSW_REPAIR_NODE_CONNECTIONS_JOB, repairCb, index_), node_id(id_),
+          level(level_),
+          // Insert the first swap job from which this repair job was created.
+          associatedSwapJobs(1, swapJob, this->allocator) {}
+    // In case that a repair job is required for deleting another neighbor of the node, save a
+    // reference to additional swap job.
+    void appendAnotherAssociatedSwapJob(HNSWSwapJob *swapJob) {
+        associatedSwapJobs.push_back(swapJob);
+    }
+};
 
 template <typename DataType, typename DistType>
 class TieredHNSWIndex : public VecSimTieredIndex<DataType, DistType> {
 private:
     /// Mappings from id/label to associated jobs, for invalidating and update ids if necessary.
-    // In MULTI, we can have more than one insert job pending per label
-    std::unordered_map<labelType, std::vector<HNSWInsertJob *>> labelToInsertJobs;
-    std::unordered_map<idType, std::vector<HNSWRepairJob *>> idToRepairJobs;
-    std::unordered_map<idType, HNSWSwapJob *> idToSwapJob;
+    // In MULTI, we can have more than one insert job pending per label.
+    // **This map is protected with the flat buffer lock**
+    vecsim_stl::unordered_map<labelType, vecsim_stl::vector<HNSWInsertJob *>> labelToInsertJobs;
+    vecsim_stl::unordered_map<idType, vecsim_stl::vector<HNSWRepairJob *>> idToRepairJobs;
+    vecsim_stl::unordered_map<idType, HNSWSwapJob *> idToSwapJob;
+
+    // A mapping to hold invalid jobs, so we can dispose them upon index deletion.
+    vecsim_stl::unordered_map<idType, AsyncJob *> invalidJobs;
+    idType currInvalidJobId; // A unique arbitrary identifier for accessing invalid jobs
+    std::mutex invalidJobsLookupGuard;
+
+    // This threshold is tested upon deleting a label from HNSW, and once the number of deleted
+    // vectors reached this limit, we apply swap jobs *only for vectors that has no more pending
+    // repair jobs*, and are ready to be removed from the graph.
+    size_t pendingSwapJobsThreshold;
+    size_t readySwapJobs;
 
-    // Todo: implement these methods later on
-    void executeInsertJob(HNSWInsertJob *job) {}
-    void executeRepairJob(HNSWRepairJob *job) {}
+    // Protect the both idToRepairJobs lookup and the pending_repair_jobs_counter for the
+    // associated swap jobs.
+    std::mutex idToRepairJobsGuard;
 
-    // To be executed synchronously upon deleting a vector, doesn't require a wrapper.
-    void executeSwapJob(HNSWSwapJob *job) {}
+    void executeInsertJob(HNSWInsertJob *job);
+    void executeRepairJob(HNSWRepairJob *job);
+
+    // To be executed synchronously upon deleting a vector, doesn't require a wrapper. Main HNSW
+    // lock is assumed to be held exclusive here.
+    void executeSwapJob(HNSWSwapJob *job, vecsim_stl::vector<idType> &idsToRemove);
+
+    void executeReadySwapJobs();
 
     // Wrappers static functions to be sent as callbacks upon creating the jobs (since members
     // functions cannot serve as callback, this serve as the "gateway" to the appropriate index).
-    static void executeInsertJobWrapper(HNSWInsertJob *job) {
-        reinterpret_cast<TieredHNSWIndex<DataType, DistType> *>(job->index)->executeInsertJob(job);
-    }
-    static void executeRepairJobWrapper(HNSWRepairJob *job) {
-        reinterpret_cast<TieredHNSWIndex<DataType, DistType> *>(job->index)->executeRepairJob(job);
-    }
+    static void executeInsertJobWrapper(AsyncJob *job);
+    static void executeRepairJobWrapper(AsyncJob *job);
+
+    inline HNSWIndex<DataType, DistType> *getHNSWIndex() const;
+
+    // Helper function for deleting a vector from the flat buffer (after it has already been
+    // ingested into HNSW or deleted). This includes removing the corresponding insert job from the
+    // label-to-insert-jobs lookup. Also, since deletion a vector triggers swapping of the
+    // internal last id with the deleted vector id, here we update the pending insert job(s) for the
+    // last id (if needed). This should be called while *flat lock is held* (exclusive lock).
+    void updateInsertJobInternalId(idType prev_id, idType new_id, labelType label);
+
+    // Helper function for performing in place mark delete of vector(s) associated with a label
+    // and creating the appropriate repair jobs for the effected connections. This should be called
+    // while *HNSW shared lock is held* (shared locked).
+    int deleteLabelFromHNSW(labelType label);
+
+    // Insert a single vector to HNSW. This can be called in both write modes - insert async and
+    // in-place. For the async mode, we have to release the flat index guard that is held for shared
+    // ownership (we do it right after we update the HNSW global data and receive the new state).
+    template <bool releaseFlatGuard>
+    void insertVectorToHNSW(HNSWIndex<DataType, DistType> *hnsw_index, labelType label,
+                            const void *blob);
+
+    // Set an insert/repair job as invalid, put the job pointer in the invalid jobs lookup under
+    // the current available id, increase it and return it (while holding invalidJobsLookupGuard).
+    // Returns the id that the job was stored under (to be set in the job id field).
+    idType setAndSaveInvalidJob(AsyncJob *job);
 
 #ifdef BUILD_TESTS
 #include "VecSim/algorithms/hnsw/hnsw_tiered_tests_friends.h"
 #endif
 
 public:
-    TieredHNSWIndex(HNSWIndex<DataType, DistType> *hnsw_index, TieredIndexParams tieredParams)
-        : VecSimTieredIndex<DataType, DistType>(hnsw_index, tieredParams) {}
-    virtual ~TieredHNSWIndex() = default;
-
-    // TODO: Implement the actual methods instead of these temporary ones.
-    int addVector(const void *blob, labelType label, bool overwrite_allowed) override {
-        return this->index->addVector(blob, label, overwrite_allowed);
-    }
-    int deleteVector(labelType id) override { return this->index->deleteVector(id); }
-    double getDistanceFrom(labelType id, const void *blob) const override {
-        return this->index->getDistanceFrom(id, blob);
-    }
-    size_t indexSize() const override { return this->index->indexSize(); }
-    size_t indexCapacity() const override { return this->index->indexCapacity(); }
-    void increaseCapacity() override { this->index->increaseCapacity(); }
-    size_t indexLabelCount() const override { return this->index->indexLabelCount(); }
-    VecSimQueryResult_List topKQuery(const void *queryBlob, size_t k,
-                                     VecSimQueryParams *queryParams) override {
-        return this->index->topKQuery(queryBlob, k, queryParams);
-    }
-    VecSimQueryResult_List rangeQuery(const void *queryBlob, double radius,
-                                      VecSimQueryParams *queryParams) override {
-        return this->index->rangeQuery(queryBlob, radius, queryParams);
-    }
-    VecSimIndexInfo info() const override { return this->index->info(); }
-    VecSimInfoIterator *infoIterator() const override { return this->index->infoIterator(); }
+    class TieredHNSW_BatchIterator : public VecSimBatchIterator {
+    private:
+        const TieredHNSWIndex<DataType, DistType> *index;
+        VecSimQueryParams *queryParams;
+
+        VecSimQueryResult_List flat_results;
+        VecSimQueryResult_List hnsw_results;
+
+        VecSimBatchIterator *flat_iterator;
+        VecSimBatchIterator *hnsw_iterator;
+
+        // On single value indices, this set holds the IDs of the results that were returned from
+        // the flat buffer.
+        // On multi value indices, this set holds the IDs of all the results that were returned.
+        // The difference between the two cases is that on multi value indices, the same ID can
+        // appear in both indexes and results with different scores, and therefore we can't tell in
+        // advance when we expect a possibility of a duplicate.
+        // On single value indices, a duplicate may appear at the same batch (and we will handle it
+        // when merging the results) Or it may appear in a different batches, first from the flat
+        // buffer and then from the HNSW, in the cases where a better result if found later in HNSW
+        // because of the approximate nature of the algorithm.
+        vecsim_stl::unordered_set<labelType> returned_results_set;
+
+    private:
+        template <bool isMultiValue>
+        inline VecSimQueryResult_List compute_current_batch(size_t n_res);
+        inline void filter_irrelevant_results(VecSimQueryResult_List &);
+
+    public:
+        TieredHNSW_BatchIterator(void *query_vector,
+                                 const TieredHNSWIndex<DataType, DistType> *index,
+                                 VecSimQueryParams *queryParams,
+                                 std::shared_ptr<VecSimAllocator> allocator);
+
+        ~TieredHNSW_BatchIterator();
+
+        VecSimQueryResult_List getNextResults(size_t n_res, VecSimQueryResult_Order order) override;
+
+        bool isDepleted() override;
+
+        void reset() override;
+    };
+
+public:
+    TieredHNSWIndex(HNSWIndex<DataType, DistType> *hnsw_index,
+                    BruteForceIndex<DataType, DistType> *bf_index,
+                    const TieredIndexParams &tieredParams,
+                    std::shared_ptr<VecSimAllocator> allocator);
+    virtual ~TieredHNSWIndex();
+
+    int addVector(const void *blob, labelType label, void *auxiliaryCtx = nullptr) override;
+    int deleteVector(labelType label) override;
+    size_t indexSize() const override;
+    size_t indexLabelCount() const override;
+    size_t indexCapacity() const override;
+    double getDistanceFrom(labelType label, const void *blob) const override;
+    // Do nothing here, each tier (flat buffer and HNSW) should increase capacity for itself when
+    // needed.
+    void increaseCapacity() override {}
+    VecSimIndexInfo info() const override;
+    VecSimIndexBasicInfo basicInfo() const override;
+    VecSimInfoIterator *infoIterator() const override;
     VecSimBatchIterator *newBatchIterator(const void *queryBlob,
                                           VecSimQueryParams *queryParams) const override {
-        return this->index->newBatchIterator(queryBlob, queryParams);
-    }
-    bool preferAdHocSearch(size_t subsetSize, size_t k, bool initial_check) override {
-        return this->index->preferAdHocSearch(subsetSize, k, initial_check);
+        size_t blobSize = this->backendIndex->getDim() * sizeof(DataType);
+        void *queryBlobCopy = this->allocator->allocate(blobSize);
+        memcpy(queryBlobCopy, queryBlob, blobSize);
+        return new (this->allocator)
+            TieredHNSW_BatchIterator(queryBlobCopy, this, queryParams, this->allocator);
     }
     inline void setLastSearchMode(VecSearchMode mode) override {
-        return this->index->setLastSearchMode(mode);
+        return this->backendIndex->setLastSearchMode(mode);
+    }
+};
+
+/**
+ ******************************* Implementation **************************
+ */
+
+/* Helper methods */
+template <typename DataType, typename DistType>
+void TieredHNSWIndex<DataType, DistType>::executeInsertJobWrapper(AsyncJob *job) {
+    auto *insert_job = reinterpret_cast<HNSWInsertJob *>(job);
+    auto *job_index = reinterpret_cast<TieredHNSWIndex<DataType, DistType> *>(insert_job->index);
+    job_index->executeInsertJob(insert_job);
+    delete job;
+}
+
+template <typename DataType, typename DistType>
+void TieredHNSWIndex<DataType, DistType>::executeRepairJobWrapper(AsyncJob *job) {
+    auto *repair_job = reinterpret_cast<HNSWRepairJob *>(job);
+    auto *job_index = reinterpret_cast<TieredHNSWIndex<DataType, DistType> *>(repair_job->index);
+    job_index->executeRepairJob(repair_job);
+    delete job;
+}
+
+template <typename DataType, typename DistType>
+void TieredHNSWIndex<DataType, DistType>::executeSwapJob(HNSWSwapJob *job,
+                                                         vecsim_stl::vector<idType> &idsToRemove) {
+    auto hnsw_index = this->getHNSWIndex();
+    hnsw_index->removeAndSwapDeletedElement(job->deleted_id);
+    // Get the id that was last and was had been swapped with the job's deleted id.
+    idType prev_last_id = this->getHNSWIndex()->indexSize();
+
+    // Invalidate repair jobs for the disposed id (if exist), and update the associated swap jobs.
+    if (idToRepairJobs.find(job->deleted_id) != idToRepairJobs.end()) {
+        for (auto &job_it : idToRepairJobs.at(job->deleted_id)) {
+            job_it->node_id = this->setAndSaveInvalidJob(job_it);
+            for (auto &swap_job_it : job_it->associatedSwapJobs) {
+                if (swap_job_it->atomicDecreasePendingJobsNum() == 0) {
+                    readySwapJobs++;
+                }
+            }
+        }
+        idToRepairJobs.erase(job->deleted_id);
+    }
+    // Swap the ids in the pending jobs for the current last id (if exist).
+    if (idToRepairJobs.find(prev_last_id) != idToRepairJobs.end()) {
+        for (auto &job_it : idToRepairJobs.at(prev_last_id)) {
+            job_it->node_id = job->deleted_id;
+        }
+        idToRepairJobs.insert({job->deleted_id, idToRepairJobs.at(prev_last_id)});
+        idToRepairJobs.erase(prev_last_id);
+    }
+    // Update the swap jobs if the last id also needs a swap, otherwise just collect to deleted id
+    // to be removed from the swap jobs.
+    if (prev_last_id != job->deleted_id && idToSwapJob.find(prev_last_id) != idToSwapJob.end() &&
+        std::find(idsToRemove.begin(), idsToRemove.end(), prev_last_id) == idsToRemove.end()) {
+        // Update the curr_last_id pending swap job id after the removal that renamed curr_last_id
+        // with the deleted id.
+        idsToRemove.push_back(prev_last_id);
+        idToSwapJob.at(prev_last_id)->deleted_id = job->deleted_id;
+        idToSwapJob.at(job->deleted_id) = idToSwapJob.at(prev_last_id);
+    } else {
+        idsToRemove.push_back(job->deleted_id);
+    }
+}
+
+template <typename DataType, typename DistType>
+HNSWIndex<DataType, DistType> *TieredHNSWIndex<DataType, DistType>::getHNSWIndex() const {
+    return dynamic_cast<HNSWIndex<DataType, DistType> *>(this->backendIndex);
+}
+
+template <typename DataType, typename DistType>
+void TieredHNSWIndex<DataType, DistType>::executeReadySwapJobs() {
+
+    // If swapJobs size is equal or larger than a threshold, go over the swap jobs and execute every
+    // job for which all of its pending repair jobs were executed (otherwise finish and return).
+    if (readySwapJobs < this->pendingSwapJobsThreshold) {
+        return;
+    }
+    // Execute swap jobs - acquire hnsw write lock.
+    this->mainIndexGuard.lock();
+
+    vecsim_stl::vector<idType> idsToRemove(this->allocator);
+    idsToRemove.reserve(idToSwapJob.size());
+    for (auto &it : idToSwapJob) {
+        auto *swap_job = it.second;
+        if (swap_job->pending_repair_jobs_counter.load() == 0) {
+            // Swap job is ready for execution - execute and delete it.
+            this->executeSwapJob(swap_job, idsToRemove);
+            delete swap_job;
+        }
+    }
+    for (idType id : idsToRemove) {
+        idToSwapJob.erase(id);
+    }
+    readySwapJobs -= idsToRemove.size();
+    this->mainIndexGuard.unlock();
+}
+
+template <typename DataType, typename DistType>
+int TieredHNSWIndex<DataType, DistType>::deleteLabelFromHNSW(labelType label) {
+    auto *hnsw_index = getHNSWIndex();
+    this->mainIndexGuard.lock_shared();
+
+    // Get the required data about the relevant ids to delete.
+    // Internally, this will hold the index data lock.
+    auto internal_ids = hnsw_index->markDelete(label);
+
+    for (size_t i = 0; i < internal_ids.size(); i++) {
+        idType id = internal_ids[i];
+        vecsim_stl::vector<AsyncJob *> repair_jobs(this->allocator);
+        auto *swap_job = new (this->allocator) HNSWSwapJob(this->allocator, id);
+
+        // Go over all the deleted element links in every level and create repair jobs.
+        auto incoming_edges =
+            hnsw_index->safeCollectAllNodeIncomingNeighbors(id, hnsw_index->getElementTopLevel(id));
+
+        // Protect the id->repair_jobs lookup while we update it with the new jobs.
+        this->idToRepairJobsGuard.lock();
+        for (pair<idType, ushort> &node : incoming_edges) {
+            bool repair_job_exists = false;
+            HNSWRepairJob *repair_job = nullptr;
+            if (idToRepairJobs.find(node.first) != idToRepairJobs.end()) {
+                for (auto it : idToRepairJobs.at(node.first)) {
+                    if (it->level == node.second) {
+                        // There is already an existing pending repair job for this node due to
+                        // the deletion of another node - avoid creating another job.
+                        repair_job_exists = true;
+                        repair_job = it;
+                        break;
+                    }
+                }
+            } else {
+                // There is no repair jobs at all for this element, create a new array for it.
+                idToRepairJobs.insert(
+                    {node.first, vecsim_stl::vector<HNSWRepairJob *>(this->allocator)});
+            }
+            if (repair_job_exists) {
+                repair_job->appendAnotherAssociatedSwapJob(swap_job);
+            } else {
+                repair_job =
+                    new (this->allocator) HNSWRepairJob(this->allocator, node.first, node.second,
+                                                        executeRepairJobWrapper, this, swap_job);
+                repair_jobs.emplace_back(repair_job);
+                idToRepairJobs.at(node.first).push_back(repair_job);
+            }
+        }
+        swap_job->setRepairJobsNum(incoming_edges.size());
+        if (incoming_edges.size() == 0) {
+            // No pending repair jobs, so swap jobs is ready from the beginning.
+            readySwapJobs++;
+        }
+        this->idToRepairJobsGuard.unlock();
+
+        this->submitJobs(repair_jobs);
+        // Insert the swap job into the swap jobs lookup (for fast update in case that the
+        // node id is changed due to swap job).
+        assert(idToSwapJob.find(id) == idToSwapJob.end());
+        idToSwapJob[id] = swap_job;
+    }
+    this->mainIndexGuard.unlock_shared();
+    return internal_ids.size();
+}
+
+template <typename DataType, typename DistType>
+void TieredHNSWIndex<DataType, DistType>::updateInsertJobInternalId(idType prev_id, idType new_id,
+                                                                    labelType label) {
+    // Update the pending job id, due to a swap that was caused after the removal of new_id.
+    assert(new_id != INVALID_ID && prev_id != INVALID_ID);
+    auto it = this->labelToInsertJobs.find(label);
+    if (it != this->labelToInsertJobs.end()) {
+        // There is a pending job for the label of the swapped last id - update its id.
+        for (HNSWInsertJob *job_it : it->second) {
+            if (job_it->id == prev_id) {
+                job_it->id = new_id;
+            }
+        }
+    }
+}
+
+template <typename DataType, typename DistType>
+template <bool releaseFlatGuard>
+void TieredHNSWIndex<DataType, DistType>::insertVectorToHNSW(
+    HNSWIndex<DataType, DistType> *hnsw_index, labelType label, const void *blob) {
+    // Acquire the index data lock, so we know what is the exact index size at this time. Acquire
+    // the main r/w lock before to avoid deadlocks.
+    AddVectorCtx state = {0};
+    this->mainIndexGuard.lock_shared();
+    hnsw_index->lockIndexDataGuard();
+    // Check if resizing is needed for HNSW index (requires write lock).
+    if (hnsw_index->indexCapacity() == hnsw_index->indexSize()) {
+        // Release the inner HNSW data lock before we re-acquire the global HNSW lock.
+        this->mainIndexGuard.unlock_shared();
+        hnsw_index->unlockIndexDataGuard();
+        this->mainIndexGuard.lock();
+        hnsw_index->lockIndexDataGuard();
+        // Check if resizing is still required (another thread might have done it in the meantime
+        // while we release the shared lock).
+        if (hnsw_index->indexCapacity() == hnsw_index->indexSize()) {
+            hnsw_index->increaseCapacity();
+        }
+        // Hold the index data lock while we store the new element. If the new node's max level is
+        // higher than the current one, hold the lock through the entire insertion to ensure that
+        // graph scans will not occur, as they will try access the entry point's neighbors.
+        state = hnsw_index->storeNewElement(label);
+        if (releaseFlatGuard) {
+            this->flatIndexGuard.unlock_shared();
+        }
+
+        // If we're still holding the index data guard, we cannot take the main index lock for
+        // shared ownership as it may cause deadlocks, and we also cannot release the main index
+        // lock between, since we cannot allow swap jobs to happen, as they will make the
+        // saved state invalid. Hence, we insert the vector with the current exclusive lock held.
+        if (state.elementMaxLevel <= state.currMaxLevel) {
+            hnsw_index->unlockIndexDataGuard();
+        }
+        // Take the vector from the flat buffer and insert it to HNSW (overwrite should not occur).
+        hnsw_index->addVector(blob, label, &state);
+        if (state.elementMaxLevel > state.currMaxLevel) {
+            hnsw_index->unlockIndexDataGuard();
+        }
+        this->mainIndexGuard.unlock();
+    } else {
+        // Do the same as above except for changing the capacity, but with *shared* lock held:
+        // Hold the index data lock while we store the new element. If the new node's max level is
+        // higher than the current one, hold the lock through the entire insertion to ensure that
+        // graph scans will not occur, as they will try access the entry point's neighbors.
+        state = hnsw_index->storeNewElement(label);
+        if (releaseFlatGuard) {
+            this->flatIndexGuard.unlock_shared();
+        }
+
+        if (state.elementMaxLevel <= state.currMaxLevel) {
+            hnsw_index->unlockIndexDataGuard();
+        }
+        // Take the vector from the flat buffer and insert it to HNSW (overwrite should not occur).
+        hnsw_index->addVector(blob, label, &state);
+        if (state.elementMaxLevel > state.currMaxLevel) {
+            hnsw_index->unlockIndexDataGuard();
+        }
+        this->mainIndexGuard.unlock_shared();
+    }
+}
+
+template <typename DataType, typename DistType>
+idType TieredHNSWIndex<DataType, DistType>::setAndSaveInvalidJob(AsyncJob *job) {
+    this->invalidJobsLookupGuard.lock();
+    job->isValid = false;
+    idType curInvalidId = currInvalidJobId++;
+    this->invalidJobs.insert({curInvalidId, job});
+    this->invalidJobsLookupGuard.unlock();
+    return curInvalidId;
+}
+
+/******************** Job's callbacks **********************************/
+template <typename DataType, typename DistType>
+void TieredHNSWIndex<DataType, DistType>::executeInsertJob(HNSWInsertJob *job) {
+    // Note that accessing the job fields should occur with flat index guard held (here and later).
+    this->flatIndexGuard.lock_shared();
+    if (!job->isValid) {
+        this->flatIndexGuard.unlock_shared();
+        // Job has been invalidated in the meantime - nothing to execute, and remove it from the
+        // lookup.
+        this->invalidJobsLookupGuard.lock();
+        this->invalidJobs.erase(job->id);
+        this->invalidJobsLookupGuard.unlock();
+        return;
+    }
+
+    HNSWIndex<DataType, DistType> *hnsw_index = this->getHNSWIndex();
+    // Copy the vector blob from the flat buffer, so we can release the flat lock while we are
+    // indexing the vector into HNSW index.
+    DataType blob_copy[this->frontendIndex->getDim()];
+    memcpy(blob_copy, this->frontendIndex->getDataByInternalId(job->id),
+           this->frontendIndex->getDim() * sizeof(DataType));
+
+    this->insertVectorToHNSW<true>(hnsw_index, job->label, blob_copy);
+
+    // Remove the vector and the insert job from the flat buffer.
+    this->flatIndexGuard.lock();
+    // The job might have been invalidated due to overwrite in the meantime. In this case,
+    // it was already deleted and the job has been evicted. Otherwise, we need to do it now.
+    if (job->isValid) {
+        // Remove the job pointer from the labelToInsertJobs mapping.
+        auto &jobs = labelToInsertJobs.at(job->label);
+        for (size_t i = 0; i < jobs.size(); i++) {
+            if (jobs[i]->id == job->id) {
+                jobs.erase(jobs.begin() + (long)i);
+                break;
+            }
+        }
+        if (labelToInsertJobs.at(job->label).empty()) {
+            labelToInsertJobs.erase(job->label);
+        }
+        // Remove the vector from the flat buffer. This may cause the last vector id to swap with
+        // the deleted id. Hold the label for the last id, so we can later on update its
+        // corresponding job id. Note that after calling deleteVectorById, the last id's label
+        // shouldn't be available, since it is removed from the lookup.
+        labelType last_vec_label =
+            this->frontendIndex->getLabelByInternalId(this->frontendIndex->indexSize() - 1);
+        int deleted = this->frontendIndex->deleteVectorById(job->label, job->id);
+        if (deleted && job->id != this->frontendIndex->indexSize()) {
+            // If the vector removal caused a swap with the last id, update the relevant insert job.
+            this->updateInsertJobInternalId(this->frontendIndex->indexSize(), job->id,
+                                            last_vec_label);
+        }
+    } else {
+        // Remove the current job from the invalid jobs' lookup, as we are about to delete it now.
+        this->invalidJobsLookupGuard.lock();
+        this->invalidJobs.erase(job->id);
+        this->invalidJobsLookupGuard.unlock();
+    }
+    this->flatIndexGuard.unlock();
+}
+
+template <typename DataType, typename DistType>
+void TieredHNSWIndex<DataType, DistType>::executeRepairJob(HNSWRepairJob *job) {
+    // Lock the HNSW shared lock before accessing its internals.
+    this->mainIndexGuard.lock_shared();
+    if (!job->isValid) {
+        this->mainIndexGuard.unlock_shared();
+        // The current node has already been removed and disposed.
+        this->invalidJobsLookupGuard.lock();
+        this->invalidJobs.erase(job->node_id);
+        this->invalidJobsLookupGuard.unlock();
+        return;
+    }
+    HNSWIndex<DataType, DistType> *hnsw_index = this->getHNSWIndex();
+
+    // Remove this job pointer from the repair jobs lookup BEFORE it has been executed. Had we done
+    // it after executing the repair job, we might have see that there is a pending repair job for
+    // this node id upon deleting another neighbor of this node, and we may avoid creating another
+    // repair job even though *it has already been executed*.
+    this->idToRepairJobsGuard.lock();
+    auto &repair_jobs = this->idToRepairJobs.at(job->node_id);
+    assert(repair_jobs.size() > 0);
+    if (repair_jobs.size() == 1) {
+        // This was the only pending repair job for this id.
+        this->idToRepairJobs.erase(job->node_id);
+    } else {
+        // There are more pending jobs for the current id, remove just this job from the pending
+        // repair jobs list for this element id by replacing it with the last one (and trim the
+        // last job in the list).
+        auto it = std::find(repair_jobs.begin(), repair_jobs.end(), job);
+        assert(it != repair_jobs.end());
+        *it = repair_jobs.back();
+        repair_jobs.pop_back();
+    }
+    for (auto &it : job->associatedSwapJobs) {
+        if (it->atomicDecreasePendingJobsNum() == 0) {
+            readySwapJobs++;
+        }
+    }
+    this->idToRepairJobsGuard.unlock();
+
+    hnsw_index->repairNodeConnections(job->node_id, job->level);
+
+    this->mainIndexGuard.unlock_shared();
+}
+
+/******************** Index API ****************************************/
+
+template <typename DataType, typename DistType>
+TieredHNSWIndex<DataType, DistType>::TieredHNSWIndex(HNSWIndex<DataType, DistType> *hnsw_index,
+                                                     BruteForceIndex<DataType, DistType> *bf_index,
+                                                     const TieredIndexParams &tiered_index_params,
+                                                     std::shared_ptr<VecSimAllocator> allocator)
+    : VecSimTieredIndex<DataType, DistType>(hnsw_index, bf_index, tiered_index_params, allocator),
+      labelToInsertJobs(this->allocator), idToRepairJobs(this->allocator),
+      idToSwapJob(this->allocator), invalidJobs(this->allocator), currInvalidJobId(0),
+      readySwapJobs(0) {
+    // If the param for swapJobThreshold is 0 use the default value, if it exceeds the maximum
+    // allowed, use the maximum value.
+    this->pendingSwapJobsThreshold =
+        tiered_index_params.specificParams.tieredHnswParams.swapJobThreshold == 0
+            ? DEFAULT_PENDING_SWAP_JOBS_THRESHOLD
+            : std::min(tiered_index_params.specificParams.tieredHnswParams.swapJobThreshold,
+                       MAX_PENDING_SWAP_JOBS_THRESHOLD);
+}
+
+template <typename DataType, typename DistType>
+TieredHNSWIndex<DataType, DistType>::~TieredHNSWIndex() {
+    // Delete all the pending insert jobs.
+    for (auto &jobs : this->labelToInsertJobs) {
+        for (auto *job : jobs.second) {
+            delete job;
+        }
+    }
+    // Delete all the pending repair jobs.
+    for (auto &jobs : this->idToRepairJobs) {
+        for (auto *job : jobs.second) {
+            delete job;
+        }
+    }
+    // Delete all the pending swap jobs.
+    for (auto &it : this->idToSwapJob) {
+        delete it.second;
+    }
+    // Delete all the pending invalid jobs.
+    for (auto &it : this->invalidJobs) {
+        delete it.second;
+    }
+}
+
+template <typename DataType, typename DistType>
+size_t TieredHNSWIndex<DataType, DistType>::indexSize() const {
+    this->flatIndexGuard.lock_shared();
+    this->getHNSWIndex()->lockIndexDataGuard();
+    size_t res = this->backendIndex->indexSize() + this->frontendIndex->indexSize();
+    this->getHNSWIndex()->unlockIndexDataGuard();
+    this->flatIndexGuard.unlock_shared();
+    return res;
+}
+
+template <typename DataType, typename DistType>
+size_t TieredHNSWIndex<DataType, DistType>::indexCapacity() const {
+    return this->backendIndex->indexCapacity() + this->frontendIndex->indexCapacity();
+}
+
+template <typename DataType, typename DistType>
+size_t TieredHNSWIndex<DataType, DistType>::indexLabelCount() const {
+    // Compute the union of both labels set in both tiers of the index.
+    this->flatIndexGuard.lock();
+    this->mainIndexGuard.lock();
+    auto flat_labels = this->frontendIndex->getLabelsSet();
+    auto hnsw_labels = this->getHNSWIndex()->getLabelsSet();
+    std::vector<labelType> output;
+    std::set_union(flat_labels.begin(), flat_labels.end(), hnsw_labels.begin(), hnsw_labels.end(),
+                   std::back_inserter(output));
+    this->flatIndexGuard.unlock();
+    this->mainIndexGuard.unlock();
+    return output.size();
+}
+
+template <typename DataType, typename DistType>
+int TieredHNSWIndex<DataType, DistType>::addVector(const void *blob, labelType label,
+                                                   void *auxiliaryCtx) {
+    int ret = 1;
+    auto hnsw_index = this->getHNSWIndex();
+    if (this->getWriteMode() == VecSim_WriteInPlace) {
+        this->mainIndexGuard.lock();
+        // Internally, we may overwrite (delete the previous vector stored under this label), and
+        // may need to increase the capacity when we append the new vector afterwards.
+        ret = hnsw_index->addVector(blob, label);
+        this->mainIndexGuard.unlock();
+        return ret;
+    }
+    if (this->frontendIndex->indexSize() >= this->flatBufferLimit) {
+        // Handle overwrite situation.
+        if (!this->backendIndex->isMultiValue()) {
+            // This will do nothing (and return 0) if this label doesn't exist. Otherwise, it may
+            // remove vector from the flat buffer and/or the HNSW index.
+            ret -= this->deleteVector(label);
+        }
+        if (this->frontendIndex->indexSize() >= this->flatBufferLimit) {
+            // We didn't remove a vector from flat buffer due to overwrite, insert the new vector
+            // directly to HNSW. Since flat buffer guard was not held, no need to release it
+            // internally.
+            this->insertVectorToHNSW<false>(hnsw_index, label, blob);
+            return ret;
+        }
+        // Otherwise, we fall back to the "regular" insertion into the flat buffer
+        // (since it is not full anymore after removing the previous vector stored under the label).
+    }
+    this->flatIndexGuard.lock();
+    idType new_flat_id = this->frontendIndex->indexSize();
+    if (this->frontendIndex->isLabelExists(label) && !this->frontendIndex->isMultiValue()) {
+        // Overwrite the vector and invalidate its only pending job (since we are not in MULTI).
+        auto *old_job = this->labelToInsertJobs.at(label).at(0);
+        old_job->id = this->setAndSaveInvalidJob(old_job);
+        this->labelToInsertJobs.erase(label);
+        ret = 0;
+        // We are going to update the internal id that currently holds the vector associated with
+        // the given label.
+        new_flat_id =
+            dynamic_cast<BruteForceIndex_Single<DataType, DistType> *>(this->frontendIndex)
+                ->getIdOfLabel(label);
+        // If we are adding a new element (rather than updating an exiting one) we may need to
+        // increase index capacity.
+    } else if (this->frontendIndex->indexCapacity() == this->frontendIndex->indexSize()) {
+        this->frontendIndex->increaseCapacity();
+    }
+    // If this label already exists, this will do overwrite.
+    this->frontendIndex->addVector(blob, label);
+
+    AsyncJob *new_insert_job = new (this->allocator)
+        HNSWInsertJob(this->allocator, label, new_flat_id, executeInsertJobWrapper, this);
+    // Save a pointer to the job, so that if the vector is overwritten, we'll have an indication.
+    if (this->labelToInsertJobs.find(label) != this->labelToInsertJobs.end()) {
+        // There's already a pending insert job for this label, add another one (without overwrite,
+        // only possible in multi index)
+        assert(this->backendIndex->isMultiValue());
+        this->labelToInsertJobs.at(label).push_back((HNSWInsertJob *)new_insert_job);
+    } else {
+        vecsim_stl::vector<HNSWInsertJob *> new_jobs_vec(1, (HNSWInsertJob *)new_insert_job,
+                                                         this->allocator);
+        this->labelToInsertJobs.insert({label, new_jobs_vec});
+    }
+    this->flatIndexGuard.unlock();
+
+    // Here, a worker might ingest the previous vector that was stored under "label"
+    // (in case of override in non-MULTI index) - so if it's there, we remove it (and create the
+    // required repair jobs), *before* we submit the insert job.
+    if (!this->backendIndex->isMultiValue()) {
+        // If we removed the previous vector from both HNSW and flat in the overwrite process,
+        // we still return 0 (not -1).
+        ret = MAX(ret - this->deleteLabelFromHNSW(label), 0);
+    }
+    // Apply ready swap jobs if number of deleted vectors reached the threshold (under exclusive
+    // lock of the main index guard).
+    this->executeReadySwapJobs();
+
+    // Insert job to the queue and signal the workers' updater.
+    this->submitSingleJob(new_insert_job);
+    return ret;
+}
+
+template <typename DataType, typename DistType>
+int TieredHNSWIndex<DataType, DistType>::deleteVector(labelType label) {
+    int num_deleted_vectors = 0;
+    this->flatIndexGuard.lock_shared();
+    if (this->frontendIndex->isLabelExists(label)) {
+        this->flatIndexGuard.unlock_shared();
+        this->flatIndexGuard.lock();
+        // Check again if the label exists, as it may have been removed while we released the lock.
+        if (this->frontendIndex->isLabelExists(label)) {
+            // Invalidate the pending insert job(s) into HNSW associated with this label
+            auto &insert_jobs = this->labelToInsertJobs.at(label);
+            for (auto *job : insert_jobs) {
+                job->id = this->setAndSaveInvalidJob(job);
+            }
+            num_deleted_vectors += insert_jobs.size();
+            // Remove the pending insert job(s) from the labelToInsertJobs mapping.
+            this->labelToInsertJobs.erase(label);
+            // Go over the every id that corresponds the label and remove it from the flat buffer.
+            // Every delete may cause a swap of the deleted id with the last id, and we return a
+            // mapping from id to the original id that resides in this id after the deletion(s) (see
+            // an example in this function implementation in MULTI index).
+            auto updated_ids = this->frontendIndex->deleteVectorAndGetUpdatedIds(label);
+            for (auto &it : updated_ids) {
+                idType prev_id = it.second.first;
+                labelType updated_vec_label = it.second.second;
+                this->updateInsertJobInternalId(prev_id, it.first, updated_vec_label);
+            }
+        }
+        this->flatIndexGuard.unlock();
+    } else {
+        this->flatIndexGuard.unlock_shared();
+    }
+
+    // Next, check if there vector(s) stored under the given label in HNSW and delete them as well.
+    // Note that we may remove the same vector that has been removed from the flat index, if it was
+    // being ingested at that time.
+    if (this->getWriteMode() == VecSim_WriteAsync) {
+        num_deleted_vectors += this->deleteLabelFromHNSW(label);
+        // Apply ready swap jobs if number of deleted vectors reached the threshold
+        // (under exclusive lock of the main index guard).
+        this->executeReadySwapJobs();
+    } else {
+        // delete in place.
+        this->mainIndexGuard.lock();
+        num_deleted_vectors += this->backendIndex->deleteVector(label);
+        this->mainIndexGuard.unlock();
+    }
+
+    return num_deleted_vectors;
+}
+
+// `getDistanceFrom` returns the minimum distance between the given blob and the vector with the
+// given label. If the label doesn't exist, the distance will be NaN.
+// Therefore, it's better to just call `getDistanceFrom` on both indexes and return the minimum
+// instead of checking if the label exists in each index. We first try to get the distance from the
+// flat buffer, as vectors in the buffer might move to the Main while we're "between" the locks.
+// Behavior for single (regular) index:
+// 1. label doesn't exist in both indexes - return NaN
+// 2. label exists in one of the indexes only - return the distance from that index (which is valid)
+// 3. label exists in both indexes - return the value from the flat buffer (which is valid and equal
+//    to the value from the Main index), saving us from locking the Main index.
+// Behavior for multi index:
+// 1. label doesn't exist in both indexes - return NaN
+// 2. label exists in one of the indexes only - return the distance from that index (which is valid)
+// 3. label exists in both indexes - we may have some of the vectors with the same label in the flat
+//    buffer only and some in the Main index only (and maybe temporal duplications).
+//    So, we get the distance from both indexes and return the minimum.
+template <typename DataType, typename DistType>
+double TieredHNSWIndex<DataType, DistType>::getDistanceFrom(labelType label,
+                                                            const void *blob) const {
+    // Try to get the distance from the flat buffer.
+    // If the label doesn't exist, the distance will be NaN.
+    this->flatIndexGuard.lock_shared();
+    auto flat_dist = this->frontendIndex->getDistanceFrom(label, blob);
+    this->flatIndexGuard.unlock_shared();
+
+    // Optimization. TODO: consider having different implementations for single and multi indexes,
+    // to avoid checking the index type on every query.
+    if (!this->backendIndex->isMultiValue() && !std::isnan(flat_dist)) {
+        // If the index is single value, and we got a valid distance from the flat buffer,
+        // we can return the distance without querying the Main index.
+        return flat_dist;
     }
+
+    // Try to get the distance from the Main index.
+    this->mainIndexGuard.lock_shared();
+    auto hnsw_dist = getHNSWIndex()->safeGetDistanceFrom(label, blob);
+    this->mainIndexGuard.unlock_shared();
+
+    // Return the minimum distance that is not NaN.
+    return std::fmin(flat_dist, hnsw_dist);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//  TieredHNSW_BatchIterator                                                                     //
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/******************** Ctor / Dtor *****************/
+
+// Defining spacial values for the hnsw_iterator field, to indicate if the iterator is uninitialized
+// or depleted when we don't have a valid iterator.
+#define UNINITIALIZED ((VecSimBatchIterator *)0)
+#define DEPLETED      ((VecSimBatchIterator *)1)
+
+template <typename DataType, typename DistType>
+TieredHNSWIndex<DataType, DistType>::TieredHNSW_BatchIterator::TieredHNSW_BatchIterator(
+    void *query_vector, const TieredHNSWIndex<DataType, DistType> *index,
+    VecSimQueryParams *queryParams, std::shared_ptr<VecSimAllocator> allocator)
+    : VecSimBatchIterator(query_vector, queryParams ? queryParams->timeoutCtx : nullptr,
+                          std::move(allocator)),
+      index(index), flat_results({0}), hnsw_results({0}),
+      flat_iterator(this->index->frontendIndex->newBatchIterator(query_vector, queryParams)),
+      hnsw_iterator(UNINITIALIZED), returned_results_set(this->allocator) {
+    // Save a copy of the query params to initialize the HNSW iterator with (on first batch and
+    // first batch after reset).
+    if (queryParams) {
+        this->queryParams =
+            (VecSimQueryParams *)this->allocator->allocate(sizeof(VecSimQueryParams));
+        *this->queryParams = *queryParams;
+    } else {
+        this->queryParams = nullptr;
+    }
+}
+
+template <typename DataType, typename DistType>
+TieredHNSWIndex<DataType, DistType>::TieredHNSW_BatchIterator::~TieredHNSW_BatchIterator() {
+    delete this->flat_iterator;
+
+    if (this->hnsw_iterator != UNINITIALIZED && this->hnsw_iterator != DEPLETED) {
+        delete this->hnsw_iterator;
+        this->index->mainIndexGuard.unlock_shared();
+    }
+
+    this->allocator->free_allocation(this->queryParams);
+
+    VecSimQueryResult_Free(this->flat_results);
+    VecSimQueryResult_Free(this->hnsw_results);
+}
+
+/******************** Implementation **************/
+
+template <typename DataType, typename DistType>
+VecSimQueryResult_List
+TieredHNSWIndex<DataType, DistType>::TieredHNSW_BatchIterator::getNextResults(
+    size_t n_res, VecSimQueryResult_Order order) {
+
+    const bool isMulti = this->index->backendIndex->isMultiValue();
+
+    if (this->hnsw_iterator == UNINITIALIZED) {
+        // First call to getNextResults. The call to the BF iterator will include calculating all
+        // the distances and access the BF index. We take the lock on this call.
+        this->index->flatIndexGuard.lock_shared();
+        this->flat_results = this->flat_iterator->getNextResults(n_res, BY_SCORE_THEN_ID);
+        this->index->flatIndexGuard.unlock_shared();
+        // This is also the only time `getNextResults` on the BF iterator can fail.
+        if (VecSim_OK != flat_results.code) {
+            return flat_results;
+        }
+        // We also take the lock on the main index on the first call to getNextResults, and we hold
+        // it until the iterator is depleted or freed.
+        this->index->mainIndexGuard.lock_shared();
+        this->hnsw_iterator =
+            this->index->backendIndex->newBatchIterator(getQueryBlob(), queryParams);
+        this->hnsw_results = this->hnsw_iterator->getNextResults(n_res, BY_SCORE_THEN_ID);
+        if (this->hnsw_iterator->isDepleted()) {
+            delete this->hnsw_iterator;
+            this->hnsw_iterator = DEPLETED;
+            this->index->mainIndexGuard.unlock_shared();
+        }
+    } else {
+        while (VecSimQueryResult_Len(this->flat_results) < n_res &&
+               !this->flat_iterator->isDepleted()) {
+            auto tail = this->flat_iterator->getNextResults(
+                n_res - VecSimQueryResult_Len(this->flat_results), BY_SCORE_THEN_ID);
+            concat_results(this->flat_results, tail);
+
+            if (!isMulti) {
+                // On single-value indexes, duplicates will never appear in the hnsw results before
+                // they appear in the flat results (at the same time or later if the approximation
+                // misses) so we don't need to try and filter the flat results (and recheck
+                // conditions).
+                break;
+            } else {
+                // On multi-value indexes, the flat results may contain results that are already
+                // returned from the hnsw index. We need to filter them out.
+                filter_irrelevant_results(this->flat_results);
+            }
+        }
+
+        auto code = VecSim_QueryResult_OK;
+        while (VecSimQueryResult_Len(this->hnsw_results) < n_res &&
+               this->hnsw_iterator != DEPLETED && code == VecSim_OK) {
+            auto tail = this->hnsw_iterator->getNextResults(
+                n_res - VecSimQueryResult_Len(this->hnsw_results), BY_SCORE_THEN_ID);
+            code = tail.code; // Set the hnsw_results code to the last `getNextResults` code.
+            // New batch may contain better results than the previous batch, so we need to merge.
+            // We don't expect duplications (hence the <false>), as the iterator guarantees that
+            // no result is returned twice.
+            this->hnsw_results = merge_result_lists<false>(this->hnsw_results, tail, n_res);
+            this->hnsw_results.code = code;
+            filter_irrelevant_results(this->hnsw_results);
+            if (this->hnsw_iterator->isDepleted()) {
+                delete this->hnsw_iterator;
+                this->hnsw_iterator = DEPLETED;
+                this->index->mainIndexGuard.unlock_shared();
+            }
+        }
+    }
+
+    if (VecSim_OK != hnsw_results.code) {
+        return {NULL, hnsw_results.code};
+    }
+
+    VecSimQueryResult_List batch;
+    if (isMulti)
+        batch = compute_current_batch<true>(n_res);
+    else
+        batch = compute_current_batch<false>(n_res);
+
+    if (order == BY_ID) {
+        sort_results_by_id(batch);
+    }
+    size_t batch_len = VecSimQueryResult_Len(batch);
+    this->updateResultsCount(batch_len);
+
+    return batch;
+}
+
+// DISCLAIMER: After the last batch, one of the iterators may report that it is not depleted,
+// while all of its remaining results were already returned from the other iterator.
+// (On single-value indexes, this can happen to the hnsw iterator only, on multi-value
+//  indexes, this can happen to both iterators).
+// The next call to `getNextResults` will return an empty batch, and then the iterators will
+// correctly report that they are depleted.
+template <typename DataType, typename DistType>
+bool TieredHNSWIndex<DataType, DistType>::TieredHNSW_BatchIterator::isDepleted() {
+    return VecSimQueryResult_Len(this->flat_results) == 0 && this->flat_iterator->isDepleted() &&
+           VecSimQueryResult_Len(this->hnsw_results) == 0 && this->hnsw_iterator == DEPLETED;
+}
+
+template <typename DataType, typename DistType>
+void TieredHNSWIndex<DataType, DistType>::TieredHNSW_BatchIterator::reset() {
+    if (this->hnsw_iterator != UNINITIALIZED && this->hnsw_iterator != DEPLETED) {
+        delete this->hnsw_iterator;
+        this->index->mainIndexGuard.unlock_shared();
+    }
+    this->resetResultsCount();
+    this->flat_iterator->reset();
+    this->hnsw_iterator = UNINITIALIZED;
+    VecSimQueryResult_Free(this->flat_results);
+    VecSimQueryResult_Free(this->hnsw_results);
+    this->flat_results = {0};
+    this->hnsw_results = {0};
+    returned_results_set.clear();
+}
+
+/****************** Helper Functions **************/
+
+template <typename DataType, typename DistType>
+template <bool isMultiValue>
+VecSimQueryResult_List
+TieredHNSWIndex<DataType, DistType>::TieredHNSW_BatchIterator::compute_current_batch(size_t n_res) {
+    // Set pointers
+    auto bf_res = this->flat_results.results;
+    auto hnsw_res = this->hnsw_results.results;
+    const auto bf_end = bf_res + VecSimQueryResult_Len(this->flat_results);
+    const auto hnsw_end = hnsw_res + VecSimQueryResult_Len(this->hnsw_results);
+
+    // Merge results
+    // This call will update `hnsw_res` and `bf_res` to point to the end of the merged results.
+    VecSimQueryResult *batch_res;
+    if (isMultiValue) {
+        batch_res = merge_results<true>(hnsw_res, hnsw_end, bf_res, bf_end, n_res);
+    } else {
+        batch_res = merge_results<false>(hnsw_res, hnsw_end, bf_res, bf_end, n_res);
+    }
+
+    if (!isMultiValue) {
+        // If we're on a single-value index, update the set of results returned from the FLAT index
+        // before popping them, to prevent them to be returned from the HNSW index in later batches.
+        for (auto it = this->flat_results.results; it != bf_res; ++it) {
+            this->returned_results_set.insert(it->id);
+        }
+    } else {
+        // If we're on a multi-value index, update the set of results returned (from `batch_res`)
+        for (size_t i = 0; i < array_len(batch_res); ++i) {
+            this->returned_results_set.insert(batch_res[i].id);
+        }
+    }
+
+    // Update results
+    array_pop_front_n(this->flat_results.results, bf_res - this->flat_results.results);
+    array_pop_front_n(this->hnsw_results.results, hnsw_res - this->hnsw_results.results);
+
+    // clean up the results
+    // On multi-value indexes, one (or both) results lists may contain results that are already
+    // returned form the other list (with a different score). We need to filter them out.
+    if (isMultiValue) {
+        filter_irrelevant_results(this->flat_results);
+        filter_irrelevant_results(this->hnsw_results);
+    }
+
+    // Return current batch
+    return {batch_res, VecSim_QueryResult_OK};
+}
+
+template <typename DataType, typename DistType>
+void TieredHNSWIndex<DataType, DistType>::TieredHNSW_BatchIterator::filter_irrelevant_results(
+    VecSimQueryResult_List &rl) {
+    // Filter out results that were already returned.
+    auto it = rl.results;
+    auto end = it + VecSimQueryResult_Len(rl);
+    // Skip results that not returned yet
+    while (it != end && this->returned_results_set.count(it->id) == 0) {
+        ++it;
+    }
+    // If none of the results were returned, return
+    if (it == end) {
+        return;
+    }
+    // Mark the current result as the first result to be filtered
+    auto cur_end = it;
+    ++it;
+    // "Append" all results that were not returned from the FLAT index
+    while (it != end) {
+        if (this->returned_results_set.count(it->id) == 0) {
+            *cur_end = *it;
+            ++cur_end;
+        }
+        ++it;
+    }
+    // Update number of results (pop the tail)
+    array_pop_back_n(rl.results, end - cur_end);
+}
+
+template <typename DataType, typename DistType>
+VecSimIndexInfo TieredHNSWIndex<DataType, DistType>::info() const {
+    auto info = VecSimTieredIndex<DataType, DistType>::info();
+
+    HnswTieredInfo hnswTieredInfo = {.pendingSwapJobsThreshold = this->pendingSwapJobsThreshold};
+    info.tieredInfo.specificTieredBackendInfo.hnswTieredInfo = hnswTieredInfo;
+
+    return info;
+}
+
+template <typename DataType, typename DistType>
+VecSimInfoIterator *TieredHNSWIndex<DataType, DistType>::infoIterator() const {
+    VecSimIndexInfo info = this->info();
+    // Get the base tiered fields.
+    auto *infoIterator = VecSimTieredIndex<DataType, DistType>::infoIterator();
+
+    // Tiered HNSW specific param.
+    infoIterator->addInfoField(VecSim_InfoField{
+        .fieldName = VecSimCommonStrings::TIERED_HNSW_SWAP_JOBS_THRESHOLD_STRING,
+        .fieldType = INFOFIELD_UINT64,
+        .fieldValue = {FieldValue{.uintegerValue = info.tieredInfo.specificTieredBackendInfo
+                                                       .hnswTieredInfo.pendingSwapJobsThreshold}}});
+
+    return infoIterator;
+}
+
+template <typename DataType, typename DistType>
+VecSimIndexBasicInfo TieredHNSWIndex<DataType, DistType>::basicInfo() const {
+    VecSimIndexBasicInfo info = this->backendIndex->getBasicInfo();
+    info.blockSize = info.blockSize;
+    info.isTiered = true;
+    info.algo = VecSimAlgo_HNSWLIB;
+    return info;
 };
diff --git a/src/VecSim/algorithms/hnsw/hnsw_tiered_tests_friends.h b/src/VecSim/algorithms/hnsw/hnsw_tiered_tests_friends.h
index 547573928..05aa14d8f 100644
--- a/src/VecSim/algorithms/hnsw/hnsw_tiered_tests_friends.h
+++ b/src/VecSim/algorithms/hnsw/hnsw_tiered_tests_friends.h
@@ -1,2 +1,49 @@
 #include "VecSim/friend_test_decl.h"
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest)
 INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_CreateIndexInstance_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_addVector_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_manageIndexOwnership_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_insertJob_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_parallelSearch_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_parallelInsertSearch_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_deleteFromHNSWBasic_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_deleteFromHNSWWithRepairJobExec_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_manageIndexOwnershipWithPendingJobs_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_parallelInsertAdHoc_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_deleteVector_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_deleteVectorAndRepairAsync_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_alternateInsertDeleteAsync_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_swapJobBasic_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_swapJobBasic2_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_deleteVectorsAndSwapSync_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_BatchIterator_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_BatchIteratorAdvanced_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_BatchIteratorSize1_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_BatchIteratorReset_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_BatchIteratorWithOverlaps_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_parallelBatchIteratorSearch_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_testInfo_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_testInfoIterator_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_writeInPlaceMode_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_switchWriteModes_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_bufferLimit_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_bufferLimitAsync_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_RangeSearch_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_parallelRangeSearch_Test)
+
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_insertJobAsync_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_insertJobAsyncMulti_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_KNNSearch_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_MergeMulti_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_deleteFromHNSWMulti_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_deleteFromHNSWMultiLevels_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_AdHocSingle_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_AdHocMulti_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_deleteVectorMulti_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_deleteVectorMultiFromFlatAdvanced_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_BatchIteratorWithOverlaps_SpacialMultiCases_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_deleteVectorMulti_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_deleteVectorMultiFromFlatAdvanced_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_overwriteVectorBasic_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_overwriteVectorAsync_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_preferAdHocOptimization_Test)
diff --git a/src/VecSim/algorithms/hnsw/visited_nodes_handler.cpp b/src/VecSim/algorithms/hnsw/visited_nodes_handler.cpp
index 2ef765321..3f4f48ad1 100644
--- a/src/VecSim/algorithms/hnsw/visited_nodes_handler.cpp
+++ b/src/VecSim/algorithms/hnsw/visited_nodes_handler.cpp
@@ -41,11 +41,11 @@ VisitedNodesHandler::~VisitedNodesHandler() { allocator->free_allocation(element
 /**
  * VisitedNodesHandlerPool methods to enable parallel graph scans.
  */
-VisitedNodesHandlerPool::VisitedNodesHandlerPool(int initial_pool_size, int cap,
+VisitedNodesHandlerPool::VisitedNodesHandlerPool(size_t initial_pool_size, int cap,
                                                  const std::shared_ptr<VecSimAllocator> &allocator)
     : VecsimBaseObject(allocator), pool(initial_pool_size, allocator), num_elements(cap),
       total_handlers_in_use(1) {
-    for (int i = 0; i < initial_pool_size; i++)
+    for (size_t i = 0; i < initial_pool_size; i++)
         pool[i] = new (allocator) VisitedNodesHandler(cap, allocator);
 }
 
diff --git a/src/VecSim/algorithms/hnsw/visited_nodes_handler.h b/src/VecSim/algorithms/hnsw/visited_nodes_handler.h
index 7dcdbb97a..bbeae9a2c 100644
--- a/src/VecSim/algorithms/hnsw/visited_nodes_handler.h
+++ b/src/VecSim/algorithms/hnsw/visited_nodes_handler.h
@@ -58,7 +58,7 @@ class VisitedNodesHandlerPool : public VecsimBaseObject {
     unsigned short total_handlers_in_use;
 
 public:
-    VisitedNodesHandlerPool(int initial_pool_size, int cap,
+    VisitedNodesHandlerPool(size_t initial_pool_size, int cap,
                             const std::shared_ptr<VecSimAllocator> &allocator);
 
     VisitedNodesHandler *getAvailableVisitedNodesHandler();
diff --git a/src/VecSim/batch_iterator.h b/src/VecSim/batch_iterator.h
index a14413eb6..0dcba7242 100644
--- a/src/VecSim/batch_iterator.h
+++ b/src/VecSim/batch_iterator.h
@@ -12,6 +12,8 @@
 /**
  * An abstract class for performing search in batches. Every index type should implement its own
  * batch iterator class.
+ * A batch iterator instance is NOT meant to be shared between threads, but the iterated index can
+ * be and in this case the iterator should be able to iterate the index concurrently and safely.
  */
 struct VecSimBatchIterator : public VecsimBaseObject {
 private:
diff --git a/src/VecSim/index_factories/brute_force_factory.cpp b/src/VecSim/index_factories/brute_force_factory.cpp
new file mode 100644
index 000000000..7cdcf9bbe
--- /dev/null
+++ b/src/VecSim/index_factories/brute_force_factory.cpp
@@ -0,0 +1,96 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "VecSim/index_factories/brute_force_factory.h"
+#include "VecSim/algorithms/brute_force/brute_force.h"
+#include "VecSim/algorithms/brute_force/brute_force_single.h"
+#include "VecSim/algorithms/brute_force/brute_force_multi.h"
+
+namespace BruteForceFactory {
+template <typename DataType, typename DistType = DataType>
+inline VecSimIndex *
+NewIndex_ChooseMultiOrSingle(const BFParams *params,
+                             const AbstractIndexInitParams &abstractInitParams) {
+
+    // check if single and return new bf_index
+    if (params->multi)
+        return new (abstractInitParams.allocator)
+            BruteForceIndex_Multi<DataType, DistType>(params, abstractInitParams);
+    else
+        return new (abstractInitParams.allocator)
+            BruteForceIndex_Single<DataType, DistType>(params, abstractInitParams);
+}
+
+static AbstractIndexInitParams NewAbstractInitParams(const VecSimParams *params) {
+
+    const BFParams *bfParams = &params->bfParams;
+    AbstractIndexInitParams abstractInitParams = {.allocator =
+                                                      VecSimAllocator::newVecsimAllocator(),
+                                                  .dim = bfParams->dim,
+                                                  .vecType = bfParams->type,
+                                                  .metric = bfParams->metric,
+                                                  .blockSize = bfParams->blockSize,
+                                                  .multi = bfParams->multi,
+                                                  .logCtx = params->logCtx};
+    return abstractInitParams;
+}
+
+VecSimIndex *NewIndex(const VecSimParams *params) {
+    const BFParams *bfParams = &params->bfParams;
+    AbstractIndexInitParams abstractInitParams = NewAbstractInitParams(params);
+    return NewIndex(bfParams, NewAbstractInitParams(params));
+}
+
+VecSimIndex *NewIndex(const BFParams *bfparams, const AbstractIndexInitParams &abstractInitParams) {
+    if (bfparams->type == VecSimType_FLOAT32) {
+        return NewIndex_ChooseMultiOrSingle<float>(bfparams, abstractInitParams);
+    } else if (bfparams->type == VecSimType_FLOAT64) {
+        return NewIndex_ChooseMultiOrSingle<double>(bfparams, abstractInitParams);
+    }
+
+    // If we got here something is wrong.
+    return NULL;
+}
+
+VecSimIndex *NewIndex(const BFParams *bfparams) {
+    VecSimParams params = {.bfParams = *bfparams};
+    return NewIndex(&params);
+}
+
+template <typename DataType, typename DistType = DataType>
+inline size_t EstimateInitialSize_ChooseMultiOrSingle(bool is_multi) {
+    // check if single and return new bf_index
+    if (is_multi)
+        return sizeof(BruteForceIndex_Multi<DataType, DistType>);
+    else
+        return sizeof(BruteForceIndex_Single<DataType, DistType>);
+}
+
+size_t EstimateInitialSize(const BFParams *params) {
+
+    size_t allocations_overhead = VecSimAllocator::getAllocationOverheadSize();
+
+    // Constant part (not effected by parameters).
+    size_t est = sizeof(VecSimAllocator) + allocations_overhead;
+
+    if (params->type == VecSimType_FLOAT32) {
+        est += EstimateInitialSize_ChooseMultiOrSingle<float>(params->multi);
+    } else if (params->type == VecSimType_FLOAT64) {
+        est += EstimateInitialSize_ChooseMultiOrSingle<double>(params->multi);
+    }
+    // Parameters related part.
+
+    if (params->initialCapacity) {
+        est += params->initialCapacity * sizeof(labelType) + allocations_overhead;
+    }
+
+    return est;
+}
+
+size_t EstimateElementSize(const BFParams *params) {
+    return params->dim * VecSimType_sizeof(params->type) + sizeof(labelType);
+}
+}; // namespace BruteForceFactory
diff --git a/src/VecSim/algorithms/brute_force/brute_force_factory.h b/src/VecSim/index_factories/brute_force_factory.h
similarity index 66%
rename from src/VecSim/algorithms/brute_force/brute_force_factory.h
rename to src/VecSim/index_factories/brute_force_factory.h
index 37a9c25af..ff1a0ec24 100644
--- a/src/VecSim/algorithms/brute_force/brute_force_factory.h
+++ b/src/VecSim/index_factories/brute_force_factory.h
@@ -12,9 +12,14 @@
 #include "VecSim/vec_sim.h"              //typedef VecSimIndex
 #include "VecSim/vec_sim_common.h"       // BFParams
 #include "VecSim/memory/vecsim_malloc.h" // VecSimAllocator
+#include "VecSim/vec_sim_index.h"
 
 namespace BruteForceFactory {
-VecSimIndex *NewIndex(const BFParams *params, std::shared_ptr<VecSimAllocator> allocator);
+// Overloading the NewIndex function to support different parameters
+VecSimIndex *NewIndex(const VecSimParams *params);
+VecSimIndex *NewIndex(const BFParams *bfparams);
+VecSimIndex *NewIndex(const BFParams *bfparams, const AbstractIndexInitParams &abstractInitParams);
 size_t EstimateInitialSize(const BFParams *params);
 size_t EstimateElementSize(const BFParams *params);
+
 }; // namespace BruteForceFactory
diff --git a/src/VecSim/algorithms/hnsw/hnsw_factory.cpp b/src/VecSim/index_factories/hnsw_factory.cpp
similarity index 61%
rename from src/VecSim/algorithms/hnsw/hnsw_factory.cpp
rename to src/VecSim/index_factories/hnsw_factory.cpp
index 4bd77a84f..98bece820 100644
--- a/src/VecSim/algorithms/hnsw/hnsw_factory.cpp
+++ b/src/VecSim/index_factories/hnsw_factory.cpp
@@ -6,36 +6,58 @@
 
 #include "VecSim/algorithms/hnsw/hnsw_single.h"
 #include "VecSim/algorithms/hnsw/hnsw_multi.h"
-#include "VecSim/algorithms/hnsw/hnsw_factory.h"
+#include "VecSim/index_factories/hnsw_factory.h"
 #include "VecSim/algorithms/hnsw/hnsw.h"
-#include "hnsw_tiered.h"
 
 namespace HNSWFactory {
 
 template <typename DataType, typename DistType = DataType>
 inline HNSWIndex<DataType, DistType> *
-NewIndex_ChooseMultiOrSingle(const HNSWParams *params, std::shared_ptr<VecSimAllocator> allocator) {
+NewIndex_ChooseMultiOrSingle(const HNSWParams *params,
+                             const AbstractIndexInitParams &abstractInitParams) {
     // check if single and return new hnsw_index
     if (params->multi)
-        return new (allocator) HNSWIndex_Multi<DataType, DistType>(params, allocator);
+        return new (abstractInitParams.allocator)
+            HNSWIndex_Multi<DataType, DistType>(params, abstractInitParams);
     else
-        return new (allocator) HNSWIndex_Single<DataType, DistType>(params, allocator);
+        return new (abstractInitParams.allocator)
+            HNSWIndex_Single<DataType, DistType>(params, abstractInitParams);
 }
 
-VecSimIndex *NewIndex(const HNSWParams *params, std::shared_ptr<VecSimAllocator> allocator) {
-    if (params->type == VecSimType_FLOAT32) {
-        return NewIndex_ChooseMultiOrSingle<float>(params, allocator);
-    } else if (params->type == VecSimType_FLOAT64) {
-        return NewIndex_ChooseMultiOrSingle<double>(params, allocator);
+static AbstractIndexInitParams NewAbstractInitParams(const VecSimParams *params) {
+    const HNSWParams *hnswParams = &params->hnswParams;
+    AbstractIndexInitParams abstractInitParams = {.allocator =
+                                                      VecSimAllocator::newVecsimAllocator(),
+                                                  .dim = hnswParams->dim,
+                                                  .vecType = hnswParams->type,
+                                                  .metric = hnswParams->metric,
+                                                  .blockSize = hnswParams->blockSize,
+                                                  .multi = hnswParams->multi,
+                                                  .logCtx = params->logCtx};
+    return abstractInitParams;
+}
+
+VecSimIndex *NewIndex(const VecSimParams *params) {
+    const HNSWParams *hnswParams = &params->hnswParams;
+    AbstractIndexInitParams abstractInitParams = NewAbstractInitParams(params);
+    if (hnswParams->type == VecSimType_FLOAT32) {
+        return NewIndex_ChooseMultiOrSingle<float>(hnswParams, abstractInitParams);
+    } else if (hnswParams->type == VecSimType_FLOAT64) {
+        return NewIndex_ChooseMultiOrSingle<double>(hnswParams, abstractInitParams);
     }
 
     // If we got here something is wrong.
     return NULL;
 }
 
+VecSimIndex *NewIndex(const HNSWParams *params) {
+    VecSimParams vecSimParams = {.hnswParams = *params};
+    return NewIndex(&vecSimParams);
+}
+
 template <typename DataType, typename DistType = DataType>
 inline size_t EstimateInitialSize_ChooseMultiOrSingle(bool is_multi) {
-    // check if single and return new bf_index
+    // check if single or multi and return the size of the matching class struct.
     if (is_multi)
         return sizeof(HNSWIndex_Multi<DataType, DistType>);
     else
@@ -45,7 +67,9 @@ inline size_t EstimateInitialSize_ChooseMultiOrSingle(bool is_multi) {
 size_t EstimateInitialSize(const HNSWParams *params) {
     size_t M = (params->M) ? params->M : HNSW_DEFAULT_M;
 
-    size_t est = sizeof(VecSimAllocator) + sizeof(size_t);
+    size_t allocations_overhead = VecSimAllocator::getAllocationOverheadSize();
+
+    size_t est = sizeof(VecSimAllocator) + allocations_overhead;
     if (params->type == VecSimType_FLOAT32) {
         est += EstimateInitialSize_ChooseMultiOrSingle<float>(params->multi);
     } else if (params->type == VecSimType_FLOAT64) {
@@ -53,36 +77,41 @@ size_t EstimateInitialSize(const HNSWParams *params) {
     }
 
     // Account for the visited nodes pool (assume that it holds one pointer to a handler).
-    est += sizeof(VisitedNodesHandler) + sizeof(size_t);
+    est += sizeof(VisitedNodesHandler) + allocations_overhead;
     // The visited nodes pool inner vector buffer (contains one pointer).
-    est += sizeof(void *) + sizeof(size_t);
-    est += sizeof(tag_t) * params->initialCapacity + sizeof(size_t); // visited nodes array
+    est += sizeof(void *) + allocations_overhead;
+    est += sizeof(tag_t) * params->initialCapacity + allocations_overhead; // visited nodes array
 
     // Implicit allocation calls - allocates memory + a header only with positive capacity.
     if (params->initialCapacity) {
-        est += sizeof(size_t) * params->initialCapacity + sizeof(size_t); // element level
+        est += sizeof(size_t) * params->initialCapacity + allocations_overhead; // element level
         est += sizeof(size_t) * params->initialCapacity +
-               sizeof(size_t); // Labels lookup hash table buckets.
+               allocations_overhead; // Labels lookup hash table buckets.
+        est +=
+            sizeof(std::mutex) * params->initialCapacity + allocations_overhead; // lock per vector
     }
 
     // Explicit allocation calls - always allocate a header.
-    est += sizeof(void *) * params->initialCapacity + sizeof(size_t); // link lists (for levels > 0)
+    est += sizeof(void *) * params->initialCapacity +
+           allocations_overhead; // link lists (for levels > 0)
 
     size_t size_links_level0 =
         sizeof(elementFlags) + sizeof(linkListSize) + M * 2 * sizeof(idType) + sizeof(void *);
     size_t size_total_data_per_element =
         size_links_level0 + params->dim * VecSimType_sizeof(params->type) + sizeof(labelType);
-    est += params->initialCapacity * size_total_data_per_element + sizeof(size_t);
+    est += params->initialCapacity * size_total_data_per_element + allocations_overhead;
 
     return est;
 }
 
 size_t EstimateElementSize(const HNSWParams *params) {
+    size_t allocations_overhead = VecSimAllocator::getAllocationOverheadSize();
+
     size_t M = (params->M) ? params->M : HNSW_DEFAULT_M;
     size_t size_links_level0 = sizeof(linkListSize) + M * 2 * sizeof(idType) + sizeof(void *) +
-                               sizeof(vecsim_stl::vector<idType>);
+                               sizeof(vecsim_stl::vector<idType>) + allocations_overhead;
     size_t size_links_higher_level = sizeof(linkListSize) + M * sizeof(idType) + sizeof(void *) +
-                                     sizeof(vecsim_stl::vector<idType>);
+                                     sizeof(vecsim_stl::vector<idType>) + allocations_overhead;
     // The Expectancy for the random variable which is the number of levels per element equals
     // 1/ln(M). Since the max_level is rounded to the "floor" integer, the actual average number
     // of levels is lower (intuitively, we "loose" a level every time the random generated number
@@ -96,25 +125,34 @@ size_t EstimateElementSize(const HNSWParams *params) {
                                          sizeof(labelType);
 
     size_t size_label_lookup_node;
+    std::shared_ptr<VecSimAllocator> allocator = VecSimAllocator::newVecsimAllocator();
+
     if (params->multi) {
-        // For each new insertion (of a new label), we add a new node to the label_lookup_ map,
-        // and a new element to the vector in the map. These two allocations both results in a new
-        // allocation and therefore another VecSimAllocator::allocation_header_size.
-        size_label_lookup_node =
-            sizeof(vecsim_stl::unordered_map<labelType, vecsim_stl::vector<idType>>::value_type) +
-            sizeof(size_t) + sizeof(vecsim_stl::vector<idType>::value_type) + sizeof(size_t);
+        auto dummy_lookup =
+            vecsim_stl::unordered_map<labelType, vecsim_stl::vector<idType>>(1, allocator);
+        size_t memory_before = allocator->getAllocationSize();
+        // For each new insertion (of a new label), we add a new node to the label_lookup_ map.
+        dummy_lookup.emplace(0, vecsim_stl::vector<idType>{allocator});
+        // In addition, a new element to the vector in the map.
+        dummy_lookup.at(0).push_back(0);
+        size_t memory_after = allocator->getAllocationSize();
+        // size_t memory_before = allocator->getAllocationSize();
+
+        size_label_lookup_node = memory_after - memory_before;
     } else {
-        // For each new insertion (of a new label), we add a new node to the label_lookup_ map. This
-        // results in a new allocation and therefore another VecSimAllocator::allocation_header_size
-        // plus an internal pointer
-        size_label_lookup_node = sizeof(vecsim_stl::unordered_map<labelType, idType>::value_type) +
-                                 sizeof(size_t) + sizeof(size_t);
+        auto dummy_lookup = vecsim_stl::unordered_map<size_t, unsigned int>(1, allocator);
+        size_t memory_before = allocator->getAllocationSize();
+        // For each new insertion (of a new label), we add a new node to the label_lookup_ map.
+        dummy_lookup.insert({1, 1}); // Insert a dummy {key, value} element pair.
+        size_t memory_after = allocator->getAllocationSize();
+        size_label_lookup_node = memory_after - memory_before;
     }
 
     // 1 entry in visited nodes + 1 entry in element levels + (approximately) 1 bucket in labels
     // lookup hash map.
     size_t size_meta_data =
         sizeof(tag_t) + sizeof(size_t) + sizeof(size_t) + size_label_lookup_node;
+    size_t size_lock = sizeof(std::mutex);
 
     /* Disclaimer: we are neglecting two additional factors that consume memory:
      * 1. The overall bucket size in labels_lookup hash table is usually higher than the number of
@@ -123,38 +161,23 @@ size_t EstimateElementSize(const HNSWParams *params) {
      * 2. The incoming edges that aren't bidirectional are stored in a dynamic array
      * (vecsim_stl::vector) Those edges' memory *is omitted completely* from this estimation.
      */
-    return size_meta_data + size_total_data_per_element;
-}
-
-VecSimIndex *NewTieredIndex(const TieredHNSWParams *params,
-                            std::shared_ptr<VecSimAllocator> allocator) {
-    if (params->hnswParams.type == VecSimType_FLOAT32) {
-        auto *hnsw_index =
-            NewIndex_ChooseMultiOrSingle<float, float>(&params->hnswParams, allocator);
-        return new (allocator) TieredHNSWIndex<float, float>(hnsw_index, params->tieredParams);
-    } else if (params->hnswParams.type == VecSimType_FLOAT64) {
-        auto *hnsw_index =
-            NewIndex_ChooseMultiOrSingle<double, double>(&params->hnswParams, allocator);
-        return new (allocator) TieredHNSWIndex<double, double>(hnsw_index, params->tieredParams);
-    } else {
-        return nullptr; // Invalid type
-    }
+    return size_meta_data + size_total_data_per_element + size_lock;
 }
 
 #ifdef BUILD_TESTS
 
 template <typename DataType, typename DistType = DataType>
 inline VecSimIndex *NewIndex_ChooseMultiOrSingle(std::ifstream &input, const HNSWParams *params,
-                                                 std::shared_ptr<VecSimAllocator> allocator,
+                                                 const AbstractIndexInitParams &abstractInitParams,
                                                  Serializer::EncodingVersion version) {
     HNSWIndex<DataType, DistType> *index = nullptr;
     // check if single and call the ctor that loads index information from file.
     if (params->multi)
-        index =
-            new (allocator) HNSWIndex_Multi<DataType, DistType>(input, params, allocator, version);
+        index = new (abstractInitParams.allocator)
+            HNSWIndex_Multi<DataType, DistType>(input, params, abstractInitParams, version);
     else
-        index =
-            new (allocator) HNSWIndex_Single<DataType, DistType>(input, params, allocator, version);
+        index = new (abstractInitParams.allocator)
+            HNSWIndex_Single<DataType, DistType>(input, params, abstractInitParams, version);
 
     index->restoreGraph(input);
 
@@ -214,12 +237,12 @@ VecSimIndex *NewIndex(const std::string &location, const HNSWParams *v1_params)
     }
     Serializer::readBinaryPOD(input, params.initialCapacity);
 
-    std::shared_ptr<VecSimAllocator> allocator = VecSimAllocator::newVecsimAllocator();
-
+    VecSimParams vecsimParams = {.algo = VecSimAlgo_HNSWLIB, .hnswParams = params};
+    AbstractIndexInitParams abstractInitParams = NewAbstractInitParams(&vecsimParams);
     if (params.type == VecSimType_FLOAT32) {
-        return NewIndex_ChooseMultiOrSingle<float>(input, &params, allocator, version);
+        return NewIndex_ChooseMultiOrSingle<float>(input, &params, abstractInitParams, version);
     } else if (params.type == VecSimType_FLOAT64) {
-        return NewIndex_ChooseMultiOrSingle<double>(input, &params, allocator, version);
+        return NewIndex_ChooseMultiOrSingle<double>(input, &params, abstractInitParams, version);
     } else {
         throw std::runtime_error("Cannot load index: bad index data type");
     }
diff --git a/src/VecSim/algorithms/hnsw/hnsw_factory.h b/src/VecSim/index_factories/hnsw_factory.h
similarity index 81%
rename from src/VecSim/algorithms/hnsw/hnsw_factory.h
rename to src/VecSim/index_factories/hnsw_factory.h
index ef865c6d4..435ed688e 100644
--- a/src/VecSim/algorithms/hnsw/hnsw_factory.h
+++ b/src/VecSim/index_factories/hnsw_factory.h
@@ -12,12 +12,12 @@
 #include "VecSim/vec_sim.h"              //typedef VecSimIndex
 #include "VecSim/vec_sim_common.h"       // HNSWParams
 #include "VecSim/memory/vecsim_malloc.h" // VecSimAllocator
+#include "VecSim/vec_sim_index.h"
 
 namespace HNSWFactory {
 
-VecSimIndex *NewIndex(const HNSWParams *params, std::shared_ptr<VecSimAllocator> allocator);
-VecSimIndex *NewTieredIndex(const TieredHNSWParams *params,
-                            std::shared_ptr<VecSimAllocator> allocator);
+VecSimIndex *NewIndex(const VecSimParams *params);
+VecSimIndex *NewIndex(const HNSWParams *params);
 size_t EstimateInitialSize(const HNSWParams *params);
 size_t EstimateElementSize(const HNSWParams *params);
 
diff --git a/src/VecSim/index_factories/index_factory.cpp b/src/VecSim/index_factories/index_factory.cpp
new file mode 100644
index 000000000..93f429778
--- /dev/null
+++ b/src/VecSim/index_factories/index_factory.cpp
@@ -0,0 +1,63 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "index_factory.h"
+#include "hnsw_factory.h"
+#include "brute_force_factory.h"
+#include "tiered_factory.h"
+#include "VecSim/vec_sim_index.h"
+
+namespace VecSimFactory {
+VecSimIndex *NewIndex(const VecSimParams *params) {
+    VecSimIndex *index = NULL;
+    std::shared_ptr<VecSimAllocator> allocator = VecSimAllocator::newVecsimAllocator();
+    try {
+        switch (params->algo) {
+        case VecSimAlgo_HNSWLIB: {
+            index = HNSWFactory::NewIndex(params);
+            break;
+        }
+
+        case VecSimAlgo_BF: {
+            index = BruteForceFactory::NewIndex(params);
+            break;
+        }
+        case VecSimAlgo_TIERED: {
+            index = TieredFactory::NewIndex(&params->tieredParams);
+            break;
+        }
+        }
+    } catch (...) {
+        // Index will delete itself. For now, do nothing.
+    }
+    return index;
+}
+
+size_t EstimateInitialSize(const VecSimParams *params) {
+    switch (params->algo) {
+    case VecSimAlgo_HNSWLIB:
+        return HNSWFactory::EstimateInitialSize(&params->hnswParams);
+    case VecSimAlgo_BF:
+        return BruteForceFactory::EstimateInitialSize(&params->bfParams);
+    case VecSimAlgo_TIERED:
+        return TieredFactory::EstimateInitialSize(&params->tieredParams);
+    }
+    return -1;
+}
+
+size_t EstimateElementSize(const VecSimParams *params) {
+    switch (params->algo) {
+    case VecSimAlgo_HNSWLIB:
+        return HNSWFactory::EstimateElementSize(&params->hnswParams);
+    case VecSimAlgo_BF:
+        return BruteForceFactory::EstimateElementSize(&params->bfParams);
+    case VecSimAlgo_TIERED:
+        return TieredFactory::EstimateElementSize(&params->tieredParams);
+    }
+    return -1;
+}
+
+} // namespace VecSimFactory
diff --git a/src/VecSim/index_factories/index_factory.h b/src/VecSim/index_factories/index_factory.h
new file mode 100644
index 000000000..53c691b07
--- /dev/null
+++ b/src/VecSim/index_factories/index_factory.h
@@ -0,0 +1,16 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+#pragma once
+
+#include "VecSim/vec_sim.h"
+#include "VecSim/vec_sim_common.h"
+#include "VecSim/memory/vecsim_malloc.h"
+
+namespace VecSimFactory {
+VecSimIndex *NewIndex(const VecSimParams *params);
+size_t EstimateInitialSize(const VecSimParams *params);
+size_t EstimateElementSize(const VecSimParams *params);
+}; // namespace VecSimFactory
diff --git a/src/VecSim/index_factories/tiered_factory.cpp b/src/VecSim/index_factories/tiered_factory.cpp
new file mode 100644
index 000000000..b47ddbd12
--- /dev/null
+++ b/src/VecSim/index_factories/tiered_factory.cpp
@@ -0,0 +1,116 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "VecSim/index_factories/tiered_factory.h"
+#include "VecSim/index_factories/hnsw_factory.h"
+#include "VecSim/index_factories/brute_force_factory.h"
+
+#include "VecSim/algorithms/hnsw/hnsw_tiered.h"
+
+namespace TieredFactory {
+
+namespace TieredHNSWFactory {
+template <typename DataType, typename DistType = DataType>
+inline VecSimIndex *NewIndex(const TieredIndexParams *params) {
+
+    // initialize hnsw index
+    auto *hnsw_index = reinterpret_cast<HNSWIndex<DataType, DistType> *>(
+        HNSWFactory::NewIndex(params->primaryIndexParams));
+    // initialize brute force index
+
+    BFParams bf_params = {.type = params->primaryIndexParams->hnswParams.type,
+                          .dim = params->primaryIndexParams->hnswParams.dim,
+                          .metric = params->primaryIndexParams->hnswParams.metric,
+                          .multi = params->primaryIndexParams->hnswParams.multi,
+                          .blockSize = params->primaryIndexParams->hnswParams.blockSize};
+
+    std::shared_ptr<VecSimAllocator> flat_allocator = VecSimAllocator::newVecsimAllocator();
+    AbstractIndexInitParams abstractInitParams = {.allocator = flat_allocator,
+                                                  .dim = bf_params.dim,
+                                                  .vecType = bf_params.type,
+                                                  .metric = bf_params.metric,
+                                                  .blockSize = bf_params.blockSize,
+                                                  .multi = bf_params.multi,
+                                                  .logCtx = params->primaryIndexParams->logCtx};
+    auto frontendIndex = static_cast<BruteForceIndex<DataType, DistType> *>(
+        BruteForceFactory::NewIndex(&bf_params, abstractInitParams));
+
+    // Create new tiered hnsw index
+    std::shared_ptr<VecSimAllocator> management_layer_allocator =
+        VecSimAllocator::newVecsimAllocator();
+
+    return new (management_layer_allocator) TieredHNSWIndex<DataType, DistType>(
+        hnsw_index, frontendIndex, *params, management_layer_allocator);
+}
+
+inline size_t EstimateInitialSize(const TieredIndexParams *params, BFParams &bf_params_output) {
+    HNSWParams hnsw_params = params->primaryIndexParams->hnswParams;
+
+    // Add size estimation of VecSimTieredIndex sub indexes.
+    size_t est = HNSWFactory::EstimateInitialSize(&hnsw_params);
+
+    // Management layer allocator overhead.
+    size_t allocations_overhead = VecSimAllocator::getAllocationOverheadSize();
+    est += sizeof(VecSimAllocator) + allocations_overhead;
+
+    // Size of the TieredHNSWIndex struct.
+    if (hnsw_params.type == VecSimType_FLOAT32) {
+        est += sizeof(TieredHNSWIndex<float, float>);
+    } else if (hnsw_params.type == VecSimType_FLOAT64) {
+        est += sizeof(TieredHNSWIndex<double, double>);
+    }
+    bf_params_output.type = hnsw_params.type;
+    bf_params_output.multi = hnsw_params.multi;
+
+    return est;
+}
+
+VecSimIndex *NewIndex(const TieredIndexParams *params) {
+    // Tiered index that contains HNSW index as primary index
+    VecSimType type = params->primaryIndexParams->hnswParams.type;
+    if (type == VecSimType_FLOAT32) {
+        return TieredHNSWFactory::NewIndex<float>(params);
+    } else if (type == VecSimType_FLOAT64) {
+        return TieredHNSWFactory::NewIndex<double>(params);
+    }
+    return nullptr; // Invalid type.
+}
+} // namespace TieredHNSWFactory
+
+VecSimIndex *NewIndex(const TieredIndexParams *params) {
+    // Tiered index that contains HNSW index as primary index
+    if (params->primaryIndexParams->algo == VecSimAlgo_HNSWLIB) {
+        VecSimType type = params->primaryIndexParams->hnswParams.type;
+        if (type == VecSimType_FLOAT32) {
+            return TieredHNSWFactory::NewIndex<float>(params);
+        } else if (type == VecSimType_FLOAT64) {
+            return TieredHNSWFactory::NewIndex<double>(params);
+        }
+    }
+    return nullptr; // Invalid algorithm or type.
+}
+size_t EstimateInitialSize(const TieredIndexParams *params) {
+
+    size_t est = 0;
+
+    BFParams bf_params{};
+    if (params->primaryIndexParams->algo == VecSimAlgo_HNSWLIB) {
+        est += TieredHNSWFactory::EstimateInitialSize(params, bf_params);
+    }
+
+    est += BruteForceFactory::EstimateInitialSize(&bf_params);
+    return est;
+}
+
+size_t EstimateElementSize(const TieredIndexParams *params) {
+    size_t est = 0;
+    if (params->primaryIndexParams->algo == VecSimAlgo_HNSWLIB) {
+        est = HNSWFactory::EstimateElementSize(&params->primaryIndexParams->hnswParams);
+    }
+    return est;
+}
+
+}; // namespace TieredFactory
diff --git a/src/VecSim/index_factories/tiered_factory.h b/src/VecSim/index_factories/tiered_factory.h
new file mode 100644
index 000000000..65e2817cb
--- /dev/null
+++ b/src/VecSim/index_factories/tiered_factory.h
@@ -0,0 +1,23 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+#pragma once
+
+#include "VecSim/vec_sim.h"
+#include "VecSim/vec_sim_common.h"
+#include "VecSim/memory/vecsim_malloc.h"
+#include "VecSim/vec_sim_index.h"
+
+namespace TieredFactory {
+
+VecSimIndex *NewIndex(const TieredIndexParams *params);
+
+// The size estimation is the sum of the buffer (brute force) and main index initial sizes
+// estimations, plus the tiered index class size. Note it does not include the size of internal
+// containers such as the job queue, as those depend on the user implementation.
+size_t EstimateInitialSize(const TieredIndexParams *params);
+size_t EstimateElementSize(const TieredIndexParams *params);
+
+}; // namespace TieredFactory
diff --git a/src/VecSim/info_iterator.h b/src/VecSim/info_iterator.h
index 2cc3d79ad..bda6f0081 100644
--- a/src/VecSim/info_iterator.h
+++ b/src/VecSim/info_iterator.h
@@ -21,14 +21,16 @@ typedef enum {
     INFOFIELD_STRING,
     INFOFIELD_INT64,
     INFOFIELD_UINT64,
-    INFOFIELD_FLOAT64
+    INFOFIELD_FLOAT64,
+    INFOFIELD_ITERATOR
 } VecSim_InfoFieldType;
 
 typedef union {
-    double floatingPointValue; // Floating point value. 64 bits float.
-    int64_t integerValue;      // Integer value. Signed 64 bits integer.
-    u_int64_t uintegerValue;   // Unsigned value. Unsigned 64 buts integer.
-    const char *stringValue;   // String value.
+    double floatingPointValue;         // Floating point value. 64 bits float.
+    int64_t integerValue;              // Integer value. Signed 64 bits integer.
+    u_int64_t uintegerValue;           // Unsigned value. Unsigned 64 buts integer.
+    const char *stringValue;           // String value.
+    VecSimInfoIterator *iteratorValue; // Iterator value.
 } FieldValue;
 
 /**
diff --git a/src/VecSim/info_iterator_struct.h b/src/VecSim/info_iterator_struct.h
index 6d0f6ec26..d7a2c2d29 100644
--- a/src/VecSim/info_iterator_struct.h
+++ b/src/VecSim/info_iterator_struct.h
@@ -27,5 +27,12 @@ struct VecSimInfoIterator {
 
     inline size_t numberOfFields() { return array_len(this->fields); }
 
-    virtual ~VecSimInfoIterator() { array_free(this->fields); }
+    virtual ~VecSimInfoIterator() {
+        for (size_t i = 0; i < array_len(this->fields); i++) {
+            if (this->fields[i].fieldType == INFOFIELD_ITERATOR) {
+                delete this->fields[i].fieldValue.iteratorValue;
+            }
+        }
+        array_free(this->fields);
+    }
 };
diff --git a/src/VecSim/memory/vecsim_base.cpp b/src/VecSim/memory/vecsim_base.cpp
index 269aa855d..a1dd4ae2f 100644
--- a/src/VecSim/memory/vecsim_base.cpp
+++ b/src/VecSim/memory/vecsim_base.cpp
@@ -48,4 +48,4 @@ void operator delete[](void *p, size_t size, std::shared_ptr<VecSimAllocator> al
     allocator->deallocate(p, size);
 }
 
-std::shared_ptr<VecSimAllocator> VecsimBaseObject::getAllocator() { return this->allocator; }
+std::shared_ptr<VecSimAllocator> VecsimBaseObject::getAllocator() const { return this->allocator; }
diff --git a/src/VecSim/memory/vecsim_base.h b/src/VecSim/memory/vecsim_base.h
index f5226481b..1f8eda612 100644
--- a/src/VecSim/memory/vecsim_base.h
+++ b/src/VecSim/memory/vecsim_base.h
@@ -27,8 +27,10 @@ struct VecsimBaseObject {
     static void operator delete(void *p, size_t size, std::shared_ptr<VecSimAllocator> allocator);
     static void operator delete[](void *p, size_t size, std::shared_ptr<VecSimAllocator> allocator);
 
-    std::shared_ptr<VecSimAllocator> getAllocator();
-    inline int64_t getAllocationSize() const { return this->allocator->getAllocationSize(); }
+    std::shared_ptr<VecSimAllocator> getAllocator() const;
+    virtual inline uint64_t getAllocationSize() const {
+        return this->allocator->getAllocationSize();
+    }
 
     virtual ~VecsimBaseObject() {}
 };
diff --git a/src/VecSim/memory/vecsim_malloc.cpp b/src/VecSim/memory/vecsim_malloc.cpp
index 075c104b7..377ed01b1 100644
--- a/src/VecSim/memory/vecsim_malloc.cpp
+++ b/src/VecSim/memory/vecsim_malloc.cpp
@@ -77,4 +77,4 @@ void *VecSimAllocator::operator new[](size_t size) { return vecsim_malloc(size);
 void VecSimAllocator::operator delete(void *p, size_t size) { vecsim_free(p); }
 void VecSimAllocator::operator delete[](void *p, size_t size) { vecsim_free(p); }
 
-int64_t VecSimAllocator::getAllocationSize() const { return this->allocated; }
+uint64_t VecSimAllocator::getAllocationSize() const { return this->allocated; }
diff --git a/src/VecSim/memory/vecsim_malloc.h b/src/VecSim/memory/vecsim_malloc.h
index 0a2b9278d..ccdfd354f 100644
--- a/src/VecSim/memory/vecsim_malloc.h
+++ b/src/VecSim/memory/vecsim_malloc.h
@@ -40,7 +40,7 @@ struct VecSimAllocator {
     void operator delete(void *p, size_t size);
     void operator delete[](void *p, size_t size);
 
-    int64_t getAllocationSize() const;
+    uint64_t getAllocationSize() const;
     inline friend bool operator==(const VecSimAllocator &a, const VecSimAllocator &b) {
         return a.allocated == b.allocated;
     }
@@ -51,6 +51,8 @@ struct VecSimAllocator {
 
     static void setMemoryFunctions(VecSimMemoryFunctions memFunctions);
 
+    static size_t getAllocationOverheadSize() { return allocation_header_size; }
+
 private:
     // Retrive the original requested allocation size. Required for remalloc.
     inline size_t getPointerAllocationSize(void *p) { return *(((size_t *)p) - 1); }
diff --git a/src/VecSim/query_result_struct.h b/src/VecSim/query_result_struct.h
index 8450997a9..2814e8c8a 100644
--- a/src/VecSim/query_result_struct.h
+++ b/src/VecSim/query_result_struct.h
@@ -9,9 +9,8 @@
 #include <cstdlib>
 #include <limits>
 
-#define INVALID_ID UINT_MAX
 // Use the "not a number" value to represent invalid score. This is for distinguishing the invalid
-// scroe from "inf" score (which is valid).
+// score from "inf" score (which is valid).
 #define INVALID_SCORE std::numeric_limits<double>::quiet_NaN()
 
 /**
diff --git a/src/VecSim/query_results.cpp b/src/VecSim/query_results.cpp
index 198e5a6d7..bb947e1fe 100644
--- a/src/VecSim/query_results.cpp
+++ b/src/VecSim/query_results.cpp
@@ -8,6 +8,7 @@
 #include "VecSim/utils/arr_cpp.h"
 #include "VecSim/vec_sim.h"
 #include "VecSim/batch_iterator.h"
+#include <assert.h>
 
 struct VecSimQueryResult_Iterator {
     VecSimQueryResult *results_arr;
@@ -84,6 +85,8 @@ extern "C" void VecSimQueryResult_IteratorReset(VecSimQueryResult_Iterator *iter
 /********************** batch iterator API ***************************/
 VecSimQueryResult_List VecSimBatchIterator_Next(VecSimBatchIterator *iterator, size_t n_results,
                                                 VecSimQueryResult_Order order) {
+    assert((order == BY_ID || order == BY_SCORE) &&
+           "Possible order values are only 'BY_ID' or 'BY_SCORE'");
     return iterator->getNextResults(n_results, order);
 }
 
diff --git a/src/VecSim/query_results.h b/src/VecSim/query_results.h
index 8174af7b6..ec4a8b999 100644
--- a/src/VecSim/query_results.h
+++ b/src/VecSim/query_results.h
@@ -16,7 +16,7 @@ extern "C" {
 #endif
 
 // The possible ordering for results that return from a query
-typedef enum { BY_SCORE, BY_ID } VecSimQueryResult_Order;
+typedef enum { BY_SCORE, BY_ID, BY_SCORE_THEN_ID } VecSimQueryResult_Order;
 
 /**
  * @brief A single query result. This is an opaque object from which a user can get the result
diff --git a/src/VecSim/utils/arr_cpp.h b/src/VecSim/utils/arr_cpp.h
index 79627ab36..801262e4c 100644
--- a/src/VecSim/utils/arr_cpp.h
+++ b/src/VecSim/utils/arr_cpp.h
@@ -62,6 +62,14 @@ T *array_append(T *arr, T val) {
     return arr;
 }
 
+template <typename T>
+T *array_concat(T *arr, T *other) {
+    arr = array_ensure_cap(arr, array_len(arr) + array_len(other));
+    memcpy(arr + array_len(arr), other, array_len(other) * sizeof(T));
+    array_hdr(arr)->len += array_len(other);
+    return arr;
+}
+
 template <typename T>
 size_t array_len(T *arr) {
     return arr ? array_hdr(arr)->len : 0;
@@ -72,3 +80,23 @@ void array_free(T *arr) {
     array_hdr_t<T> *arr_hdr = array_hdr(arr);
     vecsim_free(arr_hdr);
 }
+
+#define array_pop_front(arr) array_pop_front_n(arr, 1)
+
+template <typename T>
+void array_pop_front_n(T *arr, size_t n) {
+    array_hdr_t<T> *arr_hdr = array_hdr(arr);
+    n = MIN(n, arr_hdr->len);
+    arr_hdr->len -= n;
+    // mem* functions can handle a pointer to the next element after the end of the array (n = len),
+    // if the number of bytes to copy is 0.
+    memmove(arr, arr + n, arr_hdr->len * sizeof(T));
+}
+
+#define array_pop_back(arr) array_pop_back_n(arr, 1)
+
+template <typename T>
+void array_pop_back_n(T *arr, size_t n) {
+    array_hdr_t<T> *arr_hdr = array_hdr(arr);
+    arr_hdr->len -= MIN(n, arr_hdr->len);
+}
diff --git a/src/VecSim/utils/query_result_utils.h b/src/VecSim/utils/query_result_utils.h
new file mode 100644
index 000000000..958158960
--- /dev/null
+++ b/src/VecSim/utils/query_result_utils.h
@@ -0,0 +1,148 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#pragma once
+
+#include "VecSim/query_result_struct.h"
+#include <VecSim/utils/vec_utils.h>
+#include "arr_cpp.h"
+#include <unordered_set>
+
+// Append the current result to the merged results, after verifying that it did not added yet (if
+// verification is needed). Also update the set, limit and the current result.
+template <bool withSet>
+inline constexpr void maybe_append(VecSimQueryResult *&results, VecSimQueryResult *&cur_res,
+                                   std::unordered_set<size_t> &ids, size_t &limit) {
+    // In a single line, checks (only if a check is needed) if we already inserted the current id to
+    // the merged results, add it to the set if not, and returns its conclusion.
+    if (!withSet || ids.insert(cur_res->id).second) {
+        array_append(results, *cur_res);
+        limit--;
+    }
+    cur_res++;
+}
+
+// Assumes that the arrays are sorted by score firstly and by id secondarily.
+// By the end of the function, the first and second referenced pointers will point to the first
+// element that was not merged (in each array), or to the end of the array if it was merged
+// completely.
+template <bool withSet>
+VecSimQueryResult *merge_results(VecSimQueryResult *&first, const VecSimQueryResult *first_end,
+                                 VecSimQueryResult *&second, const VecSimQueryResult *second_end,
+                                 size_t limit) {
+    // Allocate the merged results array with the minimum size needed.
+    // Min of the limit and the sum of the lengths of the two arrays.
+    VecSimQueryResult *results = array_new<VecSimQueryResult>(
+        std::min(limit, (size_t)(first_end - first) + (size_t)(second_end - second)));
+    // Will hold the ids of the results we've already added to the merged results.
+    // Will be used only if withSet is true.
+    std::unordered_set<size_t> ids;
+    auto &cur_first = first;
+    auto &cur_second = second;
+
+    while (limit && cur_first != first_end && cur_second != second_end) {
+        int cmp = cmpVecSimQueryResultByScoreThenId(cur_first, cur_second);
+        if (cmp > 0) {
+            maybe_append<withSet>(results, cur_second, ids, limit);
+        } else if (cmp < 0) {
+            maybe_append<withSet>(results, cur_first, ids, limit);
+        } else {
+            // Even if `withSet` is true, we encountered an exact duplicate, so we know that this id
+            // didn't appear before in both arrays, and it won't appear again in both arrays, so we
+            // can add it to the merged results, and skip adding it to the set.
+            array_append(results, *cur_first);
+            cur_first++;
+            cur_second++;
+            limit--;
+        }
+    }
+
+    // If we didn't exit the loop because of the limit, at least one of the arrays is empty.
+    // We can try appending the rest of the other array.
+    if (limit != 0) {
+        if (cur_first == first_end) {
+            while (limit && cur_second != second_end) {
+                maybe_append<withSet>(results, cur_second, ids, limit);
+            }
+        } else {
+            while (limit && cur_first != first_end) {
+                maybe_append<withSet>(results, cur_first, ids, limit);
+            }
+        }
+    }
+
+    return results;
+}
+
+// Assumes that the arrays are sorted by score firstly and by id secondarily.
+template <bool withSet>
+VecSimQueryResult_List merge_result_lists(VecSimQueryResult_List first,
+                                          VecSimQueryResult_List second, size_t limit) {
+
+    VecSimQueryResult *cur_first = first.results;
+    VecSimQueryResult *cur_second = second.results;
+    const auto first_end = cur_first + VecSimQueryResult_Len(first);
+    const auto second_end = cur_second + VecSimQueryResult_Len(second);
+
+    auto results = merge_results<withSet>(cur_first, first_end, cur_second, second_end, limit);
+    VecSimQueryResult_List mergedResults{.results = results, .code = VecSim_QueryResult_OK};
+
+    VecSimQueryResult_Free(first);
+    VecSimQueryResult_Free(second);
+    return mergedResults;
+}
+
+// Concatenate the results of two queries into the results of the first query, consuming the second.
+static inline void concat_results(VecSimQueryResult_List &first, VecSimQueryResult_List &second) {
+    auto &dst = first.results;
+    auto &src = second.results;
+
+    dst = array_concat(dst, src);
+    VecSimQueryResult_Free(second);
+}
+
+// Sorts the results by id and removes duplicates.
+// Assumes that a result can appear at most twice in the results list.
+// @returns the number of unique results. This should be set to be the new length of the results
+template <bool IsMulti>
+void filter_results_by_id(VecSimQueryResult_List results) {
+    if (VecSimQueryResult_Len(results) < 2) {
+        return;
+    }
+    sort_results_by_id(results);
+
+    size_t i, cur_end;
+    for (i = 0, cur_end = 0; i < VecSimQueryResult_Len(results) - 1; i++, cur_end++) {
+        const VecSimQueryResult *cur_res = results.results + i;
+        const VecSimQueryResult *next_res = cur_res + 1;
+        if (VecSimQueryResult_GetId(cur_res) == VecSimQueryResult_GetId(next_res)) {
+            if (IsMulti) {
+                // On multi value index, scores might be different and we want to keep the lower
+                // score.
+                if (VecSimQueryResult_GetScore(cur_res) < VecSimQueryResult_GetScore(next_res)) {
+                    results.results[cur_end] = *cur_res;
+                } else {
+                    results.results[cur_end] = *next_res;
+                }
+            } else {
+                // On single value index, scores are the same so we can keep any of the results.
+                results.results[cur_end] = *cur_res;
+            }
+            // Assuming every id can appear at most twice, we can skip the next comparison between
+            // the current and the next result.
+            i++;
+        } else {
+            results.results[cur_end] = *cur_res;
+        }
+    }
+    // If the last result is unique, we need to add it to the results.
+    if (i == VecSimQueryResult_Len(results) - 1) {
+        results.results[cur_end] = results.results[i];
+        // Logically, we should increment cur_end and i here, but we don't need to because it won't
+        // affect the rest of the function.
+    }
+    array_pop_back_n(results.results, i - cur_end);
+}
diff --git a/src/VecSim/utils/updatable_heap.h b/src/VecSim/utils/updatable_heap.h
index 13a5f46ea..b9bb74b4c 100644
--- a/src/VecSim/utils/updatable_heap.h
+++ b/src/VecSim/utils/updatable_heap.h
@@ -42,6 +42,9 @@ class updatable_max_heap : public abstract_priority_queue<Priority, Value> {
     inline void pop() override;
     inline const std::pair<Priority, Value> top() const override;
     inline size_t size() const override;
+
+private:
+    inline auto top_ptr() const;
 };
 
 template <typename Priority, typename Value>
@@ -64,15 +67,26 @@ bool updatable_max_heap<Priority, Value>::empty() const {
 }
 
 template <typename Priority, typename Value>
-const std::pair<Priority, Value> updatable_max_heap<Priority, Value>::top() const {
+auto updatable_max_heap<Priority, Value>::top_ptr() const {
     // The `.begin()` of "priorityToValue" is the max priority element.
     auto x = priorityToValue.begin();
+    // x has the max priority, but there might be multiple values with the same priority. We need to
+    // find the value with the highest value as well.
+    auto [begin, end] = priorityToValue.equal_range(x->first);
+    auto y = std::max_element(begin, end,
+                              [](const auto &a, const auto &b) { return a.second < b.second; });
+    return y;
+}
+
+template <typename Priority, typename Value>
+const std::pair<Priority, Value> updatable_max_heap<Priority, Value>::top() const {
+    auto x = top_ptr();
     return *x;
 }
 
 template <typename Priority, typename Value>
 void updatable_max_heap<Priority, Value>::pop() {
-    auto to_remove = priorityToValue.begin();
+    auto to_remove = top_ptr();
     valueToPriority.erase(to_remove->second);
     priorityToValue.erase(to_remove);
 }
diff --git a/src/VecSim/utils/vec_utils.cpp b/src/VecSim/utils/vec_utils.cpp
index add8b3d4e..80b38060d 100644
--- a/src/VecSim/utils/vec_utils.cpp
+++ b/src/VecSim/utils/vec_utils.cpp
@@ -7,7 +7,6 @@
 #include "vec_utils.h"
 #include "VecSim/query_result_struct.h"
 #include <cmath>
-#include <cassert>
 #include <cerrno>
 #include <climits>
 #include <float.h>
@@ -20,6 +19,7 @@ typedef int (*__compar_fn_t)(const void *, const void *);
 const char *VecSimCommonStrings::ALGORITHM_STRING = "ALGORITHM";
 const char *VecSimCommonStrings::FLAT_STRING = "FLAT";
 const char *VecSimCommonStrings::HNSW_STRING = "HNSW";
+const char *VecSimCommonStrings::TIERED_STRING = "TIERED";
 
 const char *VecSimCommonStrings::TYPE_STRING = "TYPE";
 const char *VecSimCommonStrings::FLOAT32_STRING = "FLOAT32";
@@ -44,22 +44,20 @@ const char *VecSimCommonStrings::HNSW_EF_CONSTRUCTION_STRING = "EF_CONSTRUCTION"
 const char *VecSimCommonStrings::HNSW_EPSILON_STRING = "EPSILON";
 const char *VecSimCommonStrings::HNSW_MAX_LEVEL = "MAX_LEVEL";
 const char *VecSimCommonStrings::HNSW_ENTRYPOINT = "ENTRYPOINT";
+const char *VecSimCommonStrings::HNSW_NUM_MARKED_DELETED = "NUMBER_OF_MARKED_DELETED";
 
 const char *VecSimCommonStrings::BLOCK_SIZE_STRING = "BLOCK_SIZE";
 const char *VecSimCommonStrings::SEARCH_MODE_STRING = "LAST_SEARCH_MODE";
 const char *VecSimCommonStrings::HYBRID_POLICY_STRING = "HYBRID_POLICY";
 const char *VecSimCommonStrings::BATCH_SIZE_STRING = "BATCH_SIZE";
 
-int cmpVecSimQueryResultById(const VecSimQueryResult *res1, const VecSimQueryResult *res2) {
-    return (int)(VecSimQueryResult_GetId(res1) - VecSimQueryResult_GetId(res2));
-}
-
-int cmpVecSimQueryResultByScore(const VecSimQueryResult *res1, const VecSimQueryResult *res2) {
-    assert(!std::isnan(VecSimQueryResult_GetScore(res1)) &&
-           !std::isnan(VecSimQueryResult_GetScore(res2)));
-    // Compare doubles
-    return (VecSimQueryResult_GetScore(res1) - VecSimQueryResult_GetScore(res2)) >= 0.0 ? 1 : -1;
-}
+const char *VecSimCommonStrings::TIERED_MANAGEMENT_MEMORY_STRING = "MANAGEMENT_LAYER_MEMORY";
+const char *VecSimCommonStrings::TIERED_BACKGROUND_INDEXING_STRING = "BACKGROUND_INDEXING";
+const char *VecSimCommonStrings::TIERED_BUFFER_LIMIT_STRING = "TIERED_BUFFER_LIMIT";
+const char *VecSimCommonStrings::FRONTEND_INDEX_STRING = "FRONTEND_INDEX";
+const char *VecSimCommonStrings::BACKEND_INDEX_STRING = "BACKEND_INDEX";
+const char *VecSimCommonStrings::TIERED_HNSW_SWAP_JOBS_THRESHOLD_STRING =
+    "TIERED_HNSW_SWAP_JOBS_THRESHOLD";
 
 void sort_results_by_id(VecSimQueryResult_List rl) {
     qsort(rl.results, VecSimQueryResult_Len(rl), sizeof(VecSimQueryResult),
@@ -71,6 +69,22 @@ void sort_results_by_score(VecSimQueryResult_List rl) {
           (__compar_fn_t)cmpVecSimQueryResultByScore);
 }
 
+void sort_results_by_score_then_id(VecSimQueryResult_List rl) {
+    qsort(rl.results, VecSimQueryResult_Len(rl), sizeof(VecSimQueryResult),
+          (__compar_fn_t)cmpVecSimQueryResultByScoreThenId);
+}
+
+void sort_results(VecSimQueryResult_List rl, VecSimQueryResult_Order order) {
+    switch (order) {
+    case BY_ID:
+        return sort_results_by_id(rl);
+    case BY_SCORE:
+        return sort_results_by_score(rl);
+    case BY_SCORE_THEN_ID:
+        return sort_results_by_score_then_id(rl);
+    }
+}
+
 VecSimResolveCode validate_positive_integer_param(VecSimRawParam rawParam, long long *val) {
     char *ep; // For checking that strtoll used all rawParam.valLen chars.
     errno = 0;
@@ -103,11 +117,11 @@ const char *VecSimAlgo_ToString(VecSimAlgo vecsimAlgo) {
         return VecSimCommonStrings::FLAT_STRING;
     case VecSimAlgo_HNSWLIB:
         return VecSimCommonStrings::HNSW_STRING;
-    default:
-        return NULL;
+    case VecSimAlgo_TIERED:
+        return VecSimCommonStrings::TIERED_STRING;
     }
+    return NULL;
 }
-
 const char *VecSimType_ToString(VecSimType vecsimType) {
     switch (vecsimType) {
     case VecSimType_FLOAT32:
@@ -118,9 +132,8 @@ const char *VecSimType_ToString(VecSimType vecsimType) {
         return VecSimCommonStrings::INT32_STRING;
     case VecSimType_INT64:
         return VecSimCommonStrings::INT64_STRING;
-    default:
-        return NULL;
     }
+    return NULL;
 }
 
 const char *VecSimMetric_ToString(VecSimMetric vecsimMetric) {
@@ -131,9 +144,8 @@ const char *VecSimMetric_ToString(VecSimMetric vecsimMetric) {
         return "IP";
     case VecSimMetric_L2:
         return "L2";
-    default:
-        return NULL;
     }
+    return NULL;
 }
 
 const char *VecSimSearchMode_ToString(VecSearchMode vecsimSearchMode) {
@@ -150,9 +162,8 @@ const char *VecSimSearchMode_ToString(VecSearchMode vecsimSearchMode) {
         return "HYBRID_BATCHES_TO_ADHOC_BF";
     case RANGE_QUERY:
         return "RANGE_QUERY";
-    default:
-        return NULL;
     }
+    return NULL;
 }
 
 size_t VecSimType_sizeof(VecSimType type) {
diff --git a/src/VecSim/utils/vec_utils.h b/src/VecSim/utils/vec_utils.h
index 37cdb03c9..0f5d05f63 100644
--- a/src/VecSim/utils/vec_utils.h
+++ b/src/VecSim/utils/vec_utils.h
@@ -10,6 +10,7 @@
 #include "VecSim/vec_sim_common.h"
 #include <VecSim/query_results.h>
 #include <utility>
+#include <cassert>
 #include <cmath> //sqrt
 
 template <typename dist_t>
@@ -25,6 +26,7 @@ struct VecSimCommonStrings {
     static const char *ALGORITHM_STRING;
     static const char *FLAT_STRING;
     static const char *HNSW_STRING;
+    static const char *TIERED_STRING;
 
     static const char *TYPE_STRING;
     static const char *FLOAT32_STRING;
@@ -49,17 +51,49 @@ struct VecSimCommonStrings {
     static const char *HNSW_EPSILON_STRING;
     static const char *HNSW_MAX_LEVEL;
     static const char *HNSW_ENTRYPOINT;
+    static const char *HNSW_NUM_MARKED_DELETED;
+    // static const char *HNSW_VISITED_NODES_POOL_SIZE_STRING;
 
     static const char *BLOCK_SIZE_STRING;
     static const char *SEARCH_MODE_STRING;
     static const char *HYBRID_POLICY_STRING;
     static const char *BATCH_SIZE_STRING;
+
+    static const char *TIERED_MANAGEMENT_MEMORY_STRING;
+    static const char *TIERED_BACKGROUND_INDEXING_STRING;
+    static const char *TIERED_BUFFER_LIMIT_STRING;
+    static const char *FRONTEND_INDEX_STRING;
+    static const char *BACKEND_INDEX_STRING;
+    static const char *TIERED_HNSW_SWAP_JOBS_THRESHOLD_STRING;
 };
 
+inline int cmpVecSimQueryResultById(const VecSimQueryResult *res1, const VecSimQueryResult *res2) {
+    return (int)(VecSimQueryResult_GetId(res1) - VecSimQueryResult_GetId(res2));
+}
+
+inline int cmpVecSimQueryResultByScore(const VecSimQueryResult *res1,
+                                       const VecSimQueryResult *res2) {
+    assert(!std::isnan(VecSimQueryResult_GetScore(res1)) &&
+           !std::isnan(VecSimQueryResult_GetScore(res2)));
+    // Compare doubles
+    return (VecSimQueryResult_GetScore(res1) - VecSimQueryResult_GetScore(res2)) >= 0.0 ? 1 : -1;
+}
+
+inline int cmpVecSimQueryResultByScoreThenId(const VecSimQueryResult *res1,
+                                             const VecSimQueryResult *res2) {
+    return (VecSimQueryResult_GetScore(res1) != VecSimQueryResult_GetScore(res2))
+               ? cmpVecSimQueryResultByScore(res1, res2)
+               : cmpVecSimQueryResultById(res1, res2);
+}
+
 void sort_results_by_id(VecSimQueryResult_List results);
 
 void sort_results_by_score(VecSimQueryResult_List results);
 
+void sort_results_by_score_then_id(VecSimQueryResult_List results);
+
+void sort_results(VecSimQueryResult_List results, VecSimQueryResult_Order order);
+
 VecSimResolveCode validate_positive_integer_param(VecSimRawParam rawParam, long long *val);
 
 VecSimResolveCode validate_positive_double_param(VecSimRawParam rawParam, double *val);
@@ -81,7 +115,7 @@ void normalizeVector(DataType *input_vector, size_t dim) {
     double sum = 0;
 
     for (size_t i = 0; i < dim; i++) {
-        sum += input_vector[i] * input_vector[i];
+        sum += (double)input_vector[i] * (double)input_vector[i];
     }
     DataType norm = sqrt(sum);
 
@@ -89,3 +123,12 @@ void normalizeVector(DataType *input_vector, size_t dim) {
         input_vector[i] = input_vector[i] / norm;
     }
 }
+
+typedef void (*normalizeVector_f)(void *input_vector, size_t dim);
+
+static inline void normalizeVectorFloat(void *input_vector, size_t dim) {
+    normalizeVector(static_cast<float *>(input_vector), dim);
+}
+static inline void normalizeVectorDouble(void *input_vector, size_t dim) {
+    normalizeVector(static_cast<double *>(input_vector), dim);
+}
diff --git a/src/VecSim/vec_sim.cpp b/src/VecSim/vec_sim.cpp
index 38bbfea11..206314e94 100644
--- a/src/VecSim/vec_sim.cpp
+++ b/src/VecSim/vec_sim.cpp
@@ -7,12 +7,10 @@
 #include "VecSim/vec_sim.h"
 #include "VecSim/query_results.h"
 #include "VecSim/query_result_struct.h"
-#include "VecSim/algorithms/brute_force/brute_force.h"
-#include "VecSim/algorithms/hnsw/hnsw.h"
 #include "VecSim/utils/vec_utils.h"
 #include "VecSim/utils/arr_cpp.h"
-#include "VecSim/algorithms/brute_force/brute_force_factory.h"
-#include "VecSim/algorithms/hnsw/hnsw_factory.h"
+#include "VecSim/index_factories/index_factory.h"
+#include "VecSim/vec_sim_index.h"
 #include <cassert>
 #include "memory.h"
 
@@ -20,6 +18,12 @@ extern "C" void VecSim_SetTimeoutCallbackFunction(timeoutCallbackFunction callba
     VecSimIndex::setTimeoutCallbackFunction(callback);
 }
 
+extern "C" void VecSim_SetLogCallbackFunction(logCallbackFunction callback) {
+    VecSimIndex::setLogCallbackFunction(callback);
+}
+
+extern "C" void VecSim_SetWriteMode(VecSimWriteMode mode) { VecSimIndex::setWriteMode(mode); }
+
 static VecSimResolveCode _ResolveParams_EFRuntime(VecSimAlgo index_type, VecSimRawParam rparam,
                                                   VecSimQueryParams *qparams,
                                                   VecsimQueryType query_type) {
@@ -100,62 +104,30 @@ static VecSimResolveCode _ResolveParams_HybridPolicy(VecSimRawParam rparam,
 }
 
 extern "C" VecSimIndex *VecSimIndex_New(const VecSimParams *params) {
-    VecSimIndex *index = NULL;
-    std::shared_ptr<VecSimAllocator> allocator = VecSimAllocator::newVecsimAllocator();
-    try {
-        switch (params->algo) {
-        case VecSimAlgo_HNSWLIB:
-            index = HNSWFactory::NewIndex(&params->hnswParams, allocator);
-            break;
-        case VecSimAlgo_BF:
-            index = BruteForceFactory::NewIndex(&params->bfParams, allocator);
-            break;
-        }
-    } catch (...) {
-        // Index will delete itself. For now, do nothing.
-    }
-    return index;
+    return VecSimFactory::NewIndex(params);
 }
 
 extern "C" size_t VecSimIndex_EstimateInitialSize(const VecSimParams *params) {
-    switch (params->algo) {
-    case VecSimAlgo_HNSWLIB:
-        return HNSWFactory::EstimateInitialSize(&params->hnswParams);
-    case VecSimAlgo_BF:
-        return BruteForceFactory::EstimateInitialSize(&params->bfParams);
-    }
-    return -1;
+    return VecSimFactory::EstimateInitialSize(params);
 }
 
-extern "C" int VecSimIndex_AddVector(VecSimIndex *index, const void *blob, size_t id) {
-    int64_t before = index->getAllocationSize();
+extern "C" int VecSimIndex_AddVector(VecSimIndex *index, const void *blob, size_t label) {
     if (index->indexSize() == index->indexCapacity()) {
         index->increaseCapacity();
     }
-    index->addVector(blob, id, true);
-    int64_t after = index->getAllocationSize();
-    return after - before;
+    return index->addVectorWrapper(blob, label);
 }
 
-extern "C" int VecSimIndex_DeleteVector(VecSimIndex *index, size_t id) {
-    int64_t before = index->getAllocationSize();
-    index->deleteVector(id);
-    int64_t after = index->getAllocationSize();
-    return after - before;
+extern "C" int VecSimIndex_DeleteVector(VecSimIndex *index, size_t label) {
+    return index->deleteVector(label);
 }
 
-extern "C" double VecSimIndex_GetDistanceFrom(VecSimIndex *index, size_t id, const void *blob) {
-    return index->getDistanceFrom(id, blob);
+extern "C" double VecSimIndex_GetDistanceFrom(VecSimIndex *index, size_t label, const void *blob) {
+    return index->getDistanceFrom(label, blob);
 }
 
 extern "C" size_t VecSimIndex_EstimateElementSize(const VecSimParams *params) {
-    switch (params->algo) {
-    case VecSimAlgo_HNSWLIB:
-        return HNSWFactory::EstimateElementSize(&params->hnswParams);
-    case VecSimAlgo_BF:
-        return BruteForceFactory::EstimateElementSize(&params->bfParams);
-    }
-    return -1;
+    return VecSimFactory::EstimateElementSize(params);
 }
 
 extern "C" void VecSim_Normalize(void *blob, size_t dim, VecSimType type) {
@@ -175,7 +147,8 @@ extern "C" VecSimResolveCode VecSimIndex_ResolveParams(VecSimIndex *index, VecSi
     if (!qparams || (!rparams && (paramNum != 0))) {
         return VecSimParamResolverErr_NullParam;
     }
-    VecSimAlgo index_type = index->info().algo;
+    VecSimAlgo index_type = index->basicInfo().algo;
+
     bzero(qparams, sizeof(VecSimQueryParams));
     auto res = VecSimParamResolver_OK;
     for (int i = 0; i < paramNum; i++) {
@@ -225,7 +198,7 @@ extern "C" VecSimQueryResult_List VecSimIndex_TopKQuery(VecSimIndex *index, cons
     assert((order == BY_ID || order == BY_SCORE) &&
            "Possible order values are only 'BY_ID' or 'BY_SCORE'");
     VecSimQueryResult_List results;
-    results = index->topKQuery(queryBlob, k, queryParams);
+    results = index->topKQueryWrapper(queryBlob, k, queryParams);
 
     if (order == BY_ID) {
         sort_results_by_id(results);
@@ -243,14 +216,7 @@ extern "C" VecSimQueryResult_List VecSimIndex_RangeQuery(VecSimIndex *index, con
     if (radius < 0) {
         throw std::runtime_error("radius must be non-negative");
     }
-    VecSimQueryResult_List results = index->rangeQuery(queryBlob, radius, queryParams);
-
-    if (order == BY_SCORE) {
-        sort_results_by_score(results);
-    } else {
-        sort_results_by_id(results);
-    }
-    return results;
+    return index->rangeQueryWrapper(queryBlob, radius, queryParams, order);
 }
 
 extern "C" void VecSimIndex_Free(VecSimIndex *index) {
@@ -265,9 +231,13 @@ extern "C" VecSimInfoIterator *VecSimIndex_InfoIterator(VecSimIndex *index) {
     return index->infoIterator();
 }
 
+extern "C" VecSimIndexBasicInfo VecSimIndex_BasicInfo(VecSimIndex *index) {
+    return index->basicInfo();
+}
+
 extern "C" VecSimBatchIterator *VecSimBatchIterator_New(VecSimIndex *index, const void *queryBlob,
                                                         VecSimQueryParams *queryParams) {
-    return index->newBatchIterator(queryBlob, queryParams);
+    return index->newBatchIteratorWrapper(queryBlob, queryParams);
 }
 
 extern "C" void VecSim_SetMemoryFunctions(VecSimMemoryFunctions memoryfunctions) {
diff --git a/src/VecSim/vec_sim.h b/src/VecSim/vec_sim.h
index 936b06cec..8300304b6 100644
--- a/src/VecSim/vec_sim.h
+++ b/src/VecSim/vec_sim.h
@@ -52,18 +52,18 @@ void VecSimIndex_Free(VecSimIndex *index);
  * @param index the index to which the vector is added.
  * @param blob binary representation of the vector. Blob size should match the index data type and
  * dimension.
- * @param id the id of the added vector
- * @return always returns true
+ * @param label the label of the added vector
+ * @return the number of new vectors inserted (1 for new insertion, 0 for override).
  */
-int VecSimIndex_AddVector(VecSimIndex *index, const void *blob, size_t id);
+int VecSimIndex_AddVector(VecSimIndex *index, const void *blob, size_t label);
 
 /**
  * @brief Remove a vector from an index.
  * @param index the index from which the vector is removed.
- * @param id the id of the removed vector
- * @return always returns true
+ * @param label the label of the removed vector
+ * @return the number of vectors removed (0 if the label was not found)
  */
-int VecSimIndex_DeleteVector(VecSimIndex *index, size_t id);
+int VecSimIndex_DeleteVector(VecSimIndex *index, size_t label);
 
 /**
  * @brief Calculate the distance of a vector from an index to a vector. This function assumes that
@@ -71,13 +71,13 @@ int VecSimIndex_DeleteVector(VecSimIndex *index, size_t id);
  * index's distance metric is cosine, the vector is already normalized.
  * @param index the index from which the first vector is located, and that defines the distance
  * metric.
- * @param id the id of the vector in the index.
+ * @param label the label of the vector in the index.
  * @param blob binary representation of the second vector. Blob size should match the index data
  * type and dimension, and pre-normalized if needed.
  * @return The distance (according to the index's distance metric) between `blob` and the vector
- * with id `id`.
+ * with label  label`.
  */
-double VecSimIndex_GetDistanceFrom(VecSimIndex *index, size_t id, const void *blob);
+double VecSimIndex_GetDistanceFrom(VecSimIndex *index, size_t label, const void *blob);
 
 /**
  * @brief normalize the vector blob in place.
@@ -148,6 +148,13 @@ VecSimQueryResult_List VecSimIndex_RangeQuery(VecSimIndex *index, const void *qu
  */
 VecSimIndexInfo VecSimIndex_Info(VecSimIndex *index);
 
+/**
+ * @brief Return basic immutable index information.
+ * @param index the index to return its info.
+ * @return Index basic meta-data.
+ */
+VecSimIndexBasicInfo VecSimIndex_BasicInfo(VecSimIndex *index);
+
 /**
  * @brief Returns an info iterator for generic reply purposes.
  *
@@ -198,6 +205,21 @@ void VecSim_SetMemoryFunctions(VecSimMemoryFunctions memoryfunctions);
  */
 void VecSim_SetTimeoutCallbackFunction(timeoutCallbackFunction callback);
 
+/**
+ * @brief Allow 3rd party log callback to be used for logging.
+ *
+ * @param callback logCallbackFunction function. should get void* and return void.
+ */
+void VecSim_SetLogCallbackFunction(logCallbackFunction callback);
+
+/**
+ * @brief Allow 3rd party to set the write mode for tiered index - async insert/delete using
+ * background jobs, or insert/delete inplace.
+ *
+ * @param mode VecSimWriteMode the mode in which we add/remove vectors (async or in-place).
+ */
+void VecSim_SetWriteMode(VecSimWriteMode mode);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/VecSim/vec_sim_common.h b/src/VecSim/vec_sim_common.h
index c9e72fb19..dda8e0355 100644
--- a/src/VecSim/vec_sim_common.h
+++ b/src/VecSim/vec_sim_common.h
@@ -11,6 +11,11 @@ extern "C" {
 #endif
 #include <stddef.h>
 #include <stdint.h>
+#include <stdbool.h>
+
+// Common definitions
+#define INVALID_ID    UINT_MAX
+#define INVALID_LABEL SIZE_MAX
 
 // HNSW default parameters
 #define HNSW_DEFAULT_M       16
@@ -19,6 +24,10 @@ extern "C" {
 #define HNSW_DEFAULT_EPSILON 0.01
 #define DEFAULT_BLOCK_SIZE   1024
 
+#define HNSW_INVALID_LEVEL SIZE_MAX
+#define INVALID_JOB_ID     UINT_MAX
+#define INVALID_INFO       UINT_MAX
+
 // Datatypes for indexing.
 typedef enum {
     VecSimType_FLOAT32,
@@ -28,14 +37,11 @@ typedef enum {
 } VecSimType;
 
 // Algorithm type/library.
-typedef enum { VecSimAlgo_BF, VecSimAlgo_HNSWLIB } VecSimAlgo;
+typedef enum { VecSimAlgo_BF, VecSimAlgo_HNSWLIB, VecSimAlgo_TIERED } VecSimAlgo;
 
 // Distance metric
 typedef enum { VecSimMetric_L2, VecSimMetric_IP, VecSimMetric_Cosine } VecSimMetric;
 
-// Vectors flags (for marking a specific vector)
-typedef enum { DELETE_MARK = 0x01 } Flags;
-
 typedef size_t labelType;
 typedef unsigned int idType;
 
@@ -66,17 +72,23 @@ typedef enum {
     VecSimParamResolverErr_InvalidPolicy_AdHoc_With_EfRuntime
 } VecSimResolveCode;
 
+typedef struct AsyncJob AsyncJob; // forward declaration.
+
+// Write async/sync mode
+typedef enum { VecSim_WriteAsync, VecSim_WriteInPlace } VecSimWriteMode;
+
 /**
  * Callback signatures for asynchronous tiered index.
  */
-typedef int (*SubmitCB)(void *job_queue, void **jobs, size_t jobs_len);
-typedef int (*UpdateMemoryCB)(void *memory_ctx, size_t memory);
-typedef void (*JobCallback)(void *);
+typedef void (*JobCallback)(AsyncJob *);
+typedef int (*SubmitCB)(void *job_queue, void *index_ctx, AsyncJob **jobs, JobCallback *CBs,
+                        size_t jobs_len);
 
 /**
  * @brief Index initialization parameters.
  *
  */
+typedef struct VecSimParams VecSimParams;
 typedef struct {
     VecSimType type;     // Datatype to index.
     size_t dim;          // Vector's dimension.
@@ -99,28 +111,34 @@ typedef struct {
     size_t blockSize;
 } BFParams;
 
-// A struct that contains the common tiered index params.
-typedef struct {
-    void *jobQueue;             // External queue that holds the jobs.
-    SubmitCB submitCb;          // A callback that submits an array of jobs into a given jobQueue.
-    void *memoryCtx;            // External context that stores the index memory consumption.
-    UpdateMemoryCB UpdateMemCb; // A callback that updates the memoryCtx
-                                // with a given memory (number).
-} TieredIndexParams;
-
+// A struct that contains HNSW tiered index specific params.
 typedef struct {
-    HNSWParams hnswParams;
-    TieredIndexParams tieredParams;
+    size_t swapJobThreshold; // The minimum number of swap jobs to accumulate before applying
+                             // all the ready swap jobs in a batch.
 } TieredHNSWParams;
 
+// A struct that contains the common tiered index params.
 typedef struct {
+    void *jobQueue;         // External queue that holds the jobs.
+    void *jobQueueCtx;      // External context to be sent to the submit callback.
+    SubmitCB submitCb;      // A callback that submits an array of jobs into a given jobQueue.
+    size_t flatBufferLimit; // Maximum size allowed for the flat buffer. If flat buffer is full, use
+                            // in-place insertion.
+    VecSimParams *primaryIndexParams; // Parameters to initialize the index.
+    union {
+        TieredHNSWParams tieredHnswParams;
+    } specificParams;
+} TieredIndexParams;
+
+struct VecSimParams {
     VecSimAlgo algo; // Algorithm to use.
     union {
         HNSWParams hnswParams;
         BFParams bfParams;
-        TieredHNSWParams tieredHNSWParams;
+        TieredIndexParams tieredParams;
     };
-} VecSimParams;
+    void *logCtx; // External context that stores the index log.
+};
 
 /**
  * The specific job types in use (to be extended in the future by demand)
@@ -133,47 +151,6 @@ typedef enum {
     INVALID_JOB // to indicate that finding a JobType >= INVALID_JOB is an error
 } JobType;
 
-/**
- * Definition of generic job structure for asynchronous tiered index.
- */
-typedef struct AsyncJob {
-    JobType jobType;
-    JobCallback Execute; // A callback that receives a job as its input and executes the job.
-} AsyncJob;
-
-/**
- * Definition of a job that inserts a new vector from flat into HNSW Index.
- */
-typedef struct HNSWInsertJob {
-    AsyncJob base;
-    void *index;
-    labelType label;
-    idType id;
-} HNSWInsertJob;
-
-/**
- * Definition of a job that swaps last id with a deleted id in HNSW Index after delete operation.
- */
-typedef struct HNSWSwapJob {
-    AsyncJob base;
-    void *index;
-    idType deleted_id;
-    long pending_repair_jobs_counter; // number of repair jobs left to complete before this job
-                                      // is ready to be executed (atomic counter).
-} HNSWSwapJob;
-
-/**
- * Definition of a job that repairs a certain node's connection in HNSW Index after delete
- * operation.
- */
-typedef struct HNSWRepairJob {
-    AsyncJob base;
-    void *index;
-    idType node_id;
-    unsigned short level;
-    HNSWSwapJob *assosiated_swap_job;
-} HNSWRepairJob;
-
 typedef struct {
     size_t efRuntime; // EF parameter for HNSW graph accuracy/latency for search.
     double epsilon;   // Epsilon parameter for HNSW graph accuracy/latency for range search.
@@ -216,43 +193,77 @@ typedef struct {
                       // to get it from the parameters resolve function.
 } VecSimQueryParams;
 
+/**
+ * Index info that is static and immutable (cannot be changed over time)
+ */
+typedef struct {
+    VecSimAlgo algo;     // Algorithm being used.
+    size_t blockSize;    // Brute force algorithm vector block (mini matrix) size
+    VecSimMetric metric; // Index distance metric
+    VecSimType type;     // Datatype the index holds.
+    bool isMulti;        // Determines if the index should multi-index or not.
+    size_t dim;          // Vector size (dimension).
+
+    bool isTiered; // The algorithm for the tiered index (if algo is tiered).
+} VecSimIndexBasicInfo;
+
+typedef struct {
+    VecSimIndexBasicInfo basicInfo; // Index immutable meta-data.
+    size_t indexSize;               // Current count of vectors.
+    size_t indexLabelCount;         // Current unique count of labels.
+    uint64_t memory;                // Index memory consumption.
+    VecSearchMode last_mode;        // The mode in which the last query ran.
+} CommonInfo;
+
+typedef struct {
+    size_t M;              // Number of allowed edges per node in graph.
+    size_t efConstruction; // EF parameter for HNSW graph accuracy/latency for indexing.
+    size_t efRuntime;      // EF parameter for HNSW graph accuracy/latency for search.
+    double epsilon;        // Epsilon parameter for HNSW graph accuracy/latency for range search.
+    size_t max_level;      // Number of graph levels.
+    size_t entrypoint;     // Entrypoint vector label.
+    size_t visitedNodesPoolSize;       // The max number of parallel graph scans so far.
+    size_t numberOfMarkedDeletedNodes; // The number of nodes that are marked as deleted.
+} hnswInfoStruct;
+
+typedef struct {
+    char dummy; // For not having this as an empty struct, can be removed after we extend this.
+} bfInfoStruct;
+
+typedef struct HnswTieredInfo {
+    size_t pendingSwapJobsThreshold;
+} HnswTieredInfo;
+
+typedef struct {
+
+    // Since we cannot recursively have a struct that contains itself, we need this workaround.
+    union {
+        hnswInfoStruct hnswInfo;
+    } backendInfo; // The backend index info.
+    union {
+        HnswTieredInfo hnswTieredInfo;
+    } specificTieredBackendInfo;   // Info relevant for tiered index with a specific backend.
+    CommonInfo backendCommonInfo;  // Common index info.
+    CommonInfo frontendCommonInfo; // Common index info.
+    bfInfoStruct bfInfo;           // The brute force index info.
+
+    uint64_t management_layer_memory; // Memory consumption of the management layer.
+    bool backgroundIndexing;          // Determines if the index is currently being indexed in the
+                                      // background.
+    size_t bufferLimit;               // Maximum number of vectors allowed in the flat buffer.
+} tieredInfoStruct;
+
 /**
  * @brief Index information. Mainly used for debug/testing.
  *
  */
 typedef struct {
+    CommonInfo commonInfo;
     union {
-        struct {
-            size_t indexSize;       // Current count of vectors.
-            size_t indexLabelCount; // Current unique count of labels.
-            size_t blockSize;       // Sets the amount to grow when resizing
-            size_t M;               // Number of allowed edges per node in graph.
-            size_t efConstruction;  // EF parameter for HNSW graph accuracy/latency for indexing.
-            size_t efRuntime;       // EF parameter for HNSW graph accuracy/latency for search.
-            double epsilon;   // Epsilon parameter for HNSW graph accuracy/latency for range search.
-            size_t max_level; // Number of graph levels.
-            size_t entrypoint;           // Entrypoint vector label.
-            VecSimMetric metric;         // Index distance metric
-            uint64_t memory;             // Index memory consumption.
-            VecSimType type;             // Datatype the index holds.
-            bool isMulti;                // Determines if the index should multi-index or not.
-            size_t dim;                  // Vector size (dimension).
-            VecSearchMode last_mode;     // The mode in which the last query ran.
-            size_t visitedNodesPoolSize; // The max number of parallel graph scans so far.
-        } hnswInfo;
-        struct {
-            size_t indexSize;        // Current count of vectors.
-            size_t indexLabelCount;  // Current unique count of labels.
-            size_t blockSize;        // Brute force algorithm vector block (mini matrix) size
-            VecSimMetric metric;     // Index distance metric
-            uint64_t memory;         // Index memory consumption.
-            VecSimType type;         // Datatype the index holds.
-            bool isMulti;            // Determines if the index should multi-index or not.
-            size_t dim;              // Vector size (dimension).
-            VecSearchMode last_mode; // The mode in which the last query ran.
-        } bfInfo;
+        bfInfoStruct bfInfo;
+        hnswInfoStruct hnswInfo;
+        tieredInfoStruct tieredInfo;
     };
-    VecSimAlgo algo; // Algorithm being used.
 } VecSimIndexInfo;
 
 // Memory function declarations.
@@ -280,6 +291,13 @@ typedef struct {
  */
 typedef int (*timeoutCallbackFunction)(void *ctx);
 
+/**
+ * @brief A struct to pass 3rd party logging function to Vecsimlib.
+ * @param ctx some generic context to pass to the function
+ * @param message the message to log
+ */
+typedef void (*logCallbackFunction)(void *ctx, const char *message);
+
 typedef enum {
     VecSim_QueryResult_OK = VecSim_OK,
     VecSim_QueryResult_TimedOut,
diff --git a/src/VecSim/vec_sim_index.h b/src/VecSim/vec_sim_index.h
index 7e33b69ad..3f8748c81 100644
--- a/src/VecSim/vec_sim_index.h
+++ b/src/VecSim/vec_sim_index.h
@@ -17,6 +17,27 @@
 
 using spaces::dist_func_t;
 
+/**
+ * @brief Struct for initializing an abstract index class.
+ *
+ * @param allocator The allocator to use for the index.
+ * @param dim The dimension of the vectors in the index.
+ * @param vecType The type of the vectors in the index.
+ * @param metric The metric to use in the index.
+ * @param blockSize The block size to use in the index.
+ * @param multi Determines if the index should multi-index or not.
+ * @param logCtx The context to use for logging.
+ */
+struct AbstractIndexInitParams {
+    std::shared_ptr<VecSimAllocator> allocator;
+    size_t dim;
+    VecSimType vecType;
+    VecSimMetric metric;
+    size_t blockSize;
+    bool multi;
+    void *logCtx;
+};
+
 /**
  * @brief Abstract C++ class for vector index, delete and lookup
  *
@@ -26,26 +47,47 @@ struct VecSimIndexAbstract : public VecSimIndexInterface {
 protected:
     size_t dim;          // Vector's dimension.
     VecSimType vecType;  // Datatype to index.
+    size_t data_size;    // Vector size in bytes
     VecSimMetric metric; // Distance metric to use in the index.
     size_t blockSize;    // Index's vector block size (determines by how many vectors to resize when
                          // resizing)
     dist_func_t<DistType>
-        dist_func;           // Index's distance function. Chosen by the type, metric and dimension.
-    VecSearchMode last_mode; // The last search mode in RediSearch (used for debug/testing).
-    bool isMulti;            // Determines if the index should multi-index or not.
+        dist_func; // Index's distance function. Chosen by the type, metric and dimension.
+    mutable VecSearchMode last_mode; // The last search mode in RediSearch (used for debug/testing).
+    bool isMulti;                    // Determines if the index should multi-index or not.
+    void *logCallbackCtx;            // Context for the log callback.
+
+    /**
+     * @brief Get the common info object
+     *
+     * @return CommonInfo
+     */
+    CommonInfo getCommonInfo() const {
+        CommonInfo info;
+        info.basicInfo = this->getBasicInfo();
+        info.last_mode = this->last_mode;
+        info.memory = this->getAllocationSize();
+        info.indexSize = this->indexSize();
+        info.indexLabelCount = this->indexLabelCount();
+        return info;
+    }
+
+    normalizeVector_f normalize_func; // A pointer to a normalization function of specific type.
 
 public:
     /**
      * @brief Construct a new Vec Sim Index object
      *
      */
-    VecSimIndexAbstract(std::shared_ptr<VecSimAllocator> allocator, size_t dim, VecSimType vecType,
-                        VecSimMetric metric, size_t blockSize, bool multi)
-        : VecSimIndexInterface(allocator), dim(dim), vecType(vecType), metric(metric),
-          blockSize(blockSize ? blockSize : DEFAULT_BLOCK_SIZE), last_mode(EMPTY_MODE),
-          isMulti(multi) {
+    VecSimIndexAbstract(const AbstractIndexInitParams &params)
+        : VecSimIndexInterface(params.allocator), dim(params.dim), vecType(params.vecType),
+          data_size(dim * VecSimType_sizeof(vecType)), metric(params.metric),
+          blockSize(params.blockSize ? params.blockSize : DEFAULT_BLOCK_SIZE),
+          last_mode(EMPTY_MODE), isMulti(params.multi), logCallbackCtx(params.logCtx) {
         assert(VecSimType_sizeof(vecType));
         spaces::SetDistFunc(metric, dim, &dist_func);
+        normalize_func =
+            vecType == VecSimType_FLOAT32 ? normalizeVectorFloat : normalizeVectorDouble;
     }
 
     /**
@@ -60,4 +102,129 @@ struct VecSimIndexAbstract : public VecSimIndexInterface {
     inline bool isMultiValue() const { return isMulti; }
     inline VecSimType getType() const { return vecType; }
     inline VecSimMetric getMetric() const { return metric; }
+    inline size_t getDataSize() const { return data_size; }
+
+    virtual VecSimQueryResult_List rangeQuery(const void *queryBlob, double radius,
+                                              VecSimQueryParams *queryParams) const = 0;
+    VecSimQueryResult_List rangeQuery(const void *queryBlob, double radius,
+                                      VecSimQueryParams *queryParams,
+                                      VecSimQueryResult_Order order) const override {
+        auto results = rangeQuery(queryBlob, radius, queryParams);
+        sort_results(results, order);
+        return results;
+    }
+
+    void log(const char *fmt, ...) const {
+        if (VecSimIndexInterface::logCallback) {
+            // Format the message and call the callback
+            va_list args;
+            va_start(args, fmt);
+            int len = vsnprintf(NULL, 0, fmt, args);
+            va_end(args);
+            char *buf = new char[len + 1];
+            va_start(args, fmt);
+            vsnprintf(buf, len + 1, fmt, args);
+            va_end(args);
+            logCallback(this->logCallbackCtx, buf);
+            delete[] buf;
+        }
+    }
+
+    // Adds all common info to the info iterator, besides the block size (currently 8 fields).
+    void addCommonInfoToIterator(VecSimInfoIterator *infoIterator, const CommonInfo &info) const {
+        infoIterator->addInfoField(VecSim_InfoField{
+            .fieldName = VecSimCommonStrings::TYPE_STRING,
+            .fieldType = INFOFIELD_STRING,
+            .fieldValue = {FieldValue{.stringValue = VecSimType_ToString(info.basicInfo.type)}}});
+        infoIterator->addInfoField(
+            VecSim_InfoField{.fieldName = VecSimCommonStrings::DIMENSION_STRING,
+                             .fieldType = INFOFIELD_UINT64,
+                             .fieldValue = {FieldValue{.uintegerValue = info.basicInfo.dim}}});
+        infoIterator->addInfoField(
+            VecSim_InfoField{.fieldName = VecSimCommonStrings::METRIC_STRING,
+                             .fieldType = INFOFIELD_STRING,
+                             .fieldValue = {FieldValue{
+                                 .stringValue = VecSimMetric_ToString(info.basicInfo.metric)}}});
+        infoIterator->addInfoField(
+            VecSim_InfoField{.fieldName = VecSimCommonStrings::IS_MULTI_STRING,
+                             .fieldType = INFOFIELD_UINT64,
+                             .fieldValue = {FieldValue{.uintegerValue = info.basicInfo.isMulti}}});
+        infoIterator->addInfoField(
+            VecSim_InfoField{.fieldName = VecSimCommonStrings::INDEX_SIZE_STRING,
+                             .fieldType = INFOFIELD_UINT64,
+                             .fieldValue = {FieldValue{.uintegerValue = info.indexSize}}});
+        infoIterator->addInfoField(
+            VecSim_InfoField{.fieldName = VecSimCommonStrings::INDEX_LABEL_COUNT_STRING,
+                             .fieldType = INFOFIELD_UINT64,
+                             .fieldValue = {FieldValue{.uintegerValue = info.indexLabelCount}}});
+        infoIterator->addInfoField(
+            VecSim_InfoField{.fieldName = VecSimCommonStrings::MEMORY_STRING,
+                             .fieldType = INFOFIELD_UINT64,
+                             .fieldValue = {FieldValue{.uintegerValue = info.memory}}});
+        infoIterator->addInfoField(VecSim_InfoField{
+            .fieldName = VecSimCommonStrings::SEARCH_MODE_STRING,
+            .fieldType = INFOFIELD_STRING,
+            .fieldValue = {FieldValue{.stringValue = VecSimSearchMode_ToString(info.last_mode)}}});
+    }
+    const void *processBlob(const void *original_blob, void *processed_blob) const {
+        // if the metric is cosine, we need to normalize
+        if (this->metric == VecSimMetric_Cosine) {
+            // copy original blob to the output blob
+            memcpy(processed_blob, original_blob, this->data_size);
+            // normalize the copy in place
+            normalize_func(processed_blob, this->dim);
+
+            return processed_blob;
+        }
+
+        // Else no process is needed, return the original blob
+        return original_blob;
+    }
+
+    /**
+     * @brief Get the basic static info object
+     *
+     * @return basicInfo
+     */
+    VecSimIndexBasicInfo getBasicInfo() const {
+        VecSimIndexBasicInfo info{.blockSize = this->blockSize,
+                                  .metric = this->metric,
+                                  .type = this->vecType,
+                                  .isMulti = this->isMulti,
+                                  .dim = this->dim};
+        return info;
+    }
+
+protected:
+    virtual int addVectorWrapper(const void *blob, labelType label, void *auxiliaryCtx) override {
+        char processed_blob[this->data_size];
+        const void *vector_to_add = processBlob(blob, processed_blob);
+
+        return this->addVector(vector_to_add, label, auxiliaryCtx);
+    }
+
+    virtual VecSimQueryResult_List topKQueryWrapper(const void *queryBlob, size_t k,
+                                                    VecSimQueryParams *queryParams) const override {
+        char processed_blob[this->data_size];
+        const void *query_to_send = processBlob(queryBlob, processed_blob);
+
+        return this->topKQuery(query_to_send, k, queryParams);
+    }
+
+    virtual VecSimQueryResult_List rangeQueryWrapper(const void *queryBlob, double radius,
+                                                     VecSimQueryParams *queryParams,
+                                                     VecSimQueryResult_Order order) const override {
+        char processed_blob[this->data_size];
+        const void *query_to_send = processBlob(queryBlob, processed_blob);
+
+        return this->rangeQuery(query_to_send, radius, queryParams, order);
+    }
+
+    virtual VecSimBatchIterator *
+    newBatchIteratorWrapper(const void *queryBlob, VecSimQueryParams *queryParams) const override {
+        char processed_blob[this->data_size];
+        const void *query_to_send = processBlob(queryBlob, processed_blob);
+
+        return this->newBatchIterator(query_to_send, queryParams);
+    }
 };
diff --git a/src/VecSim/vec_sim_interface.cpp b/src/VecSim/vec_sim_interface.cpp
index 41bbc5bd9..b1953b023 100644
--- a/src/VecSim/vec_sim_interface.cpp
+++ b/src/VecSim/vec_sim_interface.cpp
@@ -5,5 +5,12 @@
  */
 
 #include "VecSim/vec_sim_interface.h"
+#include <cstdarg>
+#include <iostream>
+
+// Print log messages to stdout
+void Vecsim_Log(void *ctx, const char *message) { std::cout << message << std::endl; }
 
 timeoutCallbackFunction VecSimIndexInterface::timeoutCallback = [](void *ctx) { return 0; };
+logCallbackFunction VecSimIndexInterface::logCallback = Vecsim_Log;
+VecSimWriteMode VecSimIndexInterface::asyncWriteMode = VecSim_WriteAsync;
diff --git a/src/VecSim/vec_sim_interface.h b/src/VecSim/vec_sim_interface.h
index 151349eb9..873fdfc33 100644
--- a/src/VecSim/vec_sim_interface.h
+++ b/src/VecSim/vec_sim_interface.h
@@ -12,6 +12,7 @@
 
 #include <stddef.h>
 #include <stdexcept>
+#include <cstdarg>
 /**
  * @brief Abstract C++ class for vector index, delete and lookup
  *
@@ -32,18 +33,32 @@ struct VecSimIndexInterface : public VecsimBaseObject {
      */
     virtual ~VecSimIndexInterface() {}
 
+    /**
+     * @brief This Function prepares the blob before sending it to addVector.
+     * @param blob binary representation of the vector. Blob size should match the index data type
+     * and dimension. (for example, if the distance metric is cosine,
+     * it will call addVector with the normalized blob)
+     * It is assumed the the index saves its own copy of the blob.
+     */
+    virtual int addVectorWrapper(const void *blob, labelType label,
+                                 void *auxiliaryCtx = nullptr) = 0;
+
     /**
      * @brief Add a vector blob and its id to the index.
      *
      * @param blob binary representation of the vector. Blob size should match the index data type
-     * and dimension.
+     * and dimension. It is assumed that the queryBlob has been already processed
+     *  (for example, if the distance metric is cosine, the blob is already *normalized*)
      * @param label the label of the added vector.
-     * @param overwriteAllowed if true and id already exists in the index, overwrite it. Otherwise,
-     * ignore the new vector.
-     * @return the number of new vectors inserted (1 for new insertion, 0 for override), or -1
-     * in case that override is not allowed and label already exists.
+     * @param auxiliaryCtx if this is not the main index (but a layer in a tiered index for example)
+     * we pass a state of the index to be used internally. Otherwise, if auxiliaryCtx just perform
+     * a "vanilla" insertion of a new vector.
+     * In addition, if id is not given, and this label already exists overwrite it. Otherwise,
+     * it's the caller main index responsibility to validate that the new label and id are
+     * appropriate.
+     * @return the number of new vectors inserted (1 for new insertion, 0 for override).
      */
-    virtual int addVector(const void *blob, labelType label, bool overwriteAllowed = true) = 0;
+    virtual int addVector(const void *blob, labelType label, void *auxiliaryCtx = nullptr) = 0;
 
     /**
      * @brief Remove a vector from an index.
@@ -66,7 +81,7 @@ struct VecSimIndexInterface : public VecsimBaseObject {
     virtual double getDistanceFrom(labelType id, const void *blob) const = 0;
 
     /**
-     * @brief Return the number of vectors in the index using its SizeFn.
+     * @brief Return the number of vectors in the index (including ones that are marked as deleted).
      *
      * @return index size.
      */
@@ -85,31 +100,52 @@ struct VecSimIndexInterface : public VecsimBaseObject {
     virtual void increaseCapacity() = 0;
 
     /**
-     * @brief Return the number of unique labels in the index using its SizeFn.
+     * @brief Return the number of unique labels in the index (which are not deleted).
      *
      * @return index label count.
      */
     virtual size_t indexLabelCount() const = 0;
 
     /**
-     * @brief Search for the k closest vectors to a given vector in the index.
-     *
+     * @brief This Function prepares the blob before sending it to topKQuery.
      * @param queryBlob binary representation of the query vector. Blob size should match the index
      * data type and dimension.
-     * @param k the number of "nearest neighbours" to return (upper bound).
+     * for example, if the distance metric is cosine, it will call topKQuery with the normalized
+     * blob.
+     */
+    virtual VecSimQueryResult_List topKQueryWrapper(const void *queryBlob, size_t k,
+                                                    VecSimQueryParams *queryParams) const = 0;
+
+    /**
+     * @brief Search for the k closest vectors to a given vector in the index.
+     * @param queryBlob binary representation of the query vector. Blob size should match the index
+     * data type and dimension. It is assumed that the queryBlob has been already processed
+     *  (for example, if the distance metric is cosine, the blob is already *normalized*)
+     * @param k the number of "nearest neighbors" to return (upper bound).
      * @param queryParams run time params for the search, which are algorithm-specific.
      * @return An opaque object the represents a list of results. User can access the id and score
      * (which is the distance according to the index metric) of every result through
      * VecSimQueryResult_Iterator.
      */
     virtual VecSimQueryResult_List topKQuery(const void *queryBlob, size_t k,
-                                             VecSimQueryParams *queryParams) = 0;
+                                             VecSimQueryParams *queryParams) const = 0;
 
+    /**
+     * @brief This Function prepares the blob before sending it to rangeQuery.
+     * @param queryBlob binary representation of the query vector. Blob size should match the index
+     * data type and dimension.
+     * for example, if the distance metric is cosine, it will call rangeQuery with the normalized
+     * blob.
+     */
+    virtual VecSimQueryResult_List rangeQueryWrapper(const void *queryBlob, double radius,
+                                                     VecSimQueryParams *queryParams,
+                                                     VecSimQueryResult_Order order) const = 0;
     /**
      * @brief Search for the vectors that are in a given range in the index with respect to a given
      * vector. The results can be ordered by their score or id.
      * @param queryBlob binary representation of the query vector. Blob size should match the index
-     * data type and dimension.
+     * data type and dimension. It is assumed that the queryBlob has been already processed
+     *  (for example, if the distance metric is cosine, the blob is already *normalized*)
      * @param radius the radius around the query vector to search vectors within it.
      * @param queryParams run time params for the search, which are algorithm-specific.
      * @param order the criterion to sort the results list by it. Options are by score, or by id.
@@ -118,15 +154,25 @@ struct VecSimIndexInterface : public VecsimBaseObject {
      * VecSimQueryResult_Iterator.
      */
     virtual VecSimQueryResult_List rangeQuery(const void *queryBlob, double radius,
-                                              VecSimQueryParams *queryParams) = 0;
+                                              VecSimQueryParams *queryParams,
+                                              VecSimQueryResult_Order order) const = 0;
 
     /**
      * @brief Return index information.
      *
-     * @return Index general and specific meta-data.
+     * @return Index general and specific meta-data. Note that this operation might
+     * be time consuming (specially for tiered index where computing label count required
+     * locking and going over the labels sets). So this should be used carefully.
      */
     virtual VecSimIndexInfo info() const = 0;
 
+    /**
+     * @brief Return index static information.
+     *
+     * @return Index general and specific meta-data (for quick and lock-less data retrieval)_
+     */
+    virtual VecSimIndexBasicInfo basicInfo() const = 0;
+
     /**
      * @brief Returns an index information in an iterable structure.
      *
@@ -143,9 +189,16 @@ struct VecSimIndexInterface : public VecsimBaseObject {
      * type and dimension.
      * @return Fresh batch iterator
      */
+    virtual VecSimBatchIterator *newBatchIteratorWrapper(const void *queryBlob,
+                                                         VecSimQueryParams *queryParams) const = 0;
+
+    /**
+     * @brief A function to be implemented by the inheriting index and called by rangeQuery.
+     * @param blob is a processed vector (for example, if the distance metric is cosine,
+     * blob is already *normalized* )
+     */
     virtual VecSimBatchIterator *newBatchIterator(const void *queryBlob,
                                                   VecSimQueryParams *queryParams) const = 0;
-
     /**
      * @brief Return True if heuristics says that it is better to use ad-hoc brute-force
      * search over the index instead of using batch iterator.
@@ -159,7 +212,7 @@ struct VecSimIndexInterface : public VecsimBaseObject {
      * creating the hybrid iterator), or after running batches.
      */
 
-    virtual bool preferAdHocSearch(size_t subsetSize, size_t k, bool initial_check) = 0;
+    virtual bool preferAdHocSearch(size_t subsetSize, size_t k, bool initial_check) const = 0;
 
     /**
      * @brief Set the latest search mode in the index data (for info/debugging).
@@ -176,4 +229,20 @@ struct VecSimIndexInterface : public VecsimBaseObject {
     inline static void setTimeoutCallbackFunction(timeoutCallbackFunction callback) {
         VecSimIndexInterface::timeoutCallback = callback;
     }
+
+    static logCallbackFunction logCallback;
+    inline static void setLogCallbackFunction(logCallbackFunction callback) {
+        VecSimIndexInterface::logCallback = callback;
+    }
+
+    /**
+     * @brief Allow 3rd party to set the write mode for tiered index - async insert/delete using
+     * background jobs, or insert/delete inplace.
+     *
+     * @param mode VecSimWriteMode the mode in which we add/remove vectors (async or in-place).
+     */
+    static VecSimWriteMode asyncWriteMode;
+    inline static void setWriteMode(VecSimWriteMode mode) {
+        VecSimIndexInterface::asyncWriteMode = mode;
+    }
 };
diff --git a/src/VecSim/vec_sim_tiered_index.h b/src/VecSim/vec_sim_tiered_index.h
index c2dd5dcf7..9023f3567 100644
--- a/src/VecSim/vec_sim_tiered_index.h
+++ b/src/VecSim/vec_sim_tiered_index.h
@@ -2,40 +2,341 @@
 
 #include "vec_sim_index.h"
 #include "algorithms/brute_force/brute_force.h"
+#include "VecSim/batch_iterator.h"
+#include "VecSim/utils/query_result_utils.h"
 
 #include <shared_mutex>
 
+/**
+ * Definition of generic job structure for asynchronous tiered index.
+ */
+struct AsyncJob : public VecsimBaseObject {
+    JobType jobType;
+    JobCallback Execute; // A callback that receives a job as its input and executes the job.
+    VecSimIndex *index;
+    bool isValid;
+
+    AsyncJob(std::shared_ptr<VecSimAllocator> allocator, JobType type, JobCallback callback,
+             VecSimIndex *index_ref)
+        : VecsimBaseObject(allocator), jobType(type), Execute(callback), index(index_ref),
+          isValid(true) {}
+};
+
+// All read operations (including KNN, range, batch iterators and get-distance-from) are guaranteed
+// to consider all vectors that were added to the index before the query was submitted. The results
+// may include vectors that were added after the query was submitted, with no guarantees.
 template <typename DataType, typename DistType>
 class VecSimTieredIndex : public VecSimIndexInterface {
 protected:
-    BruteForceIndex<DataType, DistType> *flatBuffer;
-    VecSimIndexAbstract<DistType> *index;
+    VecSimIndexAbstract<DistType> *backendIndex;
+    BruteForceIndex<DataType, DistType> *frontendIndex;
 
     void *jobQueue;
+    void *jobQueueCtx; // External context to be sent to the submit callback.
     SubmitCB SubmitJobsToQueue;
 
-    void *memoryCtx;
-    UpdateMemoryCB UpdateIndexMemory;
+    mutable std::shared_mutex flatIndexGuard;
+    mutable std::shared_mutex mainIndexGuard;
 
-    // Consider putting these in the derived class instead. Also - see if we should use
-    // std::shared_mutex
-    std::shared_mutex flatIndexGuard;
-    std::shared_mutex mainIndexGuard;
+    size_t flatBufferLimit;
+
+    void submitSingleJob(AsyncJob *job) {
+        this->SubmitJobsToQueue(this->jobQueue, this->jobQueueCtx, &job, &job->Execute, 1);
+    }
+
+    void submitJobs(vecsim_stl::vector<AsyncJob *> &jobs) {
+        vecsim_stl::vector<JobCallback> callbacks(jobs.size(), this->allocator);
+        for (size_t i = 0; i < jobs.size(); i++) {
+            callbacks[i] = jobs[i]->Execute;
+        }
+        this->SubmitJobsToQueue(this->jobQueue, this->jobQueueCtx, jobs.data(), callbacks.data(),
+                                jobs.size());
+    }
 
 public:
-    VecSimTieredIndex(VecSimIndexAbstract<DistType> *index_, TieredIndexParams tieredParams)
-        : VecSimIndexInterface(index_->getAllocator()), index(index_),
-          jobQueue(tieredParams.jobQueue), SubmitJobsToQueue(tieredParams.submitCb),
-          memoryCtx(tieredParams.memoryCtx), UpdateIndexMemory(tieredParams.UpdateMemCb) {
-        BFParams bf_params = {.type = index_->getType(),
-                              .dim = index_->getDim(),
-                              .metric = index_->getMetric(),
-                              .multi = index_->isMultiValue()};
-        flatBuffer = static_cast<BruteForceIndex<DataType, DistType> *>(
-            BruteForceFactory::NewIndex(&bf_params, index->getAllocator()));
-    }
-    ~VecSimTieredIndex() {
-        VecSimIndex_Free(index);
-        VecSimIndex_Free(flatBuffer);
+    VecSimTieredIndex(VecSimIndexAbstract<DistType> *backendIndex_,
+                      BruteForceIndex<DataType, DistType> *frontendIndex_,
+                      TieredIndexParams tieredParams, std::shared_ptr<VecSimAllocator> allocator)
+        : VecSimIndexInterface(allocator), backendIndex(backendIndex_),
+          frontendIndex(frontendIndex_), jobQueue(tieredParams.jobQueue),
+          jobQueueCtx(tieredParams.jobQueueCtx), SubmitJobsToQueue(tieredParams.submitCb),
+          flatBufferLimit(tieredParams.flatBufferLimit) {}
+
+    virtual ~VecSimTieredIndex() {
+        VecSimIndex_Free(backendIndex);
+        VecSimIndex_Free(frontendIndex);
+    }
+
+    VecSimQueryResult_List topKQuery(const void *queryBlob, size_t k,
+                                     VecSimQueryParams *queryParams) const override;
+
+    VecSimQueryResult_List rangeQuery(const void *queryBlob, double radius,
+                                      VecSimQueryParams *queryParams,
+                                      VecSimQueryResult_Order order) const override;
+
+    virtual inline uint64_t getAllocationSize() const override {
+        return this->allocator->getAllocationSize() + this->backendIndex->getAllocationSize() +
+               this->frontendIndex->getAllocationSize();
+    }
+
+    virtual VecSimIndexInfo info() const override;
+    virtual VecSimInfoIterator *infoIterator() const override;
+
+    bool preferAdHocSearch(size_t subsetSize, size_t k, bool initial_check) const override {
+        // For now, decide according to the bigger index.
+        return this->backendIndex->indexSize() > this->frontendIndex->indexSize()
+                   ? this->backendIndex->preferAdHocSearch(subsetSize, k, initial_check)
+                   : this->frontendIndex->preferAdHocSearch(subsetSize, k, initial_check);
+    }
+
+    // Return the current state of the global write mode (async/in-place).
+    static VecSimWriteMode getWriteMode() { return VecSimIndexInterface::asyncWriteMode; }
+#ifdef BUILD_TESTS
+    inline VecSimIndexAbstract<DistType> *getFlatbufferIndex() { return this->frontendIndex; }
+#endif
+
+private:
+    virtual int addVectorWrapper(const void *blob, labelType label, void *auxiliaryCtx) override {
+        // Will be used only if a processing stage is needed
+        char processed_blob[this->backendIndex->getDataSize()];
+        const void *vector_to_add = this->backendIndex->processBlob(blob, processed_blob);
+        return this->addVector(vector_to_add, label, auxiliaryCtx);
+    }
+
+    virtual VecSimQueryResult_List topKQueryWrapper(const void *queryBlob, size_t k,
+                                                    VecSimQueryParams *queryParams) const override {
+        // Will be used only if a processing stage is needed
+        char processed_blob[this->backendIndex->getDataSize()];
+        const void *query_to_send = this->backendIndex->processBlob(queryBlob, processed_blob);
+        return this->topKQuery(query_to_send, k, queryParams);
+    }
+
+    virtual VecSimQueryResult_List rangeQueryWrapper(const void *queryBlob, double radius,
+                                                     VecSimQueryParams *queryParams,
+                                                     VecSimQueryResult_Order order) const override {
+        // Will be used only if a processing stage is needed
+        char processed_blob[this->backendIndex->getDataSize()];
+        const void *query_to_send = this->backendIndex->processBlob(queryBlob, processed_blob);
+
+        return this->rangeQuery(query_to_send, radius, queryParams, order);
+    }
+
+    virtual VecSimBatchIterator *
+    newBatchIteratorWrapper(const void *queryBlob, VecSimQueryParams *queryParams) const override {
+        // Will be used only if a processing stage is needed
+        char processed_blob[this->backendIndex->getDataSize()];
+        const void *query_to_send = this->backendIndex->processBlob(queryBlob, processed_blob);
+
+        return this->newBatchIterator(query_to_send, queryParams);
+    }
+};
+
+template <typename DataType, typename DistType>
+VecSimQueryResult_List
+VecSimTieredIndex<DataType, DistType>::topKQuery(const void *queryBlob, size_t k,
+                                                 VecSimQueryParams *queryParams) const {
+    this->flatIndexGuard.lock_shared();
+
+    // If the flat buffer is empty, we can simply query the main index.
+    if (this->frontendIndex->indexSize() == 0) {
+        // Release the flat lock and acquire the main lock.
+        this->flatIndexGuard.unlock_shared();
+
+        // Simply query the main index and return the results while holding the lock.
+        this->mainIndexGuard.lock_shared();
+        auto res = this->backendIndex->topKQuery(queryBlob, k, queryParams);
+        this->mainIndexGuard.unlock_shared();
+
+        return res;
+    } else {
+        // No luck... first query the flat buffer and release the lock.
+        auto flat_results = this->frontendIndex->topKQuery(queryBlob, k, queryParams);
+        this->flatIndexGuard.unlock_shared();
+
+        // If the query failed (currently only on timeout), return the error code.
+        if (flat_results.code != VecSim_QueryResult_OK) {
+            assert(flat_results.results == nullptr);
+            return flat_results;
+        }
+
+        // Lock the main index and query it.
+        this->mainIndexGuard.lock_shared();
+        auto main_results = this->backendIndex->topKQuery(queryBlob, k, queryParams);
+        this->mainIndexGuard.unlock_shared();
+
+        // If the query failed (currently only on timeout), return the error code.
+        if (main_results.code != VecSim_QueryResult_OK) {
+            // Free the flat results.
+            VecSimQueryResult_Free(flat_results);
+
+            assert(main_results.results == nullptr);
+            return main_results;
+        }
+
+        // Merge the results and return, avoiding duplicates.
+        if (this->backendIndex->isMultiValue()) {
+            return merge_result_lists<true>(main_results, flat_results, k);
+        } else {
+            return merge_result_lists<false>(main_results, flat_results, k);
+        }
+    }
+}
+
+template <typename DataType, typename DistType>
+VecSimQueryResult_List
+VecSimTieredIndex<DataType, DistType>::rangeQuery(const void *queryBlob, double radius,
+                                                  VecSimQueryParams *queryParams,
+                                                  VecSimQueryResult_Order order) const {
+    this->flatIndexGuard.lock_shared();
+
+    // If the flat buffer is empty, we can simply query the main index.
+    if (this->frontendIndex->indexSize() == 0) {
+        // Release the flat lock and acquire the main lock.
+        this->flatIndexGuard.unlock_shared();
+
+        // Simply query the main index and return the results while holding the lock.
+        this->mainIndexGuard.lock_shared();
+        auto res = this->backendIndex->rangeQuery(queryBlob, radius, queryParams);
+        this->mainIndexGuard.unlock_shared();
+
+        // We could have passed the order to the main index, but we can sort them here after
+        // unlocking it instead.
+        sort_results(res, order);
+        return res;
+    } else {
+        // No luck... first query the flat buffer and release the lock.
+        auto flat_results = this->frontendIndex->rangeQuery(queryBlob, radius, queryParams);
+        this->flatIndexGuard.unlock_shared();
+
+        // If the query failed (currently only on timeout), return the error code and the partial
+        // results.
+        if (flat_results.code != VecSim_QueryResult_OK) {
+            return flat_results;
+        }
+
+        // Lock the main index and query it.
+        this->mainIndexGuard.lock_shared();
+        auto main_results = this->backendIndex->rangeQuery(queryBlob, radius, queryParams);
+        this->mainIndexGuard.unlock_shared();
+
+        // Merge the results and return, avoiding duplicates.
+        // At this point, the return code of the FLAT index is OK, and the return code of the MAIN
+        // index is either OK or TIMEOUT. Make sure to return the return code of the MAIN index.
+        if (BY_SCORE == order) {
+            sort_results_by_score_then_id(main_results);
+            sort_results_by_score_then_id(flat_results);
+
+            // Keep the return code of the main index.
+            auto code = main_results.code;
+
+            // Merge the sorted results with no limit (all the results are valid).
+            VecSimQueryResult_List ret;
+            if (this->backendIndex->isMultiValue()) {
+                ret = merge_result_lists<true>(main_results, flat_results, -1);
+            } else {
+                ret = merge_result_lists<false>(main_results, flat_results, -1);
+            }
+            // Restore the return code and return.
+            ret.code = code;
+            return ret;
+
+        } else { // BY_ID
+            // Notice that we don't modify the return code of the main index in any step.
+            concat_results(main_results, flat_results);
+            if (this->backendIndex->isMultiValue()) {
+                filter_results_by_id<true>(main_results);
+            } else {
+                filter_results_by_id<false>(main_results);
+            }
+            return main_results;
+        }
+    }
+}
+
+template <typename DataType, typename DistType>
+VecSimIndexInfo VecSimTieredIndex<DataType, DistType>::info() const {
+    VecSimIndexInfo info;
+    this->flatIndexGuard.lock();
+    VecSimIndexInfo frontendInfo = this->frontendIndex->info();
+    this->flatIndexGuard.unlock();
+
+    this->mainIndexGuard.lock();
+    VecSimIndexInfo backendInfo = this->backendIndex->info();
+    this->mainIndexGuard.unlock();
+
+    info.commonInfo.indexLabelCount = this->indexLabelCount();
+    info.commonInfo.indexSize =
+        frontendInfo.commonInfo.indexSize + backendInfo.commonInfo.indexSize;
+    info.commonInfo.memory = this->getAllocationSize();
+    info.commonInfo.last_mode = backendInfo.commonInfo.last_mode;
+
+    VecSimIndexBasicInfo basic_info{.algo = backendInfo.commonInfo.basicInfo.algo,
+                                    .blockSize = backendInfo.commonInfo.basicInfo.blockSize,
+                                    .metric = backendInfo.commonInfo.basicInfo.metric,
+                                    .type = backendInfo.commonInfo.basicInfo.type,
+                                    .isMulti = this->backendIndex->isMultiValue(),
+                                    .dim = backendInfo.commonInfo.basicInfo.dim,
+                                    .isTiered = true};
+    info.commonInfo.basicInfo = basic_info;
+
+    switch (backendInfo.commonInfo.basicInfo.algo) {
+    case VecSimAlgo_HNSWLIB:
+        info.tieredInfo.backendInfo.hnswInfo = backendInfo.hnswInfo;
+        break;
+    case VecSimAlgo_BF:
+    case VecSimAlgo_TIERED:
+        assert(false && "Invalid backend algorithm");
     }
+
+    info.tieredInfo.backendCommonInfo = backendInfo.commonInfo;
+    // For now, this is hard coded to FLAT
+    info.tieredInfo.frontendCommonInfo = frontendInfo.commonInfo;
+    info.tieredInfo.bfInfo = frontendInfo.bfInfo;
+
+    info.tieredInfo.backgroundIndexing = frontendInfo.commonInfo.indexSize > 0;
+    info.tieredInfo.management_layer_memory = this->allocator->getAllocationSize();
+    info.tieredInfo.bufferLimit = this->flatBufferLimit;
+    return info;
+}
+
+template <typename DataType, typename DistType>
+VecSimInfoIterator *VecSimTieredIndex<DataType, DistType>::infoIterator() const {
+    VecSimIndexInfo info = this->info();
+    // For readability. Update this number when needed.
+    size_t numberOfInfoFields = 14;
+    auto *infoIterator = new VecSimInfoIterator(numberOfInfoFields);
+
+    // Set tiered explicitly as algo name for root iterator.
+    infoIterator->addInfoField(VecSim_InfoField{
+        .fieldName = VecSimCommonStrings::ALGORITHM_STRING,
+        .fieldType = INFOFIELD_STRING,
+        .fieldValue = {FieldValue{.stringValue = VecSimCommonStrings::TIERED_STRING}}});
+
+    this->backendIndex->addCommonInfoToIterator(infoIterator, info.commonInfo);
+
+    infoIterator->addInfoField(VecSim_InfoField{
+        .fieldName = VecSimCommonStrings::TIERED_MANAGEMENT_MEMORY_STRING,
+        .fieldType = INFOFIELD_UINT64,
+        .fieldValue = {FieldValue{.uintegerValue = info.tieredInfo.management_layer_memory}}});
+
+    infoIterator->addInfoField(VecSim_InfoField{
+        .fieldName = VecSimCommonStrings::TIERED_BACKGROUND_INDEXING_STRING,
+        .fieldType = INFOFIELD_UINT64,
+        .fieldValue = {FieldValue{.uintegerValue = info.tieredInfo.backgroundIndexing}}});
+
+    infoIterator->addInfoField(
+        VecSim_InfoField{.fieldName = VecSimCommonStrings::TIERED_BUFFER_LIMIT_STRING,
+                         .fieldType = INFOFIELD_UINT64,
+                         .fieldValue = {FieldValue{.uintegerValue = info.tieredInfo.bufferLimit}}});
+
+    infoIterator->addInfoField(VecSim_InfoField{
+        .fieldName = VecSimCommonStrings::FRONTEND_INDEX_STRING,
+        .fieldType = INFOFIELD_ITERATOR,
+        .fieldValue = {FieldValue{.iteratorValue = this->frontendIndex->infoIterator()}}});
+
+    infoIterator->addInfoField(VecSim_InfoField{
+        .fieldName = VecSimCommonStrings::BACKEND_INDEX_STRING,
+        .fieldType = INFOFIELD_ITERATOR,
+        .fieldValue = {FieldValue{.iteratorValue = this->backendIndex->infoIterator()}}});
+    return infoIterator;
 };
diff --git a/src/VecSim/version.h b/src/VecSim/version.h
index 01cf05413..7fba1cb0a 100644
--- a/src/VecSim/version.h
+++ b/src/VecSim/version.h
@@ -6,6 +6,6 @@
 
 #pragma once
 
-#define VSS_VERSION_MAJOR 99
-#define VSS_VERSION_MINOR 99
-#define VSS_VERSION_PATCH 99
+#define VSS_VERSION_MAJOR 0
+#define VSS_VERSION_MINOR 7
+#define VSS_VERSION_PATCH 1
diff --git a/src/python_bindings/Mybytearray.py b/src/python_bindings/Mybytearray.py
deleted file mode 100644
index 2a391aee6..000000000
--- a/src/python_bindings/Mybytearray.py
+++ /dev/null
@@ -1,3 +0,0 @@
-
-def create_bytearray(np_arr):
-        return bytearray(np_arr)
diff --git a/src/python_bindings/bindings.cpp b/src/python_bindings/bindings.cpp
index f794f31de..ecff84d18 100644
--- a/src/python_bindings/bindings.cpp
+++ b/src/python_bindings/bindings.cpp
@@ -6,45 +6,56 @@
 
 #include "VecSim/vec_sim.h"
 #include "VecSim/algorithms/hnsw/hnsw.h"
-#include "VecSim/algorithms/hnsw/hnsw_factory.h"
+#include "VecSim/index_factories/hnsw_factory.h"
 #include "VecSim/batch_iterator.h"
 
 #include "pybind11/pybind11.h"
 #include "pybind11/numpy.h"
 #include "pybind11/stl.h"
 #include <cstring>
+#include <thread>
+#include <VecSim/algorithms/hnsw/hnsw_single.h>
+#include <VecSim/algorithms/brute_force/brute_force_single.h>
+#include "tiered_index_mock.h"
 
 namespace py = pybind11;
-
-// Helper function that iterates query results and wrap them in python object -
-// a tuple of two lists: (ids, scores)
-py::object wrap_results(VecSimQueryResult_List res, size_t len) {
-    size_t *data_numpy_l = new size_t[len];
-    double *data_numpy_d = new double[len];
-    VecSimQueryResult_Iterator *iterator = VecSimQueryResult_List_GetIterator(res);
-    int res_ind = 0;
-    while (VecSimQueryResult_IteratorHasNext(iterator)) {
-        VecSimQueryResult *item = VecSimQueryResult_IteratorNext(iterator);
-        int id = VecSimQueryResult_GetId(item);
-        double score = VecSimQueryResult_GetScore(item);
-        data_numpy_d[res_ind] = score;
-        data_numpy_l[res_ind++] = id;
+using namespace tiered_index_mock;
+
+// Helper function that iterates query results and wrap them in python numpy object -
+// a tuple of two 2D arrays: (labels, distances)
+py::object wrap_results(VecSimQueryResult_List *res, size_t num_res, size_t num_queries = 1) {
+    auto *data_numpy_l = new long[num_res * num_queries];
+    auto *data_numpy_d = new double[num_res * num_queries];
+    // Default "padding" for the entries that will stay empty (in case of less than k results return
+    // in KNN, or results of range queries with number of results lower than the maximum in the
+    // batch (which determines the arrays' shape)
+    std::fill_n(data_numpy_l, num_res * num_queries, -1);
+    std::fill_n(data_numpy_d, num_res * num_queries, -1.0);
+
+    for (size_t i = 0; i < num_queries; i++) {
+        VecSimQueryResult_Iterator *iterator = VecSimQueryResult_List_GetIterator(res[i]);
+        size_t res_ind = i * num_res;
+        while (VecSimQueryResult_IteratorHasNext(iterator)) {
+            VecSimQueryResult *item = VecSimQueryResult_IteratorNext(iterator);
+            data_numpy_d[res_ind] = VecSimQueryResult_GetScore(item);
+            data_numpy_l[res_ind++] = (long)VecSimQueryResult_GetId(item);
+        }
+        VecSimQueryResult_IteratorFree(iterator);
+        VecSimQueryResult_Free(res[i]);
     }
-    VecSimQueryResult_IteratorFree(iterator);
-    VecSimQueryResult_Free(res);
 
-    py::capsule free_when_done_l(data_numpy_l, [](void *f) { delete[] f; });
-    py::capsule free_when_done_d(data_numpy_d, [](void *f) { delete[] f; });
+    py::capsule free_when_done_l(data_numpy_l, [](void *labels) { delete[](long *) labels; });
+    py::capsule free_when_done_d(data_numpy_d, [](void *dists) { delete[](double *) dists; });
     return py::make_tuple(
-        py::array_t<size_t>(
-            {(size_t)1, len},                       // shape
-            {len * sizeof(size_t), sizeof(size_t)}, // C-style contiguous strides for double
-            data_numpy_l,                           // the data pointer
+        py::array_t<long>(
+            {(size_t)num_queries, num_res},         // shape
+            {num_res * sizeof(long), sizeof(long)}, // C-style contiguous strides for size_t
+            data_numpy_l,                           // the data pointer (labels array)
             free_when_done_l),
         py::array_t<double>(
-            {(size_t)1, len},                       // shape
-            {len * sizeof(double), sizeof(double)}, // C-style contiguous strides for double
-            data_numpy_d,                           // the data pointer
+            {(size_t)num_queries, num_res},             // shape
+            {num_res * sizeof(double), sizeof(double)}, // C-style contiguous strides for double
+            data_numpy_d,                               // the data pointer (distances array)
             free_when_done_d));
 }
 
@@ -63,86 +74,178 @@ class PyBatchIterator {
     bool hasNext() { return VecSimBatchIterator_HasNext(batchIterator.get()); }
 
     py::object getNextResults(size_t n_res, VecSimQueryResult_Order order) {
-        VecSimQueryResult_List results =
-            VecSimBatchIterator_Next(batchIterator.get(), n_res, order);
+        VecSimQueryResult_List results;
+        {
+            // We create this object inside the scope to enable parallel execution of the batch
+            // iterator from different Python threads.
+            py::gil_scoped_release py_gil;
+            results = VecSimBatchIterator_Next(batchIterator.get(), n_res, order);
+        }
         // The number of results may be lower than n_res, if there are less than n_res remaining
         // vectors in the index that hadn't been returned yet.
         size_t actual_n_res = VecSimQueryResult_Len(results);
-        return wrap_results(results, actual_n_res);
+        return wrap_results(&results, actual_n_res);
     }
     void reset() { VecSimBatchIterator_Reset(batchIterator.get()); }
     virtual ~PyBatchIterator() {}
 };
+
 // @input or @query arguments are a py::object object. (numpy arrays are acceptable)
+class PyVecSimIndex {
+private:
+    template <typename DataType>
+    inline py::object rawVectorsAsNumpy(labelType label, size_t dim) {
+        std::vector<std::vector<DataType>> vectors;
+        if (index->basicInfo().algo == VecSimAlgo_BF) {
+            reinterpret_cast<BruteForceIndex<DataType, DataType> *>(this->index.get())
+                ->getDataByLabel(label, vectors);
+        } else {
+            // index is HNSW
+            reinterpret_cast<HNSWIndex<DataType, DataType> *>(this->index.get())
+                ->getDataByLabel(label, vectors);
+        }
+        size_t n_vectors = vectors.size();
+        auto *data_numpy = new DataType[n_vectors * dim];
+        // Copy the vector blobs into one contiguous array of data, and free the original buffer
+        // afterwards.
+        for (size_t i = 0; i < n_vectors; i++) {
+            memcpy(data_numpy + i * dim, vectors[i].data(), dim * sizeof(DataType));
+        }
 
-// To convert input or query to a pointer use input_to_blob(input)
-// For example:
-// VecSimIndex_AddVector(index, input_to_blob(input), id);
+        py::capsule free_when_done(data_numpy,
+                                   [](void *vector_data) { delete[](DataType *) vector_data; });
+        return py::array_t<DataType>(
+            {n_vectors, dim}, // shape
+            {dim * sizeof(DataType),
+             sizeof(DataType)}, // C-style contiguous strides for the data type
+            data_numpy,         // the data pointer
+            free_when_done);
+    }
+
+protected:
+    std::shared_ptr<VecSimIndex> index;
+
+    inline VecSimQueryResult_List searchKnnInternal(const char *query, size_t k,
+                                                    VecSimQueryParams *query_params) {
+        return VecSimIndex_TopKQuery(index.get(), query, k, query_params, BY_SCORE);
+    }
+
+    inline void addVectorInternal(const char *vector_data, size_t id) {
+        VecSimIndex_AddVector(index.get(), vector_data, id);
+    }
+
+    inline VecSimQueryResult_List searchRangeInternal(const char *query, double radius,
+                                                      VecSimQueryParams *query_params) {
+        return VecSimIndex_RangeQuery(index.get(), query, radius, query_params, BY_SCORE);
+    }
 
-class PyVecSimIndex {
 public:
-    PyVecSimIndex()
-        : create_bytearray(
-              py::module::import("src.python_bindings.Mybytearray").attr("create_bytearray")) {}
+    PyVecSimIndex() = default;
 
-    PyVecSimIndex(const VecSimParams &params)
-        : create_bytearray(
-              py::module::import("src.python_bindings.Mybytearray").attr("create_bytearray")) {
+    explicit PyVecSimIndex(const VecSimParams &params) {
         index = std::shared_ptr<VecSimIndex>(VecSimIndex_New(&params), VecSimIndex_Free);
     }
 
-    void addVector(py::object input, size_t id) {
-        VecSimIndex_AddVector(index.get(), input_to_blob(input), id);
+    void addVector(const py::object &input, size_t id) {
+        py::array vector_data(input);
+        py::gil_scoped_release py_gil;
+        addVectorInternal((const char *)vector_data.data(0), id);
     }
+
     void deleteVector(size_t id) { VecSimIndex_DeleteVector(index.get(), id); }
 
-    py::object knn(py::object input, size_t k, VecSimQueryParams *query_params) {
-        VecSimQueryResult_List res =
-            VecSimIndex_TopKQuery(index.get(), input_to_blob(input), k, query_params, BY_SCORE);
-        if (VecSimQueryResult_Len(res) != k) {
-            throw std::runtime_error("Cannot return the results in a contiguous 2D array. Probably "
-                                     "ef or M is too small");
+    py::object knn(const py::object &input, size_t k, VecSimQueryParams *query_params) {
+        py::array query(input);
+        VecSimQueryResult_List res;
+        {
+            py::gil_scoped_release py_gil;
+            res = searchKnnInternal((const char *)query.data(0), k, query_params);
         }
-        return wrap_results(res, k);
+        return wrap_results(&res, k);
     }
 
-    py::object range(py::object input, double radius, VecSimQueryParams *query_params) {
-        VecSimQueryResult_List res = VecSimIndex_RangeQuery(index.get(), input_to_blob(input),
-                                                            radius, query_params, BY_SCORE);
-        return wrap_results(res, VecSimQueryResult_Len(res));
+    py::object range(const py::object &input, double radius, VecSimQueryParams *query_params) {
+        py::array query(input);
+        VecSimQueryResult_List res;
+        {
+            py::gil_scoped_release py_gil;
+            res = searchRangeInternal((const char *)query.data(0), radius, query_params);
+        }
+        return wrap_results(&res, VecSimQueryResult_Len(res));
     }
 
     size_t indexSize() { return VecSimIndex_IndexSize(index.get()); }
 
-    PyBatchIterator createBatchIterator(py::object input, VecSimQueryParams *query_params) {
+    size_t indexMemory() { return this->index->getAllocationSize(); }
+
+    PyBatchIterator createBatchIterator(const py::object &input, VecSimQueryParams *query_params) {
+        py::array query(input);
         return PyBatchIterator(
-            index, VecSimBatchIterator_New(index.get(), input_to_blob(input), query_params));
+            index, VecSimBatchIterator_New(index.get(), (const char *)query.data(0), query_params));
     }
 
-    virtual ~PyVecSimIndex() {} // Delete function was given to the shared pointer object
+    py::object getVector(labelType label) {
+        VecSimIndexInfo info = index->info();
+        size_t dim = info.commonInfo.basicInfo.dim;
+        if (info.commonInfo.basicInfo.type == VecSimType_FLOAT32) {
+            return rawVectorsAsNumpy<float>(label, dim);
+        } else if (info.commonInfo.basicInfo.type == VecSimType_FLOAT64) {
+            return rawVectorsAsNumpy<double>(label, dim);
+        } else {
+            throw std::runtime_error("Invalid vector data type");
+        }
+    }
 
-protected:
-    std::shared_ptr<VecSimIndex> index;
+    virtual ~PyVecSimIndex() = default; // Delete function was given to the shared pointer object
+};
 
+class PyHNSWLibIndex : public PyVecSimIndex {
 private:
-    // save the bytearray to keep its pointer valid
-    py::bytearray tmp_bytearray;
-    const py::function create_bytearray;
-    const char *input_to_blob(py::object input) {
-        tmp_bytearray = create_bytearray(input);
-        return PyByteArray_AS_STRING(tmp_bytearray.ptr());
+    template <typename search_param_t> // size_t/double for KNN/range queries.
+    using QueryFunc =
+        std::function<VecSimQueryResult_List(const char *, search_param_t, VecSimQueryParams *)>;
+
+    template <typename search_param_t> // size_t/double for KNN / range queries.
+    void runParallelQueries(const py::array &queries, size_t n_queries, search_param_t param,
+                            VecSimQueryParams *query_params, int n_threads,
+                            QueryFunc<search_param_t> queryFunc, VecSimQueryResult_List *results) {
+
+        // Use number of hardware cores as default number of threads, unless specified otherwise.
+        if (n_threads <= 0) {
+            n_threads = (int)std::thread::hardware_concurrency();
+        }
+        std::atomic_int global_counter(0);
+
+        auto parallel_search = [&](const py::array &items) {
+            while (true) {
+                int ind = global_counter.fetch_add(1);
+                if (ind >= n_queries) {
+                    break;
+                }
+                results[ind] = queryFunc((const char *)items.data(ind), param, query_params);
+            }
+        };
+        std::thread thread_objs[n_threads];
+        {
+            // Release python GIL while threads are running.
+            py::gil_scoped_release py_gil;
+            for (size_t i = 0; i < n_threads; i++) {
+                thread_objs[i] = std::thread(parallel_search, queries);
+            }
+            for (size_t i = 0; i < n_threads; i++) {
+                thread_objs[i].join();
+            }
+        }
     }
-};
 
-class PyHNSWLibIndex : public PyVecSimIndex {
 public:
-    PyHNSWLibIndex(const HNSWParams &hnsw_params) {
+    explicit PyHNSWLibIndex(const HNSWParams &hnsw_params) {
         VecSimParams params = {.algo = VecSimAlgo_HNSWLIB, .hnswParams = hnsw_params};
         this->index = std::shared_ptr<VecSimIndex>(VecSimIndex_New(&params), VecSimIndex_Free);
     }
 
     // @params is required only in V1.
-    PyHNSWLibIndex(const std::string &location, const HNSWParams *hnsw_params = nullptr) {
+    explicit PyHNSWLibIndex(const std::string &location, const HNSWParams *hnsw_params = nullptr) {
         this->index = std::shared_ptr<VecSimIndex>(HNSWFactory::NewIndex(location, hnsw_params),
                                                    VecSimIndex_Free);
     }
@@ -155,11 +258,205 @@ class PyHNSWLibIndex : public PyVecSimIndex {
         auto *hnsw = reinterpret_cast<HNSWIndex<float, float> *>(index.get());
         hnsw->saveIndex(location);
     }
+    py::object searchKnnParallel(const py::object &input, size_t k, VecSimQueryParams *query_params,
+                                 int n_threads) {
+
+        py::array queries(input);
+        if (queries.ndim() != 2) {
+            throw std::runtime_error("Input queries array must be 2D array");
+        }
+        size_t n_queries = queries.shape(0);
+        std::function<VecSimQueryResult_List(const char *, size_t, VecSimQueryParams *)>
+            searchKnnWrapper([this](const char *query_, size_t k_,
+                                    VecSimQueryParams *query_params_) -> VecSimQueryResult_List {
+                return this->searchKnnInternal(query_, k_, query_params_);
+            });
+        VecSimQueryResult_List results[n_queries];
+        runParallelQueries<size_t>(queries, n_queries, k, query_params, n_threads, searchKnnWrapper,
+                                   results);
+        return wrap_results(results, k, n_queries);
+    }
+    py::object searchRangeParallel(const py::object &input, double radius,
+                                   VecSimQueryParams *query_params, int n_threads) {
+        py::array queries(input);
+        if (queries.ndim() != 2) {
+            throw std::runtime_error("Input queries array must be 2D array");
+        }
+        size_t n_queries = queries.shape(0);
+        std::function<VecSimQueryResult_List(const char *, double, VecSimQueryParams *)>
+            searchRangeWrapper([this](const char *query_, double radius_,
+                                      VecSimQueryParams *query_params_) -> VecSimQueryResult_List {
+                return this->searchRangeInternal(query_, radius_, query_params_);
+            });
+        VecSimQueryResult_List results[n_queries];
+        runParallelQueries<double>(queries, n_queries, radius, query_params, n_threads,
+                                   searchRangeWrapper, results);
+        size_t max_results_num = 1;
+        for (size_t i = 0; i < n_queries; i++) {
+            if (VecSimQueryResult_Len(results[i]) > max_results_num) {
+                max_results_num = VecSimQueryResult_Len(results[i]);
+            }
+        }
+        // We return 2D numpy array of results (labels and distances), use padding of "-1" in the
+        // empty entries of the matrices.
+        return wrap_results(results, max_results_num, n_queries);
+    }
+
+    void addVectorsParallel(const py::object &input, const py::object &vectors_labels,
+                            int n_threads) {
+        py::array vectors_data(input);
+        py::array_t<labelType, py::array::c_style | py::array::forcecast> labels(vectors_labels);
+
+        if (vectors_data.ndim() != 2) {
+            throw std::runtime_error("Input vectors data array must be 2D array");
+        }
+        if (labels.ndim() != 1) {
+            throw std::runtime_error("Input vectors labels array must be 1D array");
+        }
+        if (vectors_data.shape(0) != labels.shape(0)) {
+            throw std::runtime_error(
+                "The first dim of vectors data and labels arrays must be equal");
+        }
+        size_t n_vectors = vectors_data.shape(0);
+        // Use number of hardware cores as default number of threads, unless specified otherwise.
+        if (n_threads <= 0) {
+            n_threads = (int)std::thread::hardware_concurrency();
+        }
+
+        std::atomic_int global_counter(0);
+        auto parallel_insert =
+            [&](const py::array &data,
+                const py::array_t<labelType, py::array::c_style | py::array::forcecast> &labels) {
+                while (true) {
+                    int ind = global_counter.fetch_add(1);
+                    if (ind >= n_vectors) {
+                        break;
+                    }
+                    this->addVectorInternal((const char *)data.data(ind), labels.at(ind));
+                }
+            };
+        std::thread thread_objs[n_threads];
+        {
+            // Release python GIL while threads are running.
+            py::gil_scoped_release py_gil;
+            for (size_t i = 0; i < n_threads; i++) {
+                thread_objs[i] = std::thread(parallel_insert, vectors_data, labels);
+            }
+            for (size_t i = 0; i < n_threads; i++) {
+                thread_objs[i].join();
+            }
+        }
+    }
+
+    bool checkIntegrity() {
+        auto type = VecSimIndex_Info(this->index.get()).commonInfo.basicInfo.type;
+        if (type == VecSimType_FLOAT32) {
+            return reinterpret_cast<HNSWIndex<float, float> *>(this->index.get())
+                ->checkIntegrity()
+                .valid_state;
+        } else if (type == VecSimType_FLOAT64) {
+            return reinterpret_cast<HNSWIndex<double, double> *>(this->index.get())
+                ->checkIntegrity()
+                .valid_state;
+        } else {
+            throw std::runtime_error("Invalid index data type");
+        }
+    }
+};
+
+class PyTieredIndex : public PyVecSimIndex {
+private:
+    VecSimIndexAbstract<float> *getFlatBuffer() {
+        return reinterpret_cast<VecSimTieredIndex<float, float> *>(this->index.get())
+            ->getFlatbufferIndex();
+    }
+
+protected:
+    JobQueue jobQueue;       // External queue that holds the jobs.
+    IndexExtCtx jobQueueCtx; // External context to be sent to the submit callback.
+    SubmitCB submitCb;       // A callback that submits an array of jobs into a given jobQueue.
+    size_t flatBufferLimit; // Maximum size allowed for the flat buffer. If flat buffer is full, use
+                            // in-place insertion.
+    bool run_thread;
+    std::bitset<MAX_POOL_SIZE> executions_status;
+
+    TieredIndexParams TieredIndexParams_Init() {
+        TieredIndexParams ret = {
+            .jobQueue = &this->jobQueue,
+            .jobQueueCtx = &this->jobQueueCtx,
+            .submitCb = this->submitCb,
+            .flatBufferLimit = this->flatBufferLimit,
+        };
+
+        return ret;
+    }
+
+public:
+    explicit PyTieredIndex(size_t BufferLimit = 3000000)
+        : submitCb(submit_callback), flatBufferLimit(BufferLimit), run_thread(true) {
+
+        for (size_t i = 0; i < THREAD_POOL_SIZE; i++) {
+            ThreadParams params(run_thread, executions_status, i, jobQueue);
+            thread_pool.emplace_back(thread_main_loop, params);
+        }
+    }
+
+    virtual ~PyTieredIndex() = 0;
+
+    void WaitForIndex(size_t waiting_duration = 10) {
+        bool keep_wating = true;
+        while (keep_wating) {
+            std::this_thread::sleep_for(std::chrono::milliseconds(waiting_duration));
+            std::unique_lock<std::mutex> lock(queue_guard);
+            if (jobQueue.empty()) {
+                while (true) {
+                    if (executions_status.count() == 0) {
+                        keep_wating = false;
+                        break;
+                    }
+                    std::this_thread::sleep_for(std::chrono::milliseconds(waiting_duration));
+                }
+            }
+        }
+    }
+
+    size_t getFlatIndexSize() { return getFlatBuffer()->indexLabelCount(); }
+
+    static size_t GetThreadsNum() { return THREAD_POOL_SIZE; }
+
+    size_t getBufferLimit() { return flatBufferLimit; }
+};
+
+PyTieredIndex::~PyTieredIndex() { thread_pool_terminate(jobQueue, run_thread); }
+class PyTiered_HNSWIndex : public PyTieredIndex {
+public:
+    explicit PyTiered_HNSWIndex(const HNSWParams &hnsw_params,
+                                const TieredHNSWParams &tiered_hnsw_params) {
+
+        // Create primaryIndexParams and specific params for hnsw tiered index.
+        VecSimParams primary_index_params = {.algo = VecSimAlgo_HNSWLIB, .hnswParams = hnsw_params};
+
+        // create TieredIndexParams
+        TieredIndexParams tiered_params = TieredIndexParams_Init();
+
+        tiered_params.primaryIndexParams = &primary_index_params;
+        tiered_params.specificParams.tieredHnswParams = tiered_hnsw_params;
+
+        // create VecSimParams for TieredIndexParams
+        VecSimParams params = {.algo = VecSimAlgo_TIERED, .tieredParams = tiered_params};
+
+        this->index = std::shared_ptr<VecSimIndex>(VecSimIndex_New(&params), VecSimIndex_Free);
+        // Set the created tiered index in the index external context.
+        this->jobQueueCtx.index_strong_ref = this->index;
+    }
+    size_t HNSWLabelCount() {
+        return this->index->info().tieredInfo.backendCommonInfo.indexLabelCount;
+    }
 };
 
 class PyBFIndex : public PyVecSimIndex {
 public:
-    PyBFIndex(const BFParams &bf_params) {
+    explicit PyBFIndex(const BFParams &bf_params) {
         VecSimParams params = {.algo = VecSimAlgo_BF, .bfParams = bf_params};
         this->index = std::shared_ptr<VecSimIndex>(VecSimIndex_New(&params), VecSimIndex_Free);
     }
@@ -210,6 +507,10 @@ PYBIND11_MODULE(VecSim, m) {
         .def_readwrite("initialCapacity", &BFParams::initialCapacity)
         .def_readwrite("blockSize", &BFParams::blockSize);
 
+    py::class_<TieredHNSWParams>(m, "TieredHNSWParams")
+        .def(py::init())
+        .def_readwrite("swapJobThreshold", &TieredHNSWParams::swapJobThreshold);
+
     py::class_<VecSimParams>(m, "VecSimParams")
         .def(py::init())
         .def_readwrite("algo", &VecSimParams::algo)
@@ -236,8 +537,10 @@ PYBIND11_MODULE(VecSim, m) {
         .def("range_query", &PyVecSimIndex::range, py::arg("vector"), py::arg("radius"),
              py::arg("query_param") = nullptr)
         .def("index_size", &PyVecSimIndex::indexSize)
+        .def("index_memory", &PyVecSimIndex::indexMemory)
         .def("create_batch_iterator", &PyVecSimIndex::createBatchIterator, py::arg("query_blob"),
-             py::arg("query_param") = nullptr);
+             py::arg("query_param") = nullptr)
+        .def("get_vector", &PyVecSimIndex::getVector);
 
     py::class_<PyHNSWLibIndex, PyVecSimIndex>(m, "HNSWIndex")
         .def(py::init([](const HNSWParams &params) { return new PyHNSWLibIndex(params); }),
@@ -247,7 +550,28 @@ PYBIND11_MODULE(VecSim, m) {
              }),
              py::arg("location"), py::arg("params") = nullptr)
         .def("set_ef", &PyHNSWLibIndex::setDefaultEf)
-        .def("save_index", &PyHNSWLibIndex::saveIndex);
+        .def("save_index", &PyHNSWLibIndex::saveIndex)
+        .def("knn_parallel", &PyHNSWLibIndex::searchKnnParallel, py::arg("queries"), py::arg("k"),
+             py::arg("query_param") = nullptr, py::arg("num_threads") = -1)
+        .def("add_vector_parallel", &PyHNSWLibIndex::addVectorsParallel, py::arg("vectors"),
+             py::arg("labels"), py::arg("num_threads") = -1)
+        .def("check_integrity", &PyHNSWLibIndex::checkIntegrity)
+        .def("range_parallel", &PyHNSWLibIndex::searchRangeParallel, py::arg("queries"),
+             py::arg("radius"), py::arg("query_param") = nullptr, py::arg("num_threads") = -1);
+
+    py::class_<PyTieredIndex, PyVecSimIndex>(m, "TieredIndex")
+        .def("wait_for_index", &PyTiered_HNSWIndex::WaitForIndex, py::arg("waiting_duration") = 10)
+        .def("get_curr_bf_size", &PyTiered_HNSWIndex::getFlatIndexSize)
+        .def("get_buffer_limit", &PyTiered_HNSWIndex::getBufferLimit)
+        .def_static("get_threads_num", &PyTieredIndex::GetThreadsNum);
+
+    py::class_<PyTiered_HNSWIndex, PyTieredIndex>(m, "Tiered_HNSWIndex")
+        .def(
+            py::init([](const HNSWParams &hnsw_params, const TieredHNSWParams &tiered_hnsw_params) {
+                return new PyTiered_HNSWIndex(hnsw_params, tiered_hnsw_params);
+            }),
+            py::arg("hnsw_params"), py::arg("tiered_hnsw_params"))
+        .def("hnsw_label_count", &PyTiered_HNSWIndex::HNSWLabelCount);
 
     py::class_<PyBFIndex, PyVecSimIndex>(m, "BFIndex")
         .def(py::init([](const BFParams &params) { return new PyBFIndex(params); }),
diff --git a/src/python_bindings/tiered_index_mock.h b/src/python_bindings/tiered_index_mock.h
new file mode 100644
index 000000000..7f2204e06
--- /dev/null
+++ b/src/python_bindings/tiered_index_mock.h
@@ -0,0 +1,129 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#pragma once
+
+#include <thread>
+#include <condition_variable>
+#include <bitset>
+
+#include "VecSim/vec_sim.h"
+#include "VecSim/algorithms/hnsw/hnsw_tiered.h"
+#include "pybind11/pybind11.h"
+
+namespace tiered_index_mock {
+
+std::mutex queue_guard;
+std::condition_variable queue_cond;
+std::vector<std::thread> thread_pool;
+typedef struct RefManagedJob {
+    AsyncJob *job;
+    std::weak_ptr<VecSimIndex> index_weak_ref;
+} RefManagedJob;
+
+using JobQueue = std::queue<RefManagedJob>;
+int submit_callback(void *job_queue, void *index_ctx, AsyncJob **jobs, JobCallback *CBs,
+                    JobCallback *freeCBs, size_t len);
+
+typedef struct IndexExtCtx {
+    std::shared_ptr<VecSimIndex> index_strong_ref;
+    ~IndexExtCtx() {}
+} IndexExtCtx;
+
+static const size_t MAX_POOL_SIZE = 16;
+static const size_t THREAD_POOL_SIZE = MIN(MAX_POOL_SIZE, std::thread::hardware_concurrency());
+
+class ThreadParams {
+public:
+    bool &run_thread;
+    // A bit set to help determine whether all jobs are done by checking
+    // that the job queue is empty and the all the bits are 0.
+    // Each thread is associated with a bit position in the bit set.
+    // The thread's corresponding bit should be set to before the job is popped
+    // from the queue and the execution starts.
+    // We turn the bit off after the execute callback returns to mark the job is done.
+    std::bitset<MAX_POOL_SIZE> &executions_status;
+    const unsigned int thread_index;
+    JobQueue &jobQ;
+    ThreadParams(bool &run_thread, std::bitset<MAX_POOL_SIZE> &executions_status,
+                 const unsigned int thread_index, JobQueue &jobQ)
+        : run_thread(run_thread), executions_status(executions_status), thread_index(thread_index),
+          jobQ(jobQ) {}
+
+    ThreadParams(const ThreadParams &other) = default;
+};
+
+void inline MarkExecuteInProcess(std::bitset<MAX_POOL_SIZE> &executions_status,
+                                 size_t thread_index) {
+    executions_status.set(thread_index);
+}
+
+void inline MarkExecuteDone(std::bitset<MAX_POOL_SIZE> &executions_status, size_t thread_index) {
+    executions_status.reset(thread_index);
+}
+
+void thread_main_loop(ThreadParams params) {
+    while (params.run_thread) {
+        std::unique_lock<std::mutex> lock(queue_guard);
+        // Wake up and acquire the lock (atomically) ONLY if the job queue is not empty at that
+        // point, or if the thread should not run anymore (and quit in that case).
+        queue_cond.wait(lock, [&params]() { return !(params.jobQ.empty()) || !params.run_thread; });
+        if (!params.run_thread)
+            return;
+        auto managed_job = params.jobQ.front();
+        MarkExecuteInProcess(params.executions_status, params.thread_index);
+        params.jobQ.pop();
+
+        lock.unlock();
+        // Upgrade the index weak reference to a strong ref while we run the job over the index.
+        if (auto temp_ref = managed_job.index_weak_ref.lock()) {
+            managed_job.job->Execute(managed_job.job);
+            MarkExecuteDone(params.executions_status, params.thread_index);
+        }
+    }
+}
+
+int submit_callback(void *job_queue, void *index_ctx, AsyncJob **jobs, JobCallback *CBs,
+                    size_t len) {
+    {
+        std::unique_lock<std::mutex> lock(queue_guard);
+        for (size_t i = 0; i < len; i++) {
+            // Wrap the job with a struct that contains a weak reference to the related index.
+            auto owned_job = RefManagedJob{
+                .job = jobs[i],
+                .index_weak_ref = reinterpret_cast<IndexExtCtx *>(index_ctx)->index_strong_ref};
+            static_cast<JobQueue *>(job_queue)->push(owned_job);
+        }
+    }
+    if (len == 1) {
+        queue_cond.notify_one();
+    } else {
+        queue_cond.notify_all();
+    }
+    return VecSim_OK;
+}
+
+// Main loop for background worker threads that execute the jobs form the job queue.
+// run_thread uses as a signal to the thread that indicates whether it should keep running or
+// stop and terminate the thread.
+
+void thread_pool_terminate(JobQueue &jobQ, bool &run_thread) {
+    // Check every 10 ms if queue is empty, and if so, terminate the threads loop.
+    while (true) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+        std::unique_lock<std::mutex> lock(queue_guard);
+        if (jobQ.empty()) {
+            run_thread = false;
+            queue_cond.notify_all();
+            break;
+        }
+    }
+    for (size_t i = 0; i < THREAD_POOL_SIZE; i++) {
+        thread_pool[i].join();
+    }
+    thread_pool.clear();
+}
+} // namespace tiered_index_mock
diff --git a/tests/benchmark/bm_batch_iterator.h b/tests/benchmark/bm_batch_iterator.h
index 746fbf0c0..930f2ade5 100644
--- a/tests/benchmark/bm_batch_iterator.h
+++ b/tests/benchmark/bm_batch_iterator.h
@@ -55,7 +55,7 @@ void BM_BatchIterator<index_type_t>::RunBatchedSearch_HNSW(benchmark::State &st,
     }
     st.PauseTiming();
     // Update the memory delta as a result of using the batch iterator.
-    size_t curr_memory = VecSimIndex_Info(INDICES.at(VecSimAlgo_HNSWLIB)).hnswInfo.memory;
+    size_t curr_memory = VecSimIndex_Info(INDICES.at(VecSimAlgo_HNSWLIB)).commonInfo.memory;
     memory_delta += (double)(curr_memory - index_memory);
     VecSimBatchIterator_Free(batchIterator);
 
@@ -76,7 +76,7 @@ void BM_BatchIterator<index_type_t>::BF_FixedBatchSize(benchmark::State &st) {
     size_t batch_size = st.range(0);
     size_t num_batches = st.range(1);
     size_t iter = 0;
-    size_t index_memory = VecSimIndex_Info(INDICES[VecSimAlgo_BF]).bfInfo.memory;
+    size_t index_memory = VecSimIndex_Info(INDICES[VecSimAlgo_BF]).commonInfo.memory;
     double memory_delta = 0.0;
 
     for (auto _ : st) {
@@ -91,7 +91,7 @@ void BM_BatchIterator<index_type_t>::BF_FixedBatchSize(benchmark::State &st) {
                 break;
             }
         }
-        size_t curr_memory = VecSimIndex_Info(INDICES[VecSimAlgo_BF]).bfInfo.memory;
+        size_t curr_memory = VecSimIndex_Info(INDICES[VecSimAlgo_BF]).commonInfo.memory;
         memory_delta += (double)(curr_memory - index_memory);
         VecSimBatchIterator_Free(batchIterator);
         iter++;
@@ -159,7 +159,7 @@ void BM_BatchIterator<index_type_t>::HNSW_FixedBatchSize(benchmark::State &st) {
     size_t total_res_num = num_batches * batch_size;
     size_t iter = 0;
     size_t correct = 0;
-    size_t index_memory = VecSimIndex_Info(INDICES[VecSimAlgo_HNSWLIB]).hnswInfo.memory;
+    size_t index_memory = VecSimIndex_Info(INDICES[VecSimAlgo_HNSWLIB]).commonInfo.memory;
     double memory_delta = 0.0;
 
     for (auto _ : st) {
@@ -178,7 +178,7 @@ void BM_BatchIterator<index_type_t>::HNSW_VariableBatchSize(benchmark::State &st
     size_t total_res_num;
     size_t iter = 0;
     size_t correct = 0;
-    size_t index_memory = VecSimIndex_Info(INDICES[VecSimAlgo_HNSWLIB]).hnswInfo.memory;
+    size_t index_memory = VecSimIndex_Info(INDICES[VecSimAlgo_HNSWLIB]).commonInfo.memory;
     double memory_delta = 0.0;
 
     for (auto _ : st) {
@@ -197,7 +197,7 @@ void BM_BatchIterator<index_type_t>::HNSW_BatchesToAdhocBF(benchmark::State &st)
     size_t total_res_num;
     size_t iter = 0;
     size_t correct = 0;
-    size_t index_memory = VecSimIndex_Info(INDICES[VecSimAlgo_HNSWLIB]).hnswInfo.memory;
+    size_t index_memory = VecSimIndex_Info(INDICES[VecSimAlgo_HNSWLIB]).commonInfo.memory;
     double memory_delta = 0.0;
 
     for (auto _ : st) {
diff --git a/tests/benchmark/bm_common.h b/tests/benchmark/bm_common.h
index 8b6a4f6a3..ee98f3445 100644
--- a/tests/benchmark/bm_common.h
+++ b/tests/benchmark/bm_common.h
@@ -55,7 +55,7 @@ void BM_VecSimCommon<index_type_t>::Memory_FLAT(benchmark::State &st, unsigned s
         // Do nothing...
     }
     st.counters["memory"] =
-        (double)VecSimIndex_Info(INDICES[VecSimAlgo_BF + index_offset]).bfInfo.memory;
+        (double)VecSimIndex_Info(INDICES[VecSimAlgo_BF + index_offset]).commonInfo.memory;
 }
 template <typename index_type_t>
 void BM_VecSimCommon<index_type_t>::Memory_HNSW(benchmark::State &st, unsigned short index_offset) {
@@ -64,7 +64,7 @@ void BM_VecSimCommon<index_type_t>::Memory_HNSW(benchmark::State &st, unsigned s
         // Do nothing...
     }
     st.counters["memory"] =
-        (double)VecSimIndex_Info(INDICES[VecSimAlgo_HNSWLIB + index_offset]).hnswInfo.memory;
+        (double)VecSimIndex_Info(INDICES[VecSimAlgo_HNSWLIB + index_offset]).commonInfo.memory;
 }
 
 // TopK search BM
diff --git a/tests/benchmark/bm_updated_index.h b/tests/benchmark/bm_updated_index.h
index 7c70eff55..47828ba25 100644
--- a/tests/benchmark/bm_updated_index.h
+++ b/tests/benchmark/bm_updated_index.h
@@ -6,7 +6,7 @@
 #include "VecSim/vec_sim.h"
 #include "VecSim/query_results.h"
 #include "VecSim/utils/serializer.h"
-#include "VecSim/algorithms/hnsw/hnsw_factory.h"
+#include "VecSim/index_factories/hnsw_factory.h"
 #include "bm_common.h"
 
 /**************************************
diff --git a/tests/benchmark/bm_vecsim_basics.h b/tests/benchmark/bm_vecsim_basics.h
index d55955162..02a97c1ae 100644
--- a/tests/benchmark/bm_vecsim_basics.h
+++ b/tests/benchmark/bm_vecsim_basics.h
@@ -31,49 +31,50 @@ class BM_VecSimBasics : public BM_VecSimCommon<index_type_t> {
 template <typename index_type_t>
 void BM_VecSimBasics<index_type_t>::AddLabel(benchmark::State &st) {
 
+    auto index = INDICES[st.range(0)];
     size_t index_size = N_VECTORS;
-    size_t initial_label_count = (INDICES[st.range(0)])->indexLabelCount();
+    size_t initial_label_count = index->indexLabelCount();
 
     // In a single vector per label index, index size should equal label count.
     size_t vec_per_label = index_size % initial_label_count == 0
                                ? index_size / initial_label_count
                                : index_size / initial_label_count + 1;
-    size_t memory_delta = 0;
     labelType label = initial_label_count;
     size_t added_vec_count = 0;
 
+    size_t memory_delta = index->getAllocationSize();
     // Add a new label from the test set in every iteration.
     for (auto _ : st) {
         // Add one label
         for (labelType vec = 0; vec < vec_per_label; ++vec) {
-            memory_delta += VecSimIndex_AddVector(
-                INDICES[st.range(0)], QUERIES[added_vec_count % N_QUERIES].data(), label);
+            VecSimIndex_AddVector(index, QUERIES[added_vec_count % N_QUERIES].data(), label);
         }
         added_vec_count += vec_per_label;
         label++;
     }
+    memory_delta = index->getAllocationSize() - memory_delta;
 
     st.counters["memory_per_vector"] = (double)memory_delta / (double)added_vec_count;
     st.counters["vectors_per_label"] = vec_per_label;
 
-    assert(VecSimIndex_IndexSize(INDICES[st.range(0)]) == N_VECTORS + added_vec_count);
+    assert(VecSimIndex_IndexSize(index) == N_VECTORS + added_vec_count);
 
     // Clean-up all the new vectors to restore the index size to its original value.
     // Note we loop over the new labels and not the internal ids. This way in multi indices BM all
     // the new vectors added under the same label will be removed in one call.
-    size_t new_label_count = (INDICES[st.range(0)])->indexLabelCount();
+    size_t new_label_count = index->indexLabelCount();
     for (size_t label = initial_label_count; label < new_label_count; label++) {
-        VecSimIndex_DeleteVector(INDICES[st.range(0)], label);
+        VecSimIndex_DeleteVector(index, label);
     }
 
-    assert(VecSimIndex_IndexSize(INDICES[st.range(0)]) == N_VECTORS);
+    assert(VecSimIndex_IndexSize(index) == N_VECTORS);
 }
 template <typename index_type_t>
 template <typename algo_t>
 void BM_VecSimBasics<index_type_t>::DeleteLabel(algo_t *index, benchmark::State &st) {
     // Remove a different vector in every execution.
     size_t label_to_remove = 0;
-    double memory_delta = 0;
+    double memory_delta, memory_before = index->getAllocationSize();
     size_t removed_vectors_count = 0;
     std::vector<LabelData> removed_labels_data;
 
@@ -81,7 +82,7 @@ void BM_VecSimBasics<index_type_t>::DeleteLabel(algo_t *index, benchmark::State
         st.PauseTiming();
         LabelData data(0);
         // Get label id(s) data.
-        index->GetDataByLabel(label_to_remove, data);
+        index->getDataByLabel(label_to_remove, data);
 
         removed_labels_data.push_back(data);
 
@@ -89,9 +90,9 @@ void BM_VecSimBasics<index_type_t>::DeleteLabel(algo_t *index, benchmark::State
         st.ResumeTiming();
 
         // Delete label
-        auto delta = (double)VecSimIndex_DeleteVector(index, label_to_remove++);
-        memory_delta += delta;
+        VecSimIndex_DeleteVector(index, label_to_remove++);
     }
+    memory_delta = index->getAllocationSize() - memory_before;
 
     // Avg. memory delta per vector equals the total memory delta divided by the number
     // of deleted vectors.
diff --git a/tests/benchmark/bm_vecsim_general.h b/tests/benchmark/bm_vecsim_general.h
index 8b0e3d492..ae3b04632 100644
--- a/tests/benchmark/bm_vecsim_general.h
+++ b/tests/benchmark/bm_vecsim_general.h
@@ -14,7 +14,7 @@
 #include "VecSim/utils/arr_cpp.h"
 #include "VecSim/algorithms/brute_force/brute_force.h"
 #include "VecSim/algorithms/hnsw/hnsw.h"
-#include "VecSim/algorithms/hnsw/hnsw_factory.h"
+#include "VecSim/index_factories/hnsw_factory.h"
 #include "bm_definitions.h"
 
 // This class includes every static data member that is:
diff --git a/tests/flow/common.py b/tests/flow/common.py
index 8e6fe6eb0..3231cec22 100644
--- a/tests/flow/common.py
+++ b/tests/flow/common.py
@@ -5,5 +5,53 @@
 from VecSim import *
 import numpy as np
 from scipy import spatial
-from  numpy.testing import assert_allclose
+from numpy.testing import assert_allclose
 import time
+import math
+
+def create_hnsw_params(dim, num_elements, metric, data_type, ef_construction=200, m=16, ef_runtime=10, epsilon=0.01,
+                      is_multi=False):
+    hnsw_params = HNSWParams()
+
+    hnsw_params.dim = dim
+    hnsw_params.metric = metric
+    hnsw_params.type = data_type
+    hnsw_params.M = m
+    hnsw_params.efConstruction = ef_construction
+    hnsw_params.initialCapacity = num_elements
+    hnsw_params.efRuntime = ef_runtime
+    hnsw_params.epsilon = epsilon
+    hnsw_params.multi = is_multi
+    
+    return hnsw_params    
+# Helper function for creating an index,uses the default HNSW parameters if not specified.
+def create_hnsw_index(dim, num_elements, metric, data_type, ef_construction=200, m=16, ef_runtime=10, epsilon=0.01,
+                      is_multi=False):
+    hnsw_params = HNSWParams()
+
+    hnsw_params.dim = dim
+    hnsw_params.metric = metric
+    hnsw_params.type = data_type
+    hnsw_params.M = m
+    hnsw_params.efConstruction = ef_construction
+    hnsw_params.initialCapacity = num_elements
+    hnsw_params.efRuntime = ef_runtime
+    hnsw_params.epsilon = epsilon
+    hnsw_params.multi = is_multi
+
+    return HNSWIndex(hnsw_params)
+
+
+# Compute the expected speedup as a function of the expected parallel section rate of the code by Amdahl's law
+def expected_speedup(expected_parallel_rate, n_threads):
+    return 1 / ((1-expected_parallel_rate) + expected_parallel_rate/n_threads)
+
+def bytes_to_mega(bytes, ndigits = 3):
+    return round(bytes/pow(10,6), ndigits)
+
+def round_(f_value, ndigits = 2):
+    return round(f_value, ndigits)
+
+
+def round_ms(f_value, ndigits = 2):
+    return round(f_value * 1000, ndigits)  
diff --git a/tests/flow/test_hnswlib.py b/tests/flow/test_hnsw.py
similarity index 73%
rename from tests/flow/test_hnswlib.py
rename to tests/flow/test_hnsw.py
index c4ec9b2e6..214e33c9f 100644
--- a/tests/flow/test_hnswlib.py
+++ b/tests/flow/test_hnsw.py
@@ -1,36 +1,25 @@
 # Copyright Redis Ltd. 2021 - present
 # Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
 # the Server Side Public License v1 (SSPLv1).
-
+import concurrent
+import math
+import multiprocessing
 import os
+import time
 from common import *
 import hnswlib
 
+
 # compare results with the original version of hnswlib - do not use elements deletion.
 def test_sanity_hnswlib_index_L2():
     dim = 16
     num_elements = 10000
     space = 'l2'
-    M=16
+    M = 16
     efConstruction = 100
-
     efRuntime = 10
 
-    params = VecSimParams()
-    hnswparams = HNSWParams()
-
-    params.algo = VecSimAlgo_HNSWLIB
-
-    hnswparams.dim = dim
-    hnswparams.metric = VecSimMetric_L2
-    hnswparams.type = VecSimType_FLOAT32
-    hnswparams.M = M
-    hnswparams.efConstruction = efConstruction
-    hnswparams.initialCapacity = num_elements
-    hnswparams.efRuntime=efRuntime
-
-    params.hnswParams = hnswparams
-    index = VecSimIndex(params)
+    index = create_hnsw_index(dim, num_elements, VecSimMetric_L2, VecSimType_FLOAT32, efConstruction, M, efRuntime)
 
     p = hnswlib.Index(space=space, dim=dim)
     p.init_index(max_elements=num_elements, ef_construction=efConstruction, M=M)
@@ -44,34 +33,19 @@ def test_sanity_hnswlib_index_L2():
     query_data = np.float32(np.random.random((1, dim)))
     hnswlib_labels, hnswlib_distances = p.knn_query(query_data, k=10)
     redis_labels, redis_distances = index.knn_query(query_data, 10)
-    assert_allclose(hnswlib_labels, redis_labels,  rtol=1e-5, atol=0)
-    assert_allclose(hnswlib_distances, redis_distances,  rtol=1e-5, atol=0)
+    assert_allclose(hnswlib_labels, redis_labels, rtol=1e-5, atol=0)
+    assert_allclose(hnswlib_distances, redis_distances, rtol=1e-5, atol=0)
 
 
 def test_sanity_hnswlib_index_cosine():
     dim = 16
     num_elements = 10000
     space = 'cosine'
-    M=16
+    M = 16
     efConstruction = 100
-
     efRuntime = 10
 
-    params = VecSimParams()
-    hnswparams = HNSWParams()
-
-    params.algo = VecSimAlgo_HNSWLIB
-
-    hnswparams.dim = dim
-    hnswparams.metric = VecSimMetric_Cosine
-    hnswparams.type = VecSimType_FLOAT32
-    hnswparams.M = M
-    hnswparams.efConstruction = efConstruction
-    hnswparams.initialCapacity = num_elements
-    hnswparams.efRuntime=efRuntime
-
-    params.hnswParams = hnswparams
-    index = VecSimIndex(params)
+    index = create_hnsw_index(dim, num_elements, VecSimMetric_Cosine, VecSimType_FLOAT32, efConstruction, M, efRuntime)
 
     p = hnswlib.Index(space=space, dim=dim)
     p.init_index(max_elements=num_elements, ef_construction=efConstruction, M=M)
@@ -85,8 +59,8 @@ def test_sanity_hnswlib_index_cosine():
     query_data = np.float32(np.random.random((1, dim)))
     hnswlib_labels, hnswlib_distances = p.knn_query(query_data, k=10)
     redis_labels, redis_distances = index.knn_query(query_data, 10)
-    assert_allclose(hnswlib_labels, redis_labels,  rtol=1e-5, atol=0)
-    assert_allclose(hnswlib_distances, redis_distances,  rtol=1e-5, atol=0)
+    assert_allclose(hnswlib_labels, redis_labels, rtol=1e-5, atol=0)
+    assert_allclose(hnswlib_distances, redis_distances, rtol=1e-5, atol=0)
 
 
 # Validate correctness of delete implementation comparing the brute force search. We test the search recall which is not
@@ -99,19 +73,10 @@ def test_recall_for_hnswlib_index_with_deletion():
     efConstruction = 100
 
     num_queries = 10
-    k=10
+    k = 10
     efRuntime = 0
 
-    hnswparams = HNSWParams()
-    hnswparams.M = M
-    hnswparams.efConstruction = efConstruction
-    hnswparams.initialCapacity = num_elements
-    hnswparams.efRuntime = efRuntime
-    hnswparams.dim = dim
-    hnswparams.type = VecSimType_FLOAT32
-    hnswparams.metric = VecSimMetric_L2
-
-    hnsw_index = HNSWIndex(hnswparams)
+    hnsw_index = create_hnsw_index(dim, num_elements, VecSimMetric_L2, VecSimType_FLOAT32, efConstruction, M, efRuntime)
 
     data = np.float32(np.random.random((num_elements, dim)))
     vectors = []
@@ -139,13 +104,13 @@ def test_recall_for_hnswlib_index_with_deletion():
         for label in hnswlib_labels[0]:
             for correct_label in keys:
                 if label == correct_label:
-                    correct+=1
+                    correct += 1
                     break
 
     # Measure recall
-    recall = float(correct)/(k*num_queries)
+    recall = float(correct) / (k * num_queries)
     print("\nrecall is: \n", recall)
-    assert(recall > 0.9)
+    assert (recall > 0.9)
 
 
 def test_batch_iterator():
@@ -156,16 +121,7 @@ def test_batch_iterator():
     efRuntime = 180
     num_queries = 10
 
-    hnswparams = HNSWParams()
-    hnswparams.M = M
-    hnswparams.efConstruction = efConstruction
-    hnswparams.initialCapacity = num_elements
-    hnswparams.efRuntime = efRuntime
-    hnswparams.dim = dim
-    hnswparams.type = VecSimType_FLOAT32
-    hnswparams.metric = VecSimMetric_L2
-
-    hnsw_index = HNSWIndex(hnswparams)
+    hnsw_index = create_hnsw_index(dim, num_elements, VecSimMetric_L2, VecSimType_FLOAT32, efConstruction, M, efRuntime)
 
     # Add 100k random vectors to the index
     rng = np.random.default_rng(seed=47)
@@ -181,17 +137,17 @@ def test_batch_iterator():
     labels_first_batch, distances_first_batch = batch_iterator.get_next_results(10, BY_ID)
     for i, _ in enumerate(labels_first_batch[0][:-1]):
         # Assert sorting by id
-        assert(labels_first_batch[0][i] < labels_first_batch[0][i+1])
+        assert (labels_first_batch[0][i] < labels_first_batch[0][i + 1])
 
     labels_second_batch, distances_second_batch = batch_iterator.get_next_results(10, BY_SCORE)
     should_have_return_in_first_batch = []
     for i, dist in enumerate(distances_second_batch[0][:-1]):
         # Assert sorting by score
-        assert(distances_second_batch[0][i] < distances_second_batch[0][i+1])
+        assert (distances_second_batch[0][i] < distances_second_batch[0][i + 1])
         # Assert that every distance in the second batch is higher than any distance of the first batch
         if len(distances_first_batch[0][np.where(distances_first_batch[0] > dist)]) != 0:
             should_have_return_in_first_batch.append(dist)
-    assert(len(should_have_return_in_first_batch) <= 2)
+    assert (len(should_have_return_in_first_batch) <= 2)
 
     # Verify that runtime args are sent properly to the batch iterator.
     query_params = VecSimQueryParams()
@@ -232,11 +188,12 @@ def test_batch_iterator():
                 keys = [key for _, key in dists[:returned_results_num]]
                 correct += len(set(accumulated_labels).intersection(set(keys)))
                 break
-        assert iterations == np.ceil(total_res/batch_size)
+        assert iterations == np.ceil(total_res / batch_size)
         recall = float(correct) / total_res
         assert recall >= 0.89
         total_recall += recall
-    print(f'\nAvg recall for {total_res} results in index of size {num_elements} with dim={dim} is: ', total_recall/num_queries)
+    print(f'\nAvg recall for {total_res} results in index of size {num_elements} with dim={dim} is: ',
+          total_recall / num_queries)
 
     # Run again a single query in batches until it is depleted.
     batch_iterator = hnsw_index.create_batch_iterator(query_data[0])
@@ -249,7 +206,7 @@ def test_batch_iterator():
         # Verify that we got new scores in each iteration.
         assert len(accumulated_labels.intersection(set(labels[0]))) == 0
         accumulated_labels = accumulated_labels.union(set(labels[0]))
-    assert len(accumulated_labels) >= 0.95*num_elements
+    assert len(accumulated_labels) >= 0.95 * num_elements
     print("Overall results returned:", len(accumulated_labels), "in", iterations, "iterations")
 
 
@@ -258,25 +215,15 @@ def test_serialization():
     num_elements = 10000
     M = 16
     efConstruction = 100
+    data_type = VecSimType_FLOAT32
 
     num_queries = 10
     k = 10
     efRuntime = 50
 
-    data_type = VecSimType_FLOAT32
-
-    hnswparams = HNSWParams()
-    hnswparams.M = M
-    hnswparams.efConstruction = efConstruction
-    hnswparams.initialCapacity = num_elements
-    hnswparams.dim = dim
-    hnswparams.type = data_type
-    hnswparams.metric = VecSimMetric_L2
-
-    hnsw_index = HNSWIndex(hnswparams)
+    hnsw_index = create_hnsw_index(dim, num_elements, VecSimMetric_L2, data_type, efConstruction, M, efRuntime)
     hnsw_index.set_ef(efRuntime)
 
-
     data = np.float32(np.random.random((num_elements, dim)))
     vectors = []
     for i, vector in enumerate(data):
@@ -301,11 +248,11 @@ def test_serialization():
                     correct += 1
                     break
     # Measure recall
-    recall = float(correct)/(k*num_queries)
+    recall = float(correct) / (k * num_queries)
     print("\nrecall is: \n", recall)
 
     # Persist, delete and restore index.
-    file_name = os.getcwd()+"/dump"
+    file_name = os.getcwd() + "/dump"
     hnsw_index.save_index(file_name)
 
     new_hnsw_index = HNSWIndex(file_name)
@@ -324,30 +271,18 @@ def test_serialization():
                     break
 
     # Compare recall after reloading the index
-    recall_after = float(correct_after)/(k*num_queries)
+    recall_after = float(correct_after) / (k * num_queries)
     print("\nrecall after is: \n", recall_after)
     assert recall == recall_after
 
+
 def test_range_query():
     dim = 100
     num_elements = 100000
     epsilon = 0.01
 
-    params = VecSimParams()
-    hnswparams = HNSWParams()
-
-    params.algo = VecSimAlgo_HNSWLIB
-
-    hnswparams.dim = dim
-    hnswparams.metric = VecSimMetric_L2
-    hnswparams.type = VecSimType_FLOAT32
-    hnswparams.M = 32
-    hnswparams.efConstruction = 200
-    hnswparams.initialCapacity = num_elements
-    hnswparams.epsilon = epsilon
-
-    params.hnswParams = hnswparams
-    index = VecSimIndex(params)
+    index = create_hnsw_index(dim, num_elements, VecSimMetric_L2, VecSimType_FLOAT32, ef_construction=200, m=32,
+                                   epsilon=epsilon)
 
     np.random.seed(47)
     data = np.float32(np.random.random((num_elements, dim)))
@@ -372,14 +307,15 @@ def test_range_query():
         dists = sorted([(key, spatial.distance.sqeuclidean(query_data.flat, vec)) for key, vec in vectors])
         actual_results = [(key, dist) for key, dist in dists if dist <= radius]
 
-        print(f'\nlookup time for {num_elements} vectors with dim={dim} took {end - start} seconds with epsilon={epsilon_rt},'
-              f' got {res_num} results, which are {res_num/len(actual_results)} of the entire results in the range.')
+        print(
+            f'\nlookup time for {num_elements} vectors with dim={dim} took {end - start} seconds with epsilon={epsilon_rt},'
+            f' got {res_num} results, which are {res_num / len(actual_results)} of the entire results in the range.')
 
         # Compare the number of vectors that are actually within the range to the returned results.
         assert np.all(np.isin(hnsw_labels, np.array([label for label, _ in actual_results])))
 
         assert max(hnsw_distances[0]) <= radius
-        recalls[epsilon_rt] = res_num/len(actual_results)
+        recalls[epsilon_rt] = res_num / len(actual_results)
 
     # Expect higher recalls for higher epsilon values.
     assert recalls[0.001] <= recalls[0.01] <= recalls[0.1]
@@ -395,24 +331,14 @@ def test_recall_for_hnsw_multi_value():
     num_per_label = 16
     M = 16
     efConstruction = 100
-
     num_queries = 10
-    k=10
+    k = 10
     efRuntime = 0
 
     num_elements = num_labels * num_per_label
 
-    hnswparams = HNSWParams()
-    hnswparams.M = M
-    hnswparams.efConstruction = efConstruction
-    hnswparams.initialCapacity = num_elements
-    hnswparams.efRuntime = efRuntime
-    hnswparams.dim = dim
-    hnswparams.type = VecSimType_FLOAT32
-    hnswparams.metric = VecSimMetric_Cosine
-    hnswparams.multi = True
-
-    hnsw_index = HNSWIndex(hnswparams)
+    hnsw_index = create_hnsw_index(dim, num_elements, VecSimMetric_Cosine, VecSimType_FLOAT32, efConstruction, M,
+                                   efRuntime, is_multi=True)
 
     data = np.float32(np.random.random((num_labels, dim)))
     vectors = []
@@ -427,7 +353,7 @@ def test_recall_for_hnsw_multi_value():
     correct = 0
     for target_vector in query_data:
         hnswlib_labels, hnswlib_distances = hnsw_index.knn_query(target_vector, 10)
-        assert(len(hnswlib_labels[0]) == len(np.unique(hnswlib_labels[0])))
+        assert (len(hnswlib_labels[0]) == len(np.unique(hnswlib_labels[0])))
 
         # sort distances of every vector from the target vector and get actual k nearest vectors
         dists = {}
@@ -435,7 +361,8 @@ def test_recall_for_hnsw_multi_value():
             # Setting or updating the score for each label. If it's the first time we calculate a score for a label,
             # dists.get(key, 3) will return 3, which is more than a Cosine score can be,
             # so we will choose the actual score the first time.
-            dists[key] = min(spatial.distance.cosine(target_vector, vec), dists.get(key, 3)) # cosine distance is always <= 2
+            dists[key] = min(spatial.distance.cosine(target_vector, vec),
+                             dists.get(key, 3))  # cosine distance is always <= 2
 
         dists = list(dists.items())
         dists = sorted(dists, key=lambda pair: pair[1])[:k]
@@ -444,40 +371,24 @@ def test_recall_for_hnsw_multi_value():
         for label in hnswlib_labels[0]:
             for correct_label in keys:
                 if label == correct_label:
-                    correct+=1
+                    correct += 1
                     break
 
     # Measure recall
-    recall = float(correct)/(k*num_queries)
+    recall = float(correct) / (k * num_queries)
     print("\nrecall is: \n", recall)
-    assert(recall > 0.9)
+    assert (recall > 0.9)
 
 
 def test_multi_range_query():
     dim = 100
     num_labels = 20000
     per_label = 5
-
     epsilon = 0.01
-
     num_elements = num_labels * per_label
 
-    params = VecSimParams()
-    hnswparams = HNSWParams()
-
-    params.algo = VecSimAlgo_HNSWLIB
-
-    hnswparams.dim = dim
-    hnswparams.metric = VecSimMetric_L2
-    hnswparams.multi = True
-    hnswparams.type = VecSimType_FLOAT32
-    hnswparams.M = 32
-    hnswparams.efConstruction = 200
-    hnswparams.initialCapacity = num_elements
-    hnswparams.epsilon = epsilon
-
-    params.hnswParams = hnswparams
-    index = VecSimIndex(params)
+    index = create_hnsw_index(dim, num_elements, VecSimMetric_L2, VecSimType_FLOAT32, ef_construction=200, m=32,
+                              epsilon=epsilon, is_multi=True)
 
     np.random.seed(47)
     data = np.float32(np.random.random((num_labels, per_label, dim)))
@@ -500,16 +411,17 @@ def test_multi_range_query():
     dists = sorted(dists, key=lambda pair: pair[1])
     keys = [key for key, dist in dists if dist <= radius]
 
-    for epsilon_rt in [0.001, 0.01, 0.1]:                   
-        query_params = VecSimQueryParams()                  
-        query_params.hnswRuntimeParams.epsilon = epsilon_rt                 
-        start = time.time()                 
-        hnsw_labels, hnsw_distances = index.range_query(query_data, radius=radius, query_param=query_params)                    
+    for epsilon_rt in [0.001, 0.01, 0.1]:
+        query_params = VecSimQueryParams()
+        query_params.hnswRuntimeParams.epsilon = epsilon_rt
+        start = time.time()
+        hnsw_labels, hnsw_distances = index.range_query(query_data, radius=radius, query_param=query_params)
         end = time.time()
         res_num = len(hnsw_labels[0])
 
-        print(f'\nlookup time for ({num_labels} X {per_label}) vectors with dim={dim} took {end - start} seconds with epsilon={epsilon_rt},'
-              f' got {res_num} results, which are {res_num/len(keys)} of the entire results in the range.')
+        print(
+            f'\nlookup time for ({num_labels} X {per_label}) vectors with dim={dim} took {end - start} seconds with epsilon={epsilon_rt},'
+            f' got {res_num} results, which are {res_num / len(keys)} of the entire results in the range.')
 
         # Compare the number of vectors that are actually within the range to the returned results.
         assert np.all(np.isin(hnsw_labels, np.array(keys)))
@@ -518,7 +430,7 @@ def test_multi_range_query():
         assert len(hnsw_labels[0]) == len(np.unique(hnsw_labels[0]))
 
         assert max(hnsw_distances[0]) <= radius
-        recalls[epsilon_rt] = res_num/len(keys)
+        recalls[epsilon_rt] = res_num / len(keys)
 
     # Expect higher recalls for higher epsilon values.
     assert recalls[0.001] <= recalls[0.01] <= recalls[0.1]
diff --git a/tests/flow/test_hnsw_parallel.py b/tests/flow/test_hnsw_parallel.py
new file mode 100644
index 000000000..c6e05c41e
--- /dev/null
+++ b/tests/flow/test_hnsw_parallel.py
@@ -0,0 +1,514 @@
+# Copyright Redis Ltd. 2021 - present
+# Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+# the Server Side Public License v1 (SSPLv1).
+import concurrent
+import os
+import threading
+from concurrent.futures import ThreadPoolExecutor, wait
+
+from common import *
+
+# Helper class for creating a "baseline" HNSW index that was built by inserting vectors one by one and a corresponding
+# flat index for given params, to compare the parallel operation against it
+class TestIndex:
+    def __init__(self, dim_, num_elements_, metric_, data_type_, multi_=False, ef_runtime=200):
+
+        self.dim = dim_
+        self.num_elements = num_elements_
+        self.metric = metric_
+        self.data_type = data_type_
+        self.hnsw_index = create_hnsw_index(dim_, num_elements_, metric_, data_type_,
+                                            ef_runtime=ef_runtime, is_multi=multi_)
+        self.multi = multi_
+
+        bf_params = BFParams()
+
+        bf_params.initialCapacity = num_elements_
+        bf_params.blockSize = num_elements_
+        bf_params.dim = dim_
+        bf_params.type = data_type_
+        bf_params.metric = metric_
+        bf_params.multi = multi_
+
+        self.bf_index = BFIndex(bf_params)
+
+        np.random.seed(47)
+        self.data = None
+        self.total_res_bf = []  # Save the ground truth results
+        self.sequential_insert_time = 0  # Total time took to insert vectors to the index one by one
+        self.vectors_per_label = 1
+
+    def insert_random_vectors(self):
+        self.data = np.float32(np.random.random((self.num_elements, self.dim))) \
+            if self.data_type == VecSimType_FLOAT32 else np.random.random((self.num_elements, self.dim))
+
+        self.sequential_insert_time = 0
+        for label, vector in enumerate(self.data):
+            start = time.time()
+            self.hnsw_index.add_vector(vector, label)
+            self.sequential_insert_time += time.time() - start
+            self.bf_index.add_vector(vector, label)
+
+    def insert_random_vectors_multi(self, vec_per_label):
+        self.vectors_per_label = vec_per_label
+        self.data = np.float32(np.random.random((int(self.num_elements/per_label), per_label, self.dim))) \
+            if self.data_type == VecSimType_FLOAT32\
+            else np.random.random((int(self.num_elements/per_label), per_label, self.dim))
+
+        self.sequential_insert_time = 0
+        for label, vectors in enumerate(self.data):
+            for vector in vectors:
+                start = time.time()
+                self.hnsw_index.add_vector(vector, label)
+                self.sequential_insert_time += time.time() - start
+                self.bf_index.add_vector(vector, label)
+
+    def compute_ground_truth_knn(self, query_data, k):
+        self.total_res_bf = []  # reset upon every call, so it will be aligned with the given query data
+        for query in query_data:
+            self.total_res_bf.append(self.bf_index.knn_query(query, k)[0][0])
+
+    def compute_ground_truth_range(self, query_data, radius):
+        self.total_res_bf = []  # reset upon every call, so it will be aligned with the given query data
+        for query in query_data:
+            self.total_res_bf.append(self.bf_index.range_query(query, radius)[0][0])
+
+
+# Global test params
+dim = 32
+num_elements = 100000
+metric = VecSimMetric_L2
+data_type = VecSimType_FLOAT32
+per_label = 5  # for multi value index
+
+print("Creating test indexes...")
+g_test_index = TestIndex(dim, num_elements, metric, data_type)
+g_test_index.insert_random_vectors()
+
+g_test_index_multi = TestIndex(dim, num_elements, metric, data_type, multi_=True)
+g_test_index_multi.insert_random_vectors_multi(per_label)
+
+
+# Compute the expected speedup as a function of the expected parallel section rate of the code by Amdahl's law
+def expected_speedup(expected_parallel_rate, n_threads):
+    return 1 / ((1-expected_parallel_rate) + expected_parallel_rate/n_threads)
+
+
+def test_parallel_search():
+    k = 10
+    num_queries = 10000
+    n_threads = min(os.cpu_count(), 8)
+    expected_parallel_rate = 0.9  # we expect that at least 90% of the insert/search time will be executed in parallel
+
+    # Sequential search as the baseline
+    query_data = np.float32(np.random.random((num_queries, dim)))
+    g_test_index.compute_ground_truth_knn(query_data, k)
+    total_search_time = 0
+    total_correct = 0
+    for i, query in enumerate(query_data):
+        start = time.time()
+        res_labels, _ = g_test_index.hnsw_index.knn_query(query, k)
+        total_search_time += time.time() - start
+        total_correct += len(set(res_labels[0]).intersection(set(g_test_index.total_res_bf[i])))
+
+    print(f"Running sequential search, got {total_correct / (k * num_queries)} recall on {num_queries} queries,"
+          f" and {num_queries/total_search_time} query per seconds")
+
+    start = time.time()
+    res_labels, _ = g_test_index.hnsw_index.knn_parallel(query_data, k, num_threads=n_threads)
+    total_search_time_parallel = time.time() - start
+
+    total_correct_parallel = 0
+    for i in range(num_queries):
+        total_correct_parallel += len(set(g_test_index.total_res_bf[i]).intersection(set(res_labels[i])))
+
+    print(f"Running parallel search, got {total_correct_parallel / (k * num_queries)} recall on {num_queries} queries,"
+          f" and {num_queries / total_search_time_parallel} query per seconds")
+    print(f"Got {total_search_time / total_search_time_parallel} times improvement in runtime using {n_threads} threads\n")
+
+    # Validate that the recall of the parallel search recall is the same as the sequential search recall.
+    assert total_correct_parallel == total_correct
+    # Validate that the parallel run managed to achieve at least the expected speedup in total runtime.
+    assert total_search_time / total_search_time_parallel > expected_speedup(expected_parallel_rate, n_threads)
+
+
+def test_parallel_insert():
+    k = 10
+    num_queries = 10000
+    n_threads = min(os.cpu_count(), 8)
+    expected_parallel_rate = 0.9  # we expect that at least 90% of the insert/search time will be executed in parallel
+
+    print(f"Inserting {num_elements} vectors of dim {dim} into HNSW sequentially took"
+          f" {g_test_index.sequential_insert_time} seconds")
+
+    parallel_index = create_hnsw_index(g_test_index.dim, g_test_index.num_elements, g_test_index.metric,
+                                       g_test_index.data_type, ef_runtime=200)
+    start = time.time()
+    parallel_index.add_vector_parallel(g_test_index.data, np.array(range(num_elements)), n_threads)
+    parallel_insert_time = time.time() - start
+    assert parallel_index.index_size() == num_elements
+    assert parallel_index.check_integrity()
+    # Validate that the parallel index contains the same vectors as the sequential one.
+    for label in range(num_elements):
+        assert_allclose(g_test_index.hnsw_index.get_vector(label), parallel_index.get_vector(label))
+
+    print(f"Inserting {num_elements} vectors of dim {dim} into HNSW in parallel took {parallel_insert_time} seconds")
+    print(f"Got {g_test_index.sequential_insert_time/parallel_insert_time} times improvement using {n_threads} threads\n")
+    assert g_test_index.sequential_insert_time/parallel_insert_time > expected_speedup(expected_parallel_rate, n_threads)
+
+    query_data = np.float32(np.random.random((num_queries, dim)))
+    g_test_index.compute_ground_truth_knn(query_data, k)
+
+    # Run search over the baseline hnsw index (that was created by inserting vectors one by one).
+    start = time.time()
+    res_labels, _ = g_test_index.hnsw_index.knn_parallel(query_data, k, num_threads=n_threads)
+    total_search_time = time.time() - start
+
+    total_correct = 0
+    for i in range(num_queries):
+        total_correct += len(set(g_test_index.total_res_bf[i]).intersection(set(res_labels[i])))
+    print(f"Running parallel search over an index that was built by inserting vectors one by one, got"
+          f" {total_correct / (k * num_queries)} recall on {num_queries} queries,"
+          f" with {num_queries/total_search_time} query per second")
+
+    # Run search with parallel index and assert that similar recall achieved.
+    start = time.time()
+    res_labels, _ = parallel_index.knn_parallel(query_data, k, num_threads=n_threads)
+    total_search_time_parallel = time.time() - start
+
+    total_correct_parallel = 0
+    for i in range(num_queries):
+        total_correct_parallel += len(set(g_test_index.total_res_bf[i]).intersection(set(res_labels[i])))
+    print(f"Running parallel search on index that was created using parallel insert, got "
+          f"{total_correct_parallel / (k * num_queries)} recall on {num_queries} queries, and"
+          f" {num_queries/total_search_time_parallel} query per second")
+    assert total_correct_parallel >= total_correct * 0.95  # 0.95 is an arbitrary threshold
+
+
+def test_parallel_insert_search():
+    k = 10
+    num_queries = 10000
+    n_threads = min(os.cpu_count(), 8)
+
+    query_data = np.float32(np.random.random((num_queries, dim)))
+    g_test_index.compute_ground_truth_knn(query_data, k)
+
+    # Insert vectors to the index and search in parallel.
+    parallel_index = create_hnsw_index(g_test_index.dim, g_test_index.num_elements, g_test_index.metric,
+                                       g_test_index.data_type, ef_runtime=200)
+
+    def insert_vectors():
+        parallel_index.add_vector_parallel(g_test_index.data, np.array(range(num_elements)), num_threads=int(n_threads/2))
+
+    res_labels_g = np.zeros((num_queries, dim))
+
+    def run_queries():
+        nonlocal res_labels_g
+        res_labels_g, _ = parallel_index.knn_parallel(query_data, k, num_threads=int(n_threads/2))
+
+    t_insert = threading.Thread(target=insert_vectors)
+    t_query = threading.Thread(target=run_queries)
+    print("Running KNN queries in parallel to inserting vectors to the index, start running queries after more 50% of"
+          " the vectors are indexed")
+    t_insert.start()
+    # Wait until half of the index is indexed, then start run queries
+    while parallel_index.index_size() < num_elements / 2:
+        time.sleep(0.5)
+    t_query.start()
+
+    [t.join() for t in [t_insert, t_query]]
+
+    # Measure recall - expect to get increased recall over time, since vectors are being inserted while queries
+    # are running, and the ground truth is measured compared to the index that contains all the elements.
+    chunk_size = int(num_queries/5)
+    total_correct_prev_chunk = 0
+    for i in range(0, num_queries, chunk_size):
+        total_correct_cur_chunk = 0
+        for j in range(i, i+chunk_size):
+            total_correct_cur_chunk += len(set(g_test_index.total_res_bf[j]).intersection(set(res_labels_g[j])))
+        assert total_correct_cur_chunk >= total_correct_prev_chunk
+        total_correct_prev_chunk = total_correct_cur_chunk
+        print(f"Recall for chunk {int(i/chunk_size)+1}/{int(num_queries/chunk_size)} of queries is:"
+              f" {total_correct_cur_chunk/(k*chunk_size)}")
+
+
+def test_parallel_with_range():
+    num_queries = 10000
+    radius = 3.0
+    n_threads = min(os.cpu_count(), 8)
+    PADDING_LABEL = -1  # used for padding empty labels entries in a single query results
+    expected_parallel_rate = 0.9  # we expect that at least 90% of the insert/search time will be executed in parallel
+
+    query_data = np.float32(np.random.random((num_queries, dim)))
+    g_test_index.compute_ground_truth_range(query_data, radius)
+
+    # Run serial range queries
+    total_search_time = 0
+    # The ratio between then number of results returned by HNSW and the total number of vectors in the range.
+    overall_intersection_rate = 0
+
+    total_results = 0
+    for i, query in enumerate(query_data):
+        start = time.time()
+        res_labels_range, res_distances_range = g_test_index.hnsw_index.range_query(query, radius)
+        total_search_time += time.time() - start
+        assert set(res_labels_range[0]).issubset(set(g_test_index.total_res_bf[i]))
+        total_results += g_test_index.total_res_bf[i].size
+        overall_intersection_rate += res_labels_range[0].size / g_test_index.total_res_bf[i].size \
+            if g_test_index.total_res_bf[i].size > 0 else 1
+    print(f"Range queries - running {num_queries} queries sequentially, average number of results is:"
+          f" {total_results/num_queries} and HNSW success rate is: {overall_intersection_rate/num_queries}."
+          f" query per seconds: {num_queries/total_search_time}")
+
+    # Run range queries in parallel
+    start = time.time()
+    hnsw_labels_range_parallel, _ = g_test_index.hnsw_index.range_parallel(query_data, radius=radius)
+    total_range_query_parallel_time = time.time() - start
+    overall_intersection_rate_parallel = 0
+    for i in range(num_queries):
+        query_results_set = set(hnsw_labels_range_parallel[i])
+        query_results_set.discard(PADDING_LABEL)  # remove the irrelevant padding values
+        assert query_results_set.issubset(set(g_test_index.total_res_bf[i]))
+        overall_intersection_rate_parallel += len(query_results_set) / g_test_index.total_res_bf[i].size \
+            if g_test_index.total_res_bf[i].size > 0 else 1
+    print(f"Running the same {num_queries} queries in parallel, query per seconds is"
+          f" {num_queries/total_range_query_parallel_time}, and intersection rate is: "
+          f"{overall_intersection_rate_parallel/num_queries}")
+    assert overall_intersection_rate_parallel == overall_intersection_rate
+    print(f"Got improvement of {total_search_time/total_range_query_parallel_time} times using {n_threads} threads\n")
+    assert total_search_time/total_range_query_parallel_time >= expected_speedup(expected_parallel_rate, n_threads)
+
+
+def test_parallel_insert_multi():
+    k = 10
+    num_labels = int(g_test_index_multi.num_elements / g_test_index_multi.vectors_per_label)
+    num_queries = 10000
+    n_threads = min(os.cpu_count(), 8)
+    expected_parallel_rate = 0.85  # we expect that at least 85% of the insert/search time will be executed in parallel
+
+    print(f"Inserting {num_elements} vectors of dim {dim} into multi-HNSW ({per_label} vectors per label) sequentially"
+          f" took {g_test_index_multi.sequential_insert_time} seconds")
+
+    parallel_multi_index = create_hnsw_index(g_test_index_multi.dim, g_test_index_multi.num_elements,
+                                             g_test_index_multi.metric, g_test_index_multi.data_type, ef_runtime=200,
+                                             is_multi=True)
+
+    # Insert vectors to multi index in parallel
+    data = g_test_index_multi.data.reshape(num_elements, dim)
+    labels = np.concatenate([[i]*g_test_index_multi.vectors_per_label for i in range(num_labels)])
+    start = time.time()
+    parallel_multi_index.add_vector_parallel(data, labels, n_threads)
+    parallel_insert_time = time.time() - start
+    assert parallel_multi_index.index_size() == num_elements
+    assert parallel_multi_index.check_integrity()
+    # Validate that the parallel index contains the same vectors as the sequential one. vectors are not necessarily
+    # at the same order, so we flatten the array and check that elements are set equal.
+    for label in range(num_labels):
+        vectors_s = g_test_index_multi.hnsw_index.get_vector(label)
+        vectors_p = parallel_multi_index.get_vector(label)
+        assert vectors_s.shape == vectors_p.shape
+        assert set(vectors_s.flatten()) == set(vectors_p.flatten())
+
+    print(f"Inserting {num_elements} vectors of dim {dim} into multi-HNSW in parallel ({per_label} vectors per label)"
+          f" took {parallel_insert_time} seconds")
+    print(f"Got {g_test_index_multi.sequential_insert_time/parallel_insert_time} times improvement using {n_threads} threads\n")
+    assert g_test_index_multi.sequential_insert_time/parallel_insert_time > \
+           expected_speedup(expected_parallel_rate, n_threads)
+
+    # Run queries over the multi-index
+    query_data = np.float32(np.random.random((num_queries, dim)))
+    g_test_index_multi.compute_ground_truth_knn(query_data, k)
+
+    # Run search over the baseline hnsw index (that was created by inserting vectors one by one).
+    total_search_time = 0
+    total_correct = 0
+    for i, query in enumerate(query_data):
+        start = time.time()
+        res_labels, _ = g_test_index_multi.hnsw_index.knn_query(query, k)
+        total_search_time += time.time() - start
+        total_correct += len(set(res_labels[0]).intersection(g_test_index_multi.total_res_bf[i]))
+    print(f"Running search over baseline multi index, got {total_correct / (k * num_queries)} recall on {num_queries}"
+          f" queries, and {num_queries/total_search_time} query per second")
+
+    # Run search with parallel index and assert that similar recall achieved.
+    start = time.time()
+    res_labels_parallel, res_dists_parallel = parallel_multi_index.knn_parallel(query_data, k, num_threads=n_threads)
+    total_search_time_parallel = time.time() - start
+    total_correct_parallel = 0
+    for res_labels, ground_truth in zip(res_labels_parallel, g_test_index_multi.total_res_bf):
+        total_correct_parallel += len(set(res_labels).intersection(set(ground_truth)))
+
+    print(f"Running parallel search over multi index built in parallel, got {total_correct_parallel / (k * num_queries)}"
+          f" recall on {num_queries} queries, and {num_queries/total_search_time_parallel} query per second")
+    print(f"Got {total_search_time / total_search_time_parallel} times improvement in runtime using"
+          f" {n_threads} threads\n")
+    assert total_correct_parallel >= total_correct * 0.95  # 0.95 is an arbitrary threshold
+    assert total_search_time/total_search_time_parallel >= expected_speedup(expected_parallel_rate, n_threads)
+
+
+def test_parallel_multi_insert_search():
+    k = 10
+    num_queries = 10000
+    n_threads = min(os.cpu_count(), 8)
+    num_labels = int(g_test_index_multi.num_elements / g_test_index_multi.vectors_per_label)
+
+    query_data = np.float32(np.random.random((num_queries, dim)))
+    g_test_index_multi.compute_ground_truth_knn(query_data, k)
+
+    # Insert vectors to the index and search in parallel.
+    parallel_multi_index = create_hnsw_index(g_test_index_multi.dim, g_test_index_multi.num_elements,
+                                             g_test_index_multi.metric, g_test_index_multi.data_type, ef_runtime=200,
+                                             is_multi=True)
+
+    data = g_test_index_multi.data.reshape(num_elements, dim)
+    labels = np.concatenate([[i]*g_test_index_multi.vectors_per_label for i in range(num_labels)])
+
+    def insert_vectors():
+        parallel_multi_index.add_vector_parallel(data, labels, num_threads=int(n_threads/2))
+
+    res_labels_g = np.zeros((num_queries, dim))
+
+    def run_queries():
+        nonlocal res_labels_g
+        res_labels_g, _ = parallel_multi_index.knn_parallel(query_data, k, num_threads=int(n_threads/2))
+
+    t_insert = threading.Thread(target=insert_vectors)
+    t_query = threading.Thread(target=run_queries)
+    print("Running KNN queries in parallel to inserting vectors to the multi index, start running queries after more"
+          " 50% of the vectors are indexed")
+    t_insert.start()
+    # Wait until half of the index is indexed, then start run queries
+    while parallel_multi_index.index_size() < num_elements / 2:
+        time.sleep(0.5)
+    t_query.start()
+
+    [t.join() for t in [t_insert, t_query]]
+
+    # Measure recall - expect to get increased recall over time, since vectors are being inserted while queries
+    # are running, and the ground truth is measured compared to the index that contains all the elements.
+    chunk_size = int(num_queries/5)
+    total_correct_prev_chunk = 0
+    for i in range(0, num_queries, chunk_size):
+        total_correct_cur_chunk = 0
+        for j in range(i, i+chunk_size):
+            total_correct_cur_chunk += len(set(g_test_index_multi.total_res_bf[j]).intersection(set(res_labels_g[j])))
+        assert total_correct_cur_chunk >= total_correct_prev_chunk
+        total_correct_prev_chunk = total_correct_cur_chunk
+        print(f"Recall for queries' chunk {int(i/chunk_size)+1}/{int(num_queries/chunk_size)} is:"
+              f" {total_correct_cur_chunk/(k*chunk_size)}")
+
+
+def test_parallel_batch_search():
+    num_queries = 10000
+    batch_size = 100
+    n_batches = 5
+    n_threads = min(os.cpu_count(), 8)
+    expected_parallel_rate = 0.85  # we expect that at least 85% of the insert/search time will be executed in parallel
+
+    # Sequential batched search as the baseline
+    query_data = np.float32(np.random.random((num_queries, dim)))
+    g_test_index.compute_ground_truth_knn(query_data, batch_size*n_batches)
+    total_search_time = 0
+    total_correct = 0
+    for i, query in enumerate(query_data):
+        start = time.time()
+        batch_iterator = g_test_index.hnsw_index.create_batch_iterator(query)
+        # Collect all the results from all batches
+        res_labels = set()
+        for _ in range(n_batches):
+            res_labels = res_labels.union(set(batch_iterator.get_next_results(batch_size, BY_SCORE)[0][0]))
+
+        total_search_time += time.time() - start
+        total_correct += len(res_labels.intersection(set(g_test_index.total_res_bf[i])))
+
+    print(f"Running sequential batched search of {n_batches} batches of size {batch_size}, over {num_queries} queries,"
+          f" got recall of {total_correct/(n_batches*batch_size*num_queries)} and "
+          f" {num_queries/total_search_time} query per second")
+
+    total_results_parallel = {}
+
+    def run_batched_search(query_, query_ind):
+        batch_iterator_ = g_test_index.hnsw_index.create_batch_iterator(query_)
+        res_labels_ = set()
+        for _ in range(n_batches):
+            res_labels_ = res_labels_.union(set(batch_iterator_.get_next_results(batch_size, BY_SCORE)[0][0]))
+        total_results_parallel[query_ind] = res_labels_
+
+    start = time.time()
+    with ThreadPoolExecutor(max_workers=n_threads) as executor:
+        futures = [executor.submit(run_batched_search, q, i) for i, q in enumerate(query_data)]
+        done, not_done = wait(futures, return_when=concurrent.futures.ALL_COMPLETED)
+    total_search_time_parallel = time.time() - start
+    assert len(done) == num_queries and len(not_done) == 0
+
+    total_correct_parallel = 0
+    for i in range(num_queries):
+        total_correct_parallel += len(set(g_test_index.total_res_bf[i]).intersection(total_results_parallel[i]))
+
+    print(f"Running parallel batched search of {n_batches} batches of size {batch_size}, over {num_queries} queries,"
+          f" got recall of {total_correct_parallel/(n_batches*batch_size*num_queries)} and "
+          f" {num_queries/total_search_time_parallel} query per second")
+    print(f"Got {total_search_time / total_search_time_parallel} times improvement in runtime using "
+          f"{n_threads} threads\n")
+
+    # Validate that the recall of the parallel search recall is the same as the sequential search recall.
+    assert total_correct_parallel == total_correct
+    # Validate that the parallel run managed to achieve at least (n_threads - 1) times improvement in total runtime.
+    assert total_search_time / total_search_time_parallel > expected_speedup(expected_parallel_rate, n_threads)
+
+
+def test_parallel_insert_batch_search():
+    num_queries = 10000
+    batch_size = 100
+    n_batches = 5
+    n_threads = min(os.cpu_count(), 8)
+
+    # Insert vectors to the index and search in parallel.
+    parallel_index = create_hnsw_index(g_test_index.dim, g_test_index.num_elements, g_test_index.metric,
+                                       g_test_index.data_type, ef_runtime=200)
+
+    query_data = np.float32(np.random.random((num_queries, dim)))
+    g_test_index.compute_ground_truth_knn(query_data, n_batches*batch_size)
+
+    total_results_parallel = {}
+
+    def run_batched_search(query_, query_ind):
+        nonlocal total_results_parallel
+        batch_iterator_ = parallel_index.create_batch_iterator(query_)
+        res_labels_ = set()
+        for _ in range(n_batches):
+            res_labels_ = res_labels_.union(set(batch_iterator_.get_next_results(batch_size, BY_SCORE)[0][0]))
+        total_results_parallel[query_ind] = res_labels_
+
+    def insert_vectors():
+        parallel_index.add_vector_parallel(g_test_index.data, range(num_elements), num_threads=int(n_threads/2))
+
+    t_insert = threading.Thread(target=insert_vectors)
+    print("Running batched search in parallel to inserting vectors to the index, start running queries after more 50%"
+          " of the vectors are indexed")
+    t_insert.start()
+    # Wait until half of the index is indexed, then start run queries
+    while parallel_index.index_size() < num_elements / 2:
+        time.sleep(0.5)
+
+    with ThreadPoolExecutor(max_workers=int(n_threads/2)) as executor:
+        futures = [executor.submit(run_batched_search, q, i) for i, q in enumerate(query_data)]
+        done, not_done = wait(futures, return_when=concurrent.futures.ALL_COMPLETED)
+    assert len(done) == num_queries and len(not_done) == 0
+
+    t_insert.join()
+    assert parallel_index.index_size() == num_elements
+    assert parallel_index.check_integrity()
+
+    # Measure recall - expect to get increased recall over time, since vectors are being inserted while queries
+    # are running, and the ground truth is measured compared to the index that contains all the elements.
+    chunk_size = int(num_queries/5)
+    total_correct_prev_chunk = 0
+    for i in range(0, num_queries, chunk_size):
+        total_correct_cur_chunk = 0
+        for j in range(i, i+chunk_size):
+            total_correct_cur_chunk += len(set(g_test_index.total_res_bf[j]).intersection(total_results_parallel[j]))
+        assert total_correct_cur_chunk >= total_correct_prev_chunk
+        total_correct_prev_chunk = total_correct_cur_chunk
+        print(f"Recall for chunk {int(i/chunk_size)+1}/{int(num_queries/chunk_size)} of queries is:"
+              f" {total_correct_cur_chunk/(batch_size*n_batches*chunk_size)}")
diff --git a/tests/flow/test_hnsw_tiered.py b/tests/flow/test_hnsw_tiered.py
new file mode 100644
index 000000000..f27f08a32
--- /dev/null
+++ b/tests/flow/test_hnsw_tiered.py
@@ -0,0 +1,568 @@
+# Copyright Redis Ltd. 2021 - present
+# Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+# the Server Side Public License v1 (SSPLv1).
+import time
+from common import *
+
+
+# swap_job_threshold = 0 means use the default swap_job_threshold defined in hnsw_tiered.h
+def create_tiered_hnsw_params(swap_job_threshold = 0):
+    tiered_hnsw_params = TieredHNSWParams()
+    tiered_hnsw_params.swapJobThreshold  = swap_job_threshold
+    return tiered_hnsw_params   
+
+class IndexCtx:
+    def __init__(self, data_size = 10000, 
+                 dim = 16,                 
+                 M = 16, 
+                 ef_c = 512, 
+                 ef_r = 20, 
+                 metric = VecSimMetric_Cosine, 
+                 data_type = VecSimType_FLOAT32, 
+                 is_multi = False, 
+                 num_per_label = 1,
+                 swap_job_threshold = 0):
+        self.num_vectors = data_size
+        self.dim = dim
+        self.M = M
+        self.efConstruction = ef_c
+        self.efRuntime = ef_r 
+        self.metric = metric
+        self.data_type = data_type
+        self.is_multi = is_multi
+        self.num_per_label = num_per_label
+        
+        # Generate data.
+        self.num_labels = int(self.num_vectors/num_per_label)
+        
+        self.rng = np.random.default_rng(seed=47)
+        
+        data_shape = (self.num_labels, num_per_label, self.dim) if is_multi else (self.num_labels, self.dim)
+        data = self.rng.random(data_shape) 
+        self.data = np.float32(data) if self.data_type == VecSimType_FLOAT32 else data
+
+        
+        self.hnsw_params = create_hnsw_params(dim = self.dim, 
+                                              num_elements = self.num_vectors, 
+                                              metric = self.metric,
+                                              data_type = self.data_type,
+                                              ef_construction = ef_c,
+                                              m = M,
+                                              ef_runtime = ef_r,
+                                              is_multi = self.is_multi)
+        self.tiered_hnsw_params = create_tiered_hnsw_params(swap_job_threshold)
+        
+        self.tiered_index = Tiered_HNSWIndex(self.hnsw_params, self.tiered_hnsw_params)
+    
+    def populate_index_multi(self, index):
+        start = time.time()
+        duration = 0
+        for label, vectors in enumerate(self.data):
+            for vector in vectors:
+                start_add = time.time()
+                index.add_vector(vector, label)
+                duration += time.time() - start_add
+        end = time.time()
+        return (start, duration, end)
+    
+    def populate_index(self, index):
+        if self.is_multi:
+            return self.populate_index_multi(index)
+        start = time.time()
+        duration = 0
+        for label, vector in enumerate(self.data):
+            start_add = time.time()
+            index.add_vector(vector, label)
+            duration += time.time() - start_add
+        end = time.time()
+        return (start, duration, end)
+                    
+    def init_and_populate_flat_index(self):
+        bfparams = BFParams()
+        bfparams.initialCapacity = self.num_vectors
+        bfparams.dim = self.dim
+        bfparams.type = self.data_type
+        bfparams.metric = self.metric
+        bfparams.multi = self.is_multi
+        self.flat_index = BFIndex(bfparams)
+        
+        self.populate_index(self.flat_index)
+        
+        return self.flat_index
+    
+    def create_hnsw_index(self):
+        return HNSWIndex(self.hnsw_params)
+    
+    def init_and_populate_hnsw_index(self):
+        hnsw_index = HNSWIndex(self.hnsw_params)
+        self.hnsw_index = hnsw_index
+        
+        self.populate_index(hnsw_index)
+        return hnsw_index
+    
+    def generate_queries(self, num_queries):
+        queries = self.rng.random((num_queries, self.dim)) 
+        return np.float32(queries) if self.data_type == VecSimType_FLOAT32 else queries
+    
+    def get_vectors_memory_size(self):
+        data_type_size = 4 if self.data_type == VecSimType_FLOAT32 else 8
+        return bytes_to_mega(self.num_vectors * self.dim * data_type_size)
+           
+def create_tiered_index(is_multi: bool, num_per_label = 1):
+    indices_ctx = IndexCtx(data_size=50000, is_multi=is_multi, num_per_label=num_per_label)
+    num_elements = indices_ctx.num_labels
+    
+    threads_num = TieredIndex.get_threads_num()
+    
+    # Initialize time measurements to apply assert at the end.
+    tiered_index_time = 0
+    hnsw_index_time = 0
+    
+    # Create the tiered index
+    index = indices_ctx.tiered_index
+    
+    _, bf_dur, end_add_time = indices_ctx.populate_index(index)
+    
+    index.wait_for_index()
+    tiered_index_time = bf_dur + time.time() - end_add_time
+    
+    assert index.hnsw_label_count() == num_elements
+    
+    # Measure insertion to tiered index.
+    print(f"Insert {num_elements} vectors into the flat buffer took {round_ms(bf_dur)} ms")
+    print(f"Total time for inserting vectors to the tiered index and indexing them into HNSW using {threads_num} threads took {round_ms(tiered_index_time)} ms")
+    
+    # Measure total memory of the tiered index.
+    tiered_memory = bytes_to_mega(index.index_memory())
+    
+    print(f"total memory of tiered index = {tiered_memory} MB")
+    
+    hnsw_index = HNSWIndex(indices_ctx.hnsw_params)
+    _, hnsw_index_time, _ = indices_ctx.populate_index(hnsw_index)
+
+    print(f"Insert {num_elements} vectors directly to HNSW index (one by one) took {round_(hnsw_index_time)} s")   
+    hnsw_memory = bytes_to_mega(hnsw_index.index_memory())
+    print(f"total memory of hnsw index = {hnsw_memory} MB")
+    
+    # The index memory should be at least as the total memory of the vectors.
+    assert hnsw_memory > indices_ctx.get_vectors_memory_size()
+    
+    # Tiered index memory should be greater than HNSW index memory.
+    assert tiered_memory > hnsw_memory
+    execution_time_ratio = hnsw_index_time / tiered_index_time
+    print(f"with {threads_num} threads, insertion runtime is {round_(execution_time_ratio)} times better \n")
+    
+
+def search_insert(is_multi: bool, num_per_label = 1):
+    data_size = 100000
+    indices_ctx = IndexCtx(data_size=data_size, is_multi=is_multi, num_per_label=num_per_label)
+    index = indices_ctx.tiered_index
+
+    num_labels = indices_ctx.num_labels
+    
+    print(f'''Insert total of {num_labels} vectors of dim = {indices_ctx.dim},
+          {num_per_label} vectors in each label. Total labels = {num_labels}''')
+    
+    query_data = indices_ctx.generate_queries(num_queries=1)
+    
+    # Add vectors to the flat index.
+    bf_index = indices_ctx.init_and_populate_flat_index()
+    
+    # Start background insertion to the tiered index.
+    index_start, _, _ = indices_ctx.populate_index(index)
+    
+    correct = 0
+    k = 10
+    searches_number = 0
+    print(f"HNSW labels number = {index.hnsw_label_count()}")
+    # run knn query every 1 s. 
+    total_tiered_search_time = 0
+    prev_bf_size = num_labels
+    print("Start running queries while indexing is done in the background")
+    while index.hnsw_label_count() < num_labels:
+        # For each run get the current hnsw size and the query time.
+        bf_curr_size = index.get_curr_bf_size()
+        query_start = time.time()
+        tiered_labels, _ = index.knn_query(query_data, k)
+        query_dur = time.time() - query_start
+        total_tiered_search_time += query_dur
+        
+        print(f"query time = {round_ms(query_dur)} ms")
+        
+        # BF size should decrease.
+        print(f"bf size = {bf_curr_size}")
+        assert bf_curr_size < prev_bf_size
+        
+        # Run the query also in the bf index to get the ground truth results.
+        bf_labels, _ = bf_index.knn_query(query_data, k)
+        correct += len(np.intersect1d(tiered_labels[0], bf_labels[0]))    
+        time.sleep(1)
+        searches_number += 1
+        prev_bf_size = bf_curr_size
+    
+    # HNSW labels count updates before the job is done, so we need to wait for the queue to be empty.
+    index.wait_for_index(1)
+    index_dur = time.time() - index_start
+    print(f"Indexing during searching in the tiered index took {round_(index_dur)} s")
+    
+    # Measure recall.
+    recall = float(correct)/(k*searches_number)
+    print("Average recall is:", round_(recall, 3))
+    print("tiered query per seconds: ", round_(searches_number/total_tiered_search_time)) 
+
+
+def test_create_tiered():
+    print("\nTest create tiered hnsw index")
+    create_tiered_index(is_multi=False)      
+      
+def test_create_multi():
+    print("Test create multi label tiered hnsw index")
+    create_tiered_index(is_multi=True, num_per_label=5)
+    
+def test_search_insert():
+    print(f"\nStart insert & search test")
+    search_insert(is_multi=False)
+    
+def test_search_insert_multi_index():
+    print(f"\nStart insert & search test for multi index")
+    
+    search_insert(is_multi=True, num_per_label=5)
+    
+# In this test we insert the vectors one by one to the tiered index (call wait_for_index after each add vector)
+# We expect to get the same index as if we were inserting the vector to the sync hnsw index.
+# To check that, we perform a knn query with k = vectors number and compare the results' labels
+# to pass the test all the labels and distances should be the same.
+def test_sanity():
+    
+    indices_ctx = IndexCtx()
+    index = indices_ctx.tiered_index    
+    k = indices_ctx.num_labels
+    
+    print(f"\nadd {indices_ctx.num_labels} vectors to the tiered index one by one")
+    # Add vectors to the tiered index one by one.
+    for i, vector in enumerate(indices_ctx.data):
+        index.add_vector(vector, i)
+        index.wait_for_index(1)
+    
+    assert index.hnsw_label_count() == indices_ctx.num_labels
+    
+    # Create hnsw index.
+    hnsw_index = indices_ctx.init_and_populate_hnsw_index()
+    
+    query_data = indices_ctx.generate_queries(num_queries=1)
+    
+    # Search knn in tiered.
+    tiered_labels, tiered_dist = index.knn_query(query_data, k)
+    # Search knn in hnsw.
+    hnsw_labels, hnsw_dist = hnsw_index.knn_query(query_data, k)
+    
+    # Compare.
+    has_diff = False
+    for i, hnsw_res_label in enumerate(hnsw_labels[0]):
+        if hnsw_res_label != tiered_labels[0][i]:
+            has_diff = True
+            print(f"hnsw label = {hnsw_res_label}, tiered label = {tiered_labels[0][i]}")
+            print(f"hnsw dist = {hnsw_dist[0][i]}, tiered dist = {tiered_dist[0][i]}")
+
+    assert has_diff == False
+    print(f"hnsw graph is identical to the tiered index graph")
+    
+def test_recall_after_deletion():
+    
+    indices_ctx = IndexCtx(ef_r=30)
+    index = indices_ctx.tiered_index
+    data = indices_ctx.data
+    num_elements = indices_ctx.num_labels
+    
+    # Create hnsw index.
+    hnsw_index = indices_ctx.init_and_populate_hnsw_index()
+    
+    print(f"\nadd {indices_ctx.num_labels} vectors to the tiered index one by one")
+    
+    # Populate tiered index.
+    vectors = []
+    for i, vector in enumerate(data):
+        index.add_vector(vector, i)
+        vectors.append((i, vector))
+
+    index.wait_for_index()
+    
+    print(f"Delete half of the index")
+    # Delete half of the index.
+    for i in range(0, num_elements, 2):
+        index.delete_vector(i)
+        hnsw_index.delete_vector(i)
+        
+    # Wait for all repair jobs to be done.
+    index.wait_for_index(5)
+    
+    assert index.hnsw_label_count() == (num_elements / 2)
+    assert hnsw_index.index_size() == (num_elements / 2)
+    
+    # Create a list of tuples of the vectors that left.
+    vectors = [vectors[i] for i in range(1, num_elements, 2)]
+    
+    # Perform queries.
+    num_queries = 10
+    queries = indices_ctx.generate_queries(num_queries=10)
+    
+    k = 10
+    correct_tiered = 0
+    correct_hnsw = 0
+    
+    # Calculate correct vectors for each index.
+    # We don't expect hnsw and tiered hnsw results to be identical due to the parallel insertion.
+    def calculate_correct(index_labels, keys):
+        correct = 0
+        for label in index_labels[0]:
+            for correct_label in keys:
+                if label == correct_label:
+                    correct += 1
+                    break 
+        return correct
+    
+    for target_vector in queries:
+        tiered_labels, _ = index.knn_query(target_vector, k)
+        hnsw_labels, _ = hnsw_index.knn_query(target_vector, k)
+        
+        # Sort distances of every vector from the target vector and get actual k nearest vectors.
+        dists = [(spatial.distance.cosine(target_vector, vec), key) for key, vec in vectors]
+        dists = sorted(dists)
+        keys = [key for _, key in dists[:k]]
+        correct_tiered += calculate_correct(tiered_labels, keys)
+        correct_hnsw += calculate_correct(hnsw_labels, keys)
+
+    # Measure recall.
+    recall_tiered = float(correct_tiered) / (k * num_queries)
+    recall_hnsw = float(correct_hnsw) / (k * num_queries)
+    print("HNSW tiered recall is: \n", recall_tiered)
+    print("HNSW recall is: \n", recall_hnsw)
+    assert (recall_tiered >= 0.9)
+
+
+def test_batch_iterator():
+    num_elements = 100000
+    dim = 100
+    M = 26
+    efConstruction = 180
+    efRuntime = 180
+    metric = VecSimMetric_L2
+    indices_ctx = IndexCtx(data_size=num_elements, 
+                           dim=dim, 
+                           M=M, 
+                           ef_c=efConstruction, 
+                           ef_r=efRuntime, 
+                           metric=metric)
+
+    index = indices_ctx.tiered_index
+    data = indices_ctx.data
+    
+    print(f"\n Test batch iterator in tiered index")
+    
+    vectors = []
+    # Add 100k random vectors to the index.
+    for i, vector in enumerate(data):
+        index.add_vector(vector, i)
+        vectors.append((i, vector))
+    
+    # Create a random query vector and create a batch iterator.
+    query_data = indices_ctx.generate_queries(num_queries=1)
+    batch_iterator = index.create_batch_iterator(query_data)
+    batch_size = 10
+    labels_first_batch, distances_first_batch = batch_iterator.get_next_results(batch_size, BY_ID)
+    
+    for i, _ in enumerate(labels_first_batch[0][:-1]):
+        # Assert sorting by id.
+        assert (labels_first_batch[0][i] < labels_first_batch[0][i + 1])
+
+    labels_second_batch, distances_second_batch = batch_iterator.get_next_results(batch_size, BY_SCORE)
+    should_have_return_in_first_batch = []
+    for i, dist in enumerate(distances_second_batch[0][:-1]):
+        # Assert sorting by score.
+        assert (distances_second_batch[0][i] < distances_second_batch[0][i + 1])
+        # Assert that every distance in the second batch is higher than any distance of the first batch.
+        if len(distances_first_batch[0][np.where(distances_first_batch[0] > dist)]) != 0:
+            should_have_return_in_first_batch.append(dist)
+    assert (len(should_have_return_in_first_batch) <= 2)
+
+    # Verify that runtime args are sent properly to the batch iterator.
+    query_params = VecSimQueryParams()
+    query_params.hnswRuntimeParams.efRuntime = 5
+    
+    batch_iterator_new = index.create_batch_iterator(query_data, query_params)
+    
+    labels_first_batch_new, distances_first_batch_new = batch_iterator_new.get_next_results(batch_size, BY_ID)
+    # Verify that accuracy is worse with the new lower ef_runtime.
+    assert (sum(labels_first_batch[0]) <= sum(labels_first_batch_new[0]))
+
+    query_params.hnswRuntimeParams.efRuntime = efRuntime  # Restore previous ef_runtime.
+    batch_iterator_new = index.create_batch_iterator(query_data, query_params)
+    labels_first_batch_new, distances_first_batch_new = batch_iterator_new.get_next_results(batch_size, BY_ID)
+    # Verify that results are now the same.
+    assert_allclose(labels_first_batch_new[0], labels_first_batch[0])
+
+    # Reset.
+    batch_iterator.reset()
+
+    # Run in batches of 100 until we reach 1000 results and measure recall.
+    batch_size = 100
+    total_res = 1000
+    total_recall = 0
+    num_queries = 10
+    query_data = indices_ctx.generate_queries(num_queries=num_queries)
+    for target_vector in query_data:
+        correct = 0
+        batch_iterator = index.create_batch_iterator(target_vector)
+        iterations = 0
+        # Sort distances of every vector from the target vector and get the actual order.
+        dists = [(spatial.distance.euclidean(target_vector, vec), key) for key, vec in vectors]
+        dists = sorted(dists)
+        accumulated_labels = []
+        while batch_iterator.has_next():
+            iterations += 1
+            labels, distances = batch_iterator.get_next_results(batch_size, BY_SCORE)
+            accumulated_labels.extend(labels[0])
+            returned_results_num = len(accumulated_labels)
+            if returned_results_num == total_res:
+                keys = [key for _, key in dists[:returned_results_num]]
+                correct += len(set(accumulated_labels).intersection(set(keys)))
+                break
+        assert iterations == np.ceil(total_res / batch_size)
+        recall = float(correct) / total_res
+        assert recall >= 0.89
+        total_recall += recall
+    print(f'\nAvg recall for {total_res} results in index of size {num_elements} with dim={dim} is: ',
+          round_(total_recall / num_queries))
+
+    # Run again a single query in batches until it is depleted.
+    batch_iterator = index.create_batch_iterator(query_data[0])
+    iterations = 0
+    accumulated_labels = set()
+
+    while batch_iterator.has_next():
+        iterations += 1
+        labels, distances = batch_iterator.get_next_results(batch_size, BY_SCORE)
+        # Verify that we got new scores in each iteration.
+        assert len(accumulated_labels.intersection(set(labels[0]))) == 0
+        accumulated_labels = accumulated_labels.union(set(labels[0]))
+    assert len(accumulated_labels) >= 0.95 * num_elements
+    print("Overall results returned:", len(accumulated_labels), "in", iterations, "iterations")
+
+def test_range_query():
+    num_elements = 100000
+    dim = 100
+    efConstruction = 200
+    efRuntime = 10
+    metric = VecSimMetric_L2
+    
+    indices_ctx = IndexCtx(data_size=num_elements, 
+                        dim=dim, 
+                        ef_c=efConstruction, 
+                        ef_r=efRuntime,
+                        metric=metric)
+
+    index = indices_ctx.tiered_index
+    data = indices_ctx.data
+
+    vectors = []
+    for i, vector in enumerate(data):
+        index.add_vector(vector, i)
+        vectors.append((i, vector))
+
+    query_data = indices_ctx.generate_queries(num_queries=1)
+
+    radius = 13.0
+    recalls = {}
+
+    for epsilon_rt in [0.001, 0.01, 0.1]:
+        query_params = VecSimQueryParams()
+        query_params.hnswRuntimeParams.epsilon = epsilon_rt
+        start = time.time()
+        tiered_labels, tiered_distances = index.range_query(query_data, radius=radius, query_param=query_params)
+        end = time.time()
+        res_num = len(tiered_labels[0])
+
+        dists = sorted([(key, spatial.distance.sqeuclidean(query_data.flat, vec)) for key, vec in vectors])
+        actual_results = [(key, dist) for key, dist in dists if dist <= radius]
+
+        print(
+            f'\nlookup time for {num_elements} vectors with dim={dim} took {end - start} seconds with epsilon={epsilon_rt},'
+            f' got {res_num} results, which are {res_num / len(actual_results)} of the entire results in the range.')
+
+        # Compare the number of vectors that are actually within the range to the returned results.
+        assert np.all(np.isin(tiered_labels, np.array([label for label, _ in actual_results])))
+
+        assert max(tiered_distances[0]) <= radius
+        recalls[epsilon_rt] = res_num / len(actual_results)
+
+    # Expect zero results for radius==0
+    tiered_labels, tiered_distances = index.range_query(query_data, radius=0)
+    assert len(tiered_labels[0]) == 0
+
+def test_multi_range_query():
+    num_labels = 20000
+    per_label = 5
+    num_elements = num_labels * per_label
+    
+    dim = 100
+    efConstruction = 200
+    efRuntime = 10
+    metric = VecSimMetric_L2
+
+    indices_ctx = IndexCtx(data_size=num_elements, 
+                        dim=dim, 
+                        ef_c=efConstruction, 
+                        ef_r=efRuntime,
+                        metric=metric,
+                        is_multi=True,
+                        num_per_label=per_label)
+    
+    index = indices_ctx.tiered_index
+    data = indices_ctx.data
+    
+    vectors = []
+    for label, vecs in enumerate(data):
+        for vector in vecs:
+            index.add_vector(vector, label)
+            vectors.append((label, vector))
+
+    query_data = indices_ctx.generate_queries(num_queries=1)
+
+    radius = 13.0
+    recalls = {}
+    # calculate distances of the labels in the index
+    dists = {}
+    for key, vec in vectors:
+        dists[key] = min(spatial.distance.sqeuclidean(query_data.flat, vec), dists.get(key, np.inf))
+
+    dists = list(dists.items())
+    dists = sorted(dists, key=lambda pair: pair[1])
+    keys = [key for key, dist in dists if dist <= radius]
+
+    for epsilon_rt in [0.001, 0.01, 0.1]:
+        query_params = VecSimQueryParams()
+        query_params.hnswRuntimeParams.epsilon = epsilon_rt
+        start = time.time()
+        tiered_labels, tiered_distances = index.range_query(query_data, radius=radius, query_param=query_params)
+        end = time.time()
+        res_num = len(tiered_labels[0])
+
+        print(
+            f'\nlookup time for ({num_labels} X {per_label}) vectors with dim={dim} took {end - start} seconds with epsilon={epsilon_rt},'
+            f' got {res_num} results, which are {res_num / len(keys)} of the entire results in the range.')
+
+        # Compare the number of vectors that are actually within the range to the returned results.
+        assert np.all(np.isin(tiered_labels, np.array(keys)))
+
+        # Asserts that all the results are unique
+        assert len(tiered_labels[0]) == len(np.unique(tiered_labels[0]))
+
+        assert max(tiered_distances[0]) <= radius
+        recalls[epsilon_rt] = res_num / len(keys)
+
+    # Expect higher recalls for higher epsilon values.
+    assert recalls[0.001] <= recalls[0.01] <= recalls[0.1]
+
+    # Expect zero results for radius==0
+    tiered_labels, tiered_distances = index.range_query(query_data, radius=0)
+    assert len(tiered_labels[0]) == 0
diff --git a/tests/module/memory_test.c b/tests/module/memory_test.c
index 4865f1ba2..1d7c56349 100644
--- a/tests/module/memory_test.c
+++ b/tests/module/memory_test.c
@@ -27,18 +27,7 @@ long long _get_memory_usage(RedisModuleCtx *ctx) {
 // Adds 'amount' vectors to the index. could be 0.
 void _add_vectors(VecSimIndex *index, long long amount) {
     VecSimIndexInfo indexInfo = VecSimIndex_Info(index);
-    size_t dim;
-    switch (indexInfo.algo) {
-    case VecSimAlgo_BF:
-        dim = indexInfo.bfInfo.dim;
-        break;
-    case VecSimAlgo_HNSWLIB:
-        dim = indexInfo.hnswInfo.dim;
-        break;
-
-    default:
-        break;
-    }
+    size_t dim = indexInfo.commonInfo.basicInfo.dim;
     double vec[dim];
     for (int i = 0; i < dim; i++)
         vec[i] = i;
@@ -53,7 +42,7 @@ void _delete_vectors(VecSimIndex *index, long long amount) {
         VecSimIndex_DeleteVector(index, i);
 }
 
-// Creates a generic index, supports Broute Force and HNSW.
+// Creates a generic index, supports Brute Force and HNSW.
 VecSimIndex *_create_index(VecSimAlgo algo) {
 
     VecSimParams param = {0};
@@ -78,6 +67,9 @@ VecSimIndex *_create_index(VecSimAlgo algo) {
         param.hnswParams.metric = VecSimMetric_L2;
         param.hnswParams.multi = false;
         break;
+    // TODO: add memory test for tiered index
+    case VecSimAlgo_TIERED:
+        return NULL;
     }
 
     return VecSimIndex_New(&param);
@@ -120,18 +112,8 @@ int _VecSim_memory_create_check_impl(RedisModuleCtx *ctx, VecSimAlgo algo, long
 
     // Actual test: verify that memory usage known to the server is at least the memory amount used
     // by the index.
-    int64_t memory;
-    switch (indexInfo.algo) {
-    case VecSimAlgo_BF:
-        memory = indexInfo.bfInfo.memory;
-        break;
-    case VecSimAlgo_HNSWLIB:
-        memory = indexInfo.hnswInfo.memory;
-        break;
+    uint64_t memory = indexInfo.commonInfo.memory;
 
-    default:
-        break;
-    }
     if (memory <= endMemory - startMemory)
         RedisModule_ReplyWithSimpleString(ctx, "OK");
     else
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 40a3ed330..9981e2fd5 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -21,13 +21,21 @@ set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_LINKER_FLAGS} ${LLVM_LD_FLAGS}")
 
 enable_testing()
 
+option(FP64_TESTS "Build fp 64 tests" OFF)
+
+if(FP64_TESTS)
+	add_definitions(-DFP64_TESTS)
+endif()
+
 add_executable(test_hnsw test_hnsw.cpp test_hnsw_multi.cpp test_hnsw_tiered.cpp test_utils.cpp)
+add_executable(test_hnsw_parallel test_hnsw_parallel.cpp test_utils.cpp)
 add_executable(test_bruteforce test_bruteforce.cpp test_bruteforce_multi.cpp test_utils.cpp)
 add_executable(test_allocator test_allocator.cpp test_utils.cpp)
 add_executable(test_spaces test_spaces.cpp)
 add_executable(test_common test_common.cpp test_utils.cpp)
 
 target_link_libraries(test_hnsw PUBLIC gtest_main VectorSimilarity)
+target_link_libraries(test_hnsw_parallel PUBLIC gtest_main VectorSimilarity)
 target_link_libraries(test_bruteforce PUBLIC gtest_main VectorSimilarity)
 target_link_libraries(test_allocator PUBLIC gtest_main VectorSimilarity)
 target_link_libraries(test_spaces PUBLIC gtest_main VectorSimilarity)
@@ -36,6 +44,7 @@ target_link_libraries(test_common PUBLIC gtest_main VectorSimilarity)
 include(GoogleTest)
 
 gtest_discover_tests(test_hnsw)
+gtest_discover_tests(test_hnsw_parallel)
 gtest_discover_tests(test_bruteforce)
 gtest_discover_tests(test_allocator)
 gtest_discover_tests(test_spaces)
diff --git a/tests/unit/test_allocator.cpp b/tests/unit/test_allocator.cpp
index 8d236b4ef..7fe109762 100644
--- a/tests/unit/test_allocator.cpp
+++ b/tests/unit/test_allocator.cpp
@@ -12,9 +12,9 @@
 #include "VecSim/algorithms/hnsw/hnsw_single.h"
 #include "test_utils.h"
 #include "VecSim/utils/serializer.h"
-#include "VecSim/algorithms/hnsw/hnsw_factory.h"
+#include "VecSim/index_factories/hnsw_factory.h"
 
-const size_t vecsimAllocationOverhead = sizeof(size_t);
+const size_t vecsimAllocationOverhead = VecSimAllocator::getAllocationOverheadSize();
 
 const size_t hashTableNodeSize = getLabelsLookupNodeSize();
 
@@ -88,9 +88,6 @@ class IndexAllocatorTest : public ::testing::Test {};
 TYPED_TEST_SUITE(IndexAllocatorTest, DataTypeSet);
 
 TYPED_TEST(IndexAllocatorTest, test_bf_index_block_size_1) {
-    std::shared_ptr<VecSimAllocator> allocator = VecSimAllocator::newVecsimAllocator();
-    uint64_t expectedAllocationSize = sizeof(VecSimAllocator);
-    ASSERT_EQ(allocator->getAllocationSize(), expectedAllocationSize);
     // Create only the minimal struct.
     size_t dim = 128;
     BFParams params = {.type = TypeParam::get_index_type(),
@@ -98,17 +95,20 @@ TYPED_TEST(IndexAllocatorTest, test_bf_index_block_size_1) {
                        .metric = VecSimMetric_IP,
                        .initialCapacity = 0,
                        .blockSize = 1};
-
+    auto *bfIndex = dynamic_cast<BruteForceIndex_Single<TEST_DATA_T, TEST_DIST_T> *>(
+        BruteForceFactory::NewIndex(&params));
+    auto allocator = bfIndex->getAllocator();
     TEST_DATA_T vec[128] = {};
-    BruteForceIndex_Single<TEST_DATA_T, TEST_DIST_T> *bfIndex =
-        new (allocator) BruteForceIndex_Single<TEST_DATA_T, TEST_DIST_T>(&params, allocator);
+    uint64_t expectedAllocationSize = sizeof(VecSimAllocator);
     expectedAllocationSize +=
         sizeof(BruteForceIndex_Single<TEST_DATA_T, TEST_DIST_T>) + vecsimAllocationOverhead;
     ASSERT_EQ(allocator->getAllocationSize(), expectedAllocationSize);
     VecSimIndexInfo info = bfIndex->info();
-    ASSERT_EQ(allocator->getAllocationSize(), info.bfInfo.memory);
+    ASSERT_EQ(allocator->getAllocationSize(), info.commonInfo.memory);
 
-    int addCommandAllocationDelta = VecSimIndex_AddVector(bfIndex, vec, 1);
+    int before = allocator->getAllocationSize();
+    VecSimIndex_AddVector(bfIndex, vec, 1);
+    int addCommandAllocationDelta = allocator->getAllocationSize() - before;
     int64_t expectedAllocationDelta = 0;
     expectedAllocationDelta +=
         sizeof(labelType) + vecsimAllocationOverhead; // resize idToLabelMapping
@@ -125,13 +125,15 @@ TYPED_TEST(IndexAllocatorTest, test_bf_index_block_size_1) {
     ASSERT_LE(expectedAllocationSize + expectedAllocationDelta, allocator->getAllocationSize());
     ASSERT_LE(expectedAllocationDelta, addCommandAllocationDelta);
     info = bfIndex->info();
-    ASSERT_EQ(allocator->getAllocationSize(), info.bfInfo.memory);
+    ASSERT_EQ(allocator->getAllocationSize(), info.commonInfo.memory);
 
     // Prepare for next assertion test
-    expectedAllocationSize = info.bfInfo.memory;
+    expectedAllocationSize = info.commonInfo.memory;
     expectedAllocationDelta = 0;
 
-    addCommandAllocationDelta = VecSimIndex_AddVector(bfIndex, vec, 2);
+    before = allocator->getAllocationSize();
+    VecSimIndex_AddVector(bfIndex, vec, 2);
+    addCommandAllocationDelta = allocator->getAllocationSize() - before;
     expectedAllocationDelta += sizeof(VectorBlock) + vecsimAllocationOverhead; // New vector block
     expectedAllocationDelta += sizeof(labelType); // resize idToLabelMapping
     expectedAllocationDelta +=
@@ -146,13 +148,15 @@ TYPED_TEST(IndexAllocatorTest, test_bf_index_block_size_1) {
     ASSERT_LE(expectedAllocationSize + expectedAllocationDelta, allocator->getAllocationSize());
     ASSERT_LE(expectedAllocationDelta, addCommandAllocationDelta);
     info = bfIndex->info();
-    ASSERT_EQ(allocator->getAllocationSize(), info.bfInfo.memory);
+    ASSERT_EQ(allocator->getAllocationSize(), info.commonInfo.memory);
 
     // Prepare for next assertion test
-    expectedAllocationSize = info.bfInfo.memory;
+    expectedAllocationSize = info.commonInfo.memory;
     expectedAllocationDelta = 0;
 
-    int deleteCommandAllocationDelta = VecSimIndex_DeleteVector(bfIndex, 2);
+    before = allocator->getAllocationSize();
+    VecSimIndex_DeleteVector(bfIndex, 2);
+    int deleteCommandAllocationDelta = allocator->getAllocationSize() - before;
     expectedAllocationDelta -=
         (sizeof(VectorBlock) + vecsimAllocationOverhead); // Free the vector block
     expectedAllocationDelta -=
@@ -166,17 +170,19 @@ TYPED_TEST(IndexAllocatorTest, test_bf_index_block_size_1) {
     // collection allocate additional structures for their internal implementation.
     ASSERT_EQ(allocator->getAllocationSize(),
               expectedAllocationSize + deleteCommandAllocationDelta);
-    ASSERT_LE(expectedAllocationSize + expectedAllocationDelta, allocator->getAllocationSize());
-    ASSERT_LE(expectedAllocationDelta, deleteCommandAllocationDelta);
+    ASSERT_GE(expectedAllocationSize + expectedAllocationDelta, allocator->getAllocationSize());
+    ASSERT_GE(expectedAllocationDelta, deleteCommandAllocationDelta);
 
     info = bfIndex->info();
-    ASSERT_EQ(allocator->getAllocationSize(), info.bfInfo.memory);
+    ASSERT_EQ(allocator->getAllocationSize(), info.commonInfo.memory);
 
     // Prepare for next assertion test
-    expectedAllocationSize = info.bfInfo.memory;
+    expectedAllocationSize = info.commonInfo.memory;
     expectedAllocationDelta = 0;
 
-    deleteCommandAllocationDelta = VecSimIndex_DeleteVector(bfIndex, 1);
+    before = allocator->getAllocationSize();
+    VecSimIndex_DeleteVector(bfIndex, 1);
+    deleteCommandAllocationDelta = allocator->getAllocationSize() - before;
     expectedAllocationDelta -=
         (sizeof(VectorBlock) + vecsimAllocationOverhead); // Free the vector block
     expectedAllocationDelta -=
@@ -195,14 +201,11 @@ TYPED_TEST(IndexAllocatorTest, test_bf_index_block_size_1) {
     ASSERT_LE(expectedAllocationSize + expectedAllocationDelta, allocator->getAllocationSize());
     ASSERT_LE(expectedAllocationDelta, deleteCommandAllocationDelta);
     info = bfIndex->info();
-    ASSERT_EQ(allocator->getAllocationSize(), info.bfInfo.memory);
+    ASSERT_EQ(allocator->getAllocationSize(), info.commonInfo.memory);
     VecSimIndex_Free(bfIndex);
 }
 
 TYPED_TEST(IndexAllocatorTest, test_hnsw) {
-    std::shared_ptr<VecSimAllocator> allocator = VecSimAllocator::newVecsimAllocator();
-    uint64_t expectedAllocationSize = sizeof(VecSimAllocator);
-    ASSERT_EQ(allocator->getAllocationSize(), expectedAllocationSize);
     size_t d = 128;
 
     // Build with default args
@@ -212,45 +215,59 @@ TYPED_TEST(IndexAllocatorTest, test_hnsw) {
                          .initialCapacity = 0};
 
     TEST_DATA_T vec[128] = {};
-    HNSWIndex_Single<TEST_DATA_T, TEST_DIST_T> *hnswIndex =
-        new (allocator) HNSWIndex_Single<TEST_DATA_T, TEST_DIST_T>(&params, allocator);
+    auto *hnswIndex =
+        dynamic_cast<HNSWIndex_Single<TEST_DATA_T, TEST_DIST_T> *>(HNSWFactory::NewIndex(&params));
+    auto allocator = hnswIndex->getAllocator();
+    uint64_t expectedAllocationSize = sizeof(VecSimAllocator);
+
     expectedAllocationSize +=
         sizeof(HNSWIndex_Single<TEST_DATA_T, TEST_DIST_T>) + vecsimAllocationOverhead;
     ASSERT_GE(allocator->getAllocationSize(), expectedAllocationSize);
     VecSimIndexInfo info = hnswIndex->info();
-    ASSERT_EQ(allocator->getAllocationSize(), info.hnswInfo.memory);
-    expectedAllocationSize = info.hnswInfo.memory;
+    ASSERT_EQ(allocator->getAllocationSize(), info.commonInfo.memory);
+    expectedAllocationSize = info.commonInfo.memory;
 
-    int addCommandAllocationDelta = VecSimIndex_AddVector(hnswIndex, vec, 1);
+    int before = allocator->getAllocationSize();
+    VecSimIndex_AddVector(hnswIndex, vec, 1);
+    int addCommandAllocationDelta = allocator->getAllocationSize() - before;
     ASSERT_EQ(allocator->getAllocationSize(), expectedAllocationSize + addCommandAllocationDelta);
     info = hnswIndex->info();
-    ASSERT_EQ(allocator->getAllocationSize(), info.hnswInfo.memory);
-    expectedAllocationSize = info.hnswInfo.memory;
+    ASSERT_EQ(allocator->getAllocationSize(), info.commonInfo.memory);
+    expectedAllocationSize = info.commonInfo.memory;
+
+    before = allocator->getAllocationSize();
+    VecSimIndex_AddVector(hnswIndex, vec, 2);
+    addCommandAllocationDelta = allocator->getAllocationSize() - before;
 
-    addCommandAllocationDelta = VecSimIndex_AddVector(hnswIndex, vec, 2);
     ASSERT_EQ(allocator->getAllocationSize(), expectedAllocationSize + addCommandAllocationDelta);
     info = hnswIndex->info();
-    ASSERT_EQ(allocator->getAllocationSize(), info.hnswInfo.memory);
+    ASSERT_EQ(allocator->getAllocationSize(), info.commonInfo.memory);
 
-    expectedAllocationSize = info.hnswInfo.memory;
+    expectedAllocationSize = info.commonInfo.memory;
+
+    before = allocator->getAllocationSize();
+    VecSimIndex_DeleteVector(hnswIndex, 2);
+    int deleteCommandAllocationDelta = allocator->getAllocationSize() - before;
 
-    int deleteCommandAllocationDelta = VecSimIndex_DeleteVector(hnswIndex, 2);
     ASSERT_EQ(expectedAllocationSize + deleteCommandAllocationDelta,
               allocator->getAllocationSize());
     info = hnswIndex->info();
-    ASSERT_EQ(allocator->getAllocationSize(), info.hnswInfo.memory);
-    expectedAllocationSize = info.hnswInfo.memory;
+    ASSERT_EQ(allocator->getAllocationSize(), info.commonInfo.memory);
+    expectedAllocationSize = info.commonInfo.memory;
+
+    before = allocator->getAllocationSize();
+    VecSimIndex_DeleteVector(hnswIndex, 1);
+    deleteCommandAllocationDelta = allocator->getAllocationSize() - before;
 
-    deleteCommandAllocationDelta = VecSimIndex_DeleteVector(hnswIndex, 1);
     ASSERT_EQ(expectedAllocationSize + deleteCommandAllocationDelta,
               allocator->getAllocationSize());
     info = hnswIndex->info();
-    ASSERT_EQ(allocator->getAllocationSize(), info.hnswInfo.memory);
+    ASSERT_EQ(allocator->getAllocationSize(), info.commonInfo.memory);
     VecSimIndex_Free(hnswIndex);
 }
 
 TYPED_TEST(IndexAllocatorTest, testIncomingEdgesSet) {
-    std::shared_ptr<VecSimAllocator> allocator = VecSimAllocator::newVecsimAllocator();
+
     size_t d = 2;
 
     // Build index, use small M to simplify the scenario.
@@ -260,14 +277,17 @@ TYPED_TEST(IndexAllocatorTest, testIncomingEdgesSet) {
                          .initialCapacity = 10,
                          .M = 2};
     auto *hnswIndex =
-        new (allocator) HNSWIndex_Single<TEST_DATA_T, TEST_DIST_T>(&params, allocator);
+        dynamic_cast<HNSWIndex_Single<TEST_DATA_T, TEST_DIST_T> *>(HNSWFactory::NewIndex(&params));
+    auto allocator = hnswIndex->getAllocator();
 
     // Add a "dummy" vector - labels_lookup hash table will allocate initial size of buckets here.
     GenerateAndAddVector<TEST_DATA_T>(hnswIndex, d, 0, 0.0);
 
     // Add another vector and validate it's exact memory allocation delta.
     TEST_DATA_T vec1[] = {1.0, 0.0};
-    int allocation_delta = VecSimIndex_AddVector(hnswIndex, vec1, 1);
+    int before = allocator->getAllocationSize();
+    VecSimIndex_AddVector(hnswIndex, vec1, 1);
+    int allocation_delta = allocator->getAllocationSize() - before;
     size_t vec_max_level = hnswIndex->element_levels_[1];
 
     // Expect the creation of an empty incoming edges set in every level (+ the allocator header
@@ -302,7 +322,9 @@ TYPED_TEST(IndexAllocatorTest, testIncomingEdgesSet) {
     // set.
     TEST_DATA_T vec5[] = {0.5f, 0.0f};
     size_t buckets_num_before = hnswIndex->label_lookup_.bucket_count();
-    allocation_delta = VecSimIndex_AddVector(hnswIndex, vec5, 5);
+    before = allocator->getAllocationSize();
+    VecSimIndex_AddVector(hnswIndex, vec5, 5);
+    allocation_delta = allocator->getAllocationSize() - before;
     vec_max_level = hnswIndex->element_levels_[5];
 
     /* Compute the expected allocation delta:
@@ -333,7 +355,6 @@ TYPED_TEST(IndexAllocatorTest, testIncomingEdgesSet) {
 }
 
 TYPED_TEST(IndexAllocatorTest, test_hnsw_reclaim_memory) {
-    std::shared_ptr<VecSimAllocator> allocator = VecSimAllocator::newVecsimAllocator();
     size_t d = 128;
 
     VecSimType type = TypeParam::get_index_type();
@@ -341,8 +362,8 @@ TYPED_TEST(IndexAllocatorTest, test_hnsw_reclaim_memory) {
     // Build HNSW index with default args and initial capacity of zero.
     HNSWParams params = {.type = type, .dim = d, .metric = VecSimMetric_L2, .initialCapacity = 0};
     auto *hnswIndex =
-        new (allocator) HNSWIndex_Single<TEST_DATA_T, TEST_DIST_T>(&params, allocator);
-
+        dynamic_cast<HNSWIndex_Single<TEST_DATA_T, TEST_DIST_T> *>(HNSWFactory::NewIndex(&params));
+    auto allocator = hnswIndex->getAllocator();
     ASSERT_EQ(hnswIndex->indexCapacity(), 0);
     size_t initial_memory_size = allocator->getAllocationSize();
     // labels_lookup and element_levels containers are not allocated at all in some platforms,
@@ -354,12 +375,15 @@ TYPED_TEST(IndexAllocatorTest, test_hnsw_reclaim_memory) {
                                        2 * vecsimAllocationOverhead);
 
     // Add vectors up to the size of a whole block, and calculate the total memory delta.
-    size_t block_size = hnswIndex->info().hnswInfo.blockSize;
-    size_t accumulated_mem_delta = 0;
+    size_t block_size = hnswIndex->info().commonInfo.basicInfo.blockSize;
 
+    size_t accumulated_mem_delta = allocator->getAllocationSize();
     for (size_t i = 0; i < block_size; i++) {
-        accumulated_mem_delta += GenerateAndAddVector<TEST_DATA_T>(hnswIndex, d, i, i);
+        GenerateAndAddVector<TEST_DATA_T>(hnswIndex, d, i, i);
     }
+    // Get the memory delta after adding the block.
+    accumulated_mem_delta = allocator->getAllocationSize() - accumulated_mem_delta;
+
     // Validate that a single block exists.
     ASSERT_EQ(hnswIndex->indexSize(), block_size);
     ASSERT_EQ(hnswIndex->indexCapacity(), block_size);
@@ -370,7 +394,9 @@ TYPED_TEST(IndexAllocatorTest, test_hnsw_reclaim_memory) {
 
     // Add another vector, expect resizing of the index to contain two blocks.
     size_t prev_bucket_count = hnswIndex->label_lookup_.bucket_count();
-    size_t mem_delta = GenerateAndAddVector<TEST_DATA_T>(hnswIndex, d, block_size, block_size);
+    size_t mem_delta = allocator->getAllocationSize();
+    GenerateAndAddVector<TEST_DATA_T>(hnswIndex, d, block_size, block_size);
+    mem_delta = allocator->getAllocationSize() - mem_delta;
 
     ASSERT_EQ(hnswIndex->indexSize(), block_size + 1);
     ASSERT_EQ(hnswIndex->indexCapacity(), 2 * block_size);
@@ -388,9 +414,9 @@ TYPED_TEST(IndexAllocatorTest, test_hnsw_reclaim_memory) {
     // Also account for all the memory allocation caused by the resizing that this vector triggered
     // except for the bucket count of the labels_lookup hash table that is calculated separately.
     size_t size_total_data_per_element = hnswIndex->size_data_per_element_;
-    expected_mem_delta +=
-        (sizeof(tag_t) + sizeof(void *) + sizeof(size_t) + size_total_data_per_element) *
-        block_size;
+    expected_mem_delta += (sizeof(tag_t) + sizeof(void *) + sizeof(size_t) +
+                           size_total_data_per_element + sizeof(std::mutex)) *
+                          block_size;
     expected_mem_delta +=
         (hnswIndex->label_lookup_.bucket_count() - prev_bucket_count) * sizeof(size_t);
 
diff --git a/tests/unit/test_bruteforce.cpp b/tests/unit/test_bruteforce.cpp
index 870485a87..ede0ef6df 100644
--- a/tests/unit/test_bruteforce.cpp
+++ b/tests/unit/test_bruteforce.cpp
@@ -108,7 +108,7 @@ TYPED_TEST(BruteForceTest, brute_force_vector_update_test) {
 
 TYPED_TEST(BruteForceTest, resize_and_align_index) {
     size_t dim = 4;
-    size_t n = 15;
+    size_t n = 14;
     size_t blockSize = 10;
 
     BFParams params = {
@@ -134,24 +134,26 @@ TYPED_TEST(BruteForceTest, resize_and_align_index) {
 
     // Add another vector, since index size equals to the capacity, this should cause resizing
     // (to fit a multiplication of block_size).
-    GenerateAndAddVector<TEST_DATA_T>(index, dim, n + 1);
+    GenerateAndAddVector<TEST_DATA_T>(index, dim, n);
     ASSERT_EQ(VecSimIndex_IndexSize(index), n + 1);
     // Check new capacity size, should be blockSize * 2.
     ASSERT_EQ(bf_index->idToLabelMapping.size(), 2 * blockSize);
+    ASSERT_EQ(bf_index->idToLabelMapping.capacity(), 2 * blockSize);
 
-    // Now size = n + 1 = 16, capacity = 2* bs = 20. Test capacity overflow again
-    // to check that it stays aligned with blocksize.
+    // Now size = n + 1 (= 15), capacity = 2 * bs (= 20). Test capacity overflow again
+    // to check that it stays aligned with block size.
 
     size_t add_vectors_count = 8;
     for (size_t i = 0; i < add_vectors_count; i++) {
         GenerateAndAddVector<TEST_DATA_T>(index, dim, n + 2 + i, i);
     }
 
-    // Size should be n + 1 + 8 = 24.
+    // Size should be n + 1 + 8 (= 25).
     ASSERT_EQ(VecSimIndex_IndexSize(index), n + 1 + add_vectors_count);
 
     // Check new capacity size, should be blockSize * 3.
     ASSERT_EQ(bf_index->idToLabelMapping.size(), 3 * blockSize);
+    ASSERT_EQ(bf_index->idToLabelMapping.capacity(), 3 * blockSize);
 
     VecSimIndex_Free(index);
 }
@@ -170,7 +172,7 @@ TYPED_TEST(BruteForceTest, resize_and_align_index_largeInitialCapacity) {
     BruteForceIndex<TEST_DATA_T, TEST_DIST_T> *bf_index = this->CastToBF(index);
     ASSERT_EQ(VecSimIndex_IndexSize(index), 0);
 
-    // add up to blocksize + 1 = 3 + 1 = 4
+    // Add up to block size + 1 = 3 + 1 = 4
     for (size_t i = 0; i < bs + 1; i++) {
         GenerateAndAddVector<TEST_DATA_T>(index, dim, i, i);
     }
@@ -190,6 +192,7 @@ TYPED_TEST(BruteForceTest, resize_and_align_index_largeInitialCapacity) {
     // 10  - 3 - 10 % 3 (1) = 6
     idToLabelMapping_size = bf_index->idToLabelMapping.size();
     ASSERT_EQ(idToLabelMapping_size, n - bs - n % bs);
+    ASSERT_EQ(idToLabelMapping_size, bf_index->idToLabelMapping.capacity());
 
     // Delete all the vectors to decrease idToLabelMapping size by another bs.
     size_t i = 0;
@@ -198,19 +201,24 @@ TYPED_TEST(BruteForceTest, resize_and_align_index_largeInitialCapacity) {
         ++i;
     }
     ASSERT_EQ(bf_index->idToLabelMapping.size(), bs);
+    ASSERT_EQ(bf_index->idToLabelMapping.capacity(), bs);
+
     // Add and delete a vector to achieve:
     // size % block_size == 0 && size + bs <= idToLabelMapping_size(3).
     // idToLabelMapping_size should be resized to zero.
     GenerateAndAddVector<TEST_DATA_T>(index, dim, 0);
     VecSimIndex_DeleteVector(index, 0);
     ASSERT_EQ(bf_index->idToLabelMapping.size(), 0);
+    ASSERT_EQ(bf_index->idToLabelMapping.capacity(), 0);
 
     // Do it again. This time after adding a vector idToLabelMapping_size is increased by bs.
     // Upon deletion it will be resized to zero again.
     GenerateAndAddVector<TEST_DATA_T>(index, dim, 0);
     ASSERT_EQ(bf_index->idToLabelMapping.size(), bs);
+    ASSERT_EQ(bf_index->idToLabelMapping.capacity(), bs);
     VecSimIndex_DeleteVector(index, 0);
     ASSERT_EQ(bf_index->idToLabelMapping.size(), 0);
+    ASSERT_EQ(bf_index->idToLabelMapping.capacity(), 0);
 
     VecSimIndex_Free(index);
 }
@@ -521,12 +529,12 @@ TYPED_TEST(BruteForceTest, test_bf_info) {
     VecSimIndex *index = this->CreateNewIndex(params);
 
     VecSimIndexInfo info = VecSimIndex_Info(index);
-    ASSERT_EQ(info.algo, VecSimAlgo_BF);
-    ASSERT_EQ(info.bfInfo.dim, d);
-    ASSERT_FALSE(info.bfInfo.isMulti);
+    ASSERT_EQ(info.commonInfo.basicInfo.algo, VecSimAlgo_BF);
+    ASSERT_EQ(info.commonInfo.basicInfo.dim, d);
+    ASSERT_FALSE(info.commonInfo.basicInfo.isMulti);
     // Default args.
-    ASSERT_EQ(info.bfInfo.blockSize, DEFAULT_BLOCK_SIZE);
-    ASSERT_EQ(info.bfInfo.indexSize, 0);
+    ASSERT_EQ(info.commonInfo.basicInfo.blockSize, DEFAULT_BLOCK_SIZE);
+    ASSERT_EQ(info.commonInfo.indexSize, 0);
     VecSimIndex_Free(index);
 
     d = 1280;
@@ -536,12 +544,25 @@ TYPED_TEST(BruteForceTest, test_bf_info) {
     index = this->CreateNewIndex(params);
 
     info = VecSimIndex_Info(index);
-    ASSERT_EQ(info.algo, VecSimAlgo_BF);
-    ASSERT_EQ(info.bfInfo.dim, d);
-    ASSERT_FALSE(info.bfInfo.isMulti);
+    ASSERT_EQ(info.commonInfo.basicInfo.algo, VecSimAlgo_BF);
+    ASSERT_EQ(info.commonInfo.basicInfo.dim, d);
+    ASSERT_FALSE(info.commonInfo.basicInfo.isMulti);
+    ASSERT_FALSE(info.commonInfo.basicInfo.isTiered);
+
     // User args.
-    ASSERT_EQ(info.bfInfo.blockSize, 1);
-    ASSERT_EQ(info.bfInfo.indexSize, 0);
+    ASSERT_EQ(info.commonInfo.basicInfo.blockSize, 1);
+    ASSERT_EQ(info.commonInfo.indexSize, 0);
+
+    // Validate that Static info returns the right restricted info as well.
+    VecSimIndexBasicInfo s_info = VecSimIndex_BasicInfo(index);
+    ASSERT_EQ(info.commonInfo.basicInfo.algo, s_info.algo);
+    ASSERT_EQ(info.commonInfo.basicInfo.dim, s_info.dim);
+    ASSERT_EQ(info.commonInfo.basicInfo.blockSize, s_info.blockSize);
+    ASSERT_EQ(info.commonInfo.basicInfo.type, s_info.type);
+    ASSERT_EQ(info.commonInfo.basicInfo.isMulti, s_info.isMulti);
+    ASSERT_EQ(info.commonInfo.basicInfo.type, s_info.type);
+    ASSERT_EQ(info.commonInfo.basicInfo.isTiered, s_info.isTiered);
+
     VecSimIndex_Free(index);
 }
 
@@ -575,8 +596,8 @@ TYPED_TEST(BruteForceTest, test_dynamic_bf_info_iterator) {
 
     VecSimIndexInfo info = VecSimIndex_Info(index);
     VecSimInfoIterator *infoIter = VecSimIndex_InfoIterator(index);
-    ASSERT_EQ(1, info.bfInfo.blockSize);
-    ASSERT_EQ(0, info.bfInfo.indexSize);
+    ASSERT_EQ(1, info.commonInfo.basicInfo.blockSize);
+    ASSERT_EQ(0, info.commonInfo.indexSize);
     compareFlatIndexInfoToIterator(info, infoIter);
     VecSimInfoIterator_Free(infoIter);
 
@@ -588,7 +609,7 @@ TYPED_TEST(BruteForceTest, test_dynamic_bf_info_iterator) {
     VecSimIndex_AddVector(index, v, 0);
     info = VecSimIndex_Info(index);
     infoIter = VecSimIndex_InfoIterator(index);
-    ASSERT_EQ(1, info.bfInfo.indexSize);
+    ASSERT_EQ(1, info.commonInfo.indexSize);
     compareFlatIndexInfoToIterator(info, infoIter);
     VecSimInfoIterator_Free(infoIter);
 
@@ -596,7 +617,7 @@ TYPED_TEST(BruteForceTest, test_dynamic_bf_info_iterator) {
     VecSimIndex_DeleteVector(index, 0);
     info = VecSimIndex_Info(index);
     infoIter = VecSimIndex_InfoIterator(index);
-    ASSERT_EQ(0, info.bfInfo.indexSize);
+    ASSERT_EQ(0, info.commonInfo.indexSize);
     compareFlatIndexInfoToIterator(info, infoIter);
     VecSimInfoIterator_Free(infoIter);
 
@@ -606,7 +627,7 @@ TYPED_TEST(BruteForceTest, test_dynamic_bf_info_iterator) {
     VecSimQueryResult_Free(res);
     info = VecSimIndex_Info(index);
     infoIter = VecSimIndex_InfoIterator(index);
-    ASSERT_EQ(STANDARD_KNN, info.bfInfo.last_mode);
+    ASSERT_EQ(STANDARD_KNN, info.commonInfo.last_mode);
     compareFlatIndexInfoToIterator(info, infoIter);
     VecSimInfoIterator_Free(infoIter);
 
@@ -614,14 +635,14 @@ TYPED_TEST(BruteForceTest, test_dynamic_bf_info_iterator) {
     VecSimQueryResult_Free(res);
     info = VecSimIndex_Info(index);
     infoIter = VecSimIndex_InfoIterator(index);
-    ASSERT_EQ(RANGE_QUERY, info.bfInfo.last_mode);
+    ASSERT_EQ(RANGE_QUERY, info.commonInfo.last_mode);
     compareFlatIndexInfoToIterator(info, infoIter);
     VecSimInfoIterator_Free(infoIter);
 
     ASSERT_TRUE(VecSimIndex_PreferAdHocSearch(index, 1, 1, true));
     info = VecSimIndex_Info(index);
     infoIter = VecSimIndex_InfoIterator(index);
-    ASSERT_EQ(HYBRID_ADHOC_BF, info.bfInfo.last_mode);
+    ASSERT_EQ(HYBRID_ADHOC_BF, info.commonInfo.last_mode);
     compareFlatIndexInfoToIterator(info, infoIter);
     VecSimInfoIterator_Free(infoIter);
 
@@ -630,7 +651,7 @@ TYPED_TEST(BruteForceTest, test_dynamic_bf_info_iterator) {
     ASSERT_FALSE(VecSimIndex_PreferAdHocSearch(index, 7e3, 1, true));
     info = VecSimIndex_Info(index);
     infoIter = VecSimIndex_InfoIterator(index);
-    ASSERT_EQ(HYBRID_BATCHES, info.bfInfo.last_mode);
+    ASSERT_EQ(HYBRID_BATCHES, info.commonInfo.last_mode);
     compareFlatIndexInfoToIterator(info, infoIter);
     VecSimInfoIterator_Free(infoIter);
 
@@ -639,7 +660,7 @@ TYPED_TEST(BruteForceTest, test_dynamic_bf_info_iterator) {
     ASSERT_TRUE(VecSimIndex_PreferAdHocSearch(index, 1, 1, false));
     info = VecSimIndex_Info(index);
     infoIter = VecSimIndex_InfoIterator(index);
-    ASSERT_EQ(HYBRID_BATCHES_TO_ADHOC_BF, info.bfInfo.last_mode);
+    ASSERT_EQ(HYBRID_BATCHES_TO_ADHOC_BF, info.commonInfo.last_mode);
     compareFlatIndexInfoToIterator(info, infoIter);
     VecSimInfoIterator_Free(infoIter);
 
@@ -659,8 +680,8 @@ TYPED_TEST(BruteForceTest, brute_force_vector_search_test_ip) {
         VecSimIndex *index = this->CreateNewIndex(params);
 
         VecSimIndexInfo info = VecSimIndex_Info(index);
-        ASSERT_EQ(info.algo, VecSimAlgo_BF);
-        ASSERT_EQ(info.bfInfo.blockSize, blocksize);
+        ASSERT_EQ(info.commonInfo.basicInfo.algo, VecSimAlgo_BF);
+        ASSERT_EQ(info.commonInfo.basicInfo.blockSize, blocksize);
 
         for (size_t i = 0; i < n; i++) {
             GenerateAndAddVector<TEST_DATA_T>(index, dim, i, i);
@@ -694,8 +715,8 @@ TYPED_TEST(BruteForceTest, brute_force_vector_search_test_l2) {
         VecSimIndex *index = this->CreateNewIndex(params);
 
         VecSimIndexInfo info = VecSimIndex_Info(index);
-        ASSERT_EQ(info.algo, VecSimAlgo_BF);
-        ASSERT_EQ(info.bfInfo.blockSize, blocksize);
+        ASSERT_EQ(info.commonInfo.basicInfo.algo, VecSimAlgo_BF);
+        ASSERT_EQ(info.commonInfo.basicInfo.blockSize, blocksize);
 
         for (size_t i = 0; i < n; i++) {
             GenerateAndAddVector<TEST_DATA_T>(index, dim, i, i);
@@ -864,16 +885,15 @@ TYPED_TEST(BruteForceTest, brute_force_zero_minimal_capacity) {
 TYPED_TEST(BruteForceTest, brute_force_batch_iterator) {
     size_t dim = 4;
 
-    BFParams params = {
-        .dim = dim, .metric = VecSimMetric_L2, .initialCapacity = 200, .blockSize = 5};
-
-    VecSimIndex *index = this->CreateNewIndex(params);
-
     // run the test twice - for index of size 100, every iteration will run select-based search,
     // as the number of results is 5, which is more than 0.1% of the index size. for index of size
     // 10000, we will run the heap-based search until we return 5000 results, and then switch to
     // select-based search.
     for (size_t n : {100, 10000}) {
+        BFParams params = {
+            .dim = dim, .metric = VecSimMetric_L2, .initialCapacity = n, .blockSize = 5};
+
+        VecSimIndex *index = this->CreateNewIndex(params);
         for (size_t i = 0; i < n; i++) {
             GenerateAndAddVector<TEST_DATA_T>(index, dim, i, i);
         }
@@ -902,23 +922,23 @@ TYPED_TEST(BruteForceTest, brute_force_batch_iterator) {
         }
         ASSERT_EQ(iteration_num, n / n_res);
         VecSimBatchIterator_Free(batchIterator);
+
+        VecSimIndex_Free(index);
     }
-    VecSimIndex_Free(index);
 }
 
 TYPED_TEST(BruteForceTest, brute_force_batch_iterator_non_unique_scores) {
     size_t dim = 4;
 
-    BFParams params = {
-        .dim = dim, .metric = VecSimMetric_L2, .initialCapacity = 200, .blockSize = 5};
-
-    VecSimIndex *index = this->CreateNewIndex(params);
-
     // Run the test twice - for index of size 100, every iteration will run select-based search,
     // as the number of results is 5, which is more than 0.1% of the index size. for index of size
     // 10000, we will run the heap-based search until we return 5000 results, and then switch to
     // select-based search.
     for (size_t n : {100, 10000}) {
+        BFParams params = {
+            .dim = dim, .metric = VecSimMetric_L2, .initialCapacity = n, .blockSize = 5};
+        VecSimIndex *index = this->CreateNewIndex(params);
+
         for (size_t i = 0; i < n; i++) {
             GenerateAndAddVector<TEST_DATA_T>(index, dim, i, i / 10);
         }
@@ -957,8 +977,9 @@ TYPED_TEST(BruteForceTest, brute_force_batch_iterator_non_unique_scores) {
         }
         ASSERT_EQ(iteration_num, n / n_res);
         VecSimBatchIterator_Free(batchIterator);
+
+        VecSimIndex_Free(index);
     }
-    VecSimIndex_Free(index);
 }
 
 TYPED_TEST(BruteForceTest, brute_force_batch_iterator_reset) {
@@ -1238,13 +1259,9 @@ TYPED_TEST(BruteForceTest, preferAdHocOptimization) {
     ASSERT_TRUE(VecSimIndex_PreferAdHocSearch(index, 0, 50, true));
 
     // Corner cases - subset size is greater than index size.
-    try {
-        VecSimIndex_PreferAdHocSearch(index, 1, 50, true);
-        FAIL() << "Expected std::runtime error";
-    } catch (std::runtime_error const &err) {
-        EXPECT_EQ(err.what(),
-                  std::string("internal error: subset size cannot be larger than index size"));
-    }
+    ASSERT_EQ(VecSimIndex_PreferAdHocSearch(index, 42, 50, true),
+              VecSimIndex_PreferAdHocSearch(index, 0, 50, true));
+
     VecSimIndex_Free(index);
 }
 
@@ -1379,7 +1396,8 @@ TYPED_TEST(BruteForceTest, testSizeEstimation) {
 
     estimation = EstimateElementSize(params) * bs;
 
-    actual = GenerateAndAddVector<TEST_DATA_T>(index, dim, 0);
+    GenerateAndAddVector<TEST_DATA_T>(index, dim, 0);
+    actual = index->getAllocationSize() - actual; // get the delta
     ASSERT_GE(estimation * 1.01, actual);
     ASSERT_LE(estimation * 0.99, actual);
 
diff --git a/tests/unit/test_bruteforce_multi.cpp b/tests/unit/test_bruteforce_multi.cpp
index 3b23764dd..1bf0aa4e7 100644
--- a/tests/unit/test_bruteforce_multi.cpp
+++ b/tests/unit/test_bruteforce_multi.cpp
@@ -86,8 +86,8 @@ TYPED_TEST(BruteForceMultiTest, resize_and_align_index) {
     }
 
     VecSimIndexInfo info = VecSimIndex_Info(index);
-    ASSERT_EQ(info.bfInfo.indexSize, n);
-    ASSERT_EQ(info.bfInfo.indexLabelCount, n_labels);
+    ASSERT_EQ(info.commonInfo.indexSize, n);
+    ASSERT_EQ(info.commonInfo.indexLabelCount, n_labels);
     ASSERT_EQ(bf_index->idToLabelMapping.size(), n);
     ASSERT_EQ(bf_index->getVectorBlocks().size(), n / blockSize + 1);
 
@@ -96,8 +96,8 @@ TYPED_TEST(BruteForceMultiTest, resize_and_align_index) {
 
     // This should do nothing
     info = VecSimIndex_Info(index);
-    ASSERT_EQ(info.bfInfo.indexSize, n);
-    ASSERT_EQ(info.bfInfo.indexLabelCount, n_labels);
+    ASSERT_EQ(info.commonInfo.indexSize, n);
+    ASSERT_EQ(info.commonInfo.indexLabelCount, n_labels);
     ASSERT_EQ(bf_index->idToLabelMapping.size(), n);
     ASSERT_EQ(bf_index->getVectorBlocks().size(), n / blockSize + 1);
 
@@ -105,9 +105,9 @@ TYPED_TEST(BruteForceMultiTest, resize_and_align_index) {
     // (to fit a multiplication of block_size).
     GenerateAndAddVector<TEST_DATA_T>(index, dim, 0);
     info = VecSimIndex_Info(index);
-    ASSERT_EQ(info.bfInfo.indexSize, n + 1);
+    ASSERT_EQ(info.commonInfo.indexSize, n + 1);
     // Label count doesn't increase because label 0 already exists
-    ASSERT_EQ(info.bfInfo.indexLabelCount, n_labels);
+    ASSERT_EQ(info.commonInfo.indexLabelCount, n_labels);
     // Check new capacity size, should be blockSize * 2.
     ASSERT_EQ(bf_index->idToLabelMapping.size(), 2 * blockSize);
 
@@ -123,9 +123,9 @@ TYPED_TEST(BruteForceMultiTest, resize_and_align_index) {
     size_t new_n = n + 1 + add_vectors_count;
     info = VecSimIndex_Info(index);
 
-    ASSERT_EQ(info.bfInfo.indexSize, new_n);
+    ASSERT_EQ(info.commonInfo.indexSize, new_n);
     // Label count doesn't increase because label 0 already exists
-    ASSERT_EQ(info.bfInfo.indexLabelCount, n_labels);
+    ASSERT_EQ(info.commonInfo.indexLabelCount, n_labels);
     size_t total_vectors = 0;
     for (auto label_ids : bf_index->labelToIdsLookup) {
         total_vectors += label_ids.second.size();
@@ -196,7 +196,7 @@ TYPED_TEST(BruteForceMultiTest, search_more_than_there_is) {
         GenerateAndAddVector<TEST_DATA_T>(index, dim, i / perLabel, i);
     }
     ASSERT_EQ(VecSimIndex_IndexSize(index), n);
-    ASSERT_EQ(VecSimIndex_Info(index).bfInfo.indexLabelCount, n_labels);
+    ASSERT_EQ(VecSimIndex_Info(index).commonInfo.indexLabelCount, n_labels);
 
     TEST_DATA_T query[] = {0, 0, 0, 0};
     VecSimQueryResult_List res = VecSimIndex_TopKQuery(index, query, k, nullptr, BY_SCORE);
@@ -288,7 +288,7 @@ TYPED_TEST(BruteForceMultiTest, find_better_score) {
         }
     }
     ASSERT_EQ(VecSimIndex_IndexSize(index), n);
-    ASSERT_EQ(VecSimIndex_Info(index).bfInfo.indexLabelCount, n_labels);
+    ASSERT_EQ(VecSimIndex_Info(index).commonInfo.indexLabelCount, n_labels);
 
     auto verify_res = [&](size_t id, double score, size_t index) {
         ASSERT_EQ(id, k - index - 1);
@@ -316,7 +316,7 @@ TYPED_TEST(BruteForceMultiTest, find_better_score_after_pop) {
         GenerateAndAddVector<TEST_DATA_T>(index, dim, i % n_labels, n - i);
     }
     ASSERT_EQ(VecSimIndex_IndexSize(index), n);
-    ASSERT_EQ(VecSimIndex_Info(index).bfInfo.indexLabelCount, n_labels);
+    ASSERT_EQ(VecSimIndex_Info(index).commonInfo.indexLabelCount, n_labels);
 
     TEST_DATA_T query[] = {0, 0, 0, 0};
     auto verify_res = [&](size_t id, double score, size_t index) {
@@ -487,12 +487,12 @@ TYPED_TEST(BruteForceMultiTest, test_bf_info) {
     VecSimIndex *index = this->CreateNewIndex(params);
 
     VecSimIndexInfo info = VecSimIndex_Info(index);
-    ASSERT_EQ(info.algo, VecSimAlgo_BF);
-    ASSERT_EQ(info.bfInfo.dim, d);
-    ASSERT_TRUE(info.bfInfo.isMulti);
+    ASSERT_EQ(info.commonInfo.basicInfo.algo, VecSimAlgo_BF);
+    ASSERT_EQ(info.commonInfo.basicInfo.dim, d);
+    ASSERT_TRUE(info.commonInfo.basicInfo.isMulti);
     // Default args.
-    ASSERT_EQ(info.bfInfo.blockSize, DEFAULT_BLOCK_SIZE);
-    ASSERT_EQ(info.bfInfo.indexSize, 0);
+    ASSERT_EQ(info.commonInfo.basicInfo.blockSize, DEFAULT_BLOCK_SIZE);
+    ASSERT_EQ(info.commonInfo.indexSize, 0);
     VecSimIndex_Free(index);
 
     d = 1280;
@@ -501,12 +501,12 @@ TYPED_TEST(BruteForceMultiTest, test_bf_info) {
     index = this->CreateNewIndex(params);
 
     info = VecSimIndex_Info(index);
-    ASSERT_EQ(info.algo, VecSimAlgo_BF);
-    ASSERT_EQ(info.bfInfo.dim, d);
-    ASSERT_TRUE(info.bfInfo.isMulti);
+    ASSERT_EQ(info.commonInfo.basicInfo.algo, VecSimAlgo_BF);
+    ASSERT_EQ(info.commonInfo.basicInfo.dim, d);
+    ASSERT_TRUE(info.commonInfo.basicInfo.isMulti);
     // User args.
-    ASSERT_EQ(info.bfInfo.blockSize, 1);
-    ASSERT_EQ(info.bfInfo.indexSize, 0);
+    ASSERT_EQ(info.commonInfo.basicInfo.blockSize, 1);
+    ASSERT_EQ(info.commonInfo.indexSize, 0);
     VecSimIndex_Free(index);
 }
 
@@ -539,8 +539,8 @@ TYPED_TEST(BruteForceMultiTest, test_dynamic_bf_info_iterator) {
 
     VecSimIndexInfo info = VecSimIndex_Info(index);
     VecSimInfoIterator *infoIter = VecSimIndex_InfoIterator(index);
-    ASSERT_EQ(1, info.bfInfo.blockSize);
-    ASSERT_EQ(0, info.bfInfo.indexSize);
+    ASSERT_EQ(1, info.commonInfo.basicInfo.blockSize);
+    ASSERT_EQ(0, info.commonInfo.indexSize);
     compareFlatIndexInfoToIterator(info, infoIter);
     VecSimInfoIterator_Free(infoIter);
 
@@ -556,8 +556,8 @@ TYPED_TEST(BruteForceMultiTest, test_dynamic_bf_info_iterator) {
     VecSimIndex_AddVector(index, v, 1);
     info = VecSimIndex_Info(index);
     infoIter = VecSimIndex_InfoIterator(index);
-    ASSERT_EQ(4, info.bfInfo.indexSize);
-    ASSERT_EQ(2, info.bfInfo.indexLabelCount);
+    ASSERT_EQ(4, info.commonInfo.indexSize);
+    ASSERT_EQ(2, info.commonInfo.indexLabelCount);
     compareFlatIndexInfoToIterator(info, infoIter);
     VecSimInfoIterator_Free(infoIter);
 
@@ -565,8 +565,8 @@ TYPED_TEST(BruteForceMultiTest, test_dynamic_bf_info_iterator) {
     VecSimIndex_DeleteVector(index, 0);
     info = VecSimIndex_Info(index);
     infoIter = VecSimIndex_InfoIterator(index);
-    ASSERT_EQ(2, info.bfInfo.indexSize);
-    ASSERT_EQ(1, info.bfInfo.indexLabelCount);
+    ASSERT_EQ(2, info.commonInfo.indexSize);
+    ASSERT_EQ(1, info.commonInfo.indexLabelCount);
     compareFlatIndexInfoToIterator(info, infoIter);
     VecSimInfoIterator_Free(infoIter);
 
@@ -576,7 +576,7 @@ TYPED_TEST(BruteForceMultiTest, test_dynamic_bf_info_iterator) {
     VecSimQueryResult_Free(res);
     info = VecSimIndex_Info(index);
     infoIter = VecSimIndex_InfoIterator(index);
-    ASSERT_EQ(STANDARD_KNN, info.bfInfo.last_mode);
+    ASSERT_EQ(STANDARD_KNN, info.commonInfo.last_mode);
     compareFlatIndexInfoToIterator(info, infoIter);
     VecSimInfoIterator_Free(infoIter);
 
@@ -584,14 +584,14 @@ TYPED_TEST(BruteForceMultiTest, test_dynamic_bf_info_iterator) {
     VecSimQueryResult_Free(res);
     info = VecSimIndex_Info(index);
     infoIter = VecSimIndex_InfoIterator(index);
-    ASSERT_EQ(RANGE_QUERY, info.bfInfo.last_mode);
+    ASSERT_EQ(RANGE_QUERY, info.commonInfo.last_mode);
     compareFlatIndexInfoToIterator(info, infoIter);
     VecSimInfoIterator_Free(infoIter);
 
     ASSERT_TRUE(VecSimIndex_PreferAdHocSearch(index, 1, 1, true));
     info = VecSimIndex_Info(index);
     infoIter = VecSimIndex_InfoIterator(index);
-    ASSERT_EQ(HYBRID_ADHOC_BF, info.bfInfo.last_mode);
+    ASSERT_EQ(HYBRID_ADHOC_BF, info.commonInfo.last_mode);
     compareFlatIndexInfoToIterator(info, infoIter);
     VecSimInfoIterator_Free(infoIter);
 
@@ -603,7 +603,7 @@ TYPED_TEST(BruteForceMultiTest, test_dynamic_bf_info_iterator) {
     ASSERT_FALSE(VecSimIndex_PreferAdHocSearch(index, 7e3, 1, true));
     info = VecSimIndex_Info(index);
     infoIter = VecSimIndex_InfoIterator(index);
-    ASSERT_EQ(HYBRID_BATCHES, info.bfInfo.last_mode);
+    ASSERT_EQ(HYBRID_BATCHES, info.commonInfo.last_mode);
     compareFlatIndexInfoToIterator(info, infoIter);
     VecSimInfoIterator_Free(infoIter);
 
@@ -612,7 +612,7 @@ TYPED_TEST(BruteForceMultiTest, test_dynamic_bf_info_iterator) {
     ASSERT_TRUE(VecSimIndex_PreferAdHocSearch(index, 1, 1, false));
     info = VecSimIndex_Info(index);
     infoIter = VecSimIndex_InfoIterator(index);
-    ASSERT_EQ(HYBRID_BATCHES_TO_ADHOC_BF, info.bfInfo.last_mode);
+    ASSERT_EQ(HYBRID_BATCHES_TO_ADHOC_BF, info.commonInfo.last_mode);
     compareFlatIndexInfoToIterator(info, infoIter);
     VecSimInfoIterator_Free(infoIter);
 
@@ -633,8 +633,8 @@ TYPED_TEST(BruteForceMultiTest, vector_search_test_l2) {
         VecSimIndex *index = this->CreateNewIndex(params);
 
         VecSimIndexInfo info = VecSimIndex_Info(index);
-        ASSERT_EQ(info.algo, VecSimAlgo_BF);
-        ASSERT_EQ(info.bfInfo.blockSize, blocksize);
+        ASSERT_EQ(info.commonInfo.basicInfo.algo, VecSimAlgo_BF);
+        ASSERT_EQ(info.commonInfo.basicInfo.blockSize, blocksize);
 
         for (size_t i = 0; i < n; i++) {
             GenerateAndAddVector<TEST_DATA_T>(index, dim, i, i);
@@ -710,6 +710,50 @@ TYPED_TEST(BruteForceMultiTest, search_empty_index) {
     VecSimIndex_Free(index);
 }
 
+TYPED_TEST(BruteForceMultiTest, removeVectorWithSwaps) {
+    size_t dim = 4;
+    size_t n = 6;
+
+    BFParams params = {.dim = dim, .metric = VecSimMetric_L2};
+    auto *index = this->CastToBF_Multi(this->CreateNewIndex(params));
+
+    // Insert 3 vectors under two different labels, so that we will have:
+    // {first_label->[0,1,3], second_label->[2,4,5]}
+    labelType first_label = 1;
+    labelType second_label = 2;
+
+    GenerateAndAddVector<TEST_DATA_T>(index, dim, first_label);
+    GenerateAndAddVector<TEST_DATA_T>(index, dim, first_label);
+    GenerateAndAddVector<TEST_DATA_T>(index, dim, second_label);
+    GenerateAndAddVector<TEST_DATA_T>(index, dim, first_label);
+    GenerateAndAddVector<TEST_DATA_T>(index, dim, second_label);
+    GenerateAndAddVector<TEST_DATA_T>(index, dim, second_label);
+    ASSERT_EQ(VecSimIndex_IndexSize(index), n);
+
+    // Artificially reorder the internal ids to test that we make the right changes
+    // when we have an id that appears twice in the array upon deleting the ids one by one.
+    ASSERT_EQ(index->labelToIdsLookup.at(second_label).size(), n / 2);
+    index->labelToIdsLookup.at(second_label)[0] = 4;
+    index->labelToIdsLookup.at(second_label)[1] = 2;
+    index->labelToIdsLookup.at(second_label)[2] = 5;
+
+    // Expect that the ids array of label 0 will behave as following:
+    // [|4, 2, 5] -> [4, |2, 4] -> [4, 2, |2] (where | marks the current position).
+    ASSERT_EQ(index->deleteVector(second_label), n / 2);
+    ASSERT_EQ(VecSimIndex_IndexSize(index), n / 2);
+
+    // Check that the internal ids of the first label are as expected.
+    auto ids = index->labelToIdsLookup.at(first_label);
+    ASSERT_EQ(ids.size(), n / 2);
+    ASSERT_TRUE(std::find(ids.begin(), ids.end(), 0) != ids.end());
+    ASSERT_TRUE(std::find(ids.begin(), ids.end(), 1) != ids.end());
+    ASSERT_TRUE(std::find(ids.begin(), ids.end(), 2) != ids.end());
+    ASSERT_EQ(index->deleteVector(first_label), n / 2);
+    ASSERT_EQ(VecSimIndex_IndexSize(index), 0);
+
+    VecSimIndex_Free(index);
+}
+
 TYPED_TEST(BruteForceMultiTest, remove_vector_after_replacing_block) {
     size_t dim = 4;
     size_t bs = 2;
@@ -747,7 +791,7 @@ TYPED_TEST(BruteForceMultiTest, remove_vector_after_replacing_block) {
     VecSimIndex_DeleteVector(index, 3);
 
     ASSERT_EQ(VecSimIndex_IndexSize(index), 3);
-    ASSERT_EQ(VecSimIndex_Info(index).bfInfo.indexLabelCount, 2);
+    ASSERT_EQ(VecSimIndex_Info(index).commonInfo.indexLabelCount, 2);
     auto bf_index = this->CastToBF_Multi(index);
     ASSERT_EQ(bf_index->getVectorLabel(0), 1);
     ASSERT_EQ(bf_index->getVectorLabel(1), 2);
@@ -768,22 +812,20 @@ TYPED_TEST(BruteForceMultiTest, batch_iterator) {
     size_t dim = 4;
     size_t perLabel = 5;
 
-    BFParams params = {
-        .dim = dim, .metric = VecSimMetric_L2, .initialCapacity = 200, .blockSize = 7};
-
-    VecSimIndex *index = this->CreateNewIndex(params);
-
     // run the test twice - for index of size 100, every iteration will run select-based search,
     // as the number of results is 5, which is more than 0.1% of the index size. for index of size
     // 10000, we will run the heap-based search until we return 5000 results, and then switch to
     // select-based search.
     for (size_t m : {100, 10000}) {
         size_t n = m * perLabel;
+        BFParams params = {
+            .dim = dim, .metric = VecSimMetric_L2, .initialCapacity = n, .blockSize = 7};
+        VecSimIndex *index = this->CreateNewIndex(params);
         for (size_t i = 0; i < n; i++) {
             GenerateAndAddVector<TEST_DATA_T>(index, dim, i / perLabel, i);
         }
         ASSERT_EQ(VecSimIndex_IndexSize(index), n);
-        ASSERT_EQ(VecSimIndex_Info(index).bfInfo.indexLabelCount, m);
+        ASSERT_EQ(VecSimIndex_Info(index).commonInfo.indexLabelCount, m);
 
         // Query for (n,n,...,n) vector (recall that n is the largest id in te index).
         TEST_DATA_T query[dim];
@@ -808,30 +850,27 @@ TYPED_TEST(BruteForceMultiTest, batch_iterator) {
         ASSERT_EQ(iteration_num, m / n_res);
         VecSimBatchIterator_Free(batchIterator);
         ASSERT_EQ(VecSimIndex_IndexSize(index), n);
-        ASSERT_EQ(VecSimIndex_Info(index).bfInfo.indexLabelCount, m);
+        ASSERT_EQ(VecSimIndex_Info(index).commonInfo.indexLabelCount, m);
+
         // Cleanup before next round.
-        for (size_t i = 0; i < m; i++) {
-            VecSimIndex_DeleteVector(index, i);
-        }
+        VecSimIndex_Free(index);
     }
-    VecSimIndex_Free(index);
 }
 
 TYPED_TEST(BruteForceMultiTest, brute_force_batch_iterator_non_unique_scores) {
     size_t dim = 4;
     size_t perLabel = 5;
 
-    BFParams params = {
-        .dim = dim, .metric = VecSimMetric_L2, .initialCapacity = 200, .blockSize = 5};
-
-    VecSimIndex *index = this->CreateNewIndex(params);
-
     // Run the test twice - for index of size 100, every iteration will run select-based search,
-    // as the number of results is 5, which is more than 0.1% of the index size. for index of size
+    // as the number of results is 5, which is more than 0.1% of the index size. For index of size
     // 10000, we will run the heap-based search until we return 5000 results, and then switch to
     // select-based search.
     for (size_t m : {100, 10000}) {
         size_t n = m * perLabel;
+        BFParams params = {
+            .dim = dim, .metric = VecSimMetric_L2, .initialCapacity = n, .blockSize = m / 2};
+
+        VecSimIndex *index = this->CreateNewIndex(params);
         for (size_t i = 0; i < n; i++) {
             GenerateAndAddVector<TEST_DATA_T>(index, dim, i / perLabel, i / (10 * perLabel));
         }
@@ -871,14 +910,13 @@ TYPED_TEST(BruteForceMultiTest, brute_force_batch_iterator_non_unique_scores) {
         ASSERT_EQ(iteration_num, m / n_res);
         VecSimBatchIterator_Free(batchIterator);
         ASSERT_EQ(VecSimIndex_IndexSize(index), n);
-        ASSERT_EQ(VecSimIndex_Info(index).bfInfo.indexLabelCount, m);
+        ASSERT_EQ(VecSimIndex_Info(index).commonInfo.indexLabelCount, m);
+
         // Cleanup before next round.
-        for (size_t i = 0; i < m; i++) {
-            VecSimIndex_DeleteVector(index, i);
-        }
+        VecSimIndex_Free(index);
     }
-    VecSimIndex_Free(index);
 }
+
 TYPED_TEST(BruteForceMultiTest, batch_iterator_validate_scores) {
     size_t dim = 4;
     size_t perLabel = 10;
@@ -922,7 +960,7 @@ TYPED_TEST(BruteForceMultiTest, batch_iterator_validate_scores) {
     ASSERT_EQ(iteration_num, n_labels / n_res);
     VecSimBatchIterator_Free(batchIterator);
     ASSERT_EQ(VecSimIndex_IndexSize(index), n_labels * perLabel);
-    ASSERT_EQ(VecSimIndex_Info(index).bfInfo.indexLabelCount, n_labels);
+    ASSERT_EQ(VecSimIndex_Info(index).commonInfo.indexLabelCount, n_labels);
 
     VecSimIndex_Free(index);
 }
@@ -1082,7 +1120,8 @@ TYPED_TEST(BruteForceMultiTest, testSizeEstimation) {
     ASSERT_EQ(estimation, actual);
 
     estimation = EstimateElementSize(params) * bs;
-    actual = GenerateAndAddVector<TEST_DATA_T>(index, dim, 0);
+    GenerateAndAddVector<TEST_DATA_T>(index, dim, 0);
+    actual = index->getAllocationSize() - actual; // get the delta
 
     ASSERT_GE(estimation * 1.01, actual);
     ASSERT_LE(estimation * 0.99, actual);
diff --git a/tests/unit/test_common.cpp b/tests/unit/test_common.cpp
index 231d1beb7..d81ce84c9 100644
--- a/tests/unit/test_common.cpp
+++ b/tests/unit/test_common.cpp
@@ -14,12 +14,13 @@
 #include "VecSim/utils/serializer.h"
 #include "VecSim/utils/vecsim_results_container.h"
 #include "VecSim/algorithms/hnsw/hnsw.h"
-#include "VecSim/algorithms/hnsw/hnsw_factory.h"
+#include "VecSim/index_factories/hnsw_factory.h"
 
 #include <cstdlib>
 #include <limits>
 #include <cmath>
 #include <random>
+#include <cstdarg>
 
 template <typename index_type_t>
 class CommonIndexTest : public ::testing::Test {};
@@ -204,25 +205,29 @@ TYPED_TEST(UtilsTests, Max_Updatable_Heap) {
     ASSERT_TRUE(heap.empty());
 
     // Inserting data with the same priority
-    heap.emplace(priorities[SECOND], 1);
-    heap.emplace(priorities[FIRST], 55);
-    heap.emplace(priorities[SECOND], 3);
     heap.emplace(priorities[SECOND], 2);
+    heap.emplace(priorities[FIRST], 1);
+    heap.emplace(priorities[SECOND], 4);
+    heap.emplace(priorities[SECOND], 3);
 
     ASSERT_EQ(heap.size(), 4);
     ASSERT_FALSE(heap.empty());
-    p = {priorities[FIRST], 55};
+    p = {priorities[FIRST], 1};
     ASSERT_TRUE(heap.top() == p);
 
-    heap.emplace(priorities[THIRD], 55); // Update priority
+    heap.emplace(priorities[THIRD], 1); // Update priority
 
     ASSERT_EQ(heap.size(), 4); // Same size after update
     ASSERT_FALSE(heap.empty());
 
     // Make sure each pop deletes a single element, even if some have the same priority.
+    // Also, make sure the elements are popped in the correct order (highest priority first, and on
+    // a tie - the element with the highest value).
     size_t len = heap.size();
     for (size_t i = len; i > 0; i--) {
         ASSERT_EQ(heap.size(), i);
+        ASSERT_EQ(heap.top().second, i);
+        ASSERT_EQ(heap.top().first, i == 1 ? priorities[THIRD] : priorities[SECOND]);
         ASSERT_FALSE(heap.empty());
         heap.pop();
     }
@@ -411,3 +416,36 @@ TEST_F(SerializerTest, HNSWSerialzer) {
     ASSERT_EXCEPTION_MESSAGE(HNSWFactory::NewIndex(this->file_name), std::runtime_error,
                              "Cannot load index: bad algorithm type");
 }
+
+struct logCtx {
+public:
+    std::vector<std::string> logBuffer;
+    std::string prefix;
+};
+
+void test_log_impl(void *ctx, const char *message) {
+    logCtx *log = (logCtx *)ctx;
+    std::string msg = log->prefix + message;
+    log->logBuffer.push_back(msg);
+}
+
+TEST(CommonAPITest, testlog) {
+
+    logCtx log;
+    log.prefix = "test log prefix: ";
+
+    BFParams bfParams = {.dim = 1, .metric = VecSimMetric_L2, .initialCapacity = 0, .blockSize = 5};
+    VecSimParams params = {.algo = VecSimAlgo_BF, .bfParams = bfParams, .logCtx = &log};
+    auto *index =
+        dynamic_cast<BruteForceIndex<float, float> *>(BruteForceFactory::NewIndex(&params));
+    VecSim_SetLogCallbackFunction(test_log_impl);
+
+    index->log("test log message no fmt");
+    index->log("test log message %s %s", "with", "args");
+
+    ASSERT_EQ(log.logBuffer.size(), 2);
+    ASSERT_EQ(log.logBuffer[0], "test log prefix: test log message no fmt");
+    ASSERT_EQ(log.logBuffer[1], "test log prefix: test log message with args");
+
+    VecSimIndex_Free(index);
+}
diff --git a/tests/unit/test_hnsw.cpp b/tests/unit/test_hnsw.cpp
index 69cd38c34..e7925dbab 100644
--- a/tests/unit/test_hnsw.cpp
+++ b/tests/unit/test_hnsw.cpp
@@ -7,11 +7,10 @@
 #include "gtest/gtest.h"
 #include "VecSim/vec_sim.h"
 #include "VecSim/algorithms/hnsw/hnsw_single.h"
-#include "VecSim/algorithms/hnsw/hnsw_factory.h"
+#include "VecSim/index_factories/hnsw_factory.h"
 #include "test_utils.h"
 #include "VecSim/utils/serializer.h"
 #include "VecSim/query_result_struct.h"
-#include <climits>
 #include <unistd.h>
 #include <random>
 #include <thread>
@@ -505,16 +504,16 @@ TYPED_TEST(HNSWTest, test_hnsw_info) {
     VecSimIndex *index = this->CreateNewIndex(params);
 
     VecSimIndexInfo info = VecSimIndex_Info(index);
-    ASSERT_EQ(info.algo, VecSimAlgo_HNSWLIB);
-    ASSERT_EQ(info.hnswInfo.dim, d);
+    ASSERT_EQ(info.commonInfo.basicInfo.algo, VecSimAlgo_HNSWLIB);
+    ASSERT_EQ(info.commonInfo.basicInfo.dim, d);
     // Default args.
-    ASSERT_FALSE(info.hnswInfo.isMulti);
-    ASSERT_EQ(info.hnswInfo.blockSize, DEFAULT_BLOCK_SIZE);
+    ASSERT_FALSE(info.commonInfo.basicInfo.isMulti);
+    ASSERT_EQ(info.commonInfo.basicInfo.blockSize, DEFAULT_BLOCK_SIZE);
     ASSERT_EQ(info.hnswInfo.M, HNSW_DEFAULT_M);
     ASSERT_EQ(info.hnswInfo.efConstruction, HNSW_DEFAULT_EF_C);
     ASSERT_EQ(info.hnswInfo.efRuntime, HNSW_DEFAULT_EF_RT);
     ASSERT_DOUBLE_EQ(info.hnswInfo.epsilon, HNSW_DEFAULT_EPSILON);
-    ASSERT_EQ(info.hnswInfo.type, params.type);
+    ASSERT_EQ(info.commonInfo.basicInfo.type, params.type);
     VecSimIndex_Free(index);
 
     d = 1280;
@@ -525,16 +524,28 @@ TYPED_TEST(HNSWTest, test_hnsw_info) {
 
     index = this->CreateNewIndex(params);
     info = VecSimIndex_Info(index);
-    ASSERT_EQ(info.algo, VecSimAlgo_HNSWLIB);
-    ASSERT_EQ(info.hnswInfo.dim, d);
+    ASSERT_EQ(info.commonInfo.basicInfo.algo, VecSimAlgo_HNSWLIB);
+    ASSERT_EQ(info.commonInfo.basicInfo.dim, d);
     // User args.
-    ASSERT_FALSE(info.hnswInfo.isMulti);
-    ASSERT_EQ(info.hnswInfo.blockSize, bs);
+    ASSERT_FALSE(info.commonInfo.basicInfo.isMulti);
+    ASSERT_EQ(info.commonInfo.basicInfo.blockSize, bs);
     ASSERT_EQ(info.hnswInfo.efConstruction, 1000);
     ASSERT_EQ(info.hnswInfo.M, 200);
     ASSERT_EQ(info.hnswInfo.efRuntime, 500);
     ASSERT_EQ(info.hnswInfo.epsilon, 0.005);
-    ASSERT_EQ(info.hnswInfo.type, params.type);
+    ASSERT_EQ(info.commonInfo.basicInfo.type, params.type);
+    ASSERT_FALSE(info.commonInfo.basicInfo.isTiered);
+
+    // Validate that Static info returns the right restricted info as well.
+    VecSimIndexBasicInfo s_info = VecSimIndex_BasicInfo(index);
+    ASSERT_EQ(info.commonInfo.basicInfo.algo, s_info.algo);
+    ASSERT_EQ(info.commonInfo.basicInfo.dim, s_info.dim);
+    ASSERT_EQ(info.commonInfo.basicInfo.blockSize, s_info.blockSize);
+    ASSERT_EQ(info.commonInfo.basicInfo.type, s_info.type);
+    ASSERT_EQ(info.commonInfo.basicInfo.isMulti, s_info.isMulti);
+    ASSERT_EQ(info.commonInfo.basicInfo.type, s_info.type);
+    ASSERT_EQ(info.commonInfo.basicInfo.isTiered, s_info.isTiered);
+
     VecSimIndex_Free(index);
 }
 
@@ -578,10 +589,10 @@ TYPED_TEST(HNSWTest, test_dynamic_hnsw_info_iterator) {
     ASSERT_EQ(250, info.hnswInfo.efConstruction);
     ASSERT_EQ(400, info.hnswInfo.efRuntime);
     ASSERT_EQ(0.004, info.hnswInfo.epsilon);
-    ASSERT_EQ(0, info.hnswInfo.indexSize);
+    ASSERT_EQ(0, info.commonInfo.indexSize);
     ASSERT_EQ(-1, info.hnswInfo.max_level);
     ASSERT_EQ(-1, info.hnswInfo.entrypoint);
-    ASSERT_EQ(params.type, info.hnswInfo.type);
+    ASSERT_EQ(params.type, info.commonInfo.basicInfo.type);
     compareHNSWIndexInfoToIterator(info, infoIter);
     VecSimInfoIterator_Free(infoIter);
 
@@ -593,7 +604,7 @@ TYPED_TEST(HNSWTest, test_dynamic_hnsw_info_iterator) {
     VecSimIndex_AddVector(index, v, 1);
     info = VecSimIndex_Info(index);
     infoIter = VecSimIndex_InfoIterator(index);
-    ASSERT_EQ(1, info.hnswInfo.indexSize);
+    ASSERT_EQ(1, info.commonInfo.indexSize);
     ASSERT_EQ(1, info.hnswInfo.entrypoint);
     ASSERT_GE(1, info.hnswInfo.max_level);
     compareHNSWIndexInfoToIterator(info, infoIter);
@@ -603,7 +614,7 @@ TYPED_TEST(HNSWTest, test_dynamic_hnsw_info_iterator) {
     VecSimIndex_DeleteVector(index, 1);
     info = VecSimIndex_Info(index);
     infoIter = VecSimIndex_InfoIterator(index);
-    ASSERT_EQ(0, info.bfInfo.indexSize);
+    ASSERT_EQ(0, info.commonInfo.indexSize);
     compareHNSWIndexInfoToIterator(info, infoIter);
     VecSimInfoIterator_Free(infoIter);
 
@@ -613,7 +624,7 @@ TYPED_TEST(HNSWTest, test_dynamic_hnsw_info_iterator) {
     VecSimQueryResult_Free(res);
     info = VecSimIndex_Info(index);
     infoIter = VecSimIndex_InfoIterator(index);
-    ASSERT_EQ(STANDARD_KNN, info.hnswInfo.last_mode);
+    ASSERT_EQ(STANDARD_KNN, info.commonInfo.last_mode);
     compareHNSWIndexInfoToIterator(info, infoIter);
     VecSimInfoIterator_Free(infoIter);
 
@@ -621,14 +632,14 @@ TYPED_TEST(HNSWTest, test_dynamic_hnsw_info_iterator) {
     VecSimQueryResult_Free(res);
     info = VecSimIndex_Info(index);
     infoIter = VecSimIndex_InfoIterator(index);
-    ASSERT_EQ(RANGE_QUERY, info.hnswInfo.last_mode);
+    ASSERT_EQ(RANGE_QUERY, info.commonInfo.last_mode);
     compareHNSWIndexInfoToIterator(info, infoIter);
     VecSimInfoIterator_Free(infoIter);
 
     ASSERT_TRUE(VecSimIndex_PreferAdHocSearch(index, 1, 1, true));
     info = VecSimIndex_Info(index);
     infoIter = VecSimIndex_InfoIterator(index);
-    ASSERT_EQ(HYBRID_ADHOC_BF, info.hnswInfo.last_mode);
+    ASSERT_EQ(HYBRID_ADHOC_BF, info.commonInfo.last_mode);
     compareHNSWIndexInfoToIterator(info, infoIter);
     VecSimInfoIterator_Free(infoIter);
 
@@ -642,7 +653,7 @@ TYPED_TEST(HNSWTest, test_dynamic_hnsw_info_iterator) {
     ASSERT_FALSE(VecSimIndex_PreferAdHocSearch(index, 10, 1, true));
     info = VecSimIndex_Info(index);
     infoIter = VecSimIndex_InfoIterator(index);
-    ASSERT_EQ(HYBRID_BATCHES, info.hnswInfo.last_mode);
+    ASSERT_EQ(HYBRID_BATCHES, info.commonInfo.last_mode);
     compareHNSWIndexInfoToIterator(info, infoIter);
     VecSimInfoIterator_Free(infoIter);
 
@@ -651,7 +662,7 @@ TYPED_TEST(HNSWTest, test_dynamic_hnsw_info_iterator) {
     ASSERT_TRUE(VecSimIndex_PreferAdHocSearch(index, 1, 10, false));
     info = VecSimIndex_Info(index);
     infoIter = VecSimIndex_InfoIterator(index);
-    ASSERT_EQ(HYBRID_BATCHES_TO_ADHOC_BF, info.hnswInfo.last_mode);
+    ASSERT_EQ(HYBRID_BATCHES_TO_ADHOC_BF, info.commonInfo.last_mode);
     compareHNSWIndexInfoToIterator(info, infoIter);
     VecSimInfoIterator_Free(infoIter);
 
@@ -923,7 +934,7 @@ TYPED_TEST(HNSWTest, hnsw_delete_entry_point) {
 
     VecSimIndexInfo info = VecSimIndex_Info(index);
 
-    while (info.hnswInfo.indexSize > 0) {
+    while (info.commonInfo.indexSize > 0) {
         ASSERT_NO_THROW(VecSimIndex_DeleteVector(index, info.hnswInfo.entrypoint));
         info = VecSimIndex_Info(index);
     }
@@ -953,11 +964,6 @@ TYPED_TEST(HNSWTest, hnsw_override) {
     }
     ASSERT_EQ(VecSimIndex_IndexSize(index), n);
 
-    // Try to override when overriding is not allowed.
-    TEST_DATA_T vec[dim];
-    GenerateVector<TEST_DATA_T>(vec, dim, n);
-    ASSERT_EQ(this->CastToHNSW_Single(index)->addVector(vec, 0, false), -1);
-
     // Insert again 300 vectors, the first 100 will be overwritten (deleted first).
     n = 300;
     for (size_t i = 0; i < n; i++) {
@@ -1121,9 +1127,9 @@ TYPED_TEST(HNSWTest, hnsw_batch_iterator_batch_size_1) {
 
 TYPED_TEST(HNSWTest, hnsw_batch_iterator_advanced) {
     size_t dim = 4;
-    size_t n = 1000;
+    size_t n = 500;
     size_t M = 8;
-    size_t ef = 1000;
+    size_t ef = n;
 
     HNSWParams params = {.dim = dim,
                          .metric = VecSimMetric_L2,
@@ -1184,8 +1190,11 @@ TYPED_TEST(HNSWTest, hnsw_batch_iterator_advanced) {
         if (iteration_num <= n / n_res) {
             runBatchIteratorSearchTest(batchIterator, n_res, verify_res, BY_ID);
         } else {
-            // In the last iteration there are n%iteration_num (=6) results left to return.
-            expected_ids.erase(expected_ids.begin()); // remove the first id
+            // In the last iteration there are n%n_res results left to return.
+            // remove the first ids that aren't going to be returned since we pass the index size.
+            for (size_t i = 0; i < n_res - n % n_res; i++) {
+                expected_ids.erase(expected_ids.begin());
+            }
             runBatchIteratorSearchTest(batchIterator, n_res, verify_res, BY_ID, n % n_res);
         }
     }
@@ -1509,13 +1518,9 @@ TYPED_TEST(HNSWTest, preferAdHocOptimization) {
     ASSERT_TRUE(VecSimIndex_PreferAdHocSearch(index, 0, 50, true));
 
     // Corner cases - subset size is greater than index size.
-    try {
-        VecSimIndex_PreferAdHocSearch(index, 1, 50, true);
-        FAIL() << "Expected std::runtime error";
-    } catch (std::runtime_error const &err) {
-        EXPECT_EQ(err.what(),
-                  std::string("internal error: subset size cannot be larger than index size"));
-    }
+    ASSERT_EQ(VecSimIndex_PreferAdHocSearch(index, 42, 50, true),
+              VecSimIndex_PreferAdHocSearch(index, 0, 50, true));
+
     VecSimIndex_Free(index);
 }
 
@@ -1573,7 +1578,7 @@ TYPED_TEST(HNSWTest, testCosine) {
 
 TYPED_TEST(HNSWTest, testSizeEstimation) {
     size_t dim = 128;
-    size_t n = 1000;
+    size_t n = DEFAULT_BLOCK_SIZE;
     size_t bs = DEFAULT_BLOCK_SIZE;
     size_t M = 32;
 
@@ -1594,19 +1599,23 @@ TYPED_TEST(HNSWTest, testSizeEstimation) {
 
     ASSERT_EQ(estimation, actual);
 
-    for (size_t i = 0; i < n; i++) {
+    for (size_t i = 0; i < bs; i++) {
         GenerateAndAddVector<TEST_DATA_T>(index, dim, i, i);
     }
 
     // Estimate the memory delta of adding a full new block.
-    estimation = EstimateElementSize(params) * (bs % n + bs);
+    estimation = EstimateElementSize(params) * bs;
 
-    actual = 0;
+    // Note we are adding vectors with ascending values. This causes the number of
+    // unidirectional edges (incoming edges),
+    // which are not taken into account in EstimateElementSize, to be zero
+    actual = index->getAllocationSize();
     for (size_t i = 0; i < bs; i++) {
-        actual += GenerateAndAddVector<TEST_DATA_T>(index, dim, n + i, i);
+        GenerateAndAddVector<TEST_DATA_T>(index, dim, bs + i, bs + i);
     }
-    ASSERT_GE(estimation * 1.01, actual);
-    ASSERT_LE(estimation * 0.99, actual);
+    actual = index->getAllocationSize() - actual;
+    ASSERT_GE(estimation * 1.02, actual);
+    ASSERT_LE(estimation * 0.98, actual);
 
     VecSimIndex_Free(index);
 }
@@ -1636,6 +1645,135 @@ TYPED_TEST(HNSWTest, testInitialSizeEstimation_No_InitialCapacity) {
     VecSimIndex_Free(index);
 }
 
+TYPED_TEST(HNSWTest, testIncomingEdgesSize) {
+    size_t dim = 4;
+    size_t n = DEFAULT_BLOCK_SIZE;
+    size_t bs = DEFAULT_BLOCK_SIZE;
+    size_t efC = n;
+
+    // In this test we add identical vectors.
+    // Expected results:
+    // in level 0: Each node has room for 2 * M neighbour. As we insert identical vectors, due to
+    // the secondary sorting by id in the top candidates heap, the M top candidates nodes for each
+    // new node will be the nodes whose ids are the maximal. These nodes will be mutually connected
+    // to the new node, in addition to their existing neighbors (there will be no unidirectional
+    // edges). For higher levels (if the max level of the new node is higher then 0): For each
+    // level, if there are already M + 1 nodes on this level, the new node will be connected to the
+    // last M existing nodes. As the capacity of the existing node is full,
+    // these M last nodes will have to remove one of their neighbour, so one
+    // unidirectional for each node (total of M) will be added. Eventually, we expected that each
+    // vector at this level will have M incoming edges, except the first M vectors and the last M
+    // vectors. The M vectors whose ids are minimal at some level should end up with 0, 1, ..., M-1
+    // incoming edges respectively. The last M vectors will behave in a similar way (the vector with
+    // maximum id at a level should have 0 incoming edges, the one whose is the second last should
+    // have 1, etc...)
+
+    for (size_t M : {2, 4, 16, 32}) {
+        HNSWParams params = {.dim = dim,
+                             .metric = VecSimMetric_L2,
+                             .initialCapacity = n,
+                             .blockSize = bs,
+                             .M = M,
+                             .efConstruction = efC};
+        VecSimIndex *index = this->CreateNewIndex(params);
+
+        auto hnsw_index = this->CastToHNSW(index);
+        size_t inc_edges0 = 0;
+
+        // Calculate expected allocations overhead after adding n vectors
+        size_t allocations_overhead = VecSimAllocator::getAllocationOverheadSize();
+
+        // meta data per node at higher levels (level0 meta data was already allcoated at index
+        // construction)
+        size_t size_links_higher_level = sizeof(linkListSize) + M * sizeof(idType) + sizeof(void *);
+
+        size_t size_label_lookup_node = getLabelsLookupNodeSize();
+
+        size_t initial_memory = index->getAllocationSize();
+        size_t metadata_overhead_estimation = 0;
+
+        // Count the number of nodes at each level of the graph.
+        std::vector<size_t> nodes_per_level_hist(100, 0);
+        for (size_t i = 0; i < n; i++) {
+
+            GenerateAndAddVector<TEST_DATA_T>(index, dim, i);
+
+            size_t elem_level = hnsw_index->element_levels_[i];
+
+            size_t high_levels_memory =
+                elem_level ? size_links_higher_level * elem_level + allocations_overhead : 0;
+            metadata_overhead_estimation += size_label_lookup_node + high_levels_memory;
+            for (size_t j = 0; j <= elem_level; j++) {
+                nodes_per_level_hist[j] += 1;
+            }
+        }
+
+        size_t incoming_edges_total_count = 0;
+        std::vector<size_t> incoming_per_level_hist(hnsw_index->max_level_ + 1, 0);
+
+        size_t incoming_edges_memory_overhead = 0;
+        for (size_t level = 0; level <= hnsw_index->max_level_; level++) {
+            size_t curr_visited_at_level_hist = 0;
+            for (size_t id = 0; id < n; id++) {
+                if (hnsw_index->element_levels_[id] >= level) {
+                    // we expect to generate a new incoming edges vector for each new node at each
+                    // level.
+                    incoming_edges_memory_overhead +=
+                        sizeof(vecsim_stl::vector<idType>) + allocations_overhead;
+                    curr_visited_at_level_hist += 1;
+                    size_t curr_idx_at_level = curr_visited_at_level_hist - 1;
+                    // The index of the vector at the current level counting backwards.
+                    size_t curr_reverse_idx_at_level =
+                        nodes_per_level_hist[level] - curr_visited_at_level_hist;
+                    auto incoming_edges = hnsw_index->getIncomingEdgesPtr(id, level);
+                    incoming_edges->shrink_to_fit();
+                    size_t incoming_edges_count = incoming_edges->size();
+
+                    // if it is level 0 or there are less than M nodes at this level, none of them
+                    // should have incoming edges.
+                    if (level == 0 || nodes_per_level_hist[level] <= M + 1 ||
+                        curr_idx_at_level == 0 || curr_reverse_idx_at_level == 0) {
+                        ASSERT_EQ(incoming_edges_count, 0);
+                        continue;
+                    }
+                    if (curr_idx_at_level < M) { // this is one of the first M nodes
+                        ASSERT_EQ(incoming_edges_count, curr_idx_at_level);
+                    } else if (curr_reverse_idx_at_level < M) { // this is one of the last M nodes
+                        ASSERT_EQ(incoming_edges_count, curr_reverse_idx_at_level);
+                    } else {
+                        ASSERT_EQ(incoming_edges_count, M);
+                    }
+                    incoming_edges_total_count += incoming_edges_count;
+                    incoming_per_level_hist[level] += incoming_edges_count;
+                    // The first insertion to the incoming edges vector causes another allocation.
+                    incoming_edges_memory_overhead +=
+                        incoming_edges_count * sizeof(idType) + allocations_overhead;
+                }
+            }
+            // each node (except the first M and the last M) should have the same number of incoming
+            // edges.
+            if (level == 0 || nodes_per_level_hist[level] <= 2 * M) {
+                continue;
+            }
+            // Sum the number of incoming edges of the first or last M nodes
+            // (0 + 1 + ... + M - 1)
+            size_t edge_nodes_incoming_edges_sum = (M * (M - 1)) / 2;
+            size_t expected_incoming_edges =
+                (nodes_per_level_hist[level] - 2 * M) * M + edge_nodes_incoming_edges_sum * 2;
+            ASSERT_EQ(incoming_per_level_hist[level], expected_incoming_edges);
+        }
+
+        size_t total_estimation = metadata_overhead_estimation + incoming_edges_memory_overhead;
+        size_t add_vectors_memory_delta = index->getAllocationSize() - initial_memory;
+
+        ASSERT_EQ(total_estimation, add_vectors_memory_delta);
+        ASSERT_EQ(hnsw_index->checkIntegrity().unidirectional_connections,
+                  incoming_edges_total_count);
+
+        VecSimIndex_Free(index);
+    }
+}
+
 TYPED_TEST(HNSWTest, testTimeoutReturn) {
     size_t dim = 4;
     VecSimQueryResult_List rl;
@@ -1904,15 +2042,15 @@ TYPED_TEST(HNSWTest, HNSWSerialization_v2) {
 
         // Fetch info after saving, as memory size change during saving.
         VecSimIndexInfo info = VecSimIndex_Info(index);
-        ASSERT_EQ(info.algo, VecSimAlgo_HNSWLIB);
+        ASSERT_EQ(info.commonInfo.basicInfo.algo, VecSimAlgo_HNSWLIB);
         ASSERT_EQ(info.hnswInfo.M, M);
         ASSERT_EQ(info.hnswInfo.efConstruction, ef);
         ASSERT_EQ(info.hnswInfo.efRuntime, ef);
-        ASSERT_EQ(info.hnswInfo.indexSize, n);
-        ASSERT_EQ(info.hnswInfo.metric, VecSimMetric_L2);
-        ASSERT_EQ(info.hnswInfo.type, TypeParam::get_index_type());
-        ASSERT_EQ(info.hnswInfo.dim, dim);
-        ASSERT_EQ(info.hnswInfo.indexLabelCount, n_labels[i]);
+        ASSERT_EQ(info.commonInfo.indexSize, n);
+        ASSERT_EQ(info.commonInfo.basicInfo.metric, VecSimMetric_L2);
+        ASSERT_EQ(info.commonInfo.basicInfo.type, TypeParam::get_index_type());
+        ASSERT_EQ(info.commonInfo.basicInfo.dim, dim);
+        ASSERT_EQ(info.commonInfo.indexLabelCount, n_labels[i]);
 
         VecSimIndex_Free(index);
 
@@ -1924,17 +2062,17 @@ TYPED_TEST(HNSWTest, HNSWSerialization_v2) {
         ASSERT_TRUE(serialized_hnsw_index->checkIntegrity().valid_state);
 
         VecSimIndexInfo info2 = VecSimIndex_Info(serialized_index);
-        ASSERT_EQ(info2.algo, VecSimAlgo_HNSWLIB);
+        ASSERT_EQ(info2.commonInfo.basicInfo.algo, VecSimAlgo_HNSWLIB);
         ASSERT_EQ(info2.hnswInfo.M, M);
-        ASSERT_EQ(info2.hnswInfo.isMulti, is_multi[i]);
-        ASSERT_EQ(info2.hnswInfo.blockSize, blockSize);
+        ASSERT_EQ(info2.commonInfo.basicInfo.isMulti, is_multi[i]);
+        ASSERT_EQ(info2.commonInfo.basicInfo.blockSize, blockSize);
         ASSERT_EQ(info2.hnswInfo.efConstruction, ef);
         ASSERT_EQ(info2.hnswInfo.efRuntime, ef);
-        ASSERT_EQ(info2.hnswInfo.indexSize, n);
-        ASSERT_EQ(info2.hnswInfo.metric, VecSimMetric_L2);
-        ASSERT_EQ(info2.hnswInfo.type, TypeParam::get_index_type());
-        ASSERT_EQ(info2.hnswInfo.dim, dim);
-        ASSERT_EQ(info2.hnswInfo.indexLabelCount, n_labels[i]);
+        ASSERT_EQ(info2.commonInfo.indexSize, n);
+        ASSERT_EQ(info2.commonInfo.basicInfo.metric, VecSimMetric_L2);
+        ASSERT_EQ(info2.commonInfo.basicInfo.type, TypeParam::get_index_type());
+        ASSERT_EQ(info2.commonInfo.basicInfo.dim, dim);
+        ASSERT_EQ(info2.commonInfo.indexLabelCount, n_labels[i]);
         ASSERT_EQ(info2.hnswInfo.epsilon, epsilon);
 
         // Check the functionality of the loaded index.
@@ -1995,15 +2133,15 @@ TYPED_TEST(HNSWTest, LoadHNSWSerialized_v1) {
         ASSERT_TRUE(serialized_hnsw_index->checkIntegrity().valid_state);
 
         VecSimIndexInfo info2 = VecSimIndex_Info(serialized_index);
-        ASSERT_EQ(info2.algo, VecSimAlgo_HNSWLIB);
+        ASSERT_EQ(info2.commonInfo.basicInfo.algo, VecSimAlgo_HNSWLIB);
         // Check that M is taken from file and not from @params.
         ASSERT_EQ(info2.hnswInfo.M, M_serialized);
         ASSERT_NE(info2.hnswInfo.M, M_param);
 
-        ASSERT_EQ(info2.hnswInfo.isMulti, is_multi[i]);
+        ASSERT_EQ(info2.commonInfo.basicInfo.isMulti, is_multi[i]);
 
         // Check it was initalized with the default blockSize value.
-        ASSERT_EQ(info2.hnswInfo.blockSize, DEFAULT_BLOCK_SIZE);
+        ASSERT_EQ(info2.commonInfo.basicInfo.blockSize, DEFAULT_BLOCK_SIZE);
 
         // Check that ef is taken from file and not from @params.
         ASSERT_EQ(info2.hnswInfo.efConstruction, ef_serialized);
@@ -2011,11 +2149,11 @@ TYPED_TEST(HNSWTest, LoadHNSWSerialized_v1) {
         ASSERT_NE(info2.hnswInfo.efRuntime, ef_param);
         ASSERT_NE(info2.hnswInfo.efConstruction, ef_param);
 
-        ASSERT_EQ(info2.hnswInfo.indexSize, n);
-        ASSERT_EQ(info2.hnswInfo.metric, VecSimMetric_L2);
-        ASSERT_EQ(info2.hnswInfo.type, TypeParam::get_index_type());
-        ASSERT_EQ(info2.hnswInfo.dim, dim);
-        ASSERT_EQ(info2.hnswInfo.indexLabelCount, n_labels[i]);
+        ASSERT_EQ(info2.commonInfo.indexSize, n);
+        ASSERT_EQ(info2.commonInfo.basicInfo.metric, VecSimMetric_L2);
+        ASSERT_EQ(info2.commonInfo.basicInfo.type, TypeParam::get_index_type());
+        ASSERT_EQ(info2.commonInfo.basicInfo.dim, dim);
+        ASSERT_EQ(info2.commonInfo.indexLabelCount, n_labels[i]);
         // Check it was initalized with the default epsilon value.
         ASSERT_EQ(info2.hnswInfo.epsilon, HNSW_DEFAULT_EPSILON);
 
@@ -2090,8 +2228,8 @@ TYPED_TEST(HNSWTest, markDelete) {
     // Add a new vector, make sure it has no link to a deleted vector
     GenerateAndAddVector<TEST_DATA_T>(index, dim, n, n);
     for (size_t level = 0; level <= this->CastToHNSW(index)->element_levels_[n]; level++) {
-        idType *neighbors = this->CastToHNSW(index)->get_linklist_at_level(n, level);
-        linkListSize size = this->CastToHNSW(index)->getListCount(neighbors);
+        idType *neighbors = this->CastToHNSW(index)->getNodeNeighborsAtLevel(n, level);
+        linkListSize size = this->CastToHNSW(index)->getNodeNeighborsCount(neighbors);
         for (size_t idx = 0; idx < size; idx++) {
             ASSERT_TRUE(neighbors[idx] % 2 != ep_reminder)
                 << "Got a link to " << neighbors[idx] << " on level " << level;
@@ -2162,173 +2300,49 @@ TYPED_TEST(HNSWTest, allMarkedDeletedLevel) {
     VecSimIndex_Free(index);
 }
 
-TYPED_TEST(HNSWTest, parallelSearchKnn) {
-    size_t n = 1000;
-    size_t k = 11;
-    size_t dim = 4;
+TYPED_TEST(HNSWTest, repairNodeConnectionsBasic) {
+    size_t dim = 8;
+    size_t n = dim;
+    size_t M = 8;
 
-    HNSWParams params = {.dim = dim,
-                         .metric = VecSimMetric_L2,
-                         .initialCapacity = n,
-                         .M = 16,
-                         .efConstruction = 200};
+    HNSWParams params = {.dim = dim, .metric = VecSimMetric_L2, .M = M};
     VecSimIndex *index = this->CreateNewIndex(params);
+    auto *hnsw_index = this->CastToHNSW(index);
 
+    // Add 8 vectors, expect to get a full graph in level 0 (all nodes pairs are connected)
+    TEST_DATA_T vec[] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
     for (size_t i = 0; i < n; i++) {
-        GenerateAndAddVector<TEST_DATA_T>(index, dim, i, i);
-    }
-    ASSERT_EQ(VecSimIndex_IndexSize(index), n);
-
-    std::atomic_int successful_searches(0);
-    // Run parallel searches where every searching thread expects to get different labels as results
-    // (determined by the thread id), which are labels in the range [50+myID-5, 50+myID+5].
-    auto parallel_search = [&](int myID) {
-        TEST_DATA_T query_val = 50 + myID;
-        TEST_DATA_T query[dim];
-        GenerateVector<TEST_DATA_T>(query, dim, query_val);
-        auto verify_res = [&](size_t id, double score, size_t res_index) {
-            // We expect to get the results with increasing order of the distance between the res
-            // label and the query val (query_val, query_val-1, query_val+1, query_val-2,
-            // query_val+2, ...) The score is the L2 distance between the vectors that correspond
-            // the ids.
-            size_t diff_id = (id > query_val) ? (id - query_val) : (query_val - id);
-            ASSERT_EQ(diff_id, (res_index + 1) / 2);
-            ASSERT_EQ(score, (dim * (diff_id * diff_id)));
-        };
-        runTopKSearchTest(index, query, k, verify_res);
-        successful_searches++;
-    };
-
-    size_t memory_before = index->info().hnswInfo.memory;
-    size_t n_threads = 16;
-    std::thread thread_objs[n_threads];
-    for (size_t i = 0; i < n_threads; i++) {
-        thread_objs[i] = std::thread(parallel_search, i);
-    }
-    for (size_t i = 0; i < n_threads; i++) {
-        thread_objs[i].join();
+        vec[i] = 1.0;
+        VecSimIndex_AddVector(index, vec, i);
+        vec[i] = 0.0;
     }
-    ASSERT_EQ(successful_searches, n_threads);
-    // Make sure that we properly update the allocator atomically during the searches. The expected
-    // Memory delta should only be the visited nodes handler added to the pool. Note that the
-    // initial pool size is 1, so we subtract 1 from the current pool size to get the delta.
-    size_t expected_memory = memory_before + (index->info().hnswInfo.visitedNodesPoolSize - 1) *
-                                                 (sizeof(VisitedNodesHandler) + sizeof(tag_t) * n +
-                                                  2 * sizeof(size_t) + sizeof(void *));
-    ASSERT_EQ(expected_memory, index->info().hnswInfo.memory);
-
-    VecSimIndex_Free(index);
-}
-
-TYPED_TEST(HNSWTest, parallelSearchCombined) {
-    size_t n = 1000;
-    size_t k = 11;
-    size_t dim = 4;
-
-    HNSWParams params = {.dim = dim,
-                         .metric = VecSimMetric_L2,
-                         .initialCapacity = n,
-                         .M = 16,
-                         .efConstruction = 200};
-    VecSimIndex *index = this->CreateNewIndex(params);
-
     for (size_t i = 0; i < n; i++) {
-        GenerateAndAddVector<TEST_DATA_T>(index, dim, i, i);
+        ASSERT_EQ(hnsw_index->getNodeNeighborsCount(hnsw_index->getNodeNeighborsAtLevel(i, 0)),
+                  n - 1);
     }
-    ASSERT_EQ(VecSimIndex_IndexSize(index), n);
-
-    std::atomic_int successful_searches(0);
-
-    /* Run parallel searches of three kinds: KNN, range, and batched search. */
-
-    // In knn, we expect to get different labels as results (determined by the thread id), which are
-    // labels in the range [50+myID-5, 50+myID+5].
-    auto parallel_knn_search = [&](int myID) {
-        TEST_DATA_T query_val = 50 + myID;
-        TEST_DATA_T query[dim];
-        GenerateVector<TEST_DATA_T>(query, dim, query_val);
-        auto verify_res = [&](size_t id, double score, size_t res_index) {
-            // We expect to get the results with increasing order of the distance between the res
-            // label and the query val (query_val, query_val-1, query_val+1, query_val-2,
-            // query_val+2, ...) The score is the L2 distance between the vectors that correspond
-            // the ids.
-            size_t diff_id = std::abs(id - query_val);
-            ASSERT_EQ(diff_id, (res_index + 1) / 2);
-            ASSERT_EQ(score, (dim * (diff_id * diff_id)));
-        };
-        runTopKSearchTest(index, query, k, verify_res);
-        successful_searches++;
-    };
-
-    auto parallel_range_search = [&](int myID) {
-        TEST_DATA_T pivot_id = 100 + myID;
-        TEST_DATA_T query[dim];
-        GenerateVector<TEST_DATA_T>(query, dim, pivot_id);
-        auto verify_res_by_score = [&](size_t id, double score, size_t res_index) {
-            size_t diff_id = std::abs(id - pivot_id);
-            ASSERT_EQ(diff_id, (res_index + 1) / 2);
-            ASSERT_EQ(score, dim * (diff_id * diff_id));
-        };
-        uint expected_num_results = 11;
-        // To get 11 results in the range [pivot_id-5, pivot_id+5], set the radius as the L2 score
-        // in the boundaries.
-        double radius = (double)dim * pow((double)expected_num_results / 2, 2);
-        runRangeQueryTest(index, query, radius, verify_res_by_score, expected_num_results,
-                          BY_SCORE);
-        successful_searches++;
-    };
 
-    auto parallel_batched_search = [&](int myID) {
-        TEST_DATA_T query[dim];
-        GenerateVector<TEST_DATA_T>(query, dim, n);
+    // Mark element 0 as deleted, and repair all of its neighbors.
+    hnsw_index->markDelete(0);
+    ASSERT_EQ(this->CastToHNSW(index)->checkIntegrity().connections_to_repair, n - 1);
+    for (size_t i = 1; i < n; i++) {
+        hnsw_index->repairNodeConnections(i, 0);
+        // After the repair expect that to have all nodes except for element 0 as neighbors.
+        ASSERT_EQ(hnsw_index->getNodeNeighborsCount(hnsw_index->getNodeNeighborsAtLevel(i, 0)),
+                  n - 2);
+    }
 
-        VecSimBatchIterator *batchIterator = VecSimBatchIterator_New(index, query, nullptr);
-        size_t iteration_num = 0;
+    // Mark elements 1 and 2 as deleted.
+    hnsw_index->markDelete(1);
+    hnsw_index->markDelete(2);
+    for (size_t i = 3; i < n; i++) {
+        hnsw_index->repairNodeConnections(i, 0);
+        // After the repair expect that to have all nodes except for elements 0-2 as neighbors.
+        ASSERT_EQ(hnsw_index->getNodeNeighborsCount(hnsw_index->getNodeNeighborsAtLevel(i, 0)),
+                  n - 4);
+    }
 
-        // Get the 5 vectors whose ids are the maximal among those that hasn't been returned yet
-        // in every iteration. The results order should be sorted by their score (distance from the
-        // query vector), which means sorted from the largest id to the lowest.
-        // Run different number of iterations for every thread id.
-        size_t total_iterations = myID;
-        size_t n_res = 5;
-        while (VecSimBatchIterator_HasNext(batchIterator) && iteration_num < total_iterations) {
-            std::vector<size_t> expected_ids(n_res);
-            for (size_t i = 0; i < n_res; i++) {
-                expected_ids[i] = (n - iteration_num * n_res - i - 1);
-            }
-            auto verify_res = [&](size_t id, double score, size_t res_index) {
-                ASSERT_TRUE(expected_ids[res_index] == id);
-            };
-            runBatchIteratorSearchTest(batchIterator, n_res, verify_res);
-            iteration_num++;
-        }
-        ASSERT_EQ(iteration_num, total_iterations);
-        VecSimBatchIterator_Free(batchIterator);
-        successful_searches++;
-    };
+    // For completeness, we also check index integrity.
+    ASSERT_TRUE(this->CastToHNSW(index)->checkIntegrity().valid_state);
 
-    size_t n_threads = 15;
-    std::thread thread_objs[n_threads];
-    size_t memory_before = index->info().hnswInfo.memory;
-    for (size_t i = 0; i < n_threads; i++) {
-        if (i % 3 == 0) {
-            thread_objs[i] = std::thread(parallel_knn_search, i);
-        } else if (i % 3 == 1) {
-            thread_objs[i] = std::thread(parallel_range_search, i);
-        } else {
-            thread_objs[i] = std::thread(parallel_batched_search, i);
-        }
-    }
-    for (size_t i = 0; i < n_threads; i++) {
-        thread_objs[i].join();
-    }
-    ASSERT_EQ(successful_searches, n_threads);
-    // Make sure that we properly update the allocator atomically during the searches.
-    // Memory delta should only be the visited nodes handler added to the pool. Note that the
-    // initial pool size is 1, so we subtract 1 from the current pool size to get the delta.
-    size_t expected_memory = memory_before + (index->info().hnswInfo.visitedNodesPoolSize - 1) *
-                                                 (sizeof(VisitedNodesHandler) + sizeof(tag_t) * n +
-                                                  2 * sizeof(size_t) + sizeof(void *));
-    ASSERT_EQ(expected_memory, index->info().hnswInfo.memory);
     VecSimIndex_Free(index);
 }
diff --git a/tests/unit/test_hnsw_multi.cpp b/tests/unit/test_hnsw_multi.cpp
index bcc6138d0..eb87e4f24 100644
--- a/tests/unit/test_hnsw_multi.cpp
+++ b/tests/unit/test_hnsw_multi.cpp
@@ -11,8 +11,6 @@
 #include "VecSim/algorithms/hnsw/hnsw_multi.h"
 #include <cmath>
 #include <map>
-#include <thread>
-#include <atomic>
 
 template <typename index_type_t>
 class HNSWMultiTest : public ::testing::Test {
@@ -357,11 +355,11 @@ TYPED_TEST(HNSWMultiTest, test_hnsw_info) {
     VecSimIndex *index = this->CreateNewIndex(params);
 
     VecSimIndexInfo info = VecSimIndex_Info(index);
-    ASSERT_EQ(info.algo, VecSimAlgo_HNSWLIB);
-    ASSERT_EQ(info.hnswInfo.dim, d);
-    ASSERT_TRUE(info.hnswInfo.isMulti);
+    ASSERT_EQ(info.commonInfo.basicInfo.algo, VecSimAlgo_HNSWLIB);
+    ASSERT_EQ(info.commonInfo.basicInfo.dim, d);
+    ASSERT_TRUE(info.commonInfo.basicInfo.isMulti);
     // Default args.
-    ASSERT_EQ(info.hnswInfo.blockSize, DEFAULT_BLOCK_SIZE);
+    ASSERT_EQ(info.commonInfo.basicInfo.blockSize, DEFAULT_BLOCK_SIZE);
     ASSERT_EQ(info.hnswInfo.M, HNSW_DEFAULT_M);
     ASSERT_EQ(info.hnswInfo.efConstruction, HNSW_DEFAULT_EF_C);
     ASSERT_EQ(info.hnswInfo.efRuntime, HNSW_DEFAULT_EF_RT);
@@ -383,11 +381,11 @@ TYPED_TEST(HNSWMultiTest, test_hnsw_info) {
 
     index = this->CreateNewIndex(params);
     info = VecSimIndex_Info(index);
-    ASSERT_EQ(info.algo, VecSimAlgo_HNSWLIB);
-    ASSERT_EQ(info.hnswInfo.dim, d);
-    ASSERT_TRUE(info.hnswInfo.isMulti);
+    ASSERT_EQ(info.commonInfo.basicInfo.algo, VecSimAlgo_HNSWLIB);
+    ASSERT_EQ(info.commonInfo.basicInfo.dim, d);
+    ASSERT_TRUE(info.commonInfo.basicInfo.isMulti);
     // User args.
-    ASSERT_EQ(info.hnswInfo.blockSize, bs);
+    ASSERT_EQ(info.commonInfo.basicInfo.blockSize, bs);
     ASSERT_EQ(info.hnswInfo.M, M);
     ASSERT_EQ(info.hnswInfo.efConstruction, ef_C);
     ASSERT_EQ(info.hnswInfo.efRuntime, ef_RT);
@@ -433,7 +431,7 @@ TYPED_TEST(HNSWMultiTest, test_dynamic_hnsw_info_iterator) {
     ASSERT_EQ(250, info.hnswInfo.efConstruction);
     ASSERT_EQ(400, info.hnswInfo.efRuntime);
     ASSERT_EQ(0.004, info.hnswInfo.epsilon);
-    ASSERT_EQ(0, info.hnswInfo.indexSize);
+    ASSERT_EQ(0, info.commonInfo.indexSize);
     ASSERT_EQ(-1, info.hnswInfo.max_level);
     ASSERT_EQ(-1, info.hnswInfo.entrypoint);
     compareHNSWIndexInfoToIterator(info, infoIter);
@@ -450,8 +448,8 @@ TYPED_TEST(HNSWMultiTest, test_dynamic_hnsw_info_iterator) {
     VecSimIndex_AddVector(index, v, 1);
     info = VecSimIndex_Info(index);
     infoIter = VecSimIndex_InfoIterator(index);
-    ASSERT_EQ(4, info.hnswInfo.indexSize);
-    ASSERT_EQ(2, info.hnswInfo.indexLabelCount);
+    ASSERT_EQ(4, info.commonInfo.indexSize);
+    ASSERT_EQ(2, info.commonInfo.indexLabelCount);
     ASSERT_GE(1, info.hnswInfo.max_level);
     ASSERT_EQ(0, info.hnswInfo.entrypoint);
     compareHNSWIndexInfoToIterator(info, infoIter);
@@ -461,8 +459,8 @@ TYPED_TEST(HNSWMultiTest, test_dynamic_hnsw_info_iterator) {
     VecSimIndex_DeleteVector(index, 0);
     info = VecSimIndex_Info(index);
     infoIter = VecSimIndex_InfoIterator(index);
-    ASSERT_EQ(2, info.hnswInfo.indexSize);
-    ASSERT_EQ(1, info.hnswInfo.indexLabelCount);
+    ASSERT_EQ(2, info.commonInfo.indexSize);
+    ASSERT_EQ(1, info.commonInfo.indexLabelCount);
     compareHNSWIndexInfoToIterator(info, infoIter);
     VecSimInfoIterator_Free(infoIter);
 
@@ -472,7 +470,7 @@ TYPED_TEST(HNSWMultiTest, test_dynamic_hnsw_info_iterator) {
     VecSimQueryResult_Free(res);
     info = VecSimIndex_Info(index);
     infoIter = VecSimIndex_InfoIterator(index);
-    ASSERT_EQ(STANDARD_KNN, info.hnswInfo.last_mode);
+    ASSERT_EQ(STANDARD_KNN, info.commonInfo.last_mode);
     compareHNSWIndexInfoToIterator(info, infoIter);
     VecSimInfoIterator_Free(infoIter);
 
@@ -480,14 +478,14 @@ TYPED_TEST(HNSWMultiTest, test_dynamic_hnsw_info_iterator) {
     VecSimQueryResult_Free(res);
     info = VecSimIndex_Info(index);
     infoIter = VecSimIndex_InfoIterator(index);
-    ASSERT_EQ(RANGE_QUERY, info.hnswInfo.last_mode);
+    ASSERT_EQ(RANGE_QUERY, info.commonInfo.last_mode);
     compareHNSWIndexInfoToIterator(info, infoIter);
     VecSimInfoIterator_Free(infoIter);
 
     ASSERT_TRUE(VecSimIndex_PreferAdHocSearch(index, 1, 1, true));
     info = VecSimIndex_Info(index);
     infoIter = VecSimIndex_InfoIterator(index);
-    ASSERT_EQ(HYBRID_ADHOC_BF, info.hnswInfo.last_mode);
+    ASSERT_EQ(HYBRID_ADHOC_BF, info.commonInfo.last_mode);
     compareHNSWIndexInfoToIterator(info, infoIter);
     VecSimInfoIterator_Free(infoIter);
 
@@ -501,7 +499,7 @@ TYPED_TEST(HNSWMultiTest, test_dynamic_hnsw_info_iterator) {
     ASSERT_FALSE(VecSimIndex_PreferAdHocSearch(index, 10, 1, true));
     info = VecSimIndex_Info(index);
     infoIter = VecSimIndex_InfoIterator(index);
-    ASSERT_EQ(HYBRID_BATCHES, info.hnswInfo.last_mode);
+    ASSERT_EQ(HYBRID_BATCHES, info.commonInfo.last_mode);
     compareHNSWIndexInfoToIterator(info, infoIter);
     VecSimInfoIterator_Free(infoIter);
 
@@ -510,7 +508,7 @@ TYPED_TEST(HNSWMultiTest, test_dynamic_hnsw_info_iterator) {
     ASSERT_TRUE(VecSimIndex_PreferAdHocSearch(index, 1, 10, false));
     info = VecSimIndex_Info(index);
     infoIter = VecSimIndex_InfoIterator(index);
-    ASSERT_EQ(HYBRID_BATCHES_TO_ADHOC_BF, info.hnswInfo.last_mode);
+    ASSERT_EQ(HYBRID_BATCHES_TO_ADHOC_BF, info.commonInfo.last_mode);
     compareHNSWIndexInfoToIterator(info, infoIter);
     VecSimInfoIterator_Free(infoIter);
 
@@ -629,13 +627,9 @@ TYPED_TEST(HNSWMultiTest, preferAdHocOptimization) {
     ASSERT_TRUE(VecSimIndex_PreferAdHocSearch(index, 0, 50, true));
 
     // Corner cases - subset size is greater than index size.
-    try {
-        VecSimIndex_PreferAdHocSearch(index, 1, 50, true);
-        FAIL() << "Expected std::runtime error";
-    } catch (std::runtime_error const &err) {
-        EXPECT_EQ(err.what(),
-                  std::string("internal error: subset size cannot be larger than index size"));
-    }
+    ASSERT_EQ(VecSimIndex_PreferAdHocSearch(index, 42, 50, true),
+              VecSimIndex_PreferAdHocSearch(index, 0, 50, true));
+
     VecSimIndex_Free(index);
 }
 TYPED_TEST(HNSWMultiTest, search_empty_index) {
@@ -686,6 +680,52 @@ TYPED_TEST(HNSWMultiTest, search_empty_index) {
     VecSimIndex_Free(index);
 }
 
+TYPED_TEST(HNSWMultiTest, removeVectorWithSwaps) {
+    size_t dim = 4;
+    size_t n = 6;
+
+    HNSWParams params = {.dim = dim, .metric = VecSimMetric_L2};
+    auto *index = this->CastToHNSW_Multi(this->CreateNewIndex(params));
+
+    // Insert 3 vectors under two different labels, so that we will have:
+    // {first_label->[0,1,3], second_label->[2,4,5]}
+    labelType first_label = 1;
+    labelType second_label = 2;
+
+    GenerateAndAddVector<TEST_DATA_T>(index, dim, first_label);
+    GenerateAndAddVector<TEST_DATA_T>(index, dim, first_label);
+    GenerateAndAddVector<TEST_DATA_T>(index, dim, second_label);
+    GenerateAndAddVector<TEST_DATA_T>(index, dim, first_label);
+    GenerateAndAddVector<TEST_DATA_T>(index, dim, second_label);
+    GenerateAndAddVector<TEST_DATA_T>(index, dim, second_label);
+    ASSERT_EQ(VecSimIndex_IndexSize(index), n);
+
+    // Artificially reorder the internal ids to test that we make the right changes
+    // when we have an id that appears twice in the array upon deleting the ids one by one.
+    ASSERT_EQ(index->label_lookup_.at(second_label).size(), n / 2);
+    index->label_lookup_.at(second_label)[0] = 4;
+    index->label_lookup_.at(second_label)[1] = 2;
+    index->label_lookup_.at(second_label)[2] = 5;
+
+    // Expect that the ids array of the second label will behave as following:
+    // [|4, 2, 5] -> [4, |2, 4] -> [4, 2, |2] (where | marks the current position).
+    index->deleteVector(second_label);
+    ASSERT_EQ(index->indexLabelCount(), 1);
+    ASSERT_EQ(VecSimIndex_IndexSize(index), n / 2);
+
+    // Check that the internal ids of the first label are as expected.
+    auto ids = index->label_lookup_.at(first_label);
+    ASSERT_EQ(ids.size(), n / 2);
+    ASSERT_TRUE(std::find(ids.begin(), ids.end(), 0) != ids.end());
+    ASSERT_TRUE(std::find(ids.begin(), ids.end(), 1) != ids.end());
+    ASSERT_TRUE(std::find(ids.begin(), ids.end(), 2) != ids.end());
+    index->deleteVector(first_label);
+    ASSERT_EQ(index->indexLabelCount(), 0);
+    ASSERT_EQ(VecSimIndex_IndexSize(index), 0);
+
+    VecSimIndex_Free(index);
+}
+
 TYPED_TEST(HNSWMultiTest, remove_vector_after_replacing_block) {
     size_t dim = 4;
     size_t bs = 2;
@@ -814,7 +854,7 @@ TYPED_TEST(HNSWMultiTest, hnsw_get_distance) {
 
 TYPED_TEST(HNSWMultiTest, testSizeEstimation) {
     size_t dim = 128;
-    size_t n_labels = 1000;
+    size_t n_labels = DEFAULT_BLOCK_SIZE;
     size_t perLabel = 1;
     size_t bs = DEFAULT_BLOCK_SIZE;
     size_t M = 32;
@@ -838,19 +878,23 @@ TYPED_TEST(HNSWMultiTest, testSizeEstimation) {
 
     ASSERT_EQ(estimation, actual);
 
-    for (size_t i = 0; i < n; i++) {
-        GenerateAndAddVector<TEST_DATA_T>(index, dim, i % n_labels, i);
+    for (size_t i = 0; i < bs; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(index, dim, bs, i);
     }
 
     // Estimate the memory delta of adding a full new block.
-    estimation = EstimateElementSize(params) * (bs % n + bs);
+    estimation = EstimateElementSize(params) * (bs);
 
-    actual = 0;
+    // Note we are adding vectors with ascending values. This causes the numbers of
+    // double connections, which are not taking into account in EstimateElementSize,
+    // to be zero
+    actual = index->getAllocationSize();
     for (size_t i = 0; i < bs; i++) {
-        actual += GenerateAndAddVector<TEST_DATA_T>(index, dim, n + i, i);
+        GenerateAndAddVector<TEST_DATA_T>(index, dim, bs + i, bs + i);
     }
-    ASSERT_GE(estimation * 1.01, actual);
-    ASSERT_LE(estimation * 0.99, actual);
+    actual = index->getAllocationSize() - actual;
+    ASSERT_GE(estimation * 1.02, actual);
+    ASSERT_LE(estimation * 0.98, actual);
 
     VecSimIndex_Free(index);
 }
@@ -1199,7 +1243,7 @@ TYPED_TEST(HNSWMultiTest, hnsw_delete_entry_point) {
 
     VecSimIndexInfo info = VecSimIndex_Info(index);
 
-    while (info.hnswInfo.indexSize > 0) {
+    while (info.commonInfo.indexSize > 0) {
         ASSERT_NO_THROW(VecSimIndex_DeleteVector(index, info.hnswInfo.entrypoint));
         info = VecSimIndex_Info(index);
     }
@@ -1362,9 +1406,9 @@ TYPED_TEST(HNSWMultiTest, hnsw_batch_iterator_batch_size_1) {
 TYPED_TEST(HNSWMultiTest, hnsw_batch_iterator_advanced) {
     size_t dim = 4;
     size_t M = 8;
-    size_t ef = 1000;
-    size_t n_labels = 1000;
+    size_t n_labels = 500;
     size_t perLabel = 5;
+    size_t ef = n_labels;
 
     size_t n = n_labels * perLabel;
 
@@ -1428,8 +1472,11 @@ TYPED_TEST(HNSWMultiTest, hnsw_batch_iterator_advanced) {
         if (iteration_num <= n_labels / n_res) {
             runBatchIteratorSearchTest(batchIterator, n_res, verify_res, BY_ID);
         } else {
-            // In the last iteration there are n%iteration_num (=6) results left to return.
-            expected_ids.erase(expected_ids.begin()); // remove the first id
+            // In the last iteration there are n%n_res results left to return.
+            // remove the first ids that aren't going to be returned since we pass the index size.
+            for (size_t i = 0; i < n_res - n_labels % n_res; i++) {
+                expected_ids.erase(expected_ids.begin());
+            }
             runBatchIteratorSearchTest(batchIterator, n_res, verify_res, BY_ID, n_labels % n_res);
         }
     }
@@ -1442,6 +1489,7 @@ TYPED_TEST(HNSWMultiTest, hnsw_batch_iterator_advanced) {
     VecSimBatchIterator_Free(batchIterator);
     VecSimIndex_Free(index);
 }
+
 TYPED_TEST(HNSWMultiTest, MultiBatchIteratorHeapLogic) {
     size_t n = 4;
     size_t n_labels = 3;
@@ -1704,8 +1752,8 @@ TYPED_TEST(HNSWMultiTest, markDelete) {
     // This value is very close to a deleted vector
     GenerateAndAddVector<TEST_DATA_T>(index, dim, n, n - per_label + 1);
     for (size_t level = 0; level <= this->CastToHNSW_Multi(index)->element_levels_[n]; level++) {
-        idType *neighbors = this->CastToHNSW(index)->get_linklist_at_level(n, level);
-        linkListSize size = this->CastToHNSW(index)->getListCount(neighbors);
+        idType *neighbors = this->CastToHNSW(index)->getNodeNeighborsAtLevel(n, level);
+        linkListSize size = this->CastToHNSW(index)->getNodeNeighborsCount(neighbors);
         for (size_t idx = 0; idx < size; idx++) {
             ASSERT_TRUE((neighbors[idx] / per_label) % 2 != ep_reminder)
                 << "Got a link to " << neighbors[idx] << " on level " << level;
@@ -1759,46 +1807,3 @@ TYPED_TEST(HNSWMultiTest, markDelete) {
     VecSimBatchIterator_Free(batchIterator);
     VecSimIndex_Free(index);
 }
-
-TYPED_TEST(HNSWMultiTest, parallelSearch) {
-    size_t dim = 4;
-    size_t n = 1000;
-    size_t n_labels = 100;
-    size_t k = 11;
-
-    HNSWParams params = {.dim = dim, .metric = VecSimMetric_L2, .initialCapacity = n};
-    VecSimIndex *index = this->CreateNewIndex(params);
-
-    for (size_t i = 0; i < n; i++) {
-        GenerateAndAddVector<TEST_DATA_T>(index, dim, i % n_labels, i);
-    }
-    ASSERT_EQ(VecSimIndex_IndexSize(index), n);
-    ASSERT_EQ(index->indexLabelCount(), n_labels);
-
-    std::atomic_int successful_searches(0);
-    // Run parallel searches where every searching thread expects to get different label as results
-    // (determined by the thread id), which are labels in the range [50+myID-5, 50+myID+5].
-    auto parallel_search = [&](int myID) {
-        TEST_DATA_T query_val = 50 + myID;
-        TEST_DATA_T query[dim];
-        GenerateVector<TEST_DATA_T>(query, dim, query_val);
-        auto verify_res = [&](size_t id, double score, size_t res_index) {
-            size_t diff_id = (id > query_val) ? (id - query_val) : (query_val - id);
-            ASSERT_EQ(diff_id, (res_index + 1) / 2);
-            ASSERT_EQ(score, (dim * ((res_index + 1) / 2) * ((res_index + 1) / 2)));
-        };
-        runTopKSearchTest(index, query, k, verify_res);
-        successful_searches++;
-    };
-
-    size_t n_threads = 16;
-    std::thread thread_objs[n_threads];
-    for (size_t i = 0; i < n_threads; i++) {
-        thread_objs[i] = std::thread(parallel_search, i);
-    }
-    for (size_t i = 0; i < n_threads; i++) {
-        thread_objs[i].join();
-    }
-    ASSERT_EQ(successful_searches, n_threads);
-    VecSimIndex_Free(index);
-}
diff --git a/tests/unit/test_hnsw_parallel.cpp b/tests/unit/test_hnsw_parallel.cpp
new file mode 100644
index 000000000..5ce271659
--- /dev/null
+++ b/tests/unit/test_hnsw_parallel.cpp
@@ -0,0 +1,777 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "gtest/gtest.h"
+#include "VecSim/vec_sim.h"
+#include "VecSim/algorithms/hnsw/hnsw_single.h"
+#include "test_utils.h"
+#include "VecSim/query_result_struct.h"
+#include <unistd.h>
+#include <random>
+#include <thread>
+#include <atomic>
+
+// Helper macro to get the closest even number which is equal or lower than x.
+#define FLOOR_EVEN(x) ((x) - ((x)&1))
+
+template <typename index_type_t>
+class HNSWTestParallel : public ::testing::Test {
+public:
+    using data_t = typename index_type_t::data_t;
+    using dist_t = typename index_type_t::dist_t;
+
+protected:
+    VecSimIndex *CreateNewIndex(HNSWParams &params, bool is_multi = false) {
+        return test_utils::CreateNewIndex(params, index_type_t::get_index_type(), is_multi);
+    }
+    HNSWIndex<data_t, dist_t> *CastToHNSW(VecSimIndex *index) {
+        return reinterpret_cast<HNSWIndex<data_t, dist_t> *>(index);
+    }
+    HNSWIndex_Single<data_t, dist_t> *CastToHNSW_Single(VecSimIndex *index) {
+        return reinterpret_cast<HNSWIndex_Single<data_t, dist_t> *>(index);
+    }
+
+    /* Helper methods for testing repair jobs:
+     * Collect all the nodes that require repair due to the deletions, from top level down, and
+     * insert them into a queue.
+     */
+    void CollectRepairJobs(HNSWIndex<data_t, dist_t> *hnsw_index,
+                           std::vector<pair<idType, size_t>> &jobQ) {
+        size_t n = hnsw_index->indexSize();
+        for (labelType element_id = 0; element_id < n; element_id++) {
+            if (!hnsw_index->isMarkedDeleted(element_id)) {
+                continue;
+            }
+            size_t element_top_level = hnsw_index->element_levels_[element_id];
+
+            for (size_t level = 0; level <= element_top_level; level++) {
+                idType *node_neighbours = hnsw_index->getNodeNeighborsAtLevel(element_id, level);
+                auto neighbours_count = hnsw_index->getNodeNeighborsCount(node_neighbours);
+
+                // Go over the neighbours of the element in a specific level.
+                for (size_t i = 0; i < neighbours_count; i++) {
+                    idType cur_neighbor = node_neighbours[i];
+                    auto *neighbour_neighbours =
+                        hnsw_index->getNodeNeighborsAtLevel(cur_neighbor, level);
+                    auto neighbor_neighbours_count =
+                        hnsw_index->getNodeNeighborsCount(neighbour_neighbours);
+                    for (size_t j = 0; j < neighbor_neighbours_count; j++) {
+                        // If the edge is bidirectional, do repair for this neighbor
+                        if (neighbour_neighbours[j] == element_id) {
+                            jobQ.emplace_back(cur_neighbor, level);
+                            break;
+                        }
+                    }
+                }
+                // Next, go over the rest of incoming edges (the ones that are not bidirectional)
+                // and make repairs.
+                auto *incoming_edges = hnsw_index->getIncomingEdgesPtr(element_id, level);
+                for (auto incoming_edge : *incoming_edges) {
+                    jobQ.emplace_back(incoming_edge, level);
+                }
+            }
+        }
+    }
+};
+
+// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in test_utils.h
+
+TYPED_TEST_SUITE(HNSWTestParallel, DataTypeSet);
+
+TYPED_TEST(HNSWTestParallel, parallelSearchKnn) {
+    size_t n = 20000;
+    size_t k = 11;
+    size_t dim = 45;
+
+    HNSWParams params = {.dim = dim,
+                         .metric = VecSimMetric_L2,
+                         .initialCapacity = n,
+                         .M = 64,
+                         .efConstruction = 200,
+                         .efRuntime = n};
+    VecSimIndex *index = this->CreateNewIndex(params);
+
+    for (size_t i = 0; i < n; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(index, dim, i, i);
+    }
+    ASSERT_EQ(VecSimIndex_IndexSize(index), n);
+
+    size_t n_threads = MIN(8, std::thread::hardware_concurrency());
+    std::atomic_int successful_searches(0);
+    // Save the number fo tasks done by thread i in the i-th entry.
+    std::vector<size_t> completed_tasks(n_threads, 0);
+
+    // Run parallel searches where every searching thread expects to get different labels as results
+    // (determined by the thread id), which are labels in the range [50+myID-5, 50+myID+5].
+    auto parallel_search = [&](int myID) {
+        completed_tasks[myID]++;
+        TEST_DATA_T query_val = 50 + myID;
+        TEST_DATA_T query[dim];
+        GenerateVector<TEST_DATA_T>(query, dim, query_val);
+        auto verify_res = [&](size_t id, double score, size_t res_index) {
+            // We expect to get the results with increasing order of the distance between the res
+            // label and the query val (query_val, query_val-1, query_val+1, query_val-2,
+            // query_val+2, ...) The score is the L2 distance between the vectors that correspond
+            // the ids.
+            size_t diff_id = (id > query_val) ? (id - query_val) : (query_val - id);
+            ASSERT_EQ(diff_id, (res_index + 1) / 2);
+            ASSERT_EQ(score, (dim * (diff_id * diff_id)));
+        };
+        runTopKSearchTest(index, query, k, verify_res);
+        successful_searches++;
+    };
+
+    size_t memory_before = index->info().commonInfo.memory;
+    std::thread thread_objs[n_threads];
+    for (size_t i = 0; i < n_threads; i++) {
+        thread_objs[i] = std::thread(parallel_search, i);
+    }
+    for (size_t i = 0; i < n_threads; i++) {
+        thread_objs[i].join();
+    }
+    ASSERT_EQ(successful_searches, n_threads);
+
+    // Validate that every thread executed a single job.
+    ASSERT_EQ(*std::min_element(completed_tasks.begin(), completed_tasks.end()), 1);
+    ASSERT_EQ(*std::max_element(completed_tasks.begin(), completed_tasks.end()), 1);
+    // Make sure that we properly update the allocator atomically during the searches. The expected
+    // Memory delta should only be the visited nodes handler added to the pool.
+    size_t expected_memory = memory_before + (index->info().hnswInfo.visitedNodesPoolSize - 1) *
+                                                 (sizeof(VisitedNodesHandler) + sizeof(tag_t) * n +
+                                                  2 * sizeof(size_t) + sizeof(void *));
+    ASSERT_EQ(expected_memory, index->info().commonInfo.memory);
+
+    VecSimIndex_Free(index);
+}
+
+TYPED_TEST(HNSWTestParallel, parallelSearchKNNMulti) {
+    size_t dim = 45;
+    size_t n = 20000;
+    size_t n_labels = 1000;
+    size_t k = 11;
+
+    HNSWParams params = {
+        .dim = dim, .metric = VecSimMetric_L2, .initialCapacity = n, .M = 64, .efRuntime = n};
+    VecSimIndex *index = this->CreateNewIndex(params, true);
+
+    for (size_t i = 0; i < n; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(index, dim, i % n_labels, i);
+    }
+    ASSERT_EQ(VecSimIndex_IndexSize(index), n);
+    ASSERT_EQ(index->indexLabelCount(), n_labels);
+
+    size_t n_threads = MIN(8, std::thread::hardware_concurrency());
+    std::atomic_int successful_searches(0);
+    // Save the number fo tasks done by thread i in the i-th entry.
+    std::vector<size_t> completed_tasks(n_threads, 0);
+
+    // Run parallel searches where every searching thread expects to get different label as results
+    // (determined by the thread id), which are labels in the range [50+myID-5, 50+myID+5].
+    auto parallel_search = [&](int myID) {
+        completed_tasks[myID]++;
+        TEST_DATA_T query_val = 50 + myID;
+        TEST_DATA_T query[dim];
+        GenerateVector<TEST_DATA_T>(query, dim, query_val);
+        auto verify_res = [&](size_t id, double score, size_t res_index) {
+            size_t diff_id = (id > query_val) ? (id - query_val) : (query_val - id);
+            ASSERT_EQ(diff_id, (res_index + 1) / 2);
+            ASSERT_EQ(score, (dim * ((res_index + 1) / 2) * ((res_index + 1) / 2)));
+        };
+        runTopKSearchTest(index, query, k, verify_res);
+        successful_searches++;
+    };
+
+    std::thread thread_objs[n_threads];
+    for (size_t i = 0; i < n_threads; i++) {
+        thread_objs[i] = std::thread(parallel_search, i);
+    }
+    for (size_t i = 0; i < n_threads; i++) {
+        thread_objs[i].join();
+    }
+    ASSERT_EQ(successful_searches, n_threads);
+    // Validate that every thread executed a single job.
+    ASSERT_EQ(*std::min_element(completed_tasks.begin(), completed_tasks.end()), 1);
+    ASSERT_EQ(*std::max_element(completed_tasks.begin(), completed_tasks.end()), 1);
+
+    VecSimIndex_Free(index);
+}
+
+TYPED_TEST(HNSWTestParallel, parallelSearchCombined) {
+    size_t n = 10000;
+    size_t k = 11;
+    size_t dim = 64;
+
+    HNSWParams params = {.dim = dim,
+                         .metric = VecSimMetric_L2,
+                         .initialCapacity = n,
+                         .M = 64,
+                         .efConstruction = 200,
+                         .efRuntime = n};
+    VecSimIndex *index = this->CreateNewIndex(params);
+
+    for (size_t i = 0; i < n; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(index, dim, i, i);
+    }
+    ASSERT_EQ(VecSimIndex_IndexSize(index), n);
+
+    size_t n_threads = MIN(15, std::thread::hardware_concurrency());
+    std::atomic_int successful_searches(0);
+    // Save the number fo tasks done by thread i in the i-th entry.
+    std::vector<size_t> completed_tasks(n_threads, 0);
+
+    /* Run parallel searches of three kinds: KNN, range, and batched search. */
+
+    // In knn, we expect to get different labels as results (determined by the thread id), which are
+    // labels in the range [50+myID-5, 50+myID+5].
+    auto parallel_knn_search = [&](int myID) {
+        completed_tasks[myID]++;
+        TEST_DATA_T query_val = 50 + myID;
+        TEST_DATA_T query[dim];
+        GenerateVector<TEST_DATA_T>(query, dim, query_val);
+        auto verify_res = [&](size_t id, double score, size_t res_index) {
+            // We expect to get the results with increasing order of the distance between the res
+            // label and the query val (query_val, query_val-1, query_val+1, query_val-2,
+            // query_val+2, ...) The score is the L2 distance between the vectors that correspond
+            // the ids.
+            size_t diff_id = std::abs(id - query_val);
+            ASSERT_EQ(diff_id, (res_index + 1) / 2);
+            ASSERT_EQ(score, (dim * (diff_id * diff_id)));
+        };
+        runTopKSearchTest(index, query, k, verify_res);
+        successful_searches++;
+    };
+
+    auto parallel_range_search = [&](int myID) {
+        completed_tasks[myID]++;
+        TEST_DATA_T pivot_id = 100 + myID;
+        TEST_DATA_T query[dim];
+        GenerateVector<TEST_DATA_T>(query, dim, pivot_id);
+        auto verify_res_by_score = [&](size_t id, double score, size_t res_index) {
+            size_t diff_id = std::abs(id - pivot_id);
+            ASSERT_EQ(diff_id, (res_index + 1) / 2);
+            ASSERT_EQ(score, dim * (diff_id * diff_id));
+        };
+        uint expected_num_results = 11;
+        // To get 11 results in the range [pivot_id-5, pivot_id+5], set the radius as the L2 score
+        // in the boundaries.
+        double radius = (double)dim * pow((double)expected_num_results / 2, 2);
+        runRangeQueryTest(index, query, radius, verify_res_by_score, expected_num_results,
+                          BY_SCORE);
+        successful_searches++;
+    };
+
+    auto parallel_batched_search = [&](int myID) {
+        completed_tasks[myID]++;
+        TEST_DATA_T query[dim];
+        GenerateVector<TEST_DATA_T>(query, dim, n);
+
+        VecSimBatchIterator *batchIterator = VecSimBatchIterator_New(index, query, nullptr);
+        size_t iteration_num = 0;
+
+        // Get the 5 vectors whose ids are the maximal among those that hasn't been returned yet
+        // in every iteration. The results order should be sorted by their score (distance from the
+        // query vector), which means sorted from the largest id to the lowest.
+        // Run different number of iterations for every thread id.
+        size_t total_iterations = myID;
+        size_t n_res = 5;
+        while (VecSimBatchIterator_HasNext(batchIterator) && iteration_num < total_iterations) {
+            std::vector<size_t> expected_ids(n_res);
+            for (size_t i = 0; i < n_res; i++) {
+                expected_ids[i] = (n - iteration_num * n_res - i - 1);
+            }
+            auto verify_res = [&](size_t id, double score, size_t res_index) {
+                ASSERT_TRUE(expected_ids[res_index] == id);
+            };
+            runBatchIteratorSearchTest(batchIterator, n_res, verify_res);
+            iteration_num++;
+        }
+        ASSERT_EQ(iteration_num, total_iterations);
+        VecSimBatchIterator_Free(batchIterator);
+        successful_searches++;
+    };
+
+    std::thread thread_objs[n_threads];
+    size_t memory_before = index->info().commonInfo.memory;
+    for (size_t i = 0; i < n_threads; i++) {
+        if (i % 3 == 0) {
+            thread_objs[i] = std::thread(parallel_knn_search, i);
+        } else if (i % 3 == 1) {
+            thread_objs[i] = std::thread(parallel_range_search, i);
+        } else {
+            thread_objs[i] = std::thread(parallel_batched_search, i);
+        }
+    }
+    for (size_t i = 0; i < n_threads; i++) {
+        thread_objs[i].join();
+    }
+    ASSERT_EQ(successful_searches, n_threads);
+    // Validate that every thread executed a single job.
+    ASSERT_EQ(*std::min_element(completed_tasks.begin(), completed_tasks.end()), 1);
+    ASSERT_EQ(*std::max_element(completed_tasks.begin(), completed_tasks.end()), 1);
+
+    // Make sure that we properly update the allocator atomically during the searches.
+    // Memory delta should only be the visited nodes handler added to the pool.
+    size_t expected_memory = memory_before + (index->info().hnswInfo.visitedNodesPoolSize - 1) *
+                                                 (sizeof(VisitedNodesHandler) + sizeof(tag_t) * n +
+                                                  2 * sizeof(size_t) + sizeof(void *));
+    ASSERT_EQ(expected_memory, index->info().commonInfo.memory);
+    VecSimIndex_Free(index);
+}
+
+TYPED_TEST(HNSWTestParallel, parallelInsert) {
+    size_t n = 10000;
+    size_t k = 11;
+    size_t dim = 32;
+
+    HNSWParams params = {.dim = dim,
+                         .metric = VecSimMetric_L2,
+                         .initialCapacity = n,
+                         .M = 16,
+                         .efConstruction = 200};
+
+    VecSimIndex *parallel_index = this->CreateNewIndex(params);
+    size_t n_threads = 10;
+
+    // Save the number fo tasks done by thread i in the i-th entry.
+    std::vector<size_t> completed_tasks(n_threads, 0);
+
+    auto parallel_insert = [&](int myID) {
+        for (labelType label = myID; label < n; label += n_threads) {
+            completed_tasks[myID]++;
+            GenerateAndAddVector<TEST_DATA_T>(parallel_index, dim, label, label);
+        }
+    };
+    std::thread thread_objs[n_threads];
+    for (size_t i = 0; i < n_threads; i++) {
+        thread_objs[i] = std::thread(parallel_insert, i);
+    }
+    for (size_t i = 0; i < n_threads; i++) {
+        thread_objs[i].join();
+    }
+    ASSERT_EQ(VecSimIndex_IndexSize(parallel_index), n);
+    // Validate that every thread executed n/n_threads jobs.
+    ASSERT_EQ(*std::min_element(completed_tasks.begin(), completed_tasks.end()), n / n_threads);
+    ASSERT_EQ(*std::max_element(completed_tasks.begin(), completed_tasks.end()),
+              ceil((double)n / n_threads));
+
+    TEST_DATA_T query[dim];
+    GenerateVector<TEST_DATA_T>(query, dim, (TEST_DATA_T)n / 2);
+    auto verify_res = [&](size_t id, double score, size_t res_index) {
+        // We expect to get the results with increasing order of the distance between the res
+        // label and the query val (n/2, n/2-1, n/2+1, n/2-2, n/2+2, ...) The score is the L2
+        // distance between the vectors that correspond the ids.
+        size_t diff_id = std::abs(int(id - n / 2));
+        ASSERT_EQ(diff_id, (res_index + 1) / 2);
+        ASSERT_EQ(score, (dim * (diff_id * diff_id)));
+    };
+    runTopKSearchTest(parallel_index, query, k, verify_res);
+    VecSimIndex_Free(parallel_index);
+}
+
+TYPED_TEST(HNSWTestParallel, parallelInsertMulti) {
+    size_t n = 10000;
+    size_t n_labels = 1000;
+    size_t per_label = n / n_labels;
+    size_t k = 11;
+    size_t dim = 32;
+
+    HNSWParams params = {.dim = dim,
+                         .metric = VecSimMetric_L2,
+                         .initialCapacity = n,
+                         .M = 16,
+                         .efConstruction = 200};
+
+    VecSimIndex *parallel_index = this->CreateNewIndex(params, true);
+    size_t n_threads = 10;
+
+    // Save the number fo tasks done by thread i in the i-th entry.
+    std::vector<size_t> completed_tasks(n_threads, 0);
+    auto parallel_insert = [&](int myID) {
+        for (size_t i = myID; i < n; i += n_threads) {
+            completed_tasks[myID]++;
+            GenerateAndAddVector<TEST_DATA_T>(parallel_index, dim, i % n_labels, i);
+        }
+    };
+    std::thread thread_objs[n_threads];
+    for (size_t i = 0; i < n_threads; i++) {
+        thread_objs[i] = std::thread(parallel_insert, i);
+    }
+    for (size_t i = 0; i < n_threads; i++) {
+        thread_objs[i].join();
+    }
+    ASSERT_EQ(VecSimIndex_IndexSize(parallel_index), n);
+    // Validate that every thread executed n/n_threads jobs.
+    ASSERT_EQ(*std::min_element(completed_tasks.begin(), completed_tasks.end()), n / n_threads);
+    ASSERT_EQ(*std::max_element(completed_tasks.begin(), completed_tasks.end()),
+              ceil((double)n / n_threads));
+
+    TEST_DATA_T query[dim];
+    TEST_DATA_T query_val = (TEST_DATA_T)n / 2 + 10;
+    GenerateVector<TEST_DATA_T>(query, dim, (TEST_DATA_T)query_val);
+    auto verify_res = [&](size_t id, double score, size_t res_index) {
+        // We expect to get the results with increasing order of the distance between the res
+        // label and query_val%n_labels (that is ids 10, 9, 11, ... for the current arguments).
+        // The score is the L2 distance between the vectors that correspond the ids.
+        size_t diff_id = std::abs(int(id - (size_t)query_val % n_labels));
+        ASSERT_EQ(diff_id, (res_index + 1) / 2);
+        ASSERT_EQ(score, (dim * (diff_id * diff_id)));
+    };
+    runTopKSearchTest(parallel_index, query, k, verify_res);
+    VecSimIndex_Free(parallel_index);
+}
+
+TYPED_TEST(HNSWTestParallel, parallelInsertSearch) {
+    size_t n = 10000;
+    size_t k = 11;
+    size_t dim = 32;
+
+    HNSWParams params = {.dim = dim,
+                         .metric = VecSimMetric_L2,
+                         .initialCapacity = n,
+                         .M = 64,
+                         .efConstruction = 200,
+                         .efRuntime = n};
+
+    for (bool is_multi : {true, false}) {
+        VecSimIndex *parallel_index = this->CreateNewIndex(params, is_multi);
+        size_t n_threads = MIN(10, FLOOR_EVEN(std::thread::hardware_concurrency()));
+        // Save the number fo tasks done by thread i in the i-th entry.
+        std::vector<size_t> completed_tasks(n_threads, 0);
+
+        auto parallel_insert = [&](int myID) {
+            for (labelType label = myID; label < n; label += n_threads / 2) {
+                completed_tasks[myID]++;
+                GenerateAndAddVector<TEST_DATA_T>(parallel_index, dim, label, label);
+            }
+        };
+
+        TEST_DATA_T query_val = (TEST_DATA_T)n / 4;
+        std::atomic_int successful_searches(0);
+        auto parallel_knn_search = [&](int myID) {
+            completed_tasks[myID]++;
+            // Make sure were still indexing in parallel to the search (at most 90% if the vectors
+            // were already indexed).
+            ASSERT_LT(VecSimIndex_IndexSize(parallel_index), 0.9 * n);
+            TEST_DATA_T query[dim];
+            GenerateVector<TEST_DATA_T>(query, dim, query_val);
+            auto verify_res = [&](size_t id, double score, size_t res_index) {
+                // We expect to get the results with increasing order of the distance between the
+                // res label and the query val (n/4, n/4-1, n/4+1, n/4-2, n/4+2, ...) The score is
+                // the L2 distance between the vectors that correspond the ids.
+                size_t diff_id = std::abs(int(id - query_val));
+                ASSERT_EQ(diff_id, (res_index + 1) / 2);
+                ASSERT_EQ(score, (dim * (diff_id * diff_id)));
+            };
+            runTopKSearchTest(parallel_index, query, k, verify_res);
+            successful_searches++;
+        };
+
+        auto hnsw_index = this->CastToHNSW(parallel_index);
+        std::thread thread_objs[n_threads];
+        for (size_t i = 0; i < n_threads; i++) {
+            if (i < n_threads / 2) {
+                thread_objs[i] = std::thread(parallel_insert, i);
+            } else {
+                // Search threads are waiting in bust wait until the vectors of the query results
+                // are done being indexed.
+                bool wait_for_results = true;
+                while (wait_for_results) {
+                    wait_for_results = false;
+                    for (labelType res_label = query_val - k / 2; res_label <= query_val + k / 2;
+                         res_label++) {
+                        if (!hnsw_index->safeCheckIfLabelExistsInIndex(res_label, true)) {
+                            wait_for_results = true;
+                            break; // results are not ready yet, restart the check.
+                        }
+                    }
+                }
+                thread_objs[i] = std::thread(parallel_knn_search, i);
+            }
+        }
+        for (size_t i = 0; i < n_threads; i++) {
+            thread_objs[i].join();
+        }
+        ASSERT_EQ(VecSimIndex_IndexSize(parallel_index), n);
+        ASSERT_EQ(successful_searches, ceil(double(n_threads) / 2));
+        // Validate that every insertion thread executed n/(n_threads/2_ jobs).
+        ASSERT_EQ(
+            *std::min_element(completed_tasks.begin(), completed_tasks.begin() + n_threads / 2),
+            n / (n_threads / 2));
+        ASSERT_EQ(
+            *std::max_element(completed_tasks.begin(), completed_tasks.begin() + n_threads / 2),
+            ceil((double)n / (n_threads / 2)));
+        // Validate that every search thread executed a single job.
+        ASSERT_EQ(*std::min_element(completed_tasks.begin() + n_threads / 2, completed_tasks.end()),
+                  1);
+        ASSERT_EQ(*std::max_element(completed_tasks.begin() + n_threads / 2, completed_tasks.end()),
+                  1);
+        VecSimIndex_Free(parallel_index);
+    }
+}
+
+TYPED_TEST(HNSWTestParallel, parallelRepairs) {
+    size_t n = 1000;
+    size_t dim = 32;
+
+    HNSWParams params = {.dim = dim, .metric = VecSimMetric_L2, .initialCapacity = n};
+
+    auto *hnsw_index = this->CastToHNSW(this->CreateNewIndex(params));
+    size_t n_threads = MIN(10, std::thread::hardware_concurrency());
+    // Save the number fo tasks done by thread i in the i-th entry.
+    std::vector<size_t> completed_tasks(n_threads, 0);
+
+    // Create some random vectors and insert them to the index.
+    std::srand(10); // create pseudo random generator with ana arbitrary seed.
+    for (size_t i = 0; i < n; i++) {
+        TEST_DATA_T vector[dim];
+        for (size_t j = 0; j < dim; j++) {
+            vector[j] = std::rand() / (TEST_DATA_T)RAND_MAX;
+        }
+        VecSimIndex_AddVector(hnsw_index, vector, i);
+    }
+    ASSERT_EQ(VecSimIndex_IndexSize(hnsw_index), n);
+
+    // Queue of repair jobs, each job is represented as {id, level}
+    auto jobQ = std::vector<pair<idType, size_t>>();
+
+    // Collect all the nodes that require repairment due to the deletions, from top level down.
+    for (size_t element_id = 0; element_id < n; element_id += 2) {
+        hnsw_index->markDelete(element_id);
+    }
+    ASSERT_EQ(hnsw_index->getNumMarkedDeleted(), n / 2);
+    // Every that every deleted node should have at least 2 connections to repair.
+    auto report = hnsw_index->checkIntegrity();
+    ASSERT_GE(report.connections_to_repair, n);
+
+    this->CollectRepairJobs(hnsw_index, jobQ);
+    size_t n_jobs = jobQ.size();
+    ASSERT_EQ(report.connections_to_repair, n_jobs);
+
+    auto executeRepairJobs = [&](int myID) {
+        for (size_t i = myID; i < n_jobs; i += n_threads) {
+            auto job = jobQ[i];
+            hnsw_index->repairNodeConnections(job.first, job.second); // {element_id, level}
+            completed_tasks[myID]++;
+        }
+    };
+
+    std::thread thread_objs[n_threads];
+    for (size_t i = 0; i < n_threads; i++) {
+        thread_objs[i] = std::thread(executeRepairJobs, i);
+    }
+    for (size_t i = 0; i < n_threads; i++) {
+        thread_objs[i].join();
+    }
+    // Check index integrity, also make sure that no node is pointing to a deleted node.
+    report = hnsw_index->checkIntegrity();
+    ASSERT_TRUE(report.valid_state);
+    ASSERT_EQ(report.connections_to_repair, 0);
+
+    // Validate that the tasks are spread among the threads uniformly.
+    ASSERT_EQ(*std::min_element(completed_tasks.begin(), completed_tasks.end()),
+              floorf((float)n_jobs / n_threads));
+    ASSERT_EQ(*std::max_element(completed_tasks.begin(), completed_tasks.end()),
+              ceilf((float)n_jobs / n_threads));
+    VecSimIndex_Free(hnsw_index);
+}
+
+TYPED_TEST(HNSWTestParallel, parallelRepairSearch) {
+    size_t n = 10000;
+    size_t k = 10;
+    size_t dim = 32;
+
+    HNSWParams params = {
+        .dim = dim, .metric = VecSimMetric_L2, .initialCapacity = n, .efRuntime = n};
+
+    auto *hnsw_index = this->CastToHNSW(this->CreateNewIndex(params));
+    size_t n_threads = MIN(10, FLOOR_EVEN(std::thread::hardware_concurrency()));
+    // Save the number of tasks done by thread i in the i-th entry.
+    std::vector<size_t> completed_tasks(n_threads, 0);
+
+    for (size_t i = 0; i < n; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(hnsw_index, dim, i, i);
+    }
+    ASSERT_EQ(VecSimIndex_IndexSize(hnsw_index), n);
+
+    // Queue of repair jobs, each job is represented as {id, level}
+    auto jobQ = std::vector<pair<idType, size_t>>();
+
+    for (size_t element_id = 0; element_id < n; element_id += 2) {
+        hnsw_index->markDelete(element_id);
+    }
+    ASSERT_EQ(hnsw_index->getNumMarkedDeleted(), n / 2);
+    // Every deleted node i should have at least 2 connection to repair (to i-1 and i+1), except for
+    // 0 and n-1 that has at least one connection to repair.
+    ASSERT_GE(hnsw_index->checkIntegrity().connections_to_repair, n - 2);
+
+    // Collect all the nodes that require repairment due to the deletions, from top level down.
+    this->CollectRepairJobs(hnsw_index, jobQ);
+    size_t n_jobs = jobQ.size();
+
+    auto executeRepairJobs = [&](int myID) {
+        for (size_t i = myID; i < n_jobs; i += n_threads / 2) {
+            auto job = jobQ[i];
+            hnsw_index->repairNodeConnections(job.first, job.second); // {element_id, level}
+            completed_tasks[myID]++;
+        }
+    };
+
+    bool run_queries = true;
+    auto parallel_knn_search = [&](int myID) {
+        TEST_DATA_T query_val = (TEST_DATA_T)n / 4 + 2 * myID;
+        TEST_DATA_T query[dim];
+        GenerateVector<TEST_DATA_T>(query, dim, query_val);
+        auto verify_res = [&](size_t id, double score, size_t res_index) {
+            // We expect to get the results with increasing order of the distance between the
+            // res label and the query val and only odd labels (query_val-1, query_val+1,
+            // query_val-3, query_val+3, ...) The score is the L2 distance between the vectors that
+            // correspond the ids.
+            size_t diff_id = std::abs(int(id - query_val));
+            ASSERT_EQ(diff_id, res_index + (1 - res_index % 2));
+            ASSERT_EQ(score, (dim * (diff_id * diff_id)));
+        };
+        do {
+            runTopKSearchTest(hnsw_index, query, k, verify_res);
+            completed_tasks[myID]++;
+        } while (run_queries);
+    };
+
+    std::thread thread_objs[n_threads];
+    // Run queries, expect to get only non-deleted vector as results.
+    for (size_t i = n_threads / 2; i < n_threads; i++) {
+        thread_objs[i] = std::thread(parallel_knn_search, i);
+    }
+
+    // Run the repair jobs.
+    for (size_t i = 0; i < n_threads / 2; i++) {
+        thread_objs[i] = std::thread(executeRepairJobs, i);
+    }
+    for (size_t i = 0; i < n_threads / 2; i++) {
+        thread_objs[i].join();
+    }
+    // Once all the repair jobs are done, signal the query threads to finish.
+    run_queries = false;
+    for (size_t i = n_threads / 2; i < n_threads; i++) {
+        thread_objs[i].join();
+    }
+
+    // Check index integrity, also make sure that no node is pointing to a deleted node.
+    auto report = hnsw_index->checkIntegrity();
+    ASSERT_TRUE(report.valid_state);
+    ASSERT_EQ(report.connections_to_repair, 0);
+
+    // Validate that every search thread ran at least one job.
+    ASSERT_GE(*std::min_element(completed_tasks.begin() + n_threads / 2, completed_tasks.end()), 1);
+    // Validate that the repair tasks are spread among the threads uniformly.
+    ASSERT_EQ(*std::min_element(completed_tasks.begin(), completed_tasks.begin() + n_threads / 2),
+              floorf((float)n_jobs / (n_threads / 2.0)));
+    ASSERT_EQ(*std::max_element(completed_tasks.begin(), completed_tasks.begin() + n_threads / 2),
+              ceilf((float)n_jobs / (n_threads / 2.0)));
+    VecSimIndex_Free(hnsw_index);
+}
+
+TYPED_TEST(HNSWTestParallel, parallelRepairInsert) {
+    size_t n = 1000;
+    size_t k = 11;
+    size_t dim = 32;
+
+    HNSWParams params = {
+        .dim = dim, .metric = VecSimMetric_L2, .initialCapacity = n, .efRuntime = n};
+
+    auto *hnsw_index = this->CastToHNSW(this->CreateNewIndex(params));
+    size_t n_threads = MIN(8, FLOOR_EVEN(std::thread::hardware_concurrency()));
+    // Save the number fo tasks done by thread i in the i-th entry.
+    std::vector<size_t> completed_tasks(n_threads, 0);
+
+    // Insert n/2 vectors to the index.
+    for (size_t i = 0; i < n / 2; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(hnsw_index, dim, i, i);
+    }
+    ASSERT_EQ(VecSimIndex_IndexSize(hnsw_index), n / 2);
+
+    // Queue of repair jobs, each job is represented as {id, level}
+    auto jobQ = std::vector<pair<idType, size_t>>();
+    for (size_t element_id = 0; element_id < n / 2; element_id += 2) {
+        hnsw_index->markDelete(element_id);
+    }
+    ASSERT_EQ(hnsw_index->getNumMarkedDeleted(), n / 4);
+    // Every deleted node i should have at least 2 connection to repair (to i-1 and i-1), except for
+    // 0 that has at least one connection to repair.
+    ASSERT_GE(hnsw_index->checkIntegrity().connections_to_repair, n / 2 - 1);
+
+    // Collect all the nodes that require repairment due to the deletions, from top level down.
+    this->CollectRepairJobs(hnsw_index, jobQ);
+    size_t n_jobs = jobQ.size();
+
+    auto executeRepairJobs = [&](int myID) {
+        for (size_t i = myID - n_threads / 2; i < n_jobs; i += n_threads / 2) {
+            auto job = jobQ[i];
+            hnsw_index->repairNodeConnections(job.first, job.second); // {element_id, level}
+            completed_tasks[myID]++;
+        }
+    };
+
+    auto parallel_insert = [&](int myID) {
+        // Reinsert the even ids that were deleted, and n/4 more even ids.
+        for (labelType label = 2 * myID; label < n; label += n_threads) {
+            completed_tasks[myID]++;
+            GenerateAndAddVector<TEST_DATA_T>(hnsw_index, dim, label, label);
+        }
+    };
+
+    std::thread thread_objs[n_threads];
+
+    // Insert n/2 new vectors while we repair connections.
+    for (size_t i = 0; i < n_threads / 2; i++) {
+        thread_objs[i] = std::thread(parallel_insert, i);
+    }
+    for (size_t i = n_threads / 2; i < n_threads; i++) {
+        thread_objs[i] = std::thread(executeRepairJobs, i);
+    }
+    for (size_t i = 0; i < n_threads; i++) {
+        thread_objs[i].join();
+    }
+    // Check index integrity, also make sure that no node is pointing to a deleted node.
+    ASSERT_EQ(hnsw_index->indexSize(), n);
+    auto report = hnsw_index->checkIntegrity();
+    ASSERT_TRUE(report.valid_state);
+    ASSERT_EQ(report.connections_to_repair, 0);
+
+    // Validate that the repair tasks are spread among the threads uniformly.
+    ASSERT_EQ(*std::min_element(completed_tasks.begin() + n_threads / 2, completed_tasks.end()),
+              floorf((float)n_jobs / (n_threads / 2.0)));
+    ASSERT_EQ(*std::max_element(completed_tasks.begin() + n_threads / 2, completed_tasks.end()),
+              ceilf((float)n_jobs / (n_threads / 2.0)));
+
+    // Run queries to validate the index new state.
+    TEST_DATA_T query[dim];
+    // Around 3n/4 we only have even numbers vectors.
+    size_t query_val = 3 * n / 4;
+    GenerateVector<TEST_DATA_T>(query, dim, query_val);
+    auto verify_res_even = [&](size_t id, double score, size_t res_index) {
+        // We expect to get the results with increasing order of the distance between the
+        // res label and the query val (3n/4, 3n/4 - 2, 3n/4 + 2, 3n/4 - 4 3n/4 + 4, ...) The score
+        // is the L2 distance between the vectors that correspond the ids.
+        size_t diff_id = std::abs(int(id - query_val));
+        ASSERT_EQ(diff_id, res_index % 2 ? res_index + 1 : res_index);
+        ASSERT_EQ(score, (dim * (diff_id * diff_id)));
+    };
+    runTopKSearchTest(hnsw_index, query, k, verify_res_even);
+
+    // Around n/4 we should have all vectors (even and odd).
+    query_val = n / 4;
+    GenerateVector<TEST_DATA_T>(query, dim, query_val);
+    auto verify_res = [&](size_t id, double score, size_t res_index) {
+        // We expect to get the results with increasing order of the distance between the
+        // res label and the query val (n/4, n/4 - 1, n/4 + 1, n/4 - 2 n/4 + 2, ...) The score
+        // is the L2 distance between the vectors that correspond the ids.
+        size_t diff_id = std::abs(int(id - query_val));
+        ASSERT_EQ(diff_id, (res_index + 1) / 2);
+        ASSERT_EQ(score, (dim * (diff_id * diff_id)));
+    };
+    runTopKSearchTest(hnsw_index, query, k, verify_res);
+    VecSimIndex_Free(hnsw_index);
+}
diff --git a/tests/unit/test_hnsw_tiered.cpp b/tests/unit/test_hnsw_tiered.cpp
index 92d0d82bf..341748c83 100644
--- a/tests/unit/test_hnsw_tiered.cpp
+++ b/tests/unit/test_hnsw_tiered.cpp
@@ -1,88 +1,3567 @@
+#include "VecSim/index_factories/tiered_factory.h"
 #include "VecSim/algorithms/hnsw/hnsw_tiered.h"
-#include "VecSim/algorithms/hnsw/hnsw_factory.h"
+#include "VecSim/algorithms/hnsw/hnsw_single.h"
+#include "VecSim/algorithms/hnsw/hnsw_multi.h"
+#include <string>
+#include <array>
+
 #include "test_utils.h"
 
+#include <thread>
+
 using namespace tiered_index_mock;
 
+// Runs the test for all combination of data type(float/double) - label type (single/multi)
+
 template <typename index_type_t>
-class HNSWTieredIndexTest : public ::testing::Test {};
+class HNSWTieredIndexTest : public ::testing::Test {
+public:
+    using data_t = typename index_type_t::data_t;
+    using dist_t = typename index_type_t::dist_t;
+
+protected:
+    HNSWIndex<data_t, dist_t> *CastToHNSW(VecSimIndex *index) {
+        auto tiered_index = reinterpret_cast<TieredHNSWIndex<data_t, dist_t> *>(index);
+        return tiered_index->getHNSWIndex();
+    }
+    TieredHNSWIndex<data_t, dist_t> *CreateTieredHNSWIndex(VecSimParams &hnsw_params,
+                                                           JobQueue *jobQ, IndexExtCtx *ctx,
+                                                           size_t swap_job_threshold = 0,
+                                                           size_t flat_buffer_limit = SIZE_MAX) {
+        TieredIndexParams tiered_params = {
+            .jobQueue = jobQ,
+            .jobQueueCtx = ctx,
+            .submitCb = submit_callback,
+            .flatBufferLimit = flat_buffer_limit,
+            .primaryIndexParams = &hnsw_params,
+            .specificParams = {TieredHNSWParams{.swapJobThreshold = swap_job_threshold}}};
+        auto *tiered_index = reinterpret_cast<TieredHNSWIndex<data_t, dist_t> *>(
+            TieredFactory::NewIndex(&tiered_params));
+
+        // Set the created tiered index in the index external context (it will take ownership over
+        // the index, and we'll need to release the ctx at the end of the test.
+        ctx->index_strong_ref.reset(tiered_index);
+        return tiered_index;
+    }
+};
+
+TYPED_TEST_SUITE(HNSWTieredIndexTest, DataTypeSetExtended);
 
-TYPED_TEST_SUITE(HNSWTieredIndexTest, DataTypeSet);
+// Runs the test for each data type(float/double). The label type should be explicitly
+// set in the test.
+
+template <typename index_type_t>
+class HNSWTieredIndexTestBasic : public HNSWTieredIndexTest<index_type_t> {};
+TYPED_TEST_SUITE(HNSWTieredIndexTestBasic, DataTypeSet);
 
 TYPED_TEST(HNSWTieredIndexTest, CreateIndexInstance) {
-    std::shared_ptr<VecSimAllocator> allocator = VecSimAllocator::newVecsimAllocator();
+    // Create TieredHNSW index instance with a mock queue.
+    HNSWParams params = {.type = TypeParam::get_index_type(),
+                         .dim = 4,
+                         .metric = VecSimMetric_L2,
+                         .multi = TypeParam::isMulti()};
+    VecSimParams hnsw_params = CreateParams(params);
+    auto jobQ = JobQueue();
+    auto jobQueueCtx = new IndexExtCtx();
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, jobQueueCtx);
+
+    // Get the allocator from the tiered index.
+    auto allocator = tiered_index->getAllocator();
+
+    // Add a vector to the flat index.
+    TEST_DATA_T vector[tiered_index->backendIndex->getDim()];
+    GenerateVector<TEST_DATA_T>(vector, tiered_index->backendIndex->getDim());
+    labelType vector_label = 1;
+    VecSimIndex_AddVector(tiered_index->frontendIndex, vector, vector_label);
+
+    // Create a mock job that inserts some vector into the HNSW index.
+    auto insert_to_index = [](AsyncJob *job) {
+        auto *my_insert_job = reinterpret_cast<HNSWInsertJob *>(job);
+        auto my_index =
+            reinterpret_cast<TieredHNSWIndex<TEST_DATA_T, TEST_DIST_T> *>(my_insert_job->index);
+
+        // Move the vector from the temp flat index into the HNSW index.
+        // Note that we access the vector via its internal id since in index of type MULTI,
+        // this is the only way to do so (knowing the label is not enough...)
+        VecSimIndex_AddVector(my_index->backendIndex,
+                              my_index->frontendIndex->getDataByInternalId(my_insert_job->id),
+                              my_insert_job->label);
+        // TODO: enable deleting vectors by internal id for the case of moving a single vector
+        //  from the flat buffer in MULTI.
+        VecSimIndex_DeleteVector(my_index->frontendIndex, my_insert_job->label);
+        auto it = my_index->labelToInsertJobs.at(my_insert_job->label).begin();
+        ASSERT_EQ(job, *it); // Assert pointers equation
+        // Here we update labelToInsertJobs mapping, as we except that for every insert job
+        // there will be a corresponding item in the map.
+        my_index->labelToInsertJobs.at(my_insert_job->label).erase(it);
+        delete job;
+    };
+
+    auto job = new (allocator)
+        HNSWInsertJob(tiered_index->allocator, vector_label, 0, insert_to_index, tiered_index);
+    auto jobs_vec = vecsim_stl::vector<HNSWInsertJob *>(1, job, allocator);
+    tiered_index->labelToInsertJobs.insert({vector_label, jobs_vec});
+
+    // Wrap this job with an array and submit the jobs to the queue.
+    // TODO: in the future this should be part of the tiered index "add_vector" flow, and
+    //  we can replace this to avoid the breaking of the abstraction.
+    tiered_index->submitSingleJob((AsyncJob *)job);
+    ASSERT_EQ(jobQ.size(), 1);
+
+    // Execute the job from the queue and validate that the index was updated properly.
+    thread_iteration(jobQ);
+    ASSERT_EQ(tiered_index->indexSize(), 1);
+    ASSERT_EQ(tiered_index->getDistanceFrom(1, vector), 0);
+    ASSERT_EQ(tiered_index->frontendIndex->indexSize(), 0);
+    ASSERT_EQ(tiered_index->labelToInsertJobs.at(vector_label).size(), 0);
+
+    // This will delete the index. The allocator is "fix to the point" in the body of the test,
+    // so it is safe to delete the index with `delete` and not with VecSimIndex_Free.
+    delete jobQueueCtx;
+}
+
+TYPED_TEST(HNSWTieredIndexTest, testSizeEstimation) {
+    size_t dim = 128;
+    size_t n = DEFAULT_BLOCK_SIZE;
+    size_t M = 32;
+    size_t bs = DEFAULT_BLOCK_SIZE;
+    bool isMulti = TypeParam::isMulti();
+
+    HNSWParams hnsw_params = {.type = TypeParam::get_index_type(),
+                              .dim = dim,
+                              .metric = VecSimMetric_L2,
+                              .multi = isMulti,
+                              .initialCapacity = n,
+                              .M = M};
+    VecSimParams vecsim_hnsw_params = CreateParams(hnsw_params);
+
+    auto jobQ = JobQueue();
+    auto index_ctx = new IndexExtCtx();
+    TieredIndexParams tiered_params = {.jobQueue = &jobQ,
+                                       .jobQueueCtx = index_ctx,
+                                       .submitCb = submit_callback,
+                                       .flatBufferLimit = SIZE_MAX,
+                                       .primaryIndexParams = &vecsim_hnsw_params};
+    VecSimParams params = CreateParams(tiered_params);
+    auto *index = VecSimIndex_New(&params);
+    index_ctx->index_strong_ref.reset(index);
+    auto allocator = index->getAllocator();
+
+    size_t initial_size_estimation = VecSimIndex_EstimateInitialSize(&params);
+
+    // labels_lookup hash table has additional memory, since STL implementation chooses "an
+    // appropriate prime number" higher than n as the number of allocated buckets (for n=1000, 1031
+    // buckets are created)
+    auto hnsw_index = this->CastToHNSW(index);
+    if (isMulti == false) {
+        auto hnsw = reinterpret_cast<HNSWIndex_Single<TEST_DATA_T, TEST_DIST_T> *>(hnsw_index);
+        initial_size_estimation += (hnsw->label_lookup_.bucket_count() - n) * sizeof(size_t);
+    } else { // if its a multi value index cast to HNSW_Multi
+        auto hnsw = reinterpret_cast<HNSWIndex_Multi<TEST_DATA_T, TEST_DIST_T> *>(hnsw_index);
+        initial_size_estimation += (hnsw->label_lookup_.bucket_count() - n) * sizeof(size_t);
+    }
+
+    ASSERT_EQ(initial_size_estimation, index->getAllocationSize());
+
+    // Add vectors up to initial capacity (initial capacity == block size).
+    for (size_t i = 0; i < n; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(index, dim, i, i);
+        thread_iteration(jobQ);
+    }
+
+    // Estimate memory delta for filling up the first block and adding another block.
+    size_t estimation = VecSimIndex_EstimateElementSize(&params) * bs;
+
+    size_t memory_before = index->getAllocationSize();
+
+    // Note we are adding vectors with ascending values. This causes the numbers of
+    // incoming edges, which are not taking into account in EstimateElementSize,
+    // to be zero
+    for (size_t i = 0; i < bs; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(index, dim, i + bs, i + bs);
+        thread_iteration(jobQ);
+    }
+
+    size_t delta = index->getAllocationSize() - memory_before;
+
+    // Flat index should be empty, hence the index size includes only hnsw size.
+    ASSERT_EQ(index->indexSize(), hnsw_index->indexSize());
+    // We added 2 * bs vectors
+    ASSERT_EQ(index->indexSize(), 2 * bs);
+    // Which is the current maximum capacity.
+    ASSERT_EQ(index->indexSize(), index->indexCapacity());
+    ASSERT_EQ(index->indexSize(), hnsw_index->indexCapacity());
+
+    ASSERT_GE(estimation * 1.02, delta);
+    ASSERT_LE(estimation * 0.98, delta);
 
+    delete index_ctx;
+}
+
+TYPED_TEST(HNSWTieredIndexTest, addVector) {
     // Create TieredHNSW index instance with a mock queue.
-    for (auto is_multi : {true, false}) {
-        HNSWParams params = {.type = TypeParam::get_index_type(),
-                             .dim = 4,
-                             .metric = VecSimMetric_L2,
-                             .multi = is_multi};
-        auto *jobQ = new JobQueue();
-        size_t memory_ctx = 0;
-        TieredIndexParams tiered_params = {.jobQueue = jobQ,
-                                           .submitCb = submit_callback,
-                                           .memoryCtx = &memory_ctx,
-                                           .UpdateMemCb = update_mem_callback};
-        TieredHNSWParams tiered_hnsw_params = {.hnswParams = params, .tieredParams = tiered_params};
-        auto *tiered_index = reinterpret_cast<TieredHNSWIndex<TEST_DATA_T, TEST_DIST_T> *>(
-            HNSWFactory::NewTieredIndex(&tiered_hnsw_params, allocator));
-
-        // Add a vector to the flat index.
-        TEST_DATA_T vector[tiered_index->index->getDim()];
-        GenerateVector<TEST_DATA_T>(vector, tiered_index->index->getDim());
-        labelType vector_label = 1;
-        VecSimIndex_AddVector(tiered_index->flatBuffer, vector, vector_label);
-
-        // Create a mock job that inserts some vector into the HNSW index.
-        auto insert_to_index = [](void *job) {
-            auto *my_insert_job = reinterpret_cast<HNSWInsertJob *>(job);
-            auto my_index =
-                reinterpret_cast<TieredHNSWIndex<TEST_DATA_T, TEST_DIST_T> *>(my_insert_job->index);
-
-            // Move the vector from the temp flat index into the HNSW index.
-            // Note that we access the vector via its internal id since in index of type MULTI,
-            // this is the only way to do so (knowing the label is not enough...)
-            VecSimIndex_AddVector(my_index,
-                                  my_index->flatBuffer->getDataByInternalId(my_insert_job->id),
-                                  my_insert_job->label);
-            // TODO: enable deleting vectors by internal id for the case of moving a single vector
-            //  from the flat buffer in MULTI.
-            VecSimIndex_DeleteVector(my_index->flatBuffer, my_insert_job->label);
-            auto it = my_index->labelToInsertJobs[my_insert_job->label].begin();
-            ASSERT_EQ(job, *it); // Assert pointers equation
-            // Here we update labelToInsertJobs mapping, as we except that for every insert job
-            // there will be a corresponding item in the map.
-            my_index->labelToInsertJobs[my_insert_job->label].erase(it);
-            my_index->UpdateIndexMemory(my_index->memoryCtx,
-                                        my_index->getAllocator()->getAllocationSize());
+    size_t dim = 4;
+    bool isMulti = TypeParam::isMulti();
+    HNSWParams params = {.type = TypeParam::get_index_type(),
+                         .dim = dim,
+                         .metric = VecSimMetric_L2,
+                         .multi = isMulti};
+    VecSimParams hnsw_params = CreateParams(params);
+    auto jobQ = JobQueue();
+    auto index_ctx = new IndexExtCtx();
+    TieredIndexParams tiered_params = {.jobQueue = &jobQ,
+                                       .jobQueueCtx = index_ctx,
+                                       .submitCb = submit_callback,
+                                       .flatBufferLimit = SIZE_MAX,
+                                       .primaryIndexParams = &hnsw_params};
+    auto *tiered_index = reinterpret_cast<TieredHNSWIndex<TEST_DATA_T, TEST_DIST_T> *>(
+        TieredFactory::NewIndex(&tiered_params));
+    // Get the allocator from the tiered index.
+    auto allocator = tiered_index->getAllocator();
+    // Set the created tiered index in the index external context.
+    index_ctx->index_strong_ref.reset(tiered_index);
+
+    BFParams bf_params = {.type = TypeParam::get_index_type(),
+                          .dim = dim,
+                          .metric = VecSimMetric_L2,
+                          .multi = isMulti};
+
+    // Validate that memory upon creating the tiered index is as expected (no more than 2%
+    // above te expected, since in different platforms there are some minor additional
+    // allocations).
+    size_t expected_mem = TieredFactory::EstimateInitialSize(&tiered_params);
+    ASSERT_LE(expected_mem, tiered_index->getAllocationSize());
+    ASSERT_GE(expected_mem * 1.02, tiered_index->getAllocationSize());
+
+    // Create a vector and add it to the tiered index.
+    labelType vec_label = 1;
+    TEST_DATA_T vector[dim];
+    GenerateVector<TEST_DATA_T>(vector, dim, vec_label);
+    VecSimIndex_AddVector(tiered_index, vector, vec_label);
+    // Validate that the vector was inserted to the flat buffer properly.
+    ASSERT_EQ(tiered_index->indexSize(), 1);
+    ASSERT_EQ(tiered_index->backendIndex->indexSize(), 0);
+    ASSERT_EQ(tiered_index->frontendIndex->indexSize(), 1);
+    ASSERT_EQ(tiered_index->frontendIndex->indexCapacity(), DEFAULT_BLOCK_SIZE);
+    ASSERT_EQ(tiered_index->indexCapacity(), DEFAULT_BLOCK_SIZE);
+    ASSERT_EQ(tiered_index->frontendIndex->getDistanceFrom(vec_label, vector), 0);
+    // Validate that the job was created properly
+    ASSERT_EQ(tiered_index->labelToInsertJobs.at(vec_label).size(), 1);
+    ASSERT_EQ(tiered_index->labelToInsertJobs.at(vec_label)[0]->label, vec_label);
+    ASSERT_EQ(tiered_index->labelToInsertJobs.at(vec_label)[0]->id, 0);
+
+    // Account for the allocation of a new block due to the vector insertion.
+    expected_mem += (BruteForceFactory::EstimateElementSize(&bf_params)) * DEFAULT_BLOCK_SIZE;
+    // Account for the memory that was allocated in the labelToId map (approx.)
+    expected_mem += sizeof(vecsim_stl::unordered_map<labelType, idType>::value_type) +
+                    sizeof(void *) + sizeof(size_t);
+    // Account for the memory that was allocated in the labelToInsertJobs map (approx.)
+    expected_mem +=
+        sizeof(
+            vecsim_stl::unordered_map<labelType, vecsim_stl::vector<HNSWInsertJob *>>::value_type) +
+        sizeof(void *) + sizeof(size_t);
+    // Account for the inner buffer of the std::vector<HNSWInsertJob *> in the map.
+    expected_mem += sizeof(void *) + sizeof(size_t);
+    // Account for the insert job that was created.
+    expected_mem += sizeof(HNSWInsertJob) + sizeof(size_t);
+    ASSERT_GE(expected_mem * 1.02, tiered_index->getAllocationSize());
+    ASSERT_LE(expected_mem, tiered_index->getAllocationSize());
+
+    if (isMulti) {
+        // Add another vector under the same label (create another insert job)
+        VecSimIndex_AddVector(tiered_index, vector, vec_label);
+        ASSERT_EQ(tiered_index->indexSize(), 2);
+        ASSERT_EQ(tiered_index->indexLabelCount(), 1);
+        ASSERT_EQ(tiered_index->backendIndex->indexSize(), 0);
+        ASSERT_EQ(tiered_index->frontendIndex->indexSize(), 2);
+        // Validate that the second job was created properly
+        ASSERT_EQ(tiered_index->labelToInsertJobs.at(vec_label).size(), 2);
+        ASSERT_EQ(tiered_index->labelToInsertJobs.at(vec_label)[1]->label, vec_label);
+        ASSERT_EQ(tiered_index->labelToInsertJobs.at(vec_label)[1]->id, 1);
+    }
+
+    delete index_ctx;
+}
+
+TYPED_TEST(HNSWTieredIndexTest, manageIndexOwnership) {
+
+    // Create TieredHNSW index instance with a mock queue.
+    size_t dim = 4;
+    HNSWParams params = {.type = TypeParam::get_index_type(),
+                         .dim = dim,
+                         .metric = VecSimMetric_L2,
+                         .multi = TypeParam::isMulti()};
+    VecSimParams hnsw_params = CreateParams(params);
+    auto jobQ = JobQueue();
+    auto *index_ctx = new IndexExtCtx();
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx);
+
+    // Get the allocator from the tiered index.
+    auto allocator = tiered_index->getAllocator();
+
+    EXPECT_EQ(index_ctx->index_strong_ref.use_count(), 1);
+    size_t initial_mem = allocator->getAllocationSize();
+
+    // Create a dummy job callback that insert one vector to the underline HNSW index.
+    auto dummy_job = [](AsyncJob *job) {
+        auto *my_index = reinterpret_cast<TieredHNSWIndex<TEST_DATA_T, TEST_DIST_T> *>(job->index);
+        std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+        size_t dim = 4;
+        TEST_DATA_T vector[dim];
+        GenerateVector<TEST_DATA_T>(vector, dim);
+        if (my_index->backendIndex->indexCapacity() == my_index->backendIndex->indexSize()) {
+            my_index->backendIndex->increaseCapacity();
+        }
+        my_index->backendIndex->addVector(vector, my_index->backendIndex->indexSize());
+    };
+
+    std::atomic_int successful_executions(0);
+    auto job1 =
+        new (allocator) AsyncJob(allocator, HNSW_INSERT_VECTOR_JOB, dummy_job, tiered_index);
+    auto job2 =
+        new (allocator) AsyncJob(allocator, HNSW_INSERT_VECTOR_JOB, dummy_job, tiered_index);
+
+    // Wrap this job with an array and submit the jobs to the queue.
+    tiered_index->submitSingleJob(job1);
+    tiered_index->submitSingleJob(job2);
+    ASSERT_EQ(jobQ.size(), 2);
+
+    // Execute the job from the queue asynchronously, delete the index in the meantime.
+    auto run_fn = [&jobQ, &successful_executions]() {
+        // Create a temporary strong reference of the index from the weak reference that the
+        // job holds, to ensure that the index is not deleted while the job is running.
+        if (auto temp_ref = jobQ.front().index_weak_ref.lock()) {
+            // At this point we wish to validate that we have both the index strong ref (stored
+            // in index_ctx) and the weak ref owned by the job (that we currently promoted).
+            EXPECT_EQ(jobQ.front().index_weak_ref.use_count(), 2);
+
+            jobQ.front().job->Execute(jobQ.front().job);
+            successful_executions++;
+        }
+        jobQ.kick();
+    };
+    std::thread t1(run_fn);
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    // Delete the index while the job is still running, to ensure that the weak ref protects
+    // the index.
+    delete index_ctx;
+    EXPECT_EQ(jobQ.front().index_weak_ref.use_count(), 1);
+    t1.join();
+    // Expect that the first job will succeed.
+    ASSERT_EQ(successful_executions, 1);
+
+    // The second job should not run, since the weak reference is not supposed to become a
+    // strong references now.
+    ASSERT_EQ(jobQ.size(), 1);
+    ASSERT_EQ(jobQ.front().index_weak_ref.use_count(), 0);
+    std::thread t2(run_fn);
+    t2.join();
+    // Expect that the second job is ot successful.
+    ASSERT_EQ(successful_executions, 1);
+}
+
+TYPED_TEST(HNSWTieredIndexTest, insertJob) {
+    // Create TieredHNSW index instance with a mock queue.
+    size_t dim = 4;
+    HNSWParams params = {.type = TypeParam::get_index_type(),
+                         .dim = dim,
+                         .metric = VecSimMetric_L2,
+                         .multi = TypeParam::isMulti()};
+    VecSimParams hnsw_params = CreateParams(params);
+    auto jobQ = JobQueue();
+    auto index_ctx = new IndexExtCtx();
+
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx);
+    auto allocator = tiered_index->getAllocator();
+
+    // Create a vector and add it to the tiered index.
+    labelType vec_label = 1;
+    TEST_DATA_T vector[dim];
+    GenerateVector<TEST_DATA_T>(vector, dim, vec_label);
+    VecSimIndex_AddVector(tiered_index, vector, vec_label);
+    ASSERT_EQ(tiered_index->indexSize(), 1);
+    ASSERT_EQ(tiered_index->frontendIndex->indexSize(), 1);
+
+    // Execute the insert job manually (in a synchronous manner).
+    ASSERT_EQ(jobQ.size(), 1);
+    auto *insertion_job = reinterpret_cast<HNSWInsertJob *>(jobQ.front().job);
+    ASSERT_EQ(insertion_job->label, vec_label);
+    ASSERT_EQ(insertion_job->id, 0);
+    ASSERT_EQ(insertion_job->jobType, HNSW_INSERT_VECTOR_JOB);
+
+    thread_iteration(jobQ);
+    ASSERT_EQ(tiered_index->indexSize(), 1);
+    ASSERT_EQ(tiered_index->frontendIndex->indexSize(), 0);
+    ASSERT_EQ(tiered_index->backendIndex->indexSize(), 1);
+    // HNSW index should have allocated a single block, while flat index should remove the
+    // block.
+    ASSERT_EQ(tiered_index->backendIndex->indexCapacity(), DEFAULT_BLOCK_SIZE);
+    ASSERT_EQ(tiered_index->indexCapacity(), DEFAULT_BLOCK_SIZE);
+    ASSERT_EQ(tiered_index->frontendIndex->indexCapacity(), 0);
+    ASSERT_EQ(tiered_index->backendIndex->getDistanceFrom(vec_label, vector), 0);
+    // After the execution, the job should be removed from the labelToInsertJobs mapping.
+    ASSERT_EQ(tiered_index->labelToInsertJobs.size(), 0);
+    delete index_ctx;
+}
+
+TYPED_TEST(HNSWTieredIndexTestBasic, insertJobAsync) {
+    // Create TieredHNSW index instance with a mock queue.
+    size_t dim = 4;
+    size_t n = 5000;
+    HNSWParams params = {
+        .type = TypeParam::get_index_type(), .dim = dim, .metric = VecSimMetric_L2, .multi = false};
+    VecSimParams hnsw_params = CreateParams(params);
+    auto jobQ = JobQueue();
+    auto index_ctx = new IndexExtCtx();
+
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx);
+    auto allocator = tiered_index->getAllocator();
+
+    // Launch the BG threads loop that takes jobs from the queue and executes them.
+    bool run_thread = true;
+    for (size_t i = 0; i < THREAD_POOL_SIZE; i++) {
+        thread_pool.emplace_back(thread_main_loop, std::ref(jobQ), std::ref(run_thread));
+    }
+
+    // Insert vectors
+    for (size_t i = 0; i < n; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, i, i);
+    }
+
+    thread_pool_join(jobQ, run_thread);
+    ASSERT_EQ(tiered_index->indexSize(), n);
+    ASSERT_EQ(tiered_index->backendIndex->indexSize(), n);
+    ASSERT_EQ(tiered_index->frontendIndex->indexSize(), 0);
+    ASSERT_EQ(tiered_index->labelToInsertJobs.size(), 0);
+    ASSERT_EQ(jobQ.size(), 0);
+    // Verify that the vectors were inserted to HNSW as expected
+    for (size_t i = 0; i < n; i++) {
+        TEST_DATA_T expected_vector[dim];
+        GenerateVector<TEST_DATA_T>(expected_vector, dim, i);
+        ASSERT_EQ(tiered_index->backendIndex->getDistanceFrom(i, expected_vector), 0);
+    }
+
+    delete index_ctx;
+}
+
+TYPED_TEST(HNSWTieredIndexTestBasic, insertJobAsyncMulti) {
+    // Create TieredHNSW index instance with a mock queue.
+    size_t dim = 4;
+    size_t n = 5000;
+    HNSWParams params = {
+        .type = TypeParam::get_index_type(), .dim = dim, .metric = VecSimMetric_L2, .multi = true};
+    VecSimParams hnsw_params = CreateParams(params);
+    size_t per_label = 5;
+    auto jobQ = JobQueue();
+    auto index_ctx = new IndexExtCtx();
+
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx);
+    auto allocator = tiered_index->getAllocator();
+
+    // Launch the BG threads loop that takes jobs from the queue and executes them.
+    bool run_thread = true;
+    for (size_t i = 0; i < THREAD_POOL_SIZE; i++) {
+        thread_pool.emplace_back(thread_main_loop, std::ref(jobQ), std::ref(run_thread));
+    }
+
+    // Create and insert vectors, store them in this continuous array.
+    TEST_DATA_T vectors[n * dim];
+    for (size_t i = 0; i < n / per_label; i++) {
+        for (size_t j = 0; j < per_label; j++) {
+            GenerateVector<TEST_DATA_T>(vectors + i * dim * per_label + j * dim, dim,
+                                        i * per_label + j);
+            tiered_index->addVector(vectors + i * dim * per_label + j * dim, i);
+        }
+    }
+
+    thread_pool_join(jobQ, run_thread);
+    EXPECT_EQ(tiered_index->backendIndex->indexSize(), n);
+    EXPECT_EQ(tiered_index->indexLabelCount(), n / per_label);
+    EXPECT_EQ(tiered_index->frontendIndex->indexSize(), 0);
+    EXPECT_EQ(tiered_index->labelToInsertJobs.size(), 0);
+    EXPECT_EQ(jobQ.size(), 0);
+    // Verify that the vectors were inserted to HNSW as expected
+    for (size_t i = 0; i < n / per_label; i++) {
+        for (size_t j = 0; j < per_label; j++) {
+            // The distance from every vector that is stored under the label i should be zero
+            EXPECT_EQ(tiered_index->backendIndex->getDistanceFrom(i, vectors + i * per_label * dim +
+                                                                         j * dim),
+                      0);
+        }
+    }
+    // Cleanup.
+    delete index_ctx;
+}
+
+TYPED_TEST(HNSWTieredIndexTestBasic, KNNSearch) {
+    size_t dim = 4;
+    size_t k = 10;
+
+    size_t n = k * 3;
+
+    // Create TieredHNSW index instance with a mock queue.
+    HNSWParams params = {
+        .type = TypeParam::get_index_type(),
+        .dim = dim,
+        .metric = VecSimMetric_L2,
+    };
+    VecSimParams hnsw_params = CreateParams(params);
+    auto jobQ = JobQueue();
+    auto index_ctx = new IndexExtCtx();
+    size_t cur_memory_usage;
+
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx);
+    auto allocator = tiered_index->getAllocator();
+    EXPECT_EQ(index_ctx->index_strong_ref.use_count(), 1);
+
+    auto hnsw_index = tiered_index->backendIndex;
+    auto flat_index = tiered_index->frontendIndex;
+
+    TEST_DATA_T query_0[dim];
+    GenerateVector<TEST_DATA_T>(query_0, dim, 0);
+    TEST_DATA_T query_1mid[dim];
+    GenerateVector<TEST_DATA_T>(query_1mid, dim, n / 3);
+    TEST_DATA_T query_2mid[dim];
+    GenerateVector<TEST_DATA_T>(query_2mid, dim, n * 2 / 3);
+    TEST_DATA_T query_n[dim];
+    GenerateVector<TEST_DATA_T>(query_n, dim, n - 1);
+
+    // Search for vectors when the index is empty.
+    runTopKSearchTest(tiered_index, query_0, k, nullptr);
+
+    // Define the verification functions.
+    auto ver_res_0 = [&](size_t id, double score, size_t index) {
+        ASSERT_EQ(id, index);
+        ASSERT_DOUBLE_EQ(score, dim * id * id);
+    };
+
+    auto ver_res_1mid = [&](size_t id, double score, size_t index) {
+        ASSERT_EQ(std::abs(int(id - query_1mid[0])), (index + 1) / 2);
+        ASSERT_DOUBLE_EQ(score, dim * pow((index + 1) / 2, 2));
+    };
+
+    auto ver_res_2mid = [&](size_t id, double score, size_t index) {
+        ASSERT_EQ(std::abs(int(id - query_2mid[0])), (index + 1) / 2);
+        ASSERT_DOUBLE_EQ(score, dim * pow((index + 1) / 2, 2));
+    };
+
+    auto ver_res_n = [&](size_t id, double score, size_t index) {
+        ASSERT_EQ(id, n - 1 - index);
+        ASSERT_DOUBLE_EQ(score, dim * index * index);
+    };
+
+    // Insert n/2 vectors to the main index.
+    for (size_t i = 0; i < n / 2; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(hnsw_index, dim, i, i);
+    }
+    ASSERT_EQ(tiered_index->indexSize(), n / 2);
+    ASSERT_EQ(tiered_index->indexSize(), hnsw_index->indexSize());
+
+    // Search for k vectors with the flat index empty.
+    cur_memory_usage = allocator->getAllocationSize();
+    runTopKSearchTest(tiered_index, query_0, k, ver_res_0);
+    runTopKSearchTest(tiered_index, query_1mid, k, ver_res_1mid);
+    ASSERT_EQ(allocator->getAllocationSize(), cur_memory_usage);
+
+    // Insert n/2 vectors to the flat index.
+    for (size_t i = n / 2; i < n; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(flat_index, dim, i, i);
+    }
+    ASSERT_EQ(tiered_index->indexSize(), n);
+    ASSERT_EQ(tiered_index->indexSize(), hnsw_index->indexSize() + flat_index->indexSize());
+
+    cur_memory_usage = allocator->getAllocationSize();
+    // Search for k vectors so all the vectors will be from the flat index.
+    runTopKSearchTest(tiered_index, query_0, k, ver_res_0);
+    // Search for k vectors so all the vectors will be from the main index.
+    runTopKSearchTest(tiered_index, query_n, k, ver_res_n);
+    // Search for k so some of the results will be from the main and some from the flat index.
+    runTopKSearchTest(tiered_index, query_1mid, k, ver_res_1mid);
+    runTopKSearchTest(tiered_index, query_2mid, k, ver_res_2mid);
+    // Memory usage should not change.
+    ASSERT_EQ(allocator->getAllocationSize(), cur_memory_usage);
+
+    // Add some overlapping vectors to the main and flat index.
+    // adding directly to the underlying indexes to avoid jobs logic.
+    // The main index will have vectors 0 - 2n/3 and the flat index will have vectors n/3 - n
+    for (size_t i = n / 3; i < n / 2; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(flat_index, dim, i, i);
+    }
+    for (size_t i = n / 2; i < n * 2 / 3; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(hnsw_index, dim, i, i);
+    }
+
+    cur_memory_usage = allocator->getAllocationSize();
+    // Search for k vectors so all the vectors will be from the main index.
+    runTopKSearchTest(tiered_index, query_0, k, ver_res_0);
+    // Search for k vectors so all the vectors will be from the flat index.
+    runTopKSearchTest(tiered_index, query_n, k, ver_res_n);
+    // Search for k so some of the results will be from the main and some from the flat index.
+    runTopKSearchTest(tiered_index, query_1mid, k, ver_res_1mid);
+    runTopKSearchTest(tiered_index, query_2mid, k, ver_res_2mid);
+    // Memory usage should not change.
+    ASSERT_EQ(allocator->getAllocationSize(), cur_memory_usage);
+
+    // More edge cases:
+
+    // Search for more vectors than the index size.
+    k = n + 1;
+    runTopKSearchTest(tiered_index, query_0, k, n, ver_res_0);
+    runTopKSearchTest(tiered_index, query_n, k, n, ver_res_n);
+
+    // Search for less vectors than the index size, but more than the flat and main index sizes.
+    k = n * 5 / 6;
+    runTopKSearchTest(tiered_index, query_0, k, ver_res_0);
+    runTopKSearchTest(tiered_index, query_n, k, ver_res_n);
+
+    // Memory usage should not change.
+    ASSERT_EQ(allocator->getAllocationSize(), cur_memory_usage);
+
+    // Search for more vectors than the main index size, but less than the flat index size.
+    for (size_t i = n / 2; i < n * 2 / 3; i++) {
+        VecSimIndex_DeleteVector(hnsw_index, i);
+    }
+    ASSERT_EQ(flat_index->indexSize(), n * 2 / 3);
+    ASSERT_EQ(hnsw_index->indexSize(), n / 2);
+    k = n * 2 / 3;
+    cur_memory_usage = allocator->getAllocationSize();
+    runTopKSearchTest(tiered_index, query_0, k, ver_res_0);
+    runTopKSearchTest(tiered_index, query_n, k, ver_res_n);
+    runTopKSearchTest(tiered_index, query_1mid, k, ver_res_1mid);
+    runTopKSearchTest(tiered_index, query_2mid, k, ver_res_2mid);
+    // Memory usage should not change.
+    ASSERT_EQ(allocator->getAllocationSize(), cur_memory_usage);
+
+    // Search for more vectors than the flat index size, but less than the main index size.
+    for (size_t i = n / 2; i < n; i++) {
+        VecSimIndex_DeleteVector(flat_index, i);
+    }
+    ASSERT_EQ(flat_index->indexSize(), n / 6);
+    ASSERT_EQ(hnsw_index->indexSize(), n / 2);
+    k = n / 4;
+    cur_memory_usage = allocator->getAllocationSize();
+    runTopKSearchTest(tiered_index, query_0, k, ver_res_0);
+    runTopKSearchTest(tiered_index, query_1mid, k, ver_res_1mid);
+    // Memory usage should not change.
+    ASSERT_EQ(allocator->getAllocationSize(), cur_memory_usage);
+
+    // Search for vectors when the flat index is not empty but the main index is empty.
+    for (size_t i = 0; i < n * 2 / 3; i++) {
+        VecSimIndex_DeleteVector(hnsw_index, i);
+        GenerateAndAddVector<TEST_DATA_T>(flat_index, dim, i, i);
+    }
+    ASSERT_EQ(flat_index->indexSize(), n * 2 / 3);
+    ASSERT_EQ(hnsw_index->indexSize(), 0);
+    k = n / 3;
+    cur_memory_usage = allocator->getAllocationSize();
+    runTopKSearchTest(tiered_index, query_0, k, ver_res_0);
+    runTopKSearchTest(tiered_index, query_1mid, k, ver_res_1mid);
+    // Memory usage should not change.
+    ASSERT_EQ(allocator->getAllocationSize(), cur_memory_usage);
+
+    // // // // // // // // // // // //
+    // Check behavior upon timeout.  //
+    // // // // // // // // // // // //
+
+    VecSimQueryResult_List res;
+    // Add a vector to the HNSW index so there will be a reason to query it.
+    GenerateAndAddVector<TEST_DATA_T>(hnsw_index, dim, n, n);
+
+    // Set timeout callback to always return 1 (will fail while querying the flat buffer).
+    VecSim_SetTimeoutCallbackFunction([](void *ctx) { return 1; }); // Always times out
+
+    res = VecSimIndex_TopKQuery(tiered_index, query_0, k, nullptr, BY_SCORE);
+    ASSERT_EQ(res.results, nullptr);
+    ASSERT_EQ(res.code, VecSim_QueryResult_TimedOut);
+
+    // Set timeout callback to return 1 after n checks (will fail while querying the HNSW index).
+    // Brute-force index checks for timeout after each vector.
+    size_t checks_in_flat = flat_index->indexSize();
+    VecSimQueryParams qparams = {.timeoutCtx = &checks_in_flat};
+    VecSim_SetTimeoutCallbackFunction([](void *ctx) {
+        auto count = static_cast<size_t *>(ctx);
+        if (*count == 0) {
+            return 1;
+        }
+        (*count)--;
+        return 0;
+    });
+    res = VecSimIndex_TopKQuery(tiered_index, query_0, k, &qparams, BY_SCORE);
+    ASSERT_EQ(res.results, nullptr);
+    ASSERT_EQ(res.code, VecSim_QueryResult_TimedOut);
+    // Make sure we didn't get the timeout in the flat index.
+    checks_in_flat = flat_index->indexSize(); // Reset the counter.
+    res = VecSimIndex_TopKQuery(flat_index, query_0, k, &qparams, BY_SCORE);
+    ASSERT_EQ(res.code, VecSim_QueryResult_OK);
+    VecSimQueryResult_Free(res);
+
+    // Clean up.
+    VecSim_SetTimeoutCallbackFunction([](void *ctx) { return 0; });
+    delete index_ctx;
+}
+
+TYPED_TEST(HNSWTieredIndexTest, parallelSearch) {
+    size_t dim = 4;
+    size_t k = 10;
+    size_t n = 2000;
+    bool isMulti = TypeParam::isMulti();
+
+    // Create TieredHNSW index instance with a mock queue.
+    HNSWParams params = {
+        .type = TypeParam::get_index_type(),
+        .dim = dim,
+        .metric = VecSimMetric_L2,
+        .multi = isMulti,
+        .efRuntime = n,
+    };
+    VecSimParams hnsw_params = CreateParams(params);
+    auto jobQ = JobQueue();
+    auto index_ctx = new IndexExtCtx();
+
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx);
+    auto allocator = tiered_index->getAllocator();
+    EXPECT_EQ(index_ctx->index_strong_ref.use_count(), 1);
+
+    std::atomic_int successful_searches(0);
+    auto parallel_knn_search = [](AsyncJob *job) {
+        auto *search_job = reinterpret_cast<SearchJobMock *>(job);
+        size_t k = search_job->k;
+        size_t dim = search_job->dim;
+        auto query = search_job->query;
+
+        auto verify_res = [&](size_t id, double score, size_t res_index) {
+            TEST_DATA_T element = *(TEST_DATA_T *)query;
+            ASSERT_EQ(std::abs(id - element), (res_index + 1) / 2);
+            ASSERT_EQ(score, dim * (id - element) * (id - element));
         };
+        runTopKSearchTest(job->index, query, k, verify_res);
+        search_job->successful_searches++;
+        delete job;
+    };
+
+    size_t per_label = isMulti ? 10 : 1;
+    size_t n_labels = n / per_label;
+
+    // Fill the job queue with insert and search jobs, while filling the flat index, before
+    // initializing the thread pool.
+    for (size_t i = 0; i < n; i++) {
+        // Insert a vector to the flat index and add a job to insert it to the main index.
+        GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, i % n_labels, i);
+
+        // Add a search job. Make sure the query element is between k and n - k.
+        auto query = (TEST_DATA_T *)allocator->allocate(dim * sizeof(TEST_DATA_T));
+        GenerateVector<TEST_DATA_T>(query, dim, (i % (n_labels - (2 * k))) + k);
+        auto search_job = new (allocator) SearchJobMock(
+            allocator, parallel_knn_search, tiered_index, query, k, n, dim, successful_searches);
+        tiered_index->submitSingleJob(search_job);
+    }
+
+    EXPECT_EQ(tiered_index->indexSize(), n);
+    EXPECT_EQ(tiered_index->indexLabelCount(), n_labels);
+    EXPECT_EQ(tiered_index->labelToInsertJobs.size(), n_labels);
+    for (auto &it : tiered_index->labelToInsertJobs) {
+        EXPECT_EQ(it.second.size(), per_label);
+    }
+    EXPECT_EQ(tiered_index->frontendIndex->indexSize(), n);
+    EXPECT_EQ(tiered_index->backendIndex->indexSize(), 0);
+
+    // Launch the BG threads loop that takes jobs from the queue and executes them.
+    // All the vectors are already in the tiered index, so we expect to find the expected
+    // results from the get-go.
+    bool run_thread = true;
+    for (size_t i = 0; i < THREAD_POOL_SIZE; i++) {
+        thread_pool.emplace_back(thread_main_loop, std::ref(jobQ), std::ref(run_thread));
+    }
+
+    thread_pool_join(jobQ, run_thread);
+
+    EXPECT_EQ(tiered_index->backendIndex->indexSize(), n);
+    EXPECT_EQ(tiered_index->backendIndex->indexLabelCount(), n_labels);
+    EXPECT_EQ(tiered_index->frontendIndex->indexSize(), 0);
+    EXPECT_EQ(tiered_index->labelToInsertJobs.size(), 0);
+    EXPECT_EQ(successful_searches, n);
+    EXPECT_EQ(jobQ.size(), 0);
+
+    // Cleanup.
+    delete index_ctx;
+}
+
+TYPED_TEST(HNSWTieredIndexTest, parallelInsertSearch) {
+    size_t dim = 4;
+    size_t k = 10;
+    size_t n = 3000;
+
+    size_t block_size = n / 100;
+
+    bool isMulti = TypeParam::isMulti();
+
+    // Create TieredHNSW index instance with a mock queue.
+    size_t n_labels = isMulti ? n / 25 : n;
+    HNSWParams params = {
+        .type = TypeParam::get_index_type(),
+        .dim = dim,
+        .metric = VecSimMetric_L2,
+        .multi = isMulti,
+        .blockSize = block_size,
+    };
+    VecSimParams hnsw_params = CreateParams(params);
+    auto jobQ = JobQueue();
+    auto index_ctx = new IndexExtCtx();
+
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx);
+    auto allocator = tiered_index->getAllocator();
+    EXPECT_EQ(index_ctx->index_strong_ref.use_count(), 1);
+
+    // Launch the BG threads loop that takes jobs from the queue and executes them.
+    // Save the number fo tasks done by thread i in the i-th entry.
+    std::vector<size_t> completed_tasks(THREAD_POOL_SIZE, 0);
+    bool run_thread = true;
+    for (size_t i = 0; i < THREAD_POOL_SIZE; i++) {
+        thread_pool.emplace_back(thread_main_loop, std::ref(jobQ), std::ref(run_thread));
+    }
+    std::atomic_int successful_searches(0);
+
+    auto parallel_knn_search = [](AsyncJob *job) {
+        auto *search_job = reinterpret_cast<SearchJobMock *>(job);
+        size_t k = search_job->k;
+        auto query = search_job->query;
+        // In this test we don't care about the results, just that the search doesn't crash
+        // and returns the correct number of valid results.
+        auto verify_res = [&](size_t id, double score, size_t res_index) {};
+        runTopKSearchTest(job->index, query, k, verify_res);
+        search_job->successful_searches++;
+        delete job;
+    };
+
+    // Insert vectors in parallel to search.
+    for (size_t i = 0; i < n; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, i % n_labels, i);
+        auto query = (TEST_DATA_T *)allocator->allocate(dim * sizeof(TEST_DATA_T));
+        GenerateVector<TEST_DATA_T>(query, dim, (TEST_DATA_T)n / 4 + (i % 1000) * M_PI);
+        auto search_job = new (allocator) SearchJobMock(
+            allocator, parallel_knn_search, tiered_index, query, k, n, dim, successful_searches);
+        tiered_index->submitSingleJob(search_job);
+    }
+
+    thread_pool_join(jobQ, run_thread);
+
+    EXPECT_EQ(successful_searches, n);
+    EXPECT_EQ(tiered_index->backendIndex->indexSize(), n);
+    EXPECT_EQ(tiered_index->backendIndex->indexLabelCount(), n_labels);
+    EXPECT_EQ(tiered_index->frontendIndex->indexSize(), 0);
+    EXPECT_EQ(tiered_index->labelToInsertJobs.size(), 0);
+    EXPECT_EQ(jobQ.size(), 0);
+    // Cleanup.
+    delete index_ctx;
+}
+
+TYPED_TEST(HNSWTieredIndexTestBasic, MergeMulti) {
+    size_t dim = 4;
+
+    // Create TieredHNSW index instance with a mock queue.
+    HNSWParams params = {
+        .type = TypeParam::get_index_type(),
+        .dim = dim,
+        .metric = VecSimMetric_L2,
+        .multi = true,
+    };
+    VecSimParams hnsw_params = CreateParams(params);
+    auto jobQ = JobQueue();
+    auto *index_ctx = new IndexExtCtx();
+
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx);
+    auto allocator = tiered_index->getAllocator();
+
+    auto hnsw_index = tiered_index->backendIndex;
+    auto flat_index = tiered_index->frontendIndex;
+
+    // Insert vectors with label 0 to HNSW only.
+    GenerateAndAddVector<TEST_DATA_T>(hnsw_index, dim, 0, 0);
+    GenerateAndAddVector<TEST_DATA_T>(hnsw_index, dim, 0, 1);
+    GenerateAndAddVector<TEST_DATA_T>(hnsw_index, dim, 0, 2);
+    // Insert vectors with label 1 to flat buffer only.
+    GenerateAndAddVector<TEST_DATA_T>(flat_index, dim, 1, 0);
+    GenerateAndAddVector<TEST_DATA_T>(flat_index, dim, 1, 1);
+    GenerateAndAddVector<TEST_DATA_T>(flat_index, dim, 1, 2);
+    // Insert DIFFERENT vectors with label 2 to both HNSW and flat buffer.
+    GenerateAndAddVector<TEST_DATA_T>(hnsw_index, dim, 2, 0);
+    GenerateAndAddVector<TEST_DATA_T>(flat_index, dim, 2, 1);
+
+    TEST_DATA_T query[dim];
+    GenerateVector<TEST_DATA_T>(query, dim, 0);
+
+    // Search in the tiered index for more vectors than it has. Merging the results from the two
+    // indexes should result in a list of unique vectors, even if the scores of the duplicates are
+    // different.
+    runTopKSearchTest(tiered_index, query, 5, 3, [](size_t _, double __, size_t ___) {});
+
+    delete index_ctx;
+}
+
+TYPED_TEST(HNSWTieredIndexTest, deleteFromHNSWBasic) {
+    // Create TieredHNSW index instance with a mock queue.
+    size_t dim = 4;
+    bool isMulti = TypeParam::isMulti();
+
+    HNSWParams params = {.type = TypeParam::get_index_type(),
+                         .dim = dim,
+                         .metric = VecSimMetric_L2,
+                         .multi = isMulti};
+    VecSimParams hnsw_params = CreateParams(params);
+
+    auto jobQ = JobQueue();
+    auto index_ctx = new IndexExtCtx();
+
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx);
+    auto allocator = tiered_index->getAllocator();
+
+    // Delete a non existing label.
+    ASSERT_EQ(tiered_index->deleteLabelFromHNSW(0), 0);
+    ASSERT_EQ(jobQ.size(), 0);
+
+    // Insert one vector to HNSW and then delete it (it should have no neighbors to repair).
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index->backendIndex, dim, 0);
+    ASSERT_EQ(tiered_index->deleteLabelFromHNSW(0), 1);
+    ASSERT_EQ(jobQ.size(), 0);
+
+    // Add another vector and remove it. Since the other vector in the index has marked deleted,
+    // this vector should have no neighbors, and again, no neighbors to repair.
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index->backendIndex, dim, 1, 1);
+    ASSERT_EQ(tiered_index->deleteLabelFromHNSW(1), 1);
+    ASSERT_EQ(jobQ.size(), 0);
+
+    // Add two vectors and delete one, expect that at backendIndex one repair job will be created.
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index->backendIndex, dim, 2, 2);
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index->backendIndex, dim, 3, 3);
+    ASSERT_EQ(tiered_index->deleteLabelFromHNSW(3), 1);
+
+    // The first job should be a repair job of the first inserted non-deleted node id (2)
+    // in level 0.
+    ASSERT_EQ(jobQ.size(), 1);
+    ASSERT_EQ(jobQ.front().job->jobType, HNSW_REPAIR_NODE_CONNECTIONS_JOB);
+    ASSERT_EQ(((HNSWRepairJob *)(jobQ.front().job))->node_id, 2);
+    ASSERT_EQ(((HNSWRepairJob *)(jobQ.front().job))->level, 0);
+    ASSERT_EQ(tiered_index->idToRepairJobs.size(), 1);
+    ASSERT_GE(tiered_index->idToRepairJobs.at(2).size(), 1);
+    ASSERT_EQ(tiered_index->idToRepairJobs.at(2)[0]->associatedSwapJobs.size(), 1);
+    ASSERT_EQ(tiered_index->idToRepairJobs.at(2)[0]->associatedSwapJobs[0]->deleted_id, 3);
+
+    ASSERT_EQ(tiered_index->indexSize(), 4);
+    ASSERT_EQ(tiered_index->getHNSWIndex()->getNumMarkedDeleted(), 3);
+    ASSERT_EQ(tiered_index->idToSwapJob.size(), 3);
+
+    delete index_ctx;
+}
+
+TYPED_TEST(HNSWTieredIndexTestBasic, deleteFromHNSWMulti) {
+    // Create TieredHNSW index instance with a mock queue.
+    size_t dim = 4;
+
+    HNSWParams params = {
+        .type = TypeParam::get_index_type(), .dim = dim, .metric = VecSimMetric_L2, .multi = true};
+    VecSimParams hnsw_params = CreateParams(params);
+
+    auto jobQ = JobQueue();
+    auto index_ctx = new IndexExtCtx();
+
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx);
+    auto allocator = tiered_index->getAllocator();
+
+    // Add two vectors and delete one, expect that at least one repair job will be created.
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index->backendIndex, dim, 0, 0);
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index->backendIndex, dim, 1, 1);
+    ASSERT_EQ(tiered_index->deleteLabelFromHNSW(0), 1);
+    ASSERT_EQ(tiered_index->idToRepairJobs.size(), 1);
+    ASSERT_EQ(tiered_index->idToRepairJobs.at(1).size(), 1);
+    ASSERT_EQ(tiered_index->idToRepairJobs.at(1)[0]->associatedSwapJobs.size(), 1);
+    ASSERT_EQ(tiered_index->idToRepairJobs.at(1)[0]->associatedSwapJobs[0]->deleted_id, 0);
+    ASSERT_EQ(((HNSWRepairJob *)(jobQ.front().job))->node_id, 1);
+    ASSERT_EQ(((HNSWRepairJob *)(jobQ.front().job))->level, 0);
+    jobQ.pop();
+
+    // Insert another vector under the label (1) that has not been deleted.
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index->backendIndex, dim, 1, 2);
+
+    // Expect to see both ids stored under this label being deleted (1 and 2), and have both
+    // ids need repair (as the connection between the two vectors is mutual). However, 1 has
+    // also an outgoing edge to his other (deleted) neighbor (0), so there will be no new
+    // repair job created for 1, since the previous repair job is expected to have both 0 and 2 in
+    // its associated swap jobs. Also, there is an edge 0->1 whose going to be repaired as well.
+    ASSERT_EQ(tiered_index->deleteLabelFromHNSW(1), 2);
+    ASSERT_EQ(jobQ.size(), 2);
+    ASSERT_EQ(((HNSWRepairJob *)(jobQ.front().job))->node_id, 0);
+    ASSERT_EQ(((HNSWRepairJob *)(jobQ.front().job))->level, 0);
+    jobQ.pop();
+    ASSERT_EQ(((HNSWRepairJob *)(jobQ.front().job))->node_id, 2);
+    ASSERT_EQ(((HNSWRepairJob *)(jobQ.front().job))->level, 0);
+    jobQ.pop();
+    // No new job for deleting 1->2 edge, just another associated swap job for the existing repair
+    // job of 1 (in addition to 0, we have 2).
+    ASSERT_EQ(tiered_index->idToRepairJobs.size(), 3);
+    ASSERT_EQ(tiered_index->idToRepairJobs.at(1).size(), 1);
+    ASSERT_EQ(tiered_index->idToRepairJobs.at(1)[0]->associatedSwapJobs.size(), 2);
+    ASSERT_EQ(tiered_index->idToRepairJobs.at(1)[0]->associatedSwapJobs[1]->deleted_id, 2);
+
+    ASSERT_EQ(tiered_index->idToRepairJobs.at(0).size(), 1);
+    ASSERT_EQ(tiered_index->idToRepairJobs.at(0)[0]->associatedSwapJobs.size(), 1);
+    ASSERT_EQ(tiered_index->idToRepairJobs.at(0)[0]->associatedSwapJobs[0]->deleted_id, 1);
+
+    ASSERT_EQ(tiered_index->idToRepairJobs.at(2).size(), 1);
+    ASSERT_EQ(tiered_index->idToRepairJobs.at(2)[0]->associatedSwapJobs.size(), 1);
+    ASSERT_EQ(tiered_index->idToRepairJobs.at(2)[0]->associatedSwapJobs[0]->deleted_id, 1);
 
-        HNSWInsertJob job = {
-            .base = AsyncJob{.jobType = HNSW_INSERT_VECTOR_JOB, .Execute = insert_to_index},
-            .index = tiered_index,
-            .label = vector_label};
-        tiered_index->labelToInsertJobs[vector_label].push_back(&job);
-
-        // Wrap this job with an array and submit the jobs to the queue.
-        // TODO: in the future this should be part of the tiered index "add_vector" flow, and
-        //  we can replace this to avoid the breaking of the abstraction.
-        auto **jobs = array_new<AsyncJob *>(1);
-        jobs = array_append(jobs, (AsyncJob *)&job);
-        tiered_index->SubmitJobsToQueue(tiered_index->jobQueue, (void **)jobs, 1);
-        ASSERT_EQ(jobQ->size(), 1);
-
-        // Execute the job from the queue and validate that the index was updated properly.
-        reinterpret_cast<AsyncJob *>(jobQ->front())->Execute(jobQ->front());
-        ASSERT_EQ(tiered_index->indexSize(), 1);
-        ASSERT_EQ(tiered_index->getDistanceFrom(1, vector), 0);
-        ASSERT_EQ(memory_ctx, tiered_index->getAllocator()->getAllocationSize());
-        ASSERT_EQ(tiered_index->flatBuffer->indexSize(), 0);
-        ASSERT_EQ(tiered_index->labelToInsertJobs[vector_label].size(), 0);
-
-        // Cleanup.
-        delete jobQ;
-        array_free(jobs);
-        VecSimIndex_Free(tiered_index);
+    ASSERT_EQ(tiered_index->idToSwapJob.size(), 3);
+
+    delete index_ctx;
+}
+
+TYPED_TEST(HNSWTieredIndexTestBasic, deleteFromHNSWMultiLevels) {
+    // Create TieredHNSW index instance with a mock queue.
+    size_t dim = 4;
+
+    HNSWParams params = {
+        .type = TypeParam::get_index_type(), .dim = dim, .metric = VecSimMetric_L2, .multi = false};
+    VecSimParams hnsw_params = CreateParams(params);
+    auto jobQ = JobQueue();
+    auto index_ctx = new IndexExtCtx();
+
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx);
+    auto allocator = tiered_index->getAllocator();
+
+    // Test that repair jobs are created for multiple levels.
+    size_t num_elements_with_multiple_levels = 0;
+    int vec_id = -1;
+    do {
+        vec_id++;
+        GenerateAndAddVector<TEST_DATA_T>(tiered_index->backendIndex, dim, vec_id, vec_id);
+        if (tiered_index->getHNSWIndex()->element_levels_[vec_id] > 0) {
+            num_elements_with_multiple_levels++;
+        }
+    } while (num_elements_with_multiple_levels < 2);
+
+    // Delete the last inserted vector, which is in level 1.
+    ASSERT_EQ(tiered_index->deleteLabelFromHNSW(vec_id), 1);
+    ASSERT_EQ(tiered_index->getHNSWIndex()->element_levels_[vec_id], 1);
+    // This should be an array of length 1.
+    auto *level_one_neighbors = tiered_index->getHNSWIndex()->getNodeNeighborsAtLevel(vec_id, 1);
+    ASSERT_EQ(tiered_index->getHNSWIndex()->getNodeNeighborsCount(level_one_neighbors), 1);
+
+    size_t num_repair_jobs = jobQ.size();
+    // There should be at least two nodes to repair, the neighbors of next_id in levels 0 and 1
+    ASSERT_GE(num_repair_jobs, 2);
+    while (jobQ.size() > 1) {
+        // First we should have jobs for repairing nodes in level 0.
+        ASSERT_EQ(((HNSWRepairJob *)(jobQ.front().job))->level, 0);
+        jobQ.pop();
+    }
+
+    // The last job should be repairing the single neighbor in level 1.
+    ASSERT_EQ(((HNSWRepairJob *)(jobQ.front().job))->level, 1);
+    ASSERT_EQ(((HNSWRepairJob *)(jobQ.front().job))->node_id, *level_one_neighbors);
+
+    delete index_ctx;
+}
+
+TYPED_TEST(HNSWTieredIndexTest, deleteFromHNSWWithRepairJobExec) {
+    // Create TieredHNSW index instance with a mock queue.
+    size_t n = 1000;
+    size_t dim = 4;
+    bool isMulti = TypeParam::isMulti();
+
+    HNSWParams params = {.type = TypeParam::get_index_type(),
+                         .dim = dim,
+                         .metric = VecSimMetric_L2,
+                         .multi = isMulti,
+                         .M = 4};
+    VecSimParams hnsw_params = CreateParams(params);
+    auto jobQ = JobQueue();
+    auto index_ctx = new IndexExtCtx();
+
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx);
+    auto allocator = tiered_index->getAllocator();
+
+    for (size_t i = 0; i < n; i++) {
+        GenerateAndAddVector(tiered_index->getHNSWIndex(), dim, i, i);
+    }
+
+    // Delete vectors one by one and run the resulted repair jobs.
+    while (tiered_index->getHNSWIndex()->getNumMarkedDeleted() < n) {
+        // Choose the current entry point each time (it should be modified after the deletion).
+        idType ep = tiered_index->getHNSWIndex()->safeGetEntryPointState().first;
+        auto ep_level = tiered_index->getHNSWIndex()->getMaxLevel();
+        auto incoming_neighbors =
+            tiered_index->getHNSWIndex()->safeCollectAllNodeIncomingNeighbors(ep, ep_level);
+        ASSERT_EQ(tiered_index->deleteLabelFromHNSW(ep), 1);
+        ASSERT_EQ(jobQ.size(), incoming_neighbors.size());
+        ASSERT_EQ(tiered_index->getHNSWIndex()->checkIntegrity().connections_to_repair,
+                  jobQ.size());
+        ASSERT_NE(tiered_index->getHNSWIndex()->safeGetEntryPointState().first, ep);
+
+        // Execute synchronously all the repair jobs for the current deletion.
+        while (!jobQ.empty()) {
+            idType repair_node_id = ((HNSWRepairJob *)(jobQ.front().job))->node_id;
+            auto repair_node_level = ((HNSWRepairJob *)(jobQ.front().job))->level;
+            auto orig_neighbors = tiered_index->getHNSWIndex()->getNodeNeighborsAtLevel(
+                repair_node_id, repair_node_level);
+
+            tiered_index->getHNSWIndex()->repairNodeConnections(repair_node_id, repair_node_level);
+            auto new_neighbors = tiered_index->getHNSWIndex()->getNodeNeighborsAtLevel(
+                repair_node_id, repair_node_level);
+            size_t new_neighbors_count =
+                tiered_index->getHNSWIndex()->getNodeNeighborsCount(new_neighbors);
+            // This makes sure that the deleted node is no longer in the neighbors set of the
+            // repaired node.
+            ASSERT_TRUE(std::find(new_neighbors, new_neighbors + new_neighbors_count, ep) ==
+                        new_neighbors + new_neighbors_count);
+            // Remove the job from the id -> repair_jobs lookup, so we won't think that it is
+            // still pending and avoid creating new jobs for nodes that already been repaired
+            // as they were pointing to deleted elements.
+            tiered_index->idToRepairJobs.erase(repair_node_id);
+            jobQ.kick();
+        }
+        ASSERT_EQ(tiered_index->getHNSWIndex()->checkIntegrity().connections_to_repair, 0);
     }
+    delete index_ctx;
+}
+
+TYPED_TEST(HNSWTieredIndexTest, manageIndexOwnershipWithPendingJobs) {
+    // Create TieredHNSW index instance with a mock queue.
+    size_t dim = 4;
+    bool isMulti = TypeParam::isMulti();
+
+    HNSWParams params = {.type = TypeParam::get_index_type(),
+                         .dim = dim,
+                         .metric = VecSimMetric_L2,
+                         .multi = isMulti};
+    VecSimParams hnsw_params = CreateParams(params);
+    auto jobQ = JobQueue();
+    auto *index_ctx = new IndexExtCtx();
+
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx);
+    auto allocator = tiered_index->getAllocator();
+    EXPECT_EQ(index_ctx->index_strong_ref.use_count(), 1);
+
+    // Add a vector and create a pending insert job.
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, 0);
+    ASSERT_EQ(tiered_index->labelToInsertJobs.size(), 1);
+
+    // Delete the index before the job was executed (this would delete the pending job as well).
+    EXPECT_EQ(jobQ.size(), 1);
+    EXPECT_EQ(jobQ.front().index_weak_ref.use_count(), 1);
+    delete index_ctx;
+    EXPECT_EQ(jobQ.size(), 1);
+    EXPECT_EQ(jobQ.front().index_weak_ref.use_count(), 0);
+    jobQ.pop();
+
+    // Recreate the index with a new ctx.
+    index_ctx = new IndexExtCtx();
+    tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx);
+    allocator = tiered_index->getAllocator();
+    EXPECT_EQ(index_ctx->index_strong_ref.use_count(), 1);
+
+    // Add two vectors directly to HNSW, and remove one vector to create a repair job.
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index->backendIndex, dim, 0, 0);
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index->backendIndex, dim, 1, 1);
+    ASSERT_EQ(tiered_index->deleteLabelFromHNSW(0), 1);
+    ASSERT_EQ(tiered_index->idToRepairJobs.size(), 1);
+
+    // Delete the index before the job was executed (this would delete the pending job as well).
+    EXPECT_EQ(jobQ.size(), 1);
+    EXPECT_EQ(jobQ.front().index_weak_ref.use_count(), 1);
+    delete index_ctx;
+    EXPECT_EQ(jobQ.size(), 1);
+    EXPECT_EQ(jobQ.front().index_weak_ref.use_count(), 0);
+}
+
+TYPED_TEST(HNSWTieredIndexTestBasic, AdHocSingle) {
+    size_t dim = 4;
+
+    // Create TieredHNSW index instance with a mock queue.
+    HNSWParams params = {
+        .type = TypeParam::get_index_type(),
+        .dim = dim,
+        .metric = VecSimMetric_L2,
+    };
+    VecSimParams hnsw_params = CreateParams(params);
+    auto jobQ = JobQueue();
+    auto index_ctx = new IndexExtCtx();
+
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx);
+    auto allocator = tiered_index->getAllocator();
+
+    auto hnsw_index = tiered_index->backendIndex;
+    auto flat_index = tiered_index->frontendIndex;
+
+    TEST_DATA_T vec1[dim];
+    GenerateVector<TEST_DATA_T>(vec1, dim, 1);
+    TEST_DATA_T vec2[dim];
+    GenerateVector<TEST_DATA_T>(vec2, dim, 2);
+    TEST_DATA_T vec3[dim];
+    GenerateVector<TEST_DATA_T>(vec3, dim, 3);
+    TEST_DATA_T vec4[dim];
+    GenerateVector<TEST_DATA_T>(vec4, dim, 4);
+
+    // Insert vectors to the tiered index.
+    VecSimIndex_AddVector(hnsw_index, vec1, 1); // vec1 is inserted to HNSW only.
+    VecSimIndex_AddVector(flat_index, vec2, 2); // vec2 is inserted to flat only.
+
+    // vec3 is inserted to both HNSW and flat, simulating a vector that was inserted
+    // to HNSW and not yet removed from flat.
+    VecSimIndex_AddVector(hnsw_index, vec3, 3);
+    VecSimIndex_AddVector(flat_index, vec3, 3);
+
+    // vec4 is not inserted to any index, simulating a non-existing vector.
+
+    // copy memory context before querying the index.
+    size_t cur_memory_usage = allocator->getAllocationSize();
+
+    ASSERT_EQ(VecSimIndex_GetDistanceFrom(tiered_index, 1, vec1), 0);
+    ASSERT_EQ(VecSimIndex_GetDistanceFrom(tiered_index, 2, vec2), 0);
+    ASSERT_EQ(VecSimIndex_GetDistanceFrom(tiered_index, 3, vec3), 0);
+    ASSERT_TRUE(std::isnan(VecSimIndex_GetDistanceFrom(tiered_index, 4, vec4)));
+
+    ASSERT_EQ(cur_memory_usage, allocator->getAllocationSize());
+    delete index_ctx;
+}
+
+TYPED_TEST(HNSWTieredIndexTestBasic, AdHocMulti) {
+    size_t dim = 4;
+
+    // Create TieredHNSW index instance with a mock queue.
+    HNSWParams params = {
+        .type = TypeParam::get_index_type(),
+        .dim = dim,
+        .metric = VecSimMetric_L2,
+        .multi = true,
+    };
+    VecSimParams hnsw_params = CreateParams(params);
+    auto jobQ = JobQueue();
+    auto index_ctx = new IndexExtCtx();
+
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx);
+
+    auto hnsw_index = tiered_index->backendIndex;
+    auto flat_index = tiered_index->frontendIndex;
+    auto allocator = tiered_index->getAllocator();
+
+    TEST_DATA_T cur_element = 1;
+
+    // vec1_* are inserted to HNSW only.
+    TEST_DATA_T vec1_1[dim];
+    GenerateVector<TEST_DATA_T>(vec1_1, dim, cur_element++);
+    TEST_DATA_T vec1_2[dim];
+    GenerateVector<TEST_DATA_T>(vec1_2, dim, cur_element++);
+    TEST_DATA_T vec1_3[dim];
+    GenerateVector<TEST_DATA_T>(vec1_3, dim, cur_element++);
+
+    // vec2_* are inserted to flat only.
+    TEST_DATA_T vec2_1[dim];
+    GenerateVector<TEST_DATA_T>(vec2_1, dim, cur_element++);
+    TEST_DATA_T vec2_2[dim];
+    GenerateVector<TEST_DATA_T>(vec2_2, dim, cur_element++);
+    TEST_DATA_T vec2_3[dim];
+    GenerateVector<TEST_DATA_T>(vec2_3, dim, cur_element++);
+
+    // vec3_* are inserted to both HNSW and flat (some to HNSW only, some to flat only)
+    TEST_DATA_T vec3_1[dim];
+    GenerateVector<TEST_DATA_T>(vec3_1, dim, cur_element++);
+    TEST_DATA_T vec3_2[dim];
+    GenerateVector<TEST_DATA_T>(vec3_2, dim, cur_element++);
+    TEST_DATA_T vec3_3[dim];
+    GenerateVector<TEST_DATA_T>(vec3_3, dim, cur_element++);
+
+    // vec4_* are inserted to both HNSW and flat with some overlap.
+    TEST_DATA_T vec4_1[dim];
+    GenerateVector<TEST_DATA_T>(vec4_1, dim, cur_element++);
+    TEST_DATA_T vec4_2[dim];
+    GenerateVector<TEST_DATA_T>(vec4_2, dim, cur_element++);
+    TEST_DATA_T vec4_3[dim];
+    GenerateVector<TEST_DATA_T>(vec4_3, dim, cur_element++);
+
+    // vec5 is not inserted to any index, simulating a non-existing vector.
+    TEST_DATA_T vec5[dim];
+    GenerateVector<TEST_DATA_T>(vec5, dim, cur_element++);
+
+    // Insert vectors to the tiered index.
+    VecSimIndex_AddVector(hnsw_index, vec1_1, 1);
+    VecSimIndex_AddVector(hnsw_index, vec1_2, 1);
+    VecSimIndex_AddVector(hnsw_index, vec1_3, 1);
+
+    VecSimIndex_AddVector(flat_index, vec2_1, 2);
+    VecSimIndex_AddVector(flat_index, vec2_2, 2);
+    VecSimIndex_AddVector(flat_index, vec2_3, 2);
+
+    VecSimIndex_AddVector(hnsw_index, vec3_1, 3);
+    VecSimIndex_AddVector(flat_index, vec3_2, 3);
+    VecSimIndex_AddVector(hnsw_index, vec3_3, 3);
+
+    VecSimIndex_AddVector(hnsw_index, vec4_1, 4);
+    VecSimIndex_AddVector(hnsw_index, vec4_2, 4);
+    VecSimIndex_AddVector(flat_index, vec4_2, 4);
+    VecSimIndex_AddVector(flat_index, vec4_3, 4);
+
+    // vec5 is not inserted to any index, simulating a non-existing vector.
+
+    // copy memory context before querying the index.
+    size_t cur_memory_usage = allocator->getAllocationSize();
+
+    // Distance from any vector to its label should be 0.
+    ASSERT_EQ(VecSimIndex_GetDistanceFrom(tiered_index, 1, vec1_1), 0);
+    ASSERT_EQ(VecSimIndex_GetDistanceFrom(tiered_index, 1, vec1_2), 0);
+    ASSERT_EQ(VecSimIndex_GetDistanceFrom(tiered_index, 1, vec1_3), 0);
+    ASSERT_EQ(VecSimIndex_GetDistanceFrom(tiered_index, 2, vec2_1), 0);
+    ASSERT_EQ(VecSimIndex_GetDistanceFrom(tiered_index, 2, vec2_2), 0);
+    ASSERT_EQ(VecSimIndex_GetDistanceFrom(tiered_index, 2, vec2_3), 0);
+    ASSERT_EQ(VecSimIndex_GetDistanceFrom(tiered_index, 3, vec3_1), 0);
+    ASSERT_EQ(VecSimIndex_GetDistanceFrom(tiered_index, 3, vec3_2), 0);
+    ASSERT_EQ(VecSimIndex_GetDistanceFrom(tiered_index, 3, vec3_3), 0);
+    ASSERT_EQ(VecSimIndex_GetDistanceFrom(tiered_index, 4, vec4_1), 0);
+    ASSERT_EQ(VecSimIndex_GetDistanceFrom(tiered_index, 4, vec4_2), 0);
+    ASSERT_EQ(VecSimIndex_GetDistanceFrom(tiered_index, 4, vec4_3), 0);
+    // Distance from a non-existing label should be NaN.
+    ASSERT_TRUE(std::isnan(VecSimIndex_GetDistanceFrom(tiered_index, 5, vec5)));
+
+    ASSERT_EQ(cur_memory_usage, allocator->getAllocationSize());
+    delete index_ctx;
+}
+
+TYPED_TEST(HNSWTieredIndexTest, parallelInsertAdHoc) {
+    size_t dim = 4;
+    size_t n = 1000;
+
+    size_t block_size = n / 100;
+    bool isMulti = TypeParam::isMulti();
+
+    // Create TieredHNSW index instance with a mock queue.
+    size_t n_labels = isMulti ? n / 50 : n;
+    HNSWParams params = {
+        .type = TypeParam::get_index_type(),
+        .dim = dim,
+        .metric = VecSimMetric_L2,
+        .multi = isMulti,
+        .blockSize = block_size,
+    };
+    VecSimParams hnsw_params = CreateParams(params);
+    auto jobQ = JobQueue();
+    auto index_ctx = new IndexExtCtx();
+
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx);
+    auto allocator = tiered_index->getAllocator();
+    EXPECT_EQ(index_ctx->index_strong_ref.use_count(), 1);
+
+    // Launch the BG threads loop that takes jobs from the queue and executes them.
+    bool run_thread = true;
+    for (size_t i = 0; i < THREAD_POOL_SIZE; i++) {
+        thread_pool.emplace_back(thread_main_loop, std::ref(jobQ), std::ref(run_thread));
+    }
+    std::atomic_int successful_searches(0);
+
+    auto parallel_adhoc_search = [](AsyncJob *job) {
+        auto *search_job = reinterpret_cast<SearchJobMock *>(job);
+        auto query = search_job->query;
+        size_t element = *(TEST_DATA_T *)query;
+        size_t label = element % search_job->n;
+        bool isMulti =
+            reinterpret_cast<TieredHNSWIndex<TEST_DATA_T, TEST_DIST_T> *>(search_job->index)
+                ->backendIndex->isMultiValue();
+
+        ASSERT_EQ(0, VecSimIndex_GetDistanceFrom(search_job->index, label, query));
+
+        search_job->successful_searches++;
+        delete job;
+    };
+
+    // Insert vectors in parallel to search.
+    for (size_t i = 0; i < n; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, i % n_labels, i);
+        auto query = (TEST_DATA_T *)allocator->allocate(dim * sizeof(TEST_DATA_T));
+        GenerateVector<TEST_DATA_T>(query, dim, i);
+        auto search_job =
+            new (allocator) SearchJobMock(allocator, parallel_adhoc_search, tiered_index, query, 1,
+                                          n_labels, dim, successful_searches);
+        tiered_index->submitSingleJob(search_job);
+    }
+
+    thread_pool_join(jobQ, run_thread);
+
+    EXPECT_EQ(successful_searches, n);
+    EXPECT_EQ(tiered_index->backendIndex->indexSize(), n);
+    EXPECT_EQ(tiered_index->backendIndex->indexLabelCount(), n_labels);
+    EXPECT_EQ(tiered_index->frontendIndex->indexSize(), 0);
+    EXPECT_EQ(tiered_index->labelToInsertJobs.size(), 0);
+    EXPECT_EQ(jobQ.size(), 0);
+
+    delete index_ctx;
+}
+
+TYPED_TEST(HNSWTieredIndexTest, deleteVector) {
+    // Create TieredHNSW index instance with a mock queue.
+    size_t dim = 4;
+    HNSWParams params = {.type = TypeParam::get_index_type(),
+                         .dim = dim,
+                         .metric = VecSimMetric_L2,
+                         .multi = TypeParam::isMulti()};
+    VecSimParams hnsw_params = CreateParams(params);
+    auto jobQ = JobQueue();
+    auto index_ctx = new IndexExtCtx();
+
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx);
+    auto allocator = tiered_index->getAllocator();
+
+    labelType vec_label = 0;
+    // Delete from an empty index.
+    ASSERT_EQ(tiered_index->deleteVector(vec_label), 0);
+
+    // Create a vector and add it to the tiered index (expect it to go into the flat buffer).
+    TEST_DATA_T vector[dim];
+    GenerateVector<TEST_DATA_T>(vector, dim, vec_label);
+    VecSimIndex_AddVector(tiered_index, vector, vec_label);
+    ASSERT_EQ(tiered_index->indexSize(), 1);
+    ASSERT_EQ(tiered_index->frontendIndex->indexSize(), 1);
+
+    // Expect to have one pending insert job.
+    ASSERT_EQ(tiered_index->labelToInsertJobs.size(), 1);
+    auto *job = tiered_index->labelToInsertJobs.at(vec_label).back();
+
+    // Remove vector from flat buffer.
+    ASSERT_EQ(tiered_index->deleteVector(vec_label), 1);
+    ASSERT_EQ(tiered_index->indexSize(), 0);
+    ASSERT_EQ(tiered_index->frontendIndex->indexSize(), 0);
+    ASSERT_EQ(tiered_index->labelToInsertJobs.size(), 0);
+    // The insert job should become invalid, and executing it should do nothing.
+    ASSERT_EQ(job->isValid, false);
+    ASSERT_EQ(reinterpret_cast<HNSWInsertJob *>(job)->id, 0);
+    thread_iteration(jobQ);
+    ASSERT_EQ(tiered_index->backendIndex->indexSize(), 0);
+
+    // Create a vector and add it to HNSW in the tiered index.
+    VecSimIndex_AddVector(tiered_index->backendIndex, vector, vec_label);
+    ASSERT_EQ(tiered_index->indexSize(), 1);
+    ASSERT_EQ(tiered_index->backendIndex->indexSize(), 1);
+
+    // Remove from main index.
+    ASSERT_EQ(tiered_index->deleteVector(vec_label), 1);
+    ASSERT_EQ(tiered_index->indexLabelCount(), 0);
+    ASSERT_EQ(tiered_index->indexSize(), 1);
+    ASSERT_EQ(tiered_index->getHNSWIndex()->getNumMarkedDeleted(), 1);
+
+    // Re-insert a deleted label with a different vector.
+    TEST_DATA_T new_vec_val = 2.0;
+    GenerateVector<TEST_DATA_T>(vector, dim, new_vec_val);
+    VecSimIndex_AddVector(tiered_index, vector, vec_label);
+    ASSERT_EQ(tiered_index->indexSize(), 2);
+    ASSERT_EQ(tiered_index->frontendIndex->indexSize(), 1);
+
+    // Move the vector to HNSW by executing the insert job.
+    thread_iteration(jobQ);
+    ASSERT_EQ(tiered_index->indexLabelCount(), 1);
+    ASSERT_EQ(tiered_index->backendIndex->indexSize(), 2);
+    // Check that the distance from the deleted vector (of zeros) to the label is the distance
+    // to the new vector (L2 distance).
+    TEST_DATA_T deleted_vector[dim];
+    GenerateVector<TEST_DATA_T>(deleted_vector, dim, 0);
+    ASSERT_EQ(tiered_index->backendIndex->getDistanceFrom(vec_label, deleted_vector),
+              dim * pow(new_vec_val, 2));
+
+    delete index_ctx;
+}
+
+TYPED_TEST(HNSWTieredIndexTestBasic, deleteVectorMulti) {
+    // Create TieredHNSW index instance with a mock queue.
+    size_t dim = 4;
+    HNSWParams params = {
+        .type = TypeParam::get_index_type(), .dim = dim, .metric = VecSimMetric_L2, .multi = true};
+    VecSimParams hnsw_params = CreateParams(params);
+    auto jobQ = JobQueue();
+    auto index_ctx = new IndexExtCtx();
+
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx);
+    auto allocator = tiered_index->getAllocator();
+
+    // Test some more scenarios that are relevant only for multi value index.
+    labelType vec_label = 0;
+    labelType other_vec_val = 2.0;
+    idType invalidJobsCounter = 0;
+    // Create a vector and add it to HNSW in the tiered index.
+    TEST_DATA_T vector[dim];
+    GenerateVector<TEST_DATA_T>(vector, dim, vec_label);
+    VecSimIndex_AddVector(tiered_index->backendIndex, vector, vec_label);
+    ASSERT_EQ(tiered_index->indexSize(), 1);
+    ASSERT_EQ(tiered_index->backendIndex->indexSize(), 1);
+
+    // Test deleting a label for which one of its vector's is in the flat index while the
+    // second one is in HNSW.
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, vec_label, other_vec_val);
+    ASSERT_EQ(tiered_index->indexLabelCount(), 1);
+    ASSERT_EQ(tiered_index->indexSize(), 2);
+    ASSERT_EQ(tiered_index->deleteVector(vec_label), 2);
+    ASSERT_EQ(tiered_index->indexLabelCount(), 0);
+    ASSERT_EQ(tiered_index->getHNSWIndex()->getNumMarkedDeleted(), 1);
+    ASSERT_EQ(jobQ.front().job->isValid, false);
+    ASSERT_EQ(reinterpret_cast<HNSWInsertJob *>(jobQ.front().job)->id, invalidJobsCounter++);
+    thread_iteration(jobQ);
+    ASSERT_EQ(jobQ.size(), 0);
+
+    // Test deleting a label for which both of its vector's is in the flat index.
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, vec_label, vec_label);
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, vec_label, other_vec_val);
+    ASSERT_EQ(tiered_index->indexLabelCount(), 1);
+    ASSERT_EQ(tiered_index->frontendIndex->indexLabelCount(), 1);
+    ASSERT_EQ(tiered_index->frontendIndex->indexSize(), 2);
+    ASSERT_EQ(tiered_index->indexSize(), 3);
+    ASSERT_EQ(tiered_index->deleteVector(vec_label), 2);
+    ASSERT_EQ(tiered_index->indexLabelCount(), 0);
+    ASSERT_EQ(jobQ.front().job->isValid, false);
+    ASSERT_EQ(reinterpret_cast<HNSWInsertJob *>(jobQ.front().job)->id, invalidJobsCounter++);
+    thread_iteration(jobQ);
+    ASSERT_EQ(jobQ.front().job->isValid, false);
+    ASSERT_EQ(reinterpret_cast<HNSWInsertJob *>(jobQ.front().job)->id, invalidJobsCounter++);
+    thread_iteration(jobQ);
+    ASSERT_EQ(jobQ.size(), 0);
+
+    // Test deleting a label for which both of its vector's is in HNSW index.
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, vec_label, vec_label);
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, vec_label, other_vec_val);
+    thread_iteration(jobQ);
+    thread_iteration(jobQ);
+    ASSERT_EQ(tiered_index->indexLabelCount(), 1);
+    ASSERT_EQ(tiered_index->frontendIndex->indexSize(), 0);
+    ASSERT_EQ(tiered_index->backendIndex->indexSize(), 3);
+    ASSERT_EQ(tiered_index->backendIndex->indexLabelCount(), 1);
+    ASSERT_EQ(tiered_index->deleteVector(vec_label), 2);
+    ASSERT_EQ(tiered_index->backendIndex->indexLabelCount(), 0);
+    ASSERT_EQ(tiered_index->getHNSWIndex()->getNumMarkedDeleted(), 3);
+
+    // Expect to see two repair jobs - one for each deleted vector internal id.
+    ASSERT_EQ(jobQ.size(), 2);
+    ASSERT_EQ(jobQ.front().job->jobType, HNSW_REPAIR_NODE_CONNECTIONS_JOB);
+    ASSERT_EQ(reinterpret_cast<HNSWRepairJob *>(jobQ.front().job)->node_id, 2);
+    thread_iteration(jobQ);
+    ASSERT_EQ(jobQ.front().job->jobType, HNSW_REPAIR_NODE_CONNECTIONS_JOB);
+    ASSERT_EQ(reinterpret_cast<HNSWRepairJob *>(jobQ.front().job)->node_id, 1);
+    thread_iteration(jobQ);
+
+    delete index_ctx;
+}
+
+TYPED_TEST(HNSWTieredIndexTestBasic, deleteVectorMultiFromFlatAdvanced) {
+
+    // Create TieredHNSW index instance with a mock queue.
+    size_t dim = 4;
+    HNSWParams params = {
+        .type = TypeParam::get_index_type(), .dim = dim, .metric = VecSimMetric_L2, .multi = true};
+    VecSimParams hnsw_params = CreateParams(params);
+    auto jobQ = JobQueue();
+    auto index_ctx = new IndexExtCtx();
+
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx);
+    auto allocator = tiered_index->getAllocator();
+
+    // Insert vectors to flat buffer under two distinct labels, so that ids 0, 2 will be associated
+    // with the first label, and ids 1, 3, 4 will be associated with the second label.
+    labelType vec_label_first = 0;
+    labelType vec_label_second = 1;
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, vec_label_first);
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, vec_label_second);
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, vec_label_first);
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, vec_label_second);
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, vec_label_second);
+
+    // Remove the second label, expect to see that id 1 will hold id 2 eventually.
+    ASSERT_EQ(tiered_index->labelToInsertJobs.erase(vec_label_second), 1);
+    auto updated_ids = tiered_index->frontendIndex->deleteVectorAndGetUpdatedIds(vec_label_second);
+    ASSERT_EQ(updated_ids.size(), 1);
+    ASSERT_EQ(updated_ids.at(1).first, 2);
+    for (auto &it : updated_ids) {
+        tiered_index->updateInsertJobInternalId(it.second.first, it.first, it.second.second);
+    }
+    ASSERT_EQ(tiered_index->labelToInsertJobs.size(), 1);
+    ASSERT_EQ(tiered_index->labelToInsertJobs.at(vec_label_first).size(), 2);
+    ASSERT_EQ(tiered_index->labelToInsertJobs.at(vec_label_first)[0]->label, vec_label_first);
+    ASSERT_EQ(tiered_index->labelToInsertJobs.at(vec_label_first)[0]->id, 0);
+    ASSERT_EQ(tiered_index->labelToInsertJobs.at(vec_label_first)[1]->label, vec_label_first);
+    ASSERT_EQ(tiered_index->labelToInsertJobs.at(vec_label_first)[1]->id, 1);
+
+    ASSERT_EQ(tiered_index->indexLabelCount(), 1);
+    ASSERT_EQ(tiered_index->indexSize(), 2);
+
+    // Remove the first label, expect an empty set.
+    updated_ids = tiered_index->frontendIndex->deleteVectorAndGetUpdatedIds(vec_label_first);
+    ASSERT_EQ(updated_ids.size(), 0);
+    ASSERT_EQ(tiered_index->indexSize(), 0);
+    tiered_index->labelToInsertJobs.clear();
+
+    // Insert vectors to flat buffer under two distinct labels, so that ids 0, 3 will be associated
+    // with the first label, and ids 1, 2, 4 will be associated with the second label. This should
+    // test the case of multiple moves once we delete the second label:
+    // {1->4} => {1->4, 2->3} => {1->3}
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, vec_label_first);
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, vec_label_second);
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, vec_label_second);
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, vec_label_first);
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, vec_label_second);
+    ASSERT_EQ(tiered_index->labelToInsertJobs.erase(vec_label_second), 1);
+    updated_ids = tiered_index->frontendIndex->deleteVectorAndGetUpdatedIds(vec_label_second);
+    ASSERT_EQ(updated_ids.size(), 1);
+    ASSERT_EQ(updated_ids.at(1).first, 3);
+    for (auto &it : updated_ids) {
+        tiered_index->updateInsertJobInternalId(it.second.first, it.first, it.second.second);
+    }
+    ASSERT_EQ(tiered_index->labelToInsertJobs.size(), 1);
+    ASSERT_EQ(tiered_index->labelToInsertJobs.at(vec_label_first).size(), 2);
+    ASSERT_EQ(tiered_index->labelToInsertJobs.at(vec_label_first)[0]->label, vec_label_first);
+    ASSERT_EQ(tiered_index->labelToInsertJobs.at(vec_label_first)[0]->id, 0);
+    ASSERT_EQ(tiered_index->labelToInsertJobs.at(vec_label_first)[1]->label, vec_label_first);
+    ASSERT_EQ(tiered_index->labelToInsertJobs.at(vec_label_first)[1]->id, 1);
+    ASSERT_EQ(tiered_index->indexLabelCount(), 1);
+    ASSERT_EQ(tiered_index->indexSize(), 2);
+    tiered_index->labelToInsertJobs.clear();
+
+    // Clean jobs from queue
+    while (!jobQ.empty()) {
+        jobQ.kick();
+    }
+    delete index_ctx;
+}
+
+TYPED_TEST(HNSWTieredIndexTest, deleteVectorAndRepairAsync) {
+    // Create TieredHNSW index instance with a mock queue.
+    size_t dim = 4;
+    size_t n = 1000;
+    for (size_t maxSwapJobs : {(int)n + 1, 10, 1}) {
+        HNSWParams params = {.type = TypeParam::get_index_type(),
+                             .dim = dim,
+                             .metric = VecSimMetric_L2,
+                             .multi = TypeParam::isMulti(),
+                             .blockSize = 100};
+        VecSimParams hnsw_params = CreateParams(params);
+        auto jobQ = JobQueue();
+        auto index_ctx = new IndexExtCtx();
+        auto *tiered_index =
+            this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx, maxSwapJobs);
+        auto allocator = tiered_index->getAllocator();
+
+        size_t per_label = TypeParam::isMulti() ? 50 : 1;
+        size_t n_labels = n / per_label;
+
+        // Launch the BG threads loop that takes jobs from the queue and executes them.
+        bool run_thread = true;
+        for (size_t i = 0; i < THREAD_POOL_SIZE; i++) {
+            thread_pool.emplace_back(thread_main_loop, std::ref(jobQ), std::ref(run_thread));
+        }
+
+        // Create and insert vectors one by one, then delete them one by one.
+        std::srand(10); // create pseudo random generator with any arbitrary seed.
+        for (size_t i = 0; i < n; i++) {
+            TEST_DATA_T vector[dim];
+            for (size_t j = 0; j < dim; j++) {
+                vector[j] = std::rand() / (TEST_DATA_T)RAND_MAX;
+            }
+            VecSimIndex_AddVector(tiered_index, vector, i % n_labels);
+        }
+        // While a thread is ingesting a vector into HNSW, a vector may appear in both indexes
+        // (hence it will be counted twice in the index size calculation).
+        size_t index_size = tiered_index->indexSize();
+        EXPECT_GE(index_size, n);
+        EXPECT_LE(index_size, n + THREAD_POOL_SIZE);
+        EXPECT_EQ(tiered_index->indexLabelCount(), n_labels);
+        for (size_t i = 0; i < n_labels; i++) {
+            // Every vector associated with the label may appear in flat/HNSW index or in both if
+            // its just being ingested.
+            int num_deleted = tiered_index->deleteVector(i);
+            EXPECT_GE(num_deleted, per_label);
+            EXPECT_LE(num_deleted, MIN(2 * per_label, per_label + THREAD_POOL_SIZE));
+            EXPECT_EQ(tiered_index->deleteVector(i), 0); // delete already deleted label
+        }
+        EXPECT_EQ(tiered_index->indexLabelCount(), 0);
+
+        thread_pool_join(jobQ, run_thread);
+
+        EXPECT_EQ(tiered_index->getHNSWIndex()->checkIntegrity().connections_to_repair, 0);
+        EXPECT_EQ(tiered_index->getHNSWIndex()->safeGetEntryPointState().first, INVALID_ID);
+        // Verify that we have no pending jobs.
+        EXPECT_EQ(tiered_index->labelToInsertJobs.size(), 0);
+        EXPECT_EQ(tiered_index->idToRepairJobs.size(), 0);
+        for (auto &it : tiered_index->idToSwapJob) {
+            EXPECT_EQ(it.second->pending_repair_jobs_counter.load(), 0);
+        }
+        // Delete another vector to trigger swapping of vectors that hadn't been swapped yet.
+        // If the number of swap jobs is lower than the threshold, none of them are going to be
+        // executed, but otherwise, ALL of them should be executed.
+        size_t pending_swap_jobs = tiered_index->idToSwapJob.size();
+        EXPECT_EQ(tiered_index->deleteVector(0), 0);
+        if (pending_swap_jobs > maxSwapJobs) {
+            ASSERT_EQ(tiered_index->idToSwapJob.size(), 0);
+            EXPECT_EQ(tiered_index->indexSize(), 0);
+        } else {
+            ASSERT_LE(tiered_index->idToSwapJob.size(), maxSwapJobs);
+            ASSERT_EQ(tiered_index->idToSwapJob.size(), tiered_index->indexSize());
+        }
+        EXPECT_EQ(tiered_index->getHNSWIndex()->getNumMarkedDeleted(), tiered_index->indexSize());
+
+        delete index_ctx;
+    }
+}
+
+TYPED_TEST(HNSWTieredIndexTest, alternateInsertDeleteAsync) {
+    // Create TieredHNSW index instance with a mock queue.
+    size_t dim = 16;
+    size_t n = 1000;
+    for (size_t maxSwapJobs : {(int)n + 1, 10, 1}) {
+        for (size_t M : {2, 16}) {
+            HNSWParams params = {.type = TypeParam::get_index_type(),
+                                 .dim = dim,
+                                 .metric = VecSimMetric_L2,
+                                 .multi = TypeParam::isMulti(),
+                                 .blockSize = 100,
+                                 .M = M};
+            VecSimParams hnsw_params = CreateParams(params);
+            auto jobQ = JobQueue();
+            auto index_ctx = new IndexExtCtx();
+
+            auto *tiered_index =
+                this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx, maxSwapJobs);
+            auto allocator = tiered_index->getAllocator();
+
+            size_t per_label = TypeParam::isMulti() ? 5 : 1;
+            size_t n_labels = n / per_label;
+
+            // Launch the BG threads loop that takes jobs from the queue and executes them.
+            bool run_thread = true;
+            for (size_t i = 0; i < THREAD_POOL_SIZE; i++) {
+                thread_pool.emplace_back(thread_main_loop, std::ref(jobQ), std::ref(run_thread));
+            }
+
+            // Create and insert 10 vectors, then delete them right after.
+            size_t batch_size = 5;
+            std::srand(10); // create pseudo random generator with any arbitrary seed.
+            for (size_t i = 0; i < n / batch_size; i++) {
+                for (size_t l = 0; l < batch_size; l++) {
+                    TEST_DATA_T vector[dim];
+                    for (size_t j = 0; j < dim; j++) {
+                        vector[j] = std::rand() / (TEST_DATA_T)RAND_MAX;
+                    }
+                    tiered_index->addVector(vector, (i * batch_size + l) / per_label);
+                }
+                for (size_t l = 0; l < batch_size / per_label; l++) {
+                    // Every vector associated with the label may appear in flat/HNSW index or in
+                    // both if its just being ingested.
+                    int num_deleted = tiered_index->deleteVector(i * batch_size / per_label + l);
+                    EXPECT_GE(num_deleted, per_label);
+                    EXPECT_LE(num_deleted, 2 * per_label);
+                }
+            }
+            // Vectors are deleted from flat buffer in place (in HNSW they are only marked as
+            // deleted).
+            EXPECT_GE(tiered_index->frontendIndex->indexSize(), 0);
+            EXPECT_EQ(tiered_index->indexLabelCount(), 0);
+
+            thread_pool_join(jobQ, run_thread);
+
+            EXPECT_EQ(tiered_index->getHNSWIndex()->checkIntegrity().connections_to_repair, 0);
+            EXPECT_EQ(tiered_index->getHNSWIndex()->safeGetEntryPointState().first, INVALID_ID);
+            // Verify that we have no pending jobs.
+            EXPECT_EQ(tiered_index->labelToInsertJobs.size(), 0);
+            EXPECT_EQ(tiered_index->idToRepairJobs.size(), 0);
+            for (auto &it : tiered_index->idToSwapJob) {
+                EXPECT_EQ(it.second->pending_repair_jobs_counter.load(), 0);
+            }
+            // Delete another vector to trigger swapping of vectors that hadn't been swapped yet.
+            // If the number of swap jobs is lower than the threshold, none of them are going to be
+            // executed, but otherwise, ALL of them should be executed.
+            size_t pending_swap_jobs = tiered_index->idToSwapJob.size();
+            EXPECT_EQ(tiered_index->deleteVector(0), 0);
+            ASSERT_LE(tiered_index->idToSwapJob.size(), maxSwapJobs);
+            ASSERT_EQ(tiered_index->idToSwapJob.size(), tiered_index->indexSize());
+
+            delete index_ctx;
+        }
+    }
+}
+
+TYPED_TEST(HNSWTieredIndexTest, swapJobBasic) {
+    // Create TieredHNSW index instance with a mock queue.
+    size_t dim = 4;
+    HNSWParams params = {.type = TypeParam::get_index_type(),
+                         .dim = dim,
+                         .metric = VecSimMetric_L2,
+                         .multi = TypeParam::isMulti()};
+    VecSimParams hnsw_params = CreateParams(params);
+    auto jobQ = JobQueue();
+    auto index_ctx = new IndexExtCtx();
+
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx);
+    auto allocator = tiered_index->getAllocator();
+
+    // Test initialization of the pendingSwapJobsThreshold value.
+    ASSERT_EQ(tiered_index->pendingSwapJobsThreshold, DEFAULT_PENDING_SWAP_JOBS_THRESHOLD);
+    index_ctx->index_strong_ref.reset();
+
+    tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx,
+                                               MAX_PENDING_SWAP_JOBS_THRESHOLD + 1);
+    allocator = tiered_index->getAllocator();
+    ASSERT_EQ(tiered_index->pendingSwapJobsThreshold, MAX_PENDING_SWAP_JOBS_THRESHOLD);
+    index_ctx->index_strong_ref.reset();
+
+    tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx, 1);
+    ASSERT_EQ(tiered_index->pendingSwapJobsThreshold, 1);
+
+    allocator = tiered_index->getAllocator();
+
+    // Call reserve for the unordered maps that are going to be used, since upon initialization it
+    // consumed 0 memory, but after insertion and deletion they will consume a minimal amount of
+    // memory (that is equivalent to the memory consumption upon reserving 0 buckets).
+    tiered_index->idToRepairJobs.reserve(0);
+    tiered_index->idToSwapJob.reserve(0);
+    TypeParam::isMulti() ? reinterpret_cast<HNSWIndex_Multi<TEST_DATA_T, TEST_DIST_T> *>(
+                               tiered_index->getHNSWIndex())
+                               ->label_lookup_.reserve(0)
+                         : reinterpret_cast<HNSWIndex_Single<TEST_DATA_T, TEST_DIST_T> *>(
+                               tiered_index->getHNSWIndex())
+                               ->label_lookup_.reserve(0);
+
+    size_t initial_mem = tiered_index->getAllocationSize();
+
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index->backendIndex, dim, 0, 0);
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index->backendIndex, dim, 1, 1);
+    EXPECT_EQ(tiered_index->indexLabelCount(), 2);
+    EXPECT_EQ(tiered_index->indexSize(), 2);
+    // Delete both vectors.
+    EXPECT_EQ(tiered_index->deleteVector(0), 1);
+    EXPECT_EQ(tiered_index->deleteVector(1), 1);
+    EXPECT_EQ(tiered_index->idToSwapJob.size(), 2);
+    EXPECT_EQ(tiered_index->getHNSWIndex()->getNumMarkedDeleted(), 2);
+    // Expect to have pending repair jobs, so that swap job cannot be executed yet - for each
+    // deleted vector there should be a single repair job.
+    EXPECT_EQ(jobQ.size(), 2);
+    EXPECT_EQ(tiered_index->idToSwapJob.at(0)->pending_repair_jobs_counter.load(), 1);
+    EXPECT_EQ(tiered_index->idToSwapJob.at(1)->pending_repair_jobs_counter.load(), 1);
+    thread_iteration(jobQ);
+    thread_iteration(jobQ);
+    EXPECT_EQ(tiered_index->idToSwapJob.size(), 2);
+    // Insert another vector and remove it. expect it to have no neighbors.
+    // Threshold for is set to be >= 1, so now we expect that all the deleted vectors (which has no
+    // pending repair jobs) will be swapped.
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index->backendIndex, dim, 2, 2);
+    EXPECT_EQ(tiered_index->deleteVector(2), 1);
+    EXPECT_EQ(tiered_index->idToSwapJob.size(), 0);
+    EXPECT_EQ(tiered_index->indexSize(), 0);
+    EXPECT_EQ(tiered_index->getHNSWIndex()->getNumMarkedDeleted(), 0);
+    EXPECT_EQ(jobQ.size(), 0);
+
+    // Reserve manually 0 buckets in the hash tables so that memory would be as it was before we
+    // started inserting vectors.
+    tiered_index->idToRepairJobs.reserve(0);
+    tiered_index->idToSwapJob.reserve(0);
+    // Call this just to trigger an update of the memory context.
+    EXPECT_EQ(tiered_index->deleteVector(0), 0);
+
+    EXPECT_EQ(tiered_index->getAllocationSize(), initial_mem);
+
+    delete index_ctx;
+    // VecSimAllocator::allocation_header_size = size_t, this should be the only memory that we
+    // account for at this point.
+    EXPECT_EQ(allocator->getAllocationSize(), sizeof(size_t));
+}
+
+TYPED_TEST(HNSWTieredIndexTest, swapJobBasic2) {
+    // Create TieredHNSW index instance with a mock queue.
+    size_t dim = 4;
+    HNSWParams params = {.type = TypeParam::get_index_type(),
+                         .dim = dim,
+                         .metric = VecSimMetric_L2,
+                         .multi = TypeParam::isMulti()};
+    VecSimParams hnsw_params = CreateParams(params);
+    auto jobQ = JobQueue();
+    auto index_ctx = new IndexExtCtx();
+
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx, 1);
+    ASSERT_EQ(tiered_index->pendingSwapJobsThreshold, 1);
+    auto allocator = tiered_index->getAllocator();
+
+    // Insert 3 vectors, expect to have a fully connected graph.
+    idType invalid_jobs_counter = 0;
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index->backendIndex, dim, 0, 0);
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index->backendIndex, dim, 1, 1);
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index->backendIndex, dim, 2, 2);
+    // Delete 0, expect to have two repair jobs pending for 1 and 2 and execute it.
+    EXPECT_EQ(tiered_index->deleteVector(0), 1);
+    EXPECT_EQ(jobQ.size(), 2);
+    ASSERT_EQ(jobQ.front().job->jobType, HNSW_REPAIR_NODE_CONNECTIONS_JOB);
+    thread_iteration(jobQ);
+    EXPECT_EQ(tiered_index->idToSwapJob.at(0)->pending_repair_jobs_counter.load(), 1);
+    ASSERT_EQ(jobQ.front().job->jobType, HNSW_REPAIR_NODE_CONNECTIONS_JOB);
+    thread_iteration(jobQ);
+    EXPECT_EQ(tiered_index->idToSwapJob.at(0)->pending_repair_jobs_counter.load(), 0);
+    // Delete 2, expect to create two repair job pending from 0 and 1. Also, expect that swap
+    // job for 0 will be executed, so that 2 and 0 are swapped. Then, we should have only 1
+    // pending repair job for the "new" 0 - for deleting the old 1->2, while the second job for
+    // deleting the old 0->2 is invalid and reduced from the pending repair jobs counter.
+    EXPECT_EQ(tiered_index->deleteVector(2), 1);
+    EXPECT_EQ(tiered_index->indexSize(), 2);
+    EXPECT_EQ(tiered_index->getHNSWIndex()->getNumMarkedDeleted(), 1);
+
+    EXPECT_EQ(tiered_index->idToSwapJob.at(0)->pending_repair_jobs_counter.load(), 1);
+    EXPECT_EQ(jobQ.size(), 2);
+    // The first repair job should remove 1->0 (originally was 1->2).
+    ASSERT_EQ(jobQ.front().job->jobType, HNSW_REPAIR_NODE_CONNECTIONS_JOB);
+    ASSERT_EQ(reinterpret_cast<HNSWRepairJob *>(jobQ.front().job)->node_id, 1);
+    ASSERT_EQ(
+        reinterpret_cast<HNSWRepairJob *>(jobQ.front().job)->associatedSwapJobs[0]->deleted_id, 0);
+    thread_iteration(jobQ);
+    EXPECT_EQ(tiered_index->idToSwapJob.at(0)->pending_repair_jobs_counter.load(), 0);
+    // The second repair job is invalid due to the removal of (the original) 0.
+    ASSERT_EQ(jobQ.front().job->jobType, HNSW_REPAIR_NODE_CONNECTIONS_JOB);
+    ASSERT_EQ(jobQ.front().job->isValid, false);
+    ASSERT_EQ(reinterpret_cast<HNSWRepairJob *>(jobQ.front().job)->node_id, invalid_jobs_counter++);
+    ASSERT_EQ(
+        reinterpret_cast<HNSWRepairJob *>(jobQ.front().job)->associatedSwapJobs[0]->deleted_id, 0);
+    thread_iteration(jobQ);
+    // Delete 1, that should still have 0->1 edge that should be repaired. This should cause
+    // the swap and removal of 0 (that has no more pending jobs at that point) - so that 1 would
+    // get id 0, and then the new 0 should have no pending repair jobs.
+    EXPECT_EQ(tiered_index->deleteVector(1), 1);
+    EXPECT_EQ(jobQ.size(), 1);
+    EXPECT_EQ(tiered_index->idToSwapJob.size(), 1);
+    EXPECT_EQ(tiered_index->idToSwapJob.at(0)->deleted_id, 0);
+    EXPECT_EQ(tiered_index->idToSwapJob.at(0)->pending_repair_jobs_counter.load(), 0);
+    // The repair job is invalid due to the removal of (the previous) 0.
+    ASSERT_EQ(jobQ.front().job->jobType, HNSW_REPAIR_NODE_CONNECTIONS_JOB);
+    ASSERT_EQ(jobQ.front().job->isValid, false);
+    ASSERT_EQ(reinterpret_cast<HNSWRepairJob *>(jobQ.front().job)->node_id, invalid_jobs_counter);
+    ASSERT_EQ(
+        reinterpret_cast<HNSWRepairJob *>(jobQ.front().job)->associatedSwapJobs[0]->deleted_id, 0);
+    thread_iteration(jobQ);
+    EXPECT_EQ(tiered_index->indexSize(), 1);
+    EXPECT_EQ(tiered_index->getHNSWIndex()->getNumMarkedDeleted(), 1);
+    EXPECT_EQ(tiered_index->getHNSWIndex()->safeGetEntryPointState().first, INVALID_ID);
+
+    // Call delete again, this should only trigger the swap and removal of 1
+    // (which has already deleted)
+    EXPECT_EQ(tiered_index->deleteVector(1), 0);
+    EXPECT_EQ(tiered_index->indexSize(), 0);
+    EXPECT_EQ(tiered_index->getHNSWIndex()->getNumMarkedDeleted(), 0);
+
+    delete index_ctx;
+}
+
+// A set of lambdas that determine whether a vector should be inserted to the
+// HNSW index (returns true) or to the flat index (returns false).
+inline constexpr std::array<std::pair<std::string_view, bool (*)(size_t, size_t)>, 11> lambdas = {{
+    {"100% HNSW,   0% FLAT ", [](size_t idx, size_t n) -> bool { return 1; }},
+    {" 50% HNSW,  50% FLAT ", [](size_t idx, size_t n) -> bool { return idx % 2; }},
+    {"  0% HNSW, 100% FLAT ", [](size_t idx, size_t n) -> bool { return 0; }},
+    {" 90% HNSW,  10% FLAT ", [](size_t idx, size_t n) -> bool { return idx % 10; }},
+    {" 10% HNSW,  90% FLAT ", [](size_t idx, size_t n) -> bool { return !(idx % 10); }},
+    {" 99% HNSW,   1% FLAT ", [](size_t idx, size_t n) -> bool { return idx % 100; }},
+    {"  1% HNSW,  99% FLAT ", [](size_t idx, size_t n) -> bool { return !(idx % 100); }},
+    {"first 10% are in HNSW", [](size_t idx, size_t n) -> bool { return idx < (n / 10); }},
+    {"first 10% are in FLAT", [](size_t idx, size_t n) -> bool { return idx >= (n / 10); }},
+    {" last 10% are in FLAT", [](size_t idx, size_t n) -> bool { return idx < (9 * n / 10); }},
+    {" last 10% are in HNSW", [](size_t idx, size_t n) -> bool { return idx >= (9 * n / 10); }},
+}};
+
+TYPED_TEST(HNSWTieredIndexTest, BatchIterator) {
+    size_t d = 4;
+    size_t M = 8;
+    size_t ef = 20;
+    size_t n = 1000;
+
+    size_t per_label = TypeParam::isMulti() ? 10 : 1;
+    size_t n_labels = n / per_label;
+
+    // Create TieredHNSW index instance with a mock queue.
+    HNSWParams hnsw_params = {
+        .type = TypeParam::get_index_type(),
+        .dim = d,
+        .metric = VecSimMetric_L2,
+        .multi = TypeParam::isMulti(),
+        .initialCapacity = n,
+        .efConstruction = ef,
+        .efRuntime = ef,
+    };
+    VecSimParams params = CreateParams(hnsw_params);
+
+    // for (auto &[decider_name, decider] : lambdas) { // TODO: not supported by clang < 16
+    for (auto &lambda : lambdas) {
+        // manually deconstruct the pair to avoid the clang error
+        auto &decider_name = lambda.first;
+        auto &decider = lambda.second;
+
+        auto jobQ = JobQueue();
+        auto index_ctx = new IndexExtCtx();
+        auto *tiered_index = this->CreateTieredHNSWIndex(params, &jobQ, index_ctx);
+        auto allocator = tiered_index->getAllocator();
+
+        auto *hnsw = tiered_index->backendIndex;
+        auto *flat = tiered_index->frontendIndex;
+
+        // For every i, add the vector (i,i,i,i) under the label i.
+        for (size_t i = 0; i < n; i++) {
+            auto cur = decider(i, n) ? hnsw : flat;
+            GenerateAndAddVector<TEST_DATA_T>(cur, d, i % n_labels, i);
+        }
+        ASSERT_EQ(VecSimIndex_IndexSize(tiered_index), n) << decider_name;
+
+        // Query for (n,n,n,n) vector (recall that n-1 is the largest id in te index).
+        TEST_DATA_T query[d];
+        GenerateVector<TEST_DATA_T>(query, d, n);
+
+        VecSimBatchIterator *batchIterator = VecSimBatchIterator_New(tiered_index, query, nullptr);
+        size_t iteration_num = 0;
+
+        // Get the 5 vectors whose ids are the maximal among those that hasn't been returned yet
+        // in every iteration. The results order should be sorted by their score (distance from
+        // the query vector), which means sorted from the largest id to the lowest.
+        size_t n_res = 5;
+        while (VecSimBatchIterator_HasNext(batchIterator)) {
+            std::vector<size_t> expected_ids(n_res);
+            for (size_t i = 0; i < n_res; i++) {
+                expected_ids[i] = (n - iteration_num * n_res - i - 1) % n_labels;
+            }
+            auto verify_res = [&](size_t id, double score, size_t index) {
+                ASSERT_EQ(expected_ids[index], id) << decider_name;
+            };
+            runBatchIteratorSearchTest(batchIterator, n_res, verify_res);
+            iteration_num++;
+        }
+        ASSERT_EQ(iteration_num, n_labels / n_res) << decider_name;
+        VecSimBatchIterator_Free(batchIterator);
+
+        // Free the index.
+        delete index_ctx;
+    }
+}
+
+TYPED_TEST(HNSWTieredIndexTest, BatchIteratorReset) {
+    size_t d = 4;
+    size_t M = 8;
+    size_t ef = 20;
+    size_t n = 1000;
+
+    size_t per_label = TypeParam::isMulti() ? 10 : 1;
+    size_t n_labels = n / per_label;
+
+    // Create TieredHNSW index instance with a mock queue.
+    HNSWParams hnsw_params = {
+        .type = TypeParam::get_index_type(),
+        .dim = d,
+        .metric = VecSimMetric_L2,
+        .multi = TypeParam::isMulti(),
+        .initialCapacity = n,
+        .efConstruction = ef,
+        .efRuntime = ef,
+    };
+    VecSimParams params = CreateParams(hnsw_params);
+
+    // for (auto &[decider_name, decider] : lambdas) { // TODO: not supported by clang < 16
+    for (auto &lambda : lambdas) {
+        // manually deconstruct the pair to avoid the clang error
+        auto &decider_name = lambda.first;
+        auto &decider = lambda.second;
+
+        auto jobQ = JobQueue();
+        auto index_ctx = new IndexExtCtx();
+        auto *tiered_index = this->CreateTieredHNSWIndex(params, &jobQ, index_ctx);
+        auto allocator = tiered_index->getAllocator();
+
+        auto *hnsw = tiered_index->backendIndex;
+        auto *flat = tiered_index->frontendIndex;
+
+        // For every i, add the vector (i,i,i,i) under the label i.
+        for (size_t i = 0; i < n; i++) {
+            auto cur = decider(i, n) ? hnsw : flat;
+            GenerateAndAddVector<TEST_DATA_T>(cur, d, i % n_labels, i);
+        }
+        ASSERT_EQ(VecSimIndex_IndexSize(tiered_index), n) << decider_name;
+
+        // Query for (n,n,n,n) vector (recall that n-1 is the largest id in te index).
+        TEST_DATA_T query[d];
+        GenerateVector<TEST_DATA_T>(query, d, n);
+
+        VecSimBatchIterator *batchIterator = VecSimBatchIterator_New(tiered_index, query, nullptr);
+        ASSERT_NO_FATAL_FAILURE(VecSimBatchIterator_Reset(batchIterator));
+
+        // Get the 100 vectors whose ids are the maximal among those that hasn't been returned yet,
+        // in every iteration. Run this flow for 3 times, and reset the iterator.
+        size_t n_res = 100;
+        size_t re_runs = 3;
+
+        for (size_t take = 0; take < re_runs; take++) {
+            size_t iteration_num = 0;
+            while (VecSimBatchIterator_HasNext(batchIterator)) {
+                std::vector<size_t> expected_ids(n_res);
+                for (size_t i = 0; i < n_res; i++) {
+                    expected_ids[i] = (n - iteration_num * n_res - i - 1) % n_labels;
+                }
+                auto verify_res = [&](size_t id, double score, size_t index) {
+                    ASSERT_EQ(expected_ids[index], id) << decider_name;
+                };
+                runBatchIteratorSearchTest(batchIterator, n_res, verify_res, BY_SCORE);
+                iteration_num++;
+            }
+            ASSERT_EQ(iteration_num, n_labels / n_res) << decider_name;
+            VecSimBatchIterator_Reset(batchIterator);
+        }
+
+        // Try resetting the iterator before it is depleted.
+        n_res = 10;
+        for (size_t take = 0; take < re_runs; take++) {
+            size_t iteration_num = 0;
+            do {
+                ASSERT_TRUE(VecSimBatchIterator_HasNext(batchIterator)) << decider_name;
+                std::vector<size_t> expected_ids(n_res);
+                for (size_t i = 0; i < n_res; i++) {
+                    expected_ids[i] = (n - iteration_num * n_res - i - 1) % n_labels;
+                }
+                auto verify_res = [&](size_t id, double score, size_t index) {
+                    ASSERT_EQ(expected_ids[index], id) << decider_name;
+                };
+                runBatchIteratorSearchTest(batchIterator, n_res, verify_res, BY_SCORE);
+            } while (5 > iteration_num++);
+            VecSimBatchIterator_Reset(batchIterator);
+        }
+        VecSimBatchIterator_Free(batchIterator);
+
+        // Free the index.
+        delete index_ctx;
+    }
+}
+
+TYPED_TEST(HNSWTieredIndexTest, BatchIteratorSize1) {
+    size_t d = 4;
+    size_t M = 8;
+    size_t ef = 20;
+    size_t n = 1000;
+
+    size_t per_label = TypeParam::isMulti() ? 10 : 1;
+    size_t n_labels = n / per_label;
+
+    // Create TieredHNSW index instance with a mock queue.
+    HNSWParams hnsw_params = {
+        .type = TypeParam::get_index_type(),
+        .dim = d,
+        .metric = VecSimMetric_L2,
+        .multi = TypeParam::isMulti(),
+        .initialCapacity = n,
+        .efConstruction = ef,
+        .efRuntime = ef,
+    };
+    VecSimParams params = CreateParams(hnsw_params);
+
+    // for (auto &[decider_name, decider] : lambdas) { // TODO: not supported by clang < 16
+    for (auto &lambda : lambdas) {
+        // manually deconstruct the pair to avoid the clang error
+        auto &decider_name = lambda.first;
+        auto &decider = lambda.second;
+
+        auto jobQ = JobQueue();
+        auto index_ctx = new IndexExtCtx();
+        auto *tiered_index = this->CreateTieredHNSWIndex(params, &jobQ, index_ctx);
+        auto allocator = tiered_index->getAllocator();
+
+        auto *hnsw = tiered_index->backendIndex;
+        auto *flat = tiered_index->frontendIndex;
+
+        // For every i, add the vector (i,i,i,i) under the label `n_labels - (i % n_labels)`.
+        for (size_t i = 0; i < n; i++) {
+            auto cur = decider(i, n) ? hnsw : flat;
+            GenerateAndAddVector<TEST_DATA_T>(cur, d, n_labels - (i % n_labels), i);
+        }
+        ASSERT_EQ(VecSimIndex_IndexSize(tiered_index), n) << decider_name;
+
+        // Query for (n,n,n,n) vector (recall that n-1 is the largest id in te index).
+        TEST_DATA_T query[d];
+        GenerateVector<TEST_DATA_T>(query, d, n);
+
+        VecSimBatchIterator *batchIterator = VecSimBatchIterator_New(tiered_index, query, nullptr);
+
+        size_t iteration_num = 0;
+        size_t n_res = 1, expected_n_res = 1;
+        while (VecSimBatchIterator_HasNext(batchIterator)) {
+            iteration_num++;
+            // Expect to get results in the reverse order of labels - which is the order of the
+            // distance from the query vector. Get one result in every iteration.
+            auto verify_res = [&](size_t id, double score, size_t index) {
+                ASSERT_EQ(id, iteration_num) << decider_name;
+            };
+            runBatchIteratorSearchTest(batchIterator, n_res, verify_res, BY_SCORE, expected_n_res);
+        }
+
+        ASSERT_EQ(iteration_num, n_labels) << decider_name;
+        VecSimBatchIterator_Free(batchIterator);
+
+        // Free the index.
+        delete index_ctx;
+    }
+}
+
+TYPED_TEST(HNSWTieredIndexTest, BatchIteratorAdvanced) {
+    size_t d = 4;
+    size_t M = 8;
+    size_t ef = 1000;
+    size_t n = 1000;
+
+    size_t per_label = TypeParam::isMulti() ? 10 : 1;
+    size_t n_labels = n / per_label;
+
+    // Create TieredHNSW index instance with a mock queue.
+    HNSWParams hnsw_params = {
+        .type = TypeParam::get_index_type(),
+        .dim = d,
+        .metric = VecSimMetric_L2,
+        .multi = TypeParam::isMulti(),
+        .initialCapacity = n,
+        .efConstruction = ef,
+    };
+    VecSimParams params = CreateParams(hnsw_params);
+    HNSWRuntimeParams hnswRuntimeParams = {.efRuntime = ef};
+    VecSimQueryParams query_params = CreateQueryParams(hnswRuntimeParams);
+
+    // for (auto &[decider_name, decider] : lambdas) { // TODO: not supported by clang < 16
+    for (auto &lambda : lambdas) {
+        // manually deconstruct the pair to avoid the clang error
+        auto &decider_name = lambda.first;
+        auto &decider = lambda.second;
+
+        auto jobQ = JobQueue();
+        auto index_ctx = new IndexExtCtx();
+        auto *tiered_index = this->CreateTieredHNSWIndex(params, &jobQ, index_ctx);
+        auto allocator = tiered_index->getAllocator();
+
+        auto *hnsw = tiered_index->backendIndex;
+        auto *flat = tiered_index->frontendIndex;
+
+        TEST_DATA_T query[d];
+        GenerateVector<TEST_DATA_T>(query, d, n);
+
+        VecSimBatchIterator *batchIterator =
+            VecSimBatchIterator_New(tiered_index, query, &query_params);
+
+        // Try to get results even though there are no vectors in the index.
+        VecSimQueryResult_List res = VecSimBatchIterator_Next(batchIterator, 10, BY_SCORE);
+        ASSERT_EQ(VecSimQueryResult_Len(res), 0) << decider_name;
+        VecSimQueryResult_Free(res);
+        ASSERT_FALSE(VecSimBatchIterator_HasNext(batchIterator)) << decider_name;
+
+        // Insert one label and query again. The internal id will be 0.
+        for (size_t j = 0; j < per_label; j++) {
+            GenerateAndAddVector<TEST_DATA_T>(decider(n_labels, n) ? hnsw : flat, d, n_labels,
+                                              n - j);
+        }
+        VecSimBatchIterator_Reset(batchIterator);
+        res = VecSimBatchIterator_Next(batchIterator, 10, BY_SCORE);
+        ASSERT_EQ(VecSimQueryResult_Len(res), 1) << decider_name;
+        VecSimQueryResult_Free(res);
+        ASSERT_FALSE(VecSimBatchIterator_HasNext(batchIterator)) << decider_name;
+        VecSimBatchIterator_Free(batchIterator);
+
+        // Insert vectors to the index and re-create the batch iterator.
+        for (size_t i = 1; i < n_labels; i++) {
+            auto cur = decider(i, n) ? hnsw : flat;
+            for (size_t j = 1; j <= per_label; j++) {
+                GenerateAndAddVector<TEST_DATA_T>(cur, d, i, (i - 1) * per_label + j);
+            }
+        }
+        ASSERT_EQ(VecSimIndex_IndexSize(tiered_index), n) << decider_name;
+        batchIterator = VecSimBatchIterator_New(tiered_index, query, &query_params);
+
+        // Try to get 0 results.
+        res = VecSimBatchIterator_Next(batchIterator, 0, BY_SCORE);
+        ASSERT_EQ(VecSimQueryResult_Len(res), 0) << decider_name;
+        VecSimQueryResult_Free(res);
+
+        // n_res does not divide into ef or vice versa - expect leftovers between the graph scans.
+        size_t n_res = 7;
+        size_t iteration_num = 0;
+
+        while (VecSimBatchIterator_HasNext(batchIterator)) {
+            iteration_num++;
+            std::vector<size_t> expected_ids;
+            // We ask to get the results sorted by ID in a specific batch (in ascending order), but
+            // in every iteration the ids should be lower than the previous one, according to the
+            // distance from the query.
+            for (size_t i = 1; i <= n_res; i++) {
+                expected_ids.push_back(n_labels - iteration_num * n_res + i);
+            }
+            auto verify_res = [&](size_t id, double score, size_t index) {
+                ASSERT_EQ(expected_ids[index], id) << decider_name;
+            };
+            if (iteration_num <= n_labels / n_res) {
+                runBatchIteratorSearchTest(batchIterator, n_res, verify_res, BY_ID);
+            } else {
+                // In the last iteration there are `n_labels % n_res` results left to return.
+                size_t n_left = n_labels % n_res;
+                // Remove the first `n_res - n_left` ids from the expected ids.
+                while (expected_ids.size() > n_left) {
+                    expected_ids.erase(expected_ids.begin());
+                }
+                runBatchIteratorSearchTest(batchIterator, n_res, verify_res, BY_ID, n_left);
+            }
+        }
+        ASSERT_EQ(iteration_num, n_labels / n_res + 1) << decider_name;
+        // Try to get more results even though there are no.
+        res = VecSimBatchIterator_Next(batchIterator, 1, BY_SCORE);
+        ASSERT_EQ(VecSimQueryResult_Len(res), 0) << decider_name;
+        VecSimQueryResult_Free(res);
+
+        VecSimBatchIterator_Free(batchIterator);
+
+        // Free the index.
+        delete index_ctx;
+    }
+}
+
+TYPED_TEST(HNSWTieredIndexTest, BatchIteratorWithOverlaps) {
+    size_t d = 4;
+    size_t M = 8;
+    size_t ef = 20;
+    size_t n = 1000;
+
+    size_t per_label = TypeParam::isMulti() ? 10 : 1;
+    size_t n_labels = n / per_label;
+
+    // Create TieredHNSW index instance with a mock queue.
+    HNSWParams hnsw_params = {
+        .type = TypeParam::get_index_type(),
+        .dim = d,
+        .metric = VecSimMetric_L2,
+        .multi = TypeParam::isMulti(),
+        .initialCapacity = n,
+        .efConstruction = ef,
+        .efRuntime = ef,
+    };
+    VecSimParams params = CreateParams(hnsw_params);
+
+    // for (auto &[decider_name, decider] : lambdas) { // TODO: not supported by clang < 16
+    for (auto &lambda : lambdas) {
+        // manually deconstruct the pair to avoid the clang error
+        auto &decider_name = lambda.first;
+        auto &decider = lambda.second;
+
+        auto jobQ = JobQueue();
+        auto index_ctx = new IndexExtCtx();
+        auto *tiered_index = this->CreateTieredHNSWIndex(params, &jobQ, index_ctx);
+        auto allocator = tiered_index->getAllocator();
+
+        auto *hnsw = tiered_index->backendIndex;
+        auto *flat = tiered_index->frontendIndex;
+
+        // For every i, add the vector (i,i,i,i) under the label i.
+        size_t flat_count = 0;
+        for (size_t i = 0; i < n; i++) {
+            auto cur = decider(i, n) ? hnsw : flat;
+            GenerateAndAddVector<TEST_DATA_T>(cur, d, i % n_labels, i);
+            if (cur == flat) {
+                flat_count++;
+                // Add 10% of the vectors in FLAT to HNSW as well.
+                if (flat_count % 10 == 0) {
+                    GenerateAndAddVector<TEST_DATA_T>(hnsw, d, i % n_labels, i);
+                }
+            }
+        }
+        // The index size should be 100-110% of n.
+        ASSERT_LE(VecSimIndex_IndexSize(tiered_index), n * 1.1) << decider_name;
+        ASSERT_GE(VecSimIndex_IndexSize(tiered_index), n) << decider_name;
+        // The number of unique labels should be n_labels.
+        ASSERT_EQ(tiered_index->indexLabelCount(), n_labels) << decider_name;
+
+        // Query for (n,n,n,n) vector (recall that n-1 is the largest id in te index).
+        TEST_DATA_T query[d];
+        GenerateVector<TEST_DATA_T>(query, d, n);
+
+        VecSimBatchIterator *batchIterator = VecSimBatchIterator_New(tiered_index, query, nullptr);
+        size_t iteration_num = 0;
+
+        // Get the 5 vectors whose ids are the maximal among those that hasn't been returned yet
+        // in every iteration. The results order should be sorted by their score (distance from
+        // the query vector), which means sorted from the largest id to the lowest.
+        size_t n_res = 5;
+        size_t n_expected = n_res;
+        size_t excessive_iterations = 0;
+        while (VecSimBatchIterator_HasNext(batchIterator)) {
+            if (iteration_num * n_res == n_labels) {
+                // in some cases, the batch iterator may report that it has more results to return,
+                // but it's actually not true and the next call to `VecSimBatchIterator_Next` will
+                // return 0 results. This is safe because we don't guarantee how many results the
+                // batch iterator will return, and a similar scenario can happen when checking
+                // `VecSimBatchIterator_HasNext` on an empty index for the first time (before the
+                // first call to `VecSimBatchIterator_Next`). we check that this scenario doesn't
+                // happen more than once.
+                ASSERT_EQ(excessive_iterations, 0) << decider_name;
+                excessive_iterations = 1;
+                n_expected = 0;
+            }
+            std::vector<size_t> expected_ids(n_expected);
+            for (size_t i = 0; i < n_expected; i++) {
+                expected_ids[i] = (n - iteration_num * n_expected - i - 1) % n_labels;
+            }
+            auto verify_res = [&](size_t id, double score, size_t index) {
+                ASSERT_EQ(expected_ids[index], id) << decider_name;
+            };
+            runBatchIteratorSearchTest(batchIterator, n_res, verify_res, BY_SCORE, n_expected);
+            iteration_num++;
+        }
+        ASSERT_EQ(iteration_num - excessive_iterations, n_labels / n_res)
+            << decider_name << "\nHad excessive iterations: " << (excessive_iterations != 0);
+        VecSimBatchIterator_Free(batchIterator);
+
+        // Free the index.
+        delete index_ctx;
+    }
+}
+
+TYPED_TEST(HNSWTieredIndexTestBasic, BatchIteratorWithOverlaps_SpacialMultiCases) {
+    size_t d = 4;
+
+    std::shared_ptr<VecSimAllocator> allocator;
+    TieredHNSWIndex<TEST_DATA_T, TEST_DIST_T> *tiered_index;
+    VecSimIndex *hnsw, *flat;
+    TEST_DATA_T query[d];
+    VecSimBatchIterator *iterator;
+    VecSimQueryResult_List batch;
+
+    // Create TieredHNSW index instance with a mock queue.
+    HNSWParams hnsw_params = {
+        .type = TypeParam::get_index_type(),
+        .dim = d,
+        .metric = VecSimMetric_L2,
+        .multi = true,
+    };
+    VecSimParams params = CreateParams(hnsw_params);
+    auto jobQ = JobQueue();
+    auto index_ctx = new IndexExtCtx();
+
+    auto L2 = [&](size_t element) { return element * element * d; };
+
+    // TEST 1:
+    // first batch contains duplicates with different scores.
+    tiered_index = this->CreateTieredHNSWIndex(params, &jobQ, index_ctx);
+    allocator = tiered_index->getAllocator();
+    hnsw = tiered_index->backendIndex;
+    flat = tiered_index->frontendIndex;
+
+    GenerateAndAddVector<TEST_DATA_T>(flat, d, 0, 0);
+    GenerateAndAddVector<TEST_DATA_T>(flat, d, 1, 1);
+    GenerateAndAddVector<TEST_DATA_T>(flat, d, 2, 2);
+
+    GenerateAndAddVector<TEST_DATA_T>(hnsw, d, 1, 3);
+    GenerateAndAddVector<TEST_DATA_T>(hnsw, d, 0, 4);
+    GenerateAndAddVector<TEST_DATA_T>(hnsw, d, 3, 5);
+
+    ASSERT_EQ(tiered_index->indexLabelCount(), 4);
+
+    GenerateVector<TEST_DATA_T>(query, d, 0);
+    iterator = VecSimBatchIterator_New(tiered_index, query, nullptr);
+
+    // batch size is 3 (the size of each index). Internally the tiered batch iterator will have to
+    // handle the duplicates with different scores.
+    ASSERT_TRUE(VecSimBatchIterator_HasNext(iterator));
+    batch = VecSimBatchIterator_Next(iterator, 3, BY_SCORE);
+    ASSERT_EQ(VecSimQueryResult_Len(batch), 3);
+    ASSERT_EQ(VecSimQueryResult_GetId(batch.results + 0), 0);
+    ASSERT_EQ(VecSimQueryResult_GetScore(batch.results + 0), L2(0));
+    ASSERT_EQ(VecSimQueryResult_GetId(batch.results + 1), 1);
+    ASSERT_EQ(VecSimQueryResult_GetScore(batch.results + 1), L2(1));
+    ASSERT_EQ(VecSimQueryResult_GetId(batch.results + 2), 2);
+    ASSERT_EQ(VecSimQueryResult_GetScore(batch.results + 2), L2(2));
+    VecSimQueryResult_Free(batch);
+
+    // we have 1 more label in the index. we expect the tiered batch iterator to return it only and
+    // filter out the duplicates.
+    ASSERT_TRUE(VecSimBatchIterator_HasNext(iterator));
+    batch = VecSimBatchIterator_Next(iterator, 2, BY_SCORE);
+    ASSERT_EQ(VecSimQueryResult_Len(batch), 1);
+    ASSERT_EQ(VecSimQueryResult_GetId(batch.results + 0), 3);
+    ASSERT_EQ(VecSimQueryResult_GetScore(batch.results + 0), L2(5));
+    ASSERT_FALSE(VecSimBatchIterator_HasNext(iterator));
+    VecSimQueryResult_Free(batch);
+    // TEST 1 clean up.
+    VecSimBatchIterator_Free(iterator);
+    delete index_ctx;
+
+    // TEST 2:
+    // second batch contains duplicates (different scores) from the first batch.
+    index_ctx = new IndexExtCtx();
+    tiered_index = this->CreateTieredHNSWIndex(params, &jobQ, index_ctx);
+    allocator = tiered_index->getAllocator();
+    hnsw = tiered_index->backendIndex;
+    flat = tiered_index->frontendIndex;
+
+    GenerateAndAddVector<TEST_DATA_T>(hnsw, d, 0, 0);
+    GenerateAndAddVector<TEST_DATA_T>(hnsw, d, 1, 1);
+    GenerateAndAddVector<TEST_DATA_T>(hnsw, d, 2, 2);
+    GenerateAndAddVector<TEST_DATA_T>(hnsw, d, 3, 3);
+
+    GenerateAndAddVector<TEST_DATA_T>(flat, d, 2, 0);
+    GenerateAndAddVector<TEST_DATA_T>(flat, d, 3, 1);
+    GenerateAndAddVector<TEST_DATA_T>(flat, d, 0, 2);
+    GenerateAndAddVector<TEST_DATA_T>(flat, d, 1, 3);
+
+    ASSERT_EQ(tiered_index->indexLabelCount(), 4);
+
+    iterator = VecSimBatchIterator_New(tiered_index, query, nullptr);
+
+    // ask for 2 results. The internal batch iterators will return 2 results: hnsw - [0, 1], flat -
+    // [2, 3] so there are no duplicates.
+    ASSERT_TRUE(VecSimBatchIterator_HasNext(iterator));
+    batch = VecSimBatchIterator_Next(iterator, 2, BY_SCORE);
+    ASSERT_EQ(VecSimQueryResult_Len(batch), 2);
+    ASSERT_EQ(VecSimQueryResult_GetId(batch.results + 0), 0);
+    ASSERT_EQ(VecSimQueryResult_GetScore(batch.results + 0), L2(0));
+    ASSERT_EQ(VecSimQueryResult_GetId(batch.results + 1), 2);
+    ASSERT_EQ(VecSimQueryResult_GetScore(batch.results + 1), L2(0));
+    VecSimQueryResult_Free(batch);
+
+    // first batch contained 1 result from each index, so there is one leftover from each iterator.
+    // Asking for 3 results will return additional 2 results from each iterator and the tiered batch
+    // iterator will have to handle the duplicates that each iterator returned (both labels that
+    // were returned in the first batch and duplicates in the current batch).
+    ASSERT_TRUE(VecSimBatchIterator_HasNext(iterator));
+    batch = VecSimBatchIterator_Next(iterator, 3, BY_SCORE);
+    ASSERT_EQ(VecSimQueryResult_Len(batch), 2);
+    ASSERT_EQ(VecSimQueryResult_GetId(batch.results + 0), 1);
+    ASSERT_EQ(VecSimQueryResult_GetScore(batch.results + 0), L2(1));
+    ASSERT_EQ(VecSimQueryResult_GetId(batch.results + 1), 3);
+    ASSERT_EQ(VecSimQueryResult_GetScore(batch.results + 1), L2(1));
+    ASSERT_FALSE(VecSimBatchIterator_HasNext(iterator));
+    VecSimQueryResult_Free(batch);
+    // TEST 2 clean up.
+    VecSimBatchIterator_Free(iterator);
+
+    delete index_ctx;
+}
+
+TYPED_TEST(HNSWTieredIndexTest, parallelBatchIteratorSearch) {
+    size_t dim = 4;
+    size_t ef = 500;
+    size_t n = 1000;
+    size_t n_res_min = 3;  // minimum number of results to return per batch
+    size_t n_res_max = 15; // maximum number of results to return per batch
+    bool isMulti = TypeParam::isMulti();
+
+    size_t per_label = isMulti ? 5 : 1;
+    size_t n_labels = n / per_label;
+
+    // Create TieredHNSW index instance with a mock queue.
+    HNSWParams params = {
+        .type = TypeParam::get_index_type(),
+        .dim = dim,
+        .metric = VecSimMetric_L2,
+        .multi = isMulti,
+        .efRuntime = ef,
+    };
+    VecSimParams hnsw_params = CreateParams(params);
+    auto jobQ = JobQueue();
+    auto index_ctx = new IndexExtCtx();
+
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx);
+    auto allocator = tiered_index->getAllocator();
+
+    std::atomic_int successful_searches(0);
+    auto parallel_10_batches = [](AsyncJob *job) {
+        auto *search_job = reinterpret_cast<SearchJobMock *>(job);
+        const size_t res_per_batch = search_job->k;
+        const size_t dim = search_job->dim;
+        const auto query = search_job->query;
+
+        size_t iteration = 0;
+        auto verify_res = [&](size_t id, double score, size_t res_index) {
+            TEST_DATA_T element = *(TEST_DATA_T *)query;
+            res_index += iteration * res_per_batch;
+            ASSERT_EQ(std::abs(id - element), (res_index + 1) / 2);
+            ASSERT_EQ(score, dim * (id - element) * (id - element));
+        };
+
+        // Run 10 batches of search.
+        auto tiered_iterator = VecSimBatchIterator_New(search_job->index, query, nullptr);
+        do {
+            runBatchIteratorSearchTest(tiered_iterator, res_per_batch, verify_res);
+        } while (++iteration < 10 && VecSimBatchIterator_HasNext(tiered_iterator));
+
+        VecSimBatchIterator_Free(tiered_iterator);
+        search_job->successful_searches++;
+        delete job;
+    };
+
+    // Fill the job queue with insert and batch-search jobs, while filling the flat index, before
+    // initializing the thread pool.
+    for (size_t i = 0; i < n; i++) {
+        // Insert a vector to the flat index and add a job to insert it to the main index.
+        GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, i % n_labels, i);
+
+        // Add a search job.
+        size_t cur_res_per_batch = i % (n_res_max - n_res_min) + n_res_min;
+        size_t n_res = cur_res_per_batch * 10;
+        auto query = (TEST_DATA_T *)allocator->allocate(dim * sizeof(TEST_DATA_T));
+        // make sure there are `n_res / 2` vectors in the index in each "side" of the query vector.
+        GenerateVector<TEST_DATA_T>(query, dim, (i % (n_labels - n_res)) + (n_res / 2));
+        auto search_job =
+            new (allocator) SearchJobMock(allocator, parallel_10_batches, tiered_index, query,
+                                          cur_res_per_batch, n, dim, successful_searches);
+        tiered_index->submitSingleJob(search_job);
+    }
+
+    EXPECT_EQ(tiered_index->indexSize(), n);
+    EXPECT_EQ(tiered_index->indexLabelCount(), n_labels);
+    EXPECT_EQ(tiered_index->labelToInsertJobs.size(), n_labels);
+    for (auto &it : tiered_index->labelToInsertJobs) {
+        EXPECT_EQ(it.second.size(), per_label);
+    }
+    EXPECT_EQ(tiered_index->frontendIndex->indexSize(), n);
+    EXPECT_EQ(tiered_index->backendIndex->indexSize(), 0);
+
+    // Launch the BG threads loop that takes jobs from the queue and executes them.
+    // All the vectors are already in the tiered index, so we expect to find the expected
+    // results from the get-go.
+    bool run_thread = true;
+    for (size_t i = 0; i < THREAD_POOL_SIZE; i++) {
+        thread_pool.emplace_back(thread_main_loop, std::ref(jobQ), std::ref(run_thread));
+    }
+
+    thread_pool_join(jobQ, run_thread);
+
+    EXPECT_EQ(tiered_index->backendIndex->indexSize(), n);
+    EXPECT_EQ(tiered_index->backendIndex->indexLabelCount(), n_labels);
+    EXPECT_EQ(tiered_index->frontendIndex->indexSize(), 0);
+    EXPECT_EQ(tiered_index->labelToInsertJobs.size(), 0);
+    EXPECT_EQ(successful_searches, n);
+    EXPECT_EQ(jobQ.size(), 0);
+
+    // Cleanup.
+    delete index_ctx;
+}
+
+TYPED_TEST(HNSWTieredIndexTestBasic, overwriteVectorBasic) {
+    // Create TieredHNSW index instance with a mock queue.
+    size_t dim = 4;
+    size_t n = 1000;
+    HNSWParams params = {
+        .type = TypeParam::get_index_type(), .dim = dim, .metric = VecSimMetric_L2, .multi = false};
+    VecSimParams hnsw_params = CreateParams(params);
+    auto jobQ = JobQueue();
+    auto index_ctx = new IndexExtCtx();
+
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx, 1);
+    auto allocator = tiered_index->getAllocator();
+
+    TEST_DATA_T val = 1.0;
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, 0, val);
+    // Overwrite label 0 (in the flat buffer) with a different value.
+    val = 2.0;
+    TEST_DATA_T overwritten_vec[] = {val, val, val, val};
+    ASSERT_EQ(tiered_index->addVector(overwritten_vec, 0), 0);
+    ASSERT_EQ(tiered_index->indexLabelCount(), 1);
+    ASSERT_EQ(tiered_index->indexSize(), 1);
+    ASSERT_EQ(tiered_index->frontendIndex->indexSize(), 1);
+    ASSERT_EQ(tiered_index->getDistanceFrom(0, overwritten_vec), 0);
+
+    // Validate that jobs were created properly - first job should be invalid after overwrite,
+    // the second should be a pending insert job.
+    ASSERT_EQ(tiered_index->labelToInsertJobs.at(0).size(), 1);
+    auto *pending_insert_job = tiered_index->labelToInsertJobs.at(0)[0];
+    ASSERT_EQ(jobQ.size(), 2);
+    ASSERT_EQ(jobQ.front().job->jobType, HNSW_INSERT_VECTOR_JOB);
+    ASSERT_EQ(jobQ.front().job->isValid, false);
+    ASSERT_EQ(reinterpret_cast<HNSWInsertJob *>(jobQ.front().job)->label, 0);
+    ASSERT_EQ(reinterpret_cast<HNSWInsertJob *>(jobQ.front().job)->id, 0);
+    thread_iteration(jobQ);
+
+    ASSERT_EQ(jobQ.front().job->jobType, HNSW_INSERT_VECTOR_JOB);
+    ASSERT_EQ(reinterpret_cast<HNSWInsertJob *>(jobQ.front().job)->label, 0);
+    ASSERT_EQ(reinterpret_cast<HNSWInsertJob *>(jobQ.front().job)->id, 0);
+    ASSERT_EQ(reinterpret_cast<HNSWInsertJob *>(jobQ.front().job), pending_insert_job);
+
+    // Ingest vector into HNSW, and then overwrite it.
+    thread_iteration(jobQ);
+    ASSERT_EQ(tiered_index->backendIndex->indexSize(), 1);
+    ASSERT_EQ(tiered_index->frontendIndex->indexSize(), 0);
+    val = 3.0;
+    overwritten_vec[0] = overwritten_vec[1] = overwritten_vec[2] = overwritten_vec[3] = val;
+    ASSERT_EQ(tiered_index->addVector(overwritten_vec, 0), 0);
+    ASSERT_EQ(tiered_index->indexLabelCount(), 1);
+    // Swap job should be executed for the overwritten vector since limit is 1, and we are calling
+    // swap job execution prior to insert jobs.
+    ASSERT_EQ(tiered_index->backendIndex->indexSize(), 0);
+    ASSERT_EQ(tiered_index->frontendIndex->indexSize(), 1);
+    ASSERT_EQ(tiered_index->getDistanceFrom(0, overwritten_vec), 0);
+
+    // Ingest the updated vector to HNSW.
+    thread_iteration(jobQ);
+    ASSERT_EQ(tiered_index->backendIndex->indexSize(), 1);
+    ASSERT_EQ(tiered_index->frontendIndex->indexSize(), 0);
+    ASSERT_EQ(tiered_index->indexLabelCount(), 1);
+    ASSERT_EQ(tiered_index->getDistanceFrom(0, overwritten_vec), 0);
+
+    delete index_ctx;
+}
+
+TYPED_TEST(HNSWTieredIndexTestBasic, overwriteVectorAsync) {
+    // Create TieredHNSW index instance with a mock queue.
+    size_t dim = 4;
+    size_t n = 1000;
+    HNSWParams params = {
+        .type = TypeParam::get_index_type(), .dim = dim, .metric = VecSimMetric_L2, .multi = false};
+    VecSimParams hnsw_params = CreateParams(params);
+    for (size_t maxSwapJobs : {(int)n + 1, 1}) {
+        auto jobQ = JobQueue();
+        auto index_ctx = new IndexExtCtx();
+
+        auto *tiered_index =
+            this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx, maxSwapJobs);
+        auto allocator = tiered_index->getAllocator();
+
+        // Launch the BG threads loop that takes jobs from the queue and executes them.
+        bool run_thread = true;
+        for (size_t i = 0; i < THREAD_POOL_SIZE; i++) {
+            thread_pool.emplace_back(thread_main_loop, std::ref(jobQ), std::ref(run_thread));
+        }
+
+        // Insert vectors and overwrite them multiple times while thread run in the background.
+        std::srand(10); // create pseudo random generator with any arbitrary seed.
+        for (size_t i = 0; i < n; i++) {
+            TEST_DATA_T vector[dim];
+            for (size_t j = 0; j < dim; j++) {
+                vector[j] = std::rand() / (TEST_DATA_T)RAND_MAX;
+            }
+            tiered_index->addVector(vector, i);
+        }
+        EXPECT_EQ(tiered_index->indexLabelCount(), n);
+
+        size_t num_overwrites = 1000;
+        for (size_t i = 0; i < num_overwrites; i++) {
+            size_t label_to_overwrite = std::rand() % n;
+            TEST_DATA_T vector[dim];
+            for (size_t j = 0; j < dim; j++) {
+                vector[j] = std::rand() / (TEST_DATA_T)RAND_MAX;
+            }
+            EXPECT_EQ(tiered_index->addVector(vector, label_to_overwrite), 0);
+        }
+
+        thread_pool_join(jobQ, run_thread);
+
+        EXPECT_EQ(tiered_index->indexSize() - tiered_index->getHNSWIndex()->getNumMarkedDeleted(),
+                  n);
+        EXPECT_EQ(tiered_index->frontendIndex->indexSize(), 0);
+        EXPECT_EQ(tiered_index->indexLabelCount(), n);
+        auto report = tiered_index->getHNSWIndex()->checkIntegrity();
+        EXPECT_EQ(report.connections_to_repair, 0);
+        EXPECT_EQ(report.valid_state, true);
+
+        delete index_ctx;
+    }
+}
+
+TYPED_TEST(HNSWTieredIndexTest, testInfo) {
+    // Create TieredHNSW index instance with a mock queue.
+    size_t dim = 4;
+    size_t n = 1000;
+    HNSWParams params = {.type = TypeParam::get_index_type(),
+                         .dim = dim,
+                         .metric = VecSimMetric_L2,
+                         .multi = TypeParam::isMulti()};
+    VecSimParams hnsw_params = CreateParams(params);
+    auto jobQ = JobQueue();
+    auto index_ctx = new IndexExtCtx();
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx, 1, 1000);
+    auto allocator = tiered_index->getAllocator();
+
+    VecSimIndexInfo info = tiered_index->info();
+    EXPECT_EQ(info.commonInfo.basicInfo.algo, VecSimAlgo_HNSWLIB);
+    EXPECT_EQ(info.commonInfo.indexSize, 0);
+    EXPECT_EQ(info.commonInfo.indexLabelCount, 0);
+    EXPECT_EQ(info.commonInfo.memory, tiered_index->getAllocationSize());
+    EXPECT_EQ(info.commonInfo.basicInfo.isMulti, TypeParam::isMulti());
+    EXPECT_EQ(info.commonInfo.basicInfo.dim, dim);
+    EXPECT_EQ(info.commonInfo.basicInfo.metric, VecSimMetric_L2);
+    EXPECT_EQ(info.commonInfo.basicInfo.type, TypeParam::get_index_type());
+    EXPECT_EQ(info.commonInfo.basicInfo.blockSize, DEFAULT_BLOCK_SIZE);
+    VecSimIndexInfo frontendIndexInfo = tiered_index->frontendIndex->info();
+    VecSimIndexInfo backendIndexInfo = tiered_index->backendIndex->info();
+
+    compareCommonInfo(info.tieredInfo.frontendCommonInfo, frontendIndexInfo.commonInfo);
+    compareFlatInfo(info.tieredInfo.bfInfo, frontendIndexInfo.bfInfo);
+    compareCommonInfo(info.tieredInfo.backendCommonInfo, backendIndexInfo.commonInfo);
+    compareHNSWInfo(info.tieredInfo.backendInfo.hnswInfo, backendIndexInfo.hnswInfo);
+
+    EXPECT_EQ(info.commonInfo.memory, info.tieredInfo.management_layer_memory +
+                                          backendIndexInfo.commonInfo.memory +
+                                          frontendIndexInfo.commonInfo.memory);
+    EXPECT_EQ(info.tieredInfo.backgroundIndexing, false);
+    EXPECT_EQ(info.tieredInfo.bufferLimit, 1000);
+    EXPECT_EQ(info.tieredInfo.specificTieredBackendInfo.hnswTieredInfo.pendingSwapJobsThreshold, 1);
+
+    // Validate that Static info returns the right restricted info as well.
+    VecSimIndexBasicInfo s_info = VecSimIndex_BasicInfo(tiered_index);
+    ASSERT_EQ(info.commonInfo.basicInfo.algo, s_info.algo);
+    ASSERT_EQ(info.commonInfo.basicInfo.dim, s_info.dim);
+    ASSERT_EQ(info.commonInfo.basicInfo.blockSize, s_info.blockSize);
+    ASSERT_EQ(info.commonInfo.basicInfo.type, s_info.type);
+    ASSERT_EQ(info.commonInfo.basicInfo.isMulti, s_info.isMulti);
+    ASSERT_EQ(info.commonInfo.basicInfo.type, s_info.type);
+    ASSERT_EQ(info.commonInfo.basicInfo.isTiered, s_info.isTiered);
+
+    GenerateAndAddVector(tiered_index, dim, 1, 1);
+    info = tiered_index->info();
+
+    EXPECT_EQ(info.commonInfo.indexSize, 1);
+    EXPECT_EQ(info.commonInfo.indexLabelCount, 1);
+    EXPECT_EQ(info.tieredInfo.backendCommonInfo.indexSize, 0);
+    EXPECT_EQ(info.tieredInfo.backendCommonInfo.indexLabelCount, 0);
+    EXPECT_EQ(info.tieredInfo.frontendCommonInfo.indexSize, 1);
+    EXPECT_EQ(info.tieredInfo.frontendCommonInfo.indexLabelCount, 1);
+    EXPECT_EQ(info.commonInfo.memory, info.tieredInfo.management_layer_memory +
+                                          info.tieredInfo.backendCommonInfo.memory +
+                                          info.tieredInfo.frontendCommonInfo.memory);
+    EXPECT_EQ(info.tieredInfo.backgroundIndexing, true);
+
+    thread_iteration(jobQ);
+    info = tiered_index->info();
+
+    EXPECT_EQ(info.commonInfo.indexSize, 1);
+    EXPECT_EQ(info.commonInfo.indexLabelCount, 1);
+    EXPECT_EQ(info.tieredInfo.backendCommonInfo.indexSize, 1);
+    EXPECT_EQ(info.tieredInfo.backendCommonInfo.indexLabelCount, 1);
+    EXPECT_EQ(info.tieredInfo.frontendCommonInfo.indexSize, 0);
+    EXPECT_EQ(info.tieredInfo.frontendCommonInfo.indexLabelCount, 0);
+    EXPECT_EQ(info.commonInfo.memory, info.tieredInfo.management_layer_memory +
+                                          info.tieredInfo.backendCommonInfo.memory +
+                                          info.tieredInfo.frontendCommonInfo.memory);
+    EXPECT_EQ(info.tieredInfo.backgroundIndexing, false);
+
+    if (TypeParam::isMulti()) {
+        GenerateAndAddVector(tiered_index, dim, 1, 1);
+        info = tiered_index->info();
+
+        EXPECT_EQ(info.commonInfo.indexSize, 2);
+        EXPECT_EQ(info.commonInfo.indexLabelCount, 1);
+        EXPECT_EQ(info.tieredInfo.backendCommonInfo.indexSize, 1);
+        EXPECT_EQ(info.tieredInfo.backendCommonInfo.indexLabelCount, 1);
+        EXPECT_EQ(info.tieredInfo.frontendCommonInfo.indexSize, 1);
+        EXPECT_EQ(info.tieredInfo.frontendCommonInfo.indexLabelCount, 1);
+        EXPECT_EQ(info.commonInfo.memory, info.tieredInfo.management_layer_memory +
+                                              info.tieredInfo.backendCommonInfo.memory +
+                                              info.tieredInfo.frontendCommonInfo.memory);
+        EXPECT_EQ(info.tieredInfo.backgroundIndexing, true);
+    }
+
+    VecSimIndex_DeleteVector(tiered_index, 1);
+    info = tiered_index->info();
+
+    EXPECT_EQ(info.commonInfo.indexSize, 0);
+    EXPECT_EQ(info.commonInfo.indexLabelCount, 0);
+    EXPECT_EQ(info.tieredInfo.backendCommonInfo.indexSize, 0);
+    EXPECT_EQ(info.tieredInfo.backendCommonInfo.indexLabelCount, 0);
+    EXPECT_EQ(info.tieredInfo.frontendCommonInfo.indexSize, 0);
+    EXPECT_EQ(info.tieredInfo.frontendCommonInfo.indexLabelCount, 0);
+    EXPECT_EQ(info.commonInfo.memory, info.tieredInfo.management_layer_memory +
+                                          info.tieredInfo.backendCommonInfo.memory +
+                                          info.tieredInfo.frontendCommonInfo.memory);
+    EXPECT_EQ(info.tieredInfo.backgroundIndexing, false);
+
+    delete index_ctx;
+}
+
+TYPED_TEST(HNSWTieredIndexTest, testInfoIterator) {
+    // Create TieredHNSW index instance with a mock queue.
+    size_t dim = 4;
+    size_t n = 1000;
+    HNSWParams params = {.type = TypeParam::get_index_type(),
+                         .dim = dim,
+                         .metric = VecSimMetric_L2,
+                         .multi = TypeParam::isMulti()};
+
+    VecSimParams hnsw_params = CreateParams(params);
+    auto jobQ = JobQueue();
+    auto index_ctx = new IndexExtCtx();
+
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx, 1);
+    auto allocator = tiered_index->getAllocator();
+
+    GenerateAndAddVector(tiered_index, dim, 1, 1);
+    VecSimIndexInfo info = tiered_index->info();
+    VecSimIndexInfo frontendIndexInfo = tiered_index->frontendIndex->info();
+    VecSimIndexInfo backendIndexInfo = tiered_index->backendIndex->info();
+
+    VecSimInfoIterator *infoIterator = tiered_index->infoIterator();
+    EXPECT_EQ(infoIterator->numberOfFields(), 15);
+
+    while (infoIterator->hasNext()) {
+        VecSim_InfoField *infoField = VecSimInfoIterator_NextField(infoIterator);
+
+        if (!strcmp(infoField->fieldName, VecSimCommonStrings::ALGORITHM_STRING)) {
+            // Algorithm type.
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_STRING);
+            ASSERT_STREQ(infoField->fieldValue.stringValue, VecSimCommonStrings::TIERED_STRING);
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::TYPE_STRING)) {
+            // Vector type.
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_STRING);
+            ASSERT_STREQ(infoField->fieldValue.stringValue,
+                         VecSimType_ToString(info.commonInfo.basicInfo.type));
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::DIMENSION_STRING)) {
+            // Vector dimension.
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_UINT64);
+            ASSERT_EQ(infoField->fieldValue.uintegerValue, info.commonInfo.basicInfo.dim);
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::METRIC_STRING)) {
+            // Metric.
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_STRING);
+            ASSERT_STREQ(infoField->fieldValue.stringValue,
+                         VecSimMetric_ToString(info.commonInfo.basicInfo.metric));
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::SEARCH_MODE_STRING)) {
+            // Search mode.
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_STRING);
+            ASSERT_STREQ(infoField->fieldValue.stringValue,
+                         VecSimSearchMode_ToString(info.commonInfo.last_mode));
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::INDEX_SIZE_STRING)) {
+            // Index size.
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_UINT64);
+            ASSERT_EQ(infoField->fieldValue.uintegerValue, info.commonInfo.indexSize);
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::INDEX_LABEL_COUNT_STRING)) {
+            // Index label count.
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_UINT64);
+            ASSERT_EQ(infoField->fieldValue.uintegerValue, info.commonInfo.indexLabelCount);
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::IS_MULTI_STRING)) {
+            // Is the index multi value.
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_UINT64);
+            ASSERT_EQ(infoField->fieldValue.uintegerValue, info.commonInfo.basicInfo.isMulti);
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::MEMORY_STRING)) {
+            // Memory.
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_UINT64);
+            ASSERT_EQ(infoField->fieldValue.uintegerValue, info.commonInfo.memory);
+        } else if (!strcmp(infoField->fieldName,
+                           VecSimCommonStrings::TIERED_MANAGEMENT_MEMORY_STRING)) {
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_UINT64);
+            ASSERT_EQ(infoField->fieldValue.uintegerValue, info.tieredInfo.management_layer_memory);
+        } else if (!strcmp(infoField->fieldName,
+                           VecSimCommonStrings::TIERED_BACKGROUND_INDEXING_STRING)) {
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_UINT64);
+            ASSERT_EQ(infoField->fieldValue.uintegerValue, info.tieredInfo.backgroundIndexing);
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::FRONTEND_INDEX_STRING)) {
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_ITERATOR);
+            compareFlatIndexInfoToIterator(frontendIndexInfo, infoField->fieldValue.iteratorValue);
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::BACKEND_INDEX_STRING)) {
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_ITERATOR);
+            compareHNSWIndexInfoToIterator(backendIndexInfo, infoField->fieldValue.iteratorValue);
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::TIERED_BUFFER_LIMIT_STRING)) {
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_UINT64);
+            ASSERT_EQ(infoField->fieldValue.uintegerValue, info.tieredInfo.bufferLimit);
+        } else if (!strcmp(infoField->fieldName,
+                           VecSimCommonStrings::TIERED_HNSW_SWAP_JOBS_THRESHOLD_STRING)) {
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_UINT64);
+            ASSERT_EQ(
+                infoField->fieldValue.uintegerValue,
+                info.tieredInfo.specificTieredBackendInfo.hnswTieredInfo.pendingSwapJobsThreshold);
+        } else {
+            FAIL();
+        }
+    }
+    VecSimInfoIterator_Free(infoIterator);
+    delete index_ctx;
+}
+
+TYPED_TEST(HNSWTieredIndexTest, writeInPlaceMode) {
+    // Create TieredHNSW index instance with a mock queue.
+    size_t dim = 4;
+
+    HNSWParams params = {.type = TypeParam::get_index_type(),
+                         .dim = dim,
+                         .metric = VecSimMetric_L2,
+                         .multi = TypeParam::isMulti()};
+    VecSimParams hnsw_params = CreateParams(params);
+    auto jobQ = JobQueue();
+    auto index_ctx = new IndexExtCtx();
+
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx);
+    auto allocator = tiered_index->getAllocator();
+
+    VecSim_SetWriteMode(VecSim_WriteInPlace);
+    // Validate that the vector was inserted directly to the HNSW index.
+    labelType vec_label = 0;
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, vec_label, 0);
+    ASSERT_EQ(tiered_index->backendIndex->indexSize(), 1);
+    ASSERT_EQ(tiered_index->frontendIndex->indexSize(), 0);
+    ASSERT_EQ(tiered_index->labelToInsertJobs.size(), 0);
+
+    // Overwrite inplace - only in single-value mode
+    if (!TypeParam::isMulti()) {
+        TEST_DATA_T overwritten_vec[] = {1, 1, 1, 1};
+        tiered_index->addVector(overwritten_vec, vec_label);
+        ASSERT_EQ(tiered_index->backendIndex->indexSize(), 1);
+        ASSERT_EQ(tiered_index->frontendIndex->indexSize(), 0);
+        ASSERT_EQ(tiered_index->labelToInsertJobs.size(), 0);
+        ASSERT_EQ(tiered_index->getDistanceFrom(vec_label, overwritten_vec), 0);
+    }
+
+    // Validate that the vector is removed in place.
+    tiered_index->deleteVector(vec_label);
+    ASSERT_EQ(tiered_index->backendIndex->indexSize(), 0);
+    ASSERT_EQ(tiered_index->getHNSWIndex()->getNumMarkedDeleted(), 0);
+
+    delete index_ctx;
+}
+
+TYPED_TEST(HNSWTieredIndexTest, switchWriteModes) {
+    // Create TieredHNSW index instance with a mock queue.
+    size_t dim = 4;
+    size_t n = 500;
+    HNSWParams params = {.type = TypeParam::get_index_type(),
+                         .dim = dim,
+                         .metric = VecSimMetric_L2,
+                         .multi = TypeParam::isMulti(),
+                         .M = 32,
+                         .efRuntime = 2 * n};
+    VecSimParams hnsw_params = CreateParams(params);
+    auto jobQ = JobQueue();
+    auto index_ctx = new IndexExtCtx();
+
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx);
+    auto allocator = tiered_index->getAllocator();
+    VecSim_SetWriteMode(VecSim_WriteAsync);
+
+    // Launch the BG threads loop that takes jobs from the queue and executes them.
+    bool run_thread = true;
+    for (size_t i = 0; i < THREAD_POOL_SIZE; i++) {
+        thread_pool.emplace_back(thread_main_loop, std::ref(jobQ), std::ref(run_thread));
+    }
+
+    // Create and insert vectors one by one async.
+    size_t per_label = TypeParam::isMulti() ? 5 : 1;
+    size_t n_labels = n / per_label;
+    std::srand(10); // create pseudo random generator with any arbitrary seed.
+    for (size_t i = 0; i < n; i++) {
+        TEST_DATA_T vector[dim];
+        for (size_t j = 0; j < dim; j++) {
+            vector[j] = std::rand() / (TEST_DATA_T)RAND_MAX;
+        }
+        VecSimIndex_AddVector(tiered_index, vector, i % n_labels);
+    }
+
+    // Insert another n more vectors INPLACE, while the previous vectors are still being indexed.
+    VecSim_SetWriteMode(VecSim_WriteInPlace);
+    EXPECT_LE(tiered_index->backendIndex->indexSize(), n);
+    for (size_t i = 0; i < n; i++) {
+        TEST_DATA_T vector[dim];
+        for (size_t j = 0; j < dim; j++) {
+            vector[j] = std::rand() / (TEST_DATA_T)RAND_MAX;
+        }
+        VecSimIndex_AddVector(tiered_index, vector, i % n_labels + n_labels);
+    }
+    thread_pool_join(jobQ, run_thread);
+    EXPECT_EQ(tiered_index->backendIndex->indexSize(), 2 * n);
+
+    // Now delete the last n inserted vectors of the index using async jobs.
+    VecSim_SetWriteMode(VecSim_WriteAsync);
+    run_thread = true;
+    for (size_t i = 0; i < THREAD_POOL_SIZE; i++) {
+        thread_pool.emplace_back(thread_main_loop, std::ref(jobQ), std::ref(run_thread));
+    }
+    for (size_t i = 0; i < n_labels; i++) {
+        VecSimIndex_DeleteVector(tiered_index, n_labels + i);
+    }
+    // At this point, repair jobs should be executed in the background.
+    EXPECT_EQ(tiered_index->getHNSWIndex()->getNumMarkedDeleted(), n);
+
+    // Insert INPLACE another n vector (instead of the ones that were deleted).
+    VecSim_SetWriteMode(VecSim_WriteInPlace);
+    // Run twice, at first run we insert non-existing labels, in the second run we overwrite them
+    // (for single-value index only).
+    for (auto overwrite : {0, 1}) {
+        for (size_t i = 0; i < n; i++) {
+            TEST_DATA_T vector[dim];
+            for (size_t j = 0; j < dim; j++) {
+                vector[j] = std::rand() / (TEST_DATA_T)RAND_MAX;
+            }
+            EXPECT_EQ(tiered_index->addVector(vector, i % n_labels + n_labels),
+                      TypeParam::isMulti() ? 1 : 1 - overwrite);
+            // Run a query and see that we only receive ids with label < n_labels+i
+            // (the label that we just inserted), and the first result should be this vector.
+            auto ver_res = [&](size_t label, double score, size_t index) {
+                if (index == 0) {
+                    EXPECT_EQ(label, i % n_labels + n_labels);
+                    EXPECT_DOUBLE_EQ(score, 0);
+                }
+                if (!overwrite) {
+                    ASSERT_LE(label, i + n_labels);
+                }
+            };
+            runTopKSearchTest(tiered_index, vector, 10, ver_res);
+        }
+    }
+
+    thread_pool_join(jobQ, run_thread);
+    ASSERT_EQ(tiered_index->frontendIndex->indexSize(), 0);
+    ASSERT_EQ(tiered_index->backendIndex->indexLabelCount(), 2 * n_labels);
+
+    delete index_ctx;
+}
+
+TYPED_TEST(HNSWTieredIndexTest, bufferLimit) {
+    // Create TieredHNSW index instance with a mock queue.
+    size_t dim = 4;
+    HNSWParams params = {.type = TypeParam::get_index_type(),
+                         .dim = dim,
+                         .metric = VecSimMetric_L2,
+                         .multi = TypeParam::isMulti()};
+    VecSimParams hnsw_params = CreateParams(params);
+    auto jobQ = JobQueue();
+    auto index_ctx = new IndexExtCtx();
+
+    // Create tiered index with buffer limit set to 0.
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx,
+                                                     DEFAULT_PENDING_SWAP_JOBS_THRESHOLD, 0);
+    auto allocator = tiered_index->getAllocator();
+
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, 0);
+    ASSERT_EQ(tiered_index->backendIndex->indexSize(), 1);
+    ASSERT_EQ(tiered_index->frontendIndex->indexSize(), 0);
+    ASSERT_EQ(tiered_index->labelToInsertJobs.size(), 0);
+
+    // Set the flat limit to 1 and insert another vector - expect it to go to the flat buffer.
+    tiered_index->flatBufferLimit = 1;
+    labelType vec_label = 1;
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, vec_label, 0); // vector is [0,0,0,0]
+    ASSERT_EQ(tiered_index->backendIndex->indexSize(), 1);
+    ASSERT_EQ(tiered_index->frontendIndex->indexSize(), 1);
+    ASSERT_EQ(tiered_index->labelToInsertJobs.size(), 1);
+
+    // Overwrite the vector, expect removing it from the flat buffer and replace it with the new one
+    // only in single-value mode
+    if (!TypeParam::isMulti()) {
+        TEST_DATA_T overwritten_vec[] = {1, 1, 1, 1};
+        ASSERT_EQ(tiered_index->addVector(overwritten_vec, vec_label), 0);
+        ASSERT_EQ(tiered_index->backendIndex->indexSize(), 1);
+        ASSERT_EQ(tiered_index->frontendIndex->indexSize(), 1);
+        ASSERT_EQ(tiered_index->labelToInsertJobs.size(), 1);
+        ASSERT_EQ(tiered_index->getDistanceFrom(vec_label, overwritten_vec), 0);
+        // The first job in Q should be the invalid overwritten insert vector job.
+        ASSERT_EQ(jobQ.front().job->isValid, false);
+        ASSERT_EQ(reinterpret_cast<HNSWInsertJob *>(jobQ.front().job)->id, 0);
+        jobQ.pop();
+    }
+
+    // Insert another vector, this one should go directly to HNSW index since the buffer limit has
+    // reached.
+    vec_label = 2;
+    GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, vec_label, 0); // vector is [0,0,0,0]
+    ASSERT_EQ(tiered_index->backendIndex->indexSize(), 2);
+    ASSERT_EQ(tiered_index->frontendIndex->indexSize(), 1);
+    ASSERT_EQ(tiered_index->labelToInsertJobs.size(), 1);
+    ASSERT_EQ(tiered_index->indexLabelCount(), 3);
+
+    // Overwrite the vector, expect marking it as deleted in HNSW and insert the new one directly
+    // to HNSW as well.
+    if (!TypeParam::isMulti()) {
+        TEST_DATA_T overwritten_vec[] = {1, 1, 1, 1};
+        ASSERT_EQ(tiered_index->addVector(overwritten_vec, vec_label), 0);
+        ASSERT_EQ(tiered_index->backendIndex->indexSize(), 3);
+        ASSERT_EQ(tiered_index->getHNSWIndex()->getNumMarkedDeleted(), 1);
+        ASSERT_EQ(tiered_index->frontendIndex->indexSize(), 1);
+        ASSERT_EQ(tiered_index->labelToInsertJobs.size(), 1);
+        ASSERT_EQ(tiered_index->indexLabelCount(), 3);
+        ASSERT_EQ(tiered_index->getDistanceFrom(vec_label, overwritten_vec), 0);
+    }
+    delete index_ctx;
+}
+
+TYPED_TEST(HNSWTieredIndexTest, bufferLimitAsync) {
+    // Create TieredHNSW index instance with a mock queue.
+    size_t dim = 4;
+    size_t n = 500;
+    HNSWParams params = {.type = TypeParam::get_index_type(),
+                         .dim = dim,
+                         .metric = VecSimMetric_L2,
+                         .multi = TypeParam::isMulti(),
+                         .M = 64};
+
+    VecSimParams hnsw_params = CreateParams(params);
+    auto jobQ = JobQueue();
+    auto index_ctx = new IndexExtCtx();
+
+    // Create tiered index with buffer limit set to 100.
+    size_t flat_buffer_limit = 100;
+    auto *tiered_index = this->CreateTieredHNSWIndex(
+        hnsw_params, &jobQ, index_ctx, DEFAULT_PENDING_SWAP_JOBS_THRESHOLD, flat_buffer_limit);
+    auto allocator = tiered_index->getAllocator();
+    // Launch the BG threads loop that takes jobs from the queue and executes them.
+    bool run_thread = true;
+    for (size_t i = 0; i < THREAD_POOL_SIZE; i++) {
+        thread_pool.emplace_back(thread_main_loop, std::ref(jobQ), std::ref(run_thread));
+    }
+
+    // Create and insert vectors one by one async. At some point, buffer limit gets full and vectors
+    // are inserted directly to HNSW.
+    size_t per_label = TypeParam::isMulti() ? 5 : 1;
+    size_t n_labels = n / per_label;
+    std::srand(10); // create pseudo random generator with any arbitrary seed.
+    // Run twice, at first run we insert non-existing labels, in the second run we overwrite them
+    // (for single-value index only).
+    for (auto overwrite : {0, 1}) {
+        for (size_t i = 0; i < n; i++) {
+            TEST_DATA_T vector[dim];
+            for (size_t j = 0; j < dim; j++) {
+                vector[j] = std::rand() / (TEST_DATA_T)RAND_MAX;
+            }
+            EXPECT_EQ(tiered_index->addVector(vector, i % n_labels),
+                      TypeParam::isMulti() ? 1 : 1 - overwrite);
+            EXPECT_LE(tiered_index->frontendIndex->indexSize(), flat_buffer_limit);
+        }
+    }
+
+    thread_pool_join(jobQ, run_thread);
+    EXPECT_EQ(tiered_index->backendIndex->indexSize(), 2 * n);
+    EXPECT_EQ(tiered_index->indexLabelCount(), n_labels);
+
+    delete index_ctx;
+}
+
+TYPED_TEST(HNSWTieredIndexTest, RangeSearch) {
+    size_t dim = 4;
+    size_t k = 11;
+    size_t per_label = TypeParam::isMulti() ? 5 : 1;
+
+    size_t n_labels = k * 3;
+    size_t n = n_labels * per_label;
+
+    auto edge_delta = (k - 0.8) * per_label;
+    auto mid_delta = edge_delta / 2;
+    // `range` for querying the "edges" of the index and get k results.
+    double range = dim * edge_delta * edge_delta; // L2 distance.
+    // `half_range` for querying a point in the "middle" of the index and get k results around it.
+    double half_range = dim * mid_delta * mid_delta; // L2 distance.
+
+    // Create TieredHNSW index instance with a mock queue.
+    HNSWParams params = {
+        .type = TypeParam::get_index_type(),
+        .dim = dim,
+        .metric = VecSimMetric_L2,
+        .multi = TypeParam::isMulti(),
+        .epsilon = 3.0 * per_label,
+    };
+    VecSimParams hnsw_params = CreateParams(params);
+    auto jobQ = JobQueue();
+    auto index_ctx = new IndexExtCtx();
+    size_t cur_memory_usage;
+
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx);
+    auto allocator = tiered_index->getAllocator();
+    ASSERT_EQ(index_ctx->index_strong_ref.use_count(), 1);
+
+    auto hnsw_index = tiered_index->backendIndex;
+    auto flat_index = tiered_index->frontendIndex;
+
+    TEST_DATA_T query_0[dim];
+    GenerateVector<TEST_DATA_T>(query_0, dim, 0);
+    TEST_DATA_T query_1mid[dim];
+    GenerateVector<TEST_DATA_T>(query_1mid, dim, n / 3);
+    TEST_DATA_T query_2mid[dim];
+    GenerateVector<TEST_DATA_T>(query_2mid, dim, n * 2 / 3);
+    TEST_DATA_T query_n[dim];
+    GenerateVector<TEST_DATA_T>(query_n, dim, n - 1);
+
+    // Search for vectors when the index is empty.
+    runRangeQueryTest(tiered_index, query_0, range, nullptr, 0);
+
+    // Define the verification functions.
+    auto ver_res_0 = [&](size_t id, double score, size_t index) {
+        EXPECT_EQ(id, index);
+        // The expected score is the distance to the first vector of `id` label.
+        auto element = id * per_label;
+        EXPECT_DOUBLE_EQ(score, dim * element * element);
+    };
+
+    auto ver_res_1mid_by_id = [&](size_t id, double score, size_t index) {
+        size_t q_id = query_1mid[0] / per_label;
+        size_t mod = query_1mid[0] - q_id * per_label;
+        // In single value mode, `per_label` is always 1 and `mod` is always 0, so the following
+        // branchings is simply `expected_score = abs(id - q_id)`.
+        // In multi value mode, for ids higher than the query id, the score is the distance to the
+        // first vector of `id` label, and for ids lower than the query id, the score is the
+        // distance to the last vector of `id` label. `mod` is the distance to the first vector of
+        // `q_id` label.
+        double expected_score = 0;
+        if (id > q_id) {
+            expected_score = (id - q_id) * per_label - mod;
+        } else if (id < q_id) {
+            expected_score = (q_id - id) * per_label - (per_label - mod - 1);
+        }
+        expected_score = expected_score * expected_score * dim;
+        EXPECT_DOUBLE_EQ(score, expected_score);
+    };
+
+    auto ver_res_2mid_by_id = [&](size_t id, double score, size_t index) {
+        size_t q_id = query_2mid[0] / per_label;
+        size_t mod = query_2mid[0] - q_id * per_label;
+        // In single value mode, `per_label` is always 1 and `mod` is always 0, so the following
+        // branchings is simply `expected_score = abs(id - q_id)`.
+        // In multi value mode, for ids higher than the query id, the score is the distance to the
+        // first vector of `id` label, and for ids lower than the query id, the score is the
+        // distance to the last vector of `id` label. `mod` is the distance to the first vector of
+        // `q_id` label.
+        double expected_score = 0;
+        if (id > q_id) {
+            expected_score = (id - q_id) * per_label - mod;
+        } else if (id < q_id) {
+            expected_score = (q_id - id) * per_label - (per_label - mod - 1);
+        }
+        expected_score = expected_score * expected_score * dim;
+        EXPECT_DOUBLE_EQ(score, expected_score);
+    };
+
+    auto ver_res_1mid_by_score = [&](size_t id, double score, size_t index) {
+        size_t q_id = query_1mid[0] / per_label;
+        EXPECT_EQ(std::abs(int(id - q_id)), (index + 1) / 2);
+        ver_res_1mid_by_id(id, score, index);
+    };
+
+    auto ver_res_2mid_by_score = [&](size_t id, double score, size_t index) {
+        size_t q_id = query_2mid[0] / per_label;
+        EXPECT_EQ(std::abs(int(id - q_id)), (index + 1) / 2);
+        ver_res_2mid_by_id(id, score, index);
+    };
+
+    auto ver_res_n = [&](size_t id, double score, size_t index) {
+        EXPECT_EQ(id, n_labels - 1 - index);
+        auto element = index * per_label;
+        EXPECT_DOUBLE_EQ(score, dim * element * element);
+    };
+
+    // Insert n/2 vectors to the main index.
+    for (size_t i = 0; i < (n + 1) / 2; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(hnsw_index, dim, i / per_label, i);
+    }
+    ASSERT_EQ(tiered_index->indexSize(), (n + 1) / 2);
+    ASSERT_EQ(tiered_index->indexSize(), hnsw_index->indexSize());
+
+    // Search for `range` with the flat index empty.
+    cur_memory_usage = allocator->getAllocationSize();
+    runRangeQueryTest(tiered_index, query_0, range, ver_res_0, k, BY_ID);
+    runRangeQueryTest(tiered_index, query_0, range, ver_res_0, k, BY_SCORE);
+    runRangeQueryTest(tiered_index, query_1mid, half_range, ver_res_1mid_by_score, k, BY_SCORE);
+    runRangeQueryTest(tiered_index, query_1mid, half_range, ver_res_1mid_by_id, k, BY_ID);
+    ASSERT_EQ(allocator->getAllocationSize(), cur_memory_usage);
+
+    // Insert n/2 vectors to the flat index.
+    for (size_t i = (n + 1) / 2; i < n; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(flat_index, dim, i / per_label, i);
+    }
+    ASSERT_EQ(tiered_index->indexSize(), n);
+    ASSERT_EQ(tiered_index->indexSize(), hnsw_index->indexSize() + flat_index->indexSize());
+
+    cur_memory_usage = allocator->getAllocationSize();
+    // Search for `range` so all the vectors will be from the HNSW index.
+    runRangeQueryTest(tiered_index, query_0, range, ver_res_0, k, BY_ID);
+    runRangeQueryTest(tiered_index, query_0, range, ver_res_0, k, BY_SCORE);
+    // Search for `range` so all the vectors will be from the flat index.
+    runRangeQueryTest(tiered_index, query_n, range, ver_res_n, k, BY_SCORE);
+    // Search for `range` so some of the results will be from the main and some from the flat index.
+    runRangeQueryTest(tiered_index, query_1mid, half_range, ver_res_1mid_by_score, k, BY_SCORE);
+    runRangeQueryTest(tiered_index, query_2mid, half_range, ver_res_2mid_by_score, k, BY_SCORE);
+    runRangeQueryTest(tiered_index, query_1mid, half_range, ver_res_1mid_by_id, k, BY_ID);
+    runRangeQueryTest(tiered_index, query_2mid, half_range, ver_res_2mid_by_id, k, BY_ID);
+    // Memory usage should not change.
+    ASSERT_EQ(allocator->getAllocationSize(), cur_memory_usage);
+
+    // Add some overlapping vectors to the main and flat index.
+    // adding directly to the underlying indexes to avoid jobs logic.
+    // The main index will have vectors 0 - 2n/3 and the flat index will have vectors n/3 - n
+    for (size_t i = n / 3; i < n / 2; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(flat_index, dim, i / per_label, i);
+    }
+    for (size_t i = n / 2; i < n * 2 / 3; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(hnsw_index, dim, i / per_label, i);
+    }
+
+    cur_memory_usage = allocator->getAllocationSize();
+    // Search for `range` so all the vectors will be from the main index.
+    runRangeQueryTest(tiered_index, query_0, range, ver_res_0, k, BY_ID);
+    runRangeQueryTest(tiered_index, query_0, range, ver_res_0, k, BY_SCORE);
+    // Search for `range` so all the vectors will be from the flat index.
+    runRangeQueryTest(tiered_index, query_n, range, ver_res_n, k, BY_SCORE);
+    // Search for `range` so some of the results will be from the main and some from the flat index.
+    runRangeQueryTest(tiered_index, query_1mid, half_range, ver_res_1mid_by_score, k, BY_SCORE);
+    runRangeQueryTest(tiered_index, query_2mid, half_range, ver_res_2mid_by_score, k, BY_SCORE);
+    runRangeQueryTest(tiered_index, query_1mid, half_range, ver_res_1mid_by_id, k, BY_ID);
+    runRangeQueryTest(tiered_index, query_2mid, half_range, ver_res_2mid_by_id, k, BY_ID);
+    // Memory usage should not change.
+    ASSERT_EQ(allocator->getAllocationSize(), cur_memory_usage);
+
+    // // // // // // // // // // // //
+    // Check behavior upon timeout.  //
+    // // // // // // // // // // // //
+
+    VecSimQueryResult_List res;
+    // Add a vector to the HNSW index so there will be a reason to query it.
+    GenerateAndAddVector<TEST_DATA_T>(hnsw_index, dim, n, n);
+
+    // Set timeout callback to always return 1 (will fail while querying the flat buffer).
+    VecSim_SetTimeoutCallbackFunction([](void *ctx) { return 1; }); // Always times out
+
+    res = VecSimIndex_RangeQuery(tiered_index, query_0, range, nullptr, BY_ID);
+    ASSERT_EQ(res.code, VecSim_QueryResult_TimedOut);
+    VecSimQueryResult_Free(res);
+
+    // Set timeout callback to return 1 after n checks (will fail while querying the HNSW index).
+    // Brute-force index checks for timeout after each vector.
+    size_t checks_in_flat = flat_index->indexSize();
+    VecSimQueryParams qparams = {.timeoutCtx = &checks_in_flat};
+    VecSim_SetTimeoutCallbackFunction([](void *ctx) {
+        auto count = static_cast<size_t *>(ctx);
+        if (*count == 0) {
+            return 1;
+        }
+        (*count)--;
+        return 0;
+    });
+    res = VecSimIndex_RangeQuery(tiered_index, query_0, range, &qparams, BY_SCORE);
+    ASSERT_EQ(res.code, VecSim_QueryResult_TimedOut);
+    VecSimQueryResult_Free(res);
+    // Make sure we didn't get the timeout in the flat index.
+    checks_in_flat = flat_index->indexSize(); // Reset the counter.
+    res = VecSimIndex_RangeQuery(flat_index, query_0, range, &qparams, BY_SCORE);
+    ASSERT_EQ(res.code, VecSim_QueryResult_OK);
+    VecSimQueryResult_Free(res);
+
+    // Check again with BY_ID.
+    checks_in_flat = flat_index->indexSize(); // Reset the counter.
+    res = VecSimIndex_RangeQuery(tiered_index, query_0, range, &qparams, BY_ID);
+    ASSERT_EQ(res.code, VecSim_QueryResult_TimedOut);
+    VecSimQueryResult_Free(res);
+    // Make sure we didn't get the timeout in the flat index.
+    checks_in_flat = flat_index->indexSize(); // Reset the counter.
+    res = VecSimIndex_RangeQuery(flat_index, query_0, range, &qparams, BY_ID);
+    ASSERT_EQ(res.code, VecSim_QueryResult_OK);
+    VecSimQueryResult_Free(res);
+
+    // Clean up.
+    VecSim_SetTimeoutCallbackFunction([](void *ctx) { return 0; });
+    delete index_ctx;
+}
+
+TYPED_TEST(HNSWTieredIndexTest, parallelRangeSearch) {
+    size_t dim = 4;
+    size_t k = 11;
+    size_t n = 1000;
+    bool isMulti = TypeParam::isMulti();
+
+    size_t per_label = isMulti ? 10 : 1;
+    size_t n_labels = n / per_label;
+
+    // Create TieredHNSW index instance with a mock queue.
+    HNSWParams params = {
+        .type = TypeParam::get_index_type(),
+        .dim = dim,
+        .metric = VecSimMetric_L2,
+        .multi = isMulti,
+        .epsilon = double(dim * k * k),
+    };
+    VecSimParams hnsw_params = CreateParams(params);
+    auto jobQ = JobQueue();
+    auto index_ctx = new IndexExtCtx();
+
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx);
+    auto allocator = tiered_index->getAllocator();
+    EXPECT_EQ(index_ctx->index_strong_ref.use_count(), 1);
+
+    std::atomic_int successful_searches(0);
+    auto parallel_range_search = [](AsyncJob *job) {
+        auto *search_job = reinterpret_cast<SearchJobMock *>(job);
+        size_t k = search_job->k;
+        size_t dim = search_job->dim;
+        // The range that will get us k results.
+        double range = dim * ((k - 0.5) / 2) * ((k - 0.5) / 2); // L2 distance.
+        auto query = search_job->query;
+
+        auto verify_res = [&](size_t id, double score, size_t res_index) {
+            TEST_DATA_T element = *(TEST_DATA_T *)query;
+            ASSERT_EQ(std::abs(id - element), (res_index + 1) / 2);
+            ASSERT_EQ(score, dim * (id - element) * (id - element));
+        };
+        runRangeQueryTest(job->index, query, range, verify_res, k, BY_SCORE);
+        search_job->successful_searches++;
+        delete job;
+    };
+
+    // Fill the job queue with insert and search jobs, while filling the flat index, before
+    // initializing the thread pool.
+    for (size_t i = 0; i < n; i++) {
+        // Insert a vector to the flat index and add a job to insert it to the main index.
+        GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, i % n_labels, i);
+
+        // Add a search job. Make sure the query element is between k and n_labels - k.
+        auto query = (TEST_DATA_T *)allocator->allocate(dim * sizeof(TEST_DATA_T));
+        GenerateVector<TEST_DATA_T>(query, dim, ((n - i) % (n_labels - (2 * k))) + k);
+        auto search_job = new (allocator) SearchJobMock(
+            allocator, parallel_range_search, tiered_index, query, k, n, dim, successful_searches);
+        tiered_index->submitSingleJob(search_job);
+    }
+
+    EXPECT_EQ(tiered_index->indexSize(), n);
+    EXPECT_EQ(tiered_index->indexLabelCount(), n_labels);
+    EXPECT_EQ(tiered_index->labelToInsertJobs.size(), n_labels);
+    for (auto &it : tiered_index->labelToInsertJobs) {
+        EXPECT_EQ(it.second.size(), per_label);
+    }
+    EXPECT_EQ(tiered_index->frontendIndex->indexSize(), n);
+    EXPECT_EQ(tiered_index->backendIndex->indexSize(), 0);
+
+    // Launch the BG threads loop that takes jobs from the queue and executes them.
+    // All the vectors are already in the tiered index, so we expect to find the expected
+    // results from the get-go.
+    bool run_thread = true;
+    for (size_t i = 0; i < THREAD_POOL_SIZE; i++) {
+        thread_pool.emplace_back(thread_main_loop, std::ref(jobQ), std::ref(run_thread));
+    }
+
+    thread_pool_join(jobQ, run_thread);
+
+    EXPECT_EQ(tiered_index->backendIndex->indexSize(), n);
+    EXPECT_EQ(tiered_index->backendIndex->indexLabelCount(), n_labels);
+    EXPECT_EQ(tiered_index->frontendIndex->indexSize(), 0);
+    EXPECT_EQ(tiered_index->labelToInsertJobs.size(), 0);
+    EXPECT_EQ(successful_searches, n);
+    EXPECT_EQ(jobQ.size(), 0);
+
+    // Cleanup.
+    delete index_ctx;
+}
+
+TYPED_TEST(HNSWTieredIndexTestBasic, preferAdHocOptimization) {
+    size_t dim = 4;
+
+    HNSWParams params = {
+        .type = TypeParam::get_index_type(),
+        .dim = dim,
+        .metric = VecSimMetric_L2,
+    };
+    VecSimParams hnsw_params = CreateParams(params);
+    auto jobQ = JobQueue();
+    auto index_ctx = new IndexExtCtx();
+
+    // Create tiered index with buffer limit set to 0.
+    auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx);
+    auto allocator = tiered_index->getAllocator();
+
+    auto hnsw = tiered_index->backendIndex;
+    auto flat = tiered_index->frontendIndex;
+
+    // Insert 5 vectors to the main index.
+    for (size_t i = 0; i < 5; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(hnsw, dim, i, i);
+    }
+    // Sanity check. Should choose as HNSW.
+    ASSERT_EQ(tiered_index->preferAdHocSearch(5, 5, true), hnsw->preferAdHocSearch(5, 5, true));
+
+    // Insert 6 vectors to the flat index.
+    for (size_t i = 0; i < 6; i++) {
+        GenerateAndAddVector<TEST_DATA_T>(flat, dim, i, i);
+    }
+    // Sanity check. Should choose as flat as it has more vectors.
+    ASSERT_EQ(tiered_index->preferAdHocSearch(5, 5, true), flat->preferAdHocSearch(5, 5, true));
+
+    // Check for preference of tiered with subset (10) smaller than the tiered index size (11),
+    // but larger than any of the underlying indexes.
+    ASSERT_NO_THROW(tiered_index->preferAdHocSearch(10, 5, false));
+
+    // Cleanup.
+    delete index_ctx;
 }
diff --git a/tests/unit/test_utils.cpp b/tests/unit/test_utils.cpp
index cf2909ac6..441cb96df 100644
--- a/tests/unit/test_utils.cpp
+++ b/tests/unit/test_utils.cpp
@@ -44,11 +44,26 @@ VecSimQueryParams CreateQueryParams(const HNSWRuntimeParams &RuntimeParams) {
     return QueryParams;
 }
 
-void runTopKSearchTest(VecSimIndex *index, const void *query, size_t k,
+static bool is_async_index(VecSimIndex *index) {
+    return dynamic_cast<VecSimTieredIndex<float, float> *>(index) != nullptr ||
+           dynamic_cast<VecSimTieredIndex<double, double> *>(index) != nullptr;
+}
+
+void runTopKSearchTest(VecSimIndex *index, const void *query, size_t k, size_t expected_num_res,
                        std::function<void(size_t, double, size_t)> ResCB, VecSimQueryParams *params,
                        VecSimQueryResult_Order order) {
     VecSimQueryResult_List res = VecSimIndex_TopKQuery(index, query, k, params, order);
-    ASSERT_EQ(VecSimQueryResult_Len(res), k);
+    if (is_async_index(index)) {
+        // Async index may return more or less than the expected number of results,
+        // depending on the number of results that were available at the time of the query.
+        // We can estimate the number of results that should be returned to be roughly
+        // `expected_num_res` +- number of threads in the pool of the job queue.
+
+        // for now, lets only check that the number of results is not greater than k.
+        ASSERT_LE(VecSimQueryResult_Len(res), k);
+    } else {
+        ASSERT_EQ(VecSimQueryResult_Len(res), expected_num_res);
+    }
     ASSERT_TRUE(allUniqueResults(res));
     VecSimQueryResult_Iterator *iterator = VecSimQueryResult_List_GetIterator(res);
     int res_ind = 0;
@@ -58,11 +73,17 @@ void runTopKSearchTest(VecSimIndex *index, const void *query, size_t k,
         double score = VecSimQueryResult_GetScore(item);
         ResCB(id, score, res_ind++);
     }
-    ASSERT_EQ(res_ind, k);
     VecSimQueryResult_IteratorFree(iterator);
     VecSimQueryResult_Free(res);
 }
 
+void runTopKSearchTest(VecSimIndex *index, const void *query, size_t k,
+                       std::function<void(size_t, double, size_t)> ResCB, VecSimQueryParams *params,
+                       VecSimQueryResult_Order order) {
+    size_t expected_num_res = std::min(VecSimIndex_IndexSize(index), k);
+    runTopKSearchTest(index, query, k, expected_num_res, ResCB, params, order);
+}
+
 /*
  * helper function to run batch search iteration, and iterate over the results. ResCB is a callback
  * that takes the id, score and index of a result, and performs test-specific logic for each.
@@ -88,157 +109,192 @@ void runBatchIteratorSearchTest(VecSimBatchIterator *batch_iterator, size_t n_re
     VecSimQueryResult_Free(res);
 }
 
+void compareCommonInfo(CommonInfo info1, CommonInfo info2) {
+    ASSERT_EQ(info1.basicInfo.dim, info2.basicInfo.dim);
+    ASSERT_EQ(info1.basicInfo.metric, info2.basicInfo.metric);
+    ASSERT_EQ(info1.indexSize, info2.indexSize);
+    ASSERT_EQ(info1.basicInfo.type, info2.basicInfo.type);
+    ASSERT_EQ(info1.memory, info2.memory);
+    ASSERT_EQ(info1.basicInfo.blockSize, info2.basicInfo.blockSize);
+    ASSERT_EQ(info1.basicInfo.isMulti, info2.basicInfo.isMulti);
+    ASSERT_EQ(info1.last_mode, info2.last_mode);
+    ASSERT_EQ(info1.indexLabelCount, info2.indexLabelCount);
+}
+void compareFlatInfo(bfInfoStruct info1, bfInfoStruct info2) {}
+
+void compareHNSWInfo(hnswInfoStruct info1, hnswInfoStruct info2) {
+    ASSERT_EQ(info1.efConstruction, info2.efConstruction);
+    ASSERT_EQ(info1.efRuntime, info2.efRuntime);
+    ASSERT_EQ(info1.entrypoint, info2.entrypoint);
+    ASSERT_EQ(info1.epsilon, info2.epsilon);
+    ASSERT_EQ(info1.M, info2.M);
+    ASSERT_EQ(info1.max_level, info2.max_level);
+    ASSERT_EQ(info1.visitedNodesPoolSize, info2.visitedNodesPoolSize);
+}
+
+/*
+ * helper function to run range query and iterate over the results. ResCB is a callback that takes
+ * the id, score and index of a result, and performs test-specific logic for each.
+ */
+void runRangeQueryTest(VecSimIndex *index, const void *query, double radius,
+                       const std::function<void(size_t, double, size_t)> &ResCB,
+                       size_t expected_res_num, VecSimQueryResult_Order order,
+                       VecSimQueryParams *params) {
+    VecSimQueryResult_List res =
+        VecSimIndex_RangeQuery(index, (const void *)query, radius, params, order);
+    EXPECT_EQ(VecSimQueryResult_Len(res), expected_res_num);
+    EXPECT_TRUE(allUniqueResults(res));
+    VecSimQueryResult_Iterator *iterator = VecSimQueryResult_List_GetIterator(res);
+    int res_ind = 0;
+    while (VecSimQueryResult_IteratorHasNext(iterator)) {
+        VecSimQueryResult *item = VecSimQueryResult_IteratorNext(iterator);
+        int id = (int)VecSimQueryResult_GetId(item);
+        double score = VecSimQueryResult_GetScore(item);
+        ResCB(id, score, res_ind++);
+    }
+    EXPECT_EQ(res_ind, expected_res_num);
+    VecSimQueryResult_IteratorFree(iterator);
+    VecSimQueryResult_Free(res);
+}
+
 void compareFlatIndexInfoToIterator(VecSimIndexInfo info, VecSimInfoIterator *infoIter) {
     ASSERT_EQ(10, VecSimInfoIterator_NumberOfFields(infoIter));
     while (VecSimInfoIterator_HasNextField(infoIter)) {
-        VecSim_InfoField *infoFiled = VecSimInfoIterator_NextField(infoIter);
-        if (!strcmp(infoFiled->fieldName, VecSimCommonStrings::ALGORITHM_STRING)) {
+        VecSim_InfoField *infoField = VecSimInfoIterator_NextField(infoIter);
+        if (!strcmp(infoField->fieldName, VecSimCommonStrings::ALGORITHM_STRING)) {
             // Algorithm type.
-            ASSERT_EQ(infoFiled->fieldType, INFOFIELD_STRING);
-            ASSERT_STREQ(infoFiled->fieldValue.stringValue, VecSimAlgo_ToString(info.algo));
-        } else if (!strcmp(infoFiled->fieldName, VecSimCommonStrings::TYPE_STRING)) {
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_STRING);
+            ASSERT_STREQ(infoField->fieldValue.stringValue,
+                         VecSimAlgo_ToString(info.commonInfo.basicInfo.algo));
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::TYPE_STRING)) {
             // Vector type.
-            ASSERT_EQ(infoFiled->fieldType, INFOFIELD_STRING);
-            ASSERT_STREQ(infoFiled->fieldValue.stringValue, VecSimType_ToString(info.bfInfo.type));
-        } else if (!strcmp(infoFiled->fieldName, VecSimCommonStrings::DIMENSION_STRING)) {
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_STRING);
+            ASSERT_STREQ(infoField->fieldValue.stringValue,
+                         VecSimType_ToString(info.commonInfo.basicInfo.type));
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::DIMENSION_STRING)) {
             // Vector dimension.
-            ASSERT_EQ(infoFiled->fieldType, INFOFIELD_UINT64);
-            ASSERT_EQ(infoFiled->fieldValue.uintegerValue, info.bfInfo.dim);
-        } else if (!strcmp(infoFiled->fieldName, VecSimCommonStrings::METRIC_STRING)) {
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_UINT64);
+            ASSERT_EQ(infoField->fieldValue.uintegerValue, info.commonInfo.basicInfo.dim);
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::METRIC_STRING)) {
             // Metric.
-            ASSERT_EQ(infoFiled->fieldType, INFOFIELD_STRING);
-            ASSERT_STREQ(infoFiled->fieldValue.stringValue,
-                         VecSimMetric_ToString(info.bfInfo.metric));
-        } else if (!strcmp(infoFiled->fieldName, VecSimCommonStrings::SEARCH_MODE_STRING)) {
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_STRING);
+            ASSERT_STREQ(infoField->fieldValue.stringValue,
+                         VecSimMetric_ToString(info.commonInfo.basicInfo.metric));
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::SEARCH_MODE_STRING)) {
             // Search mode.
-            ASSERT_EQ(infoFiled->fieldType, INFOFIELD_STRING);
-            ASSERT_STREQ(infoFiled->fieldValue.stringValue,
-                         VecSimSearchMode_ToString(info.bfInfo.last_mode));
-        } else if (!strcmp(infoFiled->fieldName, VecSimCommonStrings::INDEX_SIZE_STRING)) {
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_STRING);
+            ASSERT_STREQ(infoField->fieldValue.stringValue,
+                         VecSimSearchMode_ToString(info.commonInfo.last_mode));
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::INDEX_SIZE_STRING)) {
             // Index size.
-            ASSERT_EQ(infoFiled->fieldType, INFOFIELD_UINT64);
-            ASSERT_EQ(infoFiled->fieldValue.uintegerValue, info.bfInfo.indexSize);
-        } else if (!strcmp(infoFiled->fieldName, VecSimCommonStrings::INDEX_LABEL_COUNT_STRING)) {
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_UINT64);
+            ASSERT_EQ(infoField->fieldValue.uintegerValue, info.commonInfo.indexSize);
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::INDEX_LABEL_COUNT_STRING)) {
             // Index label count.
-            ASSERT_EQ(infoFiled->fieldType, INFOFIELD_UINT64);
-            ASSERT_EQ(infoFiled->fieldValue.uintegerValue, info.bfInfo.indexLabelCount);
-        } else if (!strcmp(infoFiled->fieldName, VecSimCommonStrings::IS_MULTI_STRING)) {
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_UINT64);
+            ASSERT_EQ(infoField->fieldValue.uintegerValue, info.commonInfo.indexLabelCount);
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::IS_MULTI_STRING)) {
             // Is the index multi value.
-            ASSERT_EQ(infoFiled->fieldType, INFOFIELD_UINT64);
-            ASSERT_EQ(infoFiled->fieldValue.uintegerValue, info.bfInfo.isMulti);
-        } else if (!strcmp(infoFiled->fieldName, VecSimCommonStrings::BLOCK_SIZE_STRING)) {
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_UINT64);
+            ASSERT_EQ(infoField->fieldValue.uintegerValue, info.commonInfo.basicInfo.isMulti);
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::BLOCK_SIZE_STRING)) {
             // Block size.
-            ASSERT_EQ(infoFiled->fieldType, INFOFIELD_UINT64);
-            ASSERT_EQ(infoFiled->fieldValue.uintegerValue, info.bfInfo.blockSize);
-        } else if (!strcmp(infoFiled->fieldName, VecSimCommonStrings::MEMORY_STRING)) {
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_UINT64);
+            ASSERT_EQ(infoField->fieldValue.uintegerValue, info.commonInfo.basicInfo.blockSize);
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::MEMORY_STRING)) {
             // Memory.
-            ASSERT_EQ(infoFiled->fieldType, INFOFIELD_UINT64);
-            ASSERT_EQ(infoFiled->fieldValue.uintegerValue, info.bfInfo.memory);
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_UINT64);
+            ASSERT_EQ(infoField->fieldValue.uintegerValue, info.commonInfo.memory);
         } else {
-            ASSERT_TRUE(false);
+            FAIL();
         }
     }
 }
 
 void compareHNSWIndexInfoToIterator(VecSimIndexInfo info, VecSimInfoIterator *infoIter) {
-    ASSERT_EQ(15, VecSimInfoIterator_NumberOfFields(infoIter));
+    ASSERT_EQ(17, VecSimInfoIterator_NumberOfFields(infoIter));
     while (VecSimInfoIterator_HasNextField(infoIter)) {
-        VecSim_InfoField *infoFiled = VecSimInfoIterator_NextField(infoIter);
-        if (!strcmp(infoFiled->fieldName, VecSimCommonStrings::ALGORITHM_STRING)) {
+        VecSim_InfoField *infoField = VecSimInfoIterator_NextField(infoIter);
+        if (!strcmp(infoField->fieldName, VecSimCommonStrings::ALGORITHM_STRING)) {
             // Algorithm type.
-            ASSERT_EQ(infoFiled->fieldType, INFOFIELD_STRING);
-            ASSERT_STREQ(infoFiled->fieldValue.stringValue, VecSimAlgo_ToString(info.algo));
-        } else if (!strcmp(infoFiled->fieldName, VecSimCommonStrings::TYPE_STRING)) {
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_STRING);
+            ASSERT_STREQ(infoField->fieldValue.stringValue,
+                         VecSimAlgo_ToString(info.commonInfo.basicInfo.algo));
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::TYPE_STRING)) {
             // Vector type.
-            ASSERT_EQ(infoFiled->fieldType, INFOFIELD_STRING);
-            ASSERT_STREQ(infoFiled->fieldValue.stringValue,
-                         VecSimType_ToString(info.hnswInfo.type));
-        } else if (!strcmp(infoFiled->fieldName, VecSimCommonStrings::DIMENSION_STRING)) {
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_STRING);
+            ASSERT_STREQ(infoField->fieldValue.stringValue,
+                         VecSimType_ToString(info.commonInfo.basicInfo.type));
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::DIMENSION_STRING)) {
             // Vector dimension.
-            ASSERT_EQ(infoFiled->fieldType, INFOFIELD_UINT64);
-            ASSERT_EQ(infoFiled->fieldValue.uintegerValue, info.hnswInfo.dim);
-        } else if (!strcmp(infoFiled->fieldName, VecSimCommonStrings::METRIC_STRING)) {
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_UINT64);
+            ASSERT_EQ(infoField->fieldValue.uintegerValue, info.commonInfo.basicInfo.dim);
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::METRIC_STRING)) {
             // Metric.
-            ASSERT_EQ(infoFiled->fieldType, INFOFIELD_STRING);
-            ASSERT_STREQ(infoFiled->fieldValue.stringValue,
-                         VecSimMetric_ToString(info.hnswInfo.metric));
-        } else if (!strcmp(infoFiled->fieldName, VecSimCommonStrings::SEARCH_MODE_STRING)) {
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_STRING);
+            ASSERT_STREQ(infoField->fieldValue.stringValue,
+                         VecSimMetric_ToString(info.commonInfo.basicInfo.metric));
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::SEARCH_MODE_STRING)) {
             // Search mode.
-            ASSERT_EQ(infoFiled->fieldType, INFOFIELD_STRING);
-            ASSERT_STREQ(infoFiled->fieldValue.stringValue,
-                         VecSimSearchMode_ToString(info.hnswInfo.last_mode));
-        } else if (!strcmp(infoFiled->fieldName, VecSimCommonStrings::INDEX_SIZE_STRING)) {
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_STRING);
+            ASSERT_STREQ(infoField->fieldValue.stringValue,
+                         VecSimSearchMode_ToString(info.commonInfo.last_mode));
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::INDEX_SIZE_STRING)) {
             // Index size.
-            ASSERT_EQ(infoFiled->fieldType, INFOFIELD_UINT64);
-            ASSERT_EQ(infoFiled->fieldValue.uintegerValue, info.hnswInfo.indexSize);
-        } else if (!strcmp(infoFiled->fieldName, VecSimCommonStrings::INDEX_LABEL_COUNT_STRING)) {
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_UINT64);
+            ASSERT_EQ(infoField->fieldValue.uintegerValue, info.commonInfo.indexSize);
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::INDEX_LABEL_COUNT_STRING)) {
             // Index label count.
-            ASSERT_EQ(infoFiled->fieldType, INFOFIELD_UINT64);
-            ASSERT_EQ(infoFiled->fieldValue.uintegerValue, info.hnswInfo.indexLabelCount);
-        } else if (!strcmp(infoFiled->fieldName, VecSimCommonStrings::IS_MULTI_STRING)) {
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_UINT64);
+            ASSERT_EQ(infoField->fieldValue.uintegerValue, info.commonInfo.indexLabelCount);
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::IS_MULTI_STRING)) {
             // Is the index multi value.
-            ASSERT_EQ(infoFiled->fieldType, INFOFIELD_UINT64);
-            ASSERT_EQ(infoFiled->fieldValue.uintegerValue, info.hnswInfo.isMulti);
-        } else if (!strcmp(infoFiled->fieldName,
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_UINT64);
+            ASSERT_EQ(infoField->fieldValue.uintegerValue, info.commonInfo.basicInfo.isMulti);
+        } else if (!strcmp(infoField->fieldName,
                            VecSimCommonStrings::HNSW_EF_CONSTRUCTION_STRING)) {
             // EF construction.
-            ASSERT_EQ(infoFiled->fieldType, INFOFIELD_UINT64);
-            ASSERT_EQ(infoFiled->fieldValue.uintegerValue, info.hnswInfo.efConstruction);
-        } else if (!strcmp(infoFiled->fieldName, VecSimCommonStrings::HNSW_EF_RUNTIME_STRING)) {
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_UINT64);
+            ASSERT_EQ(infoField->fieldValue.uintegerValue, info.hnswInfo.efConstruction);
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::HNSW_EF_RUNTIME_STRING)) {
             // EF runtime.
-            ASSERT_EQ(infoFiled->fieldType, INFOFIELD_UINT64);
-            ASSERT_EQ(infoFiled->fieldValue.uintegerValue, info.hnswInfo.efRuntime);
-        } else if (!strcmp(infoFiled->fieldName, VecSimCommonStrings::HNSW_EPSILON_STRING)) {
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_UINT64);
+            ASSERT_EQ(infoField->fieldValue.uintegerValue, info.hnswInfo.efRuntime);
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::HNSW_EPSILON_STRING)) {
             // Epsilon.
-            ASSERT_EQ(infoFiled->fieldType, INFOFIELD_FLOAT64);
-            ASSERT_EQ(infoFiled->fieldValue.floatingPointValue, info.hnswInfo.epsilon);
-        } else if (!strcmp(infoFiled->fieldName, VecSimCommonStrings::HNSW_M_STRING)) {
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_FLOAT64);
+            ASSERT_EQ(infoField->fieldValue.floatingPointValue, info.hnswInfo.epsilon);
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::HNSW_M_STRING)) {
             // M.
-            ASSERT_EQ(infoFiled->fieldType, INFOFIELD_UINT64);
-            ASSERT_EQ(infoFiled->fieldValue.uintegerValue, info.hnswInfo.M);
-        } else if (!strcmp(infoFiled->fieldName, VecSimCommonStrings::HNSW_MAX_LEVEL)) {
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_UINT64);
+            ASSERT_EQ(infoField->fieldValue.uintegerValue, info.hnswInfo.M);
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::HNSW_MAX_LEVEL)) {
             // Levels.
-            ASSERT_EQ(infoFiled->fieldType, INFOFIELD_UINT64);
-            ASSERT_EQ(infoFiled->fieldValue.uintegerValue, info.hnswInfo.max_level);
-        } else if (!strcmp(infoFiled->fieldName, VecSimCommonStrings::HNSW_ENTRYPOINT)) {
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_UINT64);
+            ASSERT_EQ(infoField->fieldValue.uintegerValue, info.hnswInfo.max_level);
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::HNSW_ENTRYPOINT)) {
             // Entrypoint.
-            ASSERT_EQ(infoFiled->fieldType, INFOFIELD_UINT64);
-            ASSERT_EQ(infoFiled->fieldValue.uintegerValue, info.hnswInfo.entrypoint);
-        } else if (!strcmp(infoFiled->fieldName, VecSimCommonStrings::MEMORY_STRING)) {
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_UINT64);
+            ASSERT_EQ(infoField->fieldValue.uintegerValue, info.hnswInfo.entrypoint);
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::MEMORY_STRING)) {
             // Memory.
-            ASSERT_EQ(infoFiled->fieldType, INFOFIELD_UINT64);
-            ASSERT_EQ(infoFiled->fieldValue.uintegerValue, info.hnswInfo.memory);
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_UINT64);
+            ASSERT_EQ(infoField->fieldValue.uintegerValue, info.commonInfo.memory);
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::BLOCK_SIZE_STRING)) {
+            // Block size.
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_UINT64);
+            ASSERT_EQ(infoField->fieldValue.uintegerValue, info.commonInfo.basicInfo.blockSize);
+        } else if (!strcmp(infoField->fieldName, VecSimCommonStrings::HNSW_NUM_MARKED_DELETED)) {
+            // Number of marked deleted.
+            ASSERT_EQ(infoField->fieldType, INFOFIELD_UINT64);
+            ASSERT_EQ(infoField->fieldValue.uintegerValue,
+                      info.hnswInfo.numberOfMarkedDeletedNodes);
         } else {
-            ASSERT_TRUE(false);
+            FAIL();
         }
     }
 }
 
-/*
- * helper function to run range query and iterate over the results. ResCB is a callback that takes
- * the id, score and index of a result, and performs test-specific logic for each.
- */
-void runRangeQueryTest(VecSimIndex *index, const void *query, double radius,
-                       const std::function<void(size_t, double, size_t)> &ResCB,
-                       size_t expected_res_num, VecSimQueryResult_Order order,
-                       VecSimQueryParams *params) {
-    VecSimQueryResult_List res =
-        VecSimIndex_RangeQuery(index, (const void *)query, radius, params, order);
-    ASSERT_EQ(VecSimQueryResult_Len(res), expected_res_num);
-    ASSERT_TRUE(allUniqueResults(res));
-    VecSimQueryResult_Iterator *iterator = VecSimQueryResult_List_GetIterator(res);
-    int res_ind = 0;
-    while (VecSimQueryResult_IteratorHasNext(iterator)) {
-        VecSimQueryResult *item = VecSimQueryResult_IteratorNext(iterator);
-        int id = (int)VecSimQueryResult_GetId(item);
-        double score = VecSimQueryResult_GetScore(item);
-        ResCB(id, score, res_ind++);
-    }
-    ASSERT_EQ(res_ind, expected_res_num);
-    VecSimQueryResult_IteratorFree(iterator);
-    VecSimQueryResult_Free(res);
-}
-
 size_t getLabelsLookupNodeSize() {
     std::shared_ptr<VecSimAllocator> allocator = VecSimAllocator::newVecsimAllocator();
     auto dummy_lookup = vecsim_stl::unordered_map<size_t, unsigned int>(1, allocator);
@@ -252,14 +308,71 @@ size_t getLabelsLookupNodeSize() {
  * Mock callbacks for testing async tiered index. We use a simple std::queue to simulate the job
  * queue.
  */
-int tiered_index_mock::submit_callback(void *job_queue, void **jobs, size_t len) {
-    for (size_t i = 0; i < len; i++) {
-        static_cast<JobQueue *>(job_queue)->push(jobs[i]);
+
+std::mutex tiered_index_mock::queue_guard;
+std::condition_variable tiered_index_mock::queue_cond;
+std::vector<std::thread> tiered_index_mock::thread_pool;
+
+int tiered_index_mock::submit_callback(void *job_queue, void *index_ctx, AsyncJob **jobs,
+                                       JobCallback *CBs, size_t len) {
+    {
+        std::unique_lock<std::mutex> lock(queue_guard);
+        for (size_t i = 0; i < len; i++) {
+            // Wrap the job with a struct that contains a weak reference to the related index.
+            auto owned_job = RefManagedJob{
+                .job = jobs[i],
+                .index_weak_ref = reinterpret_cast<IndexExtCtx *>(index_ctx)->index_strong_ref};
+            static_cast<JobQueue *>(job_queue)->push(owned_job);
+        }
+    }
+    if (len == 1) {
+        queue_cond.notify_one();
+    } else {
+        queue_cond.notify_all();
     }
     return VecSim_OK;
 }
 
-int tiered_index_mock::update_mem_callback(void *mem_ctx, size_t mem) {
-    *(size_t *)mem_ctx = mem;
-    return VecSim_OK;
+// If `run_thread` is null, treat it as `true`.
+void tiered_index_mock::thread_iteration(JobQueue &jobQ, bool *run_thread) {
+    std::unique_lock<std::mutex> lock(queue_guard);
+    // Wake up and acquire the lock (atomically) ONLY if the job queue is not empty at that
+    // point, or if the thread should not run anymore (and quit in that case).
+    queue_cond.wait(
+        lock, [&jobQ, &run_thread]() { return !jobQ.empty() || (run_thread && !*run_thread); });
+    if (run_thread && !*run_thread)
+        return;
+    auto managed_job = jobQ.front();
+    jobQ.pop();
+    lock.unlock();
+    // Upgrade the index weak reference to a strong ref while we run the job over the index.
+    if (auto temp_ref = managed_job.index_weak_ref.lock()) {
+        managed_job.job->Execute(managed_job.job);
+    }
+}
+
+// Main loop for background worker threads that execute the jobs form the job queue.
+// run_thread uses as a signal to the thread that indicates whether it should keep running or
+// stop and terminate the thread.
+void tiered_index_mock::thread_main_loop(JobQueue &jobQ, bool &run_thread) {
+    while (run_thread) {
+        thread_iteration(jobQ, &run_thread);
+    }
+}
+
+void tiered_index_mock::thread_pool_join(JobQueue &jobQ, bool &run_thread) {
+    // Check every 10 ms if queue is empty, and if so, terminate the threads loop.
+    while (true) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+        std::unique_lock<std::mutex> lock(queue_guard);
+        if (jobQ.empty()) {
+            run_thread = false;
+            queue_cond.notify_all();
+            break;
+        }
+    }
+    for (size_t i = 0; i < THREAD_POOL_SIZE; i++) {
+        thread_pool[i].join();
+    }
+    thread_pool.clear();
 }
diff --git a/tests/unit/test_utils.h b/tests/unit/test_utils.h
index 821dc9705..7f88640e8 100644
--- a/tests/unit/test_utils.h
+++ b/tests/unit/test_utils.h
@@ -9,6 +9,7 @@
 #include <functional>
 #include <cmath>
 #include <exception>
+#include <thread>
 
 #include "VecSim/vec_sim.h"
 #include "VecSim/algorithms/hnsw/hnsw_tiered.h"
@@ -25,8 +26,26 @@ struct IndexType {
 #define TEST_DATA_T typename TypeParam::data_t
 #define TEST_DIST_T typename TypeParam::dist_t
 
-using DataTypeSet =
-    ::testing::Types<IndexType<VecSimType_FLOAT32, float>, IndexType<VecSimType_FLOAT64, double>>;
+using DataTypeSet = ::testing::Types<IndexType<VecSimType_FLOAT32, float>
+#ifdef FP64_TESTS
+                                     ,
+                                     IndexType<VecSimType_FLOAT64, double>
+#endif
+                                     >;
+
+// Define index type for tests that can be automatically generated for single and multi.
+template <VecSimType type, bool IsMulti, typename DataType, typename DistType = DataType>
+struct IndexTypeExtended {
+    static VecSimType get_index_type() { return type; }
+    static bool isMulti() { return IsMulti; }
+    typedef DataType data_t;
+    typedef DistType dist_t;
+};
+
+using DataTypeSetExtended = ::testing::Types<IndexTypeExtended<VecSimType_FLOAT32, false, float>,
+                                             IndexTypeExtended<VecSimType_FLOAT32, true, float>,
+                                             IndexTypeExtended<VecSimType_FLOAT64, false, double>,
+                                             IndexTypeExtended<VecSimType_FLOAT64, true, double>>;
 
 template <typename data_t>
 static void GenerateVector(data_t *output, size_t dim, data_t value = 1.0) {
@@ -52,6 +71,11 @@ inline VecSimParams CreateParams(const BFParams &bf_params) {
     return params;
 }
 
+inline VecSimParams CreateParams(const TieredIndexParams &tiered_params) {
+    VecSimParams params{.algo = VecSimAlgo_TIERED, .tieredParams = tiered_params};
+    return params;
+}
+
 namespace test_utils {
 template <typename IndexParams>
 inline VecSimIndex *CreateNewIndex(IndexParams &index_params, VecSimType type,
@@ -82,6 +106,10 @@ VecSimQueryParams CreateQueryParams(const HNSWRuntimeParams &RuntimeParams);
 inline void ASSERT_TYPE_EQ(double arg1, double arg2) { ASSERT_DOUBLE_EQ(arg1, arg2); }
 
 inline void ASSERT_TYPE_EQ(float arg1, float arg2) { ASSERT_FLOAT_EQ(arg1, arg2); }
+void runTopKSearchTest(VecSimIndex *index, const void *query, size_t k, size_t expected_res_num,
+                       std::function<void(size_t, double, size_t)> ResCB,
+                       VecSimQueryParams *params = nullptr,
+                       VecSimQueryResult_Order order = BY_SCORE);
 void runTopKSearchTest(VecSimIndex *index, const void *query, size_t k,
                        std::function<void(size_t, double, size_t)> ResCB,
                        VecSimQueryParams *params = nullptr,
@@ -92,6 +120,10 @@ void runBatchIteratorSearchTest(VecSimBatchIterator *batch_iterator, size_t n_re
                                 VecSimQueryResult_Order order = BY_SCORE,
                                 size_t expected_n_res = -1);
 
+void compareCommonInfo(CommonInfo info1, CommonInfo info2);
+void compareFlatInfo(bfInfoStruct info1, bfInfoStruct info2);
+void compareHNSWInfo(hnswInfoStruct info1, hnswInfoStruct info2);
+
 void compareFlatIndexInfoToIterator(VecSimIndexInfo info, VecSimInfoIterator *infoIter);
 
 void compareHNSWIndexInfoToIterator(VecSimIndexInfo info, VecSimInfoIterator *infoIter);
@@ -129,7 +161,51 @@ inline double GetInfVal(VecSimType type) {
     }
 
 namespace tiered_index_mock {
-using JobQueue = std::queue<void *>;
-int submit_callback(void *job_queue, void **jobs, size_t len);
-int update_mem_callback(void *mem_ctx, size_t mem);
+
+typedef struct RefManagedJob {
+    AsyncJob *job;
+    std::weak_ptr<VecSimIndex> index_weak_ref;
+} RefManagedJob;
+
+struct SearchJobMock : public AsyncJob {
+    void *query; // The query vector. ownership is passed to the job in the constructor.
+    size_t k;    // The number of results to return.
+    size_t n;    // The number of vectors in the index (might be useful for the mock)
+    size_t dim;  // The dimension of the vectors in the index (might be useful for the mock)
+    std::atomic_int &successful_searches; // A reference to a shared counter that counts the number
+                                          // of successful searches.
+    SearchJobMock(std::shared_ptr<VecSimAllocator> allocator, JobCallback searchCB,
+                  VecSimIndex *index_, void *query_, size_t k_, size_t n_, size_t dim_,
+                  std::atomic_int &successful_searches_)
+        : AsyncJob(allocator, HNSW_SEARCH_JOB, searchCB, index_), query(query_), k(k_), n(n_),
+          dim(dim_), successful_searches(successful_searches_) {}
+    ~SearchJobMock() { this->allocator->free_allocation(query); }
+};
+
+struct JobQueue : public std::queue<RefManagedJob> {
+    // Pops and destroys the job at the front of the queue.
+    inline void kick() {
+        delete this->front().job;
+        this->pop();
+    }
+};
+
+int submit_callback(void *job_queue, void *index_ctx, AsyncJob **jobs, JobCallback *CBs,
+                    size_t jobs_len);
+
+typedef struct IndexExtCtx {
+    std::shared_ptr<VecSimIndex> index_strong_ref;
+} IndexExtCtx;
+
+static const size_t THREAD_POOL_SIZE = MIN(16, std::thread::hardware_concurrency());
+extern std::vector<std::thread> thread_pool;
+extern std::mutex queue_guard;
+extern std::condition_variable queue_cond;
+
+// A single iteration of the thread main loop.
+void thread_iteration(JobQueue &jobQ, bool *run_thread = nullptr);
+void thread_main_loop(JobQueue &jobQ, bool &run_thread);
+
+void thread_pool_join(JobQueue &jobQ, bool &run_thread);
+
 } // namespace tiered_index_mock