Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Raw vectors data layer in HNSW + move to base class #523

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
25 changes: 12 additions & 13 deletions src/VecSim/algorithms/brute_force/brute_force.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ template <typename DataType, typename DistType>
class BruteForceIndex : public VecSimIndexAbstract<DataType, DistType> {
protected:
vecsim_stl::vector<labelType> idToLabelMapping;
RawDataContainer *vectors;
idType count;

public:
Expand All @@ -41,7 +40,9 @@ class BruteForceIndex : public VecSimIndexAbstract<DataType, DistType> {
size_t indexSize() const override;
size_t indexCapacity() const override;
std::unique_ptr<RawDataContainer::Iterator> getVectorsIterator() const;
DataType *getDataByInternalId(idType id) const { return (DataType *)vectors->getElement(id); }
DataType *getDataByInternalId(idType id) const {
return (DataType *)this->vectors->getElement(id);
}
VecSimQueryReply *topKQuery(const void *queryBlob, size_t k,
VecSimQueryParams *queryParams) const override;
VecSimQueryReply *rangeQuery(const void *queryBlob, double radius,
Expand All @@ -54,7 +55,7 @@ class BruteForceIndex : public VecSimIndexAbstract<DataType, DistType> {
bool preferAdHocSearch(size_t subsetSize, size_t k, bool initial_check) const override;
labelType getVectorLabel(idType id) const { return idToLabelMapping.at(id); }

const RawDataContainer *getVectorsContainer() const { return vectors; }
const RawDataContainer *getVectorsContainer() const { return this->vectors; }

const labelType getLabelByInternalId(idType internal_id) const {
return idToLabelMapping.at(internal_id);
Expand All @@ -71,7 +72,7 @@ class BruteForceIndex : public VecSimIndexAbstract<DataType, DistType> {
// without duplicates in tiered index). Caller should hold the flat buffer lock for read.
virtual vecsim_stl::set<labelType> getLabelsSet() const = 0;

virtual ~BruteForceIndex() { delete vectors; }
virtual ~BruteForceIndex() = default;
#ifdef BUILD_TESTS
/**
* @brief Used for testing - store vector(s) data associated with a given label. This function
Expand Down Expand Up @@ -147,8 +148,6 @@ BruteForceIndex<DataType, DistType>::BruteForceIndex(
: VecSimIndexAbstract<DataType, DistType>(abstractInitParams, components),
idToLabelMapping(this->allocator), count(0) {
assert(VecSimType_sizeof(this->vecType) == sizeof(DataType));
vectors = new (this->allocator)
DataBlocksContainer(this->blockSize, this->dataSize, this->allocator, this->alignment);
}

/******************** Implementation **************/
Expand All @@ -164,7 +163,7 @@ void BruteForceIndex<DataType, DistType>::appendVector(const void *vector_data,
growByBlock();
}
// add vector data to vector raw data container
vectors->addElement(processed_blob.get(), id);
this->vectors->addElement(processed_blob.get(), id);

// add label to idToLabelMapping
setVectorLabel(id, label);
Expand Down Expand Up @@ -193,10 +192,10 @@ void BruteForceIndex<DataType, DistType>::removeVector(idType id_to_delete) {
replaceIdOfLabel(last_idx_label, id_to_delete, last_idx);

// Put data of last vector inplace of the deleted vector.
const char *last_vector_data = vectors->getElement(last_idx);
vectors->updateElement(id_to_delete, last_vector_data);
const char *last_vector_data = this->vectors->getElement(last_idx);
this->vectors->updateElement(id_to_delete, last_vector_data);
}
vectors->removeElement(last_idx);
this->vectors->removeElement(last_idx);

// If we reached to a multiply of a block size, we can reduce meta data structures size.
if (this->count % this->blockSize == 0) {
Expand All @@ -217,7 +216,7 @@ size_t BruteForceIndex<DataType, DistType>::indexCapacity() const {
template <typename DataType, typename DistType>
std::unique_ptr<RawDataContainer::Iterator>
BruteForceIndex<DataType, DistType>::getVectorsIterator() const {
return vectors->getIterator();
return this->vectors->getIterator();
}

template <typename DataType, typename DistType>
Expand All @@ -240,7 +239,7 @@ BruteForceIndex<DataType, DistType>::topKQuery(const void *queryBlob, size_t k,
getNewMaxPriorityQueue();

// For vector, compute its scores and update the Top candidates max heap
auto vectors_it = vectors->getIterator();
auto vectors_it = this->vectors->getIterator();
idType curr_id = 0;
while (auto *vector = vectors_it->next()) {
if (VECSIM_TIMEOUT(timeoutCtx)) {
Expand Down Expand Up @@ -285,7 +284,7 @@ BruteForceIndex<DataType, DistType>::rangeQuery(const void *queryBlob, double ra
getNewResultsContainer(10); // Use 10 as the initial capacity for the dynamic array.

DistType radius_ = DistType(radius);
auto vectors_it = vectors->getIterator();
auto vectors_it = this->vectors->getIterator();
idType curr_id = 0;
const void *processed_query = processed_query_ptr.get();
while (vectors_it->hasNext()) {
Expand Down
38 changes: 11 additions & 27 deletions src/VecSim/algorithms/hnsw/hnsw.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
#include "VecSim/utils/vecsim_stl.h"
#include "VecSim/utils/vec_utils.h"
#include "VecSim/containers/data_block.h"
#include "VecSim/containers/raw_data_container_interface.h"
#include "VecSim/containers/data_blocks_container.h"
#include "VecSim/containers/vecsim_results_container.h"
#include "VecSim/query_result_definitions.h"
#include "VecSim/vec_sim_common.h"
Expand Down Expand Up @@ -110,7 +112,6 @@ class HNSWIndex : public VecSimIndexAbstract<DataType, DistType>,
size_t maxLevel; // this is the top level of the entry point's element

// Index data
vecsim_stl::vector<DataBlock> vectorBlocks;
vecsim_stl::vector<DataBlock> graphDataBlocks;
vecsim_stl::vector<ElementMetaData> idToMetaData;

Expand Down Expand Up @@ -182,7 +183,7 @@ class HNSWIndex : public VecSimIndexAbstract<DataType, DistType>,
void replaceEntryPoint();

void SwapLastIdWithDeletedId(idType element_internal_id, ElementGraphData *last_element,
void *last_element_data);
const void *last_element_data);

/** Add vector functions */
// Protected internal function that implements generic single vector insertion.
Expand Down Expand Up @@ -384,7 +385,7 @@ labelType HNSWIndex<DataType, DistType>::getEntryPointLabel() const {

template <typename DataType, typename DistType>
const char *HNSWIndex<DataType, DistType>::getDataByInternalId(idType internal_id) const {
return vectorBlocks[internal_id / this->blockSize].getElement(internal_id % this->blockSize);
return this->vectors->getElement(internal_id);
}

template <typename DataType, typename DistType>
Expand Down Expand Up @@ -1130,7 +1131,7 @@ void HNSWIndex<DataType, DistType>::replaceEntryPoint() {
template <typename DataType, typename DistType>
void HNSWIndex<DataType, DistType>::SwapLastIdWithDeletedId(idType element_internal_id,
ElementGraphData *last_element,
void *last_element_data) {
const void *last_element_data) {
// Swap label - this is relevant when the last element's label exists (it is not marked as
// deleted).
if (!isMarkedDeleted(curElementCount)) {
Expand Down Expand Up @@ -1305,12 +1306,6 @@ void HNSWIndex<DataType, DistType>::resizeIndexCommon(size_t new_max_elements) {
template <typename DataType, typename DistType>
void HNSWIndex<DataType, DistType>::growByBlock() {
size_t new_max_elements = maxElements + this->blockSize;

// Validations
assert(vectorBlocks.size() == graphDataBlocks.size());
assert(vectorBlocks.empty() || vectorBlocks.back().getLength() == this->blockSize);

vectorBlocks.emplace_back(this->blockSize, this->dataSize, this->allocator, this->alignment);
graphDataBlocks.emplace_back(this->blockSize, this->elementGraphDataSize, this->allocator);

resizeIndexCommon(new_max_elements);
Expand All @@ -1320,13 +1315,6 @@ template <typename DataType, typename DistType>
void HNSWIndex<DataType, DistType>::shrinkByBlock() {
assert(maxElements >= this->blockSize);
size_t new_max_elements = maxElements - this->blockSize;

// Validations
assert(vectorBlocks.size() == graphDataBlocks.size());
assert(!vectorBlocks.empty());
assert(vectorBlocks.back().getLength() == 0);

vectorBlocks.pop_back();
graphDataBlocks.pop_back();

resizeIndexCommon(new_max_elements);
Expand Down Expand Up @@ -1599,9 +1587,8 @@ HNSWIndex<DataType, DistType>::HNSWIndex(const HNSWParams *params,
const IndexComponents<DataType, DistType> &components,
size_t random_seed)
: VecSimIndexAbstract<DataType, DistType>(abstractInitParams, components),
VecSimIndexTombstone(), maxElements(0), vectorBlocks(this->allocator),
graphDataBlocks(this->allocator), idToMetaData(this->allocator),
visitedNodesHandlerPool(0, this->allocator) {
VecSimIndexTombstone(), maxElements(0), graphDataBlocks(this->allocator),
idToMetaData(this->allocator), visitedNodesHandlerPool(0, this->allocator) {

M = params->M ? params->M : HNSW_DEFAULT_M;
M0 = M * 2;
Expand Down Expand Up @@ -1673,8 +1660,7 @@ void HNSWIndex<DataType, DistType>::removeAndSwap(idType internalId) {

// Get the last element's metadata and data.
// If we are deleting the last element, we already destroyed it's metadata.
DataBlock &last_vector_block = vectorBlocks.back();
auto last_element_data = last_vector_block.removeAndFetchLastElement();
auto *last_element_data = getDataByInternalId(curElementCount);
DataBlock &last_gd_block = graphDataBlocks.back();
auto last_element = (ElementGraphData *)last_gd_block.removeAndFetchLastElement();

Expand All @@ -1685,6 +1671,7 @@ void HNSWIndex<DataType, DistType>::removeAndSwap(idType internalId) {

// If we need to free a complete block and there is at least one block between the
// capacity and the size.
this->vectors->removeElement(curElementCount);
if (curElementCount % this->blockSize == 0) {
shrinkByBlock();
}
Expand Down Expand Up @@ -1793,16 +1780,13 @@ HNSWAddVectorState HNSWIndex<DataType, DistType>::storeNewElement(labelType labe
if (indexSize() > indexCapacity()) {
growByBlock();
} else if (state.newElementId % this->blockSize == 0) {
// If we had an initial capacity, we might have to allocate new blocks for the data and
// meta-data.
this->vectorBlocks.emplace_back(this->blockSize, this->dataSize, this->allocator,
this->alignment);
// If we had an initial capacity, we might have to allocate new blocks for the graph data.
this->graphDataBlocks.emplace_back(this->blockSize, this->elementGraphDataSize,
this->allocator);
}

// Insert the new element to the data block
this->vectorBlocks.back().addElement(vector_data);
this->vectors->addElement(vector_data, state.newElementId);
this->graphDataBlocks.back().addElement(cur_egd);
// We mark id as in process *before* we set it in the label lookup, so that IN_PROCESS flag is
// set when checking if label .
Expand Down
44 changes: 9 additions & 35 deletions src/VecSim/algorithms/hnsw/hnsw_serializer.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ HNSWIndex<DataType, DistType>::HNSWIndex(std::ifstream &input, const HNSWParams
const IndexComponents<DataType, DistType> &components,
Serializer::EncodingVersion version)
: VecSimIndexAbstract<DataType, DistType>(abstractInitParams, components), Serializer(version),
epsilon(params->epsilon), vectorBlocks(this->allocator), graphDataBlocks(this->allocator),
idToMetaData(this->allocator), visitedNodesHandlerPool(0, this->allocator) {
epsilon(params->epsilon), graphDataBlocks(this->allocator), idToMetaData(this->allocator),
visitedNodesHandlerPool(0, this->allocator) {

this->restoreIndexFields(input);
this->fieldsValidation();
Expand All @@ -23,7 +23,6 @@ HNSWIndex<DataType, DistType>::HNSWIndex(std::ifstream &input, const HNSWParams
this->visitedNodesHandlerPool.resize(maxElements);

size_t initial_vector_size = maxElements / this->blockSize;
vectorBlocks.reserve(initial_vector_size);
graphDataBlocks.reserve(initial_vector_size);
}

Expand Down Expand Up @@ -167,29 +166,16 @@ void HNSWIndex<DataType, DistType>::restoreGraph(std::ifstream &input, EncodingV
setVectorId(label, id);
}

// Get number of blocks
unsigned int num_blocks = 0;
readBinaryPOD(input, num_blocks);
this->vectorBlocks.reserve(num_blocks);
this->graphDataBlocks.reserve(num_blocks);

// Get data blocks
for (size_t i = 0; i < num_blocks; i++) {
this->vectorBlocks.emplace_back(this->blockSize, this->dataSize, this->allocator,
this->alignment);
unsigned int block_len = 0;
readBinaryPOD(input, block_len);
for (size_t j = 0; j < block_len; j++) {
auto cur_vec = this->getAllocator()->allocate_unique(this->dataSize);
input.read(static_cast<char *>(cur_vec.get()), this->dataSize);
this->vectorBlocks.back().addElement(cur_vec.get());
}
}
// Todo: create vector data container and load the stored data based on the index storage params
// when other storage types will be available.
dynamic_cast<DataBlocksContainer *>(this->vectors)
->restoreBlocks(input, this->curElementCount, m_version);

// Get graph data blocks
ElementGraphData *cur_egt;
auto tmpData = this->getAllocator()->allocate_unique(this->elementGraphDataSize);
size_t toplevel = 0;
size_t num_blocks = dynamic_cast<DataBlocksContainer *>(this->vectors)->numBlocks();
for (size_t i = 0; i < num_blocks; i++) {
this->graphDataBlocks.emplace_back(this->blockSize, this->elementGraphDataSize,
this->allocator);
Expand Down Expand Up @@ -283,22 +269,10 @@ void HNSWIndex<DataType, DistType>::saveGraph(std::ofstream &output) const {
writeBinaryPOD(output, flags);
}

// Save number of blocks
unsigned int num_blocks = this->vectorBlocks.size();
writeBinaryPOD(output, num_blocks);

// Save data blocks
for (size_t i = 0; i < num_blocks; i++) {
auto &block = this->vectorBlocks[i];
unsigned int block_len = block.getLength();
writeBinaryPOD(output, block_len);
for (size_t j = 0; j < block_len; j++) {
output.write(block.getElement(j), this->dataSize);
}
}
this->vectors->saveVectorsData(output);

// Save graph data blocks
for (size_t i = 0; i < num_blocks; i++) {
for (size_t i = 0; i < this->graphDataBlocks.size(); i++) {
auto &block = this->graphDataBlocks[i];
unsigned int block_len = block.getLength();
writeBinaryPOD(output, block_len);
Expand Down
57 changes: 57 additions & 0 deletions src/VecSim/containers/data_blocks_container.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#include "data_blocks_container.h"
#include "VecSim/utils/serializer.h"
#include <cmath>

DataBlocksContainer::DataBlocksContainer(size_t blockSize, size_t elementBytesCount,
std::shared_ptr<VecSimAllocator> allocator,
Expand All @@ -10,6 +12,8 @@ DataBlocksContainer::~DataBlocksContainer() = default;

size_t DataBlocksContainer::size() const { return element_count; }

size_t DataBlocksContainer::capacity() const { return blocks.capacity(); }

size_t DataBlocksContainer::blockSize() const { return block_size; }

size_t DataBlocksContainer::elementByteCount() const { return element_bytes_count; }
Expand Down Expand Up @@ -51,6 +55,59 @@ std::unique_ptr<RawDataContainer::Iterator> DataBlocksContainer::getIterator() c
return std::make_unique<DataBlocksContainer::Iterator>(*this);
}

#ifdef BUILD_TESTS
void DataBlocksContainer::saveVectorsData(std::ostream &output) const {
// Save data blocks
for (size_t i = 0; i < this->numBlocks(); i++) {
auto &block = this->blocks[i];
unsigned int block_len = block.getLength();
for (size_t j = 0; j < block_len; j++) {
output.write(block.getElement(j), this->element_bytes_count);
}
}
}

void DataBlocksContainer::restoreBlocks(std::istream &input, size_t num_vectors,
Serializer::EncodingVersion version) {

// Get number of blocks
unsigned int num_blocks = 0;
if (version == Serializer::EncodingVersion_V3) {
// In V3, the number of blocks is serialized, so we need to read it from the file.
Serializer::readBinaryPOD(input, num_blocks);
} else {
// Otherwise, calculate the number of blocks based on the number of vectors.
num_blocks = std::ceil((float)num_vectors / this->block_size);
}
this->blocks.reserve(num_blocks);

// Get data blocks
for (size_t i = 0; i < num_blocks; i++) {
this->blocks.emplace_back(this->block_size, this->element_bytes_count, this->allocator,
this->alignment);
unsigned int block_len = 0;
if (version == Serializer::EncodingVersion_V3) {
// In V3, the length of each block is serialized, so we need to read it from the file.
Serializer::readBinaryPOD(input, block_len);
} else {
size_t vectors_left = num_vectors - this->element_count;
block_len = vectors_left > this->block_size ? this->block_size : vectors_left;
}
for (size_t j = 0; j < block_len; j++) {
auto cur_vec = this->getAllocator()->allocate_unique(this->element_bytes_count);
input.read(static_cast<char *>(cur_vec.get()),
(std::streamsize)this->element_bytes_count);
this->blocks.back().addElement(cur_vec.get());
this->element_count++;
}
}
}

void DataBlocksContainer::shrinkToFit() { this->blocks.shrink_to_fit(); }

size_t DataBlocksContainer::numBlocks() const { return this->blocks.size(); }

#endif
/********************************** Iterator API ************************************************/

DataBlocksContainer::Iterator::Iterator(const DataBlocksContainer &container_)
Expand Down
Loading