Skip to content

Commit

Permalink
CP tiered index 0.7 (#383)
Browse files Browse the repository at this point in the history
  • Loading branch information
alonre24 authored May 24, 2023
1 parent 75f2c93 commit e134c63
Show file tree
Hide file tree
Showing 83 changed files with 11,113 additions and 2,056 deletions.
32 changes: 27 additions & 5 deletions .github/wordlist.txt
Original file line number Diff line number Diff line change
@@ -1,29 +1,51 @@
AVX
BatchIterator
DQ
Datatypes
FP
HDF
HNSW
KNN
RediSearch
RedisAI
SIMD
TBD
TopK
VSCode
VecSimBasics
VecSimGeneral
VecSimUpdatedIndex
VectorSimilarity
ZSH
allocators
ann
benchmarked
benchmarking
byndings
bm
cmake
cpp
dataset
datasets
destructor
devcontainer
dir
enum
fp
frac
gcc
github
gnist
hnsw
hnswlib
mnist
neighbor
pre
py
repo
runtime
templated
tox
valgrind
vecsim
virtualenv
whl
datasets
runtime
RedisAI
dataset
2 changes: 1 addition & 1 deletion .github/workflows/arm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
# Ubuntu 22.04 region AMI for ARM
ec2-image-id: ami-062b37d89f25c958f
ec2-instance-type: t4g.small
ec2-instance-type: t4g.medium
subnet-id: ${{ secrets.AWS_EC2_SUBNET_ID }}
security-group-id: ${{ secrets.AWS_EC2_SG_ID }}

Expand Down
6 changes: 6 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ make clean # remove binary files
make unit_test # run unit tests
CTEST_ARGS=args # extra CTest arguments
VG|VALGRIND=1 # run tests with valgrind
FP_64=1 # run tests with 64-bit floating point
make valgrind # build for Valgrind and run tests
make flow_test # run flow tests (with pytest)
TEST=file::name # run specific test
Expand Down Expand Up @@ -124,6 +125,11 @@ ifeq ($(VERBOSE),1)
CMAKE_FLAGS += -DCMAKE_VERBOSE_MAKEFILE=on
endif

# CMake flags for fp64 unit tests
ifeq ($(FP_64),1)
CMAKE_FLAGS += -DFP64_TESTS=on
endif

CMAKE_FLAGS += \
-Wno-deprecated \
-DCMAKE_WARN_DEPRECATED=OFF \
Expand Down
1 change: 0 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,5 @@ def build_extension(self, ext):
description="Python library around collection of vector similarity algorithm",
long_description="",
ext_modules=[CMakeExtension("VecSim", "src/python_bindings")],
py_modules=['src/python_bindings/Mybytearray'],
cmdclass={"build_ext": CMakeBuild}
)
6 changes: 4 additions & 2 deletions src/VecSim/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@ add_subdirectory(spaces)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wall")

add_library(VectorSimilarity ${VECSIM_LIBTYPE}
algorithms/brute_force/brute_force_factory.cpp
algorithms/hnsw/hnsw_factory.cpp
index_factories/brute_force_factory.cpp
index_factories/hnsw_factory.cpp
index_factories/tiered_factory.cpp
index_factories/index_factory.cpp
algorithms/brute_force/vector_block.cpp
algorithms/hnsw/visited_nodes_handler.cpp
vec_sim.cpp
Expand Down
16 changes: 10 additions & 6 deletions src/VecSim/algorithms/brute_force/bf_batch_iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ template <typename DataType, typename DistType>
class BF_BatchIterator : public VecSimBatchIterator {
protected:
const BruteForceIndex<DataType, DistType> *index;
size_t index_label_count; // number of labels in the index when calculating the scores,
// which is the only time we access the index.
vecsim_stl::vector<pair<DistType, labelType>> scores; // vector of scores for every label.
size_t scores_valid_start_pos; // the first index in the scores vector that contains a vector
// that hasn't been returned already.
Expand Down Expand Up @@ -56,13 +58,15 @@ template <typename DataType, typename DistType>
VecSimQueryResult_List
BF_BatchIterator<DataType, DistType>::searchByHeuristics(size_t n_res,
VecSimQueryResult_Order order) {
if ((this->index->indexLabelCount() - this->getResultsCount()) / 1000 > n_res) {
if ((this->index_label_count - this->getResultsCount()) / 1000 > n_res) {
// Heap based search always returns the results ordered by score
return this->heapBasedSearch(n_res);
}
VecSimQueryResult_List rl = this->selectBasedSearch(n_res);
if (order == BY_SCORE) {
sort_results_by_score(rl);
} else if (order == BY_SCORE_THEN_ID) {
sort_results_by_score_then_id(rl);
}
return rl;
}
Expand Down Expand Up @@ -167,17 +171,17 @@ BF_BatchIterator<DataType, DistType>::BF_BatchIterator(
void *query_vector, const BruteForceIndex<DataType, DistType> *bf_index,
VecSimQueryParams *queryParams, std::shared_ptr<VecSimAllocator> allocator)
: VecSimBatchIterator(query_vector, queryParams ? queryParams->timeoutCtx : nullptr, allocator),
index(bf_index), scores(allocator), scores_valid_start_pos(0) {}
index(bf_index), index_label_count(index->indexLabelCount()), scores(allocator),
scores_valid_start_pos(0) {}

template <typename DataType, typename DistType>
VecSimQueryResult_List
BF_BatchIterator<DataType, DistType>::getNextResults(size_t n_res, VecSimQueryResult_Order order) {
assert((order == BY_ID || order == BY_SCORE) &&
"Possible order values are only 'BY_ID' or 'BY_SCORE'");
// Only in the first iteration we need to compute all the scores
if (this->scores.empty()) {
assert(getResultsCount() == 0);

// The only time we access the index. This function also updates the iterator's label count.
auto rc = calculateScores();

if (VecSim_OK != rc) {
Expand All @@ -198,8 +202,8 @@ BF_BatchIterator<DataType, DistType>::getNextResults(size_t n_res, VecSimQueryRe

template <typename DataType, typename DistType>
bool BF_BatchIterator<DataType, DistType>::isDepleted() {
assert(this->getResultsCount() <= this->index->indexLabelCount());
bool depleted = this->getResultsCount() == this->index->indexLabelCount();
assert(this->getResultsCount() <= this->index_label_count);
bool depleted = this->getResultsCount() == this->index_label_count;
return depleted;
}

Expand Down
6 changes: 3 additions & 3 deletions src/VecSim/algorithms/brute_force/bfm_batch_iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ class BFM_BatchIterator : public BF_BatchIterator<DataType, DistType> {

private:
inline VecSimQueryResult_Code calculateScores() override {

this->scores.reserve(this->index->indexLabelCount());
vecsim_stl::unordered_map<labelType, DistType> tmp_scores(this->index->indexLabelCount(),
this->index_label_count = this->index->indexLabelCount();
this->scores.reserve(this->index_label_count);
vecsim_stl::unordered_map<labelType, DistType> tmp_scores(this->index_label_count,
this->allocator);
vecsim_stl::vector<VectorBlock *> blocks = this->index->getVectorBlocks();
VecSimQueryResult_Code rc;
Expand Down
4 changes: 2 additions & 2 deletions src/VecSim/algorithms/brute_force/bfs_batch_iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ class BFS_BatchIterator : public BF_BatchIterator<DataType, DistType> {

private:
inline VecSimQueryResult_Code calculateScores() override {

this->scores.reserve(this->index->indexLabelCount());
this->index_label_count = this->index->indexLabelCount();
this->scores.reserve(this->index_label_count);
vecsim_stl::vector<VectorBlock *> blocks = this->index->getVectorBlocks();
VecSimQueryResult_Code rc;

Expand Down
Loading

0 comments on commit e134c63

Please sign in to comment.