From d275cc61e715a4b23a3daf7fa40da476e0ca406f Mon Sep 17 00:00:00 2001 From: Pasin Suriyentrakorn Date: Thu, 21 Mar 2024 17:08:51 -0700 Subject: [PATCH] CBL-5500 : Implement Vector Search API * Implemented Vector Search API. * Added a sanity test but disabled (Need to implement structure for automated testing). --- CBL_C.xcodeproj/project.pbxproj | 30 ++++++++- include/cbl/CBLCollection.h | 14 ++++ include/cbl/CBLDatabase.h | 19 ++++++ include/cbl/CBLPrediction.h | 52 +++++++++++++++ include/cbl/CBLQuery.h | 85 ++++++++++++++++++++++-- src/CBLCollection_CAPI.cc | 15 +++++ src/CBLCollection_Internal.hh | 79 ++++++++++++++++++++++ src/CBLDatabase_CAPI.cc | 10 +++ src/CBLDatabase_Internal.hh | 11 +++ src/CBLPrediction.cc | 66 ++++++++++++++++++ src/CBLPrediction_CAPI.cc | 33 +++++++++ src/CBLPrediction_Internal.hh | 45 +++++++++++++ src/CBLVectorIndexConfig.hh | 80 ++++++++++++++++++++++ src/CBLVectorIndexConfig_CAPI.cc | 47 +++++++++++++ src/exports/CBL_EE_Exports.txt | 7 ++ src/exports/generated/CBL_EE.def | 6 ++ src/exports/generated/CBL_EE.exp | 6 ++ src/exports/generated/CBL_EE.gnu | 6 ++ src/exports/generated/CBL_EE_Android.gnu | 6 ++ test/CBLTest.cc | 44 ++++++++++++ test/CBLTest.hh | 10 +++ test/VectorSearchTest.cc | 44 ++++++++++++ test/extensions/README.md | 1 + 23 files changed, 709 insertions(+), 7 deletions(-) create mode 100644 include/cbl/CBLPrediction.h create mode 100644 src/CBLPrediction.cc create mode 100644 src/CBLPrediction_CAPI.cc create mode 100644 src/CBLPrediction_Internal.hh create mode 100644 src/CBLVectorIndexConfig.hh create mode 100644 src/CBLVectorIndexConfig_CAPI.cc create mode 100644 test/VectorSearchTest.cc create mode 100644 test/extensions/README.md diff --git a/CBL_C.xcodeproj/project.pbxproj b/CBL_C.xcodeproj/project.pbxproj index c8810b5d..c4a79bfc 100644 --- a/CBL_C.xcodeproj/project.pbxproj +++ b/CBL_C.xcodeproj/project.pbxproj @@ -86,6 +86,12 @@ 4022546E29355577000FBAC8 /* assets in Resources */ = {isa = PBXBuildFile; fileRef = 4022546D29355576000FBAC8 /* assets */; }; 4022546F29355577000FBAC8 /* assets in Resources */ = {isa = PBXBuildFile; fileRef = 4022546D29355576000FBAC8 /* assets */; }; 4022547029355577000FBAC8 /* assets in Resources */ = {isa = PBXBuildFile; fileRef = 4022546D29355576000FBAC8 /* assets */; }; + 406E46E42BACC4BF0088198C /* VectorSearchTest.cc in Sources */ = {isa = PBXBuildFile; fileRef = 406E46D22BACAEFF0088198C /* VectorSearchTest.cc */; }; + 4083FCAF2BA3B8B00061509D /* CBLVectorIndexConfig_CAPI.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4083FCAE2BA3B8B00061509D /* CBLVectorIndexConfig_CAPI.cc */; }; + 4083FCB02BA3B8C50061509D /* CBLVectorIndexConfig.hh in Headers */ = {isa = PBXBuildFile; fileRef = 4083FC9F2BA3A4390061509D /* CBLVectorIndexConfig.hh */; }; + 4083FCBC2BA8DE200061509D /* CBLPrediction_CAPI.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4083FCBB2BA8DE200061509D /* CBLPrediction_CAPI.cc */; }; + 4083FCC02BAA235F0061509D /* CBLPrediction_Internal.hh in Headers */ = {isa = PBXBuildFile; fileRef = 4083FCBF2BAA203A0061509D /* CBLPrediction_Internal.hh */; }; + 4083FCC22BAA38F30061509D /* CBLPrediction.cc in Sources */ = {isa = PBXBuildFile; fileRef = 4083FCC12BAA38F30061509D /* CBLPrediction.cc */; }; 40D1862529B6D1A50061AA85 /* Collection.hh in Headers */ = {isa = PBXBuildFile; fileRef = FCC064BD287CBD95000C5BD7 /* Collection.hh */; }; 40F902D82B9F7B12002EA0A0 /* PrivacyInfo.xcprivacy in Resources */ = {isa = PBXBuildFile; fileRef = 40F902C82B9F7AF6002EA0A0 /* PrivacyInfo.xcprivacy */; }; 42D1B69D2978AD31003B9871 /* CBLUserAgent.mm in Sources */ = {isa = PBXBuildFile; fileRef = 42D1B68F2978AD31003B9871 /* CBLUserAgent.mm */; }; @@ -464,6 +470,13 @@ 27DBD096246C99AF002FD7A7 /* mergeIntoStaticLib.sh */ = {isa = PBXFileReference; lastKnownFileType = text.script.sh; path = mergeIntoStaticLib.sh; sourceTree = ""; }; 27DBD097246C9DE7002FD7A7 /* CBLDatabase+Apple.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = "CBLDatabase+Apple.mm"; sourceTree = ""; }; 4022546D29355576000FBAC8 /* assets */ = {isa = PBXFileReference; lastKnownFileType = folder; path = assets; sourceTree = ""; }; + 406E46D22BACAEFF0088198C /* VectorSearchTest.cc */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = VectorSearchTest.cc; sourceTree = ""; }; + 4083FC9F2BA3A4390061509D /* CBLVectorIndexConfig.hh */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.h; path = CBLVectorIndexConfig.hh; sourceTree = ""; }; + 4083FCAE2BA3B8B00061509D /* CBLVectorIndexConfig_CAPI.cc */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = CBLVectorIndexConfig_CAPI.cc; sourceTree = ""; }; + 4083FCB62BA3F6930061509D /* CBLPrediction.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = CBLPrediction.h; sourceTree = ""; }; + 4083FCBB2BA8DE200061509D /* CBLPrediction_CAPI.cc */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = CBLPrediction_CAPI.cc; sourceTree = ""; }; + 4083FCBF2BAA203A0061509D /* CBLPrediction_Internal.hh */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.h; path = CBLPrediction_Internal.hh; sourceTree = ""; }; + 4083FCC12BAA38F30061509D /* CBLPrediction.cc */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = CBLPrediction.cc; sourceTree = ""; }; 40F902C82B9F7AF6002EA0A0 /* PrivacyInfo.xcprivacy */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.xml; path = PrivacyInfo.xcprivacy; sourceTree = ""; }; 42D1B68F2978AD31003B9871 /* CBLUserAgent.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = CBLUserAgent.mm; sourceTree = ""; }; 932062DA26BC6B43006917A5 /* CBLQuery.cc */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = CBLQuery.cc; sourceTree = ""; }; @@ -633,6 +646,7 @@ 93C70CD226C4D3F20093E927 /* CBLEncryptable.h */, 277B77C6245B44BE00B222D3 /* CBLLog.h */, 9320630126BDB340006917A5 /* CBLPlatform.h */, + 4083FCB62BA3F6930061509D /* CBLPrediction.h */, 271C2A3021CAC98F0045856E /* CBLQuery.h */, 271C2A2C21CAC98F0045856E /* CBLReplicator.h */, FCD829B12835ECE0004AA814 /* CBLScope.h */, @@ -654,12 +668,15 @@ 271C2A7721CC750E0045856E /* CBLDocument.cc */, 277FEE7A21ED6C0000B60E3C /* CBLDocument_Internal.hh */, 93EC365D26C498AB00182B02 /* CBLEncryptable_Internal.hh */, + 277FEE7621ED62AA00B60E3C /* CBLReplicatorConfig.hh */, 934AD381270E797D0038D62E /* CBLLog_Internal.hh */, 277B77D4245B44E900B222D3 /* CBLLog.cc */, + 4083FCBF2BAA203A0061509D /* CBLPrediction_Internal.hh */, + 4083FCC12BAA38F30061509D /* CBLPrediction.cc */, 932062DA26BC6B43006917A5 /* CBLQuery.cc */, 27DBCF2E246B4352002FD7A7 /* CBLQuery_Internal.hh */, + 4083FC9F2BA3A4390061509D /* CBLVectorIndexConfig.hh */, 27D11BFF235140E300C58A70 /* CBLReplicator_Internal.hh */, - 277FEE7621ED62AA00B60E3C /* CBLReplicatorConfig.hh */, FCC063C828588DA6000C5BD7 /* CBLScope.cc */, FCD8299C2835AC20004AA814 /* CBLScope_Internal.hh */, AE591AB729473A4400E4BDE8 /* CBLUserAgent.hh */, @@ -729,9 +746,11 @@ 276634182605338300B9BD36 /* CBLDocument_CAPI.cc */, 93C70CE226C5B4BC0093E927 /* CBLEncryptable_CAPI.cc */, 9320631126BDB5CA006917A5 /* CBLPlatform_CAPI+Android.cc */, + 4083FCBB2BA8DE200061509D /* CBLPrediction_CAPI.cc */, 27B61D5521D5ABA60027CCDB /* CBLQuery_CAPI.cc */, 277FEE7421ED3C4900B60E3C /* CBLReplicator_CAPI.cc */, FCD829B22835EE39004AA814 /* CBLScope_CAPI.cc */, + 4083FCAE2BA3B8B00061509D /* CBLVectorIndexConfig_CAPI.cc */, ); name = "C Glue"; sourceTree = ""; @@ -758,6 +777,7 @@ FC645E1F29085CE5007D5536 /* supports */, 27B61D6821D6B60D0027CCDB /* CBLTest.hh */, 27B61D6921D6B60D0027CCDB /* CBLTest.cc */, + 27C9B5F221F7EE670040BC45 /* CBLTest.c */, 27A63F252633408000634F7B /* CBLTest_Cpp.hh */, 275B3593234BDB2700FE9CF0 /* CBLTestsMain.cpp */, 93C70D3326D01B5A0093E927 /* BlobTest.cc */, @@ -778,8 +798,8 @@ FC82CE0428C6E2BD001FA083 /* ReplicatorCollectionTest_Cpp.cc */, 2736A633242E5A74002B9D65 /* ReplicatorEETest.cc */, 93C70D1626CB334D0093E927 /* ReplicatorPropEncTest.cc */, + 406E46D22BACAEFF0088198C /* VectorSearchTest.cc */, 27D30F9123A2D30500392107 /* PerfTest.cc */, - 27C9B5F221F7EE670040BC45 /* CBLTest.c */, 275B3597234C158400FE9CF0 /* CouchbaseLiteTests.mm */, 27DBCF41246B81EE002FD7A7 /* LibC++Debug.cc */, 27B61DD421DEE5DC0027CCDB /* CMakeLists.txt */, @@ -884,12 +904,14 @@ files = ( 277FEE7821ED62AA00B60E3C /* CBLReplicatorConfig.hh in Headers */, 9320630F26BDB408006917A5 /* CBLPlatform.h in Headers */, + 4083FCB02BA3B8C50061509D /* CBLVectorIndexConfig.hh in Headers */, 27DBCF2F246B4352002FD7A7 /* CBLQuery_Internal.hh in Headers */, 27D11BEF2351043B00C58A70 /* ConflictResolver.hh in Headers */, FC5FBBA62821B3450066157F /* CBLCollection_Internal.hh in Headers */, 93C70CE026C4D3F80093E927 /* CBLEncryptable.h in Headers */, 271C2A3321CAC98F0045856E /* CBLDocument.h in Headers */, FCD988BB2821B10300512BBD /* CBLCollection.h in Headers */, + 4083FCC02BAA235F0061509D /* CBLPrediction_Internal.hh in Headers */, FCD8299D2835AC3F004AA814 /* CBLScope_Internal.hh in Headers */, 277B77D3245B44BE00B222D3 /* CBLLog.h in Headers */, FC5FBBB72821E4970066157F /* CBLDatabase_Internal.hh in Headers */, @@ -1506,7 +1528,9 @@ 27DBD098246C9DE7002FD7A7 /* CBLDatabase+Apple.mm in Sources */, FC5FBBB52821CC2E0066157F /* CBLCollection_CAPI.cc in Sources */, 275BC4DE2201323700DBE7D2 /* CBLBlob_CAPI.cc in Sources */, + 4083FCAF2BA3B8B00061509D /* CBLVectorIndexConfig_CAPI.cc in Sources */, 271C2A7821CC750E0045856E /* CBLDocument.cc in Sources */, + 4083FCBC2BA8DE200061509D /* CBLPrediction_CAPI.cc in Sources */, 276634192605338300B9BD36 /* CBLDocument_CAPI.cc in Sources */, 27B61D5621D5ABA60027CCDB /* CBLQuery_CAPI.cc in Sources */, 277FEE7521ED3C4900B60E3C /* CBLReplicator_CAPI.cc in Sources */, @@ -1514,6 +1538,7 @@ 27D11BF02351043B00C58A70 /* ConflictResolver.cc in Sources */, 27886C8E21F64C1400069BEA /* Listener.cc in Sources */, FCB96E8329007D37001C4DED /* CBLDefaults_CAPI.cc in Sources */, + 4083FCC22BAA38F30061509D /* CBLPrediction.cc in Sources */, 271C2A7621CC4BD60045856E /* Internal.cc in Sources */, FCC063C928588DA6000C5BD7 /* CBLScope.cc in Sources */, FC5FBBA52821B3450066157F /* CBLCollection.cc in Sources */, @@ -1591,6 +1616,7 @@ FC09078A28A5F40200201B07 /* CollectionTest_Cpp.cc in Sources */, 27D30F9F23A2D30500392107 /* PerfTest.cc in Sources */, FC82CE1328C6E2FD001FA083 /* ReplicatorCollectionTest_Cpp.cc in Sources */, + 406E46E42BACC4BF0088198C /* VectorSearchTest.cc in Sources */, 27DBCF42246B81EE002FD7A7 /* LibC++Debug.cc in Sources */, ); runOnlyForDeploymentPostprocessing = 0; diff --git a/include/cbl/CBLCollection.h b/include/cbl/CBLCollection.h index 38412e87..4f5294ee 100644 --- a/include/cbl/CBLCollection.h +++ b/include/cbl/CBLCollection.h @@ -381,6 +381,20 @@ bool CBLCollection_CreateFullTextIndex(CBLCollection *collection, CBLFullTextIndexConfiguration config, CBLError* _cbl_nullable outError) CBLAPI; +#ifdef COUCHBASE_ENTERPRISE +/** ENTERPRISE EDITION ONLY + + Creatres a vector index in the collection. + If an identical index with that name already exists, nothing happens (and no error is returned.) + If a non-identical index with that name already exists, it is deleted and re-created. + */ +bool CBLCollection_CreateVectorIndex(CBLCollection *collection, + FLString name, + CBLVectorIndexConfiguration config, + CBLError* _cbl_nullable outError) CBLAPI; + +#endif + /** Deletes an index in the collection by name. @param collection The collection. @param name The name of the index. diff --git a/include/cbl/CBLDatabase.h b/include/cbl/CBLDatabase.h index d153c150..d116725c 100644 --- a/include/cbl/CBLDatabase.h +++ b/include/cbl/CBLDatabase.h @@ -26,6 +26,25 @@ CBL_CAPI_BEGIN A \ref CBLDatabase is both a filesystem object and a container for documents. */ +#ifdef COUCHBASE_ENTERPRISE + +#ifdef __APPLE__ +#pragma mark - Database Extension +#endif + +/** \name Database Extension + @{ */ + +/** ENTERPRISE EDITION ONLY + + Registers a directory path to load extension libraries from, such as Vector Search. + Must be called before opening a database that will use an extension. */ +void CBL_SetExtensionPath(FLString path) CBLAPI; + +/** @} */ + +#endif + #ifdef __APPLE__ #pragma mark - CONFIGURATION #endif diff --git a/include/cbl/CBLPrediction.h b/include/cbl/CBLPrediction.h new file mode 100644 index 00000000..d2aa1640 --- /dev/null +++ b/include/cbl/CBLPrediction.h @@ -0,0 +1,52 @@ +// +// CBLPrediction.h +// +// Copyright (c) 2024 Couchbase, Inc All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#ifdef COUCHBASE_ENTERPRISE + +#pragma once +#include "CBLBase.h" + +CBL_CAPI_BEGIN + +/** Predictive Model */ +typedef struct { + /** A pointer to any external data needed by the `prediction` callback, which will receive this as its first parameter. */ + void* _cbl_nullable context; + + /** Called from within a query (or document indexing) to run the prediction. + @param context The value of the CBLPredictiveModel's `context` field. + @param input The input dictionary from the query. + @return The output dictionary of the prediction function or NULL if there is no output. + @note The output dictionary will be automatically released after it's being consumed. + @warning This function must be "pure": given the same input parameters it must always + produce the same output (otherwise indexes or queries may be messed up). + It MUST NOT alter the database or any documents, nor run a query: either of + those are very likely to cause a crash. */ + FLDict _cbl_nullable (* _cbl_nonnull prediction)(void* _cbl_nullable context, FLDict input); + + /** Called if the model is unregistered, so it can release resources. */ + void (*_cbl_nullable unregistered)(void* context); +} CBLPredictiveModel; + +void CBL_RegisterPredictiveModel(FLString name, CBLPredictiveModel model) CBLAPI; + +void CBL_UnregisterPredictiveModel(FLString name) CBLAPI; + +CBL_CAPI_END + +#endif diff --git a/include/cbl/CBLQuery.h b/include/cbl/CBLQuery.h index ee26c791..f6311ade 100644 --- a/include/cbl/CBLQuery.h +++ b/include/cbl/CBLQuery.h @@ -250,16 +250,19 @@ CBLResultSet* _cbl_nullable CBLQuery_CopyCurrentResults(const CBLQuery* query, You may find SQLite's documentation particularly helpful since Couchbase Lite's querying is based on SQLite. - Two types of indexes are currently supported: + Three types of indexes are currently supported: * Value indexes speed up queries by making it possible to look up property (or expression) values without scanning every document. They're just like regular indexes in SQL or N1QL. Multiple expressions are supported; the first is the primary key, second is secondary. Expressions must evaluate to scalar types (boolean, number, string). * Full-Text Search (FTS) indexes enable fast search of natural-language words or phrases - by using the `MATCH` operator in a query. A FTS index is **required** for full-text - search: a query with a `MATCH` operator will fail to compile unless there is already a - FTS index for the property/expression being matched. Only a single expression is - currently allowed, and it must evaluate to a string. */ + by using the `MATCH()` function in a query. A FTS index is **required** for full-text + search: a query with a `MATCH()` function will fail to compile unless there is already a + FTS index for the property/expression being matched. + * (Enterprise Edition Only) Vector indexes allows efficient search of ML vectors by using + the `VECTOR_MATCH()` function in a query. The `CouchbaseLiteVectorSearch` + extension library is **required** to use the functionality. Use \ref CBL_SetExtensionPath + function to set the directoary path containing the extension library. */ /** Value Index Configuration. */ typedef struct { @@ -310,6 +313,78 @@ typedef struct { FLString language; } CBLFullTextIndexConfiguration; +#ifdef COUCHBASE_ENTERPRISE + +/** An opaque object representing vector encoding config to use in CBLVectorIndexConfiguration. */ +typedef struct CBLVectorEncoding CBLVectorEncoding; + +/** Creates a no-encoding config to use in CBLVectorIndexConfiguration; 4 bytes per dimension, no data loss. */ +_cbl_warn_unused +CBLVectorEncoding* CBLVectorEncoding_CreateNone(void) CBLAPI; + +/** Scalar Quantizer encoding type */ +typedef CBL_ENUM(uint32_t, CBLScalarQuantizerType) { + kCBLSQ4 = 0, ///< 4 bits per dimension + kCBLSQ6, ///< 6 bits per dimension + kCBLSQ8 ///< 8 bits per dimension +}; + +/** Creates a Scalar Quantizer encoding config to use in CBLVectorIndexConfiguration. */ +_cbl_warn_unused +CBLVectorEncoding* CBLVectorEncoding_CreateScalarQuantizer(CBLScalarQuantizerType type) CBLAPI; + +/** Creates a Product Quantizer encoding config to use in CBLVectorIndexConfiguration. */ +_cbl_warn_unused +CBLVectorEncoding* CBLVectorEncoding_CreateProductQuantizer(unsigned subquantizers, unsigned bits) CBLAPI; + +/** Frees a CBLVectorEncoding object. The encoding object can be freed after the index is created. */ +void CBLVectorEncoding_Free(CBLVectorEncoding* _cbl_nullable) CBLAPI; + +/** Distance metric to use in CBLVectorIndexConfiguration. */ +typedef CBL_ENUM(uint32_t, CBLDistanceMetric) { + kCBLDistanceMetricEuclidean = 0, ///< Euclidean distance + kCBLDistanceMetricCosine, ///< Cosine distance (1.0 - Cosine Similarity) +}; + +/** ENTERPRISE EDITION ONLY + + Vector Index Configuration. */ +typedef struct { + /** The language used in the expressions (Required). */ + CBLQueryLanguage expressionLanguage; + + /** An expression returning a vector which is an array of numbers. The expression could be specified + in a JSON Array or in N1QL syntax depending on the expressionLanguage. (Required) */ + FLString expression; + + /** The number of vector dimensions. (Required) */ + unsigned dimensions; + + /** The number of centroids which is the number buckets to partition the vectors in the index. (Required) */ + unsigned centroids; + + /** Vector encoding type. The default value is 8-bits Scalar Quantizer. */ + CBLVectorEncoding* encoding; + + /** Distance Metric type. The default value is euclidean distance. */ + CBLDistanceMetric metric; + + /** The minium number of vectors for training the index, an initial process for preparing an index based on the characteristics of the vectors to be indexed. + Prior training, the full table scan will be peformed when the vector_match() function is used in the query. + + The default value is 25 times number of centroids.The number must be more than zero and not greater than maxTrainingSize. + An invalid argument error will be thrown when creating the index if an invalid value is used. */ + unsigned minTrainingSize; + + /** The max number of vectors used when trainning the index. The default + value is 256 times number of centroids. The number must be more than zero + and not less than minTrainingSize. An invalid argument will be thrown + when creating the index if an invalid value is used. */ + unsigned maxTrainingSize; +} CBLVectorIndexConfiguration; + +#endif + /** Creates a full-text index. Indexes are persistent. If an identical index with that name already exists, nothing happens (and no error is returned.) diff --git a/src/CBLCollection_CAPI.cc b/src/CBLCollection_CAPI.cc index 4fb822be..ee07e4f3 100644 --- a/src/CBLCollection_CAPI.cc +++ b/src/CBLCollection_CAPI.cc @@ -288,6 +288,21 @@ bool CBLCollection_CreateFullTextIndex(CBLCollection *collection, } catchAndBridge(outError) } +#ifdef COUCHBASE_ENTERPRISE + +bool CBLCollection_CreateVectorIndex(CBLCollection *collection, + FLString name, + CBLVectorIndexConfiguration config, + CBLError *outError) noexcept +{ + try { + collection->createVectorIndex(name, config); + return true; + } catchAndBridge(outError) +} + +#endif + bool CBLCollection_DeleteIndex(CBLCollection *collection, FLString name, CBLError *outError) noexcept diff --git a/src/CBLCollection_Internal.hh b/src/CBLCollection_Internal.hh index 14f7dac0..1badecf9 100644 --- a/src/CBLCollection_Internal.hh +++ b/src/CBLCollection_Internal.hh @@ -22,11 +22,15 @@ #include "CBLDocument_Internal.hh" #include "CBLScope_Internal.hh" #include "CBLPrivate.h" +#include "CBLVectorIndexConfig.hh" +#include "Defer.hh" CBL_ASSUME_NONNULL_BEGIN using CollectionSpec = C4Database::CollectionSpec; +using namespace litecore; + struct CBLCollection final : public CBLRefCounted { public: @@ -127,7 +131,82 @@ public: (C4QueryLanguage)config.expressionLanguage, kC4FullTextIndex, &options); } + +#ifdef COUCHBASE_ENTERPRISE + + void createVectorIndex(slice name, CBLVectorIndexConfiguration config) { + if (!config.expression.buf) { + C4Error::raise(LiteCoreDomain, kC4ErrorInvalidParameter, "expression is required."); + } + + if (config.dimensions < 2 || config.dimensions > 2048) { + C4Error::raise(LiteCoreDomain, kC4ErrorInvalidParameter, "dimensions must be >= 2 and <= 2048."); + } + + if (config.centroids < 1 || config.centroids > 64000) { + C4Error::raise(LiteCoreDomain, kC4ErrorInvalidParameter, "centroids must be >= 1 and <= 64000."); + } + + // Use default minTrainingSize if minTrainingSize is not specified: + if (config.minTrainingSize == 0) { + config.minTrainingSize = 25 * config.centroids; + } + + // Use default maxTrainingSize if maxTrainingSize is not specified: + if (config.maxTrainingSize == 0) { + config.maxTrainingSize = 250 * config.centroids; + } + + if (config.minTrainingSize < 1) { + C4Error::raise(LiteCoreDomain, kC4ErrorInvalidParameter, "minTrainingSize must be > 1."); + } else if (config.maxTrainingSize < 1) { + C4Error::raise(LiteCoreDomain, kC4ErrorInvalidParameter, "maxTrainingSize must be > 1."); + } else if (config.minTrainingSize > config.maxTrainingSize) { + C4Error::raise(LiteCoreDomain, kC4ErrorInvalidParameter, "minTrainingSize must be <= maxTrainingSize."); + } + + // Use default encoding if encoding is not specified: + CBLVectorEncoding* enc = nullptr; + if (!config.encoding) { + enc = CBLVectorEncoding_CreateScalarQuantizer(kCBLSQ8); + config.encoding = enc; + } + DEFER { + CBLVectorEncoding_Free(enc); + }; + + C4VectorEncoding c4enc = config.encoding->c4encoding(); + if (c4enc.type == kC4VectorEncodingPQ) { + if (c4enc.pq_subquantizers < 2) { + C4Error::raise(LiteCoreDomain, kC4ErrorInvalidParameter, "Product Quantizer's subquantizers must be > 1."); + } else if (config.dimensions % c4enc.pq_subquantizers != 0) { + C4Error::raise(LiteCoreDomain, kC4ErrorInvalidParameter, "Product Quantizer's subquantizers must be a factor of dimensions."); + } else if (c4enc.bits < 4 || c4enc.bits > 12){ + C4Error::raise(LiteCoreDomain, kC4ErrorInvalidParameter, "Product Quantizer's bits must be >= 4 and <= 12."); + } + } + + C4VectorIndexOptions vector {}; + vector.clustering = {}; + vector.clustering.type = kC4VectorClusteringFlat; + vector.clustering.flat_centroids = config.centroids; + vector.dimensions = config.dimensions; + vector.metric = config.metric == kCBLDistanceMetricCosine ? kC4VectorMetricCosine : kC4VectorMetricEuclidean; + vector.encoding = c4enc; + vector.minTrainingSize = config.minTrainingSize; + vector.maxTrainingSize = config.maxTrainingSize; + vector.numProbes = 0; + + C4IndexOptions options {}; + options.vector = vector; + + _c4col.useLocked()->createIndex(name, config.expression, + (C4QueryLanguage)config.expressionLanguage, + kC4VectorIndex, &options); + } +#endif + void deleteIndex(slice name) { _c4col.useLocked()->deleteIndex(name); } diff --git a/src/CBLDatabase_CAPI.cc b/src/CBLDatabase_CAPI.cc index d61ad4bb..b8dc21d2 100644 --- a/src/CBLDatabase_CAPI.cc +++ b/src/CBLDatabase_CAPI.cc @@ -458,3 +458,13 @@ bool CBLDatabase_SaveBlob(CBLDatabase* db, CBLBlob* blob, return true; } catchAndBridge(outError) } + +#pragma mark - EXTENSION: + +#ifdef COUCHBASE_ENTERPRISE + +void CBL_SetExtensionPath(FLString path) noexcept { + CBLDatabase::setExtensionPath(path); +} + +#endif diff --git a/src/CBLDatabase_Internal.hh b/src/CBLDatabase_Internal.hh index 01eeb54b..b809b9ed 100644 --- a/src/CBLDatabase_Internal.hh +++ b/src/CBLDatabase_Internal.hh @@ -45,6 +45,17 @@ namespace cbl_internal { struct CBLDatabase final : public CBLRefCounted { public: +#ifdef COUCHBASE_ENTERPRISE + +#pragma mark - Database Extension: + + static void setExtensionPath(slice path) { + CBLLog_Init(); + C4Database::setExtensionPath(path); + } + +#endif + #pragma mark - Lifecycle: static CBLDatabaseConfiguration defaultConfiguration() { diff --git a/src/CBLPrediction.cc b/src/CBLPrediction.cc new file mode 100644 index 00000000..975c434a --- /dev/null +++ b/src/CBLPrediction.cc @@ -0,0 +1,66 @@ +// +// CBLPrediction.cc +// +// Copyright (C) 2024 Couchbase, Inc All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "CBLPrediction_Internal.hh" +#include "c4PredictiveQuery.h" + +namespace cbl_internal { + using namespace std; + using namespace fleece; + + void PredictiveModel::registerModel(const slice name, const CBLPredictiveModel& model) { + auto prediction = [](void* context, FLDict input, C4Database *db, C4Error *outError) { + auto m = (PredictiveModel*)context; + FLDict output = m->_model.prediction(m->_model.context, input); + return C4SliceResult(m->encodeOutput(Dict(output))); + }; + + auto unregistered = [](void* context) { + auto m = (PredictiveModel*)context; + if (m->_model.unregistered) { + m->_model.unregistered(m->_model.context); + } + delete m; + }; + + unregisterModel(name); + + C4PredictiveModel c4model = { + .context = new PredictiveModel(model), + .prediction = prediction, + .unregistered = unregistered + }; + auto nameStr = name.asString(); + c4pred_registerModel(nameStr.c_str(), c4model); + } + + void PredictiveModel::unregisterModel(slice name) { + auto nameStr = name.asString(); + c4pred_unregisterModel(nameStr.c_str()); + } + + alloc_slice PredictiveModel::encodeOutput(Dict dict) { + if (!dict) { + return nullslice; + } + + Encoder enc; + enc.writeValue(dict); + return enc.finish(); + } +} diff --git a/src/CBLPrediction_CAPI.cc b/src/CBLPrediction_CAPI.cc new file mode 100644 index 00000000..c402acbd --- /dev/null +++ b/src/CBLPrediction_CAPI.cc @@ -0,0 +1,33 @@ +// +// CBLPrediction_CAPI.cc +// +// Copyright (C) 2024 Couchbase, Inc All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#ifdef COUCHBASE_ENTERPRISE + +#include "CBLPrediction_Internal.hh" + +using namespace cbl_internal; + +void CBL_RegisterPredictiveModel(FLString name, CBLPredictiveModel model) noexcept { + PredictiveModel::registerModel(name, model); +} + +void CBL_UnregisterPredictiveModel(FLString name) noexcept { + PredictiveModel::unregisterModel(name); +} + +#endif diff --git a/src/CBLPrediction_Internal.hh b/src/CBLPrediction_Internal.hh new file mode 100644 index 00000000..d85cd3d4 --- /dev/null +++ b/src/CBLPrediction_Internal.hh @@ -0,0 +1,45 @@ +// +// CBLPrediction_Internal.hh +// +// Copyright (C) 2024 Couchbase, Inc All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#ifdef COUCHBASE_ENTERPRISE + +#pragma once +#include "CBLPrediction.h" +#include "fleece/Fleece.hh" +#include "fleece/slice.hh" + +namespace cbl_internal { + using namespace fleece; + + struct PredictiveModel { + static void registerModel(const slice name, const CBLPredictiveModel& model); + + static void unregisterModel(const slice name); + + private: + PredictiveModel(const CBLPredictiveModel& model) : _model(model) { } + + ~PredictiveModel() = default; + + alloc_slice encodeOutput(Dict dict); + + CBLPredictiveModel const _model; + }; +} + +#endif diff --git a/src/CBLVectorIndexConfig.hh b/src/CBLVectorIndexConfig.hh new file mode 100644 index 00000000..7d2d4b39 --- /dev/null +++ b/src/CBLVectorIndexConfig.hh @@ -0,0 +1,80 @@ +// +// CBLVectorIndexConfig.hh +// +// Copyright (C) 2024 Couchbase, Inc All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#ifdef COUCHBASE_ENTERPRISE + +#include "CBLQuery.h" +#include "c4IndexTypes.h" + +CBL_ASSUME_NONNULL_BEGIN + +struct CBLVectorEncoding { + virtual ~CBLVectorEncoding() =default; + virtual const C4VectorEncoding& c4encoding() const =0; +}; + +namespace cbl_internal { + struct CBLVectorEncodingNone : public CBLVectorEncoding { + CBLVectorEncodingNone() { } + + virtual const C4VectorEncoding& c4encoding() const override { + return _encoding; + } + + private: + C4VectorEncoding _encoding { kC4VectorEncodingNone }; + }; + + struct CBLVectorEncodingSQ : public CBLVectorEncoding { + CBLVectorEncodingSQ(CBLScalarQuantizerType type) + :_type(type) { + switch (type) { + case kCBLSQ4: _encoding.bits = 4; break; + case kCBLSQ6: _encoding.bits = 6; break; + case kCBLSQ8: _encoding.bits = 8; break; + default: C4Error::raise(LiteCoreDomain, kC4ErrorInvalidParameter, "Invalid Scalar Quantizer Type"); + } + } + + virtual const C4VectorEncoding& c4encoding() const override { + return _encoding; + } + + private: + C4VectorEncoding _encoding { kC4VectorEncodingSQ }; + CBLScalarQuantizerType _type; + }; + + struct CBLVectorEncodingPQ : public CBLVectorEncoding { + CBLVectorEncodingPQ(unsigned subquantizer, unsigned bits) { + _encoding.pq_subquantizers = subquantizer; + _encoding.bits = bits; + } + + virtual const C4VectorEncoding& c4encoding() const override { + return _encoding; + } + + private: + C4VectorEncoding _encoding { kC4VectorEncodingSQ }; + }; +} + +CBL_ASSUME_NONNULL_END + +#endif diff --git a/src/CBLVectorIndexConfig_CAPI.cc b/src/CBLVectorIndexConfig_CAPI.cc new file mode 100644 index 00000000..e3030e68 --- /dev/null +++ b/src/CBLVectorIndexConfig_CAPI.cc @@ -0,0 +1,47 @@ +// +// CBLVectorIndexConfig_CAPI.cc +// +// Copyright (C) 2024 Couchbase, Inc All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#ifdef COUCHBASE_ENTERPRISE + +#include "CBLQuery.h" +#include "CBLVectorIndexConfig.hh" +#include "Internal.hh" + +CBLVectorEncoding* CBLVectorEncoding_CreateNone() noexcept { + try { + return new CBLVectorEncodingNone(); + } catchAndWarn() +} + +CBLVectorEncoding* CBLVectorEncoding_CreateScalarQuantizer(CBLScalarQuantizerType type) noexcept { + try { + return new CBLVectorEncodingSQ(type); + } catchAndWarn() +} + +CBLVectorEncoding* CBLVectorEncoding_CreateProductQuantizer(unsigned subquantizer, unsigned bits) noexcept { + try { + return new CBLVectorEncodingPQ(subquantizer, bits); + } catchAndWarn() +} + +void CBLVectorEncoding_Free(CBLVectorEncoding *enc) noexcept { + delete enc; +} + +#endif diff --git a/src/exports/CBL_EE_Exports.txt b/src/exports/CBL_EE_Exports.txt index 42b4a254..a22ec7b6 100644 --- a/src/exports/CBL_EE_Exports.txt +++ b/src/exports/CBL_EE_Exports.txt @@ -30,3 +30,10 @@ CBLEncryptable_Value FLDict_IsEncryptableValue FLDict_GetEncryptableValue FLSlot_SetEncryptableValue + +CBL_SetExtensionPath +CBLCollection_CreateVectorIndex +CBLVectorEncoding_CreateNone +CBLVectorEncoding_CreateProductQuantizer +CBLVectorEncoding_CreateScalarQuantizer +CBLVectorEncoding_Free diff --git a/src/exports/generated/CBL_EE.def b/src/exports/generated/CBL_EE.def index 35fd19f1..f0092e1e 100644 --- a/src/exports/generated/CBL_EE.def +++ b/src/exports/generated/CBL_EE.def @@ -23,6 +23,12 @@ CBLEncryptable_Value FLDict_IsEncryptableValue FLDict_GetEncryptableValue FLSlot_SetEncryptableValue +CBL_SetExtensionPath +CBLCollection_CreateVectorIndex +CBLVectorEncoding_CreateNone +CBLVectorEncoding_CreateProductQuantizer +CBLVectorEncoding_CreateScalarQuantizer +CBLVectorEncoding_Free CBL_Retain CBL_Release CBL_InstanceCount diff --git a/src/exports/generated/CBL_EE.exp b/src/exports/generated/CBL_EE.exp index 1c51f750..de7a2424 100644 --- a/src/exports/generated/CBL_EE.exp +++ b/src/exports/generated/CBL_EE.exp @@ -31,6 +31,12 @@ _CBLEncryptable_Value _FLDict_IsEncryptableValue _FLDict_GetEncryptableValue _FLSlot_SetEncryptableValue +_CBL_SetExtensionPath +_CBLCollection_CreateVectorIndex +_CBLVectorEncoding_CreateNone +_CBLVectorEncoding_CreateProductQuantizer +_CBLVectorEncoding_CreateScalarQuantizer +_CBLVectorEncoding_Free _CBL_Retain _CBL_Release _CBL_InstanceCount diff --git a/src/exports/generated/CBL_EE.gnu b/src/exports/generated/CBL_EE.gnu index d5b77d5b..f5a0ab77 100644 --- a/src/exports/generated/CBL_EE.gnu +++ b/src/exports/generated/CBL_EE.gnu @@ -21,6 +21,12 @@ CBL_C { FLDict_IsEncryptableValue; FLDict_GetEncryptableValue; FLSlot_SetEncryptableValue; + CBL_SetExtensionPath; + CBLCollection_CreateVectorIndex; + CBLVectorEncoding_CreateNone; + CBLVectorEncoding_CreateProductQuantizer; + CBLVectorEncoding_CreateScalarQuantizer; + CBLVectorEncoding_Free; CBL_Retain; CBL_Release; CBL_InstanceCount; diff --git a/src/exports/generated/CBL_EE_Android.gnu b/src/exports/generated/CBL_EE_Android.gnu index 504526bc..4f64f65a 100644 --- a/src/exports/generated/CBL_EE_Android.gnu +++ b/src/exports/generated/CBL_EE_Android.gnu @@ -22,6 +22,12 @@ CBL_C { FLDict_IsEncryptableValue; FLDict_GetEncryptableValue; FLSlot_SetEncryptableValue; + CBL_SetExtensionPath; + CBLCollection_CreateVectorIndex; + CBLVectorEncoding_CreateNone; + CBLVectorEncoding_CreateProductQuantizer; + CBLVectorEncoding_CreateScalarQuantizer; + CBLVectorEncoding_Free; CBL_Retain; CBL_Release; CBL_InstanceCount; diff --git a/test/CBLTest.cc b/test/CBLTest.cc index 5dcbda1b..559b6f43 100644 --- a/test/CBLTest.cc +++ b/test/CBLTest.cc @@ -51,6 +51,27 @@ using namespace fleece; } #endif +#ifdef COUCHBASE_ENTERPRISE + +static string sExtensionPath; + +void CBLTest::initVectorSearchExtension() { + std::once_flag sOnce; + std::call_once(sOnce, [] { + auto path = GetExtensionPath(); + if (!path.empty()) { + CBL_SetExtensionPath(slice(path)); + sExtensionPath = path; + } + }); +} + +bool CBLTest::hasVectorSearchExtension() { + return !sExtensionPath.empty(); +} + +#endif + static alloc_slice sDatabaseDir; alloc_slice CBLTest::databaseDir() { @@ -105,6 +126,10 @@ CBLTest::CBLTest() { CHECK(FLValue_GetType(kFLUndefinedValue) == kFLUndefined); CHECK(FLValue_GetType((FLValue)kFLEmptyArray) == kFLArray); CHECK(FLValue_GetType((FLValue)kFLEmptyDict) == kFLDict); + +#ifdef COUCHBASE_ENTERPRISE + initVectorSearchExtension(); +#endif CBLError error; auto config = databaseConfig(); @@ -211,6 +236,25 @@ string GetTestFilePath(const std::string &filename) { return sTestFilesPath + filename; } +#ifdef COUCHBASE_ENTERPRISE + +string GetExtensionPath() { +#ifdef __APPLE__ + auto bundle = CFBundleGetBundleWithIdentifier(CFSTR("com.couchbase.CouchbaseLiteTests")); + if (!bundle) { + string dir = "test/extensions/"; + string libPath = dir + "CouchbaseLiteVectorSearch"; + ifstream fin(libPath); + if (fin.good()) { + return dir; + } + } +#endif // __APPLE__ + return ""; +} + +#endif + bool ReadFileByLines(const string &path, const function &callback) { INFO("Reading lines from " << path); fstream fd(path.c_str(), ios_base::in); diff --git a/test/CBLTest.hh b/test/CBLTest.hh index 027cd76c..67803b0c 100644 --- a/test/CBLTest.hh +++ b/test/CBLTest.hh @@ -80,6 +80,12 @@ public: static CBLDatabaseConfiguration databaseConfig(); +#ifdef COUCHBASE_ENTERPRISE + static void initVectorSearchExtension(); + + bool hasVectorSearchExtension(); +#endif + CBLTest(); ~CBLTest(); @@ -89,6 +95,10 @@ public: std::string GetTestFilePath(const std::string &filename); +#ifdef COUCHBASE_ENTERPRISE +std::string GetExtensionPath(); +#endif + bool ReadFileByLines(const std::string &path, const std::function &callback); unsigned ImportJSONLines(std::string filename, CBLDatabase* database); diff --git a/test/VectorSearchTest.cc b/test/VectorSearchTest.cc new file mode 100644 index 00000000..93cb26ac --- /dev/null +++ b/test/VectorSearchTest.cc @@ -0,0 +1,44 @@ +// +// VectorSearchTest.cc +// +// Copyright © 2024 Couchbase. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "CBLTest.hh" +#include "CBLPrivate.h" + +using namespace fleece; +using namespace std; + +#ifdef COUCHBASE_ENTERPRISE + +class VectorSearchTest : public CBLTest { }; + +#ifdef VECTOR_SEARCH_TEST_ENABLED + +TEST_CASE_METHOD(VectorSearchTest, "Vector Index Sanity Test", "[VectorSearch]") { + CBLVectorIndexConfiguration config = {}; + config.expressionLanguage = kCBLN1QLLanguage; + config.expression = "vector"_sl; + config.dimensions = 300; + config.centroids = 8; + + CBLError error {}; + CHECK(CBLCollection_CreateVectorIndex(defaultCollection, "vector_index"_sl, config, &error)); +} + +#endif + +#endif diff --git a/test/extensions/README.md b/test/extensions/README.md new file mode 100644 index 00000000..ae45e9c3 --- /dev/null +++ b/test/extensions/README.md @@ -0,0 +1 @@ +Download and put CouchbaseLiteVectorSearch library here. \ No newline at end of file