Optimize pgvector test for semi-recent enhancements

This commit adds several changes to the pgvector test to create a more representative test environment based on recent and older changes to pgvector. Notable changes include allowing for testing of parallel index buiding parameters, using loading with the recommended binary loading method, and other changes to better emulate what a typical user of pgvector would do. This commit also has some general cleanups as well. Co-authored-by: Mark Greenhalgh <[email protected]> Co-authored-by: Tyler House <[email protected]>
zilliztech · May 15, 2024 · c4fc7c1 · c4fc7c1
1 parent 2a64abe
commit c4fc7c1
Show file tree

Hide file tree

Showing 7 changed files with 507 additions and 145 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -59,13 +59,14 @@ all = [
     "redis",
     "chromadb",
     "psycopg2",
+    "psycopg",
 ]
 
 qdrant = [ "qdrant-client" ]
 pinecone = [ "pinecone-client" ]
 weaviate = [ "weaviate-client" ]
 elastic = [ "elasticsearch" ]
-pgvector = [ "pgvector", "psycopg2" ]
+pgvector = [ "pgvector", "psycopg" ]
 pgvecto_rs = [ "psycopg2" ]
 redis = [ "redis" ]
 chromadb = [ "chromadb" ]

diff --git a/vectordb_bench/backend/clients/api.py b/vectordb_bench/backend/clients/api.py
@@ -20,6 +20,7 @@ class IndexType(str, Enum):
     Flat = "FLAT"
     AUTOINDEX = "AUTOINDEX"
     ES_HNSW = "hnsw"
+    ES_IVFFlat = "ivfflat"
     GPU_IVF_FLAT = "GPU_IVF_FLAT"
     GPU_IVF_PQ = "GPU_IVF_PQ"
     GPU_CAGRA = "GPU_CAGRA"

diff --git a/vectordb_bench/backend/clients/pgvector/config.py b/vectordb_bench/backend/clients/pgvector/config.py
@@ -1,29 +1,62 @@
+from abc import abstractmethod
+from typing import Any, Mapping, Optional, Sequence, TypedDict
 from pydantic import BaseModel, SecretStr
-from ..api import DBConfig, DBCaseConfig, IndexType, MetricType
+from typing_extensions import LiteralString
+from ..api import DBCaseConfig, DBConfig, IndexType, MetricType
 
 POSTGRE_URL_PLACEHOLDER = "postgresql://%s:%s@%s/%s"
 
+
+class PgVectorConfigDict(TypedDict):
+    """These keys will be directly used as kwargs in psycopg connection string,
+        so the names must match exactly psycopg API"""
+
+    user: str
+    password: str
+    host: str
+    port: int
+    dbname: str
+
+
 class PgVectorConfig(DBConfig):
-    user_name: SecretStr = "postgres"
+    user_name: SecretStr = SecretStr("postgres")
     password: SecretStr
     host: str = "localhost"
     port: int = 5432
     db_name: str
 
-    def to_dict(self) -> dict:
+    def to_dict(self) -> PgVectorConfigDict:
         user_str = self.user_name.get_secret_value()
         pwd_str = self.password.get_secret_value()
         return {
-            "host" : self.host,
-            "port" : self.port,
-            "dbname" : self.db_name,
-            "user" : user_str,
-            "password" : pwd_str
+            "host": self.host,
+            "port": self.port,
+            "dbname": self.db_name,
+            "user": user_str,
+            "password": pwd_str,
         }
 
+
+class PgVectorIndexParam(TypedDict):
+    metric: str
+    index_type: str
+    index_creation_with_options: Sequence[dict[str, Any]]
+    maintenance_work_mem: Optional[str]
+    max_parallel_workers: Optional[int]
+
+
+class PgVectorSearchParam(TypedDict):
+    metric_fun_op: LiteralString
+
+
+class PgVectorSessionCommands(TypedDict):
+    session_options: Sequence[dict[str, Any]]
+
+
 class PgVectorIndexConfig(BaseModel, DBCaseConfig):
     metric_type: MetricType | None = None
-    index: IndexType
+    create_index_before_load: bool = False
+    create_index_after_load: bool = True
 
     def parse_metric(self) -> str:
         if self.metric_type == MetricType.L2:
@@ -32,7 +65,7 @@ def parse_metric(self) -> str:
             return "vector_ip_ops"
         return "vector_cosine_ops"
 
-    def parse_metric_fun_op(self) -> str:
+    def parse_metric_fun_op(self) -> LiteralString:
         if self.metric_type == MetricType.L2:
             return "<->"
         elif self.metric_type == MetricType.IP:
@@ -46,48 +79,137 @@ def parse_metric_fun_str(self) -> str:
             return "max_inner_product"
         return "cosine_distance"
 
+    @abstractmethod
+    def index_param(self) -> PgVectorIndexParam:
+        ...
+
+    @abstractmethod
+    def search_param(self) -> PgVectorSearchParam:
+        ...
+
+    @abstractmethod
+    def session_param(self) -> PgVectorSessionCommands:
+        ...
+
+    @staticmethod
+    def _optionally_build_with_options(with_options: Mapping[str, Any]) -> Sequence[dict[str, Any]]:
+        """Walk through mappings, creating a List of {key1 = value} pairs. That will be used to build a where clause"""
+        options = []
+        for option_name, value in with_options.items():
+            if value is not None:
+                options.append(
+                    {
+                        "option_name": option_name,
+                        "val": str(value),
+                    }
+                )
+        return options
+
+    @staticmethod
+    def _optionally_build_set_options(
+        set_mapping: Mapping[str, Any]
+    ) -> Sequence[dict[str, Any]]:
+        """Walk through options, creating 'SET 'key1 = "value1";' commands"""
+        session_options = []
+        for setting_name, value in set_mapping.items():
+            if value:
+                session_options.append(
+                    {"parameter": {
+                            "setting_name": setting_name,
+                            "val": str(value),
+                        },
+                    }
+                )
+        return session_options
+
+
+class PgVectorIVFFlatConfig(PgVectorIndexConfig):
+    """
+    An IVFFlat index divides vectors into lists, and then searches a subset of those lists that are
+    closest to the query vector. It has faster build times and uses less memory than HNSW,
+    but has lower query performance (in terms of speed-recall tradeoff).
+
+    Three keys to achieving good recall are:
+
+    Create the index after the table has some data
+    Choose an appropriate number of lists - a good place to start is rows / 1000 for up to 1M rows and sqrt(rows) for
+    over 1M rows.
+    When querying, specify an appropriate number of probes (higher is better for recall, lower is better for speed) -
+    a good place to start is sqrt(lists)
+    """
+
+    lists: int | None
+    probes: int | None
+    index: IndexType = IndexType.ES_IVFFlat
+    maintenance_work_mem: Optional[str] = None
+    max_parallel_workers: Optional[int] = None
+
+    def index_param(self) -> PgVectorIndexParam:
+        index_parameters = {"lists": self.lists}
+        return {
+            "metric": self.parse_metric(),
+            "index_type": self.index.value,
+            "index_creation_with_options": self._optionally_build_with_options(
+                index_parameters
+            ),
+            "maintenance_work_mem": self.maintenance_work_mem,
+            "max_parallel_workers": self.max_parallel_workers,
+        }
 
-
-class HNSWConfig(PgVectorIndexConfig):
-    M: int
-    efConstruction: int
-    ef: int | None = None
-    index: IndexType = IndexType.HNSW
-
-    def index_param(self) -> dict:
+    def search_param(self) -> PgVectorSearchParam:
         return {
-            "m" : self.M,
-            "ef_construction" : self.efConstruction,
-            "metric" : self.parse_metric()
+            "metric_fun_op": self.parse_metric_fun_op(),
         }
 
-    def search_param(self) -> dict:
+    def session_param(self) -> PgVectorSessionCommands:
+        session_parameters = {"ivfflat.probes": self.probes}
         return {
-            "ef" : self.ef,
-            "metric_fun" : self.parse_metric_fun_str(),
-            "metric_fun_op" : self.parse_metric_fun_op(),
+            "session_options": self._optionally_build_set_options(session_parameters)
         }
 
 
-class IVFFlatConfig(PgVectorIndexConfig):
-    lists: int | None = 1000
-    probes: int | None = 10
-    index: IndexType = IndexType.IVFFlat
+class PgVectorHNSWConfig(PgVectorIndexConfig):
+    """
+    An HNSW index creates a multilayer graph. It has better query performance than IVFFlat (in terms of
+    speed-recall tradeoff), but has slower build times and uses more memory. Also, an index can be
+    created without any data in the table since there isn't a training step like IVFFlat.
+    """
+
+    m: int | None  # DETAIL:  Valid values are between "2" and "100".
+    ef_construction: (
+        int | None
+    )  # ef_construction must be greater than or equal to 2 * m
+    ef_search: int | None
+    index: IndexType = IndexType.ES_HNSW
+    maintenance_work_mem: Optional[str] = None
+    max_parallel_workers: Optional[int] = None
+
+    def index_param(self) -> PgVectorIndexParam:
+        index_parameters = {"m": self.m, "ef_construction": self.ef_construction}
+        return {
+            "metric": self.parse_metric(),
+            "index_type": self.index.value,
+            "index_creation_with_options": self._optionally_build_with_options(
+                index_parameters
+            ),
+            "maintenance_work_mem": self.maintenance_work_mem,
+            "max_parallel_workers": self.max_parallel_workers,
+        }
 
-    def index_param(self) -> dict:
+    def search_param(self) -> PgVectorSearchParam:
         return {
-            "lists" : self.lists,
-            "metric" : self.parse_metric()
+            "metric_fun_op": self.parse_metric_fun_op(),
         }
 
-    def search_param(self) -> dict:
+    def session_param(self) -> PgVectorSessionCommands:
+        session_parameters = {"hnsw.ef_search": self.ef_search}
         return {
-            "probes" : self.probes,
-            "metric_fun" : self.parse_metric_fun_str(),
-            "metric_fun_op" : self.parse_metric_fun_op(),
+            "session_options": self._optionally_build_set_options(session_parameters)
         }
 
+
 _pgvector_case_config = {
-    IndexType.HNSW: HNSWConfig,
-    IndexType.IVFFlat: IVFFlatConfig,
+        IndexType.HNSW: PgVectorHNSWConfig,
+        IndexType.ES_HNSW: PgVectorHNSWConfig,
+        IndexType.IVFFlat: PgVectorIVFFlatConfig,
 }