diff --git a/docs/docs/examples/vector_stores/ObjectBoxIndexDemo.ipynb b/docs/docs/examples/vector_stores/ObjectBoxIndexDemo.ipynb new file mode 100644 index 0000000000000..2c7604e374cab --- /dev/null +++ b/docs/docs/examples/vector_stores/ObjectBoxIndexDemo.ipynb @@ -0,0 +1,294 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ObjectBox VectorStore Demo\n", + "\n", + "This notebook will demonstrate the use of [ObjectBox](https://objectbox.io/) as an efficient, on-device vector-store with LlamaIndex. We will consider a simple RAG use-case where given a document, the user can ask questions and get relevant answers from a LLM in natural language. The RAG pipeline will be configured along the following verticals:\n", + "\n", + "* A builtin [`SimpleDirectoryReader` reader](https://docs.llamaindex.ai/en/stable/examples/data_connectors/simple_directory_reader/) from LlamaIndex\n", + "* A builtin [`SentenceSplitter` node-parser](https://docs.llamaindex.ai/en/stable/api_reference/node_parsers/sentence_splitter/) from LlamaIndex\n", + "* Models from [HuggingFace as embedding providers](https://docs.llamaindex.ai/en/stable/examples/embeddings/huggingface/)\n", + "* [ObjectBox](https://objectbox.io/) as NoSQL and vector datastore\n", + "* Google's [Gemini](https://docs.llamaindex.ai/en/stable/examples/llm/gemini/) as a remote LLM service\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1) Installing dependencies\n", + "\n", + "We install integrations for HuggingFace and Gemini to use along with LlamaIndex" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/1.6 MB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r", + "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━\u001b[0m \u001b[32m1.5/1.6 MB\u001b[0m \u001b[31m40.2 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m25.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.0/4.0 MB\u001b[0m \u001b[31m44.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m38.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m37.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.4/76.4 kB\u001b[0m \u001b[31m5.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.9/77.9 kB\u001b[0m \u001b[31m4.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.3/49.3 kB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h" + ] + } + ], + "source": [ + "!pip install llama_index_vector_stores_objectbox --quiet\n", + "!pip install llama-index --quiet\n", + "!pip install llama-index-embeddings-huggingface --quiet\n", + "!pip install llama-index-llms-gemini --quiet" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2) Downloading the documents" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p 'data/paul_graham/'\n", + "!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3) Setup a LLM for RAG (Gemini)\n", + "\n", + "We use Google Gemini's cloud-based API as a LLM. You can get an API-key from the [console](https://aistudio.google.com/app/apikey)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.llms.gemini import Gemini\n", + "import getpass\n", + "\n", + "gemini_key_api = getpass.getpass(\"Gemini API Key: \")\n", + "gemini_llm = Gemini(api_key=gemini_key_api)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4) Setup an embedding model for RAG (HuggingFace `bge-small-en-v1.5`)\n", + "\n", + "HuggingFace hosts a variety of embedding models, which could be observed from the [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n", + "\n", + "hf_embedding = HuggingFaceEmbedding(model_name=\"BAAI/bge-base-en-v1.5\")\n", + "embedding_dim = 384" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5) Prepare documents and nodes\n", + "\n", + "In a RAG pipeline, the first step is to read the given documents. We use the `SimpleDirectoryReader` that selects the best file reader by checking the file extension from the directory.\n", + "\n", + "Next, we produce chunks (text subsequences) from the contents read by the `SimpleDirectoryReader` from the documents. A `SentenceSplitter` is a text-splitter that preserves sentence boundaries while splitting the text into chunks of size `chunk_size`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.core import SimpleDirectoryReader\n", + "from llama_index.core.node_parser import SentenceSplitter\n", + "\n", + "reader = SimpleDirectoryReader(\"./data/paul_graham\")\n", + "documents = reader.load_data()\n", + "\n", + "node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)\n", + "nodes = node_parser.get_nodes_from_documents(documents)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6) Configure `ObjectBoxVectorStore`\n", + "\n", + "The `ObjectBoxVectorStore` can be initialized with several options:\n", + "\n", + "- `embedding_dim` (required): The dimensions of the embeddings that the vector DB will hold\n", + "- `distance_type`: Choose from `COSINE`, `DOT_PRODUCT`, `DOT_PRODUCT_NON_NORMALIZED` and `EUCLIDEAN`\n", + "- `db_directory`: The path of the directory where the `.mdb` ObjectBox database file should be created\n", + "- `clear_db`: Deletes the existing database file if it exists on `db_directory`\n", + "- `do_log`: Enables logging from the ObjectBox integration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.vector_stores.objectbox import ObjectBoxVectorStore\n", + "from llama_index.core import StorageContext, VectorStoreIndex, Settings\n", + "from objectbox import VectorDistanceType\n", + "\n", + "vector_store = ObjectBoxVectorStore(\n", + " embedding_dim,\n", + " distance_type=VectorDistanceType.COSINE,\n", + " db_directory=\"obx_data\",\n", + " clear_db=False,\n", + " do_log=True,\n", + ")\n", + "\n", + "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n", + "\n", + "Settings.llm = gemini_llm\n", + "Settings.embed_model = hf_embedding\n", + "\n", + "index = VectorStoreIndex(nodes=nodes, storage_context=storage_context)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7) Chat with the document" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query_engine = index.as_query_engine()\n", + "response = query_engine.query(\"Who is Paul Graham?\")\n", + "print(response)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Optional: Configuring `ObjectBoxVectorStore` as a retriever\n", + "\n", + "A LlamaIndex [retriever](https://docs.llamaindex.ai/en/stable/module_guides/querying/retriever/) is responsible for fetching similar chunks from a vector DB given a query.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "retriever = index.as_retriever()\n", + "response = retriever.retrieve(\"What did the author do growing up?\")\n", + "\n", + "for node in response:\n", + " print(\"Retrieved chunk text:\\n\", node.node.get_text())\n", + " print(\"Retrieved chunk metadata:\\n\", node.node.get_metadata_str())\n", + " print(\"\\n\\n\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Optional: Removing chunks associated with a single query using `delete_nodes`\n", + "\n", + "We can use the `ObjectBoxVectorStore.delete_nodes` method to remove chunks (nodes) from the vector DB providing a list containing node IDs as an argument." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = retriever.retrieve(\"What did the author do growing up?\")\n", + "\n", + "node_ids = []\n", + "for node in response:\n", + " node_ids.append(node.node_id)\n", + "print(f\"Nodes to be removed: {node_ids}\")\n", + "\n", + "print(f\"No. of vectors before deletion: {vector_store.count()}\")\n", + "vector_store.delete_nodes(node_ids)\n", + "print(f\"No. of vectors after deletion: {vector_store.count()}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Optional: Removing a single document from the vector DB\n", + "\n", + "The `ObjectBoxVectorStore.delete` method can be used to remove chunks (nodes) associated with a single document whose `id_` is provided as an argument.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "document = documents[0]\n", + "print(f\"Document to be deleted {document.id_}\")\n", + "\n", + "print(f\"No. of vectors before deletion: {vector_store.count()}\")\n", + "vector_store.delete(document.id_)\n", + "print(f\"No. of vectors after document deletion: {vector_store.count()}\")" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/.gitignore b/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/.gitignore new file mode 100644 index 0000000000000..c23ad90c75550 --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/.gitignore @@ -0,0 +1,157 @@ +llama_index/_static +.DS_Store +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +bin/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +etc/ +include/ +lib/ +lib64/ +parts/ +sdist/ +share/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +.ruff_cache + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints +notebooks/ + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +pyvenv.cfg + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Jetbrains +.idea +modules/ +*.swp + +# VsCode +.vscode + +# pipenv +Pipfile +Pipfile.lock + +# pyright +pyrightconfig.json + +# ObjectBox specific files +llama_index/vector_stores/objectbox/objectbox-model.json +objectbox diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/BUILD b/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/BUILD new file mode 100644 index 0000000000000..0896ca890d8bf --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/BUILD @@ -0,0 +1,3 @@ +poetry_requirements( + name="poetry", +) diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/Makefile b/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/Makefile new file mode 100644 index 0000000000000..b9eab05aa3706 --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/Makefile @@ -0,0 +1,17 @@ +GIT_ROOT ?= $(shell git rev-parse --show-toplevel) + +help: ## Show all Makefile targets. + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}' + +format: ## Run code autoformatters (black). + pre-commit install + git ls-files | xargs pre-commit run black --files + +lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy + pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files + +test: ## Run tests via pytest. + pytest tests + +watch-docs: ## Build and watch documentation. + sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/ diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/README.md b/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/README.md new file mode 100644 index 0000000000000..6b27928a27185 --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/README.md @@ -0,0 +1,126 @@ +# ObjectBox VectorStore For LlamaIndex + +### About + +This package contains the [ObjectBox](https://objectbox.io/) integrations for [LlamaIndex](https://www.llamaindex.ai/) + +### Getting Started + +Install the `llama-index-vector-stores-objectbox` package from PyPI via pip. + +```commandline +pip install llama-index-vector-stores-objectbox +``` + +You can import the ObjectBox vector-store with `from llama_index.vector_stores.objectbox import ObjectBoxVectorStore` and start using it, + +```python +from llama_index.vector_stores.objectbox import ObjectBoxVectorStore +from objectbox import VectorDistanceType + +embedding_dim = 384 # size of the embeddings to be stored + +vector_store = ObjectBoxVectorStore( + embedding_dim, + distance_type=VectorDistanceType.COSINE, + db_directory="obx_data", + clear_db=False, + do_log=True, +) +``` + +- `embedding_dim` (required): The dimensions of the embeddings that the vector DB will hold +- `distance_type`: Choose from `COSINE`, `DOT_PRODUCT`, `DOT_PRODUCT_NON_NORMALIZED` and `EUCLIDEAN` +- `db_directory`: The path of the directory where the `.mdb` ObjectBox database file should be created +- `clear_db`: Deletes the existing database file if it exists on `db_directory` +- `do_log`: Enables logging from the ObjectBox integration + +### A complete RAG example + +Along the `llama-index-vector-stores-objectbox`, install the following packages, + +```commandline +pip install llama-index --quiet +pip install llama-index-embeddings-huggingface --quiet +pip install llama-index-llms-gemini --quiet +``` + +Download a sample text file, + +```commandline +mkdir -p 'data/paul_graham/' +wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt' +``` + +This example will require a Gemini API key. You can get an API-key from the [Gemini developer console](https://aistudio.google.com/app/apikey). Execute the following Python script to generate an answer for `Who is Paul Graham?` from the text file, + +```python +from llama_index.llms.gemini import Gemini +from llama_index.embeddings.huggingface import HuggingFaceEmbedding +from llama_index.vector_stores.objectbox import ObjectBoxVectorStore +from llama_index.core import StorageContext, VectorStoreIndex, Settings +from llama_index.core import SimpleDirectoryReader +from llama_index.core.node_parser import SentenceSplitter +from objectbox import VectorDistanceType +import getpass + +gemini_key_api = getpass.getpass("Gemini API Key: ") +gemini_llm = Gemini(api_key=gemini_key_api) + +# Configure embedding model from HuggingFace +hf_embedding = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5") +embedding_dim = 384 + +# Setup file reader and text splitter +reader = SimpleDirectoryReader("./data/paul_graham") +documents = reader.load_data() + +node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20) +nodes = node_parser.get_nodes_from_documents(documents) + +# Configure ObjectBox as a vector-store +vector_store = ObjectBoxVectorStore( + embedding_dim, + distance_type=VectorDistanceType.COSINE, + db_directory="obx_data", + clear_db=False, + do_log=True, +) + +storage_context = StorageContext.from_defaults(vector_store=vector_store) + +Settings.llm = gemini_llm +Settings.embed_model = hf_embedding + +index = VectorStoreIndex(nodes=nodes, storage_context=storage_context) + +query_engine = index.as_query_engine() +response = query_engine.query("Who is Paul Graham?") +print(response) +``` + +### License + +```text +MIT License + +Copyright (c) 2024 ObjectBox, Ltd. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/llama_index/vector_stores/objectbox/BUILD b/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/llama_index/vector_stores/objectbox/BUILD new file mode 100644 index 0000000000000..db46e8d6c978c --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/llama_index/vector_stores/objectbox/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/llama_index/vector_stores/objectbox/__init__.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/llama_index/vector_stores/objectbox/__init__.py new file mode 100644 index 0000000000000..81958b99865c1 --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/llama_index/vector_stores/objectbox/__init__.py @@ -0,0 +1,3 @@ +from llama_index.vector_stores.objectbox.base import ObjectBoxVectorStore + +__all__ = ["ObjectBoxVectorStore"] diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/llama_index/vector_stores/objectbox/base.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/llama_index/vector_stores/objectbox/base.py new file mode 100644 index 0000000000000..b94598f1d1b3b --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/llama_index/vector_stores/objectbox/base.py @@ -0,0 +1,258 @@ +import logging +import os +import shutil +import sys +import time +from typing import Any, List, Optional + +from llama_index.core.schema import BaseNode, MetadataMode, TextNode +from llama_index.core.vector_stores import MetadataFilters +from llama_index.core.vector_stores.types import ( + BasePydanticVectorStore, + VectorStoreQuery, + VectorStoreQueryResult, +) +from objectbox import Store, Model, Box +from objectbox.model.entity import Entity +from objectbox.model.properties import ( + VectorDistanceType, + HnswIndex, + Id, + Property, + PropertyType, + String, + Float32Vector, +) +from objectbox.query import Query +from pydantic import PrivateAttr + +# from docs.prepare_for_build import file_name + +DIRECTORY = "objectbox" +_logger = logging.getLogger(__name__) +handler = logging.StreamHandler(stream=sys.stdout) +_logger.addHandler(handler) +_logger.setLevel(logging.INFO) + + +class ObjectBoxVectorStore(BasePydanticVectorStore): + """ObjectBox vector store. + + In this vector store, embeddings are stored within a ObjectBox `Box` (collection). + + During query time, the index uses ObjectBox to query for the top-K most similar nodes. + + Args: + embedding_dimensions (int): Number of elements in the embedding to be stored + distance_type (objectbox.model.properties.VectorDistanceType): + Distance metric to be used for vector search + db_directory (str): File path where ObjectBox database files will be stored + clear_db (bool): Whether to delete any existing database on `db_directory` + do_log (bool): enable/disable logging + + Examples: + `pip install llama-index-vector-stores-objectbox` + + ```python + from llama_index.vector_stores.objectbox import ObjectBoxVectorStore + from objectbox import VectorDistanceType + + vector_store = ObjectBoxVectorStore( + embedding_dim, + distance_type=VectorDistanceType.COSINE, + db_directory="obx_data", + clear_db=False, + do_log=True + ) + ``` + """ + + stores_text: bool = True + embedding_dimensions: int + distance_type: VectorDistanceType = VectorDistanceType.EUCLIDEAN + db_directory: Optional[str] = None + clear_db: Optional[bool] = False + do_log: Optional[bool] = False + + _store: Store = PrivateAttr() + _entity_class: Entity = PrivateAttr() + _box: Box = PrivateAttr() + + def __init__( + self, + embedding_dimensions: int, + distance_type: VectorDistanceType = VectorDistanceType.EUCLIDEAN, + db_directory: Optional[str] = None, + clear_db: Optional[bool] = False, + do_log: Optional[bool] = False, + **data: Any, + ): + super().__init__( + embedding_dimensions=embedding_dimensions, + distance_type=distance_type, + db_directory=db_directory, + clear_db=clear_db, + do_log=do_log, + **data, + ) + self._entity_class = self._create_entity_class() + self._store = self._create_box_store() + + self._box = self._store.box(self._entity_class) + + @property + def client(self) -> Any: + return self._box + + def add(self, nodes: List[BaseNode], **kwargs: Any) -> List[str]: + ids: list[str] = [] + start = time.perf_counter() + with self._store.write_tx(): + for node in nodes: + if node.embedding is None: + _logger.info("A node with no embedding was found ") + continue + self._box.put( + self._entity_class( + node_id=node.node_id, + doc_id=node.ref_doc_id if node.ref_doc_id is not None else "", + text=node.get_content(metadata_mode=MetadataMode.NONE), + metadata=node.metadata, + embeddings=node.embedding, + ) + ) + ids.append(node.node_id) + if self.do_log: + end = time.perf_counter() + _logger.info( + f"ObjectBox stored {len(ids)} nodes in {end - start} seconds" + ) + return ids + + def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None: + self._box.query(self._entity_class.doc_id.equals(ref_doc_id)).build().remove() + + def delete_nodes( + self, + node_ids: Optional[List[str]] = None, + filters: Optional[MetadataFilters] = None, + **delete_kwargs: Any, + ) -> None: + if filters is not None: + raise NotImplementedError( + "ObjectBox does not yet support delete_nodes() with metadata filters - contact us if you need this feature" + ) + if node_ids is not None: + query_obj = self._box.query( + self._entity_class.node_id.equals("node_id").alias("node_id") + ).build() + for node_id in node_ids: + query_obj.set_parameter_alias_string("node_id", node_id) + query_obj.remove() + + def get_nodes( + self, + node_ids: Optional[List[str]] = None, + filters: Optional[MetadataFilters] = None, + ) -> List[BaseNode]: + if filters is not None: + raise NotImplementedError( + "ObjectBox does not yet support get_nodes() with metadata filters - contact us if you need this feature" + ) + if node_ids is not None: + retrieved_nodes: list[BaseNode] = [] + with self._store.read_tx(): + query_obj = self._box.query( + self._entity_class.node_id.equals("node_id").alias("node_id") + ).build() + for node_id in node_ids: + try: + query_obj.set_parameter_alias_string("node_id", node_id) + entities = query_obj.find() + if len(entities) == 0: + _logger.info(f"No entity with id = {node_id} was found") + continue + retrieved_nodes.append( + TextNode( + text=entities[0].text, + id_=entities[0].node_id, + metadata=entities[0].metadata, + ) + ) + except ValueError: + raise ValueError(f"Invalid node id: {node_id}") + return retrieved_nodes + else: + raise ValueError("node_ids cannot be None") + + def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult: + if query.filters is not None: + raise NotImplementedError( + "ObjectBox does not yet support query() with metadata filters - contact us if you need this feature" + ) + + query_embedding = query.query_embedding + n_results = query.similarity_top_k + + nodes: list[TextNode] = [] + similarities: list[float] = [] + ids: list[str] = [] + + start = time.perf_counter() + query: Query = self._box.query( + self._entity_class.embeddings.nearest_neighbor(query_embedding, n_results) + ).build() + results: list[tuple[Entity, float]] = query.find_with_scores() + end = time.perf_counter() + + if self.do_log: + _logger.info( + f"ObjectBox retrieved {len(results)} vectors in {end - start} seconds" + ) + + for entity, score in results: + node = TextNode( + text=entity.text, id_=entity.node_id, metadata=entity.metadata + ) + ids.append(entity.node_id) + nodes.append(node) + similarities.append(score) + + return VectorStoreQueryResult(nodes=nodes, similarities=similarities, ids=ids) + + def count(self) -> int: + return self._box.count() + + def clear(self) -> None: + self._box.remove_all() + + def close(self): + self._store.close() + + def _create_entity_class(self) -> Entity: + """Dynamically define an Entity class according to the parameters.""" + + @Entity() + class VectorEntity: + id = Id() + node_id = String() + doc_id = String() + text = String() + metadata = Property(dict, type=PropertyType.flex) + embeddings = Float32Vector( + index=HnswIndex( + dimensions=self.embedding_dimensions, + distance_type=self.distance_type, + ) + ) + + return VectorEntity + + def _create_box_store(self) -> Store: + """Registering the VectorEntity model and setting up objectbox database.""" + db_path = DIRECTORY if self.db_directory is None else self.db_directory + if self.clear_db and os.path.exists(db_path): + shutil.rmtree(db_path) + model = Model() + model.entity(self._entity_class) + return Store(model=model, directory=db_path) diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/pyproject.toml b/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/pyproject.toml new file mode 100644 index 0000000000000..8db318fa324f4 --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/pyproject.toml @@ -0,0 +1,67 @@ +[build-system] +build-backend = "poetry.core.masonry.api" +requires = ["poetry-core"] + +[tool.codespell] +check-filenames = true +check-hidden = true +skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" + +[tool.llamahub] +contains_example = false +import_path = "llama_index.vector_stores.objectbox" + +[tool.llamahub.class_authors] +ObjectBoxVectorStore = "ObjectBox" + +[tool.mypy] +disallow_untyped_defs = true +exclude = ["_static", "build", "examples", "notebooks", "venv"] +ignore_missing_imports = true +python_version = "3.8" + +[tool.poetry] +authors = ["ObjectBox"] +classifiers = [ + "Development Status :: 4 - Beta", + "License :: OSI Approved :: MIT License", +] +description = "Integration package connecting ObjectBox and LlamaIndex" +exclude = ["**/BUILD"] +license = "MIT" +name = "llama-index-vector-stores-objectbox" +readme = "README.md" +version = "0.1.0a" + +[tool.poetry.dependencies] +python = ">=3.8.1,<4.0" +llama-index-core = "^0.11.0" +objectbox = "^4.0.0" + +[tool.poetry.group.dev.dependencies] +ipython = "8.10.0" +jupyter = "^1.0.0" +mypy = "0.991" +pre-commit = "3.2.0" +pylint = "2.15.10" +pytest = "7.2.1" +pytest-mock = "3.11.1" +ruff = "0.0.292" +tree-sitter-languages = "^1.8.0" +types-Deprecated = ">=0.1.0" +types-PyYAML = "^6.0.12.12" +types-protobuf = "^4.24.0.4" +types-redis = "4.5.5.0" +types-requests = "2.28.11.8" +types-setuptools = "67.1.0.0" + +[tool.poetry.group.dev.dependencies.black] +extras = ["jupyter"] +version = "<=23.9.1,>=23.7.0" + +[tool.poetry.group.dev.dependencies.codespell] +extras = ["toml"] +version = ">=v2.2.6" + +[[tool.poetry.packages]] +include = "llama_index/" diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/tests/BUILD b/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/tests/BUILD new file mode 100644 index 0000000000000..dabf212d7e716 --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/tests/BUILD @@ -0,0 +1 @@ +python_tests() diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/tests/__init__.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/tests/test_objectbox.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/tests/test_objectbox.py new file mode 100644 index 0000000000000..a5f9eb8e7282a --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/tests/test_objectbox.py @@ -0,0 +1,109 @@ +import os +import shutil +from typing import Sequence + +import pytest +from llama_index.core.schema import TextNode, BaseNode +from llama_index.core.vector_stores import VectorStoreQuery + +from llama_index.vector_stores.objectbox import ObjectBoxVectorStore + + +EMBEDDING_DIM = 3 + + +@pytest.fixture() +def vectorstore(): + obx = ObjectBoxVectorStore(embedding_dimensions=EMBEDDING_DIM) + db_default_path = "objectbox" + assert os.path.exists( + db_default_path + ), f"Directory '{db_default_path}' does not exist." + filepath = os.path.join(db_default_path, "data.mdb") + assert os.path.isfile( + filepath + ), f"File '{db_default_path}' not found in '{db_default_path}'" + return obx + + +@pytest.fixture() +def node_embeddings() -> Sequence[BaseNode]: + return [ + TextNode( + id_="e8671c2d-8ee3-4f95-9730-7832f0115560", + text="test1", + embedding=[1.2, 0.3, -0.9], + ), + TextNode( + id_="d0db4ed6-da16-4769-bf19-d1c06267a5f6", + text="test2", + embedding=[0.1, 0.0, 0.0], + ), + TextNode( + id_="8601b27c-376e-48dd-a252-e61e01f29069", + text="test3", + embedding=[-2.3, 1.2, -6.7], + ), + ] + + +def test_add(vectorstore: ObjectBoxVectorStore, node_embeddings: Sequence[BaseNode]): + node_ids = vectorstore.add(node_embeddings) + retrieved_nodes = vectorstore.get_nodes(node_ids) + assert len(retrieved_nodes) == len(node_embeddings) + + +def test_query(vectorstore: ObjectBoxVectorStore, node_embeddings: Sequence[BaseNode]): + vectorstore.add(node_embeddings) + search_result = vectorstore.query( + VectorStoreQuery(query_embedding=[0.15, 0.001, -0.01], similarity_top_k=1) + ) + assert len(search_result.ids) == 1 + assert search_result.nodes[0].id_ == "d0db4ed6-da16-4769-bf19-d1c06267a5f6" + + +def test_get_nodes( + vectorstore: ObjectBoxVectorStore, node_embeddings: Sequence[BaseNode] +): + vectorstore.add(node_embeddings) + retrieved_nodes = vectorstore.get_nodes( + node_ids=["8601b27c-376e-48dd-a252-e61e01f29069"] + ) + assert len(retrieved_nodes) == 1 + assert retrieved_nodes[0].id_ == "8601b27c-376e-48dd-a252-e61e01f29069" + + +def test_count(vectorstore: ObjectBoxVectorStore, node_embeddings: Sequence[BaseNode]): + vectorstore.add(node_embeddings) + assert vectorstore.count() == len(node_embeddings) + + +def test_delete_nodes( + vectorstore: ObjectBoxVectorStore, node_embeddings: Sequence[BaseNode] +): + node_ids = vectorstore.add(node_embeddings) + node_ids_to_be_deleted = node_ids[0:2] + vectorstore.delete_nodes(node_ids_to_be_deleted) + assert vectorstore.count() == 1 + + +def test_clear(vectorstore: ObjectBoxVectorStore, node_embeddings: Sequence[BaseNode]): + node_ids = vectorstore.add(node_embeddings) + vectorstore.clear() + retrieved_nodes = vectorstore.get_nodes(node_ids) + assert len(retrieved_nodes) == 0 + + +def remove_test_dir(test_dir: str): + if os.path.exists(test_dir): + shutil.rmtree(test_dir) + + +@pytest.fixture(autouse=True) +def auto_cleanup(vectorstore: ObjectBoxVectorStore): + yield # run the test function + vectorstore.close() + os.remove( + "llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/llama_index/vector_stores/objectbox/objectbox-model.json" + ) + remove_test_dir("objectbox") diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/tests/test_vector_stores_objectbox.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/tests/test_vector_stores_objectbox.py new file mode 100644 index 0000000000000..1d60479ccb0aa --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-objectbox/tests/test_vector_stores_objectbox.py @@ -0,0 +1,8 @@ +from llama_index.core.vector_stores.types import BasePydanticVectorStore +from llama_index.vector_stores.objectbox import ObjectBoxVectorStore + + +def test_class(): + """Ensures that BasePydanticVectorStore is one of the parent classes of ObjectBoxVectorStore.""" + names_of_base_classes = [b.__name__ for b in ObjectBoxVectorStore.__mro__] + assert BasePydanticVectorStore.__name__ in names_of_base_classes