diff --git a/authors.yaml b/authors.yaml index e8f9188b6f..c34bd65b2b 100644 --- a/authors.yaml +++ b/authors.yaml @@ -47,3 +47,8 @@ colin-openai: name: "Colin Jarvis" website: "https://twitter.com/colintjarvis" avatar: "https://pbs.twimg.com/profile_images/1647875178947387393/aUc7D9m-_400x400.jpg" + +prakul: + name: "Prakul Agarwal" + website: "https://www.linkedin.com/in/prakulagarwal" + avator: "https://media.licdn.com/dms/image/D5603AQEUug83qKgRBg/profile-displayphoto-shrink_800_800/0/1675384960197?e=1706140800&v=beta&t=qxkDbBr-Bk2ASpcwbR5JVPD6yS-vzmIwNHAa8ApyDq4" diff --git a/examples/vector_databases/mongodb_atlas/README.md b/examples/vector_databases/mongodb_atlas/README.md new file mode 100644 index 0000000000..2ef6e05e50 --- /dev/null +++ b/examples/vector_databases/mongodb_atlas/README.md @@ -0,0 +1,6 @@ +# MongoDB Atlas Vector Search + + +[Atlas Vector Search](https://www.mongodb.com/products/platform/atlas-vector-search) is a fully managed service that simplifies the process of effectively indexing high-dimensional vector data within MongoDB and being able to perform fast vector similarity searches. With Atlas Vector Search, you can use MongoDB as a standalone vector database for a new project or augment your existing MongoDB collections with vector search functionality. With Atlas Vector Search, you can use the powerful capabilities of vector search in any major public cloud (AWS, Azure, GCP) and achieve massive scalability and data security out of the box while being enterprise-ready with provisions like FedRamp, SoC2 compliance. + +Documentation - [link](https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-overview/) \ No newline at end of file diff --git a/examples/vector_databases/mongodb_atlas/semantic_search_using_mongodb_atlas_vector_search.ipynb b/examples/vector_databases/mongodb_atlas/semantic_search_using_mongodb_atlas_vector_search.ipynb new file mode 100644 index 0000000000..26bc223790 --- /dev/null +++ b/examples/vector_databases/mongodb_atlas/semantic_search_using_mongodb_atlas_vector_search.ipynb @@ -0,0 +1,404 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "\n", + "This notebook demonstrates how to build a semantic search application using OpenAI and [MongoDB Atlas vector search](https://www.mongodb.com/products/platform/atlas-vector-search)" + ], + "metadata": { + "id": "flQYAhphJC5m" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iYMn0dXXdFbY", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "98dab421-f11b-40b8-8f82-6de42b25725a" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting pymongo\n", + " Downloading pymongo-4.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (677 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m677.1/677.1 kB\u001b[0m \u001b[31m10.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting openai\n", + " Downloading openai-1.3.3-py3-none-any.whl (220 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m220.3/220.3 kB\u001b[0m \u001b[31m24.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting dnspython<3.0.0,>=1.16.0 (from pymongo)\n", + " Downloading dnspython-2.4.2-py3-none-any.whl (300 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m300.4/300.4 kB\u001b[0m \u001b[31m29.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: anyio<4,>=3.5.0 in /usr/local/lib/python3.10/dist-packages (from openai) (3.7.1)\n", + "Requirement already satisfied: distro<2,>=1.7.0 in /usr/lib/python3/dist-packages (from openai) (1.7.0)\n", + "Collecting httpx<1,>=0.23.0 (from openai)\n", + " Downloading httpx-0.25.1-py3-none-any.whl (75 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.0/75.0 kB\u001b[0m \u001b[31m9.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: pydantic<3,>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from openai) (1.10.13)\n", + "Requirement already satisfied: tqdm>4 in /usr/local/lib/python3.10/dist-packages (from openai) (4.66.1)\n", + "Requirement already satisfied: typing-extensions<5,>=4.5 in /usr/local/lib/python3.10/dist-packages (from openai) (4.5.0)\n", + "Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.10/dist-packages (from anyio<4,>=3.5.0->openai) (3.4)\n", + "Requirement already satisfied: sniffio>=1.1 in /usr/local/lib/python3.10/dist-packages (from anyio<4,>=3.5.0->openai) (1.3.0)\n", + "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<4,>=3.5.0->openai) (1.1.3)\n", + "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->openai) (2023.7.22)\n", + "Collecting httpcore (from httpx<1,>=0.23.0->openai)\n", + " Downloading httpcore-1.0.2-py3-none-any.whl (76 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.9/76.9 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting h11<0.15,>=0.13 (from httpcore->httpx<1,>=0.23.0->openai)\n", + " Downloading h11-0.14.0-py3-none-any.whl (58 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m6.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: h11, dnspython, pymongo, httpcore, httpx, openai\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "llmx 0.0.15a0 requires cohere, which is not installed.\n", + "llmx 0.0.15a0 requires tiktoken, which is not installed.\u001b[0m\u001b[31m\n", + "\u001b[0mSuccessfully installed dnspython-2.4.2 h11-0.14.0 httpcore-1.0.2 httpx-0.25.1 openai-1.3.3 pymongo-4.6.0\n" + ] + } + ], + "source": [ + "!pip install pymongo openai" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Step 1: Setup the environment\n", + "\n", + "There are 2 pre-requisites for this:\n", + "\n", + "1. **MongoDB Atlas cluster**: To create a forever free MongoDB Atlas cluster, first, you need to create a MongoDB Atlas account if you don't already have one. Visit the [MongoDB Atlas website](https://www.mongodb.com/atlas/database) and click on “Register.” Visit the [MongoDB Atlas](https://account.mongodb.com/account/login) dashboard and set up your cluster. In order to take advantage of the `$vectorSearch` operator in an aggregation pipeline, you need to run MongoDB Atlas 6.0.11 or higher. This tutorial can be built using a free cluster. When you’re setting up your deployment, you’ll be prompted to set up a database user and rules for your network connection. Please ensure you save your username and password somewhere safe and have the correct IP address rules in place so your cluster can connect properly. If you need more help getting started, check out our [tutorial on MongoDB Atlas](https://www.mongodb.com/basics/mongodb-atlas-tutorial).\n", + "\n", + "2. **OpenAI API key** To create your OpenAI key, you'll need to create an account. Once you have that, visit the [OpenAI platform](https://platform.openai.com/). Click on your profile icon in the top right of the screen to get the dropdown menu and select “View API keys”.\n" + ], + "metadata": { + "id": "vLuKFvTwJXpu" + } + }, + { + "cell_type": "code", + "source": [ + "import getpass\n", + "\n", + "MONGODB_ATLAS_CLUSTER_URI = getpass.getpass(\"MongoDB Atlas Cluster URI:\")\n", + "OPENAI_API_KEY = getpass.getpass(\"OpenAI API Key:\")\n" + ], + "metadata": { + "id": "qJHHIIKjIFUZ", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "57ad72d4-8afb-4e34-aad1-1fea6eb3645b" + }, + "execution_count": null, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MongoDB Atlas Cluster URI:··········\n", + "OpenAI API Key:··········\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Note: After executing the step above you will be prompted to enter the credentials." + ], + "metadata": { + "id": "Sarx9wdxb4Rr" + } + }, + { + "cell_type": "markdown", + "source": [ + "For this tutorial, we will be using the\n", + "[MongoDB sample dataset](https://www.mongodb.com/docs/atlas/sample-data/). Load the sample dataset using the Atlas UI. We'll be using the “sample_mflix” database, which contains a “movies” collection where each document contains fields like title, plot, genres, cast, directors, etc.\n" + ], + "metadata": { + "id": "sk1xXoyxMfil" + } + }, + { + "cell_type": "code", + "source": [ + "import openai\n", + "import pymongo\n", + "\n", + "client = pymongo.MongoClient(MONGODB_ATLAS_CLUSTER_URI)\n", + "db = client.sample_mflix\n", + "collection = db.movies\n", + "\n", + "openai.api_key = OPENAI_API_KEY" + ], + "metadata": { + "id": "k-G6WhNFdIvW" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "ATLAS_VECTOR_SEARCH_INDEX_NAME = \"default\"\n", + "EMBEDDING_FIELD_NAME = \"embedding_openai_nov19_23\"" + ], + "metadata": { + "id": "On9e13ASwReq" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Step 2: Setup embeddings generation function" + ], + "metadata": { + "id": "X-9gl2s-uGtw" + } + }, + { + "cell_type": "code", + "source": [ + "model = \"text-embedding-ada-002\"\n", + "def generate_embedding(text: str) -> list[float]:\n", + " return openai.embeddings.create(input = [text], model=model).data[0].embedding\n" + ], + "metadata": { + "id": "BMnE4BxSOCtH" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Step 3: Create and store embeddings\n", + "\n", + "Each document in the sample dataset sample_mflix.movies corresponds to a movie; we will execute an operation to create a vector embedding for the data in the \"plot\" field and store it in the database. Creating vector embeddings using OpenAI embeddings endpoint is necessary for performing a similarity search based on intent." + ], + "metadata": { + "id": "snSjiSNKwX6Z" + } + }, + { + "cell_type": "code", + "source": [ + "from pymongo import ReplaceOne\n", + "\n", + "# Update the collection with the embeddings\n", + "requests = []\n", + "\n", + "for doc in collection.find({'plot':{\"$exists\": True}}).limit(500):\n", + " doc[EMBEDDING_FIELD_NAME] = generate_embedding(doc['plot'])\n", + " requests.append(ReplaceOne({'_id': doc['_id']}, doc))\n", + "\n", + "collection.bulk_write(requests)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "t4i9gQM2xUFF", + "outputId": "ae558b67-9b06-4c83-c52a-a8047ecd40d5" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "BulkWriteResult({'writeErrors': [], 'writeConcernErrors': [], 'nInserted': 0, 'nUpserted': 0, 'nMatched': 50, 'nModified': 50, 'nRemoved': 0, 'upserted': []}, acknowledged=True)" + ] + }, + "metadata": {}, + "execution_count": 15 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "After executing the above, the documents in \"movies\" collection will contain an additional field of \"embedding\", as defined by the `EMBEDDDING_FIELD_NAME` variable, apart from already existing fields like title, plot, genres, cast, directors, etc." + ], + "metadata": { + "id": "ShPbxQPaPvHD" + } + }, + { + "cell_type": "markdown", + "source": [ + "Note: We are restricting this to just 500 documents in the interest of time. If you want to do this over the entire dataset of 23,000+ documents in our sample_mflix database, it will take a little while. Alternatively, you can use the [sample_mflix.embedded_movies collection](https://www.mongodb.com/docs/atlas/sample-data/sample-mflix/#sample_mflix.embedded_movies) which includes a pre-populated `plot_embedding` field that contains embeddings created using `OpenAI's text-embedding-ada-002` embedding model that you can use with the Atlas Search vector search feature.\n", + "\n", + "\n" + ], + "metadata": { + "id": "Coq0tyjXyNIu" + } + }, + { + "cell_type": "markdown", + "source": [ + "# Step 4: Create a vector search index\n", + "\n", + "We will create Atlas Vector Search Index on this collection which will allow us to perform the Approximate KNN search, which powers the semantic search.\n", + "We will cover 2 ways to create this index - Atlas UI and using MongoDB python driver.\n", + "\n", + "(Optional) [Documentation: Create a Vector Search Index ](https://www.mongodb.com/docs/atlas/atlas-search/field-types/knn-vector/)" + ], + "metadata": { + "id": "rCRCK6QOskqo" + } + }, + { + "cell_type": "markdown", + "source": [ + "Now head over to [Atlas UI](cloud.mongodb.com) and create an Atlas Vector Search index using the steps descibed [here](https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-tutorial/#create-the-atlas-vector-search-index). The 'dimensions' field with value 1536, corresponds to openAI text-embedding-ada002.\n", + "\n", + "Use the definition given below in the JSON editor on the Atlas UI.\n", + "\n", + "```\n", + "{\n", + " \"mappings\": {\n", + " \"dynamic\": true,\n", + " \"fields\": {\n", + " \"embedding\": {\n", + " \"dimensions\": 1536,\n", + " \"similarity\": \"dotProduct\",\n", + " \"type\": \"knnVector\"\n", + " }\n", + " }\n", + " }\n", + "}\n", + "```" + ], + "metadata": { + "id": "ymRTaFb1X5Tq" + } + }, + { + "cell_type": "markdown", + "source": [ + "(Optional) Alternatively, we can use [pymongo driver to create these vector search indexes programatically](https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.create_search_index)\n", + "The python command given in the cell below will create the index (this only works for the most recent version of the Python Driver for MongoDB and MongoDB server version 7.0+ Atlas cluster)." + ], + "metadata": { + "id": "2l5BzUgncjiq" + } + }, + { + "cell_type": "code", + "source": [ + "collection.create_search_index(\n", + " {\"definition\":\n", + " {\"mappings\": {\"dynamic\": True, \"fields\": {\n", + " EMBEDDING_FIELD_NAME : {\n", + " \"dimensions\": 1536,\n", + " \"similarity\": \"dotProduct\",\n", + " \"type\": \"knnVector\"\n", + " }}}},\n", + " \"name\": ATLAS_VECTOR_SEARCH_INDEX_NAME\n", + " }\n", + ")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "id": "54OWgiaPcmD0", + "outputId": "2cb9d1d8-4515-49ad-9fe7-5b4fa3c6c86b" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'default'" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 16 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Step 5: Query your data\n", + "\n", + "The results for the query here finds movies which have semantically similar plots to the text captured in the query string, rather than being based on the keyword search.\n", + "\n", + "(Optional) [Documentation: Run Vector Search Queries](https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-stage/)" + ], + "metadata": { + "id": "6V9QKgm8caNb" + } + }, + { + "cell_type": "code", + "source": [ + "\n", + "def query_results(query, k):\n", + " results = collection.aggregate([\n", + " {\n", + " '$vectorSearch': {\n", + " \"index\": ATLAS_VECTOR_SEARCH_INDEX_NAME,\n", + " \"path\": EMBEDDING_FIELD_NAME,\n", + " \"queryVector\": generate_embedding(query),\n", + " \"numCandidates\": 50,\n", + " \"limit\": 5,\n", + " }\n", + " }\n", + " ])\n", + " return results" + ], + "metadata": { + "id": "34tib9TrMPg4" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "query=\"imaginary characters from outerspace at war with earthlings\"\n", + "movies = query_results(query, 5)\n", + "\n", + "for movie in movies:\n", + " print(f'Movie Name: {movie[\"title\"]},\\nMovie Plot: {movie[\"plot\"]}\\n')" + ], + "metadata": { + "collapsed": true, + "id": "kTqrip-hWULK" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/registry.yaml b/registry.yaml index 42c8a7e664..14708272c8 100644 --- a/registry.yaml +++ b/registry.yaml @@ -1116,6 +1116,24 @@ - assistants - functions +- title: MongoDB Atlas Vector Search + path: examples/vector_databases/mongodb_atlas/README.md + date: 2023-11-21 + authors: + - prakul + tags: + - embeddings + - completions + +- title: Semantic search using MongoDB Atlas Vector Search and OpenAI + path: examples/vector_databases/mongodb_atlas/semantic_search_using_mongodb_atlas_vector_search.ipynb + date: 2023-11-21 + authors: + - prakul + tags: + - embeddings + - completions + - title: Evaluate RAG with LlamaIndex path: examples/evaluation/Evaluate_RAG_with_LlamaIndex.ipynb date: 2023-11-06 @@ -1123,4 +1141,5 @@ - Ravi Theja tags: - embeddings - - completions \ No newline at end of file + - completions +