From 4db1ed17fc31af32118617540c6103ecdf796154 Mon Sep 17 00:00:00 2001 From: Rohan Aggarwal Date: Mon, 30 Sep 2024 14:53:54 -0700 Subject: [PATCH] Oraclevs integration (#16161) --- .../docs/api_reference/embeddings/oracleai.md | 4 + docs/docs/api_reference/readers/oracleai.md | 5 + .../storage/vector_store/oracledb.md | 4 + docs/docs/api_reference/tools/oracleai.md | 4 + .../examples/cookbooks/oracleai_demo.ipynb | 891 ++++++++++++++++++ .../examples/data_connectors/oracleai.ipynb | 262 +++++ docs/docs/examples/embeddings/oracleai.ipynb | 287 ++++++ .../examples/vector_stores/orallamavs.ipynb | 612 ++++++++++++ docs/mkdocs.yml | 11 + .../.gitignore | 153 +++ .../llama-index-embeddings-oracleai/BUILD | 3 + .../llama-index-embeddings-oracleai/Makefile | 17 + .../llama-index-embeddings-oracleai/README.md | 40 + .../llama_index/embeddings/oracleai/BUILD | 1 + .../embeddings/oracleai/__init__.py | 4 + .../llama_index/embeddings/oracleai/base.py | 204 ++++ .../pyproject.toml | 57 ++ .../tests/BUILD | 1 + .../tests/__init__.py | 0 .../tests/test_embeddings_oracleai.py | 45 + .../llama-index-readers-oracleai/.gitignore | 153 +++ .../llama-index-readers-oracleai/BUILD | 3 + .../llama-index-readers-oracleai/Makefile | 17 + .../llama-index-readers-oracleai/README.md | 38 + .../llama_index/readers/oracleai/BUILD | 1 + .../llama_index/readers/oracleai/__init__.py | 4 + .../llama_index/readers/oracleai/base.py | 425 +++++++++ .../pyproject.toml | 58 ++ .../llama-index-readers-oracleai/tests/BUILD | 1 + .../tests/__init__.py | 0 .../tests/test_readers_oracleai.py | 142 +++ .../.gitignore | 153 +++ .../llama-index-vector-stores-oracledb/BUILD | 3 + .../Makefile | 17 + .../README.md | 5 + .../llama_index/vector_stores/oracledb/BUILD | 1 + .../vector_stores/oracledb/__init__.py | 3 + .../vector_stores/oracledb/base.py | 680 +++++++++++++ .../pyproject.toml | 63 ++ .../tests/BUILD | 1 + .../tests/__init__.py | 0 .../tests/test_vector_stores_orallamavs.py | 7 + .../llama-index-utils-oracleai/.gitignore | 153 +++ .../llama-index-utils-oracleai/BUILD | 3 + .../llama-index-utils-oracleai/Makefile | 17 + .../llama-index-utils-oracleai/README.md | 1 + .../llama_index/utils/oracleai/BUILD | 1 + .../llama_index/utils/oracleai/__init__.py | 4 + .../llama_index/utils/oracleai/base.py | 169 ++++ .../llama-index-utils-oracleai/pyproject.toml | 53 ++ .../llama-index-utils-oracleai/tests/BUILD | 1 + .../tests/__init__.py | 0 .../tests/test_utils_oracleai.py | 49 + 53 files changed, 4831 insertions(+) create mode 100644 docs/docs/api_reference/embeddings/oracleai.md create mode 100644 docs/docs/api_reference/readers/oracleai.md create mode 100644 docs/docs/api_reference/storage/vector_store/oracledb.md create mode 100644 docs/docs/api_reference/tools/oracleai.md create mode 100644 docs/docs/examples/cookbooks/oracleai_demo.ipynb create mode 100644 docs/docs/examples/data_connectors/oracleai.ipynb create mode 100644 docs/docs/examples/embeddings/oracleai.ipynb create mode 100644 docs/docs/examples/vector_stores/orallamavs.ipynb create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-oracleai/.gitignore create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-oracleai/BUILD create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-oracleai/Makefile create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-oracleai/README.md create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-oracleai/llama_index/embeddings/oracleai/BUILD create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-oracleai/llama_index/embeddings/oracleai/__init__.py create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-oracleai/llama_index/embeddings/oracleai/base.py create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-oracleai/pyproject.toml create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-oracleai/tests/BUILD create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-oracleai/tests/__init__.py create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-oracleai/tests/test_embeddings_oracleai.py create mode 100644 llama-index-integrations/readers/llama-index-readers-oracleai/.gitignore create mode 100644 llama-index-integrations/readers/llama-index-readers-oracleai/BUILD create mode 100644 llama-index-integrations/readers/llama-index-readers-oracleai/Makefile create mode 100644 llama-index-integrations/readers/llama-index-readers-oracleai/README.md create mode 100644 llama-index-integrations/readers/llama-index-readers-oracleai/llama_index/readers/oracleai/BUILD create mode 100644 llama-index-integrations/readers/llama-index-readers-oracleai/llama_index/readers/oracleai/__init__.py create mode 100644 llama-index-integrations/readers/llama-index-readers-oracleai/llama_index/readers/oracleai/base.py create mode 100644 llama-index-integrations/readers/llama-index-readers-oracleai/pyproject.toml create mode 100644 llama-index-integrations/readers/llama-index-readers-oracleai/tests/BUILD create mode 100644 llama-index-integrations/readers/llama-index-readers-oracleai/tests/__init__.py create mode 100644 llama-index-integrations/readers/llama-index-readers-oracleai/tests/test_readers_oracleai.py create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/.gitignore create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/BUILD create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/Makefile create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/README.md create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/llama_index/vector_stores/oracledb/BUILD create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/llama_index/vector_stores/oracledb/__init__.py create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/llama_index/vector_stores/oracledb/base.py create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/pyproject.toml create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/tests/BUILD create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/tests/__init__.py create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/tests/test_vector_stores_orallamavs.py create mode 100644 llama-index-utils/llama-index-utils-oracleai/.gitignore create mode 100644 llama-index-utils/llama-index-utils-oracleai/BUILD create mode 100644 llama-index-utils/llama-index-utils-oracleai/Makefile create mode 100644 llama-index-utils/llama-index-utils-oracleai/README.md create mode 100644 llama-index-utils/llama-index-utils-oracleai/llama_index/utils/oracleai/BUILD create mode 100644 llama-index-utils/llama-index-utils-oracleai/llama_index/utils/oracleai/__init__.py create mode 100644 llama-index-utils/llama-index-utils-oracleai/llama_index/utils/oracleai/base.py create mode 100644 llama-index-utils/llama-index-utils-oracleai/pyproject.toml create mode 100644 llama-index-utils/llama-index-utils-oracleai/tests/BUILD create mode 100644 llama-index-utils/llama-index-utils-oracleai/tests/__init__.py create mode 100644 llama-index-utils/llama-index-utils-oracleai/tests/test_utils_oracleai.py diff --git a/docs/docs/api_reference/embeddings/oracleai.md b/docs/docs/api_reference/embeddings/oracleai.md new file mode 100644 index 0000000000000..581c3290fd409 --- /dev/null +++ b/docs/docs/api_reference/embeddings/oracleai.md @@ -0,0 +1,4 @@ +::: llama_index.embeddings.oracleai + options: + members: + - OracleEmbeddings diff --git a/docs/docs/api_reference/readers/oracleai.md b/docs/docs/api_reference/readers/oracleai.md new file mode 100644 index 0000000000000..881e6ed43c1ae --- /dev/null +++ b/docs/docs/api_reference/readers/oracleai.md @@ -0,0 +1,5 @@ +::: llama_index.readers.oracleai + options: + members: + - OracleReader + - OracleTextSplitter diff --git a/docs/docs/api_reference/storage/vector_store/oracledb.md b/docs/docs/api_reference/storage/vector_store/oracledb.md new file mode 100644 index 0000000000000..9051ce51f3bfa --- /dev/null +++ b/docs/docs/api_reference/storage/vector_store/oracledb.md @@ -0,0 +1,4 @@ +::: llama_index.vector_stores.oracledb + options: + members: + - OraLlamaVS diff --git a/docs/docs/api_reference/tools/oracleai.md b/docs/docs/api_reference/tools/oracleai.md new file mode 100644 index 0000000000000..5457eddb9fcf7 --- /dev/null +++ b/docs/docs/api_reference/tools/oracleai.md @@ -0,0 +1,4 @@ +::: llama_index.tools.oracleai + options: + members: + - OracleSummary diff --git a/docs/docs/examples/cookbooks/oracleai_demo.ipynb b/docs/docs/examples/cookbooks/oracleai_demo.ipynb new file mode 100644 index 0000000000000..e99796369ab20 --- /dev/null +++ b/docs/docs/examples/cookbooks/oracleai_demo.ipynb @@ -0,0 +1,891 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Oracle AI Vector Search with Document Processing\n", + "Oracle AI Vector Search is designed for Artificial Intelligence (AI) workloads that allows you to query data based on semantics, rather than keywords.\n", + "One of the biggest benefits of Oracle AI Vector Search is that semantic search on unstructured data can be combined with relational search on business data in one single system.\n", + "This is not only powerful but also significantly more effective because you don't need to add a specialized vector database, eliminating the pain of data fragmentation between multiple systems.\n", + "\n", + "In addition, your vectors can benefit from all of Oracle Database’s most powerful features, like the following:\n", + "\n", + " * [Partitioning Support](https://www.oracle.com/database/technologies/partitioning.html)\n", + " * [Real Application Clusters scalability](https://www.oracle.com/database/real-application-clusters/)\n", + " * [Exadata smart scans](https://www.oracle.com/database/technologies/exadata/software/smartscan/)\n", + " * [Shard processing across geographically distributed databases](https://www.oracle.com/database/distributed-database/)\n", + " * [Transactions](https://docs.oracle.com/en/database/oracle/oracle-database/23/cncpt/transactions.html)\n", + " * [Parallel SQL](https://docs.oracle.com/en/database/oracle/oracle-database/21/vldbg/parallel-exec-intro.html#GUID-D28717E4-0F77-44F5-BB4E-234C31D4E4BA)\n", + " * [Disaster recovery](https://www.oracle.com/database/data-guard/)\n", + " * [Security](https://www.oracle.com/security/database-security/)\n", + " * [Oracle Machine Learning](https://www.oracle.com/artificial-intelligence/database-machine-learning/)\n", + " * [Oracle Graph Database](https://www.oracle.com/database/integrated-graph-database/)\n", + " * [Oracle Spatial and Graph](https://www.oracle.com/database/spatial/)\n", + " * [Oracle Blockchain](https://docs.oracle.com/en/database/oracle/oracle-database/23/arpls/dbms_blockchain_table.html#GUID-B469E277-978E-4378-A8C1-26D3FF96C9A6)\n", + " * [JSON](https://docs.oracle.com/en/database/oracle/oracle-database/23/adjsn/json-in-oracle-database.html)\n", + "\n", + "This guide demonstrates how Oracle AI Vector Search can be used with llama_index to serve an end-to-end RAG pipeline. This guide goes through examples of:\n", + "\n", + " * Loading the documents from various sources using OracleReader\n", + " * Summarizing them within/outside the database using OracleSummary\n", + " * Generating embeddings for them within/outside the database using OracleEmbeddings\n", + " * Chunking them according to different requirements using Advanced Oracle Capabilities from OracleTextSplitter\n", + " * Storing and Indexing them in a Vector Store and querying them for queries in OraLlamaVS" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you are just starting with Oracle Database, consider exploring the [free Oracle 23 AI](https://www.oracle.com/database/free/#resources) which provides a great introduction to setting up your database environment. While working with the database, it is often advisable to avoid using the system user by default; instead, you can create your own user for enhanced security and customization. For detailed steps on user creation, refer to our [end-to-end guide](https://github.com/run-llama/llama_index/blob/main/docs/docs/examples/cookbooks/oracleai_demo.ipynb) which also shows how to set up a user in Oracle. Additionally, understanding user privileges is crucial for managing database security effectively. You can learn more about this topic in the official [Oracle guide](https://docs.oracle.com/en/database/oracle/oracle-database/19/admqs/administering-user-accounts-and-security.html#GUID-36B21D72-1BBB-46C9-A0C9-F0D2A8591B8D) on administering user accounts and security." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prerequisites\n", + "\n", + "Please install the Oracle `llama-index` integration packages:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install llama-index\n", + "%pip install llama_index-embeddings-oracleai\n", + "%pip install llama_index-readers-oracleai\n", + "%pip install llama_index-utils-oracleai\n", + "%pip install llama-index-vector-stores-oracledb" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create Demo User\n", + "First, create a demo user with all the required privileges. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Connection successful!\n", + "User setup done!\n" + ] + } + ], + "source": [ + "import sys\n", + "\n", + "import oracledb\n", + "\n", + "# Update with your username, password, hostname, and service_name\n", + "username = \"\"\n", + "password = \"\"\n", + "dsn = \"\"\n", + "\n", + "try:\n", + " conn = oracledb.connect(user=username, password=password, dsn=dsn)\n", + " print(\"Connection successful!\")\n", + "\n", + " cursor = conn.cursor()\n", + " try:\n", + " cursor.execute(\n", + " \"\"\"\n", + " begin\n", + " -- Drop user\n", + " begin\n", + " execute immediate 'drop user testuser cascade';\n", + " exception\n", + " when others then\n", + " dbms_output.put_line('Error dropping user: ' || SQLERRM);\n", + " end;\n", + " \n", + " -- Create user and grant privileges\n", + " execute immediate 'create user testuser identified by testuser';\n", + " execute immediate 'grant connect, unlimited tablespace, create credential, create procedure, create any index to testuser';\n", + " execute immediate 'create or replace directory DEMO_PY_DIR as ''/scratch/hroy/view_storage/hroy_devstorage/demo/orachain''';\n", + " execute immediate 'grant read, write on directory DEMO_PY_DIR to public';\n", + " execute immediate 'grant create mining model to testuser';\n", + " \n", + " -- Network access\n", + " begin\n", + " DBMS_NETWORK_ACL_ADMIN.APPEND_HOST_ACE(\n", + " host => '*',\n", + " ace => xs$ace_type(privilege_list => xs$name_list('connect'),\n", + " principal_name => 'testuser',\n", + " principal_type => xs_acl.ptype_db)\n", + " );\n", + " end;\n", + " end;\n", + " \"\"\"\n", + " )\n", + " print(\"User setup done!\")\n", + " except Exception as e:\n", + " print(f\"User setup failed with error: {e}\")\n", + " finally:\n", + " cursor.close()\n", + " conn.close()\n", + "except Exception as e:\n", + " print(f\"Connection failed with error: {e}\")\n", + " sys.exit(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Process Documents using Oracle AI\n", + "Consider the following scenario: users possess documents stored either in an Oracle Database or a file system and intend to utilize this data with Oracle AI Vector Search powered by llama_index.\n", + "\n", + "To prepare the documents for analysis, a comprehensive preprocessing workflow is necessary. Initially, the documents must be retrieved, summarized (if required), and chunked as needed. Subsequent steps involve generating embeddings for these chunks and integrating them into the Oracle AI Vector Store. Users can then conduct semantic searches on this data.\n", + "\n", + "The Oracle AI Vector Search llama_index library encompasses a suite of document processing tools that facilitate document loading, chunking, summary generation, and embedding creation.\n", + "\n", + "In the sections that follow, we will detail the utilization of Oracle AI llama_index APIs to effectively implement each of these processes." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Connect to Demo User\n", + "The following sample code will show how to connect to Oracle Database. By default, python-oracledb runs in a ‘Thin’ mode which connects directly to Oracle Database. This mode does not need Oracle Client libraries. However, some additional functionality is available when python-oracledb uses them. Python-oracledb is said to be in ‘Thick’ mode when Oracle Client libraries are used. Both modes have comprehensive functionality supporting the Python Database API v2.0 Specification. See the following [guide](https://python-oracledb.readthedocs.io/en/latest/user_guide/appendix_a.html#featuresummary) that talks about features supported in each mode. You might want to switch to thick-mode if you are unable to use thin-mode." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Connection successful!\n" + ] + } + ], + "source": [ + "import sys\n", + "\n", + "import oracledb\n", + "\n", + "# please update with your username, password, hostname and service_name\n", + "username = \"\"\n", + "password = \"\"\n", + "dsn = \"\"\n", + "\n", + "try:\n", + " conn = oracledb.connect(user=username, password=password, dsn=dsn)\n", + " print(\"Connection successful!\")\n", + "except Exception as e:\n", + " print(\"Connection failed!\")\n", + " sys.exit(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Populate a Demo Table\n", + "Create a demo table and insert some sample documents." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Table created and populated.\n" + ] + } + ], + "source": [ + "try:\n", + " cursor = conn.cursor()\n", + "\n", + " drop_table_sql = \"\"\"drop table demo_tab\"\"\"\n", + " cursor.execute(drop_table_sql)\n", + "\n", + " create_table_sql = \"\"\"create table demo_tab (id number, data clob)\"\"\"\n", + " cursor.execute(create_table_sql)\n", + "\n", + " insert_row_sql = \"\"\"insert into demo_tab values (:1, :2)\"\"\"\n", + " rows_to_insert = [\n", + " (\n", + " 1,\n", + " \"If the answer to any preceding questions is yes, then the database stops the search and allocates space from the specified tablespace; otherwise, space is allocated from the database default shared temporary tablespace.\",\n", + " ),\n", + " (\n", + " 2,\n", + " \"A tablespace can be online (accessible) or offline (not accessible) whenever the database is open.\\nA tablespace is usually online so that its data is available to users. The SYSTEM tablespace and temporary tablespaces cannot be taken offline.\",\n", + " ),\n", + " (\n", + " 3,\n", + " \"The database stores LOBs differently from other data types. Creating a LOB column implicitly creates a LOB segment and a LOB index. The tablespace containing the LOB segment and LOB index, which are always stored together, may be different from the tablespace containing the table.\\nSometimes the database can store small amounts of LOB data in the table itself rather than in a separate LOB segment.\",\n", + " ),\n", + " ]\n", + " cursor.executemany(insert_row_sql, rows_to_insert)\n", + "\n", + " conn.commit()\n", + "\n", + " print(\"Table created and populated.\")\n", + " cursor.close()\n", + "except Exception as e:\n", + " print(\"Table creation failed.\")\n", + " cursor.close()\n", + " conn.close()\n", + " sys.exit(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With the inclusion of a demo user and a populated sample table, the remaining configuration involves setting up embedding and summary functionalities. Users are presented with multiple provider options, including local database solutions and third-party services such as Ocigenai, Hugging Face, and OpenAI. Should users opt for a third-party provider, they are required to establish credentials containing the necessary authentication details. Conversely, if selecting a database as the provider for embeddings, it is necessary to upload an ONNX model to the Oracle Database. No additional setup is required for summary functionalities when using the database option." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load ONNX Model\n", + "\n", + "Oracle accommodates a variety of embedding providers, enabling users to choose between proprietary database solutions and third-party services such as OCIGENAI and HuggingFace. This selection dictates the methodology for generating and managing embeddings.\n", + "\n", + "***Important*** : Should users opt for the database option, they must upload an ONNX model into the Oracle Database. Conversely, if a third-party provider is selected for embedding generation, uploading an ONNX model to Oracle Database is not required.\n", + "\n", + "A significant advantage of utilizing an ONNX model directly within Oracle is the enhanced security and performance it offers by eliminating the need to transmit data to external parties. Additionally, this method avoids the latency typically associated with network or REST API calls.\n", + "\n", + "Below is the example code to upload an ONNX model into Oracle Database:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ONNX model loaded.\n" + ] + } + ], + "source": [ + "from llama_index.embeddings.oracleai import OracleEmbeddings\n", + "\n", + "# please update with your related information\n", + "# make sure that you have onnx file in the system\n", + "onnx_dir = \"DEMO_PY_DIR\"\n", + "onnx_file = \"tinybert.onnx\"\n", + "model_name = \"demo_model\"\n", + "\n", + "try:\n", + " OracleEmbeddings.load_onnx_model(conn, onnx_dir, onnx_file, model_name)\n", + " print(\"ONNX model loaded.\")\n", + "except Exception as e:\n", + " print(\"ONNX model loading failed!\")\n", + " sys.exit(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create Credential\n", + "\n", + "When selecting third-party providers for generating embeddings, users are required to establish credentials to securely access the provider's endpoints.\n", + "\n", + "***Important:*** No credentials are necessary when opting for the 'database' provider to generate embeddings. However, should users decide to utilize a third-party provider, they must create credentials specific to the chosen provider.\n", + "\n", + "Below is an illustrative example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " cursor = conn.cursor()\n", + " cursor.execute(\n", + " \"\"\"\n", + " declare\n", + " jo json_object_t;\n", + " begin\n", + " -- HuggingFace\n", + " dbms_vector_chain.drop_credential(credential_name => 'HF_CRED');\n", + " jo := json_object_t();\n", + " jo.put('access_token', '');\n", + " dbms_vector_chain.create_credential(\n", + " credential_name => 'HF_CRED',\n", + " params => json(jo.to_string));\n", + "\n", + " -- OCIGENAI\n", + " dbms_vector_chain.drop_credential(credential_name => 'OCI_CRED');\n", + " jo := json_object_t();\n", + " jo.put('user_ocid','');\n", + " jo.put('tenancy_ocid','');\n", + " jo.put('compartment_ocid','');\n", + " jo.put('private_key','');\n", + " jo.put('fingerprint','');\n", + " dbms_vector_chain.create_credential(\n", + " credential_name => 'OCI_CRED',\n", + " params => json(jo.to_string));\n", + " end;\n", + " \"\"\"\n", + " )\n", + " cursor.close()\n", + " print(\"Credentials created.\")\n", + "except Exception as ex:\n", + " cursor.close()\n", + " raise" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load Documents\n", + "Users have the flexibility to load documents from either the Oracle Database, a file system, or both, by appropriately configuring the loader parameters. For comprehensive details on these parameters, please consult the [Oracle AI Vector Search Guide](https://docs.oracle.com/en/database/oracle/oracle-database/23/arpls/dbms_vector_chain1.html#GUID-73397E89-92FB-48ED-94BB-1AD960C4EA1F).\n", + "\n", + "A significant advantage of utilizing OracleReader is its capability to process over 150 distinct file formats, eliminating the need for multiple loaders for different document types. For a complete list of the supported formats, please refer to the [Oracle Text Supported Document Formats](https://docs.oracle.com/en/database/oracle/oracle-database/23/ccref/oracle-text-supported-document-formats.html).\n", + "\n", + "Below is a sample code snippet that demonstrates how to use OracleReader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of docs loaded: 3\n" + ] + } + ], + "source": [ + "from llama_index.core.schema import Document\n", + "from llama_index.readers.oracleai import OracleReader\n", + "\n", + "# loading from Oracle Database table\n", + "# make sure you have the table with this specification\n", + "loader_params = {}\n", + "loader_params = {\n", + " \"owner\": \"testuser\",\n", + " \"tablename\": \"demo_tab\",\n", + " \"colname\": \"data\",\n", + "}\n", + "\n", + "\"\"\" load the docs \"\"\"\n", + "loader = OracleReader(conn=conn, params=loader_params)\n", + "docs = loader.load()\n", + "\n", + "\"\"\" verify \"\"\"\n", + "print(f\"Number of docs loaded: {len(docs)}\")\n", + "# print(f\"Document-0: {docs[0].text}\") # content" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Generate Summary\n", + "Now that the user loaded the documents, they may want to generate a summary for each document. The Oracle AI Vector Search llama_index library offers a suite of APIs designed for document summarization. It supports multiple summarization providers such as Database, OCIGENAI, HuggingFace, among others, allowing users to select the provider that best meets their needs. To utilize these capabilities, users must configure the summary parameters as specified. For detailed information on these parameters, please consult the [Oracle AI Vector Search Guide book](https://docs.oracle.com/en/database/oracle/oracle-database/23/arpls/dbms_vector_chain1.html#GUID-EC9DDB58-6A15-4B36-BA66-ECBA20D2CE57)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Note:*** The users may need to set proxy if they want to use some 3rd party summary generation providers other than Oracle's in-house and default provider: 'database'. If you don't have proxy, please remove the proxy parameter when you instantiate the OracleSummary." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# proxy to be used when we instantiate summary and embedder object\n", + "proxy = \"\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following sample code will show how to generate summary:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of Summaries: 3\n" + ] + } + ], + "source": [ + "from llama_index.core.schema import Document\n", + "from llama_index.utils.oracleai import OracleSummary\n", + "\n", + "# using 'database' provider\n", + "summary_params = {\n", + " \"provider\": \"database\",\n", + " \"glevel\": \"S\",\n", + " \"numParagraphs\": 1,\n", + " \"language\": \"english\",\n", + "}\n", + "\n", + "# get the summary instance\n", + "# Remove proxy if not required\n", + "summ = OracleSummary(conn=conn, params=summary_params, proxy=proxy)\n", + "\n", + "list_summary = []\n", + "for doc in docs:\n", + " summary = summ.get_summary(doc.text)\n", + " list_summary.append(summary)\n", + "\n", + "\"\"\" verify \"\"\"\n", + "print(f\"Number of Summaries: {len(list_summary)}\")\n", + "# print(f\"Summary-0: {list_summary[0]}\") #content" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Split Documents\n", + "The documents may vary in size, ranging from small to very large. Users often prefer to chunk their documents into smaller sections to facilitate the generation of embeddings. A wide array of customization options is available for this splitting process. For comprehensive details regarding these parameters, please consult the [Oracle AI Vector Search Guide](https://docs.oracle.com/en/database/oracle/oracle-database/23/arpls/dbms_vector_chain1.html#GUID-4E145629-7098-4C7C-804F-FC85D1F24240).\n", + "\n", + "Below is a sample code illustrating how to implement this:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of Chunks: 3\n" + ] + } + ], + "source": [ + "from llama_index.core.schema import Document\n", + "from llama_index.readers.oracleai import OracleTextSplitter\n", + "\n", + "# split by default parameters\n", + "splitter_params = {\"normalize\": \"all\"}\n", + "\n", + "\"\"\" get the splitter instance \"\"\"\n", + "splitter = OracleTextSplitter(conn=conn, params=splitter_params)\n", + "\n", + "list_chunks = []\n", + "for doc in docs:\n", + " chunks = splitter.split_text(doc.text)\n", + " list_chunks.extend(chunks)\n", + "\n", + "\"\"\" verify \"\"\"\n", + "print(f\"Number of Chunks: {len(list_chunks)}\")\n", + "# print(f\"Chunk-0: {list_chunks[0]}\") # content" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Generate Embeddings\n", + "Now that the documents are chunked as per requirements, the users may want to generate embeddings for these chunks. Oracle AI Vector Search provides multiple methods for generating embeddings, utilizing either locally hosted ONNX models or third-party APIs. For comprehensive instructions on configuring these alternatives, please refer to the [Oracle AI Vector Search Guide](https://docs.oracle.com/en/database/oracle/oracle-database/23/arpls/dbms_vector_chain1.html#GUID-C6439E94-4E86-4ECD-954E-4B73D53579DE)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Note:*** Users may need to configure a proxy to utilize third-party embedding generation providers, excluding the 'database' provider that utilizes an ONNX model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# proxy to be used when we instantiate summary and embedder object\n", + "proxy = \"\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following sample code will show how to generate embeddings:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of embeddings: 3\n" + ] + } + ], + "source": [ + "from llama_index.core.schema import Document\n", + "from llama_index.embeddings.oracleai import OracleEmbeddings\n", + "\n", + "# using ONNX model loaded to Oracle Database\n", + "embedder_params = {\"provider\": \"database\", \"model\": \"demo_model\"}\n", + "\n", + "# get the embedding instance\n", + "# Remove proxy if not required\n", + "embedder = OracleEmbeddings(conn=conn, params=embedder_params, proxy=proxy)\n", + "\n", + "embeddings = []\n", + "for doc in docs:\n", + " chunks = splitter.split_text(doc.text)\n", + " for chunk in chunks:\n", + " embed = embedder._get_text_embedding(chunk)\n", + " embeddings.append(embed)\n", + "\n", + "\"\"\" verify \"\"\"\n", + "print(f\"Number of embeddings: {len(embeddings)}\")\n", + "# print(f\"Embedding-0: {embeddings[0]}\") # content" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Oracle AI Vector Store\n", + "Now that you know how to use Oracle AI llama_index library APIs individually to process the documents, let us show how to integrate with Oracle AI Vector Store to facilitate the semantic searches." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, let's import all the dependencies." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "import oracledb\n", + "from llama_index.core.schema import Document, TextNode\n", + "from llama_index.readers.oracleai import OracleReader, OracleTextSplitter\n", + "from llama_index.embeddings.oracleai import OracleEmbeddings\n", + "from llama_index.utils.oracleai import OracleSummary\n", + "from llama_index.vector_stores.oracledb import OraLlamaVS, DistanceStrategy\n", + "from llama_index.vector_stores.oracledb import base as orallamavs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, let's combine all document processing stages together. Here is the sample code below:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Connection successful!\n", + "ONNX model loaded.\n", + "Number of total chunks with metadata: 3\n" + ] + } + ], + "source": [ + "\"\"\"\n", + "In this sample example, we will use 'database' provider for both summary and embeddings.\n", + "So, we don't need to do the followings:\n", + " - set proxy for 3rd party providers\n", + " - create credential for 3rd party providers\n", + "\n", + "If you choose to use 3rd party provider, \n", + "please follow the necessary steps for proxy and credential.\n", + "\"\"\"\n", + "\n", + "# oracle connection\n", + "# please update with your username, password, hostname, and service_name\n", + "username = \"testuser\"\n", + "password = \"testuser\"\n", + "dsn = \"\"\n", + "\n", + "try:\n", + " conn = oracledb.connect(user=username, password=password, dsn=dsn)\n", + " print(\"Connection successful!\")\n", + "except Exception as e:\n", + " print(\"Connection failed!\")\n", + " sys.exit(1)\n", + "\n", + "\n", + "# load onnx model\n", + "# please update with your related information\n", + "onnx_dir = \"DEMO_PY_DIR\"\n", + "onnx_file = \"tinybert.onnx\"\n", + "model_name = \"demo_model\"\n", + "try:\n", + " OracleEmbeddings.load_onnx_model(conn, onnx_dir, onnx_file, model_name)\n", + " print(\"ONNX model loaded.\")\n", + "except Exception as e:\n", + " print(\"ONNX model loading failed!\")\n", + " sys.exit(1)\n", + "\n", + "\n", + "# params\n", + "# please update necessary fields with related information\n", + "loader_params = {\n", + " \"owner\": \"testuser\",\n", + " \"tablename\": \"demo_tab\",\n", + " \"colname\": \"data\",\n", + "}\n", + "summary_params = {\n", + " \"provider\": \"database\",\n", + " \"glevel\": \"S\",\n", + " \"numParagraphs\": 1,\n", + " \"language\": \"english\",\n", + "}\n", + "splitter_params = {\"normalize\": \"all\"}\n", + "embedder_params = {\"provider\": \"database\", \"model\": \"demo_model\"}\n", + "\n", + "# instantiate loader, summary, splitter, and embedder\n", + "loader = OracleReader(conn=conn, params=loader_params)\n", + "summary = OracleSummary(conn=conn, params=summary_params)\n", + "splitter = OracleTextSplitter(conn=conn, params=splitter_params)\n", + "embedder = OracleEmbeddings(conn=conn, params=embedder_params)\n", + "\n", + "# process the documents\n", + "loader = OracleReader(conn=conn, params=loader_params)\n", + "docs = loader.load()\n", + "\n", + "chunks_with_mdata = []\n", + "for id, doc in enumerate(docs, start=1):\n", + " summ = summary.get_summary(doc.text)\n", + " chunks = splitter.split_text(doc.text)\n", + " for ic, chunk in enumerate(chunks, start=1):\n", + " chunk_metadata = doc.metadata.copy()\n", + " chunk_metadata[\"id\"] = (\n", + " chunk_metadata[\"_oid\"] + \"$\" + str(id) + \"$\" + str(ic)\n", + " )\n", + " chunk_metadata[\"document_id\"] = str(id)\n", + " chunk_metadata[\"document_summary\"] = str(summ[0])\n", + " textnode = TextNode(\n", + " text=chunk,\n", + " id_=chunk_metadata[\"id\"],\n", + " embedding=embedder._get_text_embedding(chunk),\n", + " metadata=chunk_metadata,\n", + " )\n", + " chunks_with_mdata.append(textnode)\n", + "\n", + "\"\"\" verify \"\"\"\n", + "print(f\"Number of total chunks with metadata: {len(chunks_with_mdata)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "At this point, we have processed the documents and generated chunks with metadata. Next, we will create Oracle AI Vector Store with those chunks.\n", + "\n", + "Here is the sample code how to do that:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Vector Store Table: oravs\n" + ] + } + ], + "source": [ + "# create Oracle AI Vector Store\n", + "vectorstore = OraLlamaVS.from_documents(\n", + " client=conn,\n", + " docs=chunks_with_mdata,\n", + " table_name=\"oravs\",\n", + " distance_strategy=DistanceStrategy.DOT_PRODUCT,\n", + ")\n", + "\n", + "\"\"\" verify \"\"\"\n", + "print(f\"Vector Store Table: {vectorstore.table_name}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The example provided illustrates the creation of a vector store using the DOT_PRODUCT distance strategy. Users have the flexibility to employ various distance strategies with the Oracle AI Vector Store, as detailed in our [comprehensive guide](https://python.llama_index.com/v0.1/docs/integrations/vectorstores/oracle/)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With embeddings now stored in vector stores, it is advisable to establish an index to enhance semantic search performance during query execution.\n", + "\n", + "***Note*** Should you encounter an \"insufficient memory\" error, it is recommended to increase the ***vector_memory_size*** in your database configuration\n", + "\n", + "Below is a sample code snippet for creating an index:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "orallamavs.create_index(\n", + " conn, vectorstore, params={\"idx_name\": \"hnsw_oravs\", \"idx_type\": \"HNSW\"}\n", + ")\n", + "\n", + "print(\"Index created.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This example demonstrates the creation of a default HNSW index on embeddings within the 'oravs' table. Users may adjust various parameters according to their specific needs. For detailed information on these parameters, please consult the [Oracle AI Vector Search Guide book](https://docs.oracle.com/en/database/oracle/oracle-database/23/vecse/manage-different-categories-vector-indexes.html).\n", + "\n", + "Additionally, various types of vector indices can be created to meet diverse requirements. More details can be found in our [comprehensive guide](https://github.com/run-llama/llama_index/blob/main/docs/docs/examples/vector_stores/oracle.ipynb).\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Perform Semantic Search\n", + "All set!\n", + "\n", + "We have successfully processed the documents and stored them in the vector store, followed by the creation of an index to enhance query performance. We are now prepared to proceed with semantic searches.\n", + "\n", + "Below is the sample code for this process:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Document(page_content='The database stores LOBs differently from other data types. Creating a LOB column implicitly creates a LOB segment and a LOB index. The tablespace containing the LOB segment and LOB index, which are always stored together, may be different from the tablespace containing the table. Sometimes the database can store small amounts of LOB data in the table itself rather than in a separate LOB segment.', metadata={'_oid': '662f2f257677f3c2311a8ff999fd34e5', '_rowid': 'AAAR/xAAEAAAAAnAAC', 'id': '662f2f257677f3c2311a8ff999fd34e5$3$1', 'document_id': '3', 'document_summary': 'Sometimes the database can store small amounts of LOB data in the table itself rather than in a separate LOB segment.\\n\\n'})]\n", + "[]\n", + "[(Document(page_content='The database stores LOBs differently from other data types. Creating a LOB column implicitly creates a LOB segment and a LOB index. The tablespace containing the LOB segment and LOB index, which are always stored together, may be different from the tablespace containing the table. Sometimes the database can store small amounts of LOB data in the table itself rather than in a separate LOB segment.', metadata={'_oid': '662f2f257677f3c2311a8ff999fd34e5', '_rowid': 'AAAR/xAAEAAAAAnAAC', 'id': '662f2f257677f3c2311a8ff999fd34e5$3$1', 'document_id': '3', 'document_summary': 'Sometimes the database can store small amounts of LOB data in the table itself rather than in a separate LOB segment.\\n\\n'}), 0.055675752460956573)]\n", + "[]\n", + "[Document(page_content='If the answer to any preceding questions is yes, then the database stops the search and allocates space from the specified tablespace; otherwise, space is allocated from the database default shared temporary tablespace.', metadata={'_oid': '662f2f253acf96b33b430b88699490a2', '_rowid': 'AAAR/xAAEAAAAAnAAA', 'id': '662f2f253acf96b33b430b88699490a2$1$1', 'document_id': '1', 'document_summary': 'If the answer to any preceding questions is yes, then the database stops the search and allocates space from the specified tablespace; otherwise, space is allocated from the database default shared temporary tablespace.\\n\\n'})]\n", + "[Document(page_content='If the answer to any preceding questions is yes, then the database stops the search and allocates space from the specified tablespace; otherwise, space is allocated from the database default shared temporary tablespace.', metadata={'_oid': '662f2f253acf96b33b430b88699490a2', '_rowid': 'AAAR/xAAEAAAAAnAAA', 'id': '662f2f253acf96b33b430b88699490a2$1$1', 'document_id': '1', 'document_summary': 'If the answer to any preceding questions is yes, then the database stops the search and allocates space from the specified tablespace; otherwise, space is allocated from the database default shared temporary tablespace.\\n\\n'})]\n" + ] + } + ], + "source": [ + "query = \"What is Oracle AI Vector Store?\"\n", + "filter = {\"document_id\": [\"1\"]}\n", + "\n", + "# Similarity search without a filter\n", + "print(vectorstore.similarity_search(query, 1))\n", + "\n", + "# Similarity search with a filter\n", + "print(vectorstore.similarity_search(query, 1, filter=filter))\n", + "\n", + "# Similarity search with relevance score\n", + "print(vectorstore.similarity_search_with_score(query, 1))\n", + "\n", + "# Similarity search with relevance score with filter\n", + "print(vectorstore.similarity_search_with_score(query, 1, filter=filter))\n", + "\n", + "# Max marginal relevance search\n", + "print(\n", + " vectorstore.max_marginal_relevance_search(\n", + " query, 1, fetch_k=20, lambda_mult=0.5\n", + " )\n", + ")\n", + "\n", + "# Max marginal relevance search with filter\n", + "print(\n", + " vectorstore.max_marginal_relevance_search(\n", + " query, 1, fetch_k=20, lambda_mult=0.5, filter=filter\n", + " )\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/docs/examples/data_connectors/oracleai.ipynb b/docs/docs/examples/data_connectors/oracleai.ipynb new file mode 100644 index 0000000000000..ea02c32d4a844 --- /dev/null +++ b/docs/docs/examples/data_connectors/oracleai.ipynb @@ -0,0 +1,262 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Oracle AI Vector Search: Document Processing\n", + "Oracle AI Vector Search is designed for Artificial Intelligence (AI) workloads that allows you to query data based on semantics, rather than keywords.\n", + "One of the biggest benefits of Oracle AI Vector Search is that semantic search on unstructured data can be combined with relational search on business data in one single system.\n", + "This is not only powerful but also significantly more effective because you don't need to add a specialized vector database, eliminating the pain of data fragmentation between multiple systems.\n", + "\n", + "In addition, your vectors can benefit from all of Oracle Database’s most powerful features, like the following:\n", + "\n", + " * [Partitioning Support](https://www.oracle.com/database/technologies/partitioning.html)\n", + " * [Real Application Clusters scalability](https://www.oracle.com/database/real-application-clusters/)\n", + " * [Exadata smart scans](https://www.oracle.com/database/technologies/exadata/software/smartscan/)\n", + " * [Shard processing across geographically distributed databases](https://www.oracle.com/database/distributed-database/)\n", + " * [Transactions](https://docs.oracle.com/en/database/oracle/oracle-database/23/cncpt/transactions.html)\n", + " * [Parallel SQL](https://docs.oracle.com/en/database/oracle/oracle-database/21/vldbg/parallel-exec-intro.html#GUID-D28717E4-0F77-44F5-BB4E-234C31D4E4BA)\n", + " * [Disaster recovery](https://www.oracle.com/database/data-guard/)\n", + " * [Security](https://www.oracle.com/security/database-security/)\n", + " * [Oracle Machine Learning](https://www.oracle.com/artificial-intelligence/database-machine-learning/)\n", + " * [Oracle Graph Database](https://www.oracle.com/database/integrated-graph-database/)\n", + " * [Oracle Spatial and Graph](https://www.oracle.com/database/spatial/)\n", + " * [Oracle Blockchain](https://docs.oracle.com/en/database/oracle/oracle-database/23/arpls/dbms_blockchain_table.html#GUID-B469E277-978E-4378-A8C1-26D3FF96C9A6)\n", + " * [JSON](https://docs.oracle.com/en/database/oracle/oracle-database/23/adjsn/json-in-oracle-database.html)\n", + "\n", + "\n", + "The guide demonstrates how to use Document Processing Capabilities within Oracle AI Vector Search to load and chunk documents using OracleDocLoader and OracleTextSplitter respectively." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you are just starting with Oracle Database, consider exploring the [free Oracle 23 AI](https://www.oracle.com/database/free/#resources) which provides a great introduction to setting up your database environment. While working with the database, it is often advisable to avoid using the system user by default; instead, you can create your own user for enhanced security and customization. For detailed steps on user creation, refer to our [end-to-end guide](https://github.com/run-llama/llama_index/blob/main/docs/docs/examples/cookbooks/oracleai_demo.ipynb) which also shows how to set up a user in Oracle. Additionally, understanding user privileges is crucial for managing database security effectively. You can learn more about this topic in the official [Oracle guide](https://docs.oracle.com/en/database/oracle/oracle-database/19/admqs/administering-user-accounts-and-security.html#GUID-36B21D72-1BBB-46C9-A0C9-F0D2A8591B8D) on administering user accounts and security." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prerequisites\n", + "\n", + "Please install Oracle Python Client driver to use llama_index with Oracle AI Vector Search. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install llama-index-readers-oracleai" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Connect to Oracle Database\n", + "The following sample code will show how to connect to Oracle Database. By default, python-oracledb runs in a ‘Thin’ mode which connects directly to Oracle Database. This mode does not need Oracle Client libraries. However, some additional functionality is available when python-oracledb uses them. Python-oracledb is said to be in ‘Thick’ mode when Oracle Client libraries are used. Both modes have comprehensive functionality supporting the Python Database API v2.0 Specification. See the following [guide](https://python-oracledb.readthedocs.io/en/latest/user_guide/appendix_a.html#featuresummary) that talks about features supported in each mode. You might want to switch to thick-mode if you are unable to use thin-mode." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "import oracledb\n", + "\n", + "# please update with your username, password, hostname and service_name\n", + "username = \"\"\n", + "password = \"\"\n", + "dsn = \"/\"\n", + "\n", + "try:\n", + " conn = oracledb.connect(user=username, password=password, dsn=dsn)\n", + " print(\"Connection successful!\")\n", + "except Exception as e:\n", + " print(\"Connection failed!\")\n", + " sys.exit(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's create a table and insert some sample docs to test." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " cursor = conn.cursor()\n", + "\n", + " drop_table_sql = \"\"\"drop table if exists demo_tab\"\"\"\n", + " cursor.execute(drop_table_sql)\n", + "\n", + " create_table_sql = \"\"\"create table demo_tab (id number, data clob)\"\"\"\n", + " cursor.execute(create_table_sql)\n", + "\n", + " insert_row_sql = \"\"\"insert into demo_tab values (:1, :2)\"\"\"\n", + " rows_to_insert = [\n", + " (\n", + " 1,\n", + " \"If the answer to any preceding questions is yes, then the database stops the search and allocates space from the specified tablespace; otherwise, space is allocated from the database default shared temporary tablespace.\",\n", + " ),\n", + " (\n", + " 2,\n", + " \"A tablespace can be online (accessible) or offline (not accessible) whenever the database is open.\\nA tablespace is usually online so that its data is available to users. The SYSTEM tablespace and temporary tablespaces cannot be taken offline.\",\n", + " ),\n", + " (\n", + " 3,\n", + " \"The database stores LOBs differently from other data types. Creating a LOB column implicitly creates a LOB segment and a LOB index. The tablespace containing the LOB segment and LOB index, which are always stored together, may be different from the tablespace containing the table.\\nSometimes the database can store small amounts of LOB data in the table itself rather than in a separate LOB segment.\",\n", + " ),\n", + " ]\n", + " cursor.executemany(insert_row_sql, rows_to_insert)\n", + "\n", + " conn.commit()\n", + "\n", + " print(\"Table created and populated.\")\n", + " cursor.close()\n", + "except Exception as e:\n", + " print(\"Table creation failed.\")\n", + " cursor.close()\n", + " conn.close()\n", + " sys.exit(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load Documents\n", + "\n", + "Users have the flexibility to load documents from either the Oracle Database, a file system, or both, by appropriately configuring the loader parameters. For comprehensive details on these parameters, please consult the [Oracle AI Vector Search Guide](https://docs.oracle.com/en/database/oracle/oracle-database/23/arpls/dbms_vector_chain1.html#GUID-73397E89-92FB-48ED-94BB-1AD960C4EA1F).\n", + "\n", + "A significant advantage of utilizing OracleDocLoader is its capability to process over 150 distinct file formats, eliminating the need for multiple loaders for different document types. For a complete list of the supported formats, please refer to the [Oracle Text Supported Document Formats](https://docs.oracle.com/en/database/oracle/oracle-database/23/ccref/oracle-text-supported-document-formats.html).\n", + "\n", + "Below is a sample code snippet that demonstrates how to use OracleDocLoader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.core.schema import Document\n", + "from llama_index.readers.oracleai import OracleReader\n", + "\n", + "\"\"\"\n", + "# loading a local file\n", + "loader_params = {}\n", + "loader_params[\"file\"] = \"\"\n", + "\n", + "# loading from a local directory\n", + "loader_params = {}\n", + "loader_params[\"dir\"] = \"\"\n", + "\"\"\"\n", + "\n", + "# loading from Oracle Database table\n", + "loader_params = {\n", + " \"owner\": \"\",\n", + " \"tablename\": \"demo_tab\",\n", + " \"colname\": \"data\",\n", + "}\n", + "\n", + "\"\"\" load the docs \"\"\"\n", + "loader = OracleReader(conn=conn, params=loader_params)\n", + "docs = loader.load()\n", + "\n", + "\"\"\" verify \"\"\"\n", + "print(f\"Number of docs loaded: {len(docs)}\")\n", + "# print(f\"Document-0: {docs[0].text}\") # content" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Split Documents\n", + "The documents may vary in size, ranging from small to very large. Users often prefer to chunk their documents into smaller sections to facilitate the generation of embeddings. A wide array of customization options is available for this splitting process. For comprehensive details regarding these parameters, please consult the [Oracle AI Vector Search Guide](https://docs.oracle.com/en/database/oracle/oracle-database/23/arpls/dbms_vector_chain1.html#GUID-4E145629-7098-4C7C-804F-FC85D1F24240).\n", + "\n", + "Below is a sample code illustrating how to implement this:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.core.schema import Document\n", + "from llama_index.readers.oracleai import OracleTextSplitter\n", + "\n", + "\"\"\"\n", + "# Some examples\n", + "# split by chars, max 500 chars\n", + "splitter_params = {\"split\": \"chars\", \"max\": 500, \"normalize\": \"all\"}\n", + "\n", + "# split by words, max 100 words\n", + "splitter_params = {\"split\": \"words\", \"max\": 100, \"normalize\": \"all\"}\n", + "\n", + "# split by sentence, max 20 sentences\n", + "splitter_params = {\"split\": \"sentence\", \"max\": 20, \"normalize\": \"all\"}\n", + "\"\"\"\n", + "\n", + "# split by default parameters\n", + "splitter_params = {\"normalize\": \"all\"}\n", + "\n", + "# get the splitter instance\n", + "splitter = OracleTextSplitter(conn=conn, params=splitter_params)\n", + "\n", + "list_chunks = []\n", + "for doc in docs:\n", + " chunks = splitter.split_text(doc.text)\n", + " list_chunks.extend(chunks)\n", + "\n", + "\"\"\" verify \"\"\"\n", + "print(f\"Number of Chunks: {len(list_chunks)}\")\n", + "# print(f\"Chunk-0: {list_chunks[0]}\") # content" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### End to End Demo\n", + "Please refer to our complete demo guide [Oracle AI Vector Search End-to-End Demo Guide](https://github.com/run-llama/llama_index/blob/main/docs/docs/examples/cookbooks/oracleai_demo.ipynb) to build an end to end RAG pipeline with the help of Oracle AI Vector Search.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/docs/examples/embeddings/oracleai.ipynb b/docs/docs/examples/embeddings/oracleai.ipynb new file mode 100644 index 0000000000000..3fed66a14321f --- /dev/null +++ b/docs/docs/examples/embeddings/oracleai.ipynb @@ -0,0 +1,287 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Oracle AI Vector Search: Generate Embeddings\n", + "Oracle AI Vector Search is designed for Artificial Intelligence (AI) workloads that allows you to query data based on semantics, rather than keywords.\n", + "One of the biggest benefits of Oracle AI Vector Search is that semantic search on unstructured data can be combined with relational search on business data in one single system.\n", + "This is not only powerful but also significantly more effective because you don't need to add a specialized vector database, eliminating the pain of data fragmentation between multiple systems.\n", + "\n", + "In addition, your vectors can benefit from all of Oracle Database’s most powerful features, like the following:\n", + "\n", + " * [Partitioning Support](https://www.oracle.com/database/technologies/partitioning.html)\n", + " * [Real Application Clusters scalability](https://www.oracle.com/database/real-application-clusters/)\n", + " * [Exadata smart scans](https://www.oracle.com/database/technologies/exadata/software/smartscan/)\n", + " * [Shard processing across geographically distributed databases](https://www.oracle.com/database/distributed-database/)\n", + " * [Transactions](https://docs.oracle.com/en/database/oracle/oracle-database/23/cncpt/transactions.html)\n", + " * [Parallel SQL](https://docs.oracle.com/en/database/oracle/oracle-database/21/vldbg/parallel-exec-intro.html#GUID-D28717E4-0F77-44F5-BB4E-234C31D4E4BA)\n", + " * [Disaster recovery](https://www.oracle.com/database/data-guard/)\n", + " * [Security](https://www.oracle.com/security/database-security/)\n", + " * [Oracle Machine Learning](https://www.oracle.com/artificial-intelligence/database-machine-learning/)\n", + " * [Oracle Graph Database](https://www.oracle.com/database/integrated-graph-database/)\n", + " * [Oracle Spatial and Graph](https://www.oracle.com/database/spatial/)\n", + " * [Oracle Blockchain](https://docs.oracle.com/en/database/oracle/oracle-database/23/arpls/dbms_blockchain_table.html#GUID-B469E277-978E-4378-A8C1-26D3FF96C9A6)\n", + " * [JSON](https://docs.oracle.com/en/database/oracle/oracle-database/23/adjsn/json-in-oracle-database.html)\n", + "\n", + "\n", + "The guide demonstrates how to use Embedding Capabilities within Oracle AI Vector Search to generate embeddings for your documents using OracleEmbeddings." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you are just starting with Oracle Database, consider exploring the [free Oracle 23 AI](https://www.oracle.com/database/free/#resources) which provides a great introduction to setting up your database environment. While working with the database, it is often advisable to avoid using the system user by default; instead, you can create your own user for enhanced security and customization. For detailed steps on user creation, refer to our [end-to-end guide](https://github.com/run-llama/llama_index/blob/main/docs/docs/examples/cookbooks/oracleai_demo.ipynb) which also shows how to set up a user in Oracle. Additionally, understanding user privileges is crucial for managing database security effectively. You can learn more about this topic in the official [Oracle guide](https://docs.oracle.com/en/database/oracle/oracle-database/19/admqs/administering-user-accounts-and-security.html#GUID-36B21D72-1BBB-46C9-A0C9-F0D2A8591B8D) on administering user accounts and security." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prerequisites\n", + "\n", + "Ensure you have the Oracle Python Client driver installed to facilitate the integration of llama_index with Oracle AI Vector Search." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install llama-index-embeddings-oracleai" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Connect to Oracle Database\n", + "The following sample code will show how to connect to Oracle Database. By default, python-oracledb runs in a ‘Thin’ mode which connects directly to Oracle Database. This mode does not need Oracle Client libraries. However, some additional functionality is available when python-oracledb uses them. Python-oracledb is said to be in ‘Thick’ mode when Oracle Client libraries are used. Both modes have comprehensive functionality supporting the Python Database API v2.0 Specification. See the following [guide](https://python-oracledb.readthedocs.io/en/latest/user_guide/appendix_a.html#featuresummary) that talks about features supported in each mode. You might want to switch to thick-mode if you are unable to use thin-mode." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "import oracledb\n", + "\n", + "# Update the following variables with your Oracle database credentials and connection details\n", + "username = \"\"\n", + "password = \"\"\n", + "dsn = \"/\"\n", + "\n", + "try:\n", + " conn = oracledb.connect(user=username, password=password, dsn=dsn)\n", + " print(\"Connection successful!\")\n", + "except Exception as e:\n", + " print(\"Connection failed!\")\n", + " sys.exit(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For embedding generation, several provider options are available to users, including embedding generation within the database and third-party services such as OcigenAI, Hugging Face, and OpenAI. Users opting for third-party providers must establish credentials that include the requisite authentication information. Alternatively, if users select 'database' as their provider, they are required to load an ONNX model into the Oracle Database to facilitate embeddings." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load ONNX Model\n", + "\n", + "Oracle accommodates a variety of embedding providers, enabling users to choose between proprietary database solutions and third-party services such as OCIGENAI and HuggingFace. This selection dictates the methodology for generating and managing embeddings.\n", + "\n", + "***Important*** : Should users opt for the database option, they must upload an ONNX model into the Oracle Database. Conversely, if a third-party provider is selected for embedding generation, uploading an ONNX model to Oracle Database is not required.\n", + "\n", + "A significant advantage of utilizing an ONNX model directly within Oracle is the enhanced security and performance it offers by eliminating the need to transmit data to external parties. Additionally, this method avoids the latency typically associated with network or REST API calls.\n", + "\n", + "Below is the example code to upload an ONNX model into Oracle Database:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.embeddings.oracleai import OracleEmbeddings\n", + "\n", + "# please update with your related information\n", + "# make sure that you have onnx file in the system\n", + "onnx_dir = \"DEMO_DIR\"\n", + "onnx_file = \"tinybert.onnx\"\n", + "model_name = \"demo_model\"\n", + "\n", + "try:\n", + " OracleEmbeddings.load_onnx_model(conn, onnx_dir, onnx_file, model_name)\n", + " print(\"ONNX model loaded.\")\n", + "except Exception as e:\n", + " print(\"ONNX model loading failed!\")\n", + " sys.exit(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create Credential\n", + "\n", + "When selecting third-party providers for generating embeddings, users are required to establish credentials to securely access the provider's endpoints.\n", + "\n", + "***Important:*** No credentials are necessary when opting for the 'database' provider to generate embeddings. However, should users decide to utilize a third-party provider, they must create credentials specific to the chosen provider.\n", + "\n", + "Below is an illustrative example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " cursor = conn.cursor()\n", + " cursor.execute(\n", + " \"\"\"\n", + " declare\n", + " jo json_object_t;\n", + " begin\n", + " -- HuggingFace\n", + " dbms_vector_chain.drop_credential(credential_name => 'HF_CRED');\n", + " jo := json_object_t();\n", + " jo.put('access_token', '');\n", + " dbms_vector_chain.create_credential(\n", + " credential_name => 'HF_CRED',\n", + " params => json(jo.to_string));\n", + "\n", + " -- OCIGENAI\n", + " dbms_vector_chain.drop_credential(credential_name => 'OCI_CRED');\n", + " jo := json_object_t();\n", + " jo.put('user_ocid','');\n", + " jo.put('tenancy_ocid','');\n", + " jo.put('compartment_ocid','');\n", + " jo.put('private_key','');\n", + " jo.put('fingerprint','');\n", + " dbms_vector_chain.create_credential(\n", + " credential_name => 'OCI_CRED',\n", + " params => json(jo.to_string));\n", + " end;\n", + " \"\"\"\n", + " )\n", + " cursor.close()\n", + " print(\"Credentials created.\")\n", + "except Exception as ex:\n", + " cursor.close()\n", + " raise" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Generate Embeddings\n", + "\n", + "Oracle AI Vector Search provides multiple methods for generating embeddings, utilizing either locally hosted ONNX models or third-party APIs. For comprehensive instructions on configuring these alternatives, please refer to the [Oracle AI Vector Search Guide](https://docs.oracle.com/en/database/oracle/oracle-database/23/arpls/dbms_vector_chain1.html#GUID-C6439E94-4E86-4ECD-954E-4B73D53579DE)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Note:*** Users may need to configure a proxy to utilize third-party embedding generation providers, excluding the 'database' provider that utilizes an ONNX model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# proxy to be used when we instantiate summary and embedder object\n", + "proxy = \"\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following sample code will show how to generate embeddings:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.embeddings.oracleai import OracleEmbeddings\n", + "\n", + "\"\"\"\n", + "# using ocigenai\n", + "embedder_params = {\n", + " \"provider\": \"ocigenai\",\n", + " \"credential_name\": \"OCI_CRED\",\n", + " \"url\": \"https://inference.generativeai.us-chicago-1.oci.oraclecloud.com/20231130/actions/embedText\",\n", + " \"model\": \"cohere.embed-english-light-v3.0\",\n", + "}\n", + "\n", + "# using huggingface\n", + "embedder_params = {\n", + " \"provider\": \"huggingface\", \n", + " \"credential_name\": \"HF_CRED\", \n", + " \"url\": \"https://api-inference.huggingface.co/pipeline/feature-extraction/\", \n", + " \"model\": \"sentence-transformers/all-MiniLM-L6-v2\", \n", + " \"wait_for_model\": \"true\"\n", + "}\n", + "\"\"\"\n", + "\n", + "# using ONNX model loaded to Oracle Database\n", + "embedder_params = {\"provider\": \"database\", \"model\": \"demo_model\"}\n", + "\n", + "# Remove proxy if not required\n", + "embedder = OracleEmbeddings(conn=conn, params=embedder_params, proxy=proxy)\n", + "embed = embedder._get_text_embedding(\"Hello World!\")\n", + "\n", + "\"\"\" verify \"\"\"\n", + "print(f\"Embedding generated by OracleEmbeddings: {embed}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### End to End Demo\n", + "Please refer to our complete demo guide [Oracle AI Vector Search End-to-End Demo Guide](https://github.com/run-llama/llama_index/blob/main/docs/docs/examples/cookbooks/oracleai_demo.ipynb) to build an end to end RAG pipeline with the help of Oracle AI Vector Search.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/docs/examples/vector_stores/orallamavs.ipynb b/docs/docs/examples/vector_stores/orallamavs.ipynb new file mode 100644 index 0000000000000..d1816bdef5c84 --- /dev/null +++ b/docs/docs/examples/vector_stores/orallamavs.ipynb @@ -0,0 +1,612 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "dd33e9d5-9dba-4aac-9f7f-4cf9e6686593", + "metadata": {}, + "source": [ + "# Oracle AI Vector Search: Vector Store\n", + "\n", + "Oracle AI Vector Search is designed for Artificial Intelligence (AI) workloads that allows you to query data based on semantics, rather than keywords.\n", + "One of the biggest benefits of Oracle AI Vector Search is that semantic search on unstructured data can be combined with relational search on business data in one single system.\n", + "This is not only powerful but also significantly more effective because you don't need to add a specialized vector database, eliminating the pain of data fragmentation between multiple systems.\n", + "\n", + "In addition, your vectors can benefit from all of Oracle Database’s most powerful features, like the following:\n", + "\n", + " * [Partitioning Support](https://www.oracle.com/database/technologies/partitioning.html)\n", + " * [Real Application Clusters scalability](https://www.oracle.com/database/real-application-clusters/)\n", + " * [Exadata smart scans](https://www.oracle.com/database/technologies/exadata/software/smartscan/)\n", + " * [Shard processing across geographically distributed databases](https://www.oracle.com/database/distributed-database/)\n", + " * [Transactions](https://docs.oracle.com/en/database/oracle/oracle-database/23/cncpt/transactions.html)\n", + " * [Parallel SQL](https://docs.oracle.com/en/database/oracle/oracle-database/21/vldbg/parallel-exec-intro.html#GUID-D28717E4-0F77-44F5-BB4E-234C31D4E4BA)\n", + " * [Disaster recovery](https://www.oracle.com/database/data-guard/)\n", + " * [Security](https://www.oracle.com/security/database-security/)\n", + " * [Oracle Machine Learning](https://www.oracle.com/artificial-intelligence/database-machine-learning/)\n", + " * [Oracle Graph Database](https://www.oracle.com/database/integrated-graph-database/)\n", + " * [Oracle Spatial and Graph](https://www.oracle.com/database/spatial/)\n", + " * [Oracle Blockchain](https://docs.oracle.com/en/database/oracle/oracle-database/23/arpls/dbms_blockchain_table.html#GUID-B469E277-978E-4378-A8C1-26D3FF96C9A6)\n", + " * [JSON](https://docs.oracle.com/en/database/oracle/oracle-database/23/adjsn/json-in-oracle-database.html)\n", + "\n", + "The guide demonstrates how to use Vector Capabilities within Oracle AI Vector Search.\n", + "\n", + "If you are just starting with Oracle Database, consider exploring the [free Oracle 23 AI](https://www.oracle.com/database/free/#resources) which provides a great introduction to setting up your database environment. While working with the database, it is often advisable to avoid using the system user by default; instead, you can create your own user for enhanced security and customization. For detailed steps on user creation, refer to our [end-to-end guide](https://github.com/run-llama/llama_index/blob/main/docs/docs/examples/cookbooks/oracleai_demo.ipynb) which also shows how to set up a user in Oracle. Additionally, understanding user privileges is crucial for managing database security effectively. You can learn more about this topic in the official [Oracle guide](https://docs.oracle.com/en/database/oracle/oracle-database/19/admqs/administering-user-accounts-and-security.html#GUID-36B21D72-1BBB-46C9-A0C9-F0D2A8591B8D) on administering user accounts and security." + ] + }, + { + "cell_type": "markdown", + "id": "7bd80054-c803-47e1-a259-c40ed073c37d", + "metadata": {}, + "source": [ + "### Prerequisites\n", + "\n", + "Please install Oracle Python Client driver to use Llama Index with Oracle AI Vector Search. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2bbb989d-c6fb-4ab9-bafd-a95fd48538d0", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install llama-index-vector-stores-oracledb" + ] + }, + { + "cell_type": "markdown", + "id": "0fceaa5a-95da-4ebd-8b8d-5e73bb653172", + "metadata": {}, + "source": [ + "### Connect to Oracle AI Vector Search\n", + "\n", + "The following sample code will show how to connect to Oracle Database. By default, python-oracledb runs in a ‘Thin’ mode which connects directly to Oracle Database. This mode does not need Oracle Client libraries. However, some additional functionality is available when python-oracledb uses them. Python-oracledb is said to be in ‘Thick’ mode when Oracle Client libraries are used. Both modes have comprehensive functionality supporting the Python Database API v2.0 Specification. See the following [guide](https://python-oracledb.readthedocs.io/en/latest/user_guide/appendix_a.html#featuresummary) that talks about features supported in each mode. You might want to switch to thick-mode if you are unable to use thin-mode." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4421e4b7-2c7e-4bcd-82b3-9576595edd0f", + "metadata": {}, + "outputs": [], + "source": [ + "import oracledb\n", + "\n", + "# please update with your username, password, hostname and service_name\n", + "username = \"\"\n", + "password = \"\"\n", + "dsn = \"/\"\n", + "\n", + "try:\n", + " connection = oracledb.connect(user=username, password=password, dsn=dsn)\n", + " print(\"Connection successful!\")\n", + "except Exception as ex:\n", + " print(\"Exception occurred while index creation\", ex)" + ] + }, + { + "cell_type": "markdown", + "id": "b11cf362-01b0-485d-8527-31b0fbb5028e", + "metadata": {}, + "source": [ + "### Import the required dependencies to play with Oracle AI Vector Search" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43ea59e3-2910-45a6-b195-5f06094bb7c9", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "\n", + "\n", + "from llama_index.core.schema import NodeRelationship, RelatedNodeInfo, TextNode\n", + "from llama_index.core.vector_stores.types import (\n", + " ExactMatchFilter,\n", + " MetadataFilters,\n", + " VectorStoreQuery,\n", + ")\n", + "\n", + "from llama_index.vector_stores.oracledb import base as orallamavs\n", + "from llama_index.vector_stores.oracledb import OraLlamaVS, DistanceStrategy" + ] + }, + { + "cell_type": "markdown", + "id": "0aac10dc-a9cc-4fdb-901c-1b7a4bbbe5a7", + "metadata": {}, + "source": [ + "### Load Documents" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70ac6982-b13a-4e8c-9c47-57c6d136ac60", + "metadata": {}, + "outputs": [], + "source": [ + "# Define a list of documents (These dummy examples are 4 random documents )\n", + "\n", + "text_json_list = [\n", + " {\n", + " \"text\": \"If the answer to any preceding questions is yes, then the database stops the search and allocates space from the specified tablespace; otherwise, space is allocated from the database default shared temporary tablespace.\",\n", + " \"id_\": \"cncpt_15.5.3.2.2_P4\",\n", + " \"embedding\": [1.0, 0.0],\n", + " \"relationships\": \"test-0\",\n", + " \"metadata\": {\n", + " \"weight\": 1.0,\n", + " \"rank\": \"a\",\n", + " \"url\": \"https://docs.oracle.com/en/database/oracle/oracle-database/23/cncpt/logical-storage-structures.html#GUID-5387D7B2-C0CA-4C1E-811B-C7EB9B636442\",\n", + " },\n", + " },\n", + " {\n", + " \"text\": \"A tablespace can be online (accessible) or offline (not accessible) whenever the database is open.\\nA tablespace is usually online so that its data is available to users. The SYSTEM tablespace and temporary tablespaces cannot be taken offline.\",\n", + " \"id_\": \"cncpt_15.5.5_P1\",\n", + " \"embedding\": [0.0, 1.0],\n", + " \"relationships\": \"test-1\",\n", + " \"metadata\": {\n", + " \"weight\": 2.0,\n", + " \"rank\": \"c\",\n", + " \"url\": \"https://docs.oracle.com/en/database/oracle/oracle-database/23/cncpt/logical-storage-structures.html#GUID-D02B2220-E6F5-40D9-AFB5-BC69BCEF6CD4\",\n", + " },\n", + " },\n", + " {\n", + " \"text\": \"The database stores LOBs differently from other data types. Creating a LOB column implicitly creates a LOB segment and a LOB index. The tablespace containing the LOB segment and LOB index, which are always stored together, may be different from the tablespace containing the table.\\nSometimes the database can store small amounts of LOB data in the table itself rather than in a separate LOB segment.\",\n", + " \"id_\": \"cncpt_22.3.4.3.1_P2\",\n", + " \"embedding\": [1.0, 1.0],\n", + " \"relationships\": \"test-2\",\n", + " \"metadata\": {\n", + " \"weight\": 3.0,\n", + " \"rank\": \"d\",\n", + " \"url\": \"https://docs.oracle.com/en/database/oracle/oracle-database/23/cncpt/concepts-for-database-developers.html#GUID-3C50EAB8-FC39-4BB3-B680-4EACCE49E866\",\n", + " },\n", + " },\n", + " {\n", + " \"text\": \"The LOB segment stores data in pieces called chunks. A chunk is a logically contiguous set of data blocks and is the smallest unit of allocation for a LOB. A row in the table stores a pointer called a LOB locator, which points to the LOB index. When the table is queried, the database uses the LOB index to quickly locate the LOB chunks.\",\n", + " \"id_\": \"cncpt_22.3.4.3.1_P3\",\n", + " \"embedding\": [2.0, 1.0],\n", + " \"relationships\": \"test-3\",\n", + " \"metadata\": {\n", + " \"weight\": 4.0,\n", + " \"rank\": \"e\",\n", + " \"url\": \"https://docs.oracle.com/en/database/oracle/oracle-database/23/cncpt/concepts-for-database-developers.html#GUID-3C50EAB8-FC39-4BB3-B680-4EACCE49E866\",\n", + " },\n", + " },\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eaa942d6-5954-4898-8c32-3627b923a3a5", + "metadata": {}, + "outputs": [], + "source": [ + "# Create Llama Text Nodes\n", + "text_nodes = []\n", + "for text_json in text_json_list:\n", + " # Construct the relationships using RelatedNodeInfo\n", + " relationships = {\n", + " NodeRelationship.SOURCE: RelatedNodeInfo(\n", + " node_id=text_json[\"relationships\"]\n", + " )\n", + " }\n", + "\n", + " # Prepare the metadata dictionary; you might want to exclude certain metadata fields if necessary\n", + " metadata = {\n", + " \"weight\": text_json[\"metadata\"][\"weight\"],\n", + " \"rank\": text_json[\"metadata\"][\"rank\"],\n", + " }\n", + "\n", + " # Create a TextNode instance\n", + " text_node = TextNode(\n", + " text=text_json[\"text\"],\n", + " id_=text_json[\"id_\"],\n", + " embedding=text_json[\"embedding\"],\n", + " relationships=relationships,\n", + " metadata=metadata,\n", + " )\n", + "\n", + " text_nodes.append(text_node)\n", + "print(text_nodes)" + ] + }, + { + "cell_type": "markdown", + "id": "6823f5e6-997c-4f15-927b-bd44c61f105f", + "metadata": {}, + "source": [ + "### Using AI Vector Search Create a bunch of Vector Stores with different distance strategies\n", + "\n", + "First we will create three vector stores each with different distance functions. Since we have not created indices in them yet, they will just create tables for now. Later we will use these vector stores to create HNSW indicies.\n", + "\n", + "You can manually connect to the Oracle Database and will see three tables \n", + "Documents_DOT, Documents_COSINE and Documents_EUCLIDEAN. \n", + "\n", + "We will then create three additional tables Documents_DOT_IVF, Documents_COSINE_IVF and Documents_EUCLIDEAN_IVF which will be used\n", + "to create IVF indicies on the tables instead of HNSW indices. \n", + "\n", + "To understand more about the different types of indices Oracle AI Vector Search supports, refer to the following [guide](https://docs.oracle.com/en/database/oracle/oracle-database/23/vecse/manage-different-categories-vector-indexes.html)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed1b253e-5f5c-4a81-983c-74645213a170", + "metadata": {}, + "outputs": [], + "source": [ + "# Ingest documents into Oracle Vector Store using different distance strategies\n", + "\n", + "vector_store_dot = OraLlamaVS.from_documents(\n", + " text_nodes,\n", + " table_name=\"Documents_DOT\",\n", + " client=connection,\n", + " distance_strategy=DistanceStrategy.DOT_PRODUCT,\n", + ")\n", + "vector_store_max = OraLlamaVS.from_documents(\n", + " text_nodes,\n", + " table_name=\"Documents_COSINE\",\n", + " client=connection,\n", + " distance_strategy=DistanceStrategy.COSINE,\n", + ")\n", + "vector_store_euclidean = OraLlamaVS.from_documents(\n", + " text_nodes,\n", + " table_name=\"Documents_EUCLIDEAN\",\n", + " client=connection,\n", + " distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,\n", + ")\n", + "\n", + "# Ingest documents into Oracle Vector Store using different distance strategies\n", + "vector_store_dot_ivf = OraLlamaVS.from_documents(\n", + " text_nodes,\n", + " table_name=\"Documents_DOT_IVF\",\n", + " client=connection,\n", + " distance_strategy=DistanceStrategy.DOT_PRODUCT,\n", + ")\n", + "vector_store_max_ivf = OraLlamaVS.from_documents(\n", + " text_nodes,\n", + " table_name=\"Documents_COSINE_IVF\",\n", + " client=connection,\n", + " distance_strategy=DistanceStrategy.COSINE,\n", + ")\n", + "vector_store_euclidean_ivf = OraLlamaVS.from_documents(\n", + " text_nodes,\n", + " table_name=\"Documents_EUCLIDEAN_IVF\",\n", + " client=connection,\n", + " distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "77c29505-8688-4b87-9a99-e648fbb2d425", + "metadata": {}, + "source": [ + "### Demonstrating add, delete operations for texts, and basic similarity search\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "306563ae-577b-4bc7-8a92-3dd6a59310f5", + "metadata": {}, + "outputs": [], + "source": [ + "def manage_texts(vector_stores):\n", + " \"\"\"\n", + " Adds texts to each vector store, demonstrates error handling for duplicate additions,\n", + " and performs deletion of texts. Showcases similarity searches and index creation for each vector store.\n", + "\n", + " Args:\n", + " - vector_stores (list): A list of OracleVS instances.\n", + " \"\"\"\n", + " for i, vs in enumerate(vector_stores, start=1):\n", + " # Adding texts\n", + " try:\n", + " vs.add_texts(text_nodes, metadata)\n", + " print(f\"\\n\\n\\nAdd texts complete for vector store {i}\\n\\n\\n\")\n", + " except Exception as ex:\n", + " print(\n", + " f\"\\n\\n\\nExpected error on duplicate add for vector store {i}\\n\\n\\n\"\n", + " )\n", + "\n", + " # Deleting texts using the value of 'id'\n", + " vs.delete(\"test-1\")\n", + " print(f\"\\n\\n\\nDelete texts complete for vector store {i}\\n\\n\\n\")\n", + "\n", + " # Similarity search\n", + " query = VectorStoreQuery(\n", + " query_embedding=[1.0, 1.0], similarity_top_k=3\n", + " )\n", + " results = vs.query(query=query)\n", + " print(\n", + " f\"\\n\\n\\nSimilarity search results for vector store {i}: {results}\\n\\n\\n\"\n", + " )\n", + "\n", + "\n", + "vector_store_list = [\n", + " vector_store_dot,\n", + " vector_store_max,\n", + " vector_store_euclidean,\n", + " vector_store_dot_ivf,\n", + " vector_store_max_ivf,\n", + " vector_store_euclidean_ivf,\n", + "]\n", + "manage_texts(vector_store_list)" + ] + }, + { + "cell_type": "markdown", + "id": "0980cb33-69cf-4547-842a-afdc4d6fa7d3", + "metadata": {}, + "source": [ + "### Demonstrating index creation with specific parameters for each distance strategy\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46298a27-e309-456e-b2b8-771d9cb3be29", + "metadata": {}, + "outputs": [], + "source": [ + "def create_search_indices(connection):\n", + " \"\"\"\n", + " Creates search indices for the vector stores, each with specific parameters tailored to their distance strategy.\n", + " \"\"\"\n", + " # Index for DOT_PRODUCT strategy\n", + " # Notice we are creating a HNSW index with default parameters\n", + " # This will default to creating a HNSW index with 8 Parallel Workers and use the Default Accuracy used by Oracle AI Vector Search\n", + " orallamavs.create_index(\n", + " connection,\n", + " vector_store_dot,\n", + " params={\"idx_name\": \"hnsw_idx1\", \"idx_type\": \"HNSW\"},\n", + " )\n", + "\n", + " # Index for COSINE strategy with specific parameters\n", + " # Notice we are creating a HNSW index with parallel 16 and Target Accuracy Specification as 97 percent\n", + " orallamavs.create_index(\n", + " connection,\n", + " vector_store_max,\n", + " params={\n", + " \"idx_name\": \"hnsw_idx2\",\n", + " \"idx_type\": \"HNSW\",\n", + " \"accuracy\": 97,\n", + " \"parallel\": 16,\n", + " },\n", + " )\n", + "\n", + " # Index for EUCLIDEAN_DISTANCE strategy with specific parameters\n", + " # Notice we are creating a HNSW index by specifying Power User Parameters which are neighbors = 64 and efConstruction = 100\n", + " orallamavs.create_index(\n", + " connection,\n", + " vector_store_euclidean,\n", + " params={\n", + " \"idx_name\": \"hnsw_idx3\",\n", + " \"idx_type\": \"HNSW\",\n", + " \"neighbors\": 64,\n", + " \"efConstruction\": 100,\n", + " },\n", + " )\n", + "\n", + " # Index for DOT_PRODUCT strategy with specific parameters\n", + " # Notice we are creating an IVF index with default parameters\n", + " # This will default to creating an IVF index with 8 Parallel Workers and use the Default Accuracy used by Oracle AI Vector Search\n", + " orallamavs.create_index(\n", + " connection,\n", + " vector_store_dot_ivf,\n", + " params={\n", + " \"idx_name\": \"ivf_idx1\",\n", + " \"idx_type\": \"IVF\",\n", + " },\n", + " )\n", + "\n", + " # Index for COSINE strategy with specific parameters\n", + " # Notice we are creating an IVF index with parallel 32 and Target Accuracy Specification as 90 percent\n", + " orallamavs.create_index(\n", + " connection,\n", + " vector_store_max_ivf,\n", + " params={\n", + " \"idx_name\": \"ivf_idx2\",\n", + " \"idx_type\": \"IVF\",\n", + " \"accuracy\": 90,\n", + " \"parallel\": 32,\n", + " },\n", + " )\n", + "\n", + " # Index for EUCLIDEAN_DISTANCE strategy with specific parameters\n", + " # Notice we are creating an IVF index by specifying Power User Parameters which is neighbor_part = 64\n", + " orallamavs.create_index(\n", + " connection,\n", + " vector_store_euclidean_ivf,\n", + " params={\n", + " \"idx_name\": \"ivf_idx3\",\n", + " \"idx_type\": \"IVF\",\n", + " \"neighbor_part\": 64,\n", + " },\n", + " )\n", + "\n", + " print(\"Index creation complete.\")\n", + "\n", + "\n", + "create_search_indices(connection)" + ] + }, + { + "cell_type": "markdown", + "id": "7223d048-5c0b-4e91-a91b-a7daa9f86758", + "metadata": {}, + "source": [ + "### Now we will conduct a bunch of advanced searches on all six vector stores. Each of these three searches have a with and without filter version. The filter only selects the document with id 101 out and filters out everything else" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37ca2e7d-9803-4260-95e7-62776d4fb820", + "metadata": {}, + "outputs": [], + "source": [ + "# Conduct advanced searches after creating the indices\n", + "def conduct_advanced_searches(vector_stores):\n", + " # Constructing a filter for direct comparison against document metadata\n", + " # This filter aims to include documents whose metadata 'id' is exactly '2'\n", + "\n", + " for i, vs in enumerate(vector_stores, start=1):\n", + "\n", + " def query_without_filters_returns_all_rows_sorted_by_similarity():\n", + " print(f\"\\n--- Vector Store {i} Advanced Searches ---\")\n", + " # Similarity search without a filter\n", + " print(\"\\nSimilarity search results without filter:\")\n", + " query = VectorStoreQuery(\n", + " query_embedding=[1.0, 1.0], similarity_top_k=3\n", + " )\n", + " print(vs.query(query=query))\n", + "\n", + " query_without_filters_returns_all_rows_sorted_by_similarity()\n", + "\n", + " def query_with_filters_returns_multiple_matches():\n", + " print(f\"\\n--- Vector Store {i} Advanced Searches ---\")\n", + " # Similarity search with filter\n", + " print(\"\\nSimilarity search results without filter:\")\n", + " filters = MetadataFilters(\n", + " filters=[ExactMatchFilter(key=\"rank\", value=\"c\")]\n", + " )\n", + " query = VectorStoreQuery(\n", + " query_embedding=[1.0, 1.0], filters=filters, similarity_top_k=1\n", + " )\n", + " result = vs.query(query)\n", + " print(result.ids)\n", + "\n", + " query_with_filters_returns_multiple_matches()\n", + "\n", + " def query_with_filter_applies_top_k():\n", + " print(f\"\\n--- Vector Store {i} Advanced Searches ---\")\n", + " # Similarity search with a filter\n", + " print(\"\\nSimilarity search results with filter:\")\n", + " filters = MetadataFilters(\n", + " filters=[ExactMatchFilter(key=\"rank\", value=\"c\")]\n", + " )\n", + " query = VectorStoreQuery(\n", + " query_embedding=[1.0, 1.0], filters=filters, similarity_top_k=1\n", + " )\n", + " result = vs.query(query)\n", + " print(result.ids)\n", + "\n", + " query_with_filter_applies_top_k()\n", + "\n", + " def query_with_filter_applies_node_id_filter():\n", + " print(f\"\\n--- Vector Store {i} Advanced Searches ---\")\n", + " # Similarity search with a filter\n", + " print(\"\\nSimilarity search results with filter:\")\n", + " filters = MetadataFilters(\n", + " filters=[ExactMatchFilter(key=\"rank\", value=\"c\")]\n", + " )\n", + " query = VectorStoreQuery(\n", + " query_embedding=[1.0, 1.0],\n", + " filters=filters,\n", + " similarity_top_k=3,\n", + " node_ids=[\"452D24AB-F185-414C-A352-590B4B9EE51B\"],\n", + " )\n", + " result = vs.query(query)\n", + " print(result.ids)\n", + "\n", + " query_with_filter_applies_node_id_filter()\n", + "\n", + " def query_with_exact_filters_returns_single_match():\n", + " print(f\"\\n--- Vector Store {i} Advanced Searches ---\")\n", + " # Similarity search with a filter\n", + " print(\"\\nSimilarity search results with filter:\")\n", + " filters = MetadataFilters(\n", + " filters=[\n", + " ExactMatchFilter(key=\"rank\", value=\"c\"),\n", + " ExactMatchFilter(key=\"weight\", value=2),\n", + " ]\n", + " )\n", + " query = VectorStoreQuery(\n", + " query_embedding=[1.0, 1.0], filters=filters\n", + " )\n", + " result = vs.query(query)\n", + " print(result.ids)\n", + "\n", + " query_with_exact_filters_returns_single_match()\n", + "\n", + " def query_with_contradictive_filter_returns_no_matches():\n", + " filters = MetadataFilters(\n", + " filters=[\n", + " ExactMatchFilter(key=\"weight\", value=2),\n", + " ExactMatchFilter(key=\"weight\", value=3),\n", + " ]\n", + " )\n", + " query = VectorStoreQuery(\n", + " query_embedding=[1.0, 1.0], filters=filters\n", + " )\n", + " result = vs.query(query)\n", + " print(result.ids)\n", + "\n", + " query_with_contradictive_filter_returns_no_matches()\n", + "\n", + " def query_with_filter_on_unknown_field_returns_no_matches():\n", + " print(f\"\\n--- Vector Store {i} Advanced Searches ---\")\n", + " # Similarity search with a filter\n", + " print(\"\\nSimilarity search results with filter:\")\n", + " filters = MetadataFilters(\n", + " filters=[ExactMatchFilter(key=\"unknown_field\", value=\"c\")]\n", + " )\n", + " query = VectorStoreQuery(\n", + " query_embedding=[1.0, 1.0], filters=filters\n", + " )\n", + " result = vs.query(query)\n", + " print(result.ids)\n", + "\n", + " query_with_filter_on_unknown_field_returns_no_matches()\n", + "\n", + " def delete_removes_document_from_query_results():\n", + " vs.delete(\"test-1\")\n", + " query = VectorStoreQuery(\n", + " query_embedding=[1.0, 1.0], similarity_top_k=2\n", + " )\n", + " result = vs.query(query)\n", + " print(result.ids)\n", + "\n", + " delete_removes_document_from_query_results()\n", + "\n", + "\n", + "conduct_advanced_searches(vector_store_list)" + ] + }, + { + "cell_type": "markdown", + "id": "0da8c7e2-0db0-4363-b31b-a7a5e3f83717", + "metadata": {}, + "source": [ + "### End to End Demo\n", + "Please refer to our complete demo guide [Oracle AI Vector Search End-to-End Demo Guide](https://github.com/run-llama/llama_index/blob/main/docs/docs/examples/cookbooks/oracleai_demo.ipynb) to build an end to end RAG pipeline with the help of Oracle AI Vector Search.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index f6b6b823e8968..3846121b55ea4 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -144,6 +144,7 @@ nav: - ./examples/cookbooks/llama3_cookbook_ollama_replicate.ipynb - ./examples/cookbooks/mistralai.ipynb - ./examples/cookbooks/mixedbread_reranker.ipynb + - ./examples/cookbooks/oracleai_demo.ipynb - ./examples/cookbooks/oreilly_course_cookbooks/Module-2/Components_Of_LlamaIndex.ipynb - ./examples/cookbooks/oreilly_course_cookbooks/Module-3/Evaluating_RAG_Systems.ipynb - ./examples/cookbooks/oreilly_course_cookbooks/Module-4/Ingestion_Pipeline.ipynb @@ -194,6 +195,7 @@ nav: - ./examples/data_connectors/WebPageDemo.ipynb - ./examples/data_connectors/deplot/DeplotReader.ipynb - ./examples/data_connectors/html_tag_reader.ipynb + - ./examples/data_connectors/oracleai.ipynb - ./examples/data_connectors/simple_directory_reader.ipynb - ./examples/data_connectors/simple_directory_reader_parallel.ipynb - ./examples/data_connectors/simple_directory_reader_remote_fs.ipynb @@ -243,6 +245,7 @@ nav: - ./examples/embeddings/ollama_embedding.ipynb - ./examples/embeddings/openvino.ipynb - ./examples/embeddings/optimum_intel.ipynb + - ./examples/embeddings/oracleai.ipynb - ./examples/embeddings/premai.ipynb - ./examples/embeddings/sagemaker_embedding_endpoint.ipynb - ./examples/embeddings/text_embedding_inference.ipynb @@ -659,6 +662,7 @@ nav: - ./examples/vector_stores/existing_data/pinecone_existing_data.ipynb - ./examples/vector_stores/existing_data/weaviate_existing_data.ipynb - ./examples/vector_stores/neo4j_metadata_filter.ipynb + - ./examples/vector_stores/orallamavs.ipynb - ./examples/vector_stores/pinecone_auto_retriever.ipynb - ./examples/vector_stores/pinecone_metadata_filter.ipynb - ./examples/vector_stores/postgres.ipynb @@ -882,6 +886,7 @@ nav: - ./api_reference/embeddings/octoai.md - ./api_reference/embeddings/ollama.md - ./api_reference/embeddings/openai.md + - ./api_reference/embeddings/oracleai.md - ./api_reference/embeddings/premai.md - ./api_reference/embeddings/sagemaker_endpoint.md - ./api_reference/embeddings/text_embeddings_inference.md @@ -1326,6 +1331,7 @@ nav: - ./api_reference/readers/openapi.md - ./api_reference/readers/opendal.md - ./api_reference/readers/opensearch.md + - ./api_reference/readers/oracleai.md - ./api_reference/readers/pandas_ai.md - ./api_reference/readers/papers.md - ./api_reference/readers/patentsview.md @@ -1506,6 +1512,7 @@ nav: - ./api_reference/storage/vector_store/neo4jvector.md - ./api_reference/storage/vector_store/neptune.md - ./api_reference/storage/vector_store/opensearch.md + - ./api_reference/storage/vector_store/oracledb.md - ./api_reference/storage/vector_store/pgvecto_rs.md - ./api_reference/storage/vector_store/pinecone.md - ./api_reference/storage/vector_store/postgres.md @@ -1561,6 +1568,7 @@ nav: - ./api_reference/tools/ondemand_loader.md - ./api_reference/tools/openai.md - ./api_reference/tools/openapi.md + - ./api_reference/tools/oracleai.md - ./api_reference/tools/passio_nutrition_ai.md - ./api_reference/tools/playgrounds.md - ./api_reference/tools/python_file.md @@ -2196,6 +2204,9 @@ plugins: - ../llama-index-integrations/readers/llama-index-readers-quip - ../llama-index-integrations/sparse_embeddings/llama-index-sparse-embeddings-fastembed - ../llama-index-integrations/node_parser/llama-index-node-parser-topic + - ../llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb + - ../llama-index-integrations/embeddings/llama-index-embeddings-oracleai + - ../llama-index-integrations/readers/llama-index-readers-oracleai - ../llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-mistralai - redirects: redirect_maps: diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-oracleai/.gitignore b/llama-index-integrations/embeddings/llama-index-embeddings-oracleai/.gitignore new file mode 100644 index 0000000000000..990c18de22908 --- /dev/null +++ b/llama-index-integrations/embeddings/llama-index-embeddings-oracleai/.gitignore @@ -0,0 +1,153 @@ +llama_index/_static +.DS_Store +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +bin/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +etc/ +include/ +lib/ +lib64/ +parts/ +sdist/ +share/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +.ruff_cache + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints +notebooks/ + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +pyvenv.cfg + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Jetbrains +.idea +modules/ +*.swp + +# VsCode +.vscode + +# pipenv +Pipfile +Pipfile.lock + +# pyright +pyrightconfig.json diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-oracleai/BUILD b/llama-index-integrations/embeddings/llama-index-embeddings-oracleai/BUILD new file mode 100644 index 0000000000000..0896ca890d8bf --- /dev/null +++ b/llama-index-integrations/embeddings/llama-index-embeddings-oracleai/BUILD @@ -0,0 +1,3 @@ +poetry_requirements( + name="poetry", +) diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-oracleai/Makefile b/llama-index-integrations/embeddings/llama-index-embeddings-oracleai/Makefile new file mode 100644 index 0000000000000..b9eab05aa3706 --- /dev/null +++ b/llama-index-integrations/embeddings/llama-index-embeddings-oracleai/Makefile @@ -0,0 +1,17 @@ +GIT_ROOT ?= $(shell git rev-parse --show-toplevel) + +help: ## Show all Makefile targets. + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}' + +format: ## Run code autoformatters (black). + pre-commit install + git ls-files | xargs pre-commit run black --files + +lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy + pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files + +test: ## Run tests via pytest. + pytest tests + +watch-docs: ## Build and watch documentation. + sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/ diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-oracleai/README.md b/llama-index-integrations/embeddings/llama-index-embeddings-oracleai/README.md new file mode 100644 index 0000000000000..b4550d9e681ea --- /dev/null +++ b/llama-index-integrations/embeddings/llama-index-embeddings-oracleai/README.md @@ -0,0 +1,40 @@ +# LlamaIndex Embeddings Integration: Oracleai + +This API is to generate an embedding from a text. + +`pip install llama-index-embeddings-oracleai` + +# A sample example + +```python +from typing import TYPE_CHECKING +from llama_index.core.embeddings import BaseEmbedding +from llama_index.embeddings.oracleai import OracleEmbeddings + +if TYPE_CHECKING: + import oracledb + +""" get the Oracle connection """ +conn = oracledb.connect( + user="", + password="", + dsn="", +) +print("Oracle connection is established...") + +""" params """ +embedder_params = {"provider": "", "model": ""} +proxy = "" + +""" instance """ +embedder = OracleEmbeddings(conn=conn, params=embedder_params, proxy=proxy) + +embed = embedder._get_text_embedding("Hello World, Text!") +print(f"Embedding generated by OracleEmbeddings: {embed}") + +embed = embedder._get_query_embedding("Hello World, Query!") +print(f"Embedding generated by OracleEmbeddings: {embed}") + +conn.close() +print("Connection is closed.") +``` diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-oracleai/llama_index/embeddings/oracleai/BUILD b/llama-index-integrations/embeddings/llama-index-embeddings-oracleai/llama_index/embeddings/oracleai/BUILD new file mode 100644 index 0000000000000..db46e8d6c978c --- /dev/null +++ b/llama-index-integrations/embeddings/llama-index-embeddings-oracleai/llama_index/embeddings/oracleai/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-oracleai/llama_index/embeddings/oracleai/__init__.py b/llama-index-integrations/embeddings/llama-index-embeddings-oracleai/llama_index/embeddings/oracleai/__init__.py new file mode 100644 index 0000000000000..bff269b99e8ed --- /dev/null +++ b/llama-index-integrations/embeddings/llama-index-embeddings-oracleai/llama_index/embeddings/oracleai/__init__.py @@ -0,0 +1,4 @@ +from llama_index.embeddings.oracleai.base import OracleEmbeddings + + +__all__ = ["OracleEmbeddings"] diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-oracleai/llama_index/embeddings/oracleai/base.py b/llama-index-integrations/embeddings/llama-index-embeddings-oracleai/llama_index/embeddings/oracleai/base.py new file mode 100644 index 0000000000000..99c5b4f73ddd3 --- /dev/null +++ b/llama-index-integrations/embeddings/llama-index-embeddings-oracleai/llama_index/embeddings/oracleai/base.py @@ -0,0 +1,204 @@ +# ----------------------------------------------------------------------------- +# Authors: +# Harichandan Roy (hroy) +# David Jiang (ddjiang) +# +# ----------------------------------------------------------------------------- +# ...embeddings/oracleai.py +# ----------------------------------------------------------------------------- + +from __future__ import annotations + +from typing import Any, Dict, List, Optional, TYPE_CHECKING + +import json + +from llama_index.core.bridge.pydantic import PrivateAttr +from llama_index.core.embeddings import BaseEmbedding + +if TYPE_CHECKING: + from oracledb import Connection + +"""OracleEmbeddings class""" + + +class OracleEmbeddings(BaseEmbedding): + """Get Embeddings.""" + + _conn: Any = PrivateAttr() + _params: Dict[str, Any] = PrivateAttr() + _proxy: Optional[str] = PrivateAttr() + + def __init__( + self, + conn: Connection, + params: Dict[str, Any], + proxy: Optional[str] = None, + **kwargs: Any, + ): + super().__init__(**kwargs) + self._conn = conn + self._proxy = proxy + self._params = params + + @classmethod + def class_name(self) -> str: + return "OracleEmbeddings" + + @staticmethod + def load_onnx_model(conn: Connection, dir: str, onnx_file: str, model_name: str): + """Load an ONNX model to Oracle Database. + + Args: + conn: Oracle Connection, + dir: Oracle Directory, + onnx_file: ONNX file name, + model_name: Name of the model. + Note: user needs to have create procedure, + create mining model, create any directory privilege. + """ + try: + if conn is None or dir is None or onnx_file is None or model_name is None: + raise Exception("Invalid input") + + cursor = conn.cursor() + cursor.execute( + """ + begin + dbms_data_mining.drop_model(model_name => :model, force => true); + SYS.DBMS_VECTOR.load_onnx_model(:path, :filename, :model, json('{"function" : "embedding", "embeddingOutput" : "embedding" , "input": {"input": ["DATA"]}}')); + end;""", + path=dir, + filename=onnx_file, + model=model_name, + ) + + cursor.close() + + except Exception as ex: + print(f"An exception occurred :: {ex}") + cursor.close() + raise + + def _get_embedding(self, text: str) -> List[float]: + try: + import oracledb + except ImportError as e: + raise ImportError( + "Unable to import oracledb, please install with " + "`pip install -U oracledb`." + ) from e + + if text is None: + return None + + embedding = None + try: + oracledb.defaults.fetch_lobs = False + cursor = self._conn.cursor() + + if self._proxy: + cursor.execute( + "begin utl_http.set_proxy(:proxy); end;", proxy=self._proxy + ) + + cursor.execute( + "select t.* from dbms_vector_chain.utl_to_embeddings(:content, json(:params)) t", + content=text, + params=json.dumps(self._params), + ) + + row = cursor.fetchone() + if row is None: + embedding = [] + else: + rdata = json.loads(row[0]) + # dereference string as array + embedding = json.loads(rdata["embed_vector"]) + + cursor.close() + return embedding + except Exception as ex: + print(f"An exception occurred :: {ex}") + cursor.close() + raise + + def _get_embeddings(self, texts: List[str]) -> List[List[float]]: + """Compute doc embeddings using an OracleEmbeddings. + + Args: + texts: The list of texts to embed. + + Returns: + List of embeddings, one for each input text. + """ + try: + import oracledb + except ImportError as e: + raise ImportError( + "Unable to import oracledb, please install with " + "`pip install -U oracledb`." + ) from e + + if texts is None: + return None + + embeddings: List[List[float]] = [] + try: + # returns strings or bytes instead of a locator + oracledb.defaults.fetch_lobs = False + cursor = self.conn.cursor() + + if self.proxy: + cursor.execute( + "begin utl_http.set_proxy(:proxy); end;", proxy=self.proxy + ) + + chunks = [] + for i, text in enumerate(texts, start=1): + chunk = {"chunk_id": i, "chunk_data": text} + chunks.append(json.dumps(chunk)) + + vector_array_type = self.conn.gettype("SYS.VECTOR_ARRAY_T") + inputs = vector_array_type.newobject(chunks) + cursor.execute( + "select t.* " + + "from dbms_vector_chain.utl_to_embeddings(:content, " + + "json(:params)) t", + content=inputs, + params=json.dumps(self.params), + ) + + for row in cursor: + if row is None: + embeddings.append([]) + else: + rdata = json.loads(row[0]) + # dereference string as array + vec = json.loads(rdata["embed_vector"]) + embeddings.append(vec) + + cursor.close() + return embeddings + except Exception as ex: + print(f"An exception occurred :: {ex}") + cursor.close() + raise + + def _get_query_embedding(self, query: str) -> List[float]: + return self._get_embedding(query) + + async def _aget_query_embedding(self, query: str) -> List[float]: + return self._get_query_embedding(query) + + def _get_text_embedding(self, text: str) -> List[float]: + return self._get_embedding(text) + + async def _aget_text_embedding(self, text: str) -> List[float]: + return self._get_text_embedding(text) + + def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]: + return self._get_embeddings(texts) + + async def _aget_text_embeddings(self, texts: List[str]) -> List[List[float]]: + return self._get_text_embeddings(texts) diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-oracleai/pyproject.toml b/llama-index-integrations/embeddings/llama-index-embeddings-oracleai/pyproject.toml new file mode 100644 index 0000000000000..9734c1e784d44 --- /dev/null +++ b/llama-index-integrations/embeddings/llama-index-embeddings-oracleai/pyproject.toml @@ -0,0 +1,57 @@ +[build-system] +build-backend = "poetry.core.masonry.api" +requires = ["poetry-core"] + +[tool.codespell] +check-filenames = true +check-hidden = true +# Feel free to un-skip examples, and experimental, you will just need to +# work through many typos (--write-changes and --interactive will help) +skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" + +[tool.llamahub] +contains_example = false +import_path = "llama_index.embeddings.oracleai" + +[tool.llamahub.class_authors] +OracleEmbeddings = "hroyofc" + +[tool.mypy] +disallow_untyped_defs = true +# Remove venv skip when integrated with pre-commit +exclude = ["_static", "build", "examples", "notebooks", "venv"] +ignore_missing_imports = true +python_version = "3.8" + +[tool.poetry] +authors = ["Your Name "] +description = "llama-index embeddings oracleai integration" +license = "MIT" +name = "llama-index-embeddings-oracleai" +packages = [{include = "llama_index/"}] +readme = "README.md" +version = "0.1.0" + +[tool.poetry.dependencies] +python = ">=3.8.1,<4.0" +llama-index-core = ">=0.11.1" +oracledb = ">=2.2" + +[tool.poetry.group.dev.dependencies] +black = {extras = ["jupyter"], version = "<=23.9.1,>=23.7.0"} +codespell = {extras = ["toml"], version = ">=v2.2.6"} +ipython = "8.10.0" +jupyter = "^1.0.0" +mypy = "0.991" +pre-commit = "3.2.0" +pylint = "2.15.10" +pytest = "7.2.1" +pytest-mock = "3.11.1" +ruff = "0.0.292" +tree-sitter-languages = "^1.8.0" +types-Deprecated = ">=0.1.0" +types-PyYAML = "^6.0.12.12" +types-protobuf = "^4.24.0.4" +types-redis = "4.5.5.0" +types-requests = "2.28.11.8" # TODO: unpin when mypy>0.991 +types-setuptools = "67.1.0.0" diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-oracleai/tests/BUILD b/llama-index-integrations/embeddings/llama-index-embeddings-oracleai/tests/BUILD new file mode 100644 index 0000000000000..dabf212d7e716 --- /dev/null +++ b/llama-index-integrations/embeddings/llama-index-embeddings-oracleai/tests/BUILD @@ -0,0 +1 @@ +python_tests() diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-oracleai/tests/__init__.py b/llama-index-integrations/embeddings/llama-index-embeddings-oracleai/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-oracleai/tests/test_embeddings_oracleai.py b/llama-index-integrations/embeddings/llama-index-embeddings-oracleai/tests/test_embeddings_oracleai.py new file mode 100644 index 0000000000000..3304c9deb0842 --- /dev/null +++ b/llama-index-integrations/embeddings/llama-index-embeddings-oracleai/tests/test_embeddings_oracleai.py @@ -0,0 +1,45 @@ +from typing import TYPE_CHECKING +from llama_index.core.embeddings import BaseEmbedding +from llama_index.embeddings.oracleai import OracleEmbeddings + +if TYPE_CHECKING: + import oracledb + + +def test_class(): + names_of_base_classes = [b.__name__ for b in OracleEmbeddings.__mro__] + assert BaseEmbedding.__name__ in names_of_base_classes + + +# unit tests +uname = "" +passwd = "" +v_dsn = "" + + +### Test OracleEmbeddings ##### +# @pytest.mark.requires("oracledb") +def test_embeddings_test() -> None: + try: + connection = oracledb.connect(user=uname, password=passwd, dsn=v_dsn) + # print("Connection Successful!") + + doc = """Hello World!!!""" + + # get oracle embeddings + embedder_params = {"provider": "database", "model": "demo_model"} + embedder = OracleEmbeddings(conn=connection, params=embedder_params) + embedding = embedder._get_text_embedding(doc) + + # verify + assert len(embedding) != 0 + # print(f"Embedding: {embedding}") + + connection.close() + except Exception as e: + # print("Error: ", e) + pass + + +# test embedder +# test_embeddings_test() diff --git a/llama-index-integrations/readers/llama-index-readers-oracleai/.gitignore b/llama-index-integrations/readers/llama-index-readers-oracleai/.gitignore new file mode 100644 index 0000000000000..990c18de22908 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-oracleai/.gitignore @@ -0,0 +1,153 @@ +llama_index/_static +.DS_Store +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +bin/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +etc/ +include/ +lib/ +lib64/ +parts/ +sdist/ +share/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +.ruff_cache + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints +notebooks/ + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +pyvenv.cfg + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Jetbrains +.idea +modules/ +*.swp + +# VsCode +.vscode + +# pipenv +Pipfile +Pipfile.lock + +# pyright +pyrightconfig.json diff --git a/llama-index-integrations/readers/llama-index-readers-oracleai/BUILD b/llama-index-integrations/readers/llama-index-readers-oracleai/BUILD new file mode 100644 index 0000000000000..0896ca890d8bf --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-oracleai/BUILD @@ -0,0 +1,3 @@ +poetry_requirements( + name="poetry", +) diff --git a/llama-index-integrations/readers/llama-index-readers-oracleai/Makefile b/llama-index-integrations/readers/llama-index-readers-oracleai/Makefile new file mode 100644 index 0000000000000..b9eab05aa3706 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-oracleai/Makefile @@ -0,0 +1,17 @@ +GIT_ROOT ?= $(shell git rev-parse --show-toplevel) + +help: ## Show all Makefile targets. + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}' + +format: ## Run code autoformatters (black). + pre-commit install + git ls-files | xargs pre-commit run black --files + +lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy + pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files + +test: ## Run tests via pytest. + pytest tests + +watch-docs: ## Build and watch documentation. + sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/ diff --git a/llama-index-integrations/readers/llama-index-readers-oracleai/README.md b/llama-index-integrations/readers/llama-index-readers-oracleai/README.md new file mode 100644 index 0000000000000..31ef1850adfdf --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-oracleai/README.md @@ -0,0 +1,38 @@ +# LlamaIndex Readers Integration: Oracleai + +There are two classes here: + +- OracleReader: This API is to load document(s) from a file or a directory or a Oracle Database table. +- OracleTextSplitter: This API is to split a document into chunks with a lots of customizations. + +`pip install llama-index-readers-oracleai` + +# A sample example + +```python +# get the Oracle connection +conn = oracledb.connect( + user="", + password="", + dsn="", +) +print("Oracle connection is established...") + +# params +loader_params = {"owner": "ut", "tablename": "demo_tab", "colname": "data"} +splitter_params = {"by": "words", "max": "100"} + +# instances +loader = OracleReader(conn=conn, params=loader_params) +splitter = OracleTextSplitter(conn=conn, params=splitter_params) + +print("Processing the documents...") +docs = loader.load() +for id, doc in enumerate(docs, start=1): + print(f"Document#{id}, Metadata: {doc.metadata}") + chunks = splitter.split_text(doc.text) + print(f"Document#{id}, Num of Chunk: {len(chunks)}\n") + +conn.close() +print("Connection is closed.") +``` diff --git a/llama-index-integrations/readers/llama-index-readers-oracleai/llama_index/readers/oracleai/BUILD b/llama-index-integrations/readers/llama-index-readers-oracleai/llama_index/readers/oracleai/BUILD new file mode 100644 index 0000000000000..db46e8d6c978c --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-oracleai/llama_index/readers/oracleai/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-integrations/readers/llama-index-readers-oracleai/llama_index/readers/oracleai/__init__.py b/llama-index-integrations/readers/llama-index-readers-oracleai/llama_index/readers/oracleai/__init__.py new file mode 100644 index 0000000000000..e940212958b72 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-oracleai/llama_index/readers/oracleai/__init__.py @@ -0,0 +1,4 @@ +from llama_index.readers.oracleai.base import OracleReader, OracleTextSplitter + + +__all__ = ["OracleReader", "OracleTextSplitter"] diff --git a/llama-index-integrations/readers/llama-index-readers-oracleai/llama_index/readers/oracleai/base.py b/llama-index-integrations/readers/llama-index-readers-oracleai/llama_index/readers/oracleai/base.py new file mode 100644 index 0000000000000..553ad08451c70 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-oracleai/llama_index/readers/oracleai/base.py @@ -0,0 +1,425 @@ +# ----------------------------------------------------------------------------- +# Authors: +# Harichandan Roy (hroy) +# David Jiang (ddjiang) +# +# ----------------------------------------------------------------------------- +# ...readers/oracleai.py +# ----------------------------------------------------------------------------- +from __future__ import annotations + +import os +import json +import time +import struct +import random +import hashlib +import traceback + +from html.parser import HTMLParser +from typing import Dict, List, TYPE_CHECKING + +from llama_index.core.readers.base import BaseReader +from llama_index.core.schema import Document + +import logging +from typing import ( + Any, + Dict, + List, +) + +if TYPE_CHECKING: + from oracledb import Connection + +"""ParseOracleDocMetadata class""" + + +class ParseOracleDocMetadata(HTMLParser): + """Parse Oracle doc metadata...""" + + def __init__(self) -> None: + super().__init__() + self.reset() + self.match = False + self.metadata = {} + + def handle_starttag(self, tag, attrs): + if tag == "meta": + entry = "" + for name, value in attrs: + if name == "name": + entry = value + if name == "content": + if entry: + self.metadata[entry] = value + elif tag == "title": + self.match = True + + def handle_data(self, data): + if self.match: + self.metadata["title"] = data + self.match = False + + def get_metadata(self): + return self.metadata + + +"""OracleDocReader class""" + + +class OracleDocReader: + """Read a file.""" + + @staticmethod + def generate_object_id(input_string=None): + out_length = 32 # output length + hash_len = 8 # hash value length + + if input_string is None: + input_string = "".join( + random.choices( + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", + k=16, + ) + ) + + # timestamp + timestamp = int(time.time()) + timestamp_bin = struct.pack(">I", timestamp) # 4 bytes + + # hash_value + hashval_bin = hashlib.sha256(input_string.encode()).digest() + hashval_bin = hashval_bin[:hash_len] # 8 bytes + + # counter + counter_bin = struct.pack(">I", random.getrandbits(32)) # 4 bytes + + # binary object id + object_id = timestamp_bin + hashval_bin + counter_bin # 16 bytes + object_id_hex = object_id.hex() # 32 bytes + object_id_hex = object_id_hex.zfill( + out_length + ) # fill with zeros if less than 32 bytes + + return object_id_hex[:out_length] + + @staticmethod + def read_file(conn: Connection, file_path: str, params: dict) -> Document: + """Read a file using OracleReader + Args: + conn: Oracle Connection, + file_path: Oracle Directory, + params: ONNX file name. + + Returns: + Plain text and metadata as Document. + """ + metadata = {} + + try: + import oracledb + except ImportError as e: + raise ImportError( + "Unable to import oracledb, please install with " + "`pip install -U oracledb`." + ) from e + + try: + oracledb.defaults.fetch_lobs = False + cursor = conn.cursor() + + with open(file_path, "rb") as f: + data = f.read() + + if data is None: + return Document(text="", metadata=metadata) + + mdata = cursor.var(oracledb.DB_TYPE_CLOB) + text = cursor.var(oracledb.DB_TYPE_CLOB) + cursor.execute( + """ + declare + input blob; + begin + input := :blob; + :mdata := dbms_vector_chain.utl_to_text(input, json(:pref)); + :text := dbms_vector_chain.utl_to_text(input); + end;""", + blob=data, + pref=json.dumps(params), + mdata=mdata, + text=text, + ) + cursor.close() + + if mdata is None: + metadata = {} + else: + doc_data = str(mdata.getvalue()) + if doc_data.startswith(("")): + p = ParseOracleDocMetadata() + p.feed(doc_data) + metadata = p.get_metadata() + + doc_id = OracleDocReader.generate_object_id(conn.username + "$" + file_path) + metadata["_oid"] = doc_id + metadata["_file"] = file_path + + if text is None: + return Document(text="", metadata=metadata) + else: + return Document(text=str(text.getvalue()), metadata=metadata) + + except Exception as ex: + print(f"An exception occurred :: {ex}") + print(f"Skip processing {file_path}") + cursor.close() + return None + + +"""OracleDocLoader class""" + + +class OracleReader(BaseReader): + """Read documents using OracleDocLoader + Args: + conn: Oracle Connection, + params: Loader parameters. + """ + + def __init__(self, conn: Connection, params: Dict[str, Any]): + self.conn = conn + self.params = json.loads(json.dumps(params)) + + def load(self) -> List[Document]: + """Load data into Document objects...""" + try: + import oracledb + except ImportError as e: + raise ImportError( + "Unable to import oracledb, please install with " + "`pip install -U oracledb`." + ) from e + + ncols = 0 + results = [] + metadata = {} + m_params = {"plaintext": "false"} + + try: + # extract the parameters + if self.params is not None: + self.file = self.params.get("file") + self.dir = self.params.get("dir") + self.owner = self.params.get("owner") + self.tablename = self.params.get("tablename") + self.colname = self.params.get("colname") + else: + raise Exception("Missing loader parameters") + + oracledb.defaults.fetch_lobs = False + + if self.file: + doc = OracleDocReader.read_file(self.conn, self.file, m_params) + + if doc is None: + return results + + results.append(doc) + + if self.dir: + skip_count = 0 + for file_name in os.listdir(self.dir): + file_path = os.path.join(self.dir, file_name) + if os.path.isfile(file_path): + doc = OracleDocReader.read_file(self.conn, file_path, m_params) + + if doc is None: + skip_count = skip_count + 1 + print(f"Total skipped: {skip_count}\n") + else: + results.append(doc) + + if self.tablename: + try: + if self.owner is None or self.colname is None: + raise Exception("Missing owner or column name") + + cursor = self.conn.cursor() + self.mdata_cols = self.params.get("mdata_cols") + if self.mdata_cols is not None: + if len(self.mdata_cols) > 3: + raise Exception( + "Exceeds the max number of columns you can request for metadata." + ) + + # execute a query to get column data types + sql = ( + "select column_name, data_type from all_tab_columns where owner = '" + + self.owner.upper() + + "' and " + + "table_name = '" + + self.tablename.upper() + + "'" + ) + + cursor.execute(sql) + rows = cursor.fetchall() + for row in rows: + if row[0] in self.mdata_cols: + if row[1] not in [ + "NUMBER", + "BINARY_DOUBLE", + "BINARY_FLOAT", + "LONG", + "DATE", + "TIMESTAMP", + "VARCHAR2", + ]: + raise Exception( + "The datatype for the column requested for metadata is not supported." + ) + + self.mdata_cols_sql = ", rowid" + if self.mdata_cols is not None: + for col in self.mdata_cols: + self.mdata_cols_sql = self.mdata_cols_sql + ", " + col + + # [TODO] use bind variables + sql = ( + "select dbms_vector_chain.utl_to_text(t." + + self.colname + + ", json('" + + json.dumps(m_params) + + "')) mdata, dbms_vector_chain.utl_to_text(t." + + self.colname + + ") text" + + self.mdata_cols_sql + + " from " + + self.owner + + "." + + self.tablename + + " t" + ) + + cursor.execute(sql) + for row in cursor: + metadata = {} + + if row is None: + doc_id = OracleDocReader.generate_object_id( + self.conn.username + + "$" + + self.owner + + "$" + + self.tablename + + "$" + + self.colname + ) + metadata["_oid"] = doc_id + results.append(Document(text="", metadata=metadata)) + else: + if row[0] is not None: + data = str(row[0]) + if data.startswith(("")): + p = ParseOracleDocMetadata() + p.feed(data) + metadata = p.get_metadata() + + doc_id = OracleDocReader.generate_object_id( + self.conn.username + + "$" + + self.owner + + "$" + + self.tablename + + "$" + + self.colname + + "$" + + str(row[2]) + ) + metadata["_oid"] = doc_id + metadata["_rowid"] = row[2] + + # process projected metadata cols + if self.mdata_cols is not None: + ncols = len(self.mdata_cols) + + for i in range(ncols): + if i == 0: + metadata["_rowid"] = row[i + 2] + else: + metadata[self.mdata_cols[i]] = row[i + 2] + + if row[1] is None: + results.append(Document(text="", metadata=metadata)) + else: + results.append( + Document(text=str(row[1]), metadata=metadata) + ) + except Exception as ex: + print(f"An exception occurred :: {ex}") + traceback.print_exc() + cursor.close() + raise + + return results + except Exception as ex: + print(f"An exception occurred :: {ex}") + traceback.print_exc() + raise + + def load_data(self) -> List[Document]: + return self.load() + + +logger = logging.getLogger(__name__) + + +class OracleTextSplitter: + """Splitting text using Oracle chunker.""" + + def __init__(self, conn: Connection, params: Dict[str, Any]): + self.conn = conn + self.params = params + + try: + import oracledb + except ImportError as e: + raise ImportError( + "Unable to import oracledb, please install with " + "`pip install -U oracledb`." + ) from e + + self._oracledb = oracledb + self._json = json + + def split_text(self, text: str) -> List[str]: + """Split incoming text and return chunks.""" + splits = [] + + try: + cursor = self.conn.cursor() + # returns strings or bytes instead of a locator + self._oracledb.defaults.fetch_lobs = False + + cursor.setinputsizes(content=self._oracledb.CLOB) + cursor.execute( + "select t.* from dbms_vector_chain.utl_to_chunks(:content, json(:params)) t", + content=text, + params=self._json.dumps(self.params), + ) + + while True: + row = cursor.fetchone() + if row is None: + break + d = self._json.loads(row[0]) + splits.append(d["chunk_data"]) + + return splits + + except Exception as ex: + print(f"An exception occurred :: {ex}") + traceback.print_exc() + raise diff --git a/llama-index-integrations/readers/llama-index-readers-oracleai/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-oracleai/pyproject.toml new file mode 100644 index 0000000000000..7d2d4fdf33b30 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-oracleai/pyproject.toml @@ -0,0 +1,58 @@ +[build-system] +build-backend = "poetry.core.masonry.api" +requires = ["poetry-core"] + +[tool.codespell] +check-filenames = true +check-hidden = true +# Feel free to un-skip examples, and experimental, you will just need to +# work through many typos (--write-changes and --interactive will help) +skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" + +[tool.llamahub] +contains_example = false +import_path = "llama_index.readers.oracleai" + +[tool.llamahub.class_authors] +OracleReader = "hroyofc" +OracleTextSplitter = "hroyofc" + +[tool.mypy] +disallow_untyped_defs = true +# Remove venv skip when integrated with pre-commit +exclude = ["_static", "build", "examples", "notebooks", "venv"] +ignore_missing_imports = true +python_version = "3.8" + +[tool.poetry] +authors = ["Your Name "] +description = "llama-index readers oracleai integration" +license = "MIT" +name = "llama-index-readers-oracleai" +packages = [{include = "llama_index/"}] +readme = "README.md" +version = "0.1.0" + +[tool.poetry.dependencies] +python = ">=3.8.1,<4.0" +llama-index-core = ">=0.11.1" +oracledb = ">=2.2" + +[tool.poetry.group.dev.dependencies] +black = {extras = ["jupyter"], version = "<=23.9.1,>=23.7.0"} +codespell = {extras = ["toml"], version = ">=v2.2.6"} +ipython = "8.10.0" +jupyter = "^1.0.0" +mypy = "0.991" +pre-commit = "3.2.0" +pylint = "2.15.10" +pytest = "7.2.1" +pytest-mock = "3.11.1" +ruff = "0.0.292" +tree-sitter-languages = "^1.8.0" +types-Deprecated = ">=0.1.0" +types-PyYAML = "^6.0.12.12" +types-protobuf = "^4.24.0.4" +types-redis = "4.5.5.0" +types-requests = "2.28.11.8" # TODO: unpin when mypy>0.991 +types-setuptools = "67.1.0.0" diff --git a/llama-index-integrations/readers/llama-index-readers-oracleai/tests/BUILD b/llama-index-integrations/readers/llama-index-readers-oracleai/tests/BUILD new file mode 100644 index 0000000000000..dabf212d7e716 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-oracleai/tests/BUILD @@ -0,0 +1 @@ +python_tests() diff --git a/llama-index-integrations/readers/llama-index-readers-oracleai/tests/__init__.py b/llama-index-integrations/readers/llama-index-readers-oracleai/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/llama-index-integrations/readers/llama-index-readers-oracleai/tests/test_readers_oracleai.py b/llama-index-integrations/readers/llama-index-readers-oracleai/tests/test_readers_oracleai.py new file mode 100644 index 0000000000000..a45662fc85fce --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-oracleai/tests/test_readers_oracleai.py @@ -0,0 +1,142 @@ +from typing import TYPE_CHECKING +from llama_index.core.readers.base import BaseReader +from llama_index.readers.oracleai import OracleReader, OracleTextSplitter + +if TYPE_CHECKING: + import oracledb + + +def test_class(): + names_of_base_classes = [b.__name__ for b in OracleReader.__mro__] + assert BaseReader.__name__ in names_of_base_classes + + +# unit tests +uname = "" +passwd = "" +v_dsn = "" + + +### Test OracleReader ##### +# @pytest.mark.requires("oracledb") +def test_loader_test() -> None: + try: + connection = oracledb.connect(user=uname, password=passwd, dsn=v_dsn) + # print("Connection Successful!") + + cursor = connection.cursor() + cursor.execute("drop table if exists llama_demo") + cursor.execute("create table llama_demo(id number, text varchar2(25))") + + rows = [ + (1, "First"), + (2, "Second"), + (3, "Third"), + (4, "Fourth"), + (5, "Fifth"), + (6, "Sixth"), + (7, "Seventh"), + ] + + cursor.executemany("insert into llama_demo(id, text) values (:1, :2)", rows) + connection.commit() + cursor.close() + + # load from database column + loader_params = { + "owner": uname, + "tablename": "llama_demo", + "colname": "text", + } + loader = OracleReader(conn=connection, params=loader_params) + docs = loader.load() + + # verify + assert len(docs) != 0 + # print(f"Document#1: {docs[0].text}") + + connection.close() + except Exception as e: + # print("Error: ", e) + pass + + +### Test OracleTextSplitter #### +# @pytest.mark.requires("oracledb") +def test_splitter_test() -> None: + try: + connection = oracledb.connect(user=uname, password=passwd, dsn=v_dsn) + # print("Connection Successful!") + + doc = """Llamaindex is a wonderful framework to load, split, chunk + and embed your data!!""" + + # by words , max = 1000 + splitter_params = { + "by": "words", + "max": "1000", + "overlap": "200", + "split": "custom", + "custom_list": [","], + "extended": "true", + "normalize": "all", + } + splitter = OracleTextSplitter(conn=connection, params=splitter_params) + chunks = splitter.split_text(doc) + + # verify + assert len(chunks) != 0 + # print(f"1. Number of chunks: {len(chunks)}") + + # by chars , max = 4000 + splitter_params = { + "by": "chars", + "max": "4000", + "overlap": "800", + "split": "NEWLINE", + "normalize": "all", + } + splitter = OracleTextSplitter(conn=connection, params=splitter_params) + chunks = splitter.split_text(doc) + + # verify + assert len(chunks) != 0 + # print(f"2. Number of chunks: {len(chunks)}") + + # by words , max = 10 + splitter_params = { + "by": "words", + "max": "10", + "overlap": "2", + "split": "SENTENCE", + } + splitter = OracleTextSplitter(conn=connection, params=splitter_params) + chunks = splitter.split_text(doc) + + # verify + assert len(chunks) != 0 + # print(f"3. Number of chunks: {len(chunks)}") + + # by chars , max = 50 + splitter_params = { + "by": "chars", + "max": "50", + "overlap": "10", + "split": "SPACE", + "normalize": "all", + } + splitter = OracleTextSplitter(conn=connection, params=splitter_params) + chunks = splitter.split_text(doc) + + # verify + assert len(chunks) != 0 + # print(f"4. Number of chunks: {len(chunks)}") + + connection.close() + except Exception: + pass + + +# test loader and splitter +# test_loader_test() +# test_splitter_test() diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/.gitignore b/llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/.gitignore new file mode 100644 index 0000000000000..990c18de22908 --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/.gitignore @@ -0,0 +1,153 @@ +llama_index/_static +.DS_Store +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +bin/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +etc/ +include/ +lib/ +lib64/ +parts/ +sdist/ +share/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +.ruff_cache + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints +notebooks/ + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +pyvenv.cfg + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Jetbrains +.idea +modules/ +*.swp + +# VsCode +.vscode + +# pipenv +Pipfile +Pipfile.lock + +# pyright +pyrightconfig.json diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/BUILD b/llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/BUILD new file mode 100644 index 0000000000000..0896ca890d8bf --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/BUILD @@ -0,0 +1,3 @@ +poetry_requirements( + name="poetry", +) diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/Makefile b/llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/Makefile new file mode 100644 index 0000000000000..b9eab05aa3706 --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/Makefile @@ -0,0 +1,17 @@ +GIT_ROOT ?= $(shell git rev-parse --show-toplevel) + +help: ## Show all Makefile targets. + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}' + +format: ## Run code autoformatters (black). + pre-commit install + git ls-files | xargs pre-commit run black --files + +lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy + pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files + +test: ## Run tests via pytest. + pytest tests + +watch-docs: ## Build and watch documentation. + sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/ diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/README.md b/llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/README.md new file mode 100644 index 0000000000000..45bcf682182fa --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/README.md @@ -0,0 +1,5 @@ +# LlamaIndex Vector_Stores Integration: OracleDB + +# Please refer to the docs for Vector Store and End to End guide for a detailed + +# example diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/llama_index/vector_stores/oracledb/BUILD b/llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/llama_index/vector_stores/oracledb/BUILD new file mode 100644 index 0000000000000..db46e8d6c978c --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/llama_index/vector_stores/oracledb/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/llama_index/vector_stores/oracledb/__init__.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/llama_index/vector_stores/oracledb/__init__.py new file mode 100644 index 0000000000000..6f55167b81483 --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/llama_index/vector_stores/oracledb/__init__.py @@ -0,0 +1,3 @@ +from llama_index.vector_stores.oracledb.base import OraLlamaVS + +__all__ = ["OraLlamaVS", "DistanceStrategy"] diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/llama_index/vector_stores/oracledb/base.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/llama_index/vector_stores/oracledb/base.py new file mode 100644 index 0000000000000..2ae48f152461b --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/llama_index/vector_stores/oracledb/base.py @@ -0,0 +1,680 @@ +# OopCompanion:suppressRename +from __future__ import annotations + +import array +import functools +import json +import logging +import math +import os +import uuid +from enum import Enum +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + List, + Optional, + Type, + TypeVar, + cast, +) + +from llama_index.core.schema import ( + BaseNode, + MetadataMode, + NodeRelationship, + RelatedNodeInfo, + TextNode, +) +from llama_index.core.utils import iter_batch +from llama_index.core.vector_stores.types import ( + BasePydanticVectorStore, + VectorStoreQuery, + VectorStoreQueryResult, +) + +if TYPE_CHECKING: + from oracledb import Connection + +from llama_index.core.vector_stores.utils import metadata_dict_to_node +from pydantic import PrivateAttr + + +logger = logging.getLogger(__name__) +log_level = os.getenv("LOG_LEVEL", "ERROR").upper() +logging.basicConfig( + level=getattr(logging, log_level), + format="%(asctime)s - %(levelname)s - %(message)s", +) + + +class DistanceStrategy(Enum): + COSINE = 1 + DOT_PRODUCT = 2 + EUCLIDEAN_DISTANCE = 3 + MANHATTAN_DISTANCE = 4 + HAMMING_DISTANCE = 5 + EUCLIDEAN_SQUARED = 6 + + +# Define a type variable that can be any kind of function +T = TypeVar("T", bound=Callable[..., Any]) + + +def _handle_exceptions(func: T) -> T: + @functools.wraps(func) + def wrapper(*args: Any, **kwargs: Any) -> Any: + try: + return func(*args, **kwargs) + except RuntimeError as db_err: + # Handle a known type of error (e.g., DB-related) specifically + logger.exception("DB-related error occurred.") + raise RuntimeError(f"Failed due to a DB issue: {db_err}") from db_err + except ValueError as val_err: + # Handle another known type of error specifically + logger.exception("Validation error.") + raise ValueError(f"Validation failed: {val_err}") from val_err + except Exception as e: + # Generic handler for all other exceptions + logger.exception(f"An unexpected error occurred: {e}") + raise RuntimeError(f"Unexpected error: {e}") from e + + return cast(T, wrapper) + + +def _escape_str(value: str) -> str: + BS = "\\" + must_escape = (BS, "'") + return ( + "".join(f"{BS}{c}" if c in must_escape else c for c in value) if value else "" + ) + + +column_config: Dict = { + "id": {"type": "VARCHAR2(64) PRIMARY KEY", "extract_func": lambda x: x.node_id}, + "doc_id": {"type": "VARCHAR2(64)", "extract_func": lambda x: x.ref_doc_id}, + "text": { + "type": "CLOB", + "extract_func": lambda x: _escape_str( + x.get_content(metadata_mode=MetadataMode.NONE) or "" + ), + }, + "node_info": { + # Now specifying the column as CLOB intended for JSON, with a check constraint + "type": "JSON", + "extract_func": lambda x: json.dumps(x.node_info), + }, + "metadata": { + # Also specified as CLOB intended for JSON, with a check constraint + "type": "JSON", + "extract_func": lambda x: json.dumps(x.metadata), + }, + "embedding": { + "type": "VECTOR", + "extract_func": lambda x: _stringify_list(x.get_embedding()), + }, +} + + +def _stringify_list(lst: List) -> str: + return "[" + ",".join(str(item) for item in lst) + "]" + + +def _table_exists(connection: Connection, table_name: str) -> bool: + try: + import oracledb + except ImportError as e: + raise ImportError( + "Unable to import oracledb, please install with " + "`pip install -U oracledb`." + ) from e + try: + with connection.cursor() as cursor: + cursor.execute(f"SELECT COUNT(*) FROM {table_name}") + return True + except oracledb.DatabaseError as ex: + err_obj = ex.args + if err_obj[0].code == 942: + return False + raise + + +@_handle_exceptions +def _index_exists(connection: Connection, index_name: str) -> bool: + # Check if the index exists + query = ( + "SELECT index_name FROM all_indexes WHERE upper(index_name) = upper(:idx_name)" + ) + + with connection.cursor() as cursor: + # Execute the query + cursor.execute(query, idx_name=index_name.upper()) + result = cursor.fetchone() + + # Check if the index exists + return result is not None + + +def _get_distance_function(distance_strategy: DistanceStrategy) -> str: + # Dictionary to map distance strategies to their corresponding function names + distance_strategy2function = { + DistanceStrategy.EUCLIDEAN_DISTANCE: "EUCLIDEAN", + DistanceStrategy.DOT_PRODUCT: "DOT", + DistanceStrategy.COSINE: "COSINE", + DistanceStrategy.MANHATTAN_DISTANCE: "MANHATTAN", + DistanceStrategy.HAMMING_DISTANCE: "HAMMING", + DistanceStrategy.EUCLIDEAN_SQUARED: "EUCLIDEAN_SQUARED", + } + + # Attempt to return the corresponding distance function + if distance_strategy in distance_strategy2function: + return distance_strategy2function[distance_strategy] + + # If it's an unsupported distance strategy, raise an error + raise ValueError(f"Unsupported distance strategy: {distance_strategy}") + + +def _get_index_name(base_name: str) -> str: + unique_id = str(uuid.uuid4()).replace("-", "") + return f"{base_name}_{unique_id}" + + +@_handle_exceptions +def _create_table(connection: Connection, table_name: str) -> None: + if not _table_exists(connection, table_name): + with connection.cursor() as cursor: + column_definitions = ", ".join( + [f'{k} {v["type"]}' for k, v in column_config.items()] + ) + + # Generate the final DDL statement + ddl = f"CREATE TABLE {table_name} (\n {column_definitions}\n)" + + cursor.execute(ddl) + logger.info("Table created successfully...") + else: + logger.info("Table already exists...") + + +@_handle_exceptions +def create_index( + connection: Connection, + vector_store: OraLlamaVS, + params: Optional[dict[str, Any]] = None, +) -> None: + if params: + if params["idx_type"] == "HNSW": + _create_hnsw_index( + connection, + vector_store.table_name, + vector_store.distance_strategy, + params, + ) + elif params["idx_type"] == "IVF": + _create_ivf_index( + connection, + vector_store.table_name, + vector_store.distance_strategy, + params, + ) + else: + _create_hnsw_index( + connection, + vector_store.table_name, + vector_store.distance_strategy, + params, + ) + + +@_handle_exceptions +def _create_config(defaults: dict, params: dict) -> dict: + config: dict = {} + if params: + config = params.copy() + # Ensure compulsory parts are included + for compulsory_key in ["idx_name", "parallel"]: + if compulsory_key not in config: + if compulsory_key == "idx_name": + config[compulsory_key] = _get_index_name(defaults[compulsory_key]) + else: + config[compulsory_key] = defaults[compulsory_key] + + # Validate keys in config against defaults + for key in config: + if key not in defaults: + raise ValueError(f"Invalid parameter: {key}") + else: + config = defaults + return config + + +@_handle_exceptions +def _create_hnsw_index( + connection: Connection, + table_name: str, + distance_strategy: DistanceStrategy, + params: Optional[dict[str, Any]] = None, +) -> None: + defaults = { + "idx_name": "HNSW", + "idx_type": "HNSW", + "neighbors": 32, + "efConstruction": 200, + "accuracy": 90, + "parallel": 8, + } + + config = _create_config(defaults, params) + + # Base SQL statement + idx_name = config["idx_name"] + base_sql = f"create vector index {idx_name} on {table_name}(embedding) ORGANIZATION INMEMORY NEIGHBOR GRAPH" + + # Optional parts depending on parameters + accuracy_part = " WITH TARGET ACCURACY {accuracy}" if "accuracy" in config else "" + distance_part = f" DISTANCE {_get_distance_function(distance_strategy)}" + + parameters_part = "" + if "neighbors" in config and "efConstruction" in config: + parameters_part = " parameters (type {idx_type}, neighbors {neighbors}, efConstruction {efConstruction})" + elif "neighbors" in config and "efConstruction" not in config: + config["efConstruction"] = defaults["efConstruction"] + parameters_part = " parameters (type {idx_type}, neighbors {neighbors}, efConstruction {efConstruction})" + elif "neighbors" not in config and "efConstruction" in config: + config["neighbors"] = defaults["neighbors"] + parameters_part = " parameters (type {idx_type}, neighbors {neighbors}, efConstruction {efConstruction})" + + # Always included part for parallel + parallel_part = " parallel {parallel}" + + # Combine all parts + ddl_assembly = ( + base_sql + accuracy_part + distance_part + parameters_part + parallel_part + ) + # Format the SQL with values from the params dictionary + ddl = ddl_assembly.format(**config) + + # Check if the index exists + if not _index_exists(connection, config["idx_name"]): + with connection.cursor() as cursor: + cursor.execute(ddl) + logger.info("Index created successfully...") + else: + logger.info("Index already exists...") + + +@_handle_exceptions +def _create_ivf_index( + connection: Connection, + table_name: str, + distance_strategy: DistanceStrategy, + params: Optional[dict[str, Any]] = None, +) -> None: + # Default configuration + defaults = { + "idx_name": "IVF", + "idx_type": "IVF", + "neighbor_part": 32, + "accuracy": 90, + "parallel": 8, + } + + config = _create_config(defaults, params) + + # Base SQL statement + idx_name = config["idx_name"] + base_sql = f"CREATE VECTOR INDEX {idx_name} ON {table_name}(embedding) ORGANIZATION NEIGHBOR PARTITIONS" + + # Optional parts depending on parameters + accuracy_part = " WITH TARGET ACCURACY {accuracy}" if "accuracy" in config else "" + distance_part = f" DISTANCE {_get_distance_function(distance_strategy)}" + + parameters_part = "" + if "idx_type" in config and "neighbor_part" in config: + parameters_part = f" PARAMETERS (type {config['idx_type']}, neighbor partitions {config['neighbor_part']})" + + # Always included part for parallel + parallel_part = f" PARALLEL {config['parallel']}" + + # Combine all parts + ddl_assembly = ( + base_sql + accuracy_part + distance_part + parameters_part + parallel_part + ) + # Format the SQL with values from the params dictionary + ddl = ddl_assembly.format(**config) + + # Check if the index exists + if not _index_exists(connection, config["idx_name"]): + with connection.cursor() as cursor: + cursor.execute(ddl) + logger.info("Index created successfully...") + else: + logger.info("Index already exists...") + + +@_handle_exceptions +def drop_table_purge(connection: Connection, table_name: str) -> None: + if _table_exists(connection, table_name): + cursor = connection.cursor() + with cursor: + ddl = f"DROP TABLE {table_name} PURGE" + cursor.execute(ddl) + logger.info("Table dropped successfully...") + else: + logger.info("Table not found...") + + +@_handle_exceptions +def drop_index_if_exists(connection: Connection, index_name: str): + if _index_exists(connection, index_name): + drop_query = f"DROP INDEX {index_name}" + with connection.cursor() as cursor: + cursor.execute(drop_query) + logger.info(f"Index {index_name} has been dropped.") + else: + logger.exception(f"Index {index_name} does not exist.") + + +class OraLlamaVS(BasePydanticVectorStore): + """`OraLlamaVS` vector store. + + To use, you should have both: + - the ``oracledb`` python package installed + - a connection string associated with a OracleVS having deployed an + Search index + + Example: + .. code-block:: python + + from llama-index.core.vectorstores import OracleVS + from oracledb import oracledb + + with oracledb.connect(user = user, passwd = pwd, dsn = dsn) as connection: + print ("Database version:", connection.version) + """ + + AMPLIFY_RATIO_LE5: ClassVar[int] = 100 + AMPLIFY_RATIO_GT5: ClassVar[int] = 20 + AMPLIFY_RATIO_GT50: ClassVar[int] = 10 + metadata_column: str = "metadata" + stores_text: bool = True + _client: Connection = PrivateAttr() + table_name: str + distance_strategy: DistanceStrategy + batch_size: Optional[int] + params: Optional[dict[str, Any]] + + def __init__( + self, + _client: Connection, + table_name: str, + distance_strategy: DistanceStrategy = DistanceStrategy.EUCLIDEAN_DISTANCE, + batch_size: Optional[int] = 32, + params: Optional[dict[str, Any]] = None, + ): + try: + import oracledb + except ImportError as e: + raise ImportError( + "Unable to import oracledb, please install with " + "`pip install -U oracledb`." + ) from e + + try: + """Initialize with necessary components.""" + super().__init__( + table_name=table_name, + distance_strategy=distance_strategy, + batch_size=batch_size, + params=params, + ) + # Assign _client to PrivateAttr after the Pydantic initialization + object.__setattr__(self, "_client", _client) + _create_table(_client, table_name) + + except oracledb.DatabaseError as db_err: + logger.exception(f"Database error occurred while create table: {db_err}") + raise RuntimeError( + "Failed to create table due to a database error." + ) from db_err + except ValueError as val_err: + logger.exception(f"Validation error: {val_err}") + raise RuntimeError( + "Failed to create table due to a validation error." + ) from val_err + except Exception as ex: + logger.exception("An unexpected error occurred while creating the index.") + raise RuntimeError( + "Failed to create table due to an unexpected error." + ) from ex + + @property + def client(self) -> Any: + """Get client.""" + return self._client + + @classmethod + def class_name(cls) -> str: + return "OraLlamaVS" + + def _append_meta_filter_condition( + self, where_str: Optional[str], exact_match_filter: list + ) -> str: + filter_str = " AND ".join( + f"JSON_VALUE({self.metadata_column}, '$.{filter_item.key}') = '{filter_item.value}'" + for filter_item in exact_match_filter + ) + if where_str is None: + where_str = filter_str + else: + where_str += " AND " + filter_str + return where_str + + def _build_insert(self, values: List[BaseNode]) -> (str, List[tuple]): + _data = [] + for item in values: + item_values = tuple( + column["extract_func"](item) for column in column_config.values() + ) + _data.append(item_values) + + dml = f""" + INSERT INTO {self.table_name} ({", ".join(column_config.keys())}) + VALUES ({", ".join([':' + str(i + 1) for i in range(len(column_config))])}) + """ + return dml, _data + + def _build_query( + self, distance_function: str, k: int, where_str: Optional[str] = None + ) -> str: + where_clause = f"WHERE {where_str}" if where_str else "" + + return f""" + SELECT id, + doc_id, + text, + node_info, + metadata, + vector_distance(embedding, :embedding, {distance_function}) AS distance + FROM {self.table_name} + {where_clause} + ORDER BY distance + FETCH APPROX FIRST {k} ROWS ONLY + """ + + def _build_hybrid_query( + self, sub_query: str, query_str: str, similarity_top_k: int + ) -> str: + terms_pattern = [f"(?i){x}" for x in query_str.split(" ")] + column_keys = column_config.keys() + return ( + f"SELECT {','.join(filter(lambda k: k != 'embedding', column_keys))}, " + f"distance FROM ({sub_query}) temp_table " + f"ORDER BY length(multiMatchAllIndices(text, {terms_pattern})) " + f"AS distance1 DESC, " + f"log(1 + countMatches(text, '(?i)({query_str.replace(' ', '|')})')) " + f"AS distance2 DESC limit {similarity_top_k}" + ) + + @_handle_exceptions + def add(self, nodes: list[BaseNode], **kwargs: Any) -> list[str]: + if not nodes: + return [] + + for result_batch in iter_batch(nodes, self.batch_size): + dml, bind_values = self._build_insert(values=result_batch) + + with self._client.cursor() as cursor: + # Use executemany to insert the batch + cursor.executemany(dml, bind_values) + self._client.commit() + + return [node.node_id for node in nodes] + + @_handle_exceptions + def delete(self, doc_id: str, **kwargs: Any) -> None: + with self._client.cursor() as cursor: + ddl = f"DELETE FROM {self.table_name} WHERE id = :doc_id" + cursor.execute(ddl, [doc_id]) + self._client.commit() + + @_handle_exceptions + def _get_clob_value(self, result: Any) -> str: + try: + import oracledb + except ImportError as e: + raise ImportError( + "Unable to import oracledb, please install with " + "`pip install -U oracledb`." + ) from e + + clob_value = "" + if result: + if isinstance(result, oracledb.LOB): + raw_data = result.read() + if isinstance(raw_data, bytes): + clob_value = raw_data.decode( + "utf-8" + ) # Specify the correct encoding + else: + clob_value = raw_data + elif isinstance(result, str): + clob_value = result + else: + raise Exception("Unexpected type:", type(result)) + return clob_value + + @_handle_exceptions + def drop(self) -> None: + drop_table_purge(self._client, self.table_name) + + @_handle_exceptions + def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult: + distance_function = _get_distance_function(self.distance_strategy) + where_str = ( + f"doc_id in {_stringify_list(query.doc_ids)}" if query.doc_ids else None + ) + + if query.filters is not None: + where_str = self._append_meta_filter_condition( + where_str, query.filters.filters + ) + + # build query sql + query_sql = self._build_query( + distance_function, query.similarity_top_k, where_str + ) + """ + if query.mode == VectorStoreQueryMode.HYBRID and query.query_str is not None: + amplify_ratio = self.AMPLIFY_RATIO_LE5 + if 5 < query.similarity_top_k < 50: + amplify_ratio = self.AMPLIFY_RATIO_GT5 + if query.similarity_top_k > 50: + amplify_ratio = self.AMPLIFY_RATIO_GT50 + query_sql = self._build_hybrid_query( + self._build_query( + query_embed=query.query_embedding, + k=query.similarity_top_k, + where_str=where_str, + limit=query.similarity_top_k * amplify_ratio, + ), + query.query_str, + query.similarity_top_k, + ) + logger.debug(f"hybrid query_statement={query_statement}") + """ + embedding = array.array("f", query.query_embedding) + with self._client.cursor() as cursor: + cursor.execute(query_sql, embedding=embedding) + results = cursor.fetchall() + + similarities = [] + ids = [] + nodes = [] + for result in results: + doc_id = result[1] + text = self._get_clob_value(result[2]) + node_info = ( + json.loads(result[3]) if isinstance(result[3], str) else result[3] + ) + metadata = ( + json.loads(result[4]) if isinstance(result[4], str) else result[4] + ) + + if query.node_ids: + if result[0] not in query.node_ids: + continue + + if isinstance(node_info, dict): + start_char_idx = node_info.get("start", None) + end_char_idx = node_info.get("end", None) + try: + node = metadata_dict_to_node(metadata) + node.set_content(text) + except Exception: + # Note: deprecated legacy logic for backward compatibility + + node = TextNode( + id_=result[0], + text=text, + metadata=metadata, + start_char_idx=start_char_idx, + end_char_idx=end_char_idx, + relationships={ + NodeRelationship.SOURCE: RelatedNodeInfo(node_id=doc_id) + }, + ) + + nodes.append(node) + similarities.append(1.0 - math.exp(-result[5])) + ids.append(result[0]) + return VectorStoreQueryResult( + nodes=nodes, similarities=similarities, ids=ids + ) + + @classmethod + @_handle_exceptions + def from_documents( + cls: Type[OraLlamaVS], + docs: List[TextNode], + table_name: str = "llama_index", + **kwargs: Any, + ) -> OraLlamaVS: + """Return VectorStore initialized from texts and embeddings.""" + _client = kwargs.get("client") + if _client is None: + raise ValueError("client parameter is required...") + params = kwargs.get("params") + distance_strategy = kwargs.get("distance_strategy") + drop_table_purge(_client, table_name) + + vss = cls( + _client=_client, + table_name=table_name, + params=params, + distance_strategy=distance_strategy, + ) + vss.add(nodes=docs) + return vss diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/pyproject.toml b/llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/pyproject.toml new file mode 100644 index 0000000000000..123056405bd6b --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/pyproject.toml @@ -0,0 +1,63 @@ +[build-system] +build-backend = "poetry.core.masonry.api" +requires = ["poetry-core"] + +[tool.codespell] +check-filenames = true +check-hidden = true +skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" + +[tool.llamahub] +contains_example = false +import_path = "llama_index.vector_stores.oracledb" + +[tool.llamahub.class_authors] +OraLlamaVS = "llama-index" + +[tool.mypy] +disallow_untyped_defs = true +exclude = ["_static", "build", "examples", "notebooks", "venv"] +ignore_missing_imports = true +python_version = "3.11" + +[tool.poetry] +authors = ["Your Name "] +description = "llama-index vector_stores oracle database integration" +exclude = ["**/BUILD"] +license = "MIT" +name = "llama-index-vector-stores-oracledb" +readme = "README.md" +version = "0.1.0" + +[tool.poetry.dependencies] +python = ">=3.11.1,<4.0" +llama-index-core = ">=^0.11.1" +oracledb = ">=2.2" + +[tool.poetry.group.dev.dependencies] +ipython = "8.10.0" +jupyter = "^1.0.0" +mypy = "0.991" +pre-commit = "3.2.0" +pylint = "2.15.10" +pytest = "7.2.1" +pytest-mock = "3.11.1" +ruff = "0.0.292" +tree-sitter-languages = "^1.8.0" +types-Deprecated = ">=0.1.0" +types-PyYAML = "^6.0.12.12" +types-protobuf = "^4.24.0.4" +types-redis = "4.5.5.0" +types-requests = "2.28.11.8" +types-setuptools = "67.1.0.0" + +[tool.poetry.group.dev.dependencies.black] +extras = ["jupyter"] +version = "<=23.9.1,>=23.7.0" + +[tool.poetry.group.dev.dependencies.codespell] +extras = ["toml"] +version = ">=v2.2.6" + +[[tool.poetry.packages]] +include = "llama_index/" diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/tests/BUILD b/llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/tests/BUILD new file mode 100644 index 0000000000000..dabf212d7e716 --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/tests/BUILD @@ -0,0 +1 @@ +python_tests() diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/tests/__init__.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/tests/test_vector_stores_orallamavs.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/tests/test_vector_stores_orallamavs.py new file mode 100644 index 0000000000000..e2035cab28dad --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-oracledb/tests/test_vector_stores_orallamavs.py @@ -0,0 +1,7 @@ +from llama_index.core.vector_stores.types import BasePydanticVectorStore +from llama_index.vector_stores.oracledb import OraLlamaVS + + +def test_class(): + names_of_base_classes = [b.__name__ for b in OraLlamaVS.__mro__] + assert BasePydanticVectorStore.__name__ in names_of_base_classes diff --git a/llama-index-utils/llama-index-utils-oracleai/.gitignore b/llama-index-utils/llama-index-utils-oracleai/.gitignore new file mode 100644 index 0000000000000..990c18de22908 --- /dev/null +++ b/llama-index-utils/llama-index-utils-oracleai/.gitignore @@ -0,0 +1,153 @@ +llama_index/_static +.DS_Store +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +bin/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +etc/ +include/ +lib/ +lib64/ +parts/ +sdist/ +share/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +.ruff_cache + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints +notebooks/ + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +pyvenv.cfg + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Jetbrains +.idea +modules/ +*.swp + +# VsCode +.vscode + +# pipenv +Pipfile +Pipfile.lock + +# pyright +pyrightconfig.json diff --git a/llama-index-utils/llama-index-utils-oracleai/BUILD b/llama-index-utils/llama-index-utils-oracleai/BUILD new file mode 100644 index 0000000000000..0896ca890d8bf --- /dev/null +++ b/llama-index-utils/llama-index-utils-oracleai/BUILD @@ -0,0 +1,3 @@ +poetry_requirements( + name="poetry", +) diff --git a/llama-index-utils/llama-index-utils-oracleai/Makefile b/llama-index-utils/llama-index-utils-oracleai/Makefile new file mode 100644 index 0000000000000..b9eab05aa3706 --- /dev/null +++ b/llama-index-utils/llama-index-utils-oracleai/Makefile @@ -0,0 +1,17 @@ +GIT_ROOT ?= $(shell git rev-parse --show-toplevel) + +help: ## Show all Makefile targets. + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}' + +format: ## Run code autoformatters (black). + pre-commit install + git ls-files | xargs pre-commit run black --files + +lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy + pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files + +test: ## Run tests via pytest. + pytest tests + +watch-docs: ## Build and watch documentation. + sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/ diff --git a/llama-index-utils/llama-index-utils-oracleai/README.md b/llama-index-utils/llama-index-utils-oracleai/README.md new file mode 100644 index 0000000000000..ec0c2bff78901 --- /dev/null +++ b/llama-index-utils/llama-index-utils-oracleai/README.md @@ -0,0 +1 @@ +# LlamaIndex Utils Integration: Oracleai diff --git a/llama-index-utils/llama-index-utils-oracleai/llama_index/utils/oracleai/BUILD b/llama-index-utils/llama-index-utils-oracleai/llama_index/utils/oracleai/BUILD new file mode 100644 index 0000000000000..db46e8d6c978c --- /dev/null +++ b/llama-index-utils/llama-index-utils-oracleai/llama_index/utils/oracleai/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-utils/llama-index-utils-oracleai/llama_index/utils/oracleai/__init__.py b/llama-index-utils/llama-index-utils-oracleai/llama_index/utils/oracleai/__init__.py new file mode 100644 index 0000000000000..5932763d5d844 --- /dev/null +++ b/llama-index-utils/llama-index-utils-oracleai/llama_index/utils/oracleai/__init__.py @@ -0,0 +1,4 @@ +from llama_index.utils.oracleai.base import OracleSummary + + +__all__ = ["OracleSummary"] diff --git a/llama-index-utils/llama-index-utils-oracleai/llama_index/utils/oracleai/base.py b/llama-index-utils/llama-index-utils-oracleai/llama_index/utils/oracleai/base.py new file mode 100644 index 0000000000000..22de65fc0f1ee --- /dev/null +++ b/llama-index-utils/llama-index-utils-oracleai/llama_index/utils/oracleai/base.py @@ -0,0 +1,169 @@ +# ----------------------------------------------------------------------------- +# Authors: +# Harichandan Roy (hroy) +# David Jiang (ddjiang) +# +# ----------------------------------------------------------------------------- +# ...utils/oracleai.py +# ----------------------------------------------------------------------------- + +from __future__ import annotations + +import json +import logging +import traceback +from typing import Any, Dict, List, Optional, TYPE_CHECKING + +from llama_index.core.schema import Document + +if TYPE_CHECKING: + from oracledb import Connection + +logger = logging.getLogger(__name__) + + +"""OracleSummary class""" + + +class OracleSummary: + """Get Summary. + + Args: + conn: Oracle Connection, + params: Summary parameters, + proxy: Proxy + """ + + def __init__( + self, conn: Connection, params: Dict[str, Any], proxy: Optional[str] = None + ): + self.conn = conn + self.proxy = proxy + self.summary_params = params + + def get_summary(self, docs) -> List[str]: + """Get the summary of the input docs. + + Args: + docs: The documents to generate summary for. + Allowed input types: str, Document, List[str], List[Document] + + Returns: + List of summary text, one for each input doc. + """ + try: + import oracledb + except ImportError as e: + raise ImportError( + "Unable to import oracledb, please install with " + "`pip install -U oracledb`." + ) from e + + if docs is None: + return None + + results = [] + try: + oracledb.defaults.fetch_lobs = False + cursor = self.conn.cursor() + + if self.proxy: + cursor.execute( + "begin utl_http.set_proxy(:proxy); end;", proxy=self.proxy + ) + + if isinstance(docs, str): + results = [] + + summary = cursor.var(oracledb.DB_TYPE_CLOB) + cursor.execute( + """ + declare + input clob; + begin + input := :data; + :summ := dbms_vector_chain.utl_to_summary(input, json(:params)); + end;""", + data=docs, + params=json.dumps(self.summary_params), + summ=summary, + ) + + if summary is None: + results.append("") + else: + results.append(str(summary.getvalue())) + + elif isinstance(docs, Document): + results = [] + + summary = cursor.var(oracledb.DB_TYPE_CLOB) + cursor.execute( + """ + declare + input clob; + begin + input := :data; + :summ := dbms_vector_chain.utl_to_summary(input, json(:params)); + end;""", + data=docs.text, + params=json.dumps(self.summary_params), + summ=summary, + ) + + if summary is None: + results.append("") + else: + results.append(str(summary.getvalue())) + + elif isinstance(docs, List): + results = [] + for doc in docs: + summary = cursor.var(oracledb.DB_TYPE_CLOB) + if isinstance(doc, str): + cursor.execute( + """ + declare + input clob; + begin + input := :data; + :summ := dbms_vector_chain.utl_to_summary(input, json(:params)); + end;""", + data=doc, + params=json.dumps(self.summary_params), + summ=summary, + ) + + elif isinstance(doc, Document): + cursor.execute( + """ + declare + input clob; + begin + input := :data; + :summ := dbms_vector_chain.utl_to_summary(input, json(:params)); + end;""", + data=doc.text, + params=json.dumps(self.summary_params), + summ=summary, + ) + + else: + raise Exception("Invalid input type") + + if summary is None: + results.append("") + else: + results.append(str(summary.getvalue())) + + else: + raise Exception("Invalid input type") + + cursor.close() + return results + + except Exception as ex: + print(f"An exception occurred :: {ex}") + traceback.print_exc() + cursor.close() + raise diff --git a/llama-index-utils/llama-index-utils-oracleai/pyproject.toml b/llama-index-utils/llama-index-utils-oracleai/pyproject.toml new file mode 100644 index 0000000000000..f5891053bc028 --- /dev/null +++ b/llama-index-utils/llama-index-utils-oracleai/pyproject.toml @@ -0,0 +1,53 @@ +[build-system] +build-backend = "poetry.core.masonry.api" +requires = ["poetry-core"] + +[tool.codespell] +# [tool.llamahub.class_authors] +# CLASS = "github-username" +OracleSummary = "forkhroy" +check-filenames = true +check-hidden = true +# Feel free to un-skip examples, and experimental, you will just need to +# work through many typos (--write-changes and --interactive will help) +skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" + +[tool.mypy] +disallow_untyped_defs = true +# Remove venv skip when integrated with pre-commit +exclude = ["_static", "build", "examples", "notebooks", "venv"] +ignore_missing_imports = true +python_version = "3.8" + +[tool.poetry] +authors = ["Your Name "] +description = "llama-index utils oracleai integration" +license = "MIT" +name = "llama-index-utils-oracleai" +packages = [{include = "llama_index/"}] +readme = "README.md" +version = "0.1.0" + +[tool.poetry.dependencies] +python = ">=3.8.1,<4.0" +llama-index-core = ">=0.11.1" +oracledb = ">=2.2" + +[tool.poetry.group.dev.dependencies] +black = {extras = ["jupyter"], version = "<=23.9.1,>=23.7.0"} +codespell = {extras = ["toml"], version = ">=v2.2.6"} +ipython = "8.10.0" +jupyter = "^1.0.0" +mypy = "0.991" +pre-commit = "3.2.0" +pylint = "2.15.10" +pytest = "7.2.1" +pytest-mock = "3.11.1" +ruff = "0.0.292" +tree-sitter-languages = "^1.8.0" +types-Deprecated = ">=0.1.0" +types-PyYAML = "^6.0.12.12" +types-protobuf = "^4.24.0.4" +types-redis = "4.5.5.0" +types-requests = "2.28.11.8" # TODO: unpin when mypy>0.991 +types-setuptools = "67.1.0.0" diff --git a/llama-index-utils/llama-index-utils-oracleai/tests/BUILD b/llama-index-utils/llama-index-utils-oracleai/tests/BUILD new file mode 100644 index 0000000000000..dabf212d7e716 --- /dev/null +++ b/llama-index-utils/llama-index-utils-oracleai/tests/BUILD @@ -0,0 +1 @@ +python_tests() diff --git a/llama-index-utils/llama-index-utils-oracleai/tests/__init__.py b/llama-index-utils/llama-index-utils-oracleai/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/llama-index-utils/llama-index-utils-oracleai/tests/test_utils_oracleai.py b/llama-index-utils/llama-index-utils-oracleai/tests/test_utils_oracleai.py new file mode 100644 index 0000000000000..4cc24fa48094a --- /dev/null +++ b/llama-index-utils/llama-index-utils-oracleai/tests/test_utils_oracleai.py @@ -0,0 +1,49 @@ +from typing import TYPE_CHECKING +from llama_index.utils.oracleai import OracleSummary + +if TYPE_CHECKING: + import oracledb + + +# unit tests +uname = "" +passwd = "" +v_dsn = "" + + +### Test OracleSummary ##### +# @pytest.mark.requires("oracledb") +def test_summary_test() -> None: + try: + connection = oracledb.connect(user=uname, password=passwd, dsn=v_dsn) + # print("Connection Successful!") + + doc = """LlamaIndex is a data framework designed specifically + for Large Language Models (LLMs). It acts as a bridge between + your enterprise data and LLM applications, allowing you to leverage + the power of LLMs for various tasks. Here's a breakdown of its key + features and functionalities: Data Integration, Knowledge Base Creation, + Retrieval and Augmentation, Integration with LLMs and so on. """ + + # get oracle summary + summary_params = { + "provider": "database", + "glevel": "S", + "numParagraphs": 1, + "language": "english", + } + summary = OracleSummary(conn=connection, params=summary_params) + summ = summary.get_summary(doc) + + # verify + assert len(summ) != 0 + # print(f"Summary: {summ}") + + connection.close() + except Exception as e: + # print("Error: ", e) + pass + + +# test embedder +# test_summary_test()