From 4179de4e6cb507b9a235a7fd1e91f4d4980d5a0d Mon Sep 17 00:00:00 2001 From: Kevin D Smith Date: Thu, 14 Nov 2024 15:39:04 -0600 Subject: [PATCH] Add notebook validation (#124) --- .pre-commit-config.yaml | 3 +- notebooks/atlas-and-kai/notebook.ipynb | 142 +++--- notebooks/backup-database-s3/notebook.ipynb | 54 ++- notebooks/basic-query-examples/notebook.ipynb | 186 ++++---- .../notebook.ipynb | 66 +-- .../cdc-mongodb-sql-commands/notebook.ipynb | 152 +++--- .../cloud-functions-template/notebook.ipynb | 54 +-- .../notebook.ipynb | 76 +-- notebooks/create-dash-app/notebook.ipynb | 70 +-- .../notebook.ipynb | 117 +++-- .../notebook.ipynb | 266 +++++------ .../notebook.ipynb | 180 +++---- .../notebook.ipynb | 100 ++-- .../notebook.ipynb | 146 +++--- .../notebook.ipynb | 378 +++++++-------- .../notebook.ipynb | 184 +++++--- .../notebook.ipynb | 102 ++-- notebooks/hybrid-search/notebook.ipynb | 146 +++--- .../image-matching-with-sql/notebook.ipynb | 90 ++-- .../notebook.ipynb | 75 +-- .../notebook.ipynb | 96 ++-- notebooks/insure-gpt-demo/notebook.ipynb | 30 +- .../integrating-with-pandas/notebook.ipynb | 338 +++++++------- notebooks/kebab-case/notebook.ipynb | 44 +- .../notebook.ipynb | 64 +-- .../notebook.ipynb | 78 ++-- notebooks/load-csv-data-s3/notebook.ipynb | 99 ++-- notebooks/load-data-json/notebook.ipynb | 108 +++-- notebooks/load-data-kakfa/notebook.ipynb | 87 ++-- notebooks/load-json-files-s3/notebook.ipynb | 140 +++--- .../notebook.ipynb | 84 ++-- .../notebook.ipynb | 196 ++++---- notebooks/movie-recommendation/notebook.ipynb | 224 ++++----- .../notebook.ipynb | 152 +++--- .../notebook.ipynb | 160 +++---- .../notebook.ipynb | 144 +++--- notebooks/notebook-basics/notebook.ipynb | 158 +++---- .../notebook.ipynb | 192 ++++---- .../notebook.ipynb | 96 ++-- .../pipelines-query-tuning/notebook.ipynb | 212 ++++----- notebooks/rag-with-bedrock/notebook.ipynb | 146 ++++-- .../notebook.ipynb | 136 +++--- .../notebook.ipynb | 137 ++++-- .../notebook.ipynb | 84 ++-- .../restore-database-from-s3/notebook.ipynb | 57 ++- .../notebook.ipynb | 146 +++--- notebooks/resume-evaluator/notebook.ipynb | 114 ++--- .../searching-all-of-wikipedia/notebook.ipynb | 148 +++--- .../notebook.ipynb | 94 ++-- .../notebook.ipynb | 94 ++-- .../notebook.ipynb | 158 +++---- .../notebook.ipynb | 210 ++++----- .../notebook.ipynb | 102 ++-- .../notebook.ipynb | 48 +- notebooks/singlestore-now-2024/notebook.ipynb | 106 ++--- .../notebook.ipynb | 188 ++++---- .../vector-database-basics/notebook.ipynb | 45 +- .../vector-search-with-kai/notebook.ipynb | 100 ++-- .../working-with-vector-data/notebook.ipynb | 442 +++++++++--------- resources/nb-check.py | 64 ++- 60 files changed, 4183 insertions(+), 3725 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8233daea..23ccf27a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -44,9 +44,10 @@ repos: - id: nb-check name: nb-check entry: resources/nb-check.py - language: system + language: python files: \.ipynb$ exclude: notebooks/notebook-style-guide/notebook.ipynb + additional_dependencies: [nbformat==5.10.4] - id: nb-meta-check name: nb-meta-check entry: resources/nb-meta-check.py diff --git a/notebooks/atlas-and-kai/notebook.ipynb b/notebooks/atlas-and-kai/notebook.ipynb index fed7e6f5..0a6aea23 100644 --- a/notebooks/atlas-and-kai/notebook.ipynb +++ b/notebooks/atlas-and-kai/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "50bd6467", "cell_type": "markdown", - "id": "48a6458f-75ed-4a6c-aaa8-184bb9edfb75", "metadata": {}, "source": [ "
\n", @@ -17,8 +17,8 @@ ] }, { + "id": "62d047d5", "cell_type": "markdown", - "id": "df80b45e", "metadata": {}, "source": [ "
\n", @@ -33,16 +33,15 @@ { "attachments": {}, "cell_type": "markdown", - "id": "ca93a410-d513-42ec-a823-99ad8f3a25c1", "metadata": {}, "source": [ "" - ] + ], + "id": "ed84f878" }, { "attachments": {}, "cell_type": "markdown", - "id": "5353b6a2-006f-4a71-834f-045d3e054640", "metadata": {}, "source": [ "# No code change required! 100% MongoDB notebook!\n", @@ -63,51 +62,51 @@ "7. Average satisfaction per product\n", "8. Number of transactions by Location and membership\n", "9. Top 10 product sales" - ] + ], + "id": "8bf8f031" }, { "attachments": {}, "cell_type": "markdown", - "id": "a5f3d92f-5721-4f28-a91a-b04def563dfb", "metadata": {}, "source": [ "## 1. Install libraries and import modules" - ] + ], + "id": "38e1f148" }, { "attachments": {}, "cell_type": "markdown", - "id": "856860b6-c6ac-4f72-8d64-5d405dbb7acc", "metadata": {}, "source": [ "**Make sure that you have a created MongoDB enabled workspace.**\n", "\n", "This must be done when creating a workspace (for Standard/Premium Workspaces) For Starter Workspaces, the KAI API will be on by default." - ] + ], + "id": "c845fbec" }, { "attachments": {}, "cell_type": "markdown", - "id": "33506e25-c044-4f6f-9d62-df61783076e1", "metadata": {}, "source": [ "" - ] + ], + "id": "31dd9660" }, { "cell_type": "code", "execution_count": 1, - "id": "26ec8d2d-25b1-4b8f-a62f-098192b8d45f", "metadata": {}, "outputs": [], "source": [ "!pip install pymongo pandas matplotlib plotly ipywidgets --quiet" - ] + ], + "id": "812aabaf" }, { "cell_type": "code", "execution_count": 2, - "id": "6087e187-ab0b-4df6-8c9e-ee9fc7153a6b", "metadata": {}, "outputs": [], "source": [ @@ -119,12 +118,12 @@ "else:\n", " database_to_use = \"new_transactions\"\n", " %sql CREATE DATABASE {{database_to_use}}" - ] + ], + "id": "09fe48fc" }, { "attachments": {}, "cell_type": "markdown", - "id": "27a6f491", "metadata": {}, "source": [ "
\n", @@ -134,12 +133,12 @@ "

Make sure to select a database from the drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.

\n", "
\n", "
" - ] + ], + "id": "8606b198" }, { "cell_type": "code", "execution_count": 3, - "id": "3722ef02-42b0-41af-869a-b4b1f7f62e02", "metadata": {}, "outputs": [], "source": [ @@ -153,22 +152,22 @@ "import pymongo\n", "from pymongo import MongoClient\n", "from plotly.offline import plot, iplot, init_notebook_mode" - ] + ], + "id": "4e294ccd" }, { "attachments": {}, "cell_type": "markdown", - "id": "b96597f7-68e1-45d3-bbb0-dbfb5f440881", "metadata": {}, "source": [ "## 2. Connect to Atlas and SingleStore Kai endpoints\n", "We are using a shared tier on the backend for Atlas" - ] + ], + "id": "32ff8a15" }, { "cell_type": "code", "execution_count": 4, - "id": "d038bacc-ae3d-450e-a955-a304f9a07c74", "metadata": {}, "outputs": [], "source": [ @@ -178,23 +177,23 @@ "mongoitems = mydbmongodb[\"items\"]\n", "mongocusts = mydbmongodb[\"custs\"]\n", "mongotxs = mydbmongodb[\"txs\"]" - ] + ], + "id": "31bacc5f" }, { "attachments": {}, "cell_type": "markdown", - "id": "4e8f3c25-3399-4095-a034-438617daa5da", "metadata": {}, "source": [ "**Select the workspace that you want to use.**\n", "\n", "" - ] + ], + "id": "9ce6f065" }, { "cell_type": "code", "execution_count": 5, - "id": "e53b6983-8c62-4b45-85d5-fb29fb655936", "metadata": {}, "outputs": [], "source": [ @@ -203,21 +202,21 @@ "s2mongoitems = s2dbmongodb[\"items\"]\n", "s2mongocusts = s2dbmongodb[\"custs\"]\n", "s2mongotxs = s2dbmongodb[\"txs\"]" - ] + ], + "id": "f11f98cf" }, { "attachments": {}, "cell_type": "markdown", - "id": "a6f36725-4b74-4460-b1c9-a0144159a7b4", "metadata": {}, "source": [ "## 3. Copy Atlas collections into SingleStore Kai" - ] + ], + "id": "6921e8c9" }, { "cell_type": "code", "execution_count": 6, - "id": "5cb978bc-03cc-4477-853d-577fc856ca94", "metadata": {}, "outputs": [], "source": [ @@ -228,61 +227,61 @@ " data_dict = df.to_dict(orient='records')\n", " s2mongo_collection = s2dbmongodb[mongo_collection.name]\n", " s2mongo_collection.insert_many(data_dict)" - ] + ], + "id": "71cf4b5c" }, { "attachments": {}, "cell_type": "markdown", - "id": "5b3928f8-2487-4553-962f-eb5bc2d83096", "metadata": {}, "source": [ "Count documents in SingleStore" - ] + ], + "id": "b0ea4a7f" }, { "cell_type": "code", "execution_count": 7, - "id": "91ce2d6e-3d02-4c57-88f7-365d7449d84c", "metadata": {}, "outputs": [], "source": [ "mg_count = s2mongoitems.count_documents({})\n", "mg_count" - ] + ], + "id": "62cf7161" }, { "attachments": {}, "cell_type": "markdown", - "id": "48841366-41fb-45f6-81d7-323cda1b1df7", "metadata": {}, "source": [ "# Compare Queries and Performance" - ] + ], + "id": "b9aa35b7" }, { "attachments": {}, "cell_type": "markdown", - "id": "2069ac4e-13a0-425a-b063-7434b339dd8e", "metadata": {}, "source": [ "**In-app analytics is everywhere.**\n", "\n", "" - ] + ], + "id": "7458c39a" }, { "attachments": {}, "cell_type": "markdown", - "id": "01f555e2-b809-4261-a8df-0669be80377c", "metadata": {}, "source": [ "## 4. Document counts" - ] + ], + "id": "6e758302" }, { "cell_type": "code", "execution_count": 8, - "id": "f1c54716-a4e9-49ae-9035-75a4c3761c90", "metadata": {}, "outputs": [], "source": [ @@ -329,21 +328,21 @@ "fig.update_xaxes(tickmode='array', tickvals=[1, 2, 3, 4, 5,6,7,8,9,10], row=1, col=1)\n", "\n", "fig" - ] + ], + "id": "6e89408d" }, { "attachments": {}, "cell_type": "markdown", - "id": "94d4c502-fb66-45cb-a520-6ee39ae35476", "metadata": {}, "source": [ "## 5. Product Quantity Sold" - ] + ], + "id": "f530c080" }, { "cell_type": "code", "execution_count": 9, - "id": "6de02fc3-fe7b-4dd4-a495-d0e785f4c58f", "metadata": {}, "outputs": [], "source": [ @@ -399,21 +398,21 @@ "fig.update_xaxes(tickmode='array', tickvals=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], row=1, col=1)\n", "\n", "fig" - ] + ], + "id": "4c5569fc" }, { "attachments": {}, "cell_type": "markdown", - "id": "9f11b18f-d414-4107-83a9-5d9d10172d6a", "metadata": {}, "source": [ "## 6. Average Customer Satisfaction" - ] + ], + "id": "fc9813d0" }, { "cell_type": "code", "execution_count": 10, - "id": "c4bfc8e2-3f72-47be-b789-8f44a547ef60", "metadata": {}, "outputs": [], "source": [ @@ -473,21 +472,21 @@ "fig.update_xaxes(tickmode='array', tickvals=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], row=1, col=1)\n", "\n", "fig" - ] + ], + "id": "82b870c7" }, { "attachments": {}, "cell_type": "markdown", - "id": "e6657ab9-551b-4d09-a1be-50a1b9091558", "metadata": {}, "source": [ "## 7. Average Satisfaction per Product" - ] + ], + "id": "31535113" }, { "cell_type": "code", "execution_count": 11, - "id": "8015fdd4-c6eb-437a-9d60-ee937817caf3", "metadata": {}, "outputs": [], "source": [ @@ -556,21 +555,21 @@ "fig.update_xaxes(tickmode='array', tickvals=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], row=1, col=1)\n", "\n", "fig" - ] + ], + "id": "16041e02" }, { "attachments": {}, "cell_type": "markdown", - "id": "b14eb709-b58b-461e-b415-a4ca3461b1a6", "metadata": {}, "source": [ "## 8. Number of transactions by location and membership" - ] + ], + "id": "98090bfe" }, { "cell_type": "code", "execution_count": 12, - "id": "78abd324-cace-4ad6-abe7-d1b5d166a7e7", "metadata": {}, "outputs": [], "source": [ @@ -642,21 +641,21 @@ "fig.update_xaxes(tickmode='array', tickvals=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], row=1, col=1)\n", "\n", "fig" - ] + ], + "id": "48b8c84b" }, { "attachments": {}, "cell_type": "markdown", - "id": "83fa3e5c-975e-410c-a25e-4db2b8389952", "metadata": {}, "source": [ "## 9. Top 10 Product Sales" - ] + ], + "id": "2c7f8d4c" }, { "cell_type": "code", "execution_count": 13, - "id": "57a5a473-e840-4310-8f31-c53d9420a4cc", "metadata": {}, "outputs": [], "source": [ @@ -724,21 +723,21 @@ "fig.update_xaxes(tickmode='array', tickvals=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], row=1, col=1)\n", "\n", "fig" - ] + ], + "id": "2022a7d1" }, { "attachments": {}, "cell_type": "markdown", - "id": "f162caba-a871-4b95-a5f5-0014901f12ff", "metadata": {}, "source": [ "## Clean up" - ] + ], + "id": "599cc583" }, { "attachments": {}, "cell_type": "markdown", - "id": "15754544", "metadata": {}, "source": [ "
\n", @@ -748,23 +747,24 @@ "

If you created a new database in your Standard or Premium Workspace, you can drop the database by running the cell below. Note: this will not drop your database for Free Starter Workspaces. To drop a Free Starter Workspace, terminate the Workspace using the UI.

\n", "
\n", "
" - ] + ], + "id": "e6d3891a" }, { "cell_type": "code", "execution_count": 14, - "id": "40e2ed0b-9b43-446a-9150-823b8e87dd0d", "metadata": {}, "outputs": [], "source": [ "shared_tier_check = %sql show variables like 'is_shared_tier'\n", "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", " %sql DROP DATABASE IF EXISTS new_transactions;" - ] + ], + "id": "17c3f52b" }, { + "id": "bd74b914", "cell_type": "markdown", - "id": "7434e7b2-8e62-4605-9666-622efaefd3d9", "metadata": {}, "source": [ "
\n", diff --git a/notebooks/backup-database-s3/notebook.ipynb b/notebooks/backup-database-s3/notebook.ipynb index f8c49454..b4bbd09f 100644 --- a/notebooks/backup-database-s3/notebook.ipynb +++ b/notebooks/backup-database-s3/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "94965d32", "cell_type": "markdown", - "id": "f60b7a01-7fee-4f5a-a3a7-a45590c036ad", "metadata": {}, "source": [ "
\n", @@ -40,7 +40,8 @@ "## Questions?\n", "\n", "Reach out to us through our [forum](https://www.singlestore.com/forum)." - ] + ], + "id": "6992694c" }, { "attachments": {}, @@ -65,7 +66,8 @@ "
  • General format is 'database_name.backup'.
  • \n", "
  • AWS IAM user should have S3 read,   write access
  • \n", "" - ] + ], + "id": "209dd4f5" }, { "attachments": {}, @@ -73,7 +75,8 @@ "metadata": {}, "source": [ "### Imports" - ] + ], + "id": "ac58f380" }, { "cell_type": "code", @@ -88,7 +91,8 @@ "\n", "import singlestoredb as s2\n", "from IPython.display import display, HTML" - ] + ], + "id": "61bab745" }, { "attachments": {}, @@ -96,7 +100,8 @@ "metadata": {}, "source": [ "### Variables" - ] + ], + "id": "471c7717" }, { "cell_type": "code", @@ -110,7 +115,8 @@ "s3_target_path = None\n", "aws_session_token = None\n", "is_incremental_backup = 'N'" - ] + ], + "id": "8652cd49" }, { "attachments": {}, @@ -118,7 +124,8 @@ "metadata": {}, "source": [ "### Functions to display various alerts" - ] + ], + "id": "9e661147" }, { "cell_type": "code", @@ -177,7 +184,8 @@ "

    {success_msg}

    \n", "
    \n", "'''))" - ] + ], + "id": "d100faff" }, { "attachments": {}, @@ -185,7 +193,8 @@ "metadata": {}, "source": [ "### LogControl" - ] + ], + "id": "44d05387" }, { "attachments": {}, @@ -197,7 +206,8 @@ "To enable logs\n", "\n", " - Modify 'enable_debug_log(False)' to 'enable_debug_log(True)' in code below" - ] + ], + "id": "bdfe6e78" }, { "cell_type": "code", @@ -210,7 +220,8 @@ " logging.getLogger().setLevel(logging.DEBUG)\n", " else:\n", " logging.getLogger().setLevel(logging.ERROR)" - ] + ], + "id": "ebe1585e" }, { "attachments": {}, @@ -218,7 +229,8 @@ "metadata": {}, "source": [ "### Utility functions for handling S3 PATHs, SQL Statement, backup" - ] + ], + "id": "021a7ca2" }, { "cell_type": "code", @@ -321,7 +333,8 @@ " logging.error('Backup execution failed')\n", " else:\n", " logging.info(\"Backup completed\")" - ] + ], + "id": "60f2787b" }, { "cell_type": "code", @@ -381,7 +394,8 @@ " show_error(f'Failed to backup. {str(e)}')\n", "\n", "print('\\n\\nScript execution completed')" - ] + ], + "id": "bf821c9a" }, { "attachments": {}, @@ -397,7 +411,8 @@ "You may use below query to check backups created ( apply filter to limit data as per your needs )\n", "\n", " select * from information_schema.MV_BACKUP_HISTORY" - ] + ], + "id": "8fea5794" }, { "attachments": {}, @@ -407,11 +422,12 @@ "**Important Note**\n", "\n", "- To use this as scheduled notebook, we have to modify to read configuration data from table instead of user input" - ] + ], + "id": "6e27e49b" }, { + "id": "8a2988ca", "cell_type": "markdown", - "id": "12c1b550-2950-4c11-8258-7af1ea551263", "metadata": {}, "source": [ "
    \n", @@ -439,5 +455,5 @@ } }, "nbformat": 4, - "nbformat_minor": 4 + "nbformat_minor": 5 } diff --git a/notebooks/basic-query-examples/notebook.ipynb b/notebooks/basic-query-examples/notebook.ipynb index 977e2c99..155a52a8 100644 --- a/notebooks/basic-query-examples/notebook.ipynb +++ b/notebooks/basic-query-examples/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "e66c936c", "cell_type": "markdown", - "id": "44ae4cb7-8551-4d4a-9e5a-82342dcee51f", "metadata": {}, "source": [ "
    \n", @@ -17,8 +17,8 @@ ] }, { + "id": "0b6723be", "cell_type": "markdown", - "id": "82a448ca-05ee-411a-a64e-b7f030d2a85c", "metadata": {}, "source": [ "
    \n", @@ -33,70 +33,69 @@ { "attachments": {}, "cell_type": "markdown", - "id": "1a14e95a-9808-4ebf-9f4a-c7390c880f29", "metadata": {}, "source": [ "\n", " \n", " \n", "
    " - ] + ], + "id": "4cfe69c9" }, { "attachments": {}, "cell_type": "markdown", - "id": "92913d25-fb67-47b4-a883-15d2cc7ecef6", "metadata": {}, "source": [ "This notebook demonstrates how to run queries on SingleStore through a series of examples. For simplicity, the sample data in these examples is limited to 10 rows or less per table, which is much smaller than typical workloads. The examples cover various database operations, including index scans, full-table scans, joins, and aggregations." - ] + ], + "id": "9bdcbdbd" }, { "attachments": {}, "cell_type": "markdown", - "id": "80d7700a-f687-4c60-826f-526ae55eea09", "metadata": {}, "source": [ "## 1. Create a Workspace\n", "To create a workspace, refer to the [Creating and Using Workspaces](https://docs.singlestore.com/cloud/getting-started-with-singlestore-helios/about-workspaces/creating-and-using-workspaces/)." - ] + ], + "id": "40050891" }, { "attachments": {}, "cell_type": "markdown", - "id": "876a84da-16d5-472f-8d92-7f00d535af9a", "metadata": {}, "source": [ "## 2. Create the Database\n", "Select the workspace in your notebook, and create a database." - ] + ], + "id": "8551a58b" }, { "cell_type": "code", "execution_count": 1, - "id": "384f780a-b4ff-4ee8-8be3-40185247a793", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "CREATE DATABASE memsql_example;\n", "USE memsql_example;" - ] + ], + "id": "bcc93142" }, { "attachments": {}, "cell_type": "markdown", - "id": "dd20143a-8275-4f15-a72e-b842e1d467cd", "metadata": {}, "source": [ "## 3. Create the tables\n", "Create the tables named departments, employees, and salaries." - ] + ], + "id": "c63daae9" }, { "cell_type": "code", "execution_count": 2, - "id": "0844b00f-0cd1-4e05-b616-d565589cacfd", "metadata": {}, "outputs": [], "source": [ @@ -122,22 +121,22 @@ " salary int,\n", " PRIMARY KEY (employeeId)\n", ");" - ] + ], + "id": "9afac7f0" }, { "attachments": {}, "cell_type": "markdown", - "id": "44393e0c-d101-4dcd-ae68-1874fab727f3", "metadata": {}, "source": [ "## 4. Populate the tables\n", "Insert the data into the tables." - ] + ], + "id": "e3310d6d" }, { "cell_type": "code", "execution_count": 3, - "id": "ad1da4d8-abde-4e06-ad3e-8f1620577cab", "metadata": {}, "outputs": [], "source": [ @@ -160,113 +159,113 @@ "INSERT INTO salaries (employeeId, salary) VALUES\n", " (1, 885219), (2, 451519), (3, 288905), (4, 904312), (5, 919124),\n", " (6, 101538), (7, 355077), (8, 900436), (9, 41557), (10, 556263);" - ] + ], + "id": "c29bfd9b" }, { "attachments": {}, "cell_type": "markdown", - "id": "a5ecd8a4-b78f-4e87-a1ee-38d6e8629874", "metadata": {}, "source": [ "## 5. Let's Query!" - ] + ], + "id": "ecce36d9" }, { "attachments": {}, "cell_type": "markdown", - "id": "52dd2459-3fe8-4e8b-8565-8d7abf89411f", "metadata": {}, "source": [ "### The SELECT statement\n", "Ask how many rows are in the employees table." - ] + ], + "id": "05434143" }, { "cell_type": "code", "execution_count": 4, - "id": "be4a19b6-0549-47e0-b573-db2b2420448c", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT COUNT(*) from employees;" - ] + ], + "id": "110813ed" }, { "attachments": {}, "cell_type": "markdown", - "id": "358d7c70-2888-410a-bea6-2afd7a03f4fb", "metadata": {}, "source": [ "### The ORDER BY clause\n", "List the ID and the name of each employee." - ] + ], + "id": "7094c248" }, { "cell_type": "code", "execution_count": 5, - "id": "f36b3d0c-8fe2-41f3-ab21-97ea48218d9c", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT id, name FROM employees ORDER BY id;" - ] + ], + "id": "47233824" }, { "attachments": {}, "cell_type": "markdown", - "id": "619232f2-2706-4860-9654-65440398d35e", "metadata": {}, "source": [ "### The WHERE clause\n", "List the employees that work in Texas." - ] + ], + "id": "7df73edf" }, { "cell_type": "code", "execution_count": 6, - "id": "4e4f50d0-5791-458f-80ed-fc82e8c0c4f9", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT id, name FROM employees WHERE state = 'TX' ORDER BY id;" - ] + ], + "id": "60034c4d" }, { "attachments": {}, "cell_type": "markdown", - "id": "930ebd85-0ae1-4467-885a-08fc1b6b367e", "metadata": {}, "source": [ "You can use the same above query by replacing **state = 'NY'** to list the employees that work in New York." - ] + ], + "id": "811da905" }, { "cell_type": "code", "execution_count": 7, - "id": "43b80580-709e-45b8-9d24-c42949a361a5", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT id, name FROM employees WHERE state = 'NY' ORDER BY id;" - ] + ], + "id": "bcb3d03d" }, { "attachments": {}, "cell_type": "markdown", - "id": "7f626258-5274-4c9c-b9e6-2b46a1e0969d", "metadata": {}, "source": [ "List the employees hired before 2002." - ] + ], + "id": "158a598c" }, { "cell_type": "code", "execution_count": 8, - "id": "b9b0119d-fd9f-49c1-bec2-4baf2d6c30c0", "metadata": {}, "outputs": [], "source": [ @@ -275,21 +274,21 @@ " FROM employees\n", " WHERE hireDate < '2002-01-01'\n", " ORDER BY id;" - ] + ], + "id": "7aae35da" }, { "attachments": {}, "cell_type": "markdown", - "id": "1fa3c609-3fc3-4d23-b491-a9442e0c2f9e", "metadata": {}, "source": [ "List employees and their departments." - ] + ], + "id": "73ebb059" }, { "cell_type": "code", "execution_count": 9, - "id": "cf5fb4c7-5ac1-4c5e-92b1-b9563b17b0f7", "metadata": {}, "outputs": [], "source": [ @@ -298,22 +297,22 @@ " employees e, departments d\n", " WHERE e.deptId = d.id\n", " ORDER BY name;" - ] + ], + "id": "58fe3b95" }, { "attachments": {}, "cell_type": "markdown", - "id": "9d235c26-1855-4a91-957d-f85cfeccf426", "metadata": {}, "source": [ "### The COUNT function\n", "List the number of employees in each state." - ] + ], + "id": "c4aeb361" }, { "cell_type": "code", "execution_count": 10, - "id": "0929107e-3ea0-4a19-acf4-7c828f1745db", "metadata": {}, "outputs": [], "source": [ @@ -322,43 +321,43 @@ " from employees\n", " group by state\n", " ORDER BY state;" - ] + ], + "id": "629c27c4" }, { "attachments": {}, "cell_type": "markdown", - "id": "725bcb53-ad7e-46b9-86ab-f092b85df810", "metadata": {}, "source": [ "### The MAX function\n", "Highest salary amongst all employees." - ] + ], + "id": "ded11200" }, { "cell_type": "code", "execution_count": 11, - "id": "cc289e4b-ebb2-43eb-9a26-dca6c4ce9da7", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT MAX(salary) FROM salaries;" - ] + ], + "id": "d047479f" }, { "attachments": {}, "cell_type": "markdown", - "id": "99276935-07ec-484d-9aee-4cd050e406c5", "metadata": {}, "source": [ "### The Subqueries\n", "Employee with the highest salary." - ] + ], + "id": "a7012ca0" }, { "cell_type": "code", "execution_count": 12, - "id": "9b4d4235-ebd1-412a-8eeb-4c58c7671dff", "metadata": {}, "outputs": [], "source": [ @@ -367,22 +366,22 @@ " FROM employees e, salaries s\n", " WHERE e.id = s.employeeId and\n", " s.salary = (SELECT MAX(salary) FROM salaries);" - ] + ], + "id": "8c2a5493" }, { "attachments": {}, "cell_type": "markdown", - "id": "c4731afe-f9bd-46f3-b389-e537cf9f6088", "metadata": {}, "source": [ "### The AVG function\n", "Average salary of employees in each state." - ] + ], + "id": "9e91ec52" }, { "cell_type": "code", "execution_count": 13, - "id": "dcc6c8d8-210b-4538-9767-ccd4b127fd8c", "metadata": {}, "outputs": [], "source": [ @@ -392,22 +391,22 @@ " JOIN salaries s on e.id = s.employeeId\n", " GROUP BY e.state\n", " ORDER BY e.state;" - ] + ], + "id": "db4faa72" }, { "attachments": {}, "cell_type": "markdown", - "id": "41b09697-f371-4e8f-88c7-cb3f96329294", "metadata": {}, "source": [ "### The IN operator\n", "List of managers." - ] + ], + "id": "12d85cec" }, { "cell_type": "code", "execution_count": 14, - "id": "90303d05-c06a-4f40-89bf-20760b3fd810", "metadata": {}, "outputs": [], "source": [ @@ -416,22 +415,22 @@ " FROM employees\n", " WHERE id IN (SELECT managerId FROM employees)\n", " ORDER BY name;" - ] + ], + "id": "0a414332" }, { "attachments": {}, "cell_type": "markdown", - "id": "ec44f51b-e6f7-43e3-ac46-5b594ae7e602", "metadata": {}, "source": [ "### The NOT IN operator\n", "List of non-managers." - ] + ], + "id": "c9b0f7f6" }, { "cell_type": "code", "execution_count": 15, - "id": "244b7eb1-73f0-4a0b-83ae-63d7fa77220d", "metadata": {}, "outputs": [], "source": [ @@ -440,22 +439,22 @@ " FROM employees\n", " WHERE id NOT IN (SELECT managerId FROM employees)\n", " ORDER BY name;" - ] + ], + "id": "d02b4c74" }, { "attachments": {}, "cell_type": "markdown", - "id": "2789bd8c-9db0-4e35-aa6b-da233d03ba79", "metadata": {}, "source": [ "### The Joins\n", "Number of employees reporting to each manager." - ] + ], + "id": "f1b4df30" }, { "cell_type": "code", "execution_count": 16, - "id": "5f726523-0031-44d3-b2f2-9fca93df7cf7", "metadata": {}, "outputs": [], "source": [ @@ -465,21 +464,21 @@ " JOIN employees e ON m.id = e.managerId\n", " GROUP BY m.id\n", " ORDER BY count DESC;" - ] + ], + "id": "360fb1fd" }, { "attachments": {}, "cell_type": "markdown", - "id": "55f5acf0-5b52-4440-8691-44d5fa60aba3", "metadata": {}, "source": [ "Number of employees reporting to each employee." - ] + ], + "id": "bed134b6" }, { "cell_type": "code", "execution_count": 17, - "id": "66475057-43dc-4c4b-aefb-7254b64d1bf6", "metadata": {}, "outputs": [], "source": [ @@ -489,21 +488,21 @@ " LEFT JOIN employees e ON m.id = e.managerId\n", " GROUP BY m.id\n", " ORDER BY count desc;" - ] + ], + "id": "a8fb1c38" }, { "attachments": {}, "cell_type": "markdown", - "id": "fe971b3a-8447-45e2-88c3-c546062c8982", "metadata": {}, "source": [ "Manager of each employee." - ] + ], + "id": "b1e9e462" }, { "cell_type": "code", "execution_count": 18, - "id": "084b7c27-78f1-4132-b05c-63d505c42f91", "metadata": {}, "outputs": [], "source": [ @@ -512,21 +511,21 @@ " FROM employees e\n", " LEFT JOIN employees m ON e.managerId = m.id\n", " ORDER BY manager_name;" - ] + ], + "id": "3844b6f3" }, { "attachments": {}, "cell_type": "markdown", - "id": "1b061775-518e-400e-8d64-a6b2f52066e7", "metadata": {}, "source": [ "Total salary of the employees reporting to each manager." - ] + ], + "id": "f04ab919" }, { "cell_type": "code", "execution_count": 19, - "id": "fefb6de0-9796-480a-b743-13682a5774e1", "metadata": {}, "outputs": [], "source": [ @@ -537,21 +536,21 @@ " JOIN salaries s ON s.employeeId = e.id\n", " GROUP BY m.id\n", " ORDER BY SUM(salary) DESC;" - ] + ], + "id": "6893f63d" }, { "attachments": {}, "cell_type": "markdown", - "id": "4ad0f65c-2b70-47ac-bebe-131e5549e67c", "metadata": {}, "source": [ "Employees in the finance department earning more than their manager." - ] + ], + "id": "c060083c" }, { "cell_type": "code", "execution_count": 20, - "id": "474eae13-b450-45cc-b262-ddbc49a3baaf", "metadata": {}, "outputs": [], "source": [ @@ -566,20 +565,21 @@ "WHERE d.name = 'Finance'\n", " AND sm.salary < se.salary\n", "ORDER BY employee_salary, manager_salary;" - ] + ], + "id": "64a8fe23" }, { "attachments": {}, "cell_type": "markdown", - "id": "dc39e7b5-2f03-40ce-9800-c2f3728fcf9e", "metadata": {}, "source": [ "**Learn more about SingleStore notebooks [here](https://docs.singlestore.com/managed-service/en/developer-resources/notebooks.html).**" - ] + ], + "id": "c4a4cb1b" }, { + "id": "fa4013d3", "cell_type": "markdown", - "id": "10791647-93d6-4670-9995-a0974423d1f4", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/building-a-multi-agent-ai-app-with-autogen/notebook.ipynb b/notebooks/building-a-multi-agent-ai-app-with-autogen/notebook.ipynb index 053d6123..cfbed760 100644 --- a/notebooks/building-a-multi-agent-ai-app-with-autogen/notebook.ipynb +++ b/notebooks/building-a-multi-agent-ai-app-with-autogen/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "41c70e1c", "cell_type": "markdown", - "id": "1ad5c93a-aa63-4eb1-8694-35a01a64fd61", "metadata": {}, "source": [ "
    \n", @@ -17,8 +17,8 @@ ] }, { + "id": "1c6aa139", "cell_type": "markdown", - "id": "a6a73b8a-fd3d-4042-84da-25bdcb56ad32", "metadata": {}, "source": [ "
    \n", @@ -32,7 +32,6 @@ }, { "cell_type": "markdown", - "id": "7b8c5501", "metadata": {}, "source": [ "# Python Notebook Introduction\n", @@ -54,12 +53,12 @@ "6. **Chat Simulation**: This section runs the chat simulation without and with the Retrieve and Generate (RAG) model.\n", "\n", "Please ensure that you have the necessary API keys and environment variables set up before running this notebook." - ] + ], + "id": "4ab9a0e9" }, { "cell_type": "code", "execution_count": 1, - "id": "20d54e4b", "metadata": {}, "outputs": [], "source": [ @@ -70,32 +69,32 @@ "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", " %sql DROP DATABASE IF EXISTS autogen\n", " %sql CREATE DATABASE autogen" - ] + ], + "id": "cac05073" }, { "cell_type": "code", "execution_count": 2, - "id": "b714bd8c-11f8-4411-8d85-674897272b04", "metadata": {}, "outputs": [], "source": [ "!pip install --quiet langchain_community pyautogen langchain_openai langchain_text_splitters unstructured" - ] + ], + "id": "8d379d39" }, { "cell_type": "code", "execution_count": 3, - "id": "44bef047-37ba-4d2f-a36b-124e4be49460", "metadata": {}, "outputs": [], "source": [ "!pip install --quiet markdown" - ] + ], + "id": "9971496f" }, { "cell_type": "code", "execution_count": 4, - "id": "1df928ba-29e0-4cf6-8b7d-7b1323481ae0", "metadata": {}, "outputs": [], "source": [ @@ -103,12 +102,12 @@ "\n", "r = requests.get(\"https://raw.githubusercontent.com/microsoft/FLAML/main/website/docs/Examples/Integrate%20-%20Spark.md\")\n", "open('example.md', 'wb').write(r.content)" - ] + ], + "id": "ea22b9ad" }, { "cell_type": "code", "execution_count": 5, - "id": "f7370802-88cb-4bd6-939b-a466a7670ac4", "metadata": {}, "outputs": [], "source": [ @@ -132,12 +131,12 @@ "embeddings = OpenAIEmbeddings()\n", "\n", "os.environ[\"SINGLESTOREDB_URL\"] = \"admin:pass@host:3306/db\"" - ] + ], + "id": "f72ca06c" }, { "cell_type": "code", "execution_count": 6, - "id": "4dcdde91-4936-475c-a7a4-9b7bf1803fd2", "metadata": {}, "outputs": [], "source": [ @@ -146,22 +145,22 @@ " embeddings,\n", " table_name=\"notebook2\", # use table with a custom name\n", ")" - ] + ], + "id": "3c0ab2a6" }, { "cell_type": "code", "execution_count": 7, - "id": "482b327e-3fb8-4ff2-8769-18324c4d1c7f", "metadata": {}, "outputs": [], "source": [ "!pip install --quiet pyautogen[retrievechat]" - ] + ], + "id": "9042b951" }, { "cell_type": "code", "execution_count": 8, - "id": "e4291e67", "metadata": {}, "outputs": [], "source": [ @@ -170,12 +169,12 @@ "from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent\n", "from autogen import config_list_from_json\n", "from autogen import AssistantAgent" - ] + ], + "id": "50418505" }, { "cell_type": "code", "execution_count": 9, - "id": "96420b79-707a-491e-a027-dae10c47a7cd", "metadata": {}, "outputs": [], "source": [ @@ -220,23 +219,23 @@ " )\n", "\n", " self._results = results" - ] + ], + "id": "e2e53553" }, { "cell_type": "code", "execution_count": 10, - "id": "fdeff3db-e4bf-4f01-b6e0-6aad1986badc", "metadata": {}, "outputs": [], "source": [ "import os\n", "os.environ[\"AUTOGEN_USE_DOCKER\"] = \"False\"" - ] + ], + "id": "8f7e260a" }, { "cell_type": "code", "execution_count": 11, - "id": "84831c05-b392-45a6-91c8-1b7888bfb166", "metadata": {}, "outputs": [], "source": [ @@ -387,43 +386,44 @@ " manager,\n", " message=PROBLEM,\n", " )" - ] + ], + "id": "e34f343a" }, { "cell_type": "code", "execution_count": 12, - "id": "0ded1446-bc77-404b-b205-f07c5de16b3e", "metadata": {}, "outputs": [], "source": [ "norag_chat()" - ] + ], + "id": "98387dfd" }, { "cell_type": "code", "execution_count": 13, - "id": "33114c12-b097-4466-8fcc-072d460fc1ed", "metadata": {}, "outputs": [], "source": [ "rag_chat()" - ] + ], + "id": "bec4ada5" }, { "cell_type": "code", "execution_count": 14, - "id": "e0f59b73", "metadata": {}, "outputs": [], "source": [ "shared_tier_check = %sql show variables like 'is_shared_tier'\n", "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", " %sql DROP DATABASE IF EXISTS autogen" - ] + ], + "id": "1b379204" }, { + "id": "083eef58", "cell_type": "markdown", - "id": "ce70fba3-d739-4993-909d-83b38bba881f", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/cdc-mongodb-sql-commands/notebook.ipynb b/notebooks/cdc-mongodb-sql-commands/notebook.ipynb index 60f0ec68..28e367ac 100644 --- a/notebooks/cdc-mongodb-sql-commands/notebook.ipynb +++ b/notebooks/cdc-mongodb-sql-commands/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "c6f5dc7b", "cell_type": "markdown", - "id": "a0efb393-2a46-4833-b5cf-8f048d9695b0", "metadata": {}, "source": [ "
    \n", @@ -19,30 +19,29 @@ { "attachments": {}, "cell_type": "markdown", - "id": "6d2bb122-3ae2-4eab-bbbd-3e3ba6907c4b", "metadata": {}, "source": [ "\n", " \n", " \n", "
    " - ] + ], + "id": "df1b5335" }, { "attachments": {}, "cell_type": "markdown", - "id": "1b895f1f-fd94-4cdf-ac4d-abfad66279a3", "metadata": {}, "source": [ "## When do you use SingleStore's native replication capability from MongoDB ?\n", "\n", "SingleStore's native data replication gives you the ability to do one-time snapshot, or continuous change data capture CDC from MongoDB\u00ae to SingleStoreDB. This provides a quick and easy way to replicate data and power up analytics on MongoDB\u00ae data." - ] + ], + "id": "6229e0c9" }, { "attachments": {}, "cell_type": "markdown", - "id": "9e8e1c02-f723-4e0c-88f3-0adb4dc8b0de", "metadata": {}, "source": [ "## What you will learn in this notebook:\n", @@ -51,35 +50,35 @@ "1. Directly without transformations\n", "2. Flattening required fields into columns of a table\n", "3. Normalizing collection into multiple tables" - ] + ], + "id": "54bed1b2" }, { "attachments": {}, "cell_type": "markdown", - "id": "4ceef3c0-c804-48c3-9ca0-dcb7f5abfe27", "metadata": {}, "source": [ "## 1. Replicate directly without transformations\n", "\n", "To replicate the required collections, provide the list of collections using `\"collection.include.list\": \"\"` at the time of link creation, the parameter takes a comma-separated list of regular expressions that match collection names (in databaseName.collectionName format)" - ] + ], + "id": "635f3fe9" }, { "cell_type": "code", "execution_count": 1, - "id": "49145680-6e52-4af7-b3f9-f39c5aebf2e7", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "DROP DATABASE IF EXISTS sample_analytics;\n", "CREATE DATABASE sample_analytics;" - ] + ], + "id": "71592e8c" }, { "attachments": {}, "cell_type": "markdown", - "id": "8636d519-fbfb-46fd-a650-d7c157ffb119", "metadata": {}, "source": [ "
    \n", @@ -89,12 +88,12 @@ "

    Make sure to select a database from the drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.

    \n", "
    \n", "
    " - ] + ], + "id": "ba6cedf5" }, { "cell_type": "code", "execution_count": 2, - "id": "a88a0436-9b86-4c34-a4e5-196475049152", "metadata": {}, "outputs": [], "source": [ @@ -110,123 +109,123 @@ " \"mongodb.user\":\"mongo_sample_reader\",\n", " \"mongodb.password\":\"SingleStoreRocks27017\"\n", " }'" - ] + ], + "id": "be88b07b" }, { "attachments": {}, "cell_type": "markdown", - "id": "a0301e3b-c447-440b-9e01-dd771ed13408", "metadata": {}, "source": [ "Check if the link got created" - ] + ], + "id": "e1e1fbb1" }, { "cell_type": "code", "execution_count": 3, - "id": "9cecea80-2b40-49e9-8882-d97ad01e3b0a", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SHOW LINKS;" - ] + ], + "id": "1b78164d" }, { "attachments": {}, "cell_type": "markdown", - "id": "d757f030-0c5d-4864-aa3d-c8cba84216ce", "metadata": {}, "source": [ "The following step automatically creates the required tables and pipelines on SingleStoreDB for every collection configured for replication" - ] + ], + "id": "0b393f99" }, { "cell_type": "code", "execution_count": 4, - "id": "356e9f3c-8473-45fa-8918-39c3dfe98403", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "CREATE TABLES AS INFER PIPELINE AS LOAD DATA LINK cdclink '*' FORMAT AVRO;" - ] + ], + "id": "0b815f2e" }, { "attachments": {}, "cell_type": "markdown", - "id": "6395285d-c68e-49e6-a794-b54f18b53bac", "metadata": {}, "source": [ "Start pipelines to begin replicating the data" - ] + ], + "id": "94b05d5b" }, { "cell_type": "code", "execution_count": 5, - "id": "76795c17-0235-4d21-92be-49c98bfbc5a9", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "START ALL PIPELINES;" - ] + ], + "id": "65192525" }, { "cell_type": "code", "execution_count": 6, - "id": "3647b206-7ee6-45d6-9a5c-f0dceabf51e3", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SHOW TABLES;" - ] + ], + "id": "eaea4e3a" }, { "attachments": {}, "cell_type": "markdown", - "id": "9f4a5d14-d964-4345-bbea-78a7c8ffa20b", "metadata": {}, "source": [ "The customer collection from MongoDB are replicated into SingleStore in the default format of _id and _more BSON columns that are compatible with Kai API" - ] + ], + "id": "52635772" }, { "cell_type": "code", "execution_count": 7, - "id": "dd29a515-32c3-4079-bcf1-ba6227790082", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT (_id :> JSON),(_more :> JSON) FROM customers LIMIT 2;" - ] + ], + "id": "c0014b63" }, { "attachments": {}, "cell_type": "markdown", - "id": "65811b88-919b-4bef-8601-988e324f383c", "metadata": {}, "source": [ "## 2. Flattening required fields from document into columns\n", "CDC replication also gives additional flexibility to define your own table structure at SingleStore as you bring in data from MongoDB collections. In the following examples data from MongoDB collections are transformed when brought to SingleStoreDB" - ] + ], + "id": "e7184990" }, { "attachments": {}, "cell_type": "markdown", - "id": "7fb1effe-85cf-4900-8600-364367c531a8", "metadata": {}, "source": [ "Fields like `username`, `name`, `email` are flattened into columns of the table and rest of the document is stored in _more column.\n", "The following commands create a table, a stored procedure and a pipeline required for the data replication" - ] + ], + "id": "ddd6afb0" }, { "cell_type": "code", "execution_count": 8, - "id": "a82364bd-a35a-4b1f-b99d-98837f31a16f", "metadata": {}, "outputs": [], "source": [ @@ -242,12 +241,12 @@ " UNIQUE KEY `__PRIMARY` (`$_id`) USING HASH,\n", " SORT KEY `__UNORDERED` ()\n", ")" - ] + ], + "id": "7233596f" }, { "cell_type": "code", "execution_count": 9, - "id": "1d4b829a-d2ec-46ee-bdc7-b6bc7bb24176", "metadata": {}, "outputs": [], "source": [ @@ -261,12 +260,12 @@ "IF rowsDeleted > 0 THEN\n", "DELETE dest FROM `sample_analytics`.`customers_flattened` AS dest INNER JOIN changes ON dest.`$_id` = BSON_NORMALIZE_NO_ARRAY(changes.`_id`) WHERE changes.__operation = 1; END IF;\n", "END;" - ] + ], + "id": "2a60c28c" }, { "cell_type": "code", "execution_count": 10, - "id": "f8ea1ec3-e690-4417-af75-bcde15ada610", "metadata": {}, "outputs": [], "source": [ @@ -285,67 +284,67 @@ " _id <- `payload`::`_id`,\n", " _more <- `payload`::`_more`\n", ")" - ] + ], + "id": "f9c867ef" }, { "cell_type": "code", "execution_count": 11, - "id": "ff3a5285-624a-4d36-86e1-85c2ee92c125", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "START ALL PIPELINES;" - ] + ], + "id": "f78191dc" }, { "cell_type": "code", "execution_count": 12, - "id": "327673e2-dffb-46e0-92f7-b2c73ab137ae", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SHOW TABLES;" - ] + ], + "id": "dc7c6aac" }, { "cell_type": "code", "execution_count": 13, - "id": "a5cf5747-55e3-4c13-8c28-34be44b34a4a", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT _id :> JSON,username, name, email, _more :> JSON FROM customers_flattened LIMIT 10;" - ] + ], + "id": "adf1b8d4" }, { "attachments": {}, "cell_type": "markdown", - "id": "53a66a38-ddbe-44a7-b768-848793353639", "metadata": {}, "source": [ "## 3. Normalize a collection into multiple tables\n", "In the following example a collection of MongoDB is normalized into two different tables on SingleStore." - ] + ], + "id": "652750b1" }, { "cell_type": "code", "execution_count": 14, - "id": "57f1594c-aec8-48a5-b0eb-c4574e711276", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "DROP DATABASE IF EXISTS sample_airbnb;\n", "CREATE DATABASE sample_airbnb;" - ] + ], + "id": "c5ee0fe8" }, { "attachments": {}, "cell_type": "markdown", - "id": "6e55bb14-3304-4ec0-8693-ba9168df43d4", "metadata": {}, "source": [ "
    \n", @@ -355,12 +354,12 @@ "

    Make sure to select a database from the drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.

    \n", "
    \n", "
    " - ] + ], + "id": "dddaf078" }, { "cell_type": "code", "execution_count": 15, - "id": "48fa8228-4a39-4553-a535-981b4a42a0a1", "metadata": {}, "outputs": [], "source": [ @@ -376,23 +375,23 @@ " \"mongodb.user\":\"mongo_sample_reader\",\n", " \"mongodb.password\":\"SingleStoreRocks27017\"\n", " }'" - ] + ], + "id": "9be311ab" }, { "cell_type": "code", "execution_count": 16, - "id": "87580303-6afc-4855-966f-d6a9b8289c2b", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SHOW LINKS;" - ] + ], + "id": "4b89bdce" }, { "cell_type": "code", "execution_count": 17, - "id": "16a917d3-0515-499c-a019-5ac889ca7a37", "metadata": {}, "outputs": [], "source": [ @@ -408,12 +407,12 @@ " UNIQUE KEY `__PRIMARY` (`$_id`) USING HASH,\n", " SORT KEY `__UNORDERED` ()\n", ")" - ] + ], + "id": "c4d7b98d" }, { "cell_type": "code", "execution_count": 18, - "id": "609e6fe7-194a-48eb-a55e-a9b49b6180f6", "metadata": {}, "outputs": [], "source": [ @@ -428,12 +427,12 @@ " UNIQUE KEY `__PRIMARY` (`$listingid`) USING HASH,\n", " SORT KEY `__UNORDERED` ()\n", ")" - ] + ], + "id": "1e091d27" }, { "cell_type": "code", "execution_count": 19, - "id": "1029b248-a926-4e04-8071-ca7353ed7f4d", "metadata": {}, "outputs": [], "source": [ @@ -458,12 +457,12 @@ "END IF;\n", "\n", "END;" - ] + ], + "id": "d857451a" }, { "cell_type": "code", "execution_count": 20, - "id": "e6bfa806-0f4f-4953-9580-6a6f345811e3", "metadata": {}, "outputs": [], "source": [ @@ -482,55 +481,56 @@ " _id <- `payload`::`_id`,\n", " _more <- `payload`::`_more`\n", ")" - ] + ], + "id": "c0a21511" }, { "cell_type": "code", "execution_count": 21, - "id": "255a27a9-96d6-4cbb-bed8-7c48612c82da", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "START ALL PIPELINES;" - ] + ], + "id": "4d74028d" }, { "cell_type": "code", "execution_count": 22, - "id": "6e23b185-1997-4749-be4c-66612672cb42", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SHOW TABLES;" - ] + ], + "id": "60efa033" }, { "cell_type": "code", "execution_count": 23, - "id": "64cb77e2-f82b-4147-b632-13f3b944ce8f", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT _id:>JSON ,name, access, accommodates FROM listings LIMIT 10;" - ] + ], + "id": "a0c36179" }, { "cell_type": "code", "execution_count": 24, - "id": "19c61f92-2294-4190-9b98-a4c96b03bdaf", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT listingid:>JSON, review_scores_accuracy,review_scores_cleanliness, review_scores_rating FROM reviews LIMIT 10;" - ] + ], + "id": "7bc334b1" }, { + "id": "30f1da03", "cell_type": "markdown", - "id": "86aa2824-88c8-4d4a-a459-f2defc32d937", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/cloud-functions-template/notebook.ipynb b/notebooks/cloud-functions-template/notebook.ipynb index 4cbc7123..8263099c 100644 --- a/notebooks/cloud-functions-template/notebook.ipynb +++ b/notebooks/cloud-functions-template/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "33b073ec", "cell_type": "markdown", - "id": "8ba141c2-e0c2-4723-b782-c924bc7b294c", "metadata": {}, "source": [ "
    \n", @@ -17,8 +17,8 @@ ] }, { + "id": "bd0ae268", "cell_type": "markdown", - "id": "e0f88a6f-2658-40b4-9356-935a09f5053e", "metadata": {}, "source": [ "
    \n", @@ -33,27 +33,26 @@ { "attachments": {}, "cell_type": "markdown", - "id": "a5564913-7ff8-41bf-b64b-b67971c63fae", "metadata": {}, "source": [ "This Jupyter notebook will help you build your first Cloud Function, showcasing how to leverage the ultra-fast queries of SingleStore to build a responsive API server using FastAPI" - ] + ], + "id": "bcb6e6a7" }, { "attachments": {}, "cell_type": "markdown", - "id": "1e394195-29b4-403c-9abf-5d7731349eb6", "metadata": {}, "source": [ "## Create some simple tables\n", "\n", "This setup establishes a basic relational structure to store some items information." - ] + ], + "id": "5776ded1" }, { "cell_type": "code", "execution_count": 1, - "id": "a17bdd3a-16b3-4e19-8a56-6566a169eccb", "metadata": {}, "outputs": [], "source": [ @@ -66,23 +65,23 @@ " name VARCHAR(255),\n", " price FLOAT\n", ");" - ] + ], + "id": "2bbf6a44" }, { "attachments": {}, "cell_type": "markdown", - "id": "af6e2618-de97-4397-b0d2-23e4a4df1d83", "metadata": {}, "source": [ "## Create a Connection Pool\n", "\n", "To run multiple simultaneous queries, we use sqlalchemy to create a pool of sql connections to the workspace you have selected. We also define a method to execute queries and transactions using a connection from this pool." - ] + ], + "id": "8a099c3f" }, { "cell_type": "code", "execution_count": 2, - "id": "f485e71b-2b05-4696-b22a-cf046fd83090", "metadata": {}, "outputs": [], "source": [ @@ -118,23 +117,23 @@ " except Exception as e:\n", " transaction.rollback()\n", " raise e" - ] + ], + "id": "225f1ba8" }, { "attachments": {}, "cell_type": "markdown", - "id": "ee9058a9-34a5-46fc-8b12-d30cbb8c3340", "metadata": {}, "source": [ "## Setup Environment\n", "\n", "Lets setup the environment ro run a FastAPI app defining the Data Model and an executor to run the different requests in different threads simultaneously" - ] + ], + "id": "dd5f2dea" }, { "cell_type": "code", "execution_count": 3, - "id": "66df8f0c-70c6-4f06-9e64-ef06961cca3a", "metadata": {}, "outputs": [], "source": [ @@ -155,23 +154,23 @@ "def run_in_thread(fn, *args):\n", " loop = asyncio.get_event_loop()\n", " return loop.run_in_executor(executor, fn, *args)" - ] + ], + "id": "f7bd1ec2" }, { "attachments": {}, "cell_type": "markdown", - "id": "96760949-5ab2-474d-80ca-d23b5dcc52f7", "metadata": {}, "source": [ "## Define FastAPI App\n", "\n", "Next, we will be defining a FastAPI app that can insert, query and delete data from your table" - ] + ], + "id": "d58c8382" }, { "cell_type": "code", "execution_count": 4, - "id": "3087dbe6-57ce-4410-a42f-5b0fe90add90", "metadata": {}, "outputs": [], "source": [ @@ -230,44 +229,45 @@ " return await run_in_thread(delete_item_query)\n", " except Exception as e:\n", " raise HTTPException(status_code=500, detail=f\"Error deleting item with id {item_id}: {str(e)}\")" - ] + ], + "id": "f3f3b047" }, { "attachments": {}, "cell_type": "markdown", - "id": "c3d9ed07-4b55-4d17-aabb-e11b399109d1", "metadata": {}, "source": [ "## Start the FastAPI server\n", "\n", "The link at which the cloud function will be available interactively will be displayed." - ] + ], + "id": "40e2ad59" }, { "cell_type": "code", "execution_count": 5, - "id": "ff002c7d-9f1c-40e5-b82a-c9176251dc99", "metadata": {}, "outputs": [], "source": [ "import singlestoredb.apps as apps\n", "connection_info = await apps.run_function_app(app)" - ] + ], + "id": "ed4b22cd" }, { "attachments": {}, "cell_type": "markdown", - "id": "fabe76b7-e6a0-43a0-8d9e-aa79bd7d3021", "metadata": {}, "source": [ "## Publish Cloud Function\n", "\n", "After validating the Cloud Function interactively, you can publish it and use it as an API server for your data!" - ] + ], + "id": "4a825f0d" }, { + "id": "b6c75678", "cell_type": "markdown", - "id": "55513fb9-f288-4cf1-b371-a71439bb1a31", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/confluent-cloud-integration/notebook.ipynb b/notebooks/confluent-cloud-integration/notebook.ipynb index 2e99947f..cf8fea30 100644 --- a/notebooks/confluent-cloud-integration/notebook.ipynb +++ b/notebooks/confluent-cloud-integration/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "d3ffe3a2", "cell_type": "markdown", - "id": "3e5ed872-859a-4584-a702-2dd3c44eb814", "metadata": {}, "source": [ "
    \n", @@ -21,7 +21,8 @@ "metadata": {}, "source": [ "" - ] + ], + "id": "0a6f9edd" }, { "cell_type": "markdown", @@ -34,28 +35,32 @@ "Once the cluster is created, perform the following tasks:\n", "\n", "- Create a topic, for example 's2-topic'. On the topic overview page, select Schema > Set a schema > Avro, and add a new Avro schema. In this guide, the default schema is used." - ] + ], + "id": "5fd6e4ba" }, { "cell_type": "markdown", "metadata": {}, "source": [ "" - ] + ], + "id": "d56193e9" }, { "cell_type": "markdown", "metadata": {}, "source": [ "- Create API keys. The API key is displayed only once. Be sure to copy and securely store the API key." - ] + ], + "id": "ede3b8d7" }, { "cell_type": "markdown", "metadata": {}, "source": [ "" - ] + ], + "id": "9ed4b165" }, { "cell_type": "markdown", @@ -63,7 +68,8 @@ "source": [ "- On the left navigation pane, select Connectors and create a sample producer named 'datagen' using the Datagen Source connector. In the Topic selection pane, select the 's2-topic' created earlier. In the Kafka credentials pane, select the Use an existing API key option. Configure the producer to use the same schema as the one in the created topic. Refer to Step 3: Create a sample producer for more information.\n", "- Launch the 'datagen' producer and verify that the 's2-topic' has new messages." - ] + ], + "id": "84749414" }, { "cell_type": "markdown", @@ -84,7 +90,8 @@ "Select Create Schema Registry API key to create a schema API key and configure the following variables:\n", "- CONFLUENT_SCHEMA_REGISTRY_KEY\n", "- CONFLUENT_SCHEMA_REGISTRY_SECRET" - ] + ], + "id": "79e9060a" }, { "cell_type": "code", @@ -104,14 +111,16 @@ "CONFLUENT_SCHEMA_REGISTRY_URL='https://psrc-9zg5y.europe-west3.gcp.confluent.cloud'\n", "CONFLUENT_SCHEMA_REGISTRY_KEY = '7ALNJUEMWMBIMAQL'\n", "CONFLUENT_SCHEMA_REGISTRY_SECRET = '***************************************'" - ] + ], + "id": "4fea4101" }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create a Database" - ] + ], + "id": "75b59d19" }, { "cell_type": "code", @@ -123,21 +132,24 @@ "\n", "DROP DATABASE IF EXISTS {{S2_DATABASE_NAME}};\n", "CREATE DATABASE {{S2_DATABASE_NAME}};" - ] + ], + "id": "ea3e011e" }, { "cell_type": "markdown", "metadata": {}, "source": [ "

    Action Required

    Be sure to select the {{S2_DATABASE_NAME}} database from the drop-down list at the top of this notebook. It updates the connection_url which is used by the %%sql magic command and SQLAlchemy to connect to the selected database.

    " - ] + ], + "id": "cf9309a6" }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create a Table Based on the Kafka Avro Schema" - ] + ], + "id": "623b63ee" }, { "cell_type": "code", @@ -154,21 +166,24 @@ "`field2` double,\n", "`field3` text\n", ");" - ] + ], + "id": "5b8cccce" }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create a Kafka Pipeline" - ] + ], + "id": "f1dabcd9" }, { "cell_type": "markdown", "metadata": {}, "source": [ "

    Notes

    • All Kafka configurations in the pipeline, such as 'client.id', are supported since version 8.1.35.

    • The schema registry mapping section should be updated according to your schema registry in the 'table column name' <- 'schema registry field name' format.

    " - ] + ], + "id": "0aed93cd" }, { "cell_type": "code", @@ -193,14 +208,16 @@ "field2 <- my_field2,\n", "field3 <- my_field3\n", ");" - ] + ], + "id": "58c471f0" }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Test the Created Pipeline" - ] + ], + "id": "1ea816f8" }, { "cell_type": "code", @@ -210,14 +227,16 @@ "source": [ "%%sql\n", "TEST PIPELINE {{S2_DATABASE_NAME}}.{{S2_PIPELINE_NAME}} LIMIT 1;" - ] + ], + "id": "95565993" }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Start the Pipeline" - ] + ], + "id": "ca3bb824" }, { "cell_type": "code", @@ -228,14 +247,16 @@ "%%sql\n", "\n", "START PIPELINE {{S2_DATABASE_NAME}}.{{S2_PIPELINE_NAME}};" - ] + ], + "id": "31f8bc3f" }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Stop the Pipeline" - ] + ], + "id": "d17352a4" }, { "cell_type": "code", @@ -246,14 +267,16 @@ "%%sql\n", "\n", "STOP PIPELINE {{S2_DATABASE_NAME}}.{{S2_PIPELINE_NAME}};" - ] + ], + "id": "93d4b5c0" }, { "cell_type": "markdown", "metadata": {}, "source": [ "### View Consumed Events" - ] + ], + "id": "d6e9fda0" }, { "cell_type": "code", @@ -264,11 +287,12 @@ "%%sql\n", "\n", "SELECT * FROM {{S2_DATABASE_NAME}}.{{S2_TABLE_NAME}};" - ] + ], + "id": "d4d40067" }, { + "id": "d6a5e9d7", "cell_type": "markdown", - "id": "ccd26740-0002-48c2-b410-733d06171621", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/create-dash-app/notebook.ipynb b/notebooks/create-dash-app/notebook.ipynb index 8aed0b8a..93ef6fcf 100644 --- a/notebooks/create-dash-app/notebook.ipynb +++ b/notebooks/create-dash-app/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "53ac2791", "cell_type": "markdown", - "id": "24350541-570b-491c-be33-b32b46764cf0", "metadata": {}, "source": [ "
    \n", @@ -17,8 +17,8 @@ ] }, { + "id": "6e39e1ba", "cell_type": "markdown", - "id": "c5893325-17c9-495f-862f-049072c30806", "metadata": {}, "source": [ "
    \n", @@ -33,27 +33,26 @@ { "attachments": {}, "cell_type": "markdown", - "id": "df860ca4-6db8-4ded-a061-30be438c4add", "metadata": {}, "source": [ "This Jupyter notebook will help you build your first real time Dashboard, showcasing how to leverage the ultra-fast queries of SingleStore to build a great visual experience using Plotly's DashApps." - ] + ], + "id": "6f712b45" }, { "attachments": {}, "cell_type": "markdown", - "id": "e0fd0d9c-fd75-453a-aac3-bf797949dcce", "metadata": {}, "source": [ "## Create some simple tables\n", "\n", "This setup establishes a basic relational structure to store some orders information." - ] + ], + "id": "14e40e4b" }, { "cell_type": "code", "execution_count": 1, - "id": "d218d020-b9dc-4419-961d-2232ca0893f8", "metadata": {}, "outputs": [], "source": [ @@ -66,23 +65,23 @@ " amount DECIMAL(10, 2),\n", " name VARCHAR(50)\n", ");" - ] + ], + "id": "0e09ebfd" }, { "attachments": {}, "cell_type": "markdown", - "id": "c6e492c6-74c8-488f-a456-fae59af0c69d", "metadata": {}, "source": [ "## Insert some data\n", "\n", "Lets now insert some time series data into the table." - ] + ], + "id": "56787869" }, { "cell_type": "code", "execution_count": 2, - "id": "98e60d97-42ce-4600-8e35-556c70f9d4c2", "metadata": {}, "outputs": [], "source": [ @@ -109,23 +108,23 @@ "(19, '2024-01-04', 25.00, \"Speaker\"),\n", "(20, '2024-01-04', 50.00, \"Monitor\"),\n", "(21, '2024-01-04', 70.00, \"Monitor\");" - ] + ], + "id": "0d301918" }, { "attachments": {}, "cell_type": "markdown", - "id": "beb57814-ad38-4065-a730-59576f6a72e3", "metadata": {}, "source": [ "## Create a Connection Pool\n", "\n", "Next, we use sqlalchemy to create a pool of sql connections to the workspace you have selected. We also define a method to execute queries using a connection from this pool." - ] + ], + "id": "a1bf5d71" }, { "cell_type": "code", "execution_count": 3, - "id": "f030ce86-4940-4014-8227-6b8c9cb56246", "metadata": {}, "outputs": [], "source": [ @@ -150,12 +149,12 @@ "def execute_query(query: str):\n", " with engine.connect() as connection:\n", " return pd.read_sql_query(query, connection)" - ] + ], + "id": "b2124893" }, { "attachments": {}, "cell_type": "markdown", - "id": "dd87d196-3d52-4f3a-8dd4-d5f3540b051f", "metadata": {}, "source": [ "## Create a line chart\n", @@ -163,12 +162,12 @@ "You can create a line chart using plotly, to depict either of the following\n", "- Number of items sold\n", "- Total sales volume" - ] + ], + "id": "39d62939" }, { "cell_type": "code", "execution_count": 4, - "id": "712cd20d-6f2d-4c5a-9094-11b611ce622d", "metadata": {}, "outputs": [], "source": [ @@ -195,23 +194,23 @@ "\n", "line_chart = generate_line_chart(\"Count\")\n", "line_chart.show()" - ] + ], + "id": "1093acff" }, { "attachments": {}, "cell_type": "markdown", - "id": "cc363aa0-a8d5-4f7e-bdae-5a22d56e0bcf", "metadata": {}, "source": [ "## Create a pie chart\n", "\n", "You can create a pie chart to see the contribution of each type of item to the daily sales volume" - ] + ], + "id": "a84e01f8" }, { "cell_type": "code", "execution_count": 5, - "id": "79aa80ef-4a49-4238-87fb-f90a16ba4e42", "metadata": {}, "outputs": [], "source": [ @@ -227,24 +226,24 @@ "\n", "pie_chart = generate_pie_chart(\"2024-01-01\")\n", "pie_chart.show()" - ] + ], + "id": "61a5401b" }, { "attachments": {}, "cell_type": "markdown", - "id": "94586a2e-76b2-48f8-8dbd-ff7038443ae1", "metadata": {}, "source": [ "## Define the Dash App Layout and Callbacks\n", "\n", "We can now define the [layout](https://dash.plotly.com/layout) and [callbacks](https://dash.plotly.com/basic-callbacks) of the Dash app.\n", "The Layout defines the UI elements of your Dashboard and the callbacks define the interactions between the UI elements and the sqlalchemy query engine we defined earlier" - ] + ], + "id": "d358911f" }, { "cell_type": "code", "execution_count": 6, - "id": "de733262-834b-48b6-b885-78dfc5ebb452", "metadata": {}, "outputs": [], "source": [ @@ -315,43 +314,44 @@ " Input('interval-component', 'n_intervals'))\n", "def update_date_dropdown(n_intervals):\n", " return get_order_dates()" - ] + ], + "id": "030cc3fb" }, { "attachments": {}, "cell_type": "markdown", - "id": "f287e202-704b-4eb5-8290-fb08ba9a493c", "metadata": {}, "source": [ "## Start the Dash App server\n", "\n", "The link at which the Dash App will be available interactively will be displayed. You can also insert more data into the table and view the changes to the dashboard in real time." - ] + ], + "id": "df15a3bc" }, { "cell_type": "code", "execution_count": 7, - "id": "69632c1b-f981-4338-9f91-ca8ae746cd73", "metadata": {}, "outputs": [], "source": [ "connectionInfo = await apps.run_dashboard_app(app)" - ] + ], + "id": "b6dee5bd" }, { "attachments": {}, "cell_type": "markdown", - "id": "4fe4abd0-d52f-475a-89a4-d518f2b37d0d", "metadata": {}, "source": [ "## Publish Dashboard\n", "\n", "After validating the Dashboard interactively, you can publish it and view the changes to your data in real time!" - ] + ], + "id": "4a609059" }, { + "id": "87439e74", "cell_type": "markdown", - "id": "8eb7fab3-c714-4b3b-93a7-ce8a9836ded2", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/evaluating-llms-with-uptrain/notebook.ipynb b/notebooks/evaluating-llms-with-uptrain/notebook.ipynb index 0671e96b..39bee240 100644 --- a/notebooks/evaluating-llms-with-uptrain/notebook.ipynb +++ b/notebooks/evaluating-llms-with-uptrain/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "55261e39", "cell_type": "markdown", - "id": "391591bd-e6b0-4b53-84d1-5e579bf77e2a", "metadata": {}, "source": [ "
    \n", @@ -41,7 +41,8 @@ "- A SingleStoreDB workspace.\n", "\n", "Let's dive in and start building!" - ] + ], + "id": "95e02520" }, { "cell_type": "markdown", @@ -50,14 +51,16 @@ "### Create a workspace in your workspace group\n", "\n", "S-00 is sufficient." - ] + ], + "id": "546ff4ab" }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create a Database named evaluate_llm" - ] + ], + "id": "050108f9" }, { "cell_type": "code", @@ -69,14 +72,16 @@ "\n", "DROP DATABASE IF EXISTS evaluate_llm;\n", "CREATE DATABASE evaluate_llm;" - ] + ], + "id": "9941998f" }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Setting up the environment**: Before we begin, it's essential to ensure all the necessary packages are installed. Run the cell below to install the required libraries for our project. This will install uptrain, openai, langchain, and singlestoredb." - ] + ], + "id": "389618c5" }, { "cell_type": "code", @@ -85,14 +90,16 @@ "outputs": [], "source": [ "%pip install uptrain==0.7.1 openai==1.6.1 langchain==0.1.4 tiktoken==0.5.2 --quiet" - ] + ], + "id": "b9ca8614" }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Authentication**: The next step involves setting the required environment variables - mainly the openai key (for generating responses), singlestoredb (for context retrieval), and uptrain api key (for evaluating responses). You can create an account with UpTrain and generate the api key for free. Please visit https://uptrain.ai/" - ] + ], + "id": "5bc39175" }, { "cell_type": "code", @@ -108,7 +115,8 @@ "import openai\n", "\n", "client = openai.OpenAI()" - ] + ], + "id": "6798b800" }, { "cell_type": "code", @@ -117,14 +125,16 @@ "outputs": [], "source": [ "UPTRAIN_API_KEY = getpass.getpass('Uptrain API Key: ')" - ] + ], + "id": "922a5ea7" }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Importing Necessary Modules**: With the initial setup complete, let's import the essential classes and modules we'll use throughout this project. The following cell imports the required classes from langchain and SingleStoreDB." - ] + ], + "id": "b2278df6" }, { "cell_type": "code", @@ -136,14 +146,16 @@ "from uptrain import APIClient, Evals\n", "from langchain.vectorstores import SingleStoreDB\n", "from langchain.embeddings import OpenAIEmbeddings" - ] + ], + "id": "f734e5a5" }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Loading Data from the Web**: Our application requires data to process and generate insights. In this step, we'll fetch content from a URL using the WebBaseLoader class. The loaded data will be stored in the data variable. You can replace the URL with any other source if needed." - ] + ], + "id": "8f3a8051" }, { "cell_type": "code", @@ -155,14 +167,16 @@ "\n", "loader = WebBaseLoader('https://cloud.google.com/vertex-ai/docs/generative-ai/learn/generative-ai-studio')\n", "data = loader.load()" - ] + ], + "id": "09a11611" }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Splitting the Data**: To process the data more efficiently, we'll split the loaded content into smaller chunks. The RecursiveCharacterTextSplitter class helps in achieving this by dividing the data based on specified character limits." - ] + ], + "id": "0832a7d5" }, { "cell_type": "code", @@ -174,14 +188,16 @@ "\n", "text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=0)\n", "all_splits = text_splitter.split_documents(data)" - ] + ], + "id": "9924d694" }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Setting Up SingleStoreDB with OpenAI Embeddings**: For efficient storage and retrieval of our data, we use SingleStoreDB in conjunction with OpenAI embeddings. The following cell sets up the necessary environment variables and initializes the SingleStoreDB instance with OpenAI embeddings. Ensure you have the correct SingleStoreDB URL and credentials set." - ] + ], + "id": "e8797c67" }, { "cell_type": "code", @@ -199,14 +215,16 @@ "vectorstore = SingleStoreDB.from_documents(documents=all_splits,\n", " embedding=OpenAIEmbeddings(),\n", " table_name='vertex_ai_docs_chunk_size_200')" - ] + ], + "id": "f9306487" }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Setting Up the QA Prompt**: Once our data is processed and stored, we can use it to answer queries. The following cell defines a `generate_llm_response` which finds the document closest to the given question via vector similarity search and uses OpenAI's GPT-3.5-Turbo to generate the response." - ] + ], + "id": "0987b0f1" }, { "cell_type": "code", @@ -232,21 +250,24 @@ " ).choices[0].message.content\n", "\n", " return [{'question': question, 'context': context, 'response': response}]" - ] + ], + "id": "362d7801" }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], - "source": [] + "source": [], + "id": "2a2d6179" }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Let's try it out**: Let's try asking our QnA bot about Vertex AI." - ] + ], + "id": "0f089e50" }, { "cell_type": "code", @@ -255,14 +276,16 @@ "outputs": [], "source": [ "generate_llm_response('What is Vertex AI?', vectorstore)" - ] + ], + "id": "b54b0a61" }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Let's define more questions**: We now define a set of questions to test our bot upon and evaluate the quality of responses." - ] + ], + "id": "71e2d42e" }, { "cell_type": "code", @@ -289,14 +312,16 @@ "results = []\n", "for question in questions:\n", " results.extend(generate_llm_response(question, vectorstore))" - ] + ], + "id": "95b3b340" }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Running Evaluations using UpTrain**: We now define a set of questions to test our bot upon and evaluate the quality of responses. UpTrain provides an APIClient that can be initialized with `UPTRAIN_API_KEY`. It provides a `log_and_evaluate` method which takes the input data to be evaluated along with the list of checks to be run. It returns the scores along with explanations." - ] + ], + "id": "b90cd092" }, { "cell_type": "code", @@ -313,7 +338,8 @@ " data=results,\n", " checks=[Evals.CONTEXT_RELEVANCE, Evals.FACTUAL_ACCURACY]\n", ");" - ] + ], + "id": "d14c78d0" }, { "attachments": { @@ -327,21 +353,24 @@ "**Access UpTrain Dashboards**: We can access the evaluation results at https://demo.uptrain.ai/dashboard/ - the same API key can be used to access the dashboards.\n", "\n", "" - ] + ], + "id": "3d29edea" }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Running Experiments using UpTrain**: Let's also see how UpTrain can be used to conduct data-driven experimentation. We will increase the chunk_size from 200 to 1000 and see how that impacts the context retrieval quality." - ] + ], + "id": "117dcc41" }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Generate new embeddings**: We will again use SingleStoreDB to store new document embeddings" - ] + ], + "id": "2f0a47a6" }, { "cell_type": "code", @@ -354,14 +383,16 @@ "vectorstore_new = SingleStoreDB.from_documents(documents=all_splits,\n", " embedding=OpenAIEmbeddings(),\n", " table_name='vertex_ai_docs_chunk_size_1000')" - ] + ], + "id": "4ba8654d" }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Generate responses with new vectorstore**: Let's generate new responses for the same set of questions." - ] + ], + "id": "9d307a78" }, { "cell_type": "code", @@ -372,14 +403,16 @@ "results_larger_chunk = []\n", "for question in questions:\n", " results_larger_chunk.extend(generate_llm_response(question, vectorstore_new))" - ] + ], + "id": "8a6a4085" }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Append chunk size information**: Let's add the corresponding chunk size information for both sets of results. We will pass this column name to UpTrain to compare the two experiments" - ] + ], + "id": "70a6b0bb" }, { "cell_type": "code", @@ -392,14 +425,16 @@ "\n", "for x in results_larger_chunk:\n", " x.update({'chunk_size': 1000})" - ] + ], + "id": "9dc6313c" }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Evaluating Experiments using UpTrain**: UpTrain's APIClient also provides a \"evaluate_experiments\" method which takes the input data to be evaluated along with the list of checks to be run and the name of the columns associated with the experiment." - ] + ], + "id": "0e08fb98" }, { "cell_type": "code", @@ -413,7 +448,8 @@ " checks=[Evals.CONTEXT_RELEVANCE],\n", " exp_columns=['chunk_size']\n", ");" - ] + ], + "id": "15241a0d" }, { "attachments": { @@ -425,11 +461,12 @@ "metadata": {}, "source": [ "" - ] + ], + "id": "9bb1175a" }, { + "id": "ee02b325", "cell_type": "markdown", - "id": "62f5d31e-637c-4e97-8eee-34c529d21e8a", "metadata": {}, "source": [ "
    \n", @@ -457,5 +494,5 @@ } }, "nbformat": 4, - "nbformat_minor": 4 + "nbformat_minor": 5 } diff --git a/notebooks/getting-started-with-dataframes/notebook.ipynb b/notebooks/getting-started-with-dataframes/notebook.ipynb index e6e7de49..953fcd4c 100644 --- a/notebooks/getting-started-with-dataframes/notebook.ipynb +++ b/notebooks/getting-started-with-dataframes/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "9b385acd", "cell_type": "markdown", - "id": "caa4ce39-2f84-48b7-92b5-dccf6bede32b", "metadata": {}, "source": [ "
    \n", @@ -17,8 +17,8 @@ ] }, { + "id": "6fafac22", "cell_type": "markdown", - "id": "dc4afbf8", "metadata": {}, "source": [ "
    \n", @@ -32,7 +32,6 @@ }, { "cell_type": "markdown", - "id": "8d01d69e-f7d4-4efc-815f-148e0c099b90", "metadata": {}, "source": [ "SingleStoreDB supports a backend for the [Ibis](https://ibis-project.org) to\n", @@ -53,44 +52,44 @@ "\n", "The above will install the SingleStoreDB Python client, the SingleStoreDB SQLAlchemy dialect,\n", "and the SingleStoreDB Ibis backend." - ] + ], + "id": "3ea0fdf6" }, { "cell_type": "code", "execution_count": 1, - "id": "e388a2f8-1c79-4163-9a87-aaea862e5199", "metadata": {}, "outputs": [], "source": [ "import ibis\n", "import pandas as pd" - ] + ], + "id": "5fc0397a" }, { "cell_type": "markdown", - "id": "49612202-48ef-4d8a-8e7f-f5728c5142aa", "metadata": {}, "source": [ "We'll set up a few options to make the notebook nicer to use, including the display width of DataFrames and enabling the interactive features\n", "of the Ibis package. The primary purpose of the `interactive` option is so that you don't have to manually run the `execute` method on expressions\n", "to see the results. In our notebook, the `execute` method will be called implicitly each time an Ibis expression is rendered by the notebook." - ] + ], + "id": "b2c3a3b1" }, { "cell_type": "code", "execution_count": 2, - "id": "39cfa389-721a-4044-8f12-38bf06701bb1", "metadata": {}, "outputs": [], "source": [ "pd.options.display.max_colwidth = 120\n", "ibis.options.verbose = False\n", "ibis.options.interactive = True" - ] + ], + "id": "3dc5336b" }, { "cell_type": "markdown", - "id": "1614bef1", "metadata": {}, "source": [ "
    \n", @@ -100,21 +99,21 @@ "

    If you have a Free Starter Workspace deployed already, select the database from drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.

    \n", "
    \n", "
    " - ] + ], + "id": "58cad317" }, { "cell_type": "markdown", - "id": "a3cc67a9-310f-42e2-9d9a-bbe160a588f3", "metadata": {}, "source": [ "## Create a database\n", "If you have a Standard or Premium workspace deployment, you can create a new database to run this notebook. Running the cell below to create the new database in your existing workspace. Note: this will NOT create a new database in your Free Starter Workspace." - ] + ], + "id": "d829370b" }, { "cell_type": "code", "execution_count": 3, - "id": "944f4396-0fb6-4cc9-8a5c-fc7b9d450481", "metadata": {}, "outputs": [], "source": [ @@ -122,11 +121,11 @@ "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", " %sql DROP DATABASE IF EXISTS ibis_getting_started;\n", " %sql CREATE DATABASE ibis_getting_started;" - ] + ], + "id": "87d224f6" }, { "cell_type": "markdown", - "id": "e6742659-7213-4e00-8da6-2f10cf9c7d22", "metadata": {}, "source": [ "
    \n", @@ -136,11 +135,11 @@ "

    Make sure to select a database from the drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.

    \n", "
    \n", "
    " - ] + ], + "id": "e4421df5" }, { "cell_type": "markdown", - "id": "d185e3b6-b8b1-4fcd-be2f-c5b638f4ec9b", "metadata": {}, "source": [ "## Make a connection using Ibis\n", @@ -148,21 +147,21 @@ "Connections to Ibis are made through the `ibis.singlestoredb` method. The connection parameters can be given using either the suggested Python DB-API\n", "parameters (i.e., host=, user=, port=, etc.) or as a SQLAlchemy URL (e.g., user:password@host:port/database). In this case, we are getting the username and password from the environment variable `SINGLESTOREDB_URL` which is supplied by the\n", "SingleStore notebook environment." - ] + ], + "id": "af9134d0" }, { "cell_type": "code", "execution_count": 4, - "id": "18fef268-343d-47d9-bbf8-9693cea73a16", "metadata": {}, "outputs": [], "source": [ "conn = ibis.singlestoredb.connect()" - ] + ], + "id": "987c0ae5" }, { "cell_type": "markdown", - "id": "baa19853-0d9b-45b2-908a-a1b92de08f97", "metadata": {}, "source": [ "## Phone review data example\n", @@ -170,150 +169,150 @@ "Our first example will use customer review data for phones. It includes information such as a reviewer name, the text of the review, a short summary\n", "of the review, a numeric rating from 1 to 5, and the date of the review. We'll first read the data into a local pandas DataFrame to do some\n", "cleanup, then upload it to the database to demonstrate the SingleStoreDB package capabilities" - ] + ], + "id": "fad43ca2" }, { "cell_type": "markdown", - "id": "d10f036f-6519-4b52-b671-aa4e806927cf", "metadata": {}, "source": [ "### Read sample data into pandas DataFrame\n", "\n", "We will read the data for this example directly from a Github repository using a URL in the `pd.read_csv` function.\n", "The result will be a pandas `DataFrame`. We will upload the data to the server in a later step." - ] + ], + "id": "0c7611ec" }, { "cell_type": "code", "execution_count": 5, - "id": "bce0e67d-f9e3-4c29-a21a-0ad6e84e05c8", "metadata": {}, "outputs": [], "source": [ "phones_df = pd.read_csv('https://raw.githubusercontent.com/singlestore-labs/ibis-singlestoredb/main/examples/phones.csv')\n", "phones_df.head(3)" - ] + ], + "id": "dff813f8" }, { "cell_type": "markdown", - "id": "4c6a0e46-0710-44dc-a826-27d8442273b0", "metadata": {}, "source": [ "We can see the column types using the `info` method. As you can see, there are two date columns, neither of which is in a convenient\n", "format for use as a date. We'll first create an actual datetime column from those and drop the original columns." - ] + ], + "id": "649d0833" }, { "cell_type": "code", "execution_count": 6, - "id": "e8195553-9c4b-4322-a55a-7b2f7eb331a9", "metadata": {}, "outputs": [], "source": [ "phones_df.info()" - ] + ], + "id": "b3648d5d" }, { "cell_type": "markdown", - "id": "59ae51ff-7fef-4ccd-a84e-a4c5ba8952c6", "metadata": {}, "source": [ "### Upload the data to the server\n", "\n", "Now that we have some data to work with, we can upload it to the server using the `create_table` method. This method allows you to upload\n", "data from a local `DataFrame` or materialize a table from an Ibis table expression." - ] + ], + "id": "e6c78524" }, { "cell_type": "code", "execution_count": 7, - "id": "e549d41d-6b42-4f3a-a0a1-2cdbbc60178e", "metadata": {}, "outputs": [], "source": [ "conn.create_table?" - ] + ], + "id": "7c5acaff" }, { "cell_type": "markdown", - "id": "e3e9d2ec-a30c-4280-a05b-88aa5b34a7bb", "metadata": {}, "source": [ "The code below will create the table in the server from the given `DataFrame`. The data types for the database table will be inferred from\n", "the `DataFrame` data types. The `overwrite=True` is merely used here in case you run the notebook code multiple times against the same\n", "database. It will cause an existing `phones` table to be overwritten if it already exists." - ] + ], + "id": "6aeed052" }, { "cell_type": "code", "execution_count": 8, - "id": "987d9e59-09e3-46f5-9487-da21b9975eba", "metadata": {}, "outputs": [], "source": [ "phones_tbl = conn.create_table('phones', phones_df, overwrite=True)\n", "phones_tbl.head(3)" - ] + ], + "id": "e8b8be08" }, { "cell_type": "markdown", - "id": "2e95fef6-a6f3-4f9d-9338-efc0f8807c48", "metadata": {}, "source": [ "We can get information about the table in the server by using the `info` method, just as you would with a local pandas `DataFrame`." - ] + ], + "id": "525c58af" }, { "cell_type": "code", "execution_count": 9, - "id": "c3770704-41b0-4a99-b639-5e8242e952ec", "metadata": {}, "outputs": [], "source": [ "phones_tbl.info()" - ] + ], + "id": "b00b87b2" }, { "cell_type": "markdown", - "id": "76309280-afb7-46e6-91c9-f84fdc4ed7a7", "metadata": {}, "source": [ "Here is the `CREATE TABLE` statement resulting from the data upload step. The `show` attribute of the connection gives access to all of\n", "the `SHOW` commands in the database such as `functions`, `create_function`, `status`, `databases`, and so on." - ] + ], + "id": "cd7b0e0c" }, { "cell_type": "code", "execution_count": 10, - "id": "d63e3978-33ab-4154-a31a-367bf52aa3a9", "metadata": {}, "outputs": [], "source": [ "ct = conn.show.create_table('phones')\n", "ct" - ] + ], + "id": "d512fd95" }, { "cell_type": "markdown", - "id": "d36dfb66-d7d5-4343-850d-8060da9a0b7c", "metadata": {}, "source": [ "To print this a nicer way, we'll just display the `CreateTable` column of the first result row." - ] + ], + "id": "2bb21d51" }, { "cell_type": "code", "execution_count": 11, - "id": "8e5bd287-9ab3-4236-827b-16951b5b2039", "metadata": {}, "outputs": [], "source": [ "print(ct[0]['CreateTable'])" - ] + ], + "id": "b38bc988" }, { "cell_type": "markdown", - "id": "887e730f-e7da-4078-a9d9-1fbbe4435a47", "metadata": {}, "source": [ "### Convert review date / time columns\n", @@ -321,129 +320,129 @@ "This data set has a less than optimal way of storing the dates of the reviews. It has two columns that contain the same information and the review time, and neither\n", "of them is convenient to work with. Let's create a new column that contains the date in a timestamp column. To do that, we simply cast the\n", "existing `unixReviewTime` column to a timestamp type. We'll also use the `name` method to assign a name to the resulting column expression." - ] + ], + "id": "2d930373" }, { "cell_type": "code", "execution_count": 12, - "id": "3a37d8f0-815a-4b5e-9cfe-11a31de6e0a3", "metadata": {}, "outputs": [], "source": [ "date = phones_tbl.unixReviewTime.cast('timestamp').name('date')\n", "date" - ] + ], + "id": "a36d71e1" }, { "cell_type": "markdown", - "id": "7718eb94-0e45-4545-a5e3-f90ee4816775", "metadata": {}, "source": [ "The `overall` rating column was uploaded as floating point data, but it only contains integers. Let's cast that to an integer." - ] + ], + "id": "9fb2a0ce" }, { "cell_type": "code", "execution_count": 13, - "id": "1443148f-dc1b-4c04-b5b2-c182edbd601d", "metadata": {}, "outputs": [], "source": [ "overall = phones_tbl.overall.cast('int').name('overall')\n", "overall" - ] + ], + "id": "be53735f" }, { "cell_type": "markdown", - "id": "51a81952-0045-4a7e-ad5d-e8d3fa3e2b4c", "metadata": {}, "source": [ "Now that we have our two processed columns, we can remove the old columns we don't need anymore and insert our\n", "new columns expressions with the cleaned data." - ] + ], + "id": "0f0b1996" }, { "cell_type": "code", "execution_count": 14, - "id": "53386ec8-05d7-40c9-83c6-037d815b9367", "metadata": {}, "outputs": [], "source": [ "columns = [x for x in phones_tbl.columns if 'Time' not in x and x != 'overall'] + [overall, date]" - ] + ], + "id": "52fd0c29" }, { "cell_type": "code", "execution_count": 15, - "id": "3a27de1c-5f1b-4af1-af9a-2423fe9fc7d1", "metadata": {}, "outputs": [], "source": [ "phones_tbl = phones_tbl[columns]\n", "phones_tbl" - ] + ], + "id": "b4d4118b" }, { "cell_type": "markdown", - "id": "4847b100-1322-47dc-a216-a3a82371fde3", "metadata": {}, "source": [ "The `phones_tbl` object now contains both references to actual columns in the `phones` table as well as computed expressions.\n", "It can be thought of as a client-side view object, but still works the same way as our original table. We can use the `info`\n", "method to show the schema information." - ] + ], + "id": "f35fc27e" }, { "cell_type": "code", "execution_count": 16, - "id": "f6da4b8a-9bf4-46a7-b4f9-4348829b6ab2", "metadata": {}, "outputs": [], "source": [ "phones_tbl.info()" - ] + ], + "id": "4b7afa41" }, { "cell_type": "markdown", - "id": "160dfb54-511c-47af-b127-d7fd48f23975", "metadata": {}, "source": [ "It's also possible to access the schema information for traversing progamatically using the `schema` method of the table." - ] + ], + "id": "a2e72875" }, { "cell_type": "code", "execution_count": 17, - "id": "d8a40ddb-6f7c-4ec0-bba9-7d91414e3a88", "metadata": {}, "outputs": [], "source": [ "phones_tbl.schema()" - ] + ], + "id": "c2e4095b" }, { "cell_type": "markdown", - "id": "9719ecb7-9d8d-4639-a797-fc0297533791", "metadata": {}, "source": [ "Here are the public methods / attributes of the `Schema` object." - ] + ], + "id": "5f6f19b5" }, { "cell_type": "code", "execution_count": 18, - "id": "0b724536-5282-4a00-8a77-c07a753cf240", "metadata": {}, "outputs": [], "source": [ "for name in dir(phones_tbl.schema()):\n", " if not name.startswith('_'):\n", " print(name)" - ] + ], + "id": "389c9b13" }, { "cell_type": "markdown", - "id": "1595b069-95b6-4c4c-b8fb-6590ae7b45a8", "metadata": {}, "source": [ "### Create a table from a table expression\n", @@ -451,78 +450,78 @@ "As mentioned earlier, the `create_table` method can be used to create tables in the database from Ibis table\n", "expressions. That means that we can materialize our new table into the database if we so desire. This simply\n", "selects data from an expression into a new table, no work is being done on the client side." - ] + ], + "id": "6fa3f354" }, { "cell_type": "code", "execution_count": 19, - "id": "f427c60c-2fdd-4874-a219-38b27759f86d", "metadata": {}, "outputs": [], "source": [ "tbl_2 = conn.create_table('phones2', phones_tbl, overwrite=True)\n", "tbl_2" - ] + ], + "id": "db2e7d54" }, { "cell_type": "code", "execution_count": 20, - "id": "60a0dfb0-a8c0-4047-8658-858f79e90a02", "metadata": {}, "outputs": [], "source": [ "conn.show.tables()" - ] + ], + "id": "531b6443" }, { "cell_type": "code", "execution_count": 21, - "id": "03fe4942-f3e7-4d2c-96ad-721bed730c8f", "metadata": {}, "outputs": [], "source": [ "tbl_2.info()" - ] + ], + "id": "ebe7534f" }, { "cell_type": "code", "execution_count": 22, - "id": "e91d0901-6031-4ffe-b485-d167bdfa15c1", "metadata": {}, "outputs": [], "source": [ "conn.show.create_table('phones2')" - ] + ], + "id": "6de6f891" }, { "cell_type": "markdown", - "id": "b4425759-a20e-41dd-84c8-8ae89ab65f15", "metadata": {}, "source": [ "Tables can also be dropped using the `drop_table` method." - ] + ], + "id": "2f5fbeb8" }, { "cell_type": "code", "execution_count": 23, - "id": "cbeac008-f266-44ab-aef4-78142cf45ef6", "metadata": {}, "outputs": [], "source": [ "conn.drop_table('phones2')" - ] + ], + "id": "cf322021" }, { "cell_type": "markdown", - "id": "c7befbbf-c24c-41ed-a685-ad7c5e8da16b", "metadata": {}, "source": [ "### DataFrame operations" - ] + ], + "id": "0548af81" }, { "cell_type": "markdown", - "id": "b79ba74b-b713-4981-b657-6a128b61c8e1", "metadata": {}, "source": [ "Now that we have a table in the database and a `DataFrame`-like obect connected to it, let's do some operations with the data. For this example,\n", @@ -530,12 +529,12 @@ "example, we are building a sentence from various columns in the data table. We are using the reviewer name (which is capitalized), the summary,\n", "and the overall rating (which is cast to a string). All of these are concatenated together using the `+` as you normally would when\n", "concatenating strings in Python." - ] + ], + "id": "884017da" }, { "cell_type": "code", "execution_count": 24, - "id": "7d5d4e1d-ac82-42ee-926c-e7afb73db3f8", "metadata": {}, "outputs": [], "source": [ @@ -549,96 +548,96 @@ " phones_tbl.overall.cast('string') + \\\n", " ' stars.').name('Review Summary')\n", "texpr" - ] + ], + "id": "a3ecabb9" }, { "cell_type": "markdown", - "id": "f546bca3-5d61-48df-b178-c9fca4356e5f", "metadata": {}, "source": [ "As you can see from the SQL output above, the sentence is constructed using `concat` in the query. Capitalization is done using `ucase` and `substring`.\n", "The `:param_#` portions are used by Ibis to inject literal Python values from the expression. And, you'll see that the result table contains a column\n", "of string values that are the result of the expression." - ] + ], + "id": "83eba3be" }, { "cell_type": "markdown", - "id": "6475b645-7620-401d-8da4-99594a67895f", "metadata": {}, "source": [ "#### String operations\n", "\n", "There are many string operations available in Ibis. Many of them correspond to operations on pandas DataFrames and / or Python string methods.\n", "The code below prints a list of all methods on string columns." - ] + ], + "id": "2bddebd6" }, { "cell_type": "code", "execution_count": 25, - "id": "649d054f-ad0b-43e3-91c3-25ff4915f25a", "metadata": {}, "outputs": [], "source": [ "for x in dir(texpr):\n", " if not x.startswith('_'):\n", " print(x)" - ] + ], + "id": "debf5ef0" }, { "cell_type": "markdown", - "id": "87837e24-c8ad-4b88-ad69-9527b56677f9", "metadata": {}, "source": [ "#### Operations on other column types\n", "\n", "Other data types have different methods that can be called. For example, this is the list of methods on date/time columns." - ] + ], + "id": "ef514099" }, { "cell_type": "code", "execution_count": 26, - "id": "a5d65bdd-bcd8-43a8-b351-0d2492320ec7", "metadata": {}, "outputs": [], "source": [ "for x in dir(phones_tbl.date):\n", " if not x.startswith('_'):\n", " print(x)" - ] + ], + "id": "9907aaf4" }, { "cell_type": "markdown", - "id": "6d292dc6-72cc-4abf-8cb0-b0434f15c5e4", "metadata": {}, "source": [ "Here are the methods for numeric columns." - ] + ], + "id": "a9953c32" }, { "cell_type": "code", "execution_count": 27, - "id": "9d540c05-f7e0-4bfa-9508-2f25176f5ec7", "metadata": {}, "outputs": [], "source": [ "for x in dir(phones_tbl.overall):\n", " if not x.startswith('_'):\n", " print(x)" - ] + ], + "id": "1b5ff980" }, { "cell_type": "code", "execution_count": 28, - "id": "f5624059-dc62-4405-bb22-34269fb2c44e", "metadata": {}, "outputs": [], "source": [ "ibis.options.verbose = False" - ] + ], + "id": "ec9d3104" }, { "cell_type": "markdown", - "id": "8a24fd59-159b-4ef6-bd55-35ec71ac4089", "metadata": {}, "source": [ "### Filter reviews containing \"iphone 4\"\n", @@ -646,60 +645,60 @@ "Now that we've seen how to construct expressions that can be used in the database, let's use that expression for use in further queries.\n", "The expression below takes the result of our previous expression, lower-cases it, then checks for the occurrence of the substring\n", "\"iphone 4\". This will return a column of booleans which can be used to filter our original table, just like with a pandas DataFrame." - ] + ], + "id": "443328f1" }, { "cell_type": "code", "execution_count": 29, - "id": "6251e20b-f0e6-4368-aff2-3191b3cf5611", "metadata": {}, "outputs": [], "source": [ "has_iphone_4 = texpr.lower().contains('iphone 4').name('Has iPhone 4')\n", "has_iphone_4" - ] + ], + "id": "749719b2" }, { "cell_type": "markdown", - "id": "6f2fe329-fbcd-401d-9acb-472fd1b49ce6", "metadata": {}, "source": [ "First, let's use the filter expression to see how many of our generated sentences contain \"iphone 4\". We do this by using our previous\n", "expression as an indexing expression to our original table object (e.g., `tbl[filter-expr]`). This will return only the rows where\n", "the filter expression was `True`. We can then count the number of rows using the `count` method." - ] + ], + "id": "2eef6d94" }, { "cell_type": "code", "execution_count": 30, - "id": "ba0b72a1-bcd1-4021-8312-894493d1d8e6", "metadata": {}, "outputs": [], "source": [ "phones_tbl[has_iphone_4].count()" - ] + ], + "id": "cedfd906" }, { "cell_type": "markdown", - "id": "80e7cf08-a3fa-421d-8c2b-aac700b1ca86", "metadata": {}, "source": [ "Now we can print selected columns from the filtered table. We can also sort them according to the `overall` column as shown below." - ] + ], + "id": "cb30666b" }, { "cell_type": "code", "execution_count": 31, - "id": "2a3d6d92-4093-4837-b4da-2fb59ea313ed", "metadata": {}, "outputs": [], "source": [ "phones_tbl[has_iphone_4]['reviewerName', 'helpful', 'overall'].order_by(ibis.desc('overall'))" - ] + ], + "id": "50abc49c" }, { "cell_type": "markdown", - "id": "cabaa7b9-3374-43ab-8515-dd21213a68a3", "metadata": {}, "source": [ "## Conclusion\n", @@ -708,11 +707,11 @@ "to SQL and executed on the server, used those expressions in filtering operations, and selected columns from and sorted the results of\n", "the filtering operation. This covers a small number of the abilities of the table and column objects created by Ibis. For additional methods\n", "on the various types, see the [Ibis documentation](https://ibis-project.orghttps://ibis-project.org)." - ] + ], + "id": "d0944d81" }, { "cell_type": "markdown", - "id": "e811c596", "metadata": {}, "source": [ "
    \n", @@ -722,23 +721,24 @@ "

    If you created a new database in your Standard or Premium Workspace, you can drop the database by running the cell below. Note: this will not drop your database for Free Starter Workspaces. To drop a Free Starter Workspace, terminate the Workspace using the UI.

    \n", "
    \n", "
    " - ] + ], + "id": "a775fe56" }, { "cell_type": "code", "execution_count": 32, - "id": "d5bc91c6-f65b-4479-9475-df60e0ab2088", "metadata": {}, "outputs": [], "source": [ "shared_tier_check = %sql show variables like 'is_shared_tier'\n", "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", " %sql DROP DATABASE IF EXISTS ibis_getting_started;" - ] + ], + "id": "c9fa396a" }, { + "id": "b0a782b4", "cell_type": "markdown", - "id": "b40a7c86-2b36-4dad-a92f-d88f44410ec6", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/getting-started-with-fusion-sql/notebook.ipynb b/notebooks/getting-started-with-fusion-sql/notebook.ipynb index 7fec5f4d..da2b7ea7 100644 --- a/notebooks/getting-started-with-fusion-sql/notebook.ipynb +++ b/notebooks/getting-started-with-fusion-sql/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "c9e5c3b2", "cell_type": "markdown", - "id": "6fe27fdb-6d1d-49ad-a868-6d434f0840ff", "metadata": {}, "source": [ "
    \n", @@ -19,96 +19,95 @@ { "attachments": {}, "cell_type": "markdown", - "id": "fd164b0c-acb1-413c-8fc9-9e611fb8c5e1", "metadata": {}, "source": [ "In this notebook, we introduce Fusion SQL. Fusion SQL are SQL statements that\n", "can be used to manage workspace groups, workspaces, files in workspace stages,\n", "and other resources that could previously only be managed in the portal user\n", "interface or the Management REST API." - ] + ], + "id": "038ad772" }, { "attachments": {}, "cell_type": "markdown", - "id": "749d2c42-9296-44ff-9060-dd1fd415ce10", "metadata": {}, "source": [ "## Displaying available Fusion SQL commands\n", "\n", "We can use the `SHOW FUSION COMMANDS` statement to get all of the available commands." - ] + ], + "id": "8cfa493d" }, { "cell_type": "code", "execution_count": 1, - "id": "50a68230-7a0d-42d8-8805-e0e19cc462ba", "metadata": {}, "outputs": [], "source": [ "!pip uninstall -y singlestoredb\n", "!pip install singlestoredb" - ] + ], + "id": "b0c1465d" }, { "cell_type": "code", "execution_count": 2, - "id": "0979fcb9-5d5f-4bc4-964d-66a4b121d707", "metadata": {}, "outputs": [], "source": [ "commands = %sql SHOW FUSION COMMANDS\n", "for cmd in commands:\n", " print(*cmd, '\\n')" - ] + ], + "id": "0b7d54b8" }, { "attachments": {}, "cell_type": "markdown", - "id": "bbfe4266-fc6d-4f5a-aacd-94cd761ed401", "metadata": {}, "source": [ "The `SHOW FUSION COMMANDS` also has a `LIKE` option that can be used to filter the displayed commands." - ] + ], + "id": "4adb346b" }, { "cell_type": "code", "execution_count": 3, - "id": "88244a88-9722-49c1-9f19-5f22e0b8e300", "metadata": {}, "outputs": [], "source": [ "commands = %sql SHOW FUSION COMMANDS LIKE '%stage%'\n", "for cmd in commands:\n", " print(*cmd, '\\n')" - ] + ], + "id": "31a35316" }, { "attachments": {}, "cell_type": "markdown", - "id": "f8c94b3e-1521-4957-8f37-f0d004f55609", "metadata": {}, "source": [ "Let's try a workflow that goes through the entire process of creating a workspace group, workspace,\n", "and Stage files." - ] + ], + "id": "c9f87f44" }, { "attachments": {}, "cell_type": "markdown", - "id": "e6b0762f-5626-453c-9b7b-a6e56ace7e86", "metadata": {}, "source": [ "## Working with workspace groups\n", "\n", "In this example, we will create a new workspace group, add workspaces, and demonstrate how to suspend and resume a workspace.\n", "We will then terminate the workspaces and workspace groups all from SQL!" - ] + ], + "id": "6605910c" }, { "attachments": {}, "cell_type": "markdown", - "id": "dd48fec9-8a83-466c-b784-c1e63beea818", "metadata": {}, "source": [ "Looking above at our list of printed commands, we see that the `CREATE WORKSPACE GROUP`\n", @@ -127,32 +126,32 @@ "```\n", "SHOW REGIONS [ LIKE '' ] [ ORDER BY '' [ ASC | DESC ],... ] [ LIMIT ];\n", "```" - ] + ], + "id": "1a5d9d0f" }, { "cell_type": "code", "execution_count": 4, - "id": "375069f6-4553-4b55-824a-201efbb34ea2", "metadata": {}, "outputs": [], "source": [ "us_regions = %sql SHOW REGIONS LIKE '%US%'\n", "us_regions" - ] + ], + "id": "4605bd6f" }, { "attachments": {}, "cell_type": "markdown", - "id": "38cff483-5260-4601-b0c8-76521bfa1446", "metadata": {}, "source": [ "Let's use the random package to choose a US region for us." - ] + ], + "id": "402b62c5" }, { "cell_type": "code", "execution_count": 5, - "id": "490ab9d1-8539-40c3-ac21-733d18e01758", "metadata": {}, "outputs": [], "source": [ @@ -160,22 +159,22 @@ "\n", "region_id = random.choice(us_regions).ID\n", "region_id" - ] + ], + "id": "989d31f4" }, { "attachments": {}, "cell_type": "markdown", - "id": "9aa3dbb5-0e0f-4971-867c-618862def699", "metadata": {}, "source": [ "Now that we have a region ID, we can create our workspace. We'll open the firewall so it\n", "can be accessed from anywhere and set a password." - ] + ], + "id": "2cdab621" }, { "cell_type": "code", "execution_count": 6, - "id": "10d84a7c-cd26-4c36-8275-ebb66a30dcdc", "metadata": {}, "outputs": [], "source": [ @@ -183,45 +182,45 @@ "\n", "wsg_name = 'Fusion Notebook'\n", "password = secrets.token_urlsafe(20) + '-x&'" - ] + ], + "id": "0c579720" }, { "cell_type": "code", "execution_count": 7, - "id": "08d40823-6700-4c45-be8f-e5ecf5502284", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "CREATE WORKSPACE GROUP '{{ wsg_name }}' IN REGION ID '{{ region_id }}'\n", " WITH FIREWALL RANGES '0.0.0.0/0' WITH PASSWORD '{{ password }}'" - ] + ], + "id": "41d5c2d3" }, { "attachments": {}, "cell_type": "markdown", - "id": "e6a5ee99-5935-45a4-b602-9b4b7263d6b7", "metadata": {}, "source": [ "If you are in the SingleStore Cloud portal, you should see the workpace group displayed in a few seconds.\n", "You can also use the `SHOW WORKSPACE GROUPS` command to list them." - ] + ], + "id": "83d226c5" }, { "cell_type": "code", "execution_count": 8, - "id": "b2a79c30-e9cc-4262-8db8-562abe32e021", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SHOW WORKSPACE GROUPS LIKE 'Fusion%'" - ] + ], + "id": "7624bcdc" }, { "attachments": {}, "cell_type": "markdown", - "id": "a84445b2-d49a-4a48-8951-1a40e6f98465", "metadata": {}, "source": [ "### Creating workspaces\n", @@ -232,36 +231,36 @@ "CREATE WORKSPACE [ IF NOT EXISTS ] '' [ IN GROUP { ID '' | '' } ]\n", " WITH SIZE '' [ WAIT ON ACTIVE ];\n", "```" - ] + ], + "id": "35a7a9e4" }, { "cell_type": "code", "execution_count": 9, - "id": "9534e73f-d9ca-43c8-bc83-022c9d9d6d4b", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "CREATE WORKSPACE 'workspace-1' IN GROUP '{{ wsg_name }}' WITH SIZE 'S-00';\n", "CREATE WORKSPACE 'workspace-2' IN GROUP '{{ wsg_name }}' WITH SIZE 'S-1';" - ] + ], + "id": "409c2ac6" }, { "attachments": {}, "cell_type": "markdown", - "id": "326307f8-1e76-43b7-aeb2-0d354f14e6cf", "metadata": {}, "source": [ "The workspaces will take some time to become available. We can write a small wait loop to\n", "block until they are both ready. You could use the `WAIT ON ACTIVE` option for `CREATE WORKSPACE`,\n", "but that would cause them to run sequentially. We are using an external loop so that the\n", "two commands above can run in parallel." - ] + ], + "id": "2379ad3f" }, { "cell_type": "code", "execution_count": 10, - "id": "ac2cff84-17f4-4f30-a10c-b1cb2921e1f2", "metadata": {}, "outputs": [], "source": [ @@ -288,32 +287,32 @@ "\n", "# Wait for all workspaces to be active\n", "wait_on_attr(f'SHOW WORKSPACES IN GROUP \"{ wsg_name }\"', State='ACTIVE')" - ] + ], + "id": "5bf4bd4d" }, { "attachments": {}, "cell_type": "markdown", - "id": "cacf2fbe-8230-4bb6-a7a8-136dfe2f3d6c", "metadata": {}, "source": [ "We can now display the information about the workspaces using the `SHOW WORKSPACES` command." - ] + ], + "id": "f61203be" }, { "cell_type": "code", "execution_count": 11, - "id": "f07f5d66-769f-41ed-9740-ee64d784d608", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SHOW WORKSPACES IN GROUP '{{ wsg_name }}' ORDER BY Name EXTENDED" - ] + ], + "id": "c6da588b" }, { "attachments": {}, "cell_type": "markdown", - "id": "eaf14e95-0e07-421f-b051-14c35f01efef", "metadata": {}, "source": [ "### Suspending and resuming workspaces\n", @@ -325,94 +324,94 @@ "\n", "SUSPEND WORKSPACE { ID '' | '' } [ IN GROUP { ID '' | '' } ] [ WAIT ON SUSPENDED ];\n", "```" - ] + ], + "id": "7eabac7b" }, { "cell_type": "code", "execution_count": 12, - "id": "28f0c0dc-3e4e-45d5-8bc8-a9a5a090b91d", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SUSPEND WORKSPACE 'workspace-1' IN GROUP '{{ wsg_name }}'" - ] + ], + "id": "e5e01e3f" }, { "attachments": {}, "cell_type": "markdown", - "id": "37a04a12-1460-4390-824b-5beffbe0e407", "metadata": {}, "source": [ "The workspace should have a state of 'SUSPENDED' shortly after running the above command." - ] + ], + "id": "9c55a846" }, { "cell_type": "code", "execution_count": 13, - "id": "76898a1d-8e04-411a-816d-702662e940c8", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SHOW WORKSPACES IN GROUP '{{ wsg_name}}'" - ] + ], + "id": "92da04c6" }, { "attachments": {}, "cell_type": "markdown", - "id": "c0f6b076-ae63-41fd-9c98-e4d9a76146c8", "metadata": {}, "source": [ "To resume the workspace, you use the `RESUME WORKSPACE` command." - ] + ], + "id": "887fc459" }, { "cell_type": "code", "execution_count": 14, - "id": "0d018344-c4cc-4591-a764-f25678390a94", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "RESUME WORKSPACE 'workspace-1' IN GROUP '{{ wsg_name }}' WAIT ON RESUMED" - ] + ], + "id": "bf33363a" }, { "cell_type": "markdown", - "id": "79d8b786-fa0a-4608-af18-2404479d97cd", "metadata": {}, "source": [ "Display the information about the workspaces again." - ] + ], + "id": "c4a1e2ac" }, { "cell_type": "code", "execution_count": 15, - "id": "71738296-e6a1-4a55-82ef-4a8a053451a2", "metadata": {}, "outputs": [], "source": [ "workspaces = %sql SHOW WORKSPACES IN GROUP '{{ wsg_name}}' EXTENDED\n", "workspaces" - ] + ], + "id": "67d2c61f" }, { "attachments": {}, "cell_type": "markdown", - "id": "f286faac-d964-4ece-aad1-e0692a071f4b", "metadata": {}, "source": [ "### Accessing the database endpoint of a workspace\n", "\n", "As you saw above, we have access to the database endpoint in the workspace information.\n", "We can use that to create a connection to that workspace for database operations." - ] + ], + "id": "290165aa" }, { "cell_type": "code", "execution_count": 16, - "id": "33ccf8e5-7599-440d-adb9-d1e29dfecf94", "metadata": {}, "outputs": [], "source": [ @@ -423,12 +422,12 @@ " cur.execute('show databases')\n", " for row in cur:\n", " print(*row)" - ] + ], + "id": "eb52e506" }, { "attachments": {}, "cell_type": "markdown", - "id": "69bf9daf-7043-4712-8b14-1f2f383582ad", "metadata": {}, "source": [ "### Terminating workspaces and workspace groups\n", @@ -439,96 +438,96 @@ "\n", "DROP WORKSPACE GROUP [ IF EXISTS ] { ID '' | '' } [ WAIT ON TERMINATED ] [ FORCE ];\n", "```" - ] + ], + "id": "66622efe" }, { "attachments": {}, "cell_type": "markdown", - "id": "d7f5532c-dd17-48fb-ae9e-1f2d3a8a896d", "metadata": {}, "source": [ "Let's drop `workspace-2` and leave `workspace-1` in place." - ] + ], + "id": "637694a7" }, { "cell_type": "code", "execution_count": 17, - "id": "160e8673-d9f4-4ce0-b9b7-ecd4f3d0d75b", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "DROP WORKSPACE 'workspace-2' IN GROUP '{{ wsg_name }}'" - ] + ], + "id": "fcd1e469" }, { "attachments": {}, "cell_type": "markdown", - "id": "5eddf490-9894-4e45-98cd-a7bf3e299b87", "metadata": {}, "source": [ "The above operation may take a few seconds. Once it has completed, the following output will\n", "show just one workspace remaining." - ] + ], + "id": "19a1db5f" }, { "cell_type": "code", "execution_count": 18, - "id": "e1ac3334-ecd6-45a2-a218-5ea2a82ebc2d", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SHOW WORKSPACES IN GROUP '{{ wsg_name }}'" - ] + ], + "id": "f60b8887" }, { "attachments": {}, "cell_type": "markdown", - "id": "8872aa21-1422-4583-9387-dc1002b59ab0", "metadata": {}, "source": [ "It is possible to terminate a workspace group even if it has workspaces in it\n", "by using the `FORCE` option. Let's remove our workspace group with `workspace-1`\n", "still in it." - ] + ], + "id": "b4f79288" }, { "cell_type": "code", "execution_count": 19, - "id": "8a98a983-0cc4-4f74-96c9-ce27442b4e57", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "DROP WORKSPACE GROUP '{{ wsg_name }}' FORCE" - ] + ], + "id": "2c710871" }, { "cell_type": "code", "execution_count": 20, - "id": "a1f2f996-87f5-45e3-9907-f5980a654d59", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SHOW WORKSPACE GROUPS LIKE 'Fusion%'" - ] + ], + "id": "41c2a6df" }, { "attachments": {}, "cell_type": "markdown", - "id": "1668f1e9-ac1a-4d78-b940-33109926e4f5", "metadata": {}, "source": [ "We can attempt to list the workspaces in the group again, but this time\n", "you will get a KeyError saying that the workspace group is not found." - ] + ], + "id": "05aa3068" }, { "cell_type": "code", "execution_count": 21, - "id": "f0d142de-a89f-4a28-95c0-eea72a4556a9", "metadata": {}, "outputs": [], "source": [ @@ -536,12 +535,12 @@ " %sql SHOW WORKSPACES IN GROUP '{{ wsg_name }}'\n", "except KeyError:\n", " print('no workspace group was found')" - ] + ], + "id": "13b54a15" }, { "attachments": {}, "cell_type": "markdown", - "id": "90d600a2-23e9-42c0-91ba-7bee6e7097be", "metadata": {}, "source": [ "## Conclusion\n", @@ -550,11 +549,12 @@ "and workspaces. We also demonstrated how to suspend and resume workspaces. Fusion SQL\n", "can also manage your Stage files. That topic is covered in another example notebook,\n", "and more Fusion SQL commands will be added as features are added to SingleStoreDB Cloud." - ] + ], + "id": "a45b4c7a" }, { + "id": "b2d49509", "cell_type": "markdown", - "id": "3877747f-c091-4bf8-9c79-fcc26e570ef6", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/getting-started-with-mongocdc/notebook.ipynb b/notebooks/getting-started-with-mongocdc/notebook.ipynb index 192f823a..b0b133b8 100644 --- a/notebooks/getting-started-with-mongocdc/notebook.ipynb +++ b/notebooks/getting-started-with-mongocdc/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "4551d3af", "cell_type": "markdown", - "id": "587c8a4e-fa6f-49ff-b3c6-a91d9f3aabf2", "metadata": {}, "source": [ "
    \n", @@ -19,18 +19,17 @@ { "attachments": {}, "cell_type": "markdown", - "id": "6d2bb122-3ae2-4eab-bbbd-3e3ba6907c4b", "metadata": {}, "source": [ "\n", " \n", "
    " - ] + ], + "id": "85a9ff0b" }, { "attachments": {}, "cell_type": "markdown", - "id": "9e8e1c02-f723-4e0c-88f3-0adb4dc8b0de", "metadata": {}, "source": [ "SingleStore's native data replication gives you the ability to do one-time snapshot, and continuous change data capture CDC from MongoDB\u00ae to SingleStoreDB. This provides a quick and easy way to replicate data and power up analytics on MongoDB\u00ae data.\n", @@ -38,21 +37,21 @@ "## What you will learn in this notebook:\n", "\n", "Setup replication of a collection to SingleStore and see the live updates on MongoDB\u00ae collection replicate to SingleStore." - ] + ], + "id": "550b8c4c" }, { "attachments": {}, "cell_type": "markdown", - "id": "10234772-6625-4a4a-99af-f39e4e566c6d", "metadata": {}, "source": [ "## Install libraries and import modules" - ] + ], + "id": "43a0f7a3" }, { "cell_type": "code", "execution_count": 1, - "id": "fd26645f-c6b8-4853-baee-6bd77c6c1083", "metadata": {}, "outputs": [], "source": [ @@ -60,52 +59,52 @@ "\n", "import pymongo\n", "import random" - ] + ], + "id": "8f479cc8" }, { "attachments": {}, "cell_type": "markdown", - "id": "4ceef3c0-c804-48c3-9ca0-dcb7f5abfe27", "metadata": {}, "source": [ "## Replicate a collection to Singlestore" - ] + ], + "id": "302514a9" }, { "cell_type": "code", "execution_count": 2, - "id": "d2eec3c8-33fb-4d09-b1c3-8c5ce7af1bfe", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "DROP DATABASE IF EXISTS cdcdemo;\n", "CREATE DATABASE cdcdemo;" - ] + ], + "id": "31776266" }, { "cell_type": "code", "execution_count": 3, - "id": "f0647bbb-dc4d-4852-89a9-f6b482808e88", "metadata": {}, "outputs": [], "source": [ "source_mongo_url = \"mongodb+srv://mongo_sample_reader:SingleStoreRocks27017@cluster1.tfutgo0.mongodb.net/?retryWrites=true&w=majority\"" - ] + ], + "id": "d656a903" }, { "attachments": {}, "cell_type": "markdown", - "id": "909341b4-bc34-4d8c-ba29-379fc9f905f3", "metadata": {}, "source": [ "Create a link to Source MongoDB" - ] + ], + "id": "e7e6ff8c" }, { "cell_type": "code", "execution_count": 4, - "id": "0790ee0d-f9e9-4952-b1ae-54f872e42aed", "metadata": {}, "outputs": [], "source": [ @@ -116,62 +115,62 @@ "\n", "if res[\"ok\"] != 1:\n", " raise Exception(\"Failed to create link: %s\" % \"local\")" - ] + ], + "id": "3465305d" }, { "attachments": {}, "cell_type": "markdown", - "id": "267e4a33-9400-4cf6-aa5d-c065d6100ffa", "metadata": {}, "source": [ "Specify the source database and collection and start replication" - ] + ], + "id": "6fff9a89" }, { "cell_type": "code", "execution_count": 5, - "id": "356e9f3c-8473-45fa-8918-39c3dfe98403", "metadata": {}, "outputs": [], "source": [ "create_col_args = {\"from\": {\"link\": \"mongolink\", \"database\": \"cdcdemo\", \"collection\": \"scores\"}}\n", "res = s2db.create_collection(\"scores\", **create_col_args)" - ] + ], + "id": "289bc8f5" }, { "attachments": {}, "cell_type": "markdown", - "id": "dfff07f9-a853-49ae-9a3d-90de78aebdf6", "metadata": {}, "source": [ "The following command waits till the entire collection from MongoDB is synced to SingleStore" - ] + ], + "id": "70f6a5ec" }, { "cell_type": "code", "execution_count": 6, - "id": "21e59e54-ae89-430c-a46b-6a8673648bcd", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "USE cdcdemo;\n", "SYNC PIPELINE scores;" - ] + ], + "id": "60450cdc" }, { "attachments": {}, "cell_type": "markdown", - "id": "dac2ce13-bfb2-443c-a28a-885a6370210a", "metadata": {}, "source": [ "Printing some documents that are replicated" - ] + ], + "id": "478f2e96" }, { "cell_type": "code", "execution_count": 7, - "id": "3647b206-7ee6-45d6-9a5c-f0dceabf51e3", "metadata": {}, "outputs": [], "source": [ @@ -179,40 +178,40 @@ "scores_cursor = s2collection.find().limit(5)\n", "for scores in scores_cursor:\n", " print(scores)" - ] + ], + "id": "5b41c182" }, { "attachments": {}, "cell_type": "markdown", - "id": "bb2830f7-806d-4765-ac6d-559e80f9960f", "metadata": {}, "source": [ "Total documents count" - ] + ], + "id": "03e436ca" }, { "cell_type": "code", "execution_count": 8, - "id": "bad4d91c-493d-4f13-a1c6-8cc31ad2e7ed", "metadata": {}, "outputs": [], "source": [ "s2collection.count_documents({})" - ] + ], + "id": "37de6f26" }, { "attachments": {}, "cell_type": "markdown", - "id": "f4bc98f7-d2aa-44c5-b77e-612b61504a1a", "metadata": {}, "source": [ "Insert a document in the source MongoDB collection" - ] + ], + "id": "2e376a79" }, { "cell_type": "code", "execution_count": 9, - "id": "04f65aec-6b7f-4850-8841-aa07272d842e", "metadata": {}, "outputs": [], "source": [ @@ -221,60 +220,61 @@ " \"class_id\": random.randint(0, 500),\n", " \"exam_score\": random.uniform(0, 100) # Generate random score between 0 and 100 as a double\n", " }" - ] + ], + "id": "877c741b" }, { "cell_type": "code", "execution_count": 10, - "id": "7f9798a6-b95b-409c-898c-6015d1a7b683", "metadata": {}, "outputs": [], "source": [ "sourceclient = pymongo.MongoClient(source_mongo_url)\n", "sourcecol = sourceclient[\"cdcdemo\"][\"scores\"]\n", "res = sourcecol.insert_one(data)" - ] + ], + "id": "900abfce" }, { "cell_type": "code", "execution_count": 11, - "id": "e61b8047-a1a3-468a-9e5b-f5c1220d0d5b", "metadata": {}, "outputs": [], "source": [ "sourcecol.count_documents({})" - ] + ], + "id": "be24d89d" }, { "attachments": {}, "cell_type": "markdown", - "id": "7ab1af0f-4a5d-4419-ad9e-e29b5b358a5d", "metadata": {}, "source": [ "The newly added document is now replicated to singlestore, increasing the documents count by 1 demonstrating real time sync" - ] + ], + "id": "5242a930" }, { "cell_type": "code", "execution_count": 12, - "id": "54f952ae-021e-4a04-ab8b-9502a1b63c85", "metadata": {}, "outputs": [], "source": [ "s2collection.count_documents({})" - ] + ], + "id": "afff1b08" }, { "cell_type": "markdown", - "id": "ea0612bc", "metadata": {}, "source": [ "This native replication capability from Singlestore makes it easy to setup and run continuous data replication from your MongoDB at no additional cost or infrastructure requirements" - ] + ], + "id": "9ce57732" }, { + "id": "e083027c", "cell_type": "markdown", - "id": "6202b36d-a1fd-4cd3-973f-f4ae489cbead", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/getting-started-with-notebooks/notebook.ipynb b/notebooks/getting-started-with-notebooks/notebook.ipynb index 03cfa25d..49b9cfb9 100644 --- a/notebooks/getting-started-with-notebooks/notebook.ipynb +++ b/notebooks/getting-started-with-notebooks/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "359d556e", "cell_type": "markdown", - "id": "a0efb393-2a46-4833-b5cf-8f048d9695b0", "metadata": {}, "source": [ "
    \n", @@ -17,8 +17,8 @@ ] }, { + "id": "7421ad91", "cell_type": "markdown", - "id": "4e83c63b", "metadata": {}, "source": [ "
    \n", @@ -32,18 +32,17 @@ }, { "cell_type": "markdown", - "id": "6d2bb122-3ae2-4eab-bbbd-3e3ba6907c4b", "metadata": {}, "source": [ "\n", " \n", " \n", "
    " - ] + ], + "id": "9922b829" }, { "cell_type": "markdown", - "id": "9e8e1c02-f723-4e0c-88f3-0adb4dc8b0de", "metadata": {}, "source": [ "## What you will learn in this notebook:\n", @@ -56,31 +55,31 @@ "## Questions?\n", "\n", "Reach out to us through our [forum](https://www.singlestore.com/forum)." - ] + ], + "id": "150affef" }, { "cell_type": "markdown", - "id": "1b1d60b9-f1d9-4c8a-8d9d-ed8fc9968d40", "metadata": {}, "source": [ "## Enhance your notebooks with visualizations" - ] + ], + "id": "cc215d0f" }, { "cell_type": "markdown", - "id": "10234772-6625-4a4a-99af-f39e4e566c6d", "metadata": {}, "source": [ "## 1. Import libraries for reading data into a DataFrame\n", "\n", "Our data set contains geographic data, so we also install [Shapely](https://shapely.readthedocs.io/en/stable/)\n", "to store that data in Shapely geometry objects." - ] + ], + "id": "7ded7bfb" }, { "cell_type": "code", "execution_count": 1, - "id": "fd26645f-c6b8-4853-baee-6bd77c6c1083", "metadata": {}, "outputs": [], "source": [ @@ -88,11 +87,11 @@ "\n", "import pandas as pd\n", "import shapely.wkt" - ] + ], + "id": "8019c71f" }, { "cell_type": "markdown", - "id": "ccb319b8-73b6-47ec-9dd0-ec33752fbb94", "metadata": {}, "source": [ "## 2. Load a csv file hosted in Github using Python\n", @@ -101,23 +100,23 @@ "convert specific columns into various data types, including geographic data in the `business_location` column.\n", "See the [`read_csv`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html) documentation\n", "for more information." - ] + ], + "id": "03190080" }, { "cell_type": "code", "execution_count": 2, - "id": "cc892204-df0a-4c42-a772-7bf10d9f0e2d", "metadata": {}, "outputs": [], "source": [ "url = 'https://raw.githubusercontent.com/singlestore-labs/singlestoredb-samples/main/' + \\\n", " 'Sample%20datasets/csv/Restaurant_Scores_LIVES_Standard.csv'" - ] + ], + "id": "e498faa7" }, { "cell_type": "code", "execution_count": 3, - "id": "5aec33ed-1b93-46f1-b9df-b6b726de1b82", "metadata": {}, "outputs": [], "source": [ @@ -134,29 +133,29 @@ " dtype=dict(business_id=int, business_phone_number=str, business_postal_code=str, inspection_score=float),\n", " converters=dict(business_location=str_to_shapely))\n", "df" - ] + ], + "id": "4494b3cd" }, { "cell_type": "markdown", - "id": "07787fa4-445e-4452-a5de-8cf3b2dd2b34", "metadata": {}, "source": [ "Display the data types in the resulting DataFrame. Note that any objects that pandas does not support natively (e.g., strings, blobs, shapely geometries, etc.) show up as `object`." - ] + ], + "id": "efad9e64" }, { "cell_type": "code", "execution_count": 4, - "id": "e3bf5b74-b0de-4cfb-b2ff-00b5934bc0a6", "metadata": {}, "outputs": [], "source": [ "df.dtypes" - ] + ], + "id": "989ffbe3" }, { "cell_type": "markdown", - "id": "3a9c4139", "metadata": {}, "source": [ "
    \n", @@ -166,48 +165,48 @@ "

    If you have a Free Starter Workspace deployed already, select the database from drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.

    \n", "
    \n", "
    " - ] + ], + "id": "8ee45846" }, { "cell_type": "markdown", - "id": "4ceef3c0-c804-48c3-9ca0-dcb7f5abfe27", "metadata": {}, "source": [ "## 3. Ingest a DataFrame in a SingleStoreDB table" - ] + ], + "id": "210bbda1" }, { "cell_type": "markdown", - "id": "66b9f526-3c1f-449f-93bb-c167b15fc598", "metadata": {}, "source": [ "1. Create the database\n", "2. Import the library to connect to the database\n", "3. Create the connection to the library\n", "4. Ingest the dataframe to the newly created database" - ] + ], + "id": "96fa2c5a" }, { "cell_type": "markdown", - "id": "ced28574-ac65-45bd-bed4-1df5273f9b57", "metadata": {}, "source": [ "Set the database name in a variable. It will be used in subsequent queries." - ] + ], + "id": "fba58fb6" }, { "cell_type": "markdown", - "id": "47e0bfb0-2006-4055-bdaf-081fd7473cf6", "metadata": {}, "source": [ "Here we are using the `database_name` variable in a `%%sql` cell. The syntax for including Python variables\n", "is to surround the variable name with `{{ ... }}`." - ] + ], + "id": "11f05328" }, { "cell_type": "code", "execution_count": 5, - "id": "0790ee0d-f9e9-4952-b1ae-54f872e42aed", "metadata": {}, "outputs": [], "source": [ @@ -219,11 +218,11 @@ "else:\n", " current_database = %sql SELECT DATABASE() as CurrentDatabase\n", " database_name = current_database[0][0]" - ] + ], + "id": "69e18dfb" }, { "cell_type": "markdown", - "id": "7defddbb", "metadata": {}, "source": [ "
    \n", @@ -233,11 +232,11 @@ "

    Make sure to select a database from the drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.

    \n", "
    \n", "
    " - ] + ], + "id": "961afa0b" }, { "cell_type": "markdown", - "id": "3a254b3c-f5c0-4ce5-9046-df97dbffd420", "metadata": {}, "source": [ "We can use SQLAlchemy and pandas to upload a DataFrame. Note that if the table does not exist, the data types will\n", @@ -248,12 +247,12 @@ "contain a specific database to connect to. You can use the drop-down menu at the top of this notebook (immediately\n", "under the title) to select a database to work with. Changing the selection in the drop-down menu also updates\n", "the `connection_url` variable." - ] + ], + "id": "1f131f96" }, { "cell_type": "code", "execution_count": 6, - "id": "017b889a-5efe-4605-93a9-a80073b4b068", "metadata": {}, "outputs": [], "source": [ @@ -261,22 +260,22 @@ "\n", "# Create a SQLAlchemy engine and connect\n", "db_connection = sa.create_engine(connection_url).connect()" - ] + ], + "id": "eb4e912b" }, { "cell_type": "markdown", - "id": "515fb529-745f-4005-a704-241dfe65e985", "metadata": {}, "source": [ "The SingleStoreDB Python package also adds a convenience function for SQLAlchemy connections\n", "without using the `connection_url`. It automatically gets the connection information from\n", "the `SINGLESTOREDB_URL` environment variable." - ] + ], + "id": "90482136" }, { "cell_type": "code", "execution_count": 7, - "id": "16f8f5df-6019-4fbb-9d4e-e1dd389ea7af", "metadata": {}, "outputs": [], "source": [ @@ -287,52 +286,52 @@ "\n", "# Upload the DataFrame\n", "df.to_sql('sf_restaurant_scores', con=db_connection, if_exists='append', chunksize=1000)" - ] + ], + "id": "4168ceb8" }, { "cell_type": "markdown", - "id": "3e239229-b797-4b13-8cc6-1687e86328f5", "metadata": {}, "source": [ "## 4. Interact natively with the database using SQL" - ] + ], + "id": "05dd0fa8" }, { "cell_type": "markdown", - "id": "6bac215b-8d15-4e43-8492-f9483aa1970b", "metadata": {}, "source": [ "1. Read the top 10 rows from the table\n", "2. Alter the table to get the date in a date format, not string\n", "3. Read the number of restaurant inspections over the time in San Francisco" - ] + ], + "id": "63c38ba1" }, { "cell_type": "code", "execution_count": 8, - "id": "c130a5a0-4b3d-42b5-849e-30aebd82f02a", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT * FROM {{database_name}}.sf_restaurant_scores LIMIT 10;" - ] + ], + "id": "f79b3f45" }, { "cell_type": "markdown", - "id": "e30f3e37-6669-45d0-8859-72ddb466f65b", "metadata": {}, "source": [ "In the code block below, we use the `result1 <<` syntax on the `%%sql` line to store the result of the SQL\n", "operation into a variable which can be used later. As with other Jupyter notebooks, you can always get the value\n", "of the last executed cell in the `_` (underscore) variable, but setting a specifc variable name to use is generally\n", "a safer way to retrieve results." - ] + ], + "id": "acc21f0d" }, { "cell_type": "code", "execution_count": 9, - "id": "239813ff-7528-417c-b00f-7ccf274850e9", "metadata": {}, "outputs": [], "source": [ @@ -346,78 +345,78 @@ " MONTH\n", "ORDER BY\n", " MONTH DESC;" - ] + ], + "id": "6d45915f" }, { "cell_type": "markdown", - "id": "4dca22ec-028d-48fa-9cc1-e0723e05cfae", "metadata": {}, "source": [ "The output of a `%%sql` cell is a `ResultSet` which contains methods for converting to various other data types (e.g., `csv`, `dicts`, `DataFrame`, `PolarsDataFrame`). It is also possible to convert to a DataFrame by passing a `ResultSet` object to the DataFrame\n", "constructor as we'll see below." - ] + ], + "id": "ee90bcf4" }, { "cell_type": "code", "execution_count": 10, - "id": "e34a2bde-82b7-4085-8e7c-64e72746bfc0", "metadata": {}, "outputs": [], "source": [ "type(result1)" - ] + ], + "id": "5ecfaa34" }, { "cell_type": "markdown", - "id": "0d26b14b-d6c9-4ab4-ba7d-5f029fa610e6", "metadata": {}, "source": [ "## 5. Visualize with Plotly" - ] + ], + "id": "facf96ae" }, { "cell_type": "markdown", - "id": "8665a18d-f7d5-4da2-acc6-ea5a2a76d349", "metadata": {}, "source": [ "We are using [Plotly](https://plotly.com) to visualize the data in `result1`. The first parameter of the\n", "`bar` function requires a DataFrame, so we'll convert `result1` to a DataFrame before calling `bar`." - ] + ], + "id": "3a9fec49" }, { "cell_type": "code", "execution_count": 11, - "id": "9f08d6ab-6895-4ec6-a115-f470d20354d1", "metadata": {}, "outputs": [], "source": [ "result1_df = pd.DataFrame(result1)\n", "result1_df[:5]" - ] + ], + "id": "16372f39" }, { "cell_type": "code", "execution_count": 12, - "id": "d849d710-bf6e-46db-ae25-f39b7854547e", "metadata": {}, "outputs": [], "source": [ "import plotly.express as px\n", "\n", "px.bar(result1_df, x='month', y='count_inspection', title='Inspections by Month')" - ] + ], + "id": "cdb77c14" }, { "cell_type": "markdown", - "id": "93df4ad5-5fec-45d4-a051-24a651dbbff9", "metadata": {}, "source": [ "## 6. Cleanup database" - ] + ], + "id": "15ed2dde" }, { "cell_type": "markdown", - "id": "fb94bc57", "metadata": {}, "source": [ "
    \n", @@ -427,23 +426,24 @@ "

    If you created a new database in your Standard or Premium Workspace, you can drop the database by running the cell below. Note: this will not drop your database for Free Starter Workspaces. To drop a Free Starter Workspace, terminate the Workspace using the UI.

    \n", "
    \n", "
    " - ] + ], + "id": "d74f75d2" }, { "cell_type": "code", "execution_count": 13, - "id": "0792f4a6-82f4-413b-9895-cd4764f7a118", "metadata": {}, "outputs": [], "source": [ "shared_tier_check = %sql show variables like 'is_shared_tier'\n", "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", " %sql DROP DATABASE IF EXISTS {{database_name}};" - ] + ], + "id": "f6557a14" }, { + "id": "f0820d14", "cell_type": "markdown", - "id": "ecc76eb3-7bd6-4efe-abcd-8989277daa2d", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/getting-started-with-singlestore/notebook.ipynb b/notebooks/getting-started-with-singlestore/notebook.ipynb index 351dd061..4fdd19f9 100644 --- a/notebooks/getting-started-with-singlestore/notebook.ipynb +++ b/notebooks/getting-started-with-singlestore/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "8bb1a5e8", "cell_type": "markdown", - "id": "a760c918-56e4-41f8-9546-491198983367", "metadata": {}, "source": [ "
    \n", @@ -17,8 +17,8 @@ ] }, { + "id": "84c5fe68", "cell_type": "markdown", - "id": "8c1326b8-f551-4f57-85ff-19ddabae4e67", "metadata": {}, "source": [ "
    \n", @@ -33,29 +33,28 @@ { "attachments": {}, "cell_type": "markdown", - "id": "d3a83848-363f-47fb-a85e-1c43a58a6a34", "metadata": {}, "source": [ "This Jupyter notebook provides a comprehensive overview and test drive of SingleStore's multi-model capabilities, showcasing how to efficiently manage and query diverse data types within a single database platform.\n", "\n", "The notebook starts with a simple \"Getting Started\" example, guiding users through various standard SQL queries to interact with the database. It then progressively demonstrates how to add and query different data models, including vectors for machine learning, full-text search for unstructured data, JSON for hierarchical data, geospatial data for location-based queries, and time series data for temporal analysis. This hands-on approach offers an accessible way for users to explore SingleStore's versatility and powerful multi-model functionality." - ] + ], + "id": "cc9e68a6" }, { "attachments": {}, "cell_type": "markdown", - "id": "24e55337-f0db-40c3-9360-f0d804be9021", "metadata": {}, "source": [ "# Simple \"Getting Started\" example\n", "\n", "This code checks whether the current database environment is using a \"shared tier\" and then conditionally drops and creates a database based on the result." - ] + ], + "id": "4de61cfe" }, { "cell_type": "code", "execution_count": 1, - "id": "4b57e908-3374-4fc4-89d3-fee289875383", "metadata": {}, "outputs": [], "source": [ @@ -63,12 +62,12 @@ "if not shared_tier_check or shared_tier_check[0][1] == \"OFF\":\n", " %sql DROP DATABASE IF EXISTS multi_model;\n", " %sql CREATE DATABASE IF NOT EXISTS multi_model;" - ] + ], + "id": "2540a7db" }, { "attachments": {}, "cell_type": "markdown", - "id": "3d3f690b-0ccc-4f3a-b933-7288b2f2455d", "metadata": {}, "source": [ "
    \n", @@ -78,12 +77,12 @@ "

    Select the database from the drop-down menu at the top of this notebook.

    \n", "
    \n", "
    " - ] + ], + "id": "538cfe6c" }, { "attachments": {}, "cell_type": "markdown", - "id": "ba7ef01d-1543-43be-93a6-e0d0b61ea052", "metadata": {}, "source": [ "# Various standard SQL queries\n", @@ -91,12 +90,12 @@ "## Create some simple tables\n", "\n", "This setup establishes a basic relational structure to store customer information and their corresponding orders." - ] + ], + "id": "cdd86f23" }, { "cell_type": "code", "execution_count": 2, - "id": "12f4361f-b3d3-4849-94ac-5ac8ca55fea8", "metadata": {}, "outputs": [], "source": [ @@ -116,21 +115,21 @@ " amount DECIMAL(10, 2),\n", " product VARCHAR(50)\n", ");" - ] + ], + "id": "6e20a7c4" }, { "attachments": {}, "cell_type": "markdown", - "id": "d0d2485e-a84d-4461-91fe-9c332c491538", "metadata": {}, "source": [ "## Insert some data" - ] + ], + "id": "a71561fb" }, { "cell_type": "code", "execution_count": 3, - "id": "010a1770-061b-4f84-b3f8-56bfc3cc5c64", "metadata": {}, "outputs": [], "source": [ @@ -148,21 +147,21 @@ "(103, 3, 50.00, \"Notebook\"),\n", "(104, 1, 300.00, \"Laptop\"),\n", "(105, 4, 250.00, \"Tablet\");" - ] + ], + "id": "e7c7b621" }, { "attachments": {}, "cell_type": "markdown", - "id": "a6b1d80d-e7a3-4b40-b8f2-389e95a65cab", "metadata": {}, "source": [ "## Sum of amounts" - ] + ], + "id": "2dfcbabc" }, { "cell_type": "code", "execution_count": 4, - "id": "5150008b-17d9-4f2e-9d29-49441af88109", "metadata": {}, "outputs": [], "source": [ @@ -171,21 +170,21 @@ " SUM(amount) AS total_sales\n", "FROM\n", " orders;" - ] + ], + "id": "dc7eee97" }, { "attachments": {}, "cell_type": "markdown", - "id": "a9bd6429-2daf-4617-b74a-9d35f688204a", "metadata": {}, "source": [ "## Minimum amount" - ] + ], + "id": "4fe579a5" }, { "cell_type": "code", "execution_count": 5, - "id": "f70d6137-a91b-41d0-801c-67c64392c6c0", "metadata": {}, "outputs": [], "source": [ @@ -194,21 +193,21 @@ " MIN(amount) AS min_order_amount\n", "FROM\n", " orders;" - ] + ], + "id": "3cc46257" }, { "attachments": {}, "cell_type": "markdown", - "id": "5e1525d5-6ed2-4880-824e-ca7eb3b49c08", "metadata": {}, "source": [ "## Maximum amount" - ] + ], + "id": "7d34e9f8" }, { "cell_type": "code", "execution_count": 6, - "id": "a33b4514-746d-4e38-8fe5-57abfb5931f7", "metadata": {}, "outputs": [], "source": [ @@ -217,21 +216,21 @@ " MAX(amount) AS max_order_amount\n", "FROM\n", " orders;" - ] + ], + "id": "64867b2c" }, { "attachments": {}, "cell_type": "markdown", - "id": "3c8f6b48-a568-49cf-952d-b7c9774e23a8", "metadata": {}, "source": [ "## Average amount" - ] + ], + "id": "0c90d396" }, { "cell_type": "code", "execution_count": 7, - "id": "c5786941-429c-47b7-855c-64e0f2f8aba8", "metadata": {}, "outputs": [], "source": [ @@ -240,21 +239,21 @@ " ROUND(AVG(amount), 2) AS avg_order_amount\n", "FROM\n", " orders;" - ] + ], + "id": "f69edd06" }, { "attachments": {}, "cell_type": "markdown", - "id": "38a8776b-83c9-4754-9815-d9378822000c", "metadata": {}, "source": [ "## Count the number of orders" - ] + ], + "id": "f3790e0b" }, { "cell_type": "code", "execution_count": 8, - "id": "b2025671-63e3-445b-a354-b674274741ab", "metadata": {}, "outputs": [], "source": [ @@ -263,21 +262,21 @@ " COUNT(*) AS number_of_orders\n", "FROM\n", " orders;" - ] + ], + "id": "e3b617d5" }, { "attachments": {}, "cell_type": "markdown", - "id": "c7411ce3-8d08-42f4-b452-370ca0445be3", "metadata": {}, "source": [ "## Join customers and orders tables" - ] + ], + "id": "41b93287" }, { "cell_type": "code", "execution_count": 9, - "id": "b370df9f-2f67-4812-8bab-28863c35bf53", "metadata": {}, "outputs": [], "source": [ @@ -292,21 +291,21 @@ " customers.customer_id = orders.customer_id\n", "ORDER BY\n", " amount ASC;" - ] + ], + "id": "f7b794ca" }, { "attachments": {}, "cell_type": "markdown", - "id": "0f042a55-0060-44ea-880b-c4cba70ed9fe", "metadata": {}, "source": [ "## Group by customer and calculate total amount spent" - ] + ], + "id": "f1702c95" }, { "cell_type": "code", "execution_count": 10, - "id": "401acac0-c5f0-43bf-a562-7c8cc1f42133", "metadata": {}, "outputs": [], "source": [ @@ -322,45 +321,45 @@ " customers.customer_name\n", "ORDER BY\n", " total_spent DESC;" - ] + ], + "id": "459f9450" }, { "attachments": {}, "cell_type": "markdown", - "id": "0aee1d34-305e-47da-9307-a7f1adac5d4e", "metadata": {}, "source": [ "# Add Vectors\n", "\n", "## Add a 3-dimensional vector to the orders table" - ] + ], + "id": "a4176781" }, { "cell_type": "code", "execution_count": 11, - "id": "d1ed8498-7514-4ffe-b6e9-b51fee6674a0", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "ALTER TABLE orders ADD COLUMN dimensions VECTOR(3);" - ] + ], + "id": "f6b958e0" }, { "attachments": {}, "cell_type": "markdown", - "id": "79189f18-63f0-4ffa-a46a-74f3b967f92e", "metadata": {}, "source": [ "## Add some vector data\n", "\n", "### 3 dimensions represent Length (L), Width (W) and Height (H) in cm" - ] + ], + "id": "a0ad0c75" }, { "cell_type": "code", "execution_count": 12, - "id": "434186df-437a-4c63-87e3-4a6a8b680dc5", "metadata": {}, "outputs": [], "source": [ @@ -370,21 +369,21 @@ "UPDATE orders SET dimensions = '[21.0, 29.7, 0.5]' WHERE order_id = 103;\n", "UPDATE orders SET dimensions = '[32.0, 22.0, 2.0]' WHERE order_id = 104;\n", "UPDATE orders SET dimensions = '[24.0, 16.0, 0.7]' WHERE order_id = 105;" - ] + ], + "id": "ddb5c5ca" }, { "attachments": {}, "cell_type": "markdown", - "id": "cdca346c-e028-49d8-866e-139910b1403e", "metadata": {}, "source": [ "## Show the vectors" - ] + ], + "id": "d082d79a" }, { "cell_type": "code", "execution_count": 13, - "id": "bc5188c5-bb0c-40d0-b1d9-ffbbba60c3fd", "metadata": {}, "outputs": [], "source": [ @@ -395,12 +394,12 @@ " *\n", "FROM\n", " orders;" - ] + ], + "id": "79a7b7d2" }, { "attachments": {}, "cell_type": "markdown", - "id": "3b6ed738-4820-4185-8642-f7a733724728", "metadata": {}, "source": [ "## Select orders using <*> which is Dot Product\n", @@ -408,12 +407,12 @@ "The dot product is a way of multiplying two vectors to get a single number (a scalar).\n", "\n", "In simple terms, the dot product provides a way to combine two sets of numbers into a single value that reflects how much the vectors \"point\" in the same direction." - ] + ], + "id": "7230ca67" }, { "cell_type": "code", "execution_count": 14, - "id": "25f3f54f-d83a-44cb-b9ae-daf7f6f62eea", "metadata": {}, "outputs": [], "source": [ @@ -428,12 +427,12 @@ " orders\n", "ORDER BY\n", " score DESC;" - ] + ], + "id": "0e0e71e3" }, { "attachments": {}, "cell_type": "markdown", - "id": "c309b047-f031-40e5-b738-c24f160e21b3", "metadata": {}, "source": [ "## Select orders using <-> which is Euclidean Distance\n", @@ -441,12 +440,12 @@ "Euclidean distance is a way to measure how far apart two points are in space.\n", "\n", "In simple terms, Euclidean distance provides a straight-line measurement of how far one point is from another, like using a ruler to measure the distance between two points on a map." - ] + ], + "id": "89c6d75b" }, { "cell_type": "code", "execution_count": 15, - "id": "61768423-70ce-4d66-9685-d1a18ce59a69", "metadata": {}, "outputs": [], "source": [ @@ -461,43 +460,43 @@ " orders\n", "ORDER BY\n", " score ASC;" - ] + ], + "id": "bf3d573b" }, { "attachments": {}, "cell_type": "markdown", - "id": "97549944-084b-4a88-b0d3-3ee49aa171fb", "metadata": {}, "source": [ "# Add Full-Text\n", "\n", "## Add a description column to the orders table" - ] + ], + "id": "42ee6179" }, { "cell_type": "code", "execution_count": 16, - "id": "81ae4dd7-d236-4bec-b7ae-6b854d46ff23", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "ALTER TABLE orders ADD COLUMN description VARCHAR(255);" - ] + ], + "id": "213a78e8" }, { "attachments": {}, "cell_type": "markdown", - "id": "1205678f-e3f7-4834-9297-58a2d52f0418", "metadata": {}, "source": [ "## Update orders table with descriptions" - ] + ], + "id": "48410a03" }, { "cell_type": "code", "execution_count": 17, - "id": "426cc2cc-1571-4a01-aa22-f4d4d746c220", "metadata": {}, "outputs": [], "source": [ @@ -511,21 +510,21 @@ " WHEN product = \"Tablet\" THEN \"A compact tablet with a vibrant display and versatile functionality.\"\n", " ELSE \"A product with excellent features and quality.\"\n", "END;" - ] + ], + "id": "48b313aa" }, { "attachments": {}, "cell_type": "markdown", - "id": "cb8f0a06-6a31-469a-8267-6cfa2e7ef03a", "metadata": {}, "source": [ "## Show the descriptions" - ] + ], + "id": "b81cf059" }, { "cell_type": "code", "execution_count": 18, - "id": "c5242e2a-8dcc-414e-acbb-d501f413ac99", "metadata": {}, "outputs": [], "source": [ @@ -534,42 +533,42 @@ " *\n", "FROM\n", " orders;" - ] + ], + "id": "5f8ff611" }, { "attachments": {}, "cell_type": "markdown", - "id": "ff4970e1-ecec-4da0-aa2c-766c8c977912", "metadata": {}, "source": [ "## Add a full-text index to the orders table" - ] + ], + "id": "1396f5e4" }, { "cell_type": "code", "execution_count": 19, - "id": "24f87e77-71da-4bd7-bc9d-87ec065a190a", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "ALTER TABLE orders ADD FULLTEXT USING VERSION 2 orders_ft_index (product, description);\n", "OPTIMIZE TABLE orders FLUSH;" - ] + ], + "id": "3d806b06" }, { "attachments": {}, "cell_type": "markdown", - "id": "fd87ac74-f645-41ed-856e-d995b79611ab", "metadata": {}, "source": [ "## Search for a match on \"vibrant\" in the description part" - ] + ], + "id": "0c46797b" }, { "cell_type": "code", "execution_count": 20, - "id": "eedeb35d-f6c5-44b8-9b6f-19ed5fa81600", "metadata": {}, "outputs": [], "source": [ @@ -580,23 +579,23 @@ " orders\n", "WHERE\n", " MATCH (TABLE orders) AGAINST (\"description:vibrant\");" - ] + ], + "id": "e5d55bde" }, { "attachments": {}, "cell_type": "markdown", - "id": "532d372f-0d79-4e5a-9c59-215fbf9aac5b", "metadata": {}, "source": [ "## Use various operators to show flexibility\n", "\n", "### + (must appear), * (multiple wildcard), ? (single wildcard)" - ] + ], + "id": "1a865df4" }, { "cell_type": "code", "execution_count": 21, - "id": "6a44dede-14a3-4b7f-aeab-23cd6cf3c577", "metadata": {}, "outputs": [], "source": [ @@ -607,43 +606,43 @@ " orders\n", "WHERE\n", " MATCH (TABLE orders) AGAINST (\"product:(+oo?) OR description:versa*\");" - ] + ], + "id": "c1ac7e50" }, { "attachments": {}, "cell_type": "markdown", - "id": "343c4d35-21c6-4aeb-95ab-5f9ab29114fa", "metadata": {}, "source": [ "# Add JSON\n", "\n", "## Add a JSON column to the orders table" - ] + ], + "id": "890c2ee0" }, { "cell_type": "code", "execution_count": 22, - "id": "6caba766-733b-4c89-b1e9-ff6dd92fd943", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "ALTER TABLE orders ADD COLUMN additional_details JSON NOT NULL;" - ] + ], + "id": "a0ee822e" }, { "attachments": {}, "cell_type": "markdown", - "id": "2b2b958b-fb9a-4a07-a58d-a18cf3b2d70c", "metadata": {}, "source": [ "## Update orders table with additional details in JSON format" - ] + ], + "id": "f758983e" }, { "cell_type": "code", "execution_count": 23, - "id": "643eebc5-1f8d-49ea-8500-4279b0604759", "metadata": {}, "outputs": [], "source": [ @@ -752,21 +751,21 @@ " }'\n", " ELSE '{}'\n", "END;" - ] + ], + "id": "dcde1541" }, { "attachments": {}, "cell_type": "markdown", - "id": "ff311771-7c34-49e3-9a4c-32b03760c2e2", "metadata": {}, "source": [ "## Extract specific JSON fields" - ] + ], + "id": "ceaced03" }, { "cell_type": "code", "execution_count": 24, - "id": "44a947f0-d319-466e-9593-692714cf5118", "metadata": {}, "outputs": [], "source": [ @@ -779,21 +778,21 @@ " orders\n", "ORDER BY\n", " order_id;" - ] + ], + "id": "cd0ad93b" }, { "attachments": {}, "cell_type": "markdown", - "id": "b20e765a-0721-4802-b280-201f5e31a966", "metadata": {}, "source": [ "## Find orders that have been \"Delivered\"" - ] + ], + "id": "b0acad1a" }, { "cell_type": "code", "execution_count": 25, - "id": "e691ecc1-71b9-4cff-b1f0-cff2df144700", "metadata": {}, "outputs": [], "source": [ @@ -807,21 +806,21 @@ " additional_details::order_status = '\"Delivered\"'\n", "ORDER BY\n", " order_id;" - ] + ], + "id": "2b82f711" }, { "attachments": {}, "cell_type": "markdown", - "id": "889005cf-7da3-4097-8f0f-40815375abec", "metadata": {}, "source": [ "## Aggregate data based on JSON fields" - ] + ], + "id": "c60f469a" }, { "cell_type": "code", "execution_count": 26, - "id": "b50609f0-9cca-41cf-bc78-982741297a0f", "metadata": {}, "outputs": [], "source": [ @@ -833,23 +832,23 @@ " orders\n", "GROUP BY\n", " order_status;" - ] + ], + "id": "dfdaf99e" }, { "attachments": {}, "cell_type": "markdown", - "id": "12ee1b9e-d18a-4647-a9e0-1f5b9a3d26e5", "metadata": {}, "source": [ "# Add Geospatial\n", "\n", "## Insert 2 more customers into customers table" - ] + ], + "id": "ed5e7f6f" }, { "cell_type": "code", "execution_count": 27, - "id": "a750e0e5-6bbe-4912-aedc-5919f009aeb5", "metadata": {}, "outputs": [], "source": [ @@ -857,21 +856,21 @@ "INSERT INTO customers (customer_id, customer_name, country) VALUES\n", "(6, \"Emily Davis\", \"Canada\"),\n", "(7, \"Michael Johnson\", \"Canada\");" - ] + ], + "id": "6b2cc109" }, { "attachments": {}, "cell_type": "markdown", - "id": "c9c04d67-a04c-4917-b946-5a776dc042ba", "metadata": {}, "source": [ "## Create neighborhoods table for geospatial data" - ] + ], + "id": "023e589e" }, { "cell_type": "code", "execution_count": 28, - "id": "424118b4-6909-452c-a8c7-48ae27dbfd90", "metadata": {}, "outputs": [], "source": [ @@ -887,21 +886,21 @@ " sort key (name),\n", " shard key (id)\n", ");" - ] + ], + "id": "c86427ab" }, { "attachments": {}, "cell_type": "markdown", - "id": "793a47b1-f931-4b9b-9bc0-ca4eab1669e8", "metadata": {}, "source": [ "## Add some city data to the neighborhoods table" - ] + ], + "id": "d3e71aac" }, { "cell_type": "code", "execution_count": 29, - "id": "1907a717-2da9-4454-aa9c-92f7dcdcef64", "metadata": {}, "outputs": [], "source": [ @@ -927,41 +926,41 @@ " \"POLYGON((-75.9274 45.2502, -75.3537 45.2502, -75.3537 45.5489, -75.9274 45.5489, -75.9274 45.2502))\",\n", " \"POINT(-75.6972 45.4215)\"\n", ");" - ] + ], + "id": "9b43cc6c" }, { "attachments": {}, "cell_type": "markdown", - "id": "40c1ca72-b5fe-4e42-ad1b-152ce9fa722b", "metadata": {}, "source": [ "## Add a geospatial column to the customers table" - ] + ], + "id": "1f45c65f" }, { "cell_type": "code", "execution_count": 30, - "id": "56624949-7854-494f-8710-4fe756dd1654", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "ALTER TABLE customers ADD COLUMN location GEOGRAPHYPOINT;" - ] + ], + "id": "e7591e5d" }, { "attachments": {}, "cell_type": "markdown", - "id": "34590e14-5ef9-41a9-b9c5-17332f3609cf", "metadata": {}, "source": [ "## Update customers table with location data" - ] + ], + "id": "10a85572" }, { "cell_type": "code", "execution_count": 31, - "id": "32427f37-77ef-427a-b7c3-e42b68d4925f", "metadata": {}, "outputs": [], "source": [ @@ -973,21 +972,21 @@ "UPDATE customers SET location = \"POINT(-75.6972 45.4215)\" WHERE customer_id = 5;\n", "UPDATE customers SET location = \"POINT(-79.3832 43.6532)\" WHERE customer_id = 6;\n", "UPDATE customers SET location = \"POINT(-123.1216 49.2827)\" WHERE customer_id = 7;" - ] + ], + "id": "b699b639" }, { "attachments": {}, "cell_type": "markdown", - "id": "b6b39000-a6fa-4ac1-a3c7-72b2c04a832a", "metadata": {}, "source": [ "## Join the neighborhoods table to itself and measure distances between neighborhoods" - ] + ], + "id": "5ebb2fb2" }, { "cell_type": "code", "execution_count": 32, - "id": "854652a1-5ee2-496f-92c4-6372a64e1593", "metadata": {}, "outputs": [], "source": [ @@ -1002,21 +1001,21 @@ " a.name = \"Vancouver\"\n", "ORDER BY\n", " 2;" - ] + ], + "id": "68244e5c" }, { "attachments": {}, "cell_type": "markdown", - "id": "d8c19609-f7c3-43fe-a4f4-b89965bac7cd", "metadata": {}, "source": [ "## Find out where you are" - ] + ], + "id": "308aa8ba" }, { "cell_type": "code", "execution_count": 33, - "id": "8b6ec25c-12b8-4836-980d-dd0fa65ce6b1", "metadata": {}, "outputs": [], "source": [ @@ -1027,21 +1026,21 @@ " neighborhoods\n", "WHERE\n", " GEOGRAPHY_INTERSECTS(\"POINT(-79.3770 43.7500)\", shape);" - ] + ], + "id": "26fb44d6" }, { "attachments": {}, "cell_type": "markdown", - "id": "ff11e2b0-34d7-499f-9a43-b0859b9819e1", "metadata": {}, "source": [ "## Find customers within \"Vancouver\"" - ] + ], + "id": "f635b459" }, { "cell_type": "code", "execution_count": 34, - "id": "8f50e2fe-b506-48a9-a7e7-1b70cdbf2b80", "metadata": {}, "outputs": [], "source": [ @@ -1052,23 +1051,23 @@ " customers c, neighborhoods n\n", "WHERE\n", " n.name = \"Vancouver\" AND GEOGRAPHY_CONTAINS(n.shape, c.location);" - ] + ], + "id": "a269a7e8" }, { "attachments": {}, "cell_type": "markdown", - "id": "9d891830-a7e0-4f21-a037-0e86ac2bf220", "metadata": {}, "source": [ "# Add Time Series\n", "\n", "## Count orders by day" - ] + ], + "id": "4e0fc856" }, { "cell_type": "code", "execution_count": 35, - "id": "32dcbffc-0fac-4e75-9457-70b1eaa8a223", "metadata": {}, "outputs": [], "source": [ @@ -1082,21 +1081,21 @@ " order_date\n", "ORDER BY\n", " order_date;" - ] + ], + "id": "58ca7bc8" }, { "attachments": {}, "cell_type": "markdown", - "id": "c1354288-4c3a-4a4a-a8c4-eda1135843e9", "metadata": {}, "source": [ "## Sum of order amounts by month" - ] + ], + "id": "476290eb" }, { "cell_type": "code", "execution_count": 36, - "id": "01bf8b43-8793-4fca-89fc-31939c884ed7", "metadata": {}, "outputs": [], "source": [ @@ -1110,21 +1109,21 @@ " order_month\n", "ORDER BY\n", " order_month;" - ] + ], + "id": "0e8ae35a" }, { "attachments": {}, "cell_type": "markdown", - "id": "0658e46d-1c4e-40fc-a59f-e00d97632b37", "metadata": {}, "source": [ "## Orders count by customer over time" - ] + ], + "id": "912c37d3" }, { "cell_type": "code", "execution_count": 37, - "id": "90c9ec3b-c38e-463a-9db5-e6282dd7041e", "metadata": {}, "outputs": [], "source": [ @@ -1139,33 +1138,33 @@ " customer_id, order_date\n", "ORDER BY\n", " customer_id, order_date;" - ] + ], + "id": "7e1a3e06" }, { "attachments": {}, "cell_type": "markdown", - "id": "42b97b4d-da05-48a3-9edc-02ec084e4764", "metadata": {}, "source": [ "# Bonus\n", "\n", "## Create a map from geospatial city data" - ] + ], + "id": "7b9b0c36" }, { "cell_type": "code", "execution_count": 38, - "id": "10145be7-61b7-4754-b4f6-79635499529d", "metadata": {}, "outputs": [], "source": [ "!pip install folium shapely --quiet" - ] + ], + "id": "8302f8a7" }, { "attachments": {}, "cell_type": "markdown", - "id": "8579e5cc-1f05-4d0d-a54d-59cb2cfb28cd", "metadata": {}, "source": [ "
    \n", @@ -1175,33 +1174,33 @@ "

    Select the database from the drop-down menu at the top of this notebook. It updates the connection_url which is used by SQLAlchemy to make connections to the selected database.

    \n", "
    \n", "
    " - ] + ], + "id": "b91965f0" }, { "cell_type": "code", "execution_count": 39, - "id": "cf398656-b381-4907-b5f9-52b2e00cc498", "metadata": {}, "outputs": [], "source": [ "from sqlalchemy import *\n", "\n", "db_connection = create_engine(connection_url)" - ] + ], + "id": "ab06690b" }, { "attachments": {}, "cell_type": "markdown", - "id": "ee96a9eb-233b-4958-96e1-12c6a5b4cc5f", "metadata": {}, "source": [ "## Get city data from neighborhoods table" - ] + ], + "id": "53defa1f" }, { "cell_type": "code", "execution_count": 40, - "id": "3931285c-b147-47b2-a26c-bc81072200bd", "metadata": {}, "outputs": [], "source": [ @@ -1222,21 +1221,21 @@ " query,\n", " db_connection\n", ")" - ] + ], + "id": "09d1e58a" }, { "attachments": {}, "cell_type": "markdown", - "id": "413e6c7d-8eb3-4811-b94a-e31c36330b58", "metadata": {}, "source": [ "## Convert the data to geospatial format for Python" - ] + ], + "id": "e62eaf70" }, { "cell_type": "code", "execution_count": 41, - "id": "d5b40f38-9a7e-4815-aed4-42294e2e6cad", "metadata": {}, "outputs": [], "source": [ @@ -1244,21 +1243,21 @@ "\n", "df[\"polygon\"] = df[\"polygon\"].apply(wkt.loads)\n", "df[\"point\"] = df[\"point\"].apply(wkt.loads)" - ] + ], + "id": "d740de3b" }, { "attachments": {}, "cell_type": "markdown", - "id": "34f05d2b-f60f-48f3-acbe-addb169663fc", "metadata": {}, "source": [ "## Plot the cities on a map" - ] + ], + "id": "82ab7026" }, { "cell_type": "code", "execution_count": 42, - "id": "c64c7e06-dbc6-4545-a6b1-adb953313b42", "metadata": {}, "outputs": [], "source": [ @@ -1285,21 +1284,21 @@ " ).add_to(m)\n", "\n", "html_content = m._repr_html_()" - ] + ], + "id": "525a69fe" }, { "attachments": {}, "cell_type": "markdown", - "id": "c11f862e-4958-4953-914c-67c54597b801", "metadata": {}, "source": [ "## Save the map to stage" - ] + ], + "id": "206af23f" }, { "attachments": {}, "cell_type": "markdown", - "id": "cbfab172-ac45-452b-86af-41e44e6bcd67", "metadata": {}, "source": [ "
    \n", @@ -1309,12 +1308,12 @@ "

    The following code will only work on the Standard Tier at this time.

    \n", "
    \n", "
    " - ] + ], + "id": "516a6fc3" }, { "cell_type": "code", "execution_count": 43, - "id": "68e134a2-4f57-4beb-8232-89f77599c88d", "metadata": {}, "outputs": [], "source": [ @@ -1323,21 +1322,21 @@ "if not shared_tier_check or shared_tier_check[0][1] == \"OFF\":\n", " with nb.stage.open(\"map.html\", \"w\") as st:\n", " st.write(html_content)" - ] + ], + "id": "c2c231f7" }, { "attachments": {}, "cell_type": "markdown", - "id": "ae745f3f-6f31-4998-a89d-d3e2a9012e06", "metadata": {}, "source": [ "# Cleanup" - ] + ], + "id": "ef48d5e3" }, { "cell_type": "code", "execution_count": 44, - "id": "2fb216bd-6120-40b5-8e99-732db96d1b6f", "metadata": {}, "outputs": [], "source": [ @@ -1345,34 +1344,35 @@ "DROP TABLE IF EXISTS customers;\n", "DROP TABLE IF EXISTS orders;\n", "DROP TABLE IF EXISTS neighborhoods;" - ] + ], + "id": "6fe6d5be" }, { "cell_type": "code", "execution_count": 45, - "id": "5dca2b68-29ab-4f12-aab0-7bb03932ca56", "metadata": {}, "outputs": [], "source": [ "shared_tier_check = %sql SHOW VARIABLES LIKE \"is_shared_tier\"\n", "if not shared_tier_check or shared_tier_check[0][1] == \"OFF\":\n", " %sql DROP DATABASE IF EXISTS multi_model;" - ] + ], + "id": "8631a02d" }, { "attachments": {}, "cell_type": "markdown", - "id": "d0ba2e40-19a1-4016-829d-e310417b9c37", "metadata": {}, "source": [ "# Conclusions\n", "\n", "In this Jupyter notebook, we explored the robust multi-model capabilities of SingleStore, demonstrating how to efficiently manage and query a wide range of data types within a unified database platform. Beginning with a simple \"Getting Started\" guide, we progressively delved into various standard SQL queries and extended our exploration to include more advanced data models such as vectors for machine learning, full-text search for unstructured data, JSON for hierarchical data, geospatial data for location-based queries, and time series data for temporal analysis. Through these practical examples, users can appreciate SingleStore's versatility and powerful functionality, gaining the skills to effectively harness its multi-model capabilities for diverse applications." - ] + ], + "id": "57215608" }, { + "id": "abaf291a", "cell_type": "markdown", - "id": "afae2f5f-89ec-411d-9478-e5d6f6480808", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/how-to-build-llm-apps-that-can-see-hear-speak/notebook.ipynb b/notebooks/how-to-build-llm-apps-that-can-see-hear-speak/notebook.ipynb index a1aef623..7195f7d2 100644 --- a/notebooks/how-to-build-llm-apps-that-can-see-hear-speak/notebook.ipynb +++ b/notebooks/how-to-build-llm-apps-that-can-see-hear-speak/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "535f54e3", "cell_type": "markdown", - "id": "7a9ad04b-150f-450f-828c-80caaf8a538f", "metadata": {}, "source": [ "
    \n", @@ -28,7 +28,8 @@ "

    This tutorial is meant for Standard & Premium Workspaces. You can't run this with a Free Starter Workspace due to restrictions on Storage. Create a Workspace using +group in the left nav & select Standard for this notebook. Gallery notebooks tagged with \"Starter\" are suitable to run on a Free Starter Workspace

    \n", "
    \n", "
    " - ] + ], + "id": "d99e79ad" }, { "attachments": {}, @@ -39,7 +40,8 @@ "\n", "# Demo Architecture\n", "![SingleStore LLM App](https://images.contentstack.io/v3/assets/bltac01ee6daa3a1e14/bltb25c874a947f70f9/65aa3b99573161fd44cc7b7d/singlestore-llm-app.png)" - ] + ], + "id": "121d0992" }, { "attachments": {}, @@ -56,7 +58,8 @@ "- [Step 5: Add Voice Recognition and Speech](#speech)\n", "- [Step 6: Tying it together with Image data](#image)\n", "- [Conclusion](#conclusion)" - ] + ], + "id": "c07c6e65" }, { "attachments": {}, @@ -66,7 +69,8 @@ "\n", "- [Back to Contents](#contents)\n", "## Setup SingleStore DDLs" - ] + ], + "id": "c356065b" }, { "attachments": {}, @@ -74,7 +78,8 @@ "metadata": {}, "source": [ "Create and use the database llm_webinar" - ] + ], + "id": "08dc100a" }, { "cell_type": "code", @@ -95,7 +100,8 @@ "%%sql\n", "DROP DATABASE IF EXISTS llm_webinar;\n", "CREATE DATABASE llm_webinar;" - ] + ], + "id": "519d7215" }, { "attachments": {}, @@ -109,7 +115,8 @@ "

    Make sure to select a database from the drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.

    \n", "
    \n", "
    " - ] + ], + "id": "d6fe21e2" }, { "attachments": {}, @@ -117,7 +124,8 @@ "metadata": {}, "source": [ "Create tables" - ] + ], + "id": "31a125ed" }, { "cell_type": "code", @@ -237,7 +245,8 @@ " KEY `category` (`category`) USING HASH,\n", " SORT KEY `__UNORDERED` (`created_at` DESC)\n", ");" - ] + ], + "id": "38861fba" }, { "cell_type": "code", @@ -257,7 +266,8 @@ "source": [ "%%sql\n", "SHOW TABLES;" - ] + ], + "id": "e6c60aa1" }, { "attachments": {}, @@ -267,7 +277,8 @@ "\n", "- [Back to Contents](#contents)\n", "## Install packages and imports" - ] + ], + "id": "80e9f658" }, { "cell_type": "code", @@ -276,7 +287,8 @@ "outputs": [], "source": [ "%pip install --quiet elevenlabs==0.2.27 openai==1.32.0 matplotlib scipy scikit-learn langchain==0.2.12 langchain-openai==0.1.20 langchain-community==0.2.11" - ] + ], + "id": "2d09faaf" }, { "cell_type": "code", @@ -298,7 +310,8 @@ "from langchain_openai import OpenAI as LangchainOpenAI\n", "from langchain.agents.agent_toolkits import SQLDatabaseToolkit\n", "from langchain.agents import create_sql_agent" - ] + ], + "id": "616b3688" }, { "attachments": {}, @@ -306,7 +319,8 @@ "metadata": {}, "source": [ "### Set API keys" - ] + ], + "id": "ab290a88" }, { "cell_type": "code", @@ -323,7 +337,8 @@ "alpha_vantage_apikey = getpass.getpass(\"enter alphavantage apikey here\")\n", "openai_apikey = getpass.getpass(\"enter openai apikey here\")\n", "elevenlabs_apikey = getpass.getpass(\"enter elevenlabs apikey here\")" - ] + ], + "id": "54c6d82b" }, { "cell_type": "code", @@ -338,7 +353,8 @@ "def get_embeddings(inputs: list[str], model: str = 'text-embedding-ada-002') -> list[str]:\n", " \"\"\"Return list of embeddings.\"\"\"\n", " return [x.embedding for x in client.embeddings.create(input=inputs, model=model).data]" - ] + ], + "id": "e7a9f751" }, { "attachments": {}, @@ -348,7 +364,8 @@ "\n", "- [Back to Contents](#contents)\n", "## Ingest from data sources" - ] + ], + "id": "8ea524ad" }, { "attachments": {}, @@ -356,7 +373,8 @@ "metadata": {}, "source": [ "### Bring past two months of stock data" - ] + ], + "id": "dcbf4cd8" }, { "cell_type": "code", @@ -367,7 +385,8 @@ "# set up connection to SingleStore and the ticker list\n", "s2_conn = s2.connect(connection_url)\n", "ticker_list = ['TSLA', 'AMZN', 'PLTR']" - ] + ], + "id": "aeb4043c" }, { "cell_type": "code", @@ -457,7 +476,8 @@ " with s2_conn.cursor() as cur:\n", " cur.execute(stmt, params)\n", " # time.sleep(1) # required to not hit API limits" - ] + ], + "id": "0de2b3fc" }, { "cell_type": "code", @@ -477,7 +497,8 @@ "source": [ "%%sql\n", "select count(*) from stockTable" - ] + ], + "id": "af7a439b" }, { "attachments": {}, @@ -485,7 +506,8 @@ "metadata": {}, "source": [ "## Bring in Company data" - ] + ], + "id": "9d8439de" }, { "cell_type": "code", @@ -682,7 +704,8 @@ "# Replace table_name with the actual table name you're using.\n", " with s2_conn.cursor() as cur:\n", " cur.execute(stmt, params)" - ] + ], + "id": "f0ffb567" }, { "cell_type": "code", @@ -702,7 +725,8 @@ "source": [ "%%sql\n", "select * from companyInfo limit 1" - ] + ], + "id": "f50c68bb" }, { "attachments": {}, @@ -710,7 +734,8 @@ "metadata": {}, "source": [ "## Bring in news sentiment" - ] + ], + "id": "49f2e590" }, { "cell_type": "code", @@ -811,7 +836,8 @@ "\n", " with s2_conn.cursor() as cur:\n", " cur.execute(stmt, params)" - ] + ], + "id": "c9ac2953" }, { "cell_type": "code", @@ -831,7 +857,8 @@ "source": [ "%%sql\n", "SELECT count(*) Rows_in_newsSentiment FROM newsSentiment" - ] + ], + "id": "2c4c7797" }, { "attachments": {}, @@ -841,7 +868,8 @@ "\n", "- [Back to Demo Architecture](#architecture)\n", "## Connect SingleStore to Open AI's LLM with Langchain" - ] + ], + "id": "e2245f1d" }, { "cell_type": "code", @@ -915,7 +943,8 @@ " top_k=3,\n", " max_iterations=5\n", ")" - ] + ], + "id": "7568a580" }, { "attachments": {}, @@ -923,7 +952,8 @@ "metadata": {}, "source": [ "### Create function that processes user question with a check in Semantic Cache Layer" - ] + ], + "id": "2af745a9" }, { "cell_type": "code", @@ -1041,7 +1071,8 @@ " print(f\"Insert to SingleStore execution time: {elapsed_time:.2f} milliseconds\")\n", "\n", " return answer2" - ] + ], + "id": "4b816df3" }, { "attachments": {}, @@ -1049,7 +1080,8 @@ "metadata": {}, "source": [ "### Test on two similar questions" - ] + ], + "id": "5768d430" }, { "cell_type": "code", @@ -1061,7 +1093,8 @@ "# Two similar questions\n", "question_1 = \"describe the database\"\n", "question_2 = \"describe database\"" - ] + ], + "id": "e5cbf7ea" }, { "cell_type": "code", @@ -1088,7 +1121,8 @@ "# Question: describe the database\n", "answer = process_user_question(question_1)\n", "print(f'The answer is: {answer}')" - ] + ], + "id": "ab4ea454" }, { "cell_type": "code", @@ -1108,7 +1142,8 @@ "source": [ "%%sql\n", "select id, category, question, answer from embeddings limit 1" - ] + ], + "id": "afae74c5" }, { "cell_type": "code", @@ -1125,7 +1160,8 @@ "# Question: describe database\n", "answer = process_user_question(question_2)\n", "print(f'The answer is: {answer}')" - ] + ], + "id": "c700fa03" }, { "attachments": {}, @@ -1135,7 +1171,8 @@ "\n", "- [Back to Contents](#contents)\n", "## Add Voice Recognition and Speech" - ] + ], + "id": "052aa690" }, { "attachments": {}, @@ -1143,7 +1180,8 @@ "metadata": {}, "source": [ "### Select a voice" - ] + ], + "id": "380c63f5" }, { "cell_type": "code", @@ -1156,7 +1194,8 @@ "from IPython.display import Audio\n", "from IPython.display import display\n", "import requests" - ] + ], + "id": "fb9f3821" }, { "cell_type": "code", @@ -1175,7 +1214,8 @@ "source": [ "voices = voices()\n", "voices[0]" - ] + ], + "id": "285606e4" }, { "cell_type": "code", @@ -1208,7 +1248,8 @@ " for chunk in response.iter_content(chunk_size=CHUNK_SIZE):\n", " if chunk:\n", " f.write(chunk)" - ] + ], + "id": "25f10364" }, { "cell_type": "code", @@ -1223,7 +1264,8 @@ ], "source": [ "!ls" - ] + ], + "id": "47c6370f" }, { "cell_type": "code", @@ -1244,7 +1286,8 @@ "\n", "audio = Audio(filename=audio_file, autoplay =True)\n", "display(audio)" - ] + ], + "id": "a9da9a96" }, { "attachments": {}, @@ -1252,7 +1295,8 @@ "metadata": {}, "source": [ "### Transcribe the audio file" - ] + ], + "id": "d8ff9f6a" }, { "cell_type": "code", @@ -1270,7 +1314,8 @@ "audio_file= open(\"output.mp3\", \"rb\")\n", "transcript = client.audio.transcriptions.create(model=\"whisper-1\", file=audio_file)\n", "print(transcript.text)" - ] + ], + "id": "1017c4a0" }, { "attachments": {}, @@ -1280,7 +1325,8 @@ "\n", "- [Back to Demo Architecture](#architecture)\n", "## Tying it together with Image data" - ] + ], + "id": "281f78c1" }, { "cell_type": "code", @@ -1299,7 +1345,8 @@ "Include the url, time published and banner image.\"\"\"\n", "answer = process_user_question(question_3)\n", "print(f'The answer is: {answer}')" - ] + ], + "id": "b19c6cc8" }, { "cell_type": "code", @@ -1319,7 +1366,8 @@ "source": [ "%%sql\n", "SELECT title, url, time_published, banner_image FROM newsSentiment WHERE ticker = 'AMZN' AND topic_relevance_score > 0.9 ORDER BY time_published DESC LIMIT 3" - ] + ], + "id": "d265d569" }, { "attachments": {}, @@ -1327,7 +1375,8 @@ "metadata": {}, "source": [ "### Load the image" - ] + ], + "id": "c62ced16" }, { "cell_type": "code", @@ -1356,7 +1405,8 @@ " plt.show()\n", "else:\n", " print(f\"Failed to retrieve the image. Status code: {response.status_code}\")" - ] + ], + "id": "91f210d3" }, { "attachments": {}, @@ -1364,7 +1414,8 @@ "metadata": {}, "source": [ "### Set up the huggingface transformer" - ] + ], + "id": "e6c37c4f" }, { "cell_type": "code", @@ -1383,7 +1434,8 @@ "print(f\"Setting up everything with transformers version {transformers_version}\")\n", "\n", "%pip install --quiet huggingface_hub>=0.14.1 git+https://github.com/huggingface/transformers@$transformers_version pyarrow==12.0.1 diffusers==0.30.0 accelerate==0.33.0 datasets==2.15.0 torch==2.1.0 soundfile==0.12.1 sentencepiece==0.2.0 opencv-contrib-python-headless==4.8.1.78" - ] + ], + "id": "e8fd40fe" }, { "cell_type": "code", @@ -1413,7 +1465,8 @@ "\n", "from huggingface_hub import notebook_login\n", "notebook_login()" - ] + ], + "id": "b9d51663" }, { "cell_type": "code", @@ -1492,7 +1545,8 @@ " pswd = openai_apikey\n", " agent = OpenAiAgent(model=\"gpt-3.5-turbo\", api_key=pswd)\n", " print(\"OpenAI is initialized \ud83d\udcaa\")" - ] + ], + "id": "039c453a" }, { "cell_type": "code", @@ -1606,7 +1660,8 @@ ], "source": [ "caption = agent.run(\"Can you caption the `image`?\", image=img)" - ] + ], + "id": "2b88ea99" }, { "cell_type": "code", @@ -1642,7 +1697,8 @@ "\n", "audio = Audio(filename=audio_file, autoplay =True)\n", "display(audio)" - ] + ], + "id": "0701fc7d" }, { "attachments": {}, @@ -1652,7 +1708,8 @@ "\n", "- [Back to Contents](#contents)\n", "## Conclusion" - ] + ], + "id": "4f3cca60" }, { "attachments": {}, @@ -1667,7 +1724,8 @@ "- Easily scale the workspace for your workload\n", "- handle reads and writes in parallel\n", "- Use of external functions." - ] + ], + "id": "a80a3042" }, { "attachments": {}, @@ -1675,7 +1733,8 @@ "metadata": {}, "source": [ "## Reset Demo" - ] + ], + "id": "8dd3f100" }, { "cell_type": "code", @@ -1695,11 +1754,12 @@ "source": [ "%%sql\n", "DROP DATABASE llm_webinar;" - ] + ], + "id": "1a791ca7" }, { + "id": "ac12af29", "cell_type": "markdown", - "id": "44f1b246-7829-4564-ad79-38f4344f6b71", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/hybrid-full-text-vector-search/notebook.ipynb b/notebooks/hybrid-full-text-vector-search/notebook.ipynb index 38c68873..63d00490 100644 --- a/notebooks/hybrid-full-text-vector-search/notebook.ipynb +++ b/notebooks/hybrid-full-text-vector-search/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "0a4a9f89", "cell_type": "markdown", - "id": "352a16c8-a1f1-4790-89d0-1c3db2ecb08f", "metadata": {}, "source": [ "
    \n", @@ -17,8 +17,8 @@ ] }, { + "id": "da7a85db", "cell_type": "markdown", - "id": "6fdcca97-c2dc-47ac-9618-26c368b13121", "metadata": {}, "source": [ "
    \n", @@ -33,7 +33,6 @@ { "attachments": {}, "cell_type": "markdown", - "id": "a3efcd15-5da0-4f79-85de-2ae1bf061b98", "metadata": {}, "source": [ "## What's in this notebook:\n", @@ -48,57 +47,57 @@ "## Questions?\n", "\n", "Reach out to us through our [forum](https://www.singlestore.com/forum)." - ] + ], + "id": "2230b2eb" }, { "attachments": {}, "cell_type": "markdown", - "id": "c70aec87-b44a-4afb-95b6-04395e06a19f", "metadata": {}, "source": [ "## 1. Create and use a database." - ] + ], + "id": "e92213aa" }, { "attachments": {}, "cell_type": "markdown", - "id": "ecebc295-b9b6-42b2-9f6c-e6fe43e3e36e", "metadata": {}, "source": [ "To use this notebook, you need to have an active workspace and have selected a database to use. Please select a database using the dropdown above." - ] + ], + "id": "89109515" }, { "attachments": {}, "cell_type": "markdown", - "id": "1c678c7a-f64f-48f4-812a-669301a9430d", "metadata": {}, "source": [ "## 2. Create a table and load data." - ] + ], + "id": "9c2f4767" }, { "attachments": {}, "cell_type": "markdown", - "id": "cdf63f55-447c-4d40-a784-e531aabe115f", "metadata": {}, "source": [ "This example uses a dataset of Wikipedia articles about video games. The dataset contains approximately 41,000 vectors based on 1,800 articles from Wikipedia. The data set is available under the Creative Commons Attribution-ShareAlike License 4.0. Refer to [Hybrid Search and Re-ranking](https://docs.singlestore.com/cloud/vectors/hybrid-search/) for more details on this example and information about hybrid search over vectors." - ] + ], + "id": "5e21580b" }, { "attachments": {}, "cell_type": "markdown", - "id": "4dbf9356-e652-45e5-b558-2b4f676ef4ee", "metadata": {}, "source": [ "Create a table to hold the video games data using the SQL below. This table stores the text of the paragraphs and stores the vectors created for those paragraphs using the [Vector Type](https://docs.singlestore.com/cloud/vectors/vector-type/)." - ] + ], + "id": "0b2486d3" }, { "cell_type": "code", "execution_count": 1, - "id": "630de736-b068-4658-a7d2-d3a00e8d25e3", "metadata": {}, "outputs": [], "source": [ @@ -110,21 +109,21 @@ "v VECTOR(1536) NOT NULL,\n", "SHARD KEY(id), KEY(id) USING HASH\n", ");" - ] + ], + "id": "81933449" }, { "attachments": {}, "cell_type": "markdown", - "id": "fb602402-1c17-46d7-8c17-86829978cb89", "metadata": {}, "source": [ "Create and run the following pipeline using the [CREATE PIPELINE](https://docs.singlestore.com/cloud/reference/create-pipeline/) command to load data into the video_games table. The CREATE PIPELINE command may take around 30 seconds to run." - ] + ], + "id": "e00df72a" }, { "cell_type": "code", "execution_count": 2, - "id": "fbd119f0-7700-4494-9efa-b999606ba4dd", "metadata": {}, "outputs": [], "source": [ @@ -143,60 +142,60 @@ "LINES TERMINATED BY '\\r\\n';\n", "\n", "START PIPELINE wiki_pipeline FOREGROUND;" - ] + ], + "id": "3fbf7fdd" }, { "attachments": {}, "cell_type": "markdown", - "id": "37554805-ce14-49c5-ba2e-5b010023c1b6", "metadata": {}, "source": [ "Verify the data was loaded using the query below.

    Wait for the pipeline to finish before running the COUNT query." - ] + ], + "id": "59459056" }, { "cell_type": "code", "execution_count": 3, - "id": "692fd8c1-a867-46bf-b64b-68f6389b0992", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT COUNT(*)\n", "FROM video_games;" - ] + ], + "id": "83bda5b0" }, { "attachments": {}, "cell_type": "markdown", - "id": "7ef5f8bd-10d9-4013-b4ce-a1f97132bc1a", "metadata": {}, "source": [ "There should be 40,027 rows in the video_games table when the PIPELINE is finished." - ] + ], + "id": "894ab109" }, { "attachments": {}, "cell_type": "markdown", - "id": "b2fc1603-26f8-4e6e-ad35-bcf338f89612", "metadata": {}, "source": [ "## 3. Create a full-text and a vector index." - ] + ], + "id": "5bdeb5e5" }, { "attachments": {}, "cell_type": "markdown", - "id": "51a47f76-e158-4bd1-ad71-d031dfa45f63", "metadata": {}, "source": [ "Use the following SQL to create full-text and vector indexes on the video_games table. Indexes can improve query performance on large vector data sets. Refer to [Vector Indexing](https://docs.singlestore.com/cloud/vectors/vector-indexing) for more information on vector indexes and [CREATE TABLE](https://docs.singlestore.com/studio-redir/create-table/) Wait for the ALTER TABLE commands to finish before running the OPTIMIZE command." - ] + ], + "id": "cf077539" }, { "cell_type": "code", "execution_count": 5, - "id": "9254cf36-c1ee-4c3f-8304-4e9d166b2f09", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "OPTIMIZE TABLE video_games FULL;" - ] + ], + "id": "ad07912c" }, { "attachments": {}, "cell_type": "markdown", - "id": "981b4ba7-109c-418b-ab39-1df64426a1f2", "metadata": {}, "source": [ "## 4. Similarity search.\n", @@ -240,12 +239,12 @@ "To find the most similar vectors in a query vector, use an ORDER BY\u2026 LIMIT\u2026 query. The ORDER BY command will arrange the vectors by their similarity score produced by a vector similarity function, with the closest matches at the top.\n", "\n", "The SQL below finds three paragraphs that are the most similar to the first paragraph about Mario Kart, a semantic similarity search for information about Mario Kart." - ] + ], + "id": "164c2cf0" }, { "cell_type": "code", "execution_count": 6, - "id": "97142354-ead5-4165-b8b8-03d55972ccbf", "metadata": {}, "outputs": [], "source": [ @@ -258,23 +257,23 @@ "FROM video_games\n", "ORDER BY score DESC\n", "LIMIT 3;" - ] + ], + "id": "54acdc4b" }, { "attachments": {}, "cell_type": "markdown", - "id": "7220f9af-7a0c-4142-ace1-32102bedf869", "metadata": {}, "source": [ "## 5. Hybrid search.\n", "\n", "Hybrid Search combines multiple search methods in one query and blends full-text search (which finds keyword matches) and vector search (which finds semantic matches) allowing search results to be (re-)ranked by a score that combines full-text and vector rankings." - ] + ], + "id": "637a88b9" }, { "cell_type": "code", "execution_count": 7, - "id": "c846a3b0-5477-4f73-9a7e-bd935717dcf0", "metadata": {}, "outputs": [], "source": [ @@ -305,30 +304,30 @@ "FROM fts FULL OUTER JOIN vs ON fts.id = vs.id\n", "ORDER BY score DESC\n", "LIMIT 5;" - ] + ], + "id": "33a0455f" }, { "attachments": {}, "cell_type": "markdown", - "id": "4ef8d384-a2dd-442b-ba9c-3b0d36429c2c", "metadata": {}, "source": [ "## 6. Clean up." - ] + ], + "id": "f9539b11" }, { "attachments": {}, "cell_type": "markdown", - "id": "86d4270b-0471-42b9-896c-d9cc70957633", "metadata": {}, "source": [ "The command below will drop the table created as part of this notebook. Dropping this table will allow you to rerun the notebook from the beginning." - ] + ], + "id": "cbd6de3f" }, { "cell_type": "code", "execution_count": 8, - "id": "87482b82-ab10-4471-854a-71734b9c2d4a", "metadata": {}, "outputs": [], "source": [ @@ -336,11 +335,12 @@ "DROP PIPELINE wiki_pipeline;\n", "\n", "DROP TABLE video_games;" - ] + ], + "id": "4c842df6" }, { + "id": "1093f40b", "cell_type": "markdown", - "id": "cbf78a0b-cd8d-47d5-a369-f69653f69092", "metadata": {}, "source": [ "

    \n", diff --git a/notebooks/hybrid-search/notebook.ipynb b/notebooks/hybrid-search/notebook.ipynb index 11682439..876301ea 100644 --- a/notebooks/hybrid-search/notebook.ipynb +++ b/notebooks/hybrid-search/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "a2a96f4c", "cell_type": "markdown", - "id": "505a207d-82ee-406d-bb92-e6a6900d6d18", "metadata": {}, "source": [ "
    \n", @@ -17,8 +17,8 @@ ] }, { + "id": "f35c3f82", "cell_type": "markdown", - "id": "4c528069", "metadata": {}, "source": [ "
    \n", @@ -33,38 +33,37 @@ { "attachments": {}, "cell_type": "markdown", - "id": "d9f9e629-6eb9-4ca5-bcf2-1b8672b86725", "metadata": {}, "source": [ "*Source*: [OpenAI Cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/data/AG_news_samples.csv)\n", "\n", "Hybrid search integrates both keyword-based search and semantic search in order to combine the strengths of both and provide users with a more comprehensive and efficient search experience. This notebook is an example on how to perform hybrid search with SingleStore's database and notebooks." - ] + ], + "id": "9e8ce0c7" }, { "attachments": {}, "cell_type": "markdown", - "id": "532e8d3f-007d-48a4-8d36-44b561dd1109", "metadata": {}, "source": [ "## Setup\n", "Let's first download the libraries necessary." - ] + ], + "id": "9f22dab7" }, { "cell_type": "code", "execution_count": 1, - "id": "07990b64-9447-46a8-abbc-51be1972dfda", "metadata": {}, "outputs": [], "source": [ "%pip install wget openai==1.3.3 --quiet" - ] + ], + "id": "dc770da1" }, { "cell_type": "code", "execution_count": 2, - "id": "a592dd5e-4114-4abf-923d-74038f5244eb", "metadata": {}, "outputs": [], "source": [ @@ -72,12 +71,12 @@ "import os\n", "import pandas as pd\n", "import wget" - ] + ], + "id": "55e30fcc" }, { "cell_type": "code", "execution_count": 3, - "id": "c2bffc74-4b6a-4c0f-acef-f72bb255ec79", "metadata": {}, "outputs": [], "source": [ @@ -87,22 +86,22 @@ "from sentence_transformers import SentenceTransformer\n", "\n", "model = SentenceTransformer('flax-sentence-embeddings/all_datasets_v3_mpnet-base')" - ] + ], + "id": "86b0043d" }, { "attachments": {}, "cell_type": "markdown", - "id": "0aa95a80-5683-4dc3-9e52-c3e890ab87af", "metadata": {}, "source": [ "## Import data from CSV file\n", "This csv file holds the title, summary, and category of approximately 2000 news articles." - ] + ], + "id": "c7555935" }, { "cell_type": "code", "execution_count": 4, - "id": "b1b2971e-d0f6-4cfa-a9a7-954602bda460", "metadata": {}, "outputs": [], "source": [ @@ -115,34 +114,34 @@ " print('File downloaded successfully.')\n", "else:\n", " print('File already exists in the local file system.')" - ] + ], + "id": "92958292" }, { "cell_type": "code", "execution_count": 5, - "id": "4d08a1ea-59fb-4334-ba54-aa86119cbaea", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('AG_news_samples.csv')\n", "df" - ] + ], + "id": "511a4442" }, { "cell_type": "code", "execution_count": 6, - "id": "e30c69d3-a807-4437-84e9-6972e3bc3d85", "metadata": {}, "outputs": [], "source": [ "data = df.to_dict(orient='records')\n", "data[0]" - ] + ], + "id": "dcd21c17" }, { "attachments": {}, "cell_type": "markdown", - "id": "f68e9407", "metadata": {}, "source": [ "
    \n", @@ -152,30 +151,30 @@ "

    If you have a Free Starter Workspace deployed already, select the database from drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.

    \n", "
    \n", "
    " - ] + ], + "id": "56fb72ee" }, { "attachments": {}, "cell_type": "markdown", - "id": "0b6c6560-bc60-43ba-93a4-1b4aee933d5b", "metadata": {}, "source": [ "## Set up the database" - ] + ], + "id": "a654b253" }, { "attachments": {}, "cell_type": "markdown", - "id": "e1dd6296-54b0-4f8d-886a-13cacfc28163", "metadata": {}, "source": [ "Set up the SingleStoreDB database which will hold your data." - ] + ], + "id": "44d411f6" }, { "cell_type": "code", "execution_count": 7, - "id": "e1874b6f-706a-4638-ad2a-ca387953acaa", "metadata": {}, "outputs": [], "source": [ @@ -183,12 +182,12 @@ "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", " %sql DROP DATABASE IF EXISTS news;\n", " %sql CREATE DATABASE news;" - ] + ], + "id": "e9855387" }, { "attachments": {}, "cell_type": "markdown", - "id": "fa49cc11", "metadata": {}, "source": [ "
    \n", @@ -198,12 +197,12 @@ "

    Make sure to select a database from the drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.

    \n", "
    \n", "
    " - ] + ], + "id": "bef0be6c" }, { "cell_type": "code", "execution_count": 8, - "id": "3f1e2c3d-6fbd-46bb-9bd3-235eb51941cf", "metadata": {}, "outputs": [], "source": [ @@ -216,21 +215,21 @@ " embedding BLOB,\n", " FULLTEXT (title, description)\n", ");" - ] + ], + "id": "1b68c84c" }, { "attachments": {}, "cell_type": "markdown", - "id": "8bd97023-3d02-44d4-8bd3-59875cb22b6c", "metadata": {}, "source": [ "### Get embeddings for every row based on the description column" - ] + ], + "id": "4b2d785a" }, { "cell_type": "code", "execution_count": 9, - "id": "496f84d0-51b6-4b66-bf5b-b1b260e4c2de", "metadata": {}, "outputs": [], "source": [ @@ -239,60 +238,60 @@ "descriptions = [row['description'] for row in data]\n", "all_embeddings = model.encode(descriptions)\n", "all_embeddings.shape" - ] + ], + "id": "18b369bf" }, { "attachments": {}, "cell_type": "markdown", - "id": "d74c544f-feea-4d48-84be-c62bf5cb4ea3", "metadata": {}, "source": [ "Merge embedding values into `data` rows." - ] + ], + "id": "e06caac5" }, { "cell_type": "code", "execution_count": 10, - "id": "05b2f3fe-c35c-4252-b416-9f7b7aec60a6", "metadata": {}, "outputs": [], "source": [ "for row, embedding in zip(data, all_embeddings):\n", " row['embedding'] = embedding" - ] + ], + "id": "7a86156d" }, { "attachments": {}, "cell_type": "markdown", - "id": "b37f2eac-c504-452b-90c2-0bb07e918b16", "metadata": {}, "source": [ "Here's an example of one row of the combined data." - ] + ], + "id": "1699c28f" }, { "cell_type": "code", "execution_count": 11, - "id": "30474fe0-257e-4937-b8ec-5780a04581e0", "metadata": {}, "outputs": [], "source": [ "data[0]" - ] + ], + "id": "0797c747" }, { "attachments": {}, "cell_type": "markdown", - "id": "46b1628c-0ffc-4a84-ba8b-43e8df081b01", "metadata": {}, "source": [ "### Populate the database" - ] + ], + "id": "00650aeb" }, { "cell_type": "code", "execution_count": 12, - "id": "cd3e5f9b-d9e5-45fe-ba20-4fb021d7a425", "metadata": {}, "outputs": [], "source": [ @@ -320,30 +319,30 @@ " ''')\n", "\n", "conn.execute(statement, data)" - ] + ], + "id": "e08bcee3" }, { "attachments": {}, "cell_type": "markdown", - "id": "a2f3d567-eaf4-487a-a1f9-2eb7e1071991", "metadata": {}, "source": [ "## Semantic search" - ] + ], + "id": "6ba8ae2e" }, { "attachments": {}, "cell_type": "markdown", - "id": "7ad3b8f6-d3a8-4954-a737-f11c785ce9ce", "metadata": {}, "source": [ "### Connect to OpenAI" - ] + ], + "id": "ffd38ab8" }, { "cell_type": "code", "execution_count": 13, - "id": "598d7077-d04c-46b3-b7c4-7b4362dd4507", "metadata": {}, "outputs": [], "source": [ @@ -351,33 +350,33 @@ "\n", "EMBEDDING_MODEL = 'text-embedding-ada-002'\n", "GPT_MODEL = 'gpt-3.5-turbo'" - ] + ], + "id": "bfb585b7" }, { "cell_type": "code", "execution_count": 14, - "id": "9eea2f67-3c2e-4d1a-87c2-052c2acf4026", "metadata": {}, "outputs": [], "source": [ "import getpass\n", "\n", "openai.api_key = getpass.getpass('OpenAI API Key: ')" - ] + ], + "id": "f40ceb13" }, { "attachments": {}, "cell_type": "markdown", - "id": "6504f561-1ab1-4dbf-a523-0aef23b66e4b", "metadata": {}, "source": [ "### Run semantic search and get scores" - ] + ], + "id": "a14ec4a9" }, { "cell_type": "code", "execution_count": 15, - "id": "a62a4c06-d77a-49b1-beaf-4c54b04d001f", "metadata": {}, "outputs": [], "source": [ @@ -399,30 +398,30 @@ "# Execute the SQL statement.\n", "results = pd.DataFrame(conn.execute(query_statement, dict(embedding=search_embedding)))\n", "results" - ] + ], + "id": "18283d3b" }, { "attachments": {}, "cell_type": "markdown", - "id": "2c8ff862-ea5b-4960-be5b-bcd530d6e918", "metadata": {}, "source": [ "## Hybrid search" - ] + ], + "id": "3e56cfa5" }, { "attachments": {}, "cell_type": "markdown", - "id": "d0b2cff3-76f8-4a35-a596-4f001a9b4c8c", "metadata": {}, "source": [ "This search finds the average of the score gotten from the semantic search and the score gotten from the key-word search and sorts the news articles by this combined score to perform an effective hybrid search." - ] + ], + "id": "56e6d068" }, { "cell_type": "code", "execution_count": 16, - "id": "9df7073f-6a89-4528-968c-7d5c21876a83", "metadata": {}, "outputs": [], "source": [ @@ -446,21 +445,21 @@ "# Execute the SQL statement.\n", "hyb_results = pd.DataFrame(conn.execute(hyb_statement, dict(embedding=hyb_embedding, query=hyb_query)))\n", "hyb_results" - ] + ], + "id": "1f1534f7" }, { "attachments": {}, "cell_type": "markdown", - "id": "b49c3ec8-ec70-48b8-bc2b-e387d7de0efc", "metadata": {}, "source": [ "## Clean up" - ] + ], + "id": "6a7ac35e" }, { "attachments": {}, "cell_type": "markdown", - "id": "0745143b", "metadata": {}, "source": [ "
    \n", @@ -470,23 +469,24 @@ "

    If you created a new database in your Standard or Premium Workspace, you can drop the database by running the cell below. Note: this will not drop your database for Free Starter Workspaces. To drop a Free Starter Workspace, terminate the Workspace using the UI.

    \n", "
    \n", "
    " - ] + ], + "id": "c9b12932" }, { "cell_type": "code", "execution_count": 17, - "id": "3d4ce5d5-b550-4ee0-9fef-0a1c45e63bb3", "metadata": {}, "outputs": [], "source": [ "shared_tier_check = %sql show variables like 'is_shared_tier'\n", "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", " %sql DROP DATABASE IF EXISTS news;" - ] + ], + "id": "80682ad8" }, { + "id": "410115cf", "cell_type": "markdown", - "id": "f9f6e53b-fb02-4d1a-908f-b96d1c2cdfd0", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/image-matching-with-sql/notebook.ipynb b/notebooks/image-matching-with-sql/notebook.ipynb index f4819955..b0b2ba7e 100644 --- a/notebooks/image-matching-with-sql/notebook.ipynb +++ b/notebooks/image-matching-with-sql/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "1a7a3f4e", "cell_type": "markdown", - "id": "dff08daa-d4e7-4816-b8de-be40c86ec2b9", "metadata": {}, "source": [ "
    \n", @@ -17,8 +17,8 @@ ] }, { + "id": "fedc5026", "cell_type": "markdown", - "id": "24002982", "metadata": {}, "source": [ "
    \n", @@ -33,7 +33,6 @@ { "attachments": {}, "cell_type": "markdown", - "id": "7b2bd61f-bdc4-4aef-9ab3-feeb8a0138d3", "metadata": {}, "source": [ "**SingleStoreDB can supercharge your apps with AI!**\n", @@ -41,23 +40,23 @@ "In this notebook, we\u2019ll demonstrate how we use the [`dot_product`](https://docs.singlestore.com/db/v8.1/en/reference/sql-reference/vector-functions/dot_product.html) function (for cosine similarity) to find a matching image of a celebrity from among 7 thousand records in just 3 milliseconds!\n", "\n", "Efficient retrieval of high-dimensional vectors and handling of large-scale vector similarity matching workloads are made possible by SingleStore\u2019s distributed architecture and efficient low-level execution. SingleStoreDB powers many AI applications including face matching, product photo matching, object recognition, text similarity matching, and sentiment analysis." - ] + ], + "id": "2d62f4f9" }, { "attachments": {}, "cell_type": "markdown", - "id": "b21735c2-31ad-4e38-9a5f-afae2c46de38", "metadata": {}, "source": [ "
    \n", "\n", "
    " - ] + ], + "id": "385d0d5c" }, { "attachments": {}, "cell_type": "markdown", - "id": "facd7889-eaed-44f8-ba7c-ac852c26e9f3", "metadata": {}, "source": [ "## 1. Create a workspace in your workspace group\n", @@ -76,12 +75,12 @@ "## 2. Create a Database named image_recognition\n", "\n", "The code below will drop the current `image_recognition` database and create a fresh one." - ] + ], + "id": "165d0dd1" }, { "cell_type": "code", "execution_count": 1, - "id": "0ee1fb52-14ee-48cc-880c-d525a7d84988", "metadata": {}, "outputs": [], "source": [ @@ -89,12 +88,12 @@ "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", " %sql DROP DATABASE IF EXISTS image_recognition;\n", " %sql CREATE DATABASE image_recognition;" - ] + ], + "id": "43a1a893" }, { "attachments": {}, "cell_type": "markdown", - "id": "899100f0-bac6-4e56-a1e3-eaf5ba32d345", "metadata": {}, "source": [ "
    \n", @@ -105,12 +104,12 @@ " It updates the connection_url which is used by the %%sql magic command and SQLAlchemy to make connections to the selected database.

    \n", "
    \n", "
    " - ] + ], + "id": "ebb4446e" }, { "attachments": {}, "cell_type": "markdown", - "id": "278053e8-7457-4655-a6fe-5c95ecb361de", "metadata": {}, "source": [ "## 3. Install and import the following libraries\n", @@ -121,12 +120,12 @@ "\n", "You may see messages printed about not being able to find cuda drivers or TensorRT. These can\n", "be ignored." - ] + ], + "id": "156299f3" }, { "cell_type": "code", "execution_count": 2, - "id": "4a58d896-5ea9-4335-8cfe-18de418ba2de", "metadata": {}, "outputs": [], "source": [ @@ -153,24 +152,24 @@ "\n", "tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)\n", "tf.disable_v2_behavior()" - ] + ], + "id": "b08f895b" }, { "attachments": {}, "cell_type": "markdown", - "id": "3bb47d4f-d54d-4fcc-835e-6a5066fa84bc", "metadata": {}, "source": [ "## 4. Create a table of images of people\n", "\n", "The table will contain two columns: 1) the filename containing the image and 2) the vector embedding\n", "of the image as a blob containing an array of 32-bit floats." - ] + ], + "id": "37bf3957" }, { "cell_type": "code", "execution_count": 3, - "id": "b115b516-df6d-4100-aacc-c49c50a91819", "metadata": {}, "outputs": [], "source": [ @@ -180,12 +179,12 @@ " vector BLOB,\n", " SHARD(filename)\n", ");" - ] + ], + "id": "6e3ef0e7" }, { "attachments": {}, "cell_type": "markdown", - "id": "41a990db-9e11-48e3-8011-8bd9770a27a2", "metadata": {}, "source": [ "## 5. Import our sample dataset into the table\n", @@ -194,23 +193,23 @@ "\n", "Note that we are using the `converters=` parameter of `pd.read_csv` to parse the text as a JSON array and convert it\n", "to a numpy array for the resulting DataFrame column." - ] + ], + "id": "7deff865" }, { "cell_type": "code", "execution_count": 4, - "id": "d1fb9284-f64e-49c3-acf8-d01f7a0d7e81", "metadata": {}, "outputs": [], "source": [ "url = 'https://raw.githubusercontent.com/singlestore-labs/singlestoredb-samples/main/' + \\\n", " 'Tutorials/Face%20matching/celebrity_data.sql'" - ] + ], + "id": "792d241b" }, { "cell_type": "code", "execution_count": 5, - "id": "5fe33107-3e46-49f8-9450-4cac14501a34", "metadata": {}, "outputs": [], "source": [ @@ -228,35 +227,35 @@ "\n", "# Upload DataFrame\n", "df.to_sql('people', con=conn, index=False, if_exists='append')" - ] + ], + "id": "a9b97aa8" }, { "attachments": {}, "cell_type": "markdown", - "id": "168be056-17da-4f94-8252-3e5d79459a8b", "metadata": {}, "source": [ "## 6. Run our image matching algorithm using just 2 lines of SQL\n", "\n", "In this example, we use an image of Adam Sandler and find the 5 closest images in our database to it. We use the `dot_product` function to measure cosine_similarity of each vector in the database to the input image." - ] + ], + "id": "ee1141f5" }, { "cell_type": "code", "execution_count": 6, - "id": "b26fd8fe-a90d-47c5-9cd1-63def15e2eac", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SET @v = (SELECT vector FROM people WHERE filename = \"Adam_Sandler/Adam_Sandler_0003.jpg\");\n", "SELECT filename, DOT_PRODUCT(vector, @v) AS score FROM people ORDER BY score DESC LIMIT 5;" - ] + ], + "id": "a920a537" }, { "attachments": {}, "cell_type": "markdown", - "id": "1d0606a8-6503-4522-8a85-6366263e4b5e", "metadata": {}, "source": [ "## 7. Pick an image of a celebrity and see which images matched closest to it!\n", @@ -264,12 +263,12 @@ "1. Run the code cell\n", "2. Pick a celebrity picture\n", "3. Wait for the match!" - ] + ], + "id": "05ee9b14" }, { "cell_type": "code", "execution_count": 7, - "id": "6b4d75e4-38b5-491d-a77c-35f949ef4ca4", "metadata": {}, "outputs": [], "source": [ @@ -344,12 +343,12 @@ "display(dropdown)\n", "dropdown.observe(on_value_change, names='value')\n", "display(out)" - ] + ], + "id": "473b031e" }, { "attachments": {}, "cell_type": "markdown", - "id": "cea04465-6a69-42f1-8249-4c49488506f6", "metadata": {}, "source": [ "## 8. See which celebrity you look most like!\n", @@ -362,12 +361,12 @@ "3. Wait for the match!\n", "\n", "**A low score for matching is less than 0.6.**" - ] + ], + "id": "726410a3" }, { "cell_type": "code", "execution_count": 8, - "id": "985fdb94-38fd-42c9-aaff-16620db0e954", "metadata": {}, "outputs": [], "source": [ @@ -522,21 +521,21 @@ "display(upload_button)\n", "upload_button.observe(handle_upload, names='value')\n", "display(new_out)" - ] + ], + "id": "1bf0362a" }, { "attachments": {}, "cell_type": "markdown", - "id": "f3f3c685-0335-46e2-9a8d-e46ec296f074", "metadata": {}, "source": [ "## 9. Clean up" - ] + ], + "id": "04919626" }, { "attachments": {}, "cell_type": "markdown", - "id": "9c3c7c54", "metadata": {}, "source": [ "
    \n", @@ -546,23 +545,24 @@ "

    If you created a new database in your Standard or Premium Workspace, you can drop the database by running the cell below. Note: this will not drop your database for Free Starter Workspaces. To drop a Free Starter Workspace, terminate the Workspace using the UI.

    \n", "
    \n", "
    " - ] + ], + "id": "769ad070" }, { "cell_type": "code", "execution_count": 9, - "id": "51614d50-1e1f-4fe5-ab8d-09c804619edb", "metadata": {}, "outputs": [], "source": [ "shared_tier_check = %sql show variables like 'is_shared_tier'\n", "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", " %sql DROP DATABASE IF EXISTS image_recognition;" - ] + ], + "id": "776ff6cf" }, { + "id": "c3cccf38", "cell_type": "markdown", - "id": "83729541-f1c7-4d12-8162-e82a3ea991a1", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/ingest-pdfs-with-unstructured/notebook.ipynb b/notebooks/ingest-pdfs-with-unstructured/notebook.ipynb index 981dadda..aa571bb4 100644 --- a/notebooks/ingest-pdfs-with-unstructured/notebook.ipynb +++ b/notebooks/ingest-pdfs-with-unstructured/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "3ba63f11", "cell_type": "markdown", - "id": "47f782f8-1c81-45b0-8169-a1cd24c9bd9f", "metadata": {}, "source": [ "
    \n", @@ -25,7 +25,8 @@ "We'll start by installing the Unstructured library, which is essential for ingesting and processing PDF files. The library will allow us to convert PDF documents into a JSON format that includes both metadata and text extraction. For this part of the project, we'll focus on installing the PDF support components.\n", "\n", "Reference for full installation details: [Unstructured Installation Guide](https://unstructured-io.github.io/unstructured/installation/full_installation.html)" - ] + ], + "id": "0680197e" }, { "cell_type": "code", @@ -34,7 +35,8 @@ "outputs": [], "source": [ "!pip install \"unstructured[pdf]\"" - ] + ], + "id": "3a3fee0a" }, { "cell_type": "markdown", @@ -43,7 +45,8 @@ "## Import Libraries\n", "\n", "In this section, we import the necessary libraries for our project. We'll use `pandas` to handle data manipulation, converting our semi-structured JSON data into a structured DataFrame format. This is crucial for storing the data in the SingleStore database later on. Additionally, we'll utilize the OpenAI API for vectorizing text and generating responses, integral components of our RAG system." - ] + ], + "id": "6a27e7f1" }, { "cell_type": "code", @@ -59,7 +62,8 @@ "\n", "import openai\n", "from openai.embeddings_utils import get_embedding" - ] + ], + "id": "6a076d8b" }, { "cell_type": "markdown", @@ -72,7 +76,8 @@ "- Obtain your OpenAI API key here: [OpenAI API Key](https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key)\n", "- Set up your SingleStore account and workspace: [SingleStore Setup Guide](https://www.singlestore.com/blog/how-to-get-started-with-singlestore/)\n", "- Connect to your SingleStore workspace: [SingleStore Connection Documentation](https://docs.singlestore.com/cloud/connect-to-your-workspace/)" - ] + ], + "id": "c40456f7" }, { "cell_type": "code", @@ -89,7 +94,8 @@ "username=os.environ[\"SS_USERNAME\"]\n", "password=os.environ[\"SS_PASSWORD\"]\n", "database=os.environ[\"SS_DATABASE\"]" - ] + ], + "id": "e8826a8c" }, { "cell_type": "markdown", @@ -105,7 +111,8 @@ "- [Partition PDF Documentation](https://unstructured-io.github.io/unstructured/bricks/partition.html#partition-pdf)\n", "- [Chunk by Title Documentation](https://unstructured-io.github.io/unstructured/bricks/chunking.html)\n", "- [Strategy Documentation](https://unstructured-io.github.io/unstructured/best_practices/strategies.html)" - ] + ], + "id": "92ae5a1e" }, { "cell_type": "code", @@ -114,7 +121,8 @@ "outputs": [], "source": [ "pdf_filename = \"Employee-Handbook.pdf\"" - ] + ], + "id": "5f4be9dc" }, { "cell_type": "code", @@ -130,7 +138,8 @@ " )\n", "\n", "chunks = chunk_by_title(elements)" - ] + ], + "id": "24879122" }, { "cell_type": "markdown", @@ -141,7 +150,8 @@ "After processing the PDF, we receive output in an unstructured JSON format, which includes valuable metadata about the extracted elements. This metadata enables us to filter and manipulate the document elements based on our requirements. Our next step is to convert this JSON output into a structured DataFrame, which is a more suitable format for storing in the SingleStore DB and for further processing in our RAG system.\n", "\n", "Reference for understanding metadata: [Unstructured Metadata Documentation](https://unstructured-io.github.io/unstructured/metadata.html)" - ] + ], + "id": "a8fefdba" }, { "cell_type": "code", @@ -166,7 +176,8 @@ "\n", "# Show the DataFrame\n", "df.head()" - ] + ], + "id": "b4f19b22" }, { "cell_type": "markdown", @@ -179,7 +190,8 @@ "References:\n", "- [Creating a Database in SingleStoreDB Cloud](https://docs.singlestore.com/cloud/create-a-database/)\n", "- [Loading Data into SingleStoreDB Cloud](https://docs.singlestore.com/cloud/load-data/)" - ] + ], + "id": "e1cfcd38" }, { "cell_type": "code", @@ -193,7 +205,8 @@ " host=host,\n", " database=database)\n", "cnx" - ] + ], + "id": "7a9d094a" }, { "cell_type": "code", @@ -221,7 +234,8 @@ "cnx.commit()\n", "drop_cursor.close()\n", "create_cursor.close()" - ] + ], + "id": "ba220cc1" }, { "cell_type": "code", @@ -239,7 +253,8 @@ "\n", "cnx.commit()\n", "cursor.close()" - ] + ], + "id": "3f7cbbdb" }, { "cell_type": "markdown", @@ -250,7 +265,8 @@ "Next, we enhance our database table by adding a new column for text embeddings. Using OpenAI's `get_embedding` function, we generate embeddings that measure the relatedness of text strings. These embeddings are particularly useful for search functionality, allowing us to rank results by relevance.\n", "\n", "Reference: [Understanding Text Embeddings](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings)" - ] + ], + "id": "95f9443b" }, { "cell_type": "code", @@ -263,7 +279,8 @@ "# Add a new column for text embedding\n", "alter_query = \"ALTER TABLE unstructured_data ADD text_embedding TEXT;\"\n", "cursor.execute(alter_query)" - ] + ], + "id": "c95bc511" }, { "cell_type": "code", @@ -284,7 +301,8 @@ "\n", "cnx.commit()\n", "cursor.close()" - ] + ], + "id": "00b7c77b" }, { "cell_type": "markdown", @@ -295,7 +313,8 @@ "The retrieval process begins by selecting the table and text embeddings from our database. We then calculate similarity scores using numpy's dot product function, comparing the user query embeddings with the document embeddings. This allows us to identify and select the top-5 most similar entries, which are most relevant to the user's query.\n", "\n", "Reference: [How the Dot Product Measures Similarity](https://tivadardanka.com/blog/how-the-dot-product-measures-similarity)" - ] + ], + "id": "fa56983d" }, { "cell_type": "code", @@ -307,7 +326,8 @@ "search_string = \"What are the emergency management provisions include?\"\n", "search_embedding = get_embedding(search_string, engine=\"text-embedding-ada-002\")\n", "search_embedding_array = np.array(search_embedding)" - ] + ], + "id": "35e10fa7" }, { "cell_type": "code", @@ -348,7 +368,8 @@ "\n", "# Display top-k records\n", "top_5" - ] + ], + "id": "876a636b" }, { "cell_type": "markdown", @@ -359,7 +380,8 @@ "In the final step, we take the top-5 most similar entries retrieved from the database and use them as input for OpenAI's ChatCompletion. The ChatCompletion model is designed for both multi-turn conversations and single-turn tasks. It takes a list of messages as input and returns a model-generated message as output, providing us with a coherent and contextually relevant response based on the retrieved documents.\n", "\n", "Reference: [OpenAI Chat Completions API Guide](https://platform.openai.com/docs/guides/gpt/chat-completions-api)" - ] + ], + "id": "bd06c2d8" }, { "cell_type": "code", @@ -388,11 +410,12 @@ " print(f\"OpenAI API call failed: {e}\")\n", "else:\n", " print(\"No relevant documents found.\")" - ] + ], + "id": "8a57d965" }, { + "id": "f034fab2", "cell_type": "markdown", - "id": "7728cc63-f722-456a-bdd3-fba7ec452282", "metadata": {}, "source": [ "
    \n", @@ -429,5 +452,5 @@ } }, "nbformat": 4, - "nbformat_minor": 4 + "nbformat_minor": 5 } diff --git a/notebooks/inserting-embeddings-from-multiple-models-into-singlestore-using-external-functions/notebook.ipynb b/notebooks/inserting-embeddings-from-multiple-models-into-singlestore-using-external-functions/notebook.ipynb index b83f7502..5c1b04a8 100644 --- a/notebooks/inserting-embeddings-from-multiple-models-into-singlestore-using-external-functions/notebook.ipynb +++ b/notebooks/inserting-embeddings-from-multiple-models-into-singlestore-using-external-functions/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "c64a85c0", "cell_type": "markdown", - "id": "48bd8614-f726-44e2-94bd-050cb58a67a6", "metadata": {}, "source": [ "
    \n", @@ -19,7 +19,6 @@ { "attachments": {}, "cell_type": "markdown", - "id": "9591e358-2168-4e37-a839-4f02db584090", "metadata": {}, "source": [ "## Overview\n", @@ -44,22 +43,22 @@ "\n", "- **Storage Flexibility**: Using the `blob` type for embedding storage offers flexibility in handling multiple models within a single column but precludes the use of vector indices and ANN search capabilities.\n", "- **Efficiency and Specificity**: The `vector` type, along with vector indices, enables efficient ANN searches, necessitating separate columns for embeddings from different models due to dimension specification requirements." - ] + ], + "id": "8587f99f" }, { "cell_type": "markdown", - "id": "b68fe714-dc36-445b-85ff-69a74099a557", "metadata": {}, "source": [ "### Architecture diagram :\n", "
    \n", " \"Architecture\n", "
    " - ] + ], + "id": "1ab969e9" }, { "cell_type": "markdown", - "id": "26641499-1240-4c77-95f4-67d7ae79d66b", "metadata": {}, "source": [ "## Database Setup for Embeddings Demo\n", @@ -79,12 +78,12 @@ "- The `random_sentences` table serves as the input for our embedding generation process.\n", "- The `random_sentences_embeddings` table showcases how to store embeddings in a structured vector format, allowing for direct operations on embeddings within the database. This table leverages SingleStore's vector data type, enabling ordered collections of numeric values with fixed dimensions for embeddings.\n", "- The duplicated creation command for `random_sentences_embeddings_2` appears to be an oversight and should be considered for removal to avoid confusion." - ] + ], + "id": "9dc6f7a7" }, { "cell_type": "code", "execution_count": 1, - "id": "e4a7515c-62cb-41a8-8405-c48ed4f56f78", "metadata": {}, "outputs": [], "source": [ @@ -92,12 +91,12 @@ "DROP DATABASE IF EXISTS embeddings_demo;\n", "CREATE DATABASE embeddings_demo;\n", "USE embeddings_demo;" - ] + ], + "id": "3dca2da1" }, { "cell_type": "code", "execution_count": 2, - "id": "15b2e3b4-b16f-4afd-9ca7-273de3f86d9f", "metadata": {}, "outputs": [], "source": [ @@ -127,23 +126,23 @@ "INSERT INTO random_sentences VALUES (uuid(), 'Birds of a feather flock together', DEFAULT);\n", "\n", "INSERT INTO random_sentences VALUES (uuid(), 'Actions speak louder than words', DEFAULT);" - ] + ], + "id": "b00dc8a2" }, { "cell_type": "code", "execution_count": 3, - "id": "0e778407-fea2-4409-951f-cfec1419bda4", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT * FROM random_sentences;" - ] + ], + "id": "15fa51ed" }, { "cell_type": "code", "execution_count": 4, - "id": "c35e8c94-0e54-402c-a112-c8d2ae4ad219", "metadata": {}, "outputs": [], "source": [ @@ -154,12 +153,12 @@ " openai_ada002_embeddings VECTOR(1536),\n", " hf_miniLM_L12_v2_embeddings VECTOR(384)\n", ");" - ] + ], + "id": "b1bbc1d0" }, { "cell_type": "code", "execution_count": 5, - "id": "ed1f55dd-c450-49bc-97ca-2ba462efc778", "metadata": {}, "outputs": [], "source": [ @@ -170,11 +169,11 @@ " model_id VARCHAR(512),\n", " embedding BLOB\n", ");" - ] + ], + "id": "8499f4ae" }, { "cell_type": "markdown", - "id": "c0113780-b9f6-4a2b-978a-0f1ff32c2a03", "metadata": {}, "source": [ "## Inserting Embeddings into SingleStoreDB\n", @@ -191,12 +190,12 @@ "- **About External Functions**: External functions in SingleStoreDB allow for operations to be executed outside the database process, supporting both scalar and table-valued returns. This feature is instrumental in integrating machine learning insights into SQL workflows, enhancing data with vector embeddings from models like OpenAI or Hugging Face. Check more about external functions [here](https://docs.singlestore.com/cloud/reference/sql-reference/procedural-sql-reference/create-or-replace-external-function/)\n", "\n", "- **Code for external function**: To explore and test the demo, please navigate to the 'singlestore spaces' repository. Within the designated notebook's repository, you will discover the relevant code file `external_function_api.py`. This code is ready for use and has been set up to facilitate an interactive demonstration. For quicker access check **appendix at the end of the notebook**" - ] + ], + "id": "d3284473" }, { "cell_type": "code", "execution_count": 6, - "id": "b2a76d32-660a-451d-8cb8-5ba0862dbada", "metadata": {}, "outputs": [], "source": [ @@ -208,22 +207,22 @@ "\n", "-- Test external function\n", "-- SELECT get_embedding(\"blueberry\", 'openai_embedding')AS res ;" - ] + ], + "id": "89b12b02" }, { "cell_type": "markdown", - "id": "e85466fe-28cf-4beb-a54e-795f52e771be", "metadata": {}, "source": [ "### Dynamic Data Insertion with `InsertDynamicData_1`\n", "\n", "A stored procedure, `InsertDynamicData_1`, dynamically constructs and executes an SQL query to insert generated embeddings into the `random_sentences_embeddings_2` table. This process exemplifies the seamless integration of machine learning embeddings into database records, leveraging the `get_embedding` external function." - ] + ], + "id": "e3ac374e" }, { "cell_type": "code", "execution_count": 7, - "id": "826d018a-3b33-4319-93f5-efc8d8b68dc6", "metadata": {}, "outputs": [], "source": [ @@ -236,55 +235,55 @@ " sql_query = CONCAT('INSERT INTO ', target_table_id, ' (uuid, sentence, model_id, embedding) SELECT uuid, sentence, ''', embedding_model_id, ''' ,JSON_ARRAY_PACK(get_embedding(', source_column_id, ', ''', embedding_model_id, ''')) FROM ', source_table_id);\n", " EXECUTE IMMEDIATE sql_query;\n", "END;" - ] + ], + "id": "45e0fe48" }, { "cell_type": "code", "execution_count": 8, - "id": "9c1f4b0f-30bf-4c4f-aae9-052587340997", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "CALL InsertDynamicData_1('random_sentences', 'random_sentences_embeddings_2', 'sentence', 'openai_embedding');" - ] + ], + "id": "5877099a" }, { "cell_type": "code", "execution_count": 9, - "id": "e0f26928-c7b0-4202-9bb3-87ea1388e8cf", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "CALL InsertDynamicData_1('random_sentences', 'random_sentences_embeddings_2', 'sentence', 'hf_embedding');" - ] + ], + "id": "f4e49360" }, { "cell_type": "code", "execution_count": 10, - "id": "9a3070cd-130d-430c-bf66-4262ecdcce5f", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "select COUNT(*) from random_sentences_embeddings_2;" - ] + ], + "id": "c7c1299c" }, { "cell_type": "markdown", - "id": "c00f84e3-7cba-43ab-b990-c9416a9531a2", "metadata": {}, "source": [ "### Advanced Embedding Insertion with `InsertDynamicData_2`\n", "\n", "Another stored procedure, `InsertDynamicData_2`, is designed to handle multiple embedding models, inserting their outputs into designated vector columns within the `random_sentences_embeddings` table. This procedure illustrates the flexibility and power of SingleStoreDB in accommodating complex data types like vectors, directly derived from machine learning embeddings." - ] + ], + "id": "18ab6559" }, { "cell_type": "code", "execution_count": 11, - "id": "a7f8cfd0-7503-488e-b7c7-466378bc1109", "metadata": {}, "outputs": [], "source": [ @@ -304,42 +303,42 @@ " );\n", " EXECUTE IMMEDIATE sql_query;\n", "END;" - ] + ], + "id": "06099b7d" }, { "cell_type": "code", "execution_count": 12, - "id": "0cdce870-4233-4843-b875-c7e54c01c960", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "CALL InsertDynamicData_2('random_sentences', 'random_sentences_embeddings', 'sentence');" - ] + ], + "id": "8905b60c" }, { "cell_type": "code", "execution_count": 13, - "id": "208631e9-6273-4f47-83e1-4566f16fbc40", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT COUNT(*) FROM random_sentences_embeddings;" - ] + ], + "id": "753d43e2" }, { "cell_type": "markdown", - "id": "e9e1a82b-e95e-4be5-a9d0-52c9237f5336", "metadata": {}, "source": [ "### Lets try to query these embeddings to get matching score using dot_product" - ] + ], + "id": "a13b93df" }, { "cell_type": "code", "execution_count": 14, - "id": "7650cacb-a274-4f7b-9943-647e706162d9", "metadata": {}, "outputs": [], "source": [ @@ -354,21 +353,21 @@ " random_sentences_embeddings r2\n", "ORDER BY\n", " openai_score DESC, hf_score DESC;" - ] + ], + "id": "3f10b8b1" }, { "cell_type": "markdown", - "id": "38f2c7af", "metadata": {}, "source": [ "## Appendix\n", "Code for external function API." - ] + ], + "id": "981f1261" }, { "cell_type": "code", "execution_count": 15, - "id": "00bc005f", "metadata": {}, "outputs": [], "source": [ @@ -470,11 +469,12 @@ "\n", "if __name__ == '__main__':\n", " app.run(debug=True, host='0.0.0.0', port=5000)" - ] + ], + "id": "5b78282a" }, { + "id": "767b37d6", "cell_type": "markdown", - "id": "5362bb5a-cb27-4be3-bde7-5092f22f2b2b", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/insure-gpt-demo/notebook.ipynb b/notebooks/insure-gpt-demo/notebook.ipynb index f23656fb..11aa5aaa 100644 --- a/notebooks/insure-gpt-demo/notebook.ipynb +++ b/notebooks/insure-gpt-demo/notebook.ipynb @@ -1,6 +1,6 @@ { "nbformat": 4, - "nbformat_minor": 0, + "nbformat_minor": 5, "metadata": { "colab": { "provenance": [] @@ -15,8 +15,8 @@ }, "cells": [ { + "id": "2690a0d9", "cell_type": "markdown", - "id": "26270686-85fc-4a75-ad1f-2ca7f2311081", "metadata": {}, "source": [ "
    \n", @@ -57,7 +57,8 @@ "\n", "This notebook represents a practical application of AI in the insurance industry, offering a glimpse into the future of automated and data-driven claim processing. By leveraging the power of SingleStore, OpenAI, and LangChain, InsureGPT streamlines the analysis of complex data, making the claim handling process faster, more accurate, and more efficient." ], - "metadata": {} + "metadata": {}, + "id": "c36cf156" }, { "cell_type": "code", @@ -66,14 +67,16 @@ ], "metadata": {}, "execution_count": 1, - "outputs": [] + "outputs": [], + "id": "c3efeecc" }, { "cell_type": "markdown", "source": [ "In this section of the notebook, we prepare our environment by importing a set of Python libraries essential for processing and analyzing documents in the context of insurance claims. These libraries enable us to generate PDF documents, split text for easier processing, load and analyze text documents, and interact with OpenAI's powerful AI models for advanced text and image analysis. Additionally, we configure our environment to connect to a SingleStore database, a scalable SQL database that excels in real-time analytics, by setting the `SINGLESTOREDB_URL` environment variable. This setup is crucial for storing and retrieving vectorized document data efficiently. Finally, we initialize the OpenAI client with an API key, allowing us to leverage OpenAI's models for tasks such as analyzing images, processing text, and generating embeddings that can be stored in the SingleStore database for quick retrieval and analysis. This configuration forms the backbone of our document processing and analysis workflow, combining the capabilities of FPDF, LangChain, and OpenAI with the storage and retrieval efficiency of SingleStore." ], - "metadata": {} + "metadata": {}, + "id": "ac7ad6b6" }, { "cell_type": "code", @@ -92,7 +95,8 @@ "\n", "os.environ[\"SINGLESTOREDB_URL\"] = \"
    \n", diff --git a/notebooks/integrating-with-pandas/notebook.ipynb b/notebooks/integrating-with-pandas/notebook.ipynb index 50b40131..4bbed743 100644 --- a/notebooks/integrating-with-pandas/notebook.ipynb +++ b/notebooks/integrating-with-pandas/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "077e8941", "cell_type": "markdown", - "id": "5c33f6a3-69f0-400d-813f-3889b1c08d2b", "metadata": {}, "source": [ "
    \n", @@ -17,8 +17,8 @@ ] }, { + "id": "b66a3e48", "cell_type": "markdown", - "id": "cedc7490", "metadata": {}, "source": [ "
    \n", @@ -32,7 +32,6 @@ }, { "cell_type": "markdown", - "id": "084012cb-df73-4887-a105-9d82e2755508", "metadata": {}, "source": [ "This notebook will show how to move data from a pandas `DataFrame` into SingleStoreDB as well\n", @@ -40,12 +39,12 @@ "is only intended for relatively small data sets and to do processing that can't otherwise\n", "be done in SingleStoreDB itself. Moving data to the client for processing should only be done\n", "when there is no other alternative in the database." - ] + ], + "id": "91ae5c41" }, { "cell_type": "code", "execution_count": 1, - "id": "9d6f398e-e63c-4477-ab37-fbdcdd2d92f0", "metadata": {}, "outputs": [], "source": [ @@ -53,11 +52,11 @@ "import pandas as pd\n", "import singlestoredb as s2\n", "import sqlalchemy as sa" - ] + ], + "id": "c93d6c92" }, { "cell_type": "markdown", - "id": "670c7343", "metadata": {}, "source": [ "
    \n", @@ -67,22 +66,22 @@ "

    If you have a Free Starter Workspace deployed already, select the database from drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.

    \n", "
    \n", "
    " - ] + ], + "id": "a5969592" }, { "cell_type": "markdown", - "id": "3b83e421-0ae5-46b3-a64d-25d77de7da0c", "metadata": {}, "source": [ "## Create a database\n", "\n", "We need to create a database to work with in the following examples." - ] + ], + "id": "996e4e71" }, { "cell_type": "code", "execution_count": 2, - "id": "7b0ca2e7-52ec-4d97-b3e6-31ee4a2bd466", "metadata": {}, "outputs": [], "source": [ @@ -90,11 +89,11 @@ "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", " %sql DROP DATABASE IF EXISTS pandas_integration;\n", " %sql CREATE DATABASE pandas_integration;" - ] + ], + "id": "e4ad8232" }, { "cell_type": "markdown", - "id": "641ab7fd-a344-42f0-9cd9-755056374581", "metadata": {}, "source": [ "
    \n", @@ -105,11 +104,11 @@ " It updates the connection_url to connect to that database.

    \n", "
    \n", "
    " - ] + ], + "id": "6ce6ab53" }, { "cell_type": "markdown", - "id": "f094c4b2-dade-4432-9173-db590e7cb1dd", "metadata": {}, "source": [ "## Database connections\n", @@ -121,52 +120,52 @@ "\n", "In the following sections, we will connect to SingleStoreDB using each of the packages and demonstrate\n", "techniques for moving data between pandas and SingleStoreDB." - ] + ], + "id": "3be5a003" }, { "cell_type": "markdown", - "id": "a6ffcb91-b70b-4175-bad7-7d087483e66b", "metadata": {}, "source": [ "## The Iris data set\n", "\n", "We'll be using the Iris data set for the following examples. This data set includes five columns: `sepal_length`,\n", "`sepal_width`, `petal_length`, `petal_width` and `class`." - ] + ], + "id": "7207b236" }, { "cell_type": "code", "execution_count": 3, - "id": "68129a0f-7bb7-4c1b-bdab-dbb5c8cb43a3", "metadata": {}, "outputs": [], "source": [ "iris = pd.read_csv('https://raw.githubusercontent.com/singlestore-labs/spaces-notebooks/master/notebooks/integrating-with-pandas/data/iris.csv')\n", "iris" - ] + ], + "id": "f752e994" }, { "cell_type": "markdown", - "id": "ff5ef64c-1046-4db0-bb96-dd976c42db39", "metadata": {}, "source": [ "As you can see below, the first four columns are floats and the last column is a string\n", "(represented as an `object` in `DataFrame`s)." - ] + ], + "id": "eb65b562" }, { "cell_type": "code", "execution_count": 4, - "id": "22392542-a71a-484b-9a3a-7e767d3bad07", "metadata": {}, "outputs": [], "source": [ "iris.info()" - ] + ], + "id": "a88d4199" }, { "cell_type": "markdown", - "id": "8ea1fa0c-247e-4aaa-98a1-0980921fcaf7", "metadata": {}, "source": [ "## Moving data between SingleStoreDB and pandas `DataFrame`s\n", @@ -179,53 +178,53 @@ "absolutely needed since this can be a major bottleneck when working with and analyzing data. The hope is that\n", "the features of SingleStoreDB are sufficient enough to alleviate the need to do much processing (if any) on\n", "the client machine." - ] + ], + "id": "ad1429cf" }, { "cell_type": "markdown", - "id": "6d88c47a-7416-4566-b026-3e232afa7bc7", "metadata": {}, "source": [ "### SingleStoreDB Python\n", "\n", "The core library is the SingleStoreDB Python package. This is the package that all other SingleStoreDB\n", "packages are built on. To connect, simply call the `connect` function." - ] + ], + "id": "93bdb639" }, { "cell_type": "code", "execution_count": 5, - "id": "a36c4a90-0aeb-402a-8454-764730ac4a08", "metadata": {}, "outputs": [], "source": [ "s2_conn = s2.connect()" - ] + ], + "id": "81f49c90" }, { "cell_type": "markdown", - "id": "9e54271f-e185-4965-bfaf-2639c304d503", "metadata": {}, "source": [ "Since the core library is a fairly low-level interface to SingleStoreDB, most operations are done simply by sending\n", "SQL code." - ] + ], + "id": "4ba44e4c" }, { "cell_type": "markdown", - "id": "46f57610-87bd-45c8-84bd-e77c9b313527", "metadata": {}, "source": [ "#### Creating a table\n", "\n", "Because we are using a low-level driver, creating a table is just done using SQL code. We'll use the information\n", "about the data set above to construct a `CREATE TABLE` statement." - ] + ], + "id": "7d1e729f" }, { "cell_type": "code", "execution_count": 6, - "id": "4dc666be-0ce7-46bb-9afe-19f3435f02ff", "metadata": {}, "outputs": [], "source": [ @@ -239,11 +238,11 @@ " class TEXT\n", " )\n", "''')" - ] + ], + "id": "23e9fccd" }, { "cell_type": "markdown", - "id": "29c23826-1862-48ec-a837-a3e25ea52362", "metadata": {}, "source": [ "#### Upload the data from a `DataFrame`\n", @@ -252,12 +251,12 @@ "SQL statements to do this. The Python client can execute single SQL statements using the\n", "`execute` method as used above, but since we are uploading multiple rows of data it is better\n", "to use the `executemany` method since it is optimized for this purpose." - ] + ], + "id": "87533afe" }, { "cell_type": "code", "execution_count": 7, - "id": "c7347b35-f82d-4bf4-bc7d-75a1ecd478e2", "metadata": {}, "outputs": [], "source": [ @@ -272,43 +271,43 @@ "\n", "# Execute the INSERT statement\n", "s2_cur.executemany(f'INSERT INTO iris({cols}) VALUES ({values})', data)" - ] + ], + "id": "0e10f6c9" }, { "cell_type": "markdown", - "id": "f4846819-8043-4917-8b37-20ed4b4ab7ab", "metadata": {}, "source": [ "We can select a sample of the rows to see that the data is now in SingleStoreDB." - ] + ], + "id": "07f5462c" }, { "cell_type": "code", "execution_count": 8, - "id": "801a8262-375a-40fc-8b2c-c15a8cee61a7", "metadata": {}, "outputs": [], "source": [ "s2_cur.execute('SELECT * FROM iris LIMIT 10')\n", "for row in s2_cur:\n", " print(row)" - ] + ], + "id": "84cb68a6" }, { "cell_type": "markdown", - "id": "ff3de53a-76a8-44f5-a999-364692df4601", "metadata": {}, "source": [ "#### Downloading the data to a `DataFrame`\n", "\n", "We can download the data to a pandas `DataFrame` simply by selecting all columns of data,\n", "fetching all of the rows, and passing them to the `DataFrame` constructor." - ] + ], + "id": "1c1169c4" }, { "cell_type": "code", "execution_count": 9, - "id": "de9bf4d0-72be-4f70-9efa-129b041acba6", "metadata": {}, "outputs": [], "source": [ @@ -319,40 +318,40 @@ "\n", "s2_iris_df = pd.DataFrame(list(s2_cur), columns=names)\n", "s2_iris_df" - ] + ], + "id": "95418be9" }, { "cell_type": "code", "execution_count": 10, - "id": "ff2f9043-a940-43dc-974a-162e8bd2d576", "metadata": {}, "outputs": [], "source": [ "s2_iris_df.info()" - ] + ], + "id": "639e4ac8" }, { "cell_type": "markdown", - "id": "809ccd96-f00b-49ff-b19c-48d9b85c7e3f", "metadata": {}, "source": [ "Now that we have demonstrated uploading and downloading data from a pandas `DataFrame` using the\n", "SingleStoreDB Python client, we can drop the table and move on to SQLAlchemy." - ] + ], + "id": "6d6c7516" }, { "cell_type": "code", "execution_count": 11, - "id": "8ef3cbb3-0f01-47c4-b27f-0dcf5729e7cd", "metadata": {}, "outputs": [], "source": [ "s2_cur.execute('DROP TABLE IF EXISTS iris')" - ] + ], + "id": "e21589b8" }, { "cell_type": "markdown", - "id": "75788e0a-df76-4be7-8ced-d01fdc37d7b6", "metadata": {}, "source": [ "### SQLAlchemy\n", @@ -363,22 +362,22 @@ "function that does the same thing, however, it extends the default ability by allow you to use the\n", "`SINGLESTOREDB_URL` environment variable as the connection string so that no parameters are needed for\n", "`create_engine` when used in the notebooks environment." - ] + ], + "id": "29f17ae8" }, { "cell_type": "code", "execution_count": 12, - "id": "ea6ac741-7c01-4b34-95c8-59d97db70cab", "metadata": {}, "outputs": [], "source": [ "sa_eng = s2.create_engine()\n", "sa_conn = sa_eng.connect()" - ] + ], + "id": "01474892" }, { "cell_type": "markdown", - "id": "a0f0463a-9796-4f23-8081-6589fce6463d", "metadata": {}, "source": [ "#### Uploading the data from a `DataFrame`\n", @@ -387,49 +386,49 @@ "The pandas library itself has the ability to communicate with SingleStoreDB using a SQLAlchemy connection.\n", "In this case, the `DataFrame` can create the table and populate it in one step using the `to_sql` method.\n", "The `to_sql` method has various options to modify its behavior [documented on the pandas web site](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_sql.html)." - ] + ], + "id": "29da1422" }, { "cell_type": "code", "execution_count": 13, - "id": "de2c953c-09b9-4aa9-b491-751929322c19", "metadata": {}, "outputs": [], "source": [ "iris.to_sql('iris', con=sa_conn, index=False, if_exists='replace')" - ] + ], + "id": "0cde99aa" }, { "cell_type": "markdown", - "id": "e4574813-c134-4c0a-8915-94c0c466863f", "metadata": {}, "source": [ "We can verify the data is in SingleStoreDB with a simple `SELECT` statement." - ] + ], + "id": "152cdb79" }, { "cell_type": "code", "execution_count": 14, - "id": "6068dabe-5347-4449-926d-f52c0fb58b13", "metadata": {}, "outputs": [], "source": [ "for row in sa_conn.execute(sa.text('SELECT * FROM iris LIMIT 10')):\n", " print(row)" - ] + ], + "id": "8b6d0dcd" }, { "cell_type": "markdown", - "id": "75b0a1ee-8c21-4b8a-856e-643485661bdf", "metadata": {}, "source": [ "It is also possible to use SQLAlchemy expressions to query the table rather than raw SQL strings." - ] + ], + "id": "f052d204" }, { "cell_type": "code", "execution_count": 15, - "id": "60e13489-3d70-48c1-b17b-baebb0d0543c", "metadata": {}, "outputs": [], "source": [ @@ -450,23 +449,23 @@ "# Print results\n", "for row in sa_conn.execute(query):\n", " print(row)" - ] + ], + "id": "d26c240a" }, { "cell_type": "markdown", - "id": "0db80753-98a7-4748-83fe-1190cd04415f", "metadata": {}, "source": [ "#### Downloading the data to a `DataFrame`\n", "\n", "Downloading data to a pandas `DataFrame` is very simple. The result of the `execute` method can\n", "be passed directly to the pandas `DataFrame` constructor." - ] + ], + "id": "ec8c09ec" }, { "cell_type": "code", "execution_count": 16, - "id": "6053dcf8-8774-4e73-a5d7-2a86bb597569", "metadata": {}, "outputs": [], "source": [ @@ -475,59 +474,59 @@ "\n", "sa_iris_df = pd.DataFrame(sa_conn.execute(query))\n", "sa_iris_df" - ] + ], + "id": "6c3a786f" }, { "cell_type": "code", "execution_count": 17, - "id": "5c2a5a7b-a00d-48b5-8e8b-b19e08d5d792", "metadata": {}, "outputs": [], "source": [ "sa_iris_df.info()" - ] + ], + "id": "dd9018f4" }, { "cell_type": "markdown", - "id": "a8977c4c-a0a8-4299-b7bb-d1c03e386169", "metadata": {}, "source": [ "It is also possible to use `pd.read_sql` to bookend the use of `df.to_sql`." - ] + ], + "id": "40831a8d" }, { "cell_type": "code", "execution_count": 18, - "id": "a200df59-efb2-42e6-bca5-1b115666fb2d", "metadata": {}, "outputs": [], "source": [ "sa_iris_df = pd.read_sql(query, con=sa_conn)\n", "sa_iris_df" - ] + ], + "id": "9a647b1a" }, { "cell_type": "markdown", - "id": "c1dd2d93-f330-4e3a-b39a-32accc3d1a60", "metadata": {}, "source": [ "Now that we have demonstrated using SQLAlchemy to upload and download pandas `DataFrames` we can drop\n", "the table and move on to Ibis." - ] + ], + "id": "d7f11bcf" }, { "cell_type": "code", "execution_count": 19, - "id": "65eb886b-cdb6-4dae-bfaf-57a03d23cd51", "metadata": {}, "outputs": [], "source": [ "sa_iris.drop(bind=sa_eng)" - ] + ], + "id": "d70972af" }, { "cell_type": "markdown", - "id": "5693eca8-fa3c-46ad-94fb-73d02ca50478", "metadata": {}, "source": [ "### Ibis (SingleStoreDB DataFrame)\n", @@ -535,21 +534,21 @@ "The Ibis package allows you to treat tables in SingleStoreDB as `DataFrames`. The `DataFrame` expressions\n", "are used to build lazy expressions which generate SQL statements that get submitted to SingleStoreDB\n", "only when you want to see the results of a query. Ibis using SQLAlchemy connections behind-the-scenes." - ] + ], + "id": "f32118fe" }, { "cell_type": "code", "execution_count": 20, - "id": "686b8296-d50e-4a0b-b464-8b7807417f79", "metadata": {}, "outputs": [], "source": [ "ibis_conn = ibis.singlestoredb.connect()" - ] + ], + "id": "4adac2b3" }, { "cell_type": "markdown", - "id": "254fd629-e321-434f-a257-ccc3874ac93b", "metadata": {}, "source": [ "#### Uploading the data from a `DataFrame`\n", @@ -565,146 +564,146 @@ "In this case, the `ibis_iris` object is a `DataFrame`-like object that is lazily constructing\n", "the requested expression until `execute` is called on it. In the case of this example, uploading\n", "and downloading" - ] + ], + "id": "6e6ca747" }, { "cell_type": "code", "execution_count": 21, - "id": "0ca99ab2-bd93-41da-98f7-82c0d9934f43", "metadata": {}, "outputs": [], "source": [ "ibis_iris = ibis_conn.create_table('iris', iris, overwrite=True)\n", "ibis_iris.limit(10).execute()" - ] + ], + "id": "c863ae1f" }, { "cell_type": "markdown", - "id": "49bb8050-a6f4-4a6d-b083-78b324ebc755", "metadata": {}, "source": [ "It is also possible to insert the data from a `DataFrame` into an existing table using the `insert` method\n", "of the connection." - ] + ], + "id": "47893242" }, { "cell_type": "code", "execution_count": 22, - "id": "d2841e5b-a8c9-41e3-a404-25d532e831d2", "metadata": {}, "outputs": [], "source": [ "ibis_conn.insert('iris', iris)" - ] + ], + "id": "9ca1c807" }, { "cell_type": "markdown", - "id": "60cc0488-04ae-4c8b-9641-f6ef07133ac8", "metadata": {}, "source": [ "You'll see that we now have 300 rows since we've inserted the data twice." - ] + ], + "id": "8f262819" }, { "cell_type": "code", "execution_count": 23, - "id": "bc49e1ac-25b3-49e6-a257-c2bf2ace5be7", "metadata": {}, "outputs": [], "source": [ "ibis_iris.count().execute()" - ] + ], + "id": "1eb12dfb" }, { "cell_type": "markdown", - "id": "125cc52e-37fe-4c9f-a573-c81f6363ff36", "metadata": {}, "source": [ "One way to see the SQL that gets submitted during `execute` is to compile the expression\n", "and print it. Ibis also has a options to display SQL queries as they are submitted." - ] + ], + "id": "b4dc87a4" }, { "cell_type": "code", "execution_count": 24, - "id": "a1e4ccba-bec5-451c-9fd1-4b0ba87e5a74", "metadata": {}, "outputs": [], "source": [ "print(ibis_iris.compile())" - ] + ], + "id": "8bb66f21" }, { "cell_type": "markdown", - "id": "11e9fa6e-bb62-488c-a530-835ebd41d323", "metadata": {}, "source": [ "The information about the table can be retrieved much like in a local pandas `DataFrame`." - ] + ], + "id": "8df50578" }, { "cell_type": "code", "execution_count": 25, - "id": "2bfbb75b-ef83-4400-9909-348a12ac83cd", "metadata": {}, "outputs": [], "source": [ "ibis_iris.info().execute()" - ] + ], + "id": "5f07644b" }, { "cell_type": "markdown", - "id": "1bb2e364-b2c2-4c07-81ea-3e20dec1c723", "metadata": {}, "source": [ "#### Downloading the data from a `DataFrame`\n", "\n", "The output from evaluating Ibis expressions returns a `DataFrame`, so we have already demonstrated\n", "downloading data, but here is the code again." - ] + ], + "id": "af4a40bb" }, { "cell_type": "code", "execution_count": 26, - "id": "b9e14b01-9350-4560-81f9-cf68f4e105f5", "metadata": {}, "outputs": [], "source": [ "ibis_iris_df = ibis_iris.execute()\n", "ibis_iris_df" - ] + ], + "id": "50b54dd8" }, { "cell_type": "markdown", - "id": "e399be8a-6c5b-46cd-9c6b-7b4d02a36e57", "metadata": {}, "source": [ "Ibis `Table`s also have a `to_pandas` method." - ] + ], + "id": "9e3fc598" }, { "cell_type": "code", "execution_count": 27, - "id": "c6623479-e200-4575-ae12-e415feb11237", "metadata": {}, "outputs": [], "source": [ "ibis_iris.to_pandas()" - ] + ], + "id": "c09411e6" }, { "cell_type": "markdown", - "id": "15feee4b-746b-4333-aabf-c8c061db9bb0", "metadata": {}, "source": [ "If you do not have an Ibis object reference to a table already, you can get one using the `table` method\n", "or `tables` attribute of the Ibis connection." - ] + ], + "id": "097435fc" }, { "cell_type": "code", "execution_count": 28, - "id": "b1479ee9-10c6-47ac-b54e-3b3032c04949", "metadata": {}, "outputs": [], "source": [ @@ -713,40 +712,40 @@ "\n", "# This form can be used if the table name is a valid Python variable name\n", "ibis_iris = ibis_conn.tables.iris" - ] + ], + "id": "3c48ca90" }, { "cell_type": "code", "execution_count": 29, - "id": "eec644ad-61e1-4ab7-9cb0-edf0ed2a1f81", "metadata": {}, "outputs": [], "source": [ "ibis_iris.limit(10).execute()" - ] + ], + "id": "e1c8a0c1" }, { "cell_type": "markdown", - "id": "ef002757-d84c-4da2-a61c-48c424569261", "metadata": {}, "source": [ "We have demonstrated both uploading and downloading pandas `DataFrames` using Ibis, so\n", "we can drop the table now." - ] + ], + "id": "16c307d7" }, { "cell_type": "code", "execution_count": 30, - "id": "e96373d5-651c-4047-bda7-3e49bf6edb7a", "metadata": {}, "outputs": [], "source": [ "ibis_conn.drop_table('iris')" - ] + ], + "id": "ba397ac3" }, { "cell_type": "markdown", - "id": "d885cce5-1ae9-43e1-ae13-6b98c43ca23b", "metadata": {}, "source": [ "### `%%sql` and `%sql` magic commands\n", @@ -756,22 +755,22 @@ "and `%sqlplot` commands. These work in conjunction with SQLAlchemy to allow you to type SQL code in\n", "the notebook cells. They also have ways of integrating with pandas. The notebook environment automatically\n", "sets up the connection string for use in these commands." - ] + ], + "id": "7c3a31cc" }, { "cell_type": "markdown", - "id": "a474e845-0039-4625-a250-2ec5cf0adaaa", "metadata": {}, "source": [ "#### Creating a table\n", "\n", "Creating a table with the `%%sql` command is done simply by submitting the `CREATE TABLE` statement." - ] + ], + "id": "8667929d" }, { "cell_type": "code", "execution_count": 31, - "id": "db10dca0-b2c5-43c9-8dcc-1f520fac7efb", "metadata": {}, "outputs": [], "source": [ @@ -784,11 +783,11 @@ " petal_width DOUBLE,\n", " class TEXT\n", ");" - ] + ], + "id": "138f111b" }, { "cell_type": "markdown", - "id": "f0f53739-6d19-4be8-ae08-c5dca35acfef", "metadata": {}, "source": [ "#### Uploading the data from a `DataFrame`\n", @@ -797,105 +796,105 @@ "will create a table in the database and upload the data. The `--append` option will append data to an\n", "existing table. In this case, the name used for the `DataFrame` variable is used for the table name\n", "in SingleStoreDB." - ] + ], + "id": "baa7f8a7" }, { "cell_type": "code", "execution_count": 32, - "id": "8839c965-880d-4909-88a0-b7fa8cb4723d", "metadata": {}, "outputs": [], "source": [ "%sql --append --no-index iris" - ] + ], + "id": "d03da00a" }, { "cell_type": "code", "execution_count": 33, - "id": "f11d4a80-9555-4067-9ada-2e4cf3ead35f", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT * FROM iris LIMIT 10;" - ] + ], + "id": "db454011" }, { "cell_type": "markdown", - "id": "da339a4c-84a6-4f7d-a802-4fdd9f40d2b9", "metadata": {}, "source": [ "#### Downloading the data from a `DataFrame`\n", "\n", "There are a few ways of getting data from SingleStoreDB into a `DataFrame` using the SQL magic commands.\n", "The first is to use the `%sql` command and convert the result manually." - ] + ], + "id": "db91e707" }, { "cell_type": "code", "execution_count": 34, - "id": "4198f8b6-175d-4a16-963e-af7508e06487", "metadata": {}, "outputs": [], "source": [ "out = %sql SELECT * FROM iris\n", "sql_iris_df = out.DataFrame()\n", "sql_iris_df" - ] + ], + "id": "0bba558a" }, { "cell_type": "markdown", - "id": "c5978394-d1fe-4c24-a3bd-9b8ce9583dcc", "metadata": {}, "source": [ "You can also pass the result of the query to the `DataFrame` constructor." - ] + ], + "id": "19adfd90" }, { "cell_type": "code", "execution_count": 35, - "id": "4f008d44-efde-415c-b847-a7d351ac8157", "metadata": {}, "outputs": [], "source": [ "sql_iris_df = pd.DataFrame(out)\n", "sql_iris_df" - ] + ], + "id": "36eabe0c" }, { "cell_type": "markdown", - "id": "01076d35-e406-453a-88c5-86f70b98e55e", "metadata": {}, "source": [ "Finally, the output of the `%%sql` command can be stored to a variable which can then be\n", "converted to a `DataFrame` in the same manner as above." - ] + ], + "id": "6656b59a" }, { "cell_type": "code", "execution_count": 36, - "id": "2bb36229-b336-462f-977f-36679d40f50f", "metadata": {}, "outputs": [], "source": [ "%%sql result <<\n", "SELECT * FROM iris;" - ] + ], + "id": "5da05c62" }, { "cell_type": "code", "execution_count": 37, - "id": "2094aa36-86bf-469b-9322-a9f94877f954", "metadata": {}, "outputs": [], "source": [ "sql_iris_df = pd.DataFrame(result)\n", "sql_iris_df" - ] + ], + "id": "35f51470" }, { "cell_type": "markdown", - "id": "285fcfbb-5487-4195-9240-8af8462115fe", "metadata": {}, "source": [ "##### Automatically return pandas `DataFrame`s\n", @@ -903,62 +902,62 @@ "The other option for getting `DataFrame`s as the result of the SQL magic commands is to enable\n", "the `SqlMagic.autopandas` option. This will cause all results from SQL magic commands to be\n", "converted to `DataFrame`s without any intervention." - ] + ], + "id": "782ec035" }, { "cell_type": "code", "execution_count": 38, - "id": "f19aee9c-fe28-4c0b-a902-c094ea76cfd1", "metadata": {}, "outputs": [], "source": [ "%config SqlMagic.autopandas = True" - ] + ], + "id": "57227c30" }, { "cell_type": "code", "execution_count": 39, - "id": "6caba2b5-86b4-4abf-9063-e59bbac2aab3", "metadata": {}, "outputs": [], "source": [ "out = %sql SELECT * FROM iris\n", "out" - ] + ], + "id": "e12fdd61" }, { "cell_type": "code", "execution_count": 40, - "id": "129603ca-b982-4b77-9967-8c0fe0520f45", "metadata": {}, "outputs": [], "source": [ "type(out)" - ] + ], + "id": "c7d03adf" }, { "cell_type": "markdown", - "id": "929f83b9-bb1c-4d6f-9a5e-fdfc8db0db7e", "metadata": {}, "source": [ "Now that we have demonstrated uploading and downloading of `DataFrame`s using the SQL magic commands,\n", "we can reset the configuration options and drop the table." - ] + ], + "id": "061b6939" }, { "cell_type": "code", "execution_count": 41, - "id": "32a14f6f-7afa-442c-8e58-c317bccdbcb9", "metadata": {}, "outputs": [], "source": [ "%config SqlMagic.autopandas = False\n", "%sql DROP TABLE IF EXISTS iris" - ] + ], + "id": "0f8f59c5" }, { "cell_type": "markdown", - "id": "6a832062-3934-4541-81b5-5c3cf0c117ac", "metadata": {}, "source": [ "## Conclusion\n", @@ -974,23 +973,24 @@ "We have shown how to upload and download data from a pandas `DataFrame` to and from SingleStoreDB\n", "using the SingleStoreDB Python client, SQLAlchemy, and Ibis. These techniques should enable you to\n", "integrate your pandas workflows with SingleStoreDB." - ] + ], + "id": "b081f241" }, { "cell_type": "code", "execution_count": 42, - "id": "6d6f10e5-7605-485b-82f3-7a26a17ae912", "metadata": {}, "outputs": [], "source": [ "shared_tier_check = %sql show variables like 'is_shared_tier'\n", "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", " %sql DROP DATABASE IF EXISTS pandas_integration;" - ] + ], + "id": "24af14ce" }, { + "id": "1931035f", "cell_type": "markdown", - "id": "dca02e68-11a8-46b9-b2eb-35f466d0c96e", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/kebab-case/notebook.ipynb b/notebooks/kebab-case/notebook.ipynb index f7ac628e..36c1c5da 100644 --- a/notebooks/kebab-case/notebook.ipynb +++ b/notebooks/kebab-case/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "9c00b945", "cell_type": "markdown", - "id": "40e5ffee-7dbd-4148-a58c-fa3b014590b7", "metadata": {}, "source": [ "
    \n", @@ -18,16 +18,15 @@ }, { "cell_type": "markdown", - "id": "68a6ae45-1ff5-475b-ad42-d724adf998c6", "metadata": {}, "source": [ "## Step 0. Import libraries & create connection to database" - ] + ], + "id": "bd2b3254" }, { "cell_type": "code", "execution_count": 1, - "id": "d5b77e41-852a-4610-a3c9-052e684dd3f1", "metadata": {}, "outputs": [], "source": [ @@ -35,38 +34,38 @@ "import singlestoredb as s2\n", "s2_conn = s2.connect()\n", "s2_cur = s2_conn.cursor()" - ] + ], + "id": "27973c95" }, { "cell_type": "markdown", - "id": "882e7736-aa5a-4f89-aa11-9ad907376274", "metadata": {}, "source": [ "## Step 1. Specify which database you want to cache" - ] + ], + "id": "46be6948" }, { "cell_type": "code", "execution_count": 2, - "id": "2db88756-4303-4019-ba69-e065d9c1aa72", "metadata": {}, "outputs": [], "source": [ "database_name = input('Enter database name:')" - ] + ], + "id": "9590c72f" }, { "cell_type": "markdown", - "id": "9f31256d-495c-4743-95b8-1a7a3fb8b29a", "metadata": {}, "source": [ "## Step 2. Get a list of the columnstore table names in your database" - ] + ], + "id": "9d80057f" }, { "cell_type": "code", "execution_count": 3, - "id": "e08f3465-d454-4dd1-b93f-e3337e030f4f", "metadata": {}, "outputs": [], "source": [ @@ -74,20 +73,20 @@ "result = s2_cur.execute(query)\n", "result_df = pd.DataFrame(list(s2_cur))\n", "list_of_tables = result_df[[0]].values" - ] + ], + "id": "bd63ec02" }, { "cell_type": "markdown", - "id": "f6e3091f-4e77-4e2c-b397-95695adf3e2b", "metadata": {}, "source": [ "## Step 3. Cache columnar files" - ] + ], + "id": "c6f080b3" }, { "cell_type": "code", "execution_count": 4, - "id": "fc670353-5d5a-4d47-8d4f-a6637b741a37", "metadata": {}, "outputs": [], "source": [ @@ -108,20 +107,20 @@ "# run column file warm up queries\n", "for query in final_column_df[['query_text']].values:\n", " s2_cur.execute(\"\"\" {} \"\"\".format(query[0]))" - ] + ], + "id": "af25d90b" }, { "cell_type": "markdown", - "id": "2eea9509-b8ef-4004-84b2-a92cb948332d", "metadata": {}, "source": [ "## Step 4. Cache index files" - ] + ], + "id": "3295c1ab" }, { "cell_type": "code", "execution_count": 5, - "id": "0e087126-c17d-43b4-bb50-59a1eee61357", "metadata": {}, "outputs": [], "source": [ @@ -137,11 +136,12 @@ "# run index file warm up queries\n", "for query in index_queries_df.values:\n", " s2_cur.execute(\"\"\" {} \"\"\".format(query[0]))" - ] + ], + "id": "1bf9836c" }, { + "id": "3921cac6", "cell_type": "markdown", - "id": "90aea1ad-d9e8-49a8-bbb9-263803bab91f", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/launch-open-source-apps-with-langchain/notebook.ipynb b/notebooks/launch-open-source-apps-with-langchain/notebook.ipynb index a43452c8..a858e9cc 100644 --- a/notebooks/launch-open-source-apps-with-langchain/notebook.ipynb +++ b/notebooks/launch-open-source-apps-with-langchain/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "37ed22b4", "cell_type": "markdown", - "id": "7d9894c5-8938-4790-8acf-44882f2d3391", "metadata": {}, "source": [ "
    \n", @@ -19,7 +19,6 @@ { "cell_type": "code", "execution_count": 1, - "id": "be57e1dd-8030-4551-81bb-d25b1017188b", "metadata": {}, "outputs": [], "source": [ @@ -36,22 +35,22 @@ "opencv-contrib-python-headless==4.8.1.78\n", "unstructured.pytesseract==0.3.12\n", "unstructured.inference==0.7.15" - ] + ], + "id": "b2e2f099" }, { "cell_type": "code", "execution_count": 2, - "id": "68262edc-b67d-4b16-9486-7daef687af80", "metadata": {}, "outputs": [], "source": [ "%pip install -r requirements.txt --quiet" - ] + ], + "id": "5ddcf19c" }, { "cell_type": "code", "execution_count": 3, - "id": "b21ec610-517c-4fcc-bd19-217b2675c8a9", "metadata": {}, "outputs": [], "source": [ @@ -60,12 +59,12 @@ "loader = OnlinePDFLoader(\"http://leavcom.com/pdf/DBpdf.pdf\")\n", "\n", "data = loader.load()" - ] + ], + "id": "04c9106d" }, { "cell_type": "code", "execution_count": 4, - "id": "a40099c8-f859-4707-a3c9-635bbc693806", "metadata": {}, "outputs": [], "source": [ @@ -73,12 +72,12 @@ "\n", "print (f\"You have {len(data)} document(s) in your data\")\n", "print (f\"There are {len(data[0].page_content)} characters in your document\")" - ] + ], + "id": "d47825e1" }, { "cell_type": "code", "execution_count": 5, - "id": "dd303fc1-d02b-460d-a81e-3a313cab1a59", "metadata": {}, "outputs": [], "source": [ @@ -86,24 +85,24 @@ "texts = text_splitter.split_documents(data)\n", "\n", "print (f\"You have {len(texts)} pages\")" - ] + ], + "id": "d9ea31be" }, { "cell_type": "code", "execution_count": 6, - "id": "5c05e57c-3300-4c1a-867c-383c621f2b94", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "DROP DATABASE IF EXISTS pdf_db;\n", "CREATE DATABASE IF NOT EXISTS pdf_db;" - ] + ], + "id": "248f5664" }, { "attachments": {}, "cell_type": "markdown", - "id": "c779893e-fb64-4fa5-b5de-69778354fbf4", "metadata": {}, "source": [ "
    \n", @@ -113,12 +112,12 @@ "

    Make sure to select the pdf_db database from the drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.

    \n", "
    \n", "
    " - ] + ], + "id": "8f867e9b" }, { "cell_type": "code", "execution_count": 7, - "id": "0d2c027b-883b-4e2b-9594-b5c4cc41af24", "metadata": {}, "outputs": [], "source": [ @@ -129,12 +128,12 @@ " content TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,\n", " vector BLOB\n", ");" - ] + ], + "id": "ec83a39e" }, { "cell_type": "code", "execution_count": 8, - "id": "df0283f6-66fa-4b2e-bf5c-024864896c28", "metadata": {}, "outputs": [], "source": [ @@ -142,12 +141,12 @@ "import getpass\n", "\n", "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")" - ] + ], + "id": "bc9922c9" }, { "cell_type": "code", "execution_count": 9, - "id": "5b5d8f1c-e9a1-4010-bd77-fc209b52bdb6", "metadata": {}, "outputs": [], "source": [ @@ -182,12 +181,12 @@ "\"\"\")\n", "\n", "conn.execute(stmt, params)" - ] + ], + "id": "067cb1db" }, { "cell_type": "code", "execution_count": 10, - "id": "0226773e-b1eb-4e06-a92e-918c375b15a1", "metadata": {}, "outputs": [], "source": [ @@ -195,12 +194,12 @@ "SELECT JSON_ARRAY_UNPACK_F32(vector) as vector\n", "FROM pdf_docs1\n", "LIMIT 1;" - ] + ], + "id": "954fdab4" }, { "cell_type": "code", "execution_count": 11, - "id": "5597f268-7dc2-4453-a5b1-e40fb211d0e8", "metadata": {}, "outputs": [], "source": [ @@ -221,12 +220,12 @@ "\n", "for row in results:\n", " print(row[0])" - ] + ], + "id": "41b9e67a" }, { "cell_type": "code", "execution_count": 12, - "id": "a469ffbd-5084-4c5a-b961-a8c908c391ce", "metadata": {}, "outputs": [], "source": [ @@ -245,31 +244,32 @@ ")\n", "\n", "print(response.choices[0].message.content)" - ] + ], + "id": "2859d902" }, { "attachments": {}, "cell_type": "markdown", - "id": "332b068b-97a2-400c-a88d-961280adcac0", "metadata": {}, "source": [ "## Clean up" - ] + ], + "id": "2b032231" }, { "cell_type": "code", "execution_count": 13, - "id": "7ade2f7b-af30-4e58-a493-735bf1cec213", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "DROP DATABASE IF EXISTS pdf_db" - ] + ], + "id": "e7942064" }, { + "id": "fc7092b3", "cell_type": "markdown", - "id": "f1ce8da7-0868-47fd-8585-4777d26f3adc", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/load-csv-data-s3-placeholder/notebook.ipynb b/notebooks/load-csv-data-s3-placeholder/notebook.ipynb index 251a19ab..8b227e7f 100644 --- a/notebooks/load-csv-data-s3-placeholder/notebook.ipynb +++ b/notebooks/load-csv-data-s3-placeholder/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "e82b93c8", "cell_type": "markdown", - "id": "b5730dcd-9402-4a04-baef-e6406f49c3dd", "metadata": {}, "source": [ "
    \n", @@ -17,8 +17,8 @@ ] }, { + "id": "d657c3f4", "cell_type": "markdown", - "id": "e55e8155-7164-4ff9-b0c9-dcfeed326816", "metadata": {}, "source": [ "
    \n", @@ -36,7 +36,8 @@ "metadata": {}, "source": [ "This notebook demonstrates how to create a sample table in SingleStore, set up a pipeline to import data from an Amazon S3 bucket, and run queries on the imported data. It is designed for users who want to integrate S3 data with SingleStore and explore the capabilities of pipelines for efficient data ingestion." - ] + ], + "id": "e9e6acea" }, { "attachments": {}, @@ -44,7 +45,8 @@ "metadata": {}, "source": [ "

    Demo Flow

    " - ] + ], + "id": "4933b61c" }, { "attachments": {}, @@ -52,7 +54,8 @@ "metadata": {}, "source": [ "" - ] + ], + "id": "ef90c6c9" }, { "attachments": {}, @@ -62,7 +65,8 @@ "## Sample Table in SingleStore\n", "\n", "Start by creating a table that will hold the data imported from S3." - ] + ], + "id": "8a85c544" }, { "cell_type": "code", @@ -78,7 +82,8 @@ " address TEXT,\n", " created_at TIMESTAMP\n", ");" - ] + ], + "id": "2ca9281c" }, { "attachments": {}, @@ -93,23 +98,23 @@ "You have access to the S3 bucket.\n", "Proper IAM roles or access keys are configured in SingleStore.\n", "The CSV file has a structure that matches the table schema." - ] + ], + "id": "a88192c9" }, { "attachments": {}, "cell_type": "markdown", - "id": "a9c60e86-a548-4257-9130-5120e850aad0", "metadata": {}, "source": [ "## Set Up Variables\n", "\n", "Define the URL, REGION, ACCESS_KEY, and SECRET_ACCESS_KEY variables for integration, replacing the placeholder values with your own." - ] + ], + "id": "87d2c776" }, { "cell_type": "code", "execution_count": 2, - "id": "69c573a1-316c-49c2-9ac7-16327a302199", "metadata": {}, "outputs": [], "source": [ @@ -117,16 +122,17 @@ "REGION = 'your-region'\n", "ACCESS_KEY = 'access_key_id'\n", "SECRET_ACCESS_KEY = 'access_secret_key'" - ] + ], + "id": "78c44e19" }, { "attachments": {}, "cell_type": "markdown", - "id": "d8927379-38ca-427f-9de3-dcc76b5ba05e", "metadata": {}, "source": [ "Using these identifiers and keys, execute the following statement." - ] + ], + "id": "eb0c3643" }, { "cell_type": "code", @@ -145,7 +151,8 @@ "FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '\\\"'\n", "LINES TERMINATED BY '\\n'\n", "IGNORE 1 lines;" - ] + ], + "id": "6efe3112" }, { "attachments": {}, @@ -155,40 +162,41 @@ "## Start the Pipeline\n", "\n", "To start the pipeline and begin importing the data from the S3 bucket:" - ] + ], + "id": "5902a86a" }, { "cell_type": "code", "execution_count": 4, - "id": "24aba272-a594-4971-8d7c-640b31dcf216", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "START PIPELINE s3_import_pipeline;" - ] + ], + "id": "bb436fc2" }, { "attachments": {}, "cell_type": "markdown", - "id": "bea9d53a-45cc-4dd9-87d6-bdabd2ae6370", "metadata": {}, "source": [ "## Select Data from the Table\n", "\n", "Once the data has been imported, you can run a query to select it:" - ] + ], + "id": "c82c8439" }, { "cell_type": "code", "execution_count": 5, - "id": "3924e9fb-094a-467c-bdac-8f7826e63501", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT * FROM sample_table LIMIT 10;" - ] + ], + "id": "bb740975" }, { "attachments": {}, @@ -196,7 +204,8 @@ "metadata": {}, "source": [ "### Check if all data of the data is loaded" - ] + ], + "id": "df1cdb14" }, { "cell_type": "code", @@ -206,7 +215,8 @@ "source": [ "%%sql\n", "SELECT count(*) FROM sample_table" - ] + ], + "id": "dd98c9a3" }, { "attachments": {}, @@ -217,7 +227,8 @@ "\n", "We have shown how to insert data from a Amazon S3 using `Pipelines` to SingleStoreDB. These techniques should enable you to\n", "integrate your Amazon S3 with SingleStoreDB." - ] + ], + "id": "892e7f8d" }, { "attachments": {}, @@ -227,7 +238,8 @@ "## Clean up\n", "\n", "Remove the '#' to uncomment and execute the queries below to clean up the pipeline and table created." - ] + ], + "id": "3c053a57" }, { "attachments": {}, @@ -235,7 +247,8 @@ "metadata": {}, "source": [ "#### Drop Pipeline" - ] + ], + "id": "8874a110" }, { "cell_type": "code", @@ -247,7 +260,8 @@ "#STOP PIPELINE s3_import_pipeline;\n", "\n", "#DROP PIPELINE s3_import_pipeline;" - ] + ], + "id": "043861f7" }, { "attachments": {}, @@ -255,7 +269,8 @@ "metadata": {}, "source": [ "#### Drop Data" - ] + ], + "id": "445c6369" }, { "cell_type": "code", @@ -265,11 +280,12 @@ "source": [ "%%sql\n", "#DROP TABLE sample_table;" - ] + ], + "id": "f8b697e5" }, { + "id": "39231766", "cell_type": "markdown", - "id": "b47799b9-36b3-42c5-8434-4091a38f966a", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/load-csv-data-s3/notebook.ipynb b/notebooks/load-csv-data-s3/notebook.ipynb index 08c53921..42728409 100644 --- a/notebooks/load-csv-data-s3/notebook.ipynb +++ b/notebooks/load-csv-data-s3/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "24eeda62", "cell_type": "markdown", - "id": "b447e717-ea59-49fd-8092-36ece56072ae", "metadata": {}, "source": [ "
    \n", @@ -17,8 +17,8 @@ ] }, { + "id": "74f73679", "cell_type": "markdown", - "id": "0b47d1d0-0308-4b98-8b9a-d9575938995a", "metadata": {}, "source": [ "
    \n", @@ -42,7 +42,8 @@ "

    This notebook creates a pipeline, data may take up to 1 minute to populate

    \n", "
    \n", "
    " - ] + ], + "id": "1a9a5b5d" }, { "attachments": {}, @@ -60,7 +61,8 @@ "- learn how to load CSV data into Singlestore from S3\n", "- execute aggregate functions\n", "- perform time-series analysis" - ] + ], + "id": "58a02686" }, { "attachments": {}, @@ -68,7 +70,8 @@ "metadata": {}, "source": [ "

    Demo Flow

    " - ] + ], + "id": "fe488dd0" }, { "attachments": {}, @@ -76,7 +79,8 @@ "metadata": {}, "source": [ "" - ] + ], + "id": "5e2f723e" }, { "attachments": {}, @@ -84,7 +88,8 @@ "metadata": {}, "source": [ "## How to use this notebook" - ] + ], + "id": "9bfbcffa" }, { "attachments": {}, @@ -92,7 +97,8 @@ "metadata": {}, "source": [ "" - ] + ], + "id": "08afb8bd" }, { "attachments": {}, @@ -102,7 +108,8 @@ "## Create a database (You can skip this Step if you are using Free Starter Tier)\n", "\n", "We need to create a database to work with in the following examples." - ] + ], + "id": "4a00fe35" }, { "cell_type": "code", @@ -114,7 +121,8 @@ "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", " %sql DROP DATABASE IF EXISTS SalesAnalysis;\n", " %sql CREATE DATABASE SalesAnalysis;" - ] + ], + "id": "0b8a66cf" }, { "attachments": {}, @@ -122,7 +130,8 @@ "metadata": {}, "source": [ "### Create Table" - ] + ], + "id": "3afd127b" }, { "cell_type": "code", @@ -142,7 +151,8 @@ " `Price` float DEFAULT NULL,\n", " `Total_Sales` float DEFAULT NULL\n", ")" - ] + ], + "id": "a139fe30" }, { "attachments": {}, @@ -150,7 +160,8 @@ "metadata": {}, "source": [ "### Load Data Using Pipelines" - ] + ], + "id": "85e8fdc0" }, { "cell_type": "code", @@ -173,7 +184,8 @@ "\n", "\n", "START PIPELINE SalesData_Pipeline;" - ] + ], + "id": "d25aa2e2" }, { "attachments": {}, @@ -181,7 +193,8 @@ "metadata": {}, "source": [ "### Data may take couple of seconds to load after pipeline is started, rerun cell to verify" - ] + ], + "id": "23ba5df8" }, { "cell_type": "code", @@ -191,7 +204,8 @@ "source": [ "%%sql\n", "SELECT count(*) FROM SalesData" - ] + ], + "id": "8208c7dc" }, { "attachments": {}, @@ -201,7 +215,8 @@ "

    Sample Queries

    \n", "\n", "We will try to execute some Analytical Queries" - ] + ], + "id": "38f3f93c" }, { "attachments": {}, @@ -209,7 +224,8 @@ "metadata": {}, "source": [ "Top-Selling Products" - ] + ], + "id": "d6a7c132" }, { "cell_type": "code", @@ -220,7 +236,8 @@ "%%sql\n", "SELECT product_name, SUM(quantity_sold) AS total_quantity_sold FROM SalesData\n", " GROUP BY product_name ORDER BY total_quantity_sold DESC LIMIT 5;" - ] + ], + "id": "0de6fd0a" }, { "attachments": {}, @@ -228,7 +245,8 @@ "metadata": {}, "source": [ "Sales Trends Over Time" - ] + ], + "id": "94faf5eb" }, { "cell_type": "code", @@ -239,7 +257,8 @@ "%%sql\n", "SELECT date, SUM(total_sales) AS total_sales FROM SalesData\n", "GROUP BY date ORDER BY total_sales desc limit 5;" - ] + ], + "id": "3b048113" }, { "attachments": {}, @@ -247,7 +266,8 @@ "metadata": {}, "source": [ "Total Sales by Store" - ] + ], + "id": "ced08467" }, { "cell_type": "code", @@ -258,7 +278,8 @@ "%%sql\n", "SELECT Store_ID, SUM(total_sales) AS total_sales FROM SalesData\n", "GROUP BY Store_ID ORDER BY total_sales DESC limit 5;" - ] + ], + "id": "4db95a7e" }, { "attachments": {}, @@ -266,7 +287,8 @@ "metadata": {}, "source": [ "Sales Contribution by Product (Percentage)" - ] + ], + "id": "27d6d9d5" }, { "cell_type": "code", @@ -277,7 +299,8 @@ "%%sql\n", "SELECT product_name, SUM(total_sales) * 100.0 / (SELECT SUM(total_sales) FROM SalesData) AS sales_percentage FROM SalesData\n", " GROUP BY product_name ORDER BY sales_percentage DESC limit 5;" - ] + ], + "id": "c932045d" }, { "attachments": {}, @@ -285,7 +308,8 @@ "metadata": {}, "source": [ "Top Days with Highest Sale" - ] + ], + "id": "58a3a81d" }, { "cell_type": "code", @@ -296,7 +320,8 @@ "%%sql\n", "SELECT date, SUM(total_sales) AS total_sales FROM SalesData\n", " GROUP BY date ORDER BY total_sales DESC LIMIT 5;" - ] + ], + "id": "316cf98a" }, { "attachments": {}, @@ -307,7 +332,8 @@ "\n", "We have shown how to insert data from a Amazon S3 using `Pipelines` to SingleStoreDB. These techniques should enable you to\n", "integrate your Amazon S3 with SingleStoreDB." - ] + ], + "id": "ff35e62e" }, { "attachments": {}, @@ -317,7 +343,8 @@ "## Clean up\n", "\n", "Remove the '#' to uncomment and execute the queries below to clean up the pipeline and table created." - ] + ], + "id": "758c7f7e" }, { "attachments": {}, @@ -325,7 +352,8 @@ "metadata": {}, "source": [ "#### Drop Pipeline" - ] + ], + "id": "5e75054e" }, { "cell_type": "code", @@ -338,7 +366,8 @@ "#STOP PIPELINE SalesData_Pipeline;\n", "\n", "#DROP PIPELINE SalesData_Pipeline;" - ] + ], + "id": "f6e37d08" }, { "attachments": {}, @@ -346,7 +375,8 @@ "metadata": {}, "source": [ "#### Drop Data" - ] + ], + "id": "2a438ea4" }, { "cell_type": "code", @@ -359,11 +389,12 @@ "# %sql DROP DATABASE IF EXISTS SalesAnalysis;\n", "#else:\n", "# %sql DROP TABLE SalesData;" - ] + ], + "id": "b0d21e5d" }, { + "id": "687e2782", "cell_type": "markdown", - "id": "c5abfdb8-0c85-428f-be0c-82f62516a55a", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/load-data-json/notebook.ipynb b/notebooks/load-data-json/notebook.ipynb index 9e49393b..c89926ad 100644 --- a/notebooks/load-data-json/notebook.ipynb +++ b/notebooks/load-data-json/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "24c5a12b", "cell_type": "markdown", - "id": "a5296a57-2be8-41fb-b7e4-3458d7f99324", "metadata": {}, "source": [ "
    \n", @@ -17,8 +17,8 @@ ] }, { + "id": "1e9aad0a", "cell_type": "markdown", - "id": "777e083a-bd01-454b-a302-2155f7e9b723", "metadata": {}, "source": [ "
    \n", @@ -36,7 +36,8 @@ "metadata": {}, "source": [ "In this example, we want to create a pipeline from multiple JSON files stored in an AWS S3 bucket called singlestoredb and a folder called **employeedata**. This bucket is located in **ap-south-1**." - ] + ], + "id": "fe4c3ce6" }, { "attachments": {}, @@ -67,7 +68,8 @@ " \"salary\": 203000\n", "}\n", "```" - ] + ], + "id": "121fe81b" }, { "attachments": {}, @@ -75,7 +77,8 @@ "metadata": {}, "source": [ "

    Demo Flow

    " - ] + ], + "id": "b6f87690" }, { "attachments": {}, @@ -83,7 +86,8 @@ "metadata": {}, "source": [ "" - ] + ], + "id": "5799ccba" }, { "attachments": {}, @@ -91,7 +95,8 @@ "metadata": {}, "source": [ "## How to use this notebook" - ] + ], + "id": "2d280222" }, { "attachments": {}, @@ -99,7 +104,8 @@ "metadata": {}, "source": [ "" - ] + ], + "id": "e463111e" }, { "attachments": {}, @@ -109,7 +115,8 @@ "## Create a database (You can skip this Step if you are using Free Starter Tier)\n", "\n", "We need to create a database to work with in the following examples." - ] + ], + "id": "f493ee28" }, { "cell_type": "code", @@ -121,7 +128,8 @@ "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", " %sql DROP DATABASE IF EXISTS HRData;\n", " %sql CREATE DATABASE HRData;" - ] + ], + "id": "56dca5b3" }, { "attachments": {}, @@ -135,7 +143,8 @@ "

    If you have a Free Starter Workspace deployed already, select the database from drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.

    \n", "
    \n", "
    " - ] + ], + "id": "dee620b8" }, { "cell_type": "code", @@ -155,7 +164,8 @@ " salary int NOT NULL,\n", " Children JSON COLLATE utf8_bin NOT NULL\n", " );" - ] + ], + "id": "8a090c70" }, { "attachments": {}, @@ -163,7 +173,8 @@ "metadata": {}, "source": [ "### Create Pipeline To Insert JSON Data into Table" - ] + ], + "id": "1b00dd35" }, { "cell_type": "code", @@ -194,7 +205,8 @@ ");\n", "\n", "START PIPELINE employeeData;" - ] + ], + "id": "ed2afcfd" }, { "attachments": {}, @@ -202,7 +214,8 @@ "metadata": {}, "source": [ "### Check if Data is Loaded" - ] + ], + "id": "2071760e" }, { "cell_type": "code", @@ -212,7 +225,8 @@ "source": [ "%%sql\n", "SELECT * from employeeData limit 5;" - ] + ], + "id": "8d959476" }, { "attachments": {}, @@ -220,7 +234,8 @@ "metadata": {}, "source": [ "### Sample Queries" - ] + ], + "id": "a1217682" }, { "attachments": {}, @@ -228,7 +243,8 @@ "metadata": {}, "source": [ "#### Select Top 2 Employees with highest salary risiding in State 'MS'" - ] + ], + "id": "df99efdf" }, { "cell_type": "code", @@ -238,7 +254,8 @@ "source": [ "%%sql\n", "select * from employeeData where contactinfo::$state = 'MS' order by salary desc limit 2" - ] + ], + "id": "cb35cb27" }, { "attachments": {}, @@ -246,7 +263,8 @@ "metadata": {}, "source": [ "#### Select Top 5 Cities with highest Average salary" - ] + ], + "id": "f4e9e06f" }, { "cell_type": "code", @@ -257,7 +275,8 @@ "%%sql\n", "select contactinfo::$city as City,AVG(salary) as 'Avg Salary' from employeeData\n", " group by contactinfo::$city order by AVG(salary) desc limit 5" - ] + ], + "id": "86d657d1" }, { "attachments": {}, @@ -265,7 +284,8 @@ "metadata": {}, "source": [ "#### Number of employees with Children grouped by No of children" - ] + ], + "id": "975024c4" }, { "cell_type": "code", @@ -279,7 +299,8 @@ " COUNT(*) AS employees_with_children\n", "FROM employeeData\n", " group by JSON_LENGTH(Children);" - ] + ], + "id": "93c493d6" }, { "attachments": {}, @@ -287,7 +308,8 @@ "metadata": {}, "source": [ "#### Average salary of employees who have children" - ] + ], + "id": "67923638" }, { "cell_type": "code", @@ -300,7 +322,8 @@ " AVG(salary) AS average_salary_with_children\n", "FROM employeeData\n", "WHERE JSON_LENGTH(Children) > 0;" - ] + ], + "id": "9bc72500" }, { "attachments": {}, @@ -308,7 +331,8 @@ "metadata": {}, "source": [ "#### Select the total and average salary by State" - ] + ], + "id": "014b2a17" }, { "cell_type": "code", @@ -324,7 +348,8 @@ " AVG(salary) AS 'Average Salary'\n", "FROM employeeData\n", "GROUP BY contactinfo::$state limit 5;" - ] + ], + "id": "9b47355a" }, { "attachments": {}, @@ -332,7 +357,8 @@ "metadata": {}, "source": [ "#### Top 5 job title with highest number of employees" - ] + ], + "id": "81a8cc45" }, { "cell_type": "code", @@ -346,7 +372,8 @@ " COUNT(*) AS num_employees\n", "FROM employeeData\n", "GROUP BY jobTitleName order by num_employees desc limit 5;" - ] + ], + "id": "32485fd9" }, { "attachments": {}, @@ -354,7 +381,8 @@ "metadata": {}, "source": [ "#### Select the highest and lowest salary" - ] + ], + "id": "325bb36b" }, { "cell_type": "code", @@ -367,7 +395,8 @@ " MAX(salary) AS highest_salary,\n", " MIN(salary) AS lowest_salary\n", "FROM employeeData;" - ] + ], + "id": "096c39ea" }, { "attachments": {}, @@ -379,7 +408,8 @@ "\n", "We have shown how to connect to S3 using `Pipelines` and insert JSON data into SinglestoreDB. These techniques should enable you to\n", "integrate and query your JSON data with SingleStoreDB." - ] + ], + "id": "3374552a" }, { "attachments": {}, @@ -389,7 +419,8 @@ "## Clean up\n", "\n", "Remove the '#' to uncomment and execute the queries below to clean up the pipeline and table created." - ] + ], + "id": "f2b94471" }, { "cell_type": "code", @@ -401,7 +432,8 @@ "#STOP PIPELINE employeeData;\n", "\n", "#DROP PIPELINE employeeData;" - ] + ], + "id": "abb687a8" }, { "attachments": {}, @@ -409,7 +441,8 @@ "metadata": {}, "source": [ "Drop data" - ] + ], + "id": "55b13fbd" }, { "cell_type": "code", @@ -422,11 +455,12 @@ "# %sql DROP DATABASE IF EXISTS HRData;\n", "#else:\n", "# %sql DROP TABLE employeeData;" - ] + ], + "id": "b819cbdc" }, { + "id": "b11dbcfc", "cell_type": "markdown", - "id": "9bd328a2-666d-449e-b327-61d0f114ce88", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/load-data-kakfa/notebook.ipynb b/notebooks/load-data-kakfa/notebook.ipynb index a0ac145d..a9b642c7 100644 --- a/notebooks/load-data-kakfa/notebook.ipynb +++ b/notebooks/load-data-kakfa/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "ff22f714", "cell_type": "markdown", - "id": "4fa50251-2837-4228-acb3-3a11d95b8928", "metadata": {}, "source": [ "
    \n", @@ -17,8 +17,8 @@ ] }, { + "id": "2f0bf08d", "cell_type": "markdown", - "id": "2dc3099d-640a-4325-971c-2b20a5ca458d", "metadata": {}, "source": [ "
    \n", @@ -36,7 +36,8 @@ "metadata": {}, "source": [ "### Introduction" - ] + ], + "id": "ab96a8ca" }, { "attachments": {}, @@ -44,7 +45,8 @@ "metadata": {}, "source": [ "The Real-Time Event Monitoring use case illustrates how to leverage Singlestore's capabilities to process and analyze streaming data from a Kafka data source. This demo showcases the ability to ingest real-time events, such as application logs or user activities, and perform immediate analysis to gain actionable insights. By working through this example, new users will learn how to set up a Kafka data pipeline, ingest streaming data into Singlestore, and execute real-time queries to monitor event types, user activity patterns, and detect anomalies. This use case highlights the power of Singlestore in providing timely and relevant information for decision-making in dynamic environments." - ] + ], + "id": "f3b8d443" }, { "attachments": {}, @@ -52,7 +54,8 @@ "metadata": {}, "source": [ "" - ] + ], + "id": "1a480b1e" }, { "attachments": {}, @@ -60,7 +63,8 @@ "metadata": {}, "source": [ "## How to use this notebook" - ] + ], + "id": "06f4bfee" }, { "attachments": {}, @@ -68,7 +72,8 @@ "metadata": {}, "source": [ "" - ] + ], + "id": "e2cbadb4" }, { "attachments": {}, @@ -78,7 +83,8 @@ "## Create a database (You can skip this Step if you are using Free Starter Tier)\n", "\n", "We need to create a database to work with in the following examples." - ] + ], + "id": "4d1294c0" }, { "cell_type": "code", @@ -90,7 +96,8 @@ "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", " %sql DROP DATABASE IF EXISTS EventAnalysis;\n", " %sql CREATE DATABASE EventAnalysis;" - ] + ], + "id": "fa6b62ca" }, { "attachments": {}, @@ -104,7 +111,8 @@ "

    If you have a Free Starter Workspace deployed already, select the database from drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.

    \n", "
    \n", "
    " - ] + ], + "id": "10135797" }, { "attachments": {}, @@ -112,7 +120,8 @@ "metadata": {}, "source": [ "### Create Table" - ] + ], + "id": "31257823" }, { "cell_type": "code", @@ -132,7 +141,8 @@ " `region` varchar(128) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,\n", " `country` varchar(128) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL\n", ")" - ] + ], + "id": "312d1254" }, { "attachments": {}, @@ -140,7 +150,8 @@ "metadata": {}, "source": [ "### Load Data using Pipeline" - ] + ], + "id": "441a552a" }, { "cell_type": "code", @@ -170,7 +181,8 @@ ");\n", "\n", "START PIPELINE `eventsdata`;" - ] + ], + "id": "afe53130" }, { "cell_type": "code", @@ -180,7 +192,8 @@ "source": [ "%%sql\n", "SELECT COUNT(*) FROM `eventsdata`" - ] + ], + "id": "a4a75baf" }, { "attachments": {}, @@ -188,7 +201,8 @@ "metadata": {}, "source": [ "### Sample Queries" - ] + ], + "id": "1b2d7ce0" }, { "attachments": {}, @@ -196,7 +210,8 @@ "metadata": {}, "source": [ "Events by Region" - ] + ], + "id": "ca0c5167" }, { "cell_type": "code", @@ -210,7 +225,8 @@ "COUNT(events.country) AS 'events.countofevents'\n", "FROM eventsdata AS events\n", "GROUP BY 1 ORDER BY 2 DESC LIMIT 5;" - ] + ], + "id": "d70933bf" }, { "attachments": {}, @@ -218,7 +234,8 @@ "metadata": {}, "source": [ "Events by Top 5 Advertisers" - ] + ], + "id": "c5eba926" }, { "cell_type": "code", @@ -235,7 +252,8 @@ " (events.advertiser LIKE '%Subway%' OR events.advertiser LIKE '%McDonalds%' OR events.advertiser LIKE '%Starbucks%' OR events.advertiser LIKE '%Dollar General%' OR events.advertiser LIKE '%YUM! Brands%')\n", "GROUP BY 1\n", "ORDER BY 2 DESC;" - ] + ], + "id": "656e5d4d" }, { "attachments": {}, @@ -243,7 +261,8 @@ "metadata": {}, "source": [ "Ad visitors by gender and income" - ] + ], + "id": "80935d1d" }, { "cell_type": "code", @@ -270,7 +289,8 @@ ") xx\n", ") zz\n", "WHERE (z__pivot_col_rank <= 50 OR z__is_highest_ranked_cell = 1) AND (z___pivot_row_rank <= 500 OR z__pivot_col_ordering = 1) ORDER BY z___pivot_row_rank;" - ] + ], + "id": "63f0dffa" }, { "attachments": {}, @@ -278,7 +298,8 @@ "metadata": {}, "source": [ "Pipeline will keep pushing data from the kafka topic. Once your data is loaded you can stop the pipeline using below command" - ] + ], + "id": "a5cd4fd8" }, { "cell_type": "code", @@ -288,7 +309,8 @@ "source": [ "%%sql\n", "STOP PIPELINE eventsdata" - ] + ], + "id": "fc3b6e08" }, { "attachments": {}, @@ -300,7 +322,8 @@ "\n", "We have shown how to connect to Kafka using `Pipelines` and insert data into SinglestoreDB. These techniques should enable you to\n", "integrate your Kafka topics with SingleStoreDB." - ] + ], + "id": "35bb1878" }, { "attachments": {}, @@ -310,7 +333,8 @@ "## Clean up\n", "\n", "Remove the '#' to uncomment and execute the queries below to clean up the pipeline and table created." - ] + ], + "id": "d22016b4" }, { "attachments": {}, @@ -318,7 +342,8 @@ "metadata": {}, "source": [ "Drop the pipeline using below command" - ] + ], + "id": "675359fc" }, { "cell_type": "code", @@ -328,7 +353,8 @@ "source": [ "%%sql\n", "#DROP PIPELINE eventsdata" - ] + ], + "id": "d8bca715" }, { "cell_type": "code", @@ -341,11 +367,12 @@ "# %sql DROP DATABASE IF EXISTS EventAnalysis;\n", "#else:\n", "# %sql DROP TABLE eventsdata;" - ] + ], + "id": "620ae66e" }, { + "id": "6ad494a5", "cell_type": "markdown", - "id": "dea172b7-a926-4293-bdbd-ff38eeba581f", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/load-json-files-s3/notebook.ipynb b/notebooks/load-json-files-s3/notebook.ipynb index adee4398..76622fb2 100644 --- a/notebooks/load-json-files-s3/notebook.ipynb +++ b/notebooks/load-json-files-s3/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "0a372889", "cell_type": "markdown", - "id": "deb8dbf4-2368-41b4-9f09-b14c96ccb344", "metadata": {}, "source": [ "
    \n", @@ -18,7 +18,6 @@ }, { "cell_type": "markdown", - "id": "b4b337ff", "metadata": {}, "source": [ "
    \n", @@ -28,37 +27,37 @@ "

    This tutorial is meant for Standard & Premium Workspaces. You can't run this with a Free Starter Workspace due to restrictions on Storage. Create a Workspace using +group in the left nav & select Standard for this notebook. Gallery notebooks tagged with \"Starter\" are suitable to run on a Free Starter Workspace

    \n", "
    \n", "
    " - ] + ], + "id": "48097b9a" }, { "cell_type": "markdown", - "id": "50093846-9ea3-441d-89f0-fbe0576f78bf", "metadata": {}, "source": [ "This notebook helps you navigate through different scenarios data ingestion of JSON files from an AWS S3 location:\n", "* Ingest JSON files in AWS S3 using wildcards with pre-defined schema\n", "* Ingest JSON files in AWS S3 using wildcards into a JSON column" - ] + ], + "id": "ddbcac99" }, { "cell_type": "markdown", - "id": "b2ed410a-87b8-452a-b906-431fb0e949b3", "metadata": {}, "source": [ "## Create a Pipeline from JSON files in AWS S3 using wildcards" - ] + ], + "id": "3592227a" }, { "cell_type": "markdown", - "id": "9996b479-586d-4af3-b0ee-b61eead39ebc", "metadata": {}, "source": [ "In this example, we want to create a pipeline from two JSON files called **actors1.json** and **actors2.json** stored in an AWS S3 bucket called singlestoredb and a folder called **actors**. This bucket is located in **us-east-1**." - ] + ], + "id": "23ad18c2" }, { "cell_type": "markdown", - "id": "9a4caf68-0610-41a6-bfd1-59612b8e959a", "metadata": {}, "source": [ "Each file has the following shape with nested objects and arrays:\n", @@ -100,38 +99,38 @@ " ]\n", "}\n", "```" - ] + ], + "id": "551b8612" }, { "cell_type": "markdown", - "id": "98a8e14f-808e-43ff-b670-b6656091b81a", "metadata": {}, "source": [ "### Create a Table" - ] + ], + "id": "ac9ed17a" }, { "cell_type": "markdown", - "id": "a70e168d-de32-4988-90c4-651089ac25a0", "metadata": {}, "source": [ "We first create a table called **actors** in the database **demo_database**" - ] + ], + "id": "5f0e4e06" }, { "cell_type": "code", "execution_count": 1, - "id": "b703aab8-7449-43db-af04-9d65520239a5", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "CREATE DATABASE IF NOT EXISTS demo_database;" - ] + ], + "id": "2857e33b" }, { "cell_type": "markdown", - "id": "6dfc5b0b-9308-46c9-8cc8-be08fb07c1b6", "metadata": {}, "source": [ "
    \n", @@ -141,12 +140,12 @@ "

    Make sure to select the demo_database database from the drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.

    \n", "
    \n", "
    " - ] + ], + "id": "5a236dfe" }, { "cell_type": "code", "execution_count": 2, - "id": "b09528cf-0beb-4fe0-9e60-6edefb72f8b1", "metadata": {}, "outputs": [], "source": [ @@ -164,29 +163,29 @@ " children JSON COLLATE utf8_bin NOT NULL,\n", " SHARD KEY ()\n", ");" - ] + ], + "id": "4a17a738" }, { "cell_type": "markdown", - "id": "e4c15a63-eb17-432d-b0b5-d7485bcf028d", "metadata": {}, "source": [ "### Create a pipeline" - ] + ], + "id": "8c07ad3e" }, { "cell_type": "markdown", - "id": "5e09146a-74cb-4e0d-bd0a-3502c2d15a00", "metadata": {}, "source": [ "We then create a pipeline called **actors** in the database **demo_database**. Since those files are small, batch_interval is not as important and the maximum partitions per batch is only 1. For faster performance, we recommend increasing the maximum partitions per batch.\n", "Note, that since the bucket is publcly accessible, you do not need to provide access key and secret." - ] + ], + "id": "f6d82cb6" }, { "cell_type": "code", "execution_count": 3, - "id": "92df7943-e68d-4509-b7f5-4a93697f6578", "metadata": {}, "outputs": [], "source": [ @@ -217,114 +216,114 @@ " actors.hasGreyHair <- hasGreyHair,\n", " actors.children <- children\n", " );" - ] + ], + "id": "d49f0edb" }, { "cell_type": "markdown", - "id": "5410c1b9-573f-4326-ba4c-b7af71e069ad", "metadata": {}, "source": [ "### Start and monitor the pipeline" - ] + ], + "id": "1fbca46b" }, { "cell_type": "code", "execution_count": 4, - "id": "eeddd12e-e28c-4000-859b-6d1291c4a137", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "START PIPELINE demo_database.actors;" - ] + ], + "id": "cbd877bb" }, { "cell_type": "markdown", - "id": "a555997d-38dc-4b69-821b-390e52bb4d00", "metadata": {}, "source": [ "If there is no error or warning, you should see no error message." - ] + ], + "id": "908565d2" }, { "cell_type": "code", "execution_count": 5, - "id": "f48de155-af85-4c40-ad56-955573a434f8", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT * FROM information_schema.pipelines_errors\n", " WHERE pipeline_name = 'actors' ;" - ] + ], + "id": "a747d67c" }, { "cell_type": "markdown", - "id": "c18ac453-63de-424a-b9bf-ae6846817ea6", "metadata": {}, "source": [ "### Query the table" - ] + ], + "id": "3f92a9a1" }, { "cell_type": "code", "execution_count": 6, - "id": "09a739cb-4925-4699-ab61-71016a04bfb6", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT * FROM demo_database.actors;" - ] + ], + "id": "7147748d" }, { "cell_type": "markdown", - "id": "c4815572-10d8-4c31-a246-05ad6e7e6e99", "metadata": {}, "source": [ "### Cleanup ressources" - ] + ], + "id": "9aafc5e0" }, { "cell_type": "code", "execution_count": 7, - "id": "6a6dfc1d-c758-4287-a797-6cc3e4fff934", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "DROP PIPELINE IF EXISTS demo_database.actors;\n", "DROP TABLE IF EXISTS demo_database.actors;" - ] + ], + "id": "a0ebee1a" }, { "cell_type": "markdown", - "id": "09fbffac-9a0a-45fd-ad07-ede4e11b3691", "metadata": {}, "source": [ "## Ingest JSON files in AWS S3 using wildcards into a JSON column" - ] + ], + "id": "328b6f8e" }, { "cell_type": "markdown", - "id": "d3e8ff65-1b2d-47c5-8754-28fa4c254edd", "metadata": {}, "source": [ "As the schema of your files might change, you might want to keep flexibility in ingesting the data into one JSON column that we name **json_data**. the table we create is named **actors_json**." - ] + ], + "id": "ff1def80" }, { "cell_type": "markdown", - "id": "d761f324-0d28-4713-a866-3f96673d8317", "metadata": {}, "source": [ "### Create Table" - ] + ], + "id": "11b48867" }, { "cell_type": "code", "execution_count": 8, - "id": "bcb14814-7b79-4df2-ab47-7def7ae03ce3", "metadata": {}, "outputs": [], "source": [ @@ -333,20 +332,20 @@ " json_data JSON NOT NULL,\n", " SHARD KEY ()\n", ");" - ] + ], + "id": "e8d86472" }, { "cell_type": "markdown", - "id": "429fce4b-c529-4acf-af7e-5d802f79eda6", "metadata": {}, "source": [ "### Create a pipeline" - ] + ], + "id": "7bcf7671" }, { "cell_type": "code", "execution_count": 9, - "id": "a1d60130-095e-45da-b55d-b427a0af3d26", "metadata": {}, "outputs": [], "source": [ @@ -366,31 +365,31 @@ " INTO TABLE `actors_json`\n", " FORMAT JSON\n", " (json_data <- %);" - ] + ], + "id": "d359933e" }, { "cell_type": "markdown", - "id": "bd296bf5-db20-4028-a1d7-b5c9da0a6cb2", "metadata": {}, "source": [ "### Start and monitor pipeline" - ] + ], + "id": "04353f3a" }, { "cell_type": "code", "execution_count": 10, - "id": "b374598a-f9cb-43c4-a2a4-ebcd298108c4", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "START PIPELINE demo_database.actors_json;" - ] + ], + "id": "37b78fc2" }, { "cell_type": "code", "execution_count": 11, - "id": "ca06781b-61fa-4fea-97de-cd0dbacd86e8", "metadata": {}, "outputs": [], "source": [ @@ -398,49 +397,50 @@ "# Monitor and see if there is any error or warning\n", "SELECT * FROM information_schema.pipelines_errors\n", " WHERE pipeline_name = 'actors_json' ;" - ] + ], + "id": "8ac661c1" }, { "cell_type": "markdown", - "id": "7419ccdd-0f85-414e-bd05-fbe8d9656305", "metadata": {}, "source": [ "### Query the table" - ] + ], + "id": "080d9cb6" }, { "cell_type": "code", "execution_count": 12, - "id": "e34c5b49-0e97-4b07-9026-38bb6c370f73", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT * FROM demo_database.actors_json" - ] + ], + "id": "390b84af" }, { "cell_type": "markdown", - "id": "c4c155e5-a4a5-4b01-a8a7-e7e626e5fac8", "metadata": {}, "source": [ "### Cleanup ressources" - ] + ], + "id": "ed556ee9" }, { "cell_type": "code", "execution_count": 13, - "id": "6f0bd356-8a11-4cd9-b774-569d8f5e2520", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "DROP DATABASE IF EXISTS demo_database;" - ] + ], + "id": "c2dd1608" }, { + "id": "b217d2f0", "cell_type": "markdown", - "id": "c572193e-7f5b-4637-af5d-2f33f5ba5d86", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/load-s3-files-into-shared-tier/notebook.ipynb b/notebooks/load-s3-files-into-shared-tier/notebook.ipynb index f9c5f8bb..103b21f4 100644 --- a/notebooks/load-s3-files-into-shared-tier/notebook.ipynb +++ b/notebooks/load-s3-files-into-shared-tier/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "e665a836", "cell_type": "markdown", - "id": "deb8dbf4-2368-41b4-9f09-b14c96ccb344", "metadata": {}, "source": [ "
    \n", @@ -19,34 +19,33 @@ { "attachments": {}, "cell_type": "markdown", - "id": "50093846-9ea3-441d-89f0-fbe0576f78bf", "metadata": {}, "source": [ "This notebook guides you through data ingestion of CSV files from an AWS S3 location into your shared tier workspace." - ] + ], + "id": "8d216f60" }, { "attachments": {}, "cell_type": "markdown", - "id": "b2ed410a-87b8-452a-b906-431fb0e949b3", "metadata": {}, "source": [ "# Create a Pipeline from CSV files in AWS S3" - ] + ], + "id": "f2a0cd83" }, { "attachments": {}, "cell_type": "markdown", - "id": "9996b479-586d-4af3-b0ee-b61eead39ebc", "metadata": {}, "source": [ "In this example, we want to create a pipeline that ingests from a CSV file stored in an AWS S3 bucket. We will guide you through an example with stock market data." - ] + ], + "id": "ea5f52dc" }, { "attachments": {}, "cell_type": "markdown", - "id": "6dfc5b0b-9308-46c9-8cc8-be08fb07c1b6", "metadata": {}, "source": [ "
    \n", @@ -56,30 +55,30 @@ "

    Make sure to select your database from the drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.

    \n", "
    \n", "
    " - ] + ], + "id": "03567fa8" }, { "attachments": {}, "cell_type": "markdown", - "id": "98a8e14f-808e-43ff-b670-b6656091b81a", "metadata": {}, "source": [ "## Create a Table" - ] + ], + "id": "4dc71912" }, { "attachments": {}, "cell_type": "markdown", - "id": "a70e168d-de32-4988-90c4-651089ac25a0", "metadata": {}, "source": [ "Start by creating a table to store the ingested data. In our example, we will create a `Stocks` table that will store trading data for a specific stock on a given date." - ] + ], + "id": "7758dcbd" }, { "cell_type": "code", "execution_count": 1, - "id": "860e1517-bd31-415f-8750-14f7bcbb85bf", "metadata": {}, "outputs": [], "source": [ @@ -95,21 +94,21 @@ "\t`Name` text CHARACTER SET utf8 COLLATE utf8_general_ci NULL,\n", "\t SHARD KEY ()\n", ");" - ] + ], + "id": "c70f4820" }, { "attachments": {}, "cell_type": "markdown", - "id": "e4c15a63-eb17-432d-b0b5-d7485bcf028d", "metadata": {}, "source": [ "## Create a pipeline" - ] + ], + "id": "f392979e" }, { "attachments": {}, "cell_type": "markdown", - "id": "09616d12-05b7-4701-8f7d-37926aa78e7e", "metadata": {}, "source": [ "We then create a pipeline by pointing the data source to the S3 bucket containing the dataset. In our case, we have a CSV file `all_stocks_5yr.csv` and will be ingesting it into our `Stocks` table via a pipeline `stocks_pipeline`.\n", @@ -121,12 +120,12 @@ "- Your AWS account\u2019s access credentials: `` and ``\n", "\n", "*For more on how to retrieve the above information, read our [Pipeline Documentation](https://docs.singlestore.com/cloud/load-data/load-data-with-pipelines/how-to-load-data-using-pipelines/load-data-from-amazon-web-services-aws-s-3/)." - ] + ], + "id": "ee51f3db" }, { "cell_type": "code", "execution_count": 2, - "id": "46229b29-7361-424e-86cf-31aa195df2d8", "metadata": {}, "outputs": [], "source": [ @@ -152,62 +151,62 @@ " \t`stocks`.`volume`,\n", " \t`stocks`.`Name`\n", " );" - ] + ], + "id": "3ab6005c" }, { "attachments": {}, "cell_type": "markdown", - "id": "5410c1b9-573f-4326-ba4c-b7af71e069ad", "metadata": {}, "source": [ "## Start and monitor the pipeline" - ] + ], + "id": "3c83f8dd" }, { "attachments": {}, "cell_type": "markdown", - "id": "acdb8cb7-3765-4503-a2fb-0e86b811431f", "metadata": {}, "source": [ "The CREATE PIPELINE statement creates a new pipeline, but the pipeline has not yet been started, and no data has been loaded. To start a pipeline in the background, run:" - ] + ], + "id": "76a8671c" }, { "cell_type": "code", "execution_count": 3, - "id": "eeddd12e-e28c-4000-859b-6d1291c4a137", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "\n", "START PIPELINE stocks_pipeline;" - ] + ], + "id": "53403dba" }, { "cell_type": "code", "execution_count": 4, - "id": "1f2894a3-31fe-4363-a75d-d72569d9918b", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "show pipelines;" - ] + ], + "id": "f5f76fd4" }, { "attachments": {}, "cell_type": "markdown", - "id": "a555997d-38dc-4b69-821b-390e52bb4d00", "metadata": {}, "source": [ "If there is no error or warning, you should see no error message." - ] + ], + "id": "a7401fca" }, { "cell_type": "code", "execution_count": 5, - "id": "f48de155-af85-4c40-ad56-955573a434f8", "metadata": {}, "outputs": [], "source": [ @@ -215,42 +214,42 @@ "\n", "SELECT * FROM information_schema.pipelines_errors\n", " WHERE pipeline_name = 'stocks_pipeline';" - ] + ], + "id": "3a2cbf18" }, { "attachments": {}, "cell_type": "markdown", - "id": "c18ac453-63de-424a-b9bf-ae6846817ea6", "metadata": {}, "source": [ "## Query the table" - ] + ], + "id": "47e41b20" }, { "cell_type": "code", "execution_count": 6, - "id": "09a739cb-4925-4699-ab61-71016a04bfb6", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "\n", "SELECT * FROM stocks LIMIT 5;" - ] + ], + "id": "69059384" }, { "attachments": {}, "cell_type": "markdown", - "id": "c4815572-10d8-4c31-a246-05ad6e7e6e99", "metadata": {}, "source": [ "## Cleanup resources" - ] + ], + "id": "06a1ce78" }, { "cell_type": "code", "execution_count": 7, - "id": "6a6dfc1d-c758-4287-a797-6cc3e4fff934", "metadata": {}, "outputs": [], "source": [ @@ -258,11 +257,12 @@ "\n", "DROP PIPELINE IF EXISTS test.stocks_pipeline;\n", "DROP TABLE IF EXISTS test.stocks;" - ] + ], + "id": "2864526b" }, { + "id": "498edece", "cell_type": "markdown", - "id": "c572193e-7f5b-4637-af5d-2f33f5ba5d86", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/managing-stage-files-with-fusion-sql/notebook.ipynb b/notebooks/managing-stage-files-with-fusion-sql/notebook.ipynb index 061212c0..613a7e16 100644 --- a/notebooks/managing-stage-files-with-fusion-sql/notebook.ipynb +++ b/notebooks/managing-stage-files-with-fusion-sql/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "a00739c4", "cell_type": "markdown", - "id": "b3dbe6f5-da17-424a-bdea-9b024a63ecac", "metadata": {}, "source": [ "
    \n", @@ -19,54 +19,53 @@ { "attachments": {}, "cell_type": "markdown", - "id": "ceb473ad-57c6-44eb-bfdc-c4c2e8ebf71c", "metadata": {}, "source": [ "Fusion SQL can be used to manage your workspace groups and workspaces, but it\n", "can also be used to upload, download, and manage files in your workspace group\n", "Stage. We'll show you how to work with files in Stage in this notebook." - ] + ], + "id": "c33d5542" }, { "attachments": {}, "cell_type": "markdown", - "id": "be4cae77-ca16-4e3b-81a5-7b52f8a44f2d", "metadata": {}, "source": [ "## Displaying the Stage Fusion SQL commands\n", "\n", "The `SHOW FUSION COMMANDS` displays the commands that are handled by the Fusion\n", "engine. You can use the `LIKE` to filter the commands." - ] + ], + "id": "0afd983c" }, { "cell_type": "code", "execution_count": 1, - "id": "20ef40dc-0821-44fe-bca2-73dbe40c6b4e", "metadata": {}, "outputs": [], "source": [ "commands = %sql SHOW FUSION COMMANDS LIKE '%stage%'\n", "for cmd in commands:\n", " print(*cmd, '\\n')" - ] + ], + "id": "13eb0b1f" }, { "attachments": {}, "cell_type": "markdown", - "id": "9e682ac7-fdbc-4097-9501-2fda503b5878", "metadata": {}, "source": [ "## Creating a workspace group\n", "\n", "We'll start by creating a workspace group. We can get a region in the US by using the `SHOW REGIONS`\n", "command and the `random` package." - ] + ], + "id": "91485576" }, { "cell_type": "code", "execution_count": 2, - "id": "6d39a690-8743-4fd7-9da3-4f585ea7e263", "metadata": {}, "outputs": [], "source": [ @@ -77,23 +76,23 @@ "\n", "region_id = random.choice(us_regions).ID\n", "region_id" - ] + ], + "id": "4ef2337d" }, { "cell_type": "code", "execution_count": 3, - "id": "b9bc0cdd-a083-48e6-9e27-695162b23b4e", "metadata": {}, "outputs": [], "source": [ "wg_name = 'Fusion Notebook'\n", "password = secrets.token_urlsafe(20) + '-x&'" - ] + ], + "id": "b00cf79d" }, { "cell_type": "code", "execution_count": 4, - "id": "e52d76a8-5e9b-4687-8de2-9e7b8cbdbc22", "metadata": {}, "outputs": [], "source": [ @@ -101,12 +100,12 @@ "CREATE WORKSPACE GROUP '{{ wg_name }}'\n", " IN REGION ID '{{ region_id }}' WITH PASSWORD '{{ password }}'\n", " WITH FIREWALL RANGES '0.0.0.0/0'" - ] + ], + "id": "ea62190e" }, { "attachments": {}, "cell_type": "markdown", - "id": "52d91dcb-95af-4900-84a5-d93fe13faa64", "metadata": {}, "source": [ "## Uploading and downloading Stage files\n", @@ -120,12 +119,12 @@ "```\n", "\n", "First we'll create a data file locally that we can work with." - ] + ], + "id": "0567f05d" }, { "cell_type": "code", "execution_count": 5, - "id": "a4cca507-39f6-4c52-8a56-b99216af85d4", "metadata": {}, "outputs": [], "source": [ @@ -135,103 +134,103 @@ "Joe,32,70\n", "Max,44,69\n", "Ann,33,64" - ] + ], + "id": "44f5d066" }, { "attachments": {}, "cell_type": "markdown", - "id": "c8ce1d5a-6a9e-42fa-b0f0-7ea4e96a425a", "metadata": {}, "source": [ "We can now upload our data file to our workspace group Stage." - ] + ], + "id": "b333c9e8" }, { "cell_type": "code", "execution_count": 6, - "id": "1baa1cf4-4cdb-4d2e-827c-cc1bc2824ead", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "UPLOAD FILE TO STAGE 'stats.csv' IN GROUP '{{ wg_name }}' FROM 'mydata.csv'" - ] + ], + "id": "63ffcdad" }, { "attachments": {}, "cell_type": "markdown", - "id": "8348b01e-527e-461f-903b-e912385d08bd", "metadata": {}, "source": [ "We can list the files in a Stage with the `SHOW STAGE FILES` command." - ] + ], + "id": "a5fd7a60" }, { "cell_type": "code", "execution_count": 7, - "id": "68dc83a0-46b8-44d0-b8cc-a543832e3ecd", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SHOW STAGE FILES IN GROUP '{{ wg_name }}'" - ] + ], + "id": "5bd84f4e" }, { "attachments": {}, "cell_type": "markdown", - "id": "fcf0450f-db6e-44e7-8bca-a3cf27096848", "metadata": {}, "source": [ "Downloading the file is just as easy as uploading." - ] + ], + "id": "ebc693df" }, { "cell_type": "code", "execution_count": 8, - "id": "7f28d391-84cd-485c-874b-bcd98f2b07bf", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "DOWNLOAD STAGE FILE 'stats.csv' IN GROUP '{{ wg_name }}' TO 'stats.csv' OVERWRITE" - ] + ], + "id": "43c7827d" }, { "cell_type": "code", "execution_count": 9, - "id": "176f9873-117d-4e8e-80fb-fcd020b41cfc", "metadata": {}, "outputs": [], "source": [ "!cat stats.csv" - ] + ], + "id": "ee3c4d33" }, { "attachments": {}, "cell_type": "markdown", - "id": "39ed2f5c-5139-4fff-9233-346dcdd84824", "metadata": {}, "source": [ "If you just want to display the contents of the Stage file without saving it to a local\n", "file, you simply leave the `TO` option off the `DOWNLOAD STAGE FILE`." - ] + ], + "id": "f5e5c776" }, { "cell_type": "code", "execution_count": 10, - "id": "c00ba225-5453-442a-bac1-a8b3c8a06e75", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "DOWNLOAD STAGE FILE 'stats.csv' IN GROUP '{{ wg_name }}' ENCODING 'utf-8'" - ] + ], + "id": "60984ea0" }, { "attachments": {}, "cell_type": "markdown", - "id": "3184fd60-ec75-48fe-bd90-b5bc90b4bdd7", "metadata": {}, "source": [ "## Creating folders\n", @@ -249,129 +248,129 @@ "project-2/\n", "project-2/data/\n", "```" - ] + ], + "id": "ac909bca" }, { "cell_type": "code", "execution_count": 11, - "id": "23f9a4d7-b871-4723-bc45-3dfe3cef67e8", "metadata": {}, "outputs": [], "source": [ "for name in ['project-1', 'project-1/data', 'project-2', 'project-2/data']:\n", " %sql CREATE STAGE FOLDER '{{ name }}' IN GROUP '{{ wg_name }}';" - ] + ], + "id": "f5fac755" }, { "cell_type": "code", "execution_count": 12, - "id": "c62129f1-5bd1-4da1-bb97-46273029c1ba", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SHOW STAGE FILES IN GROUP '{{ wg_name }}' RECURSIVE" - ] + ], + "id": "cc98d1f2" }, { "attachments": {}, "cell_type": "markdown", - "id": "1cc31cc3-d6b1-40d1-a856-514fb5d32cea", "metadata": {}, "source": [ "Now that we have a folder structure we can put files into those folders." - ] + ], + "id": "79703772" }, { "cell_type": "code", "execution_count": 13, - "id": "8182cffe-01c8-4c5a-b898-20624df3858f", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "UPLOAD FILE TO STAGE 'project-1/data/stats.csv' IN GROUP '{{ wg_name }}' FROM 'mydata.csv';\n", "UPLOAD FILE TO STAGE 'project-2/data/stats.csv' IN GROUP '{{ wg_name }}' FROM 'mydata.csv';" - ] + ], + "id": "5fc4df07" }, { "attachments": {}, "cell_type": "markdown", - "id": "6baa541d-1ea4-4130-8655-bffa768728de", "metadata": {}, "source": [ "Now when we do a recursive listing of our Stage, we'll see the newly created files." - ] + ], + "id": "eaca1ab2" }, { "cell_type": "code", "execution_count": 14, - "id": "9c9a15fb-1e38-4bf5-a008-ca18e2731439", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SHOW STAGE FILES IN GROUP '{{ wg_name }}' RECURSIVE" - ] + ], + "id": "9261cffa" }, { "attachments": {}, "cell_type": "markdown", - "id": "af69755e-10d8-4da2-a907-822684e960e4", "metadata": {}, "source": [ "We can list the files at a specific path as well." - ] + ], + "id": "0290c32a" }, { "cell_type": "code", "execution_count": 15, - "id": "4dcb95f4-5f05-4b75-b2c7-10cd779029bd", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SHOW STAGE FILES IN GROUP '{{ wg_name }}' AT 'project-2/data'" - ] + ], + "id": "e439eee3" }, { "attachments": {}, "cell_type": "markdown", - "id": "3104850f-f459-43f2-b122-c44c3f97938c", "metadata": {}, "source": [ "## Loading data from Stage\n", "\n", "We are going to load data from a Stage into a database table. For this, we need to\n", "have a workspace and a database." - ] + ], + "id": "883a9656" }, { "cell_type": "code", "execution_count": 16, - "id": "3a742652-4330-41df-b7a3-b4b0de718005", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "CREATE WORKSPACE 'stage-loader' IN GROUP '{{ wg_name }}' WITH SIZE 'S-00' WAIT ON ACTIVE" - ] + ], + "id": "b3c4e207" }, { "cell_type": "code", "execution_count": 17, - "id": "41bf143e-db2e-45c6-a012-285145cc6733", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SHOW WORKSPACES IN GROUP 'Fusion Notebook'" - ] + ], + "id": "36d0e56b" }, { "attachments": {}, "cell_type": "markdown", - "id": "3218a23c-f696-464d-a81c-bd054ebef79a", "metadata": {}, "source": [ "
    \n", @@ -381,23 +380,23 @@ "

    Make sure to select the stage-loader workspace from the drop-down menu at the top of this notebook.

    \n", "
    \n", "
    " - ] + ], + "id": "43d88cde" }, { "cell_type": "code", "execution_count": 18, - "id": "2d13f4f7-7c01-4dcf-abef-4b20d5e4d2c1", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "CREATE DATABASE IF NOT EXISTS stage_loader" - ] + ], + "id": "c97e381f" }, { "attachments": {}, "cell_type": "markdown", - "id": "8dda3f88-e64f-49c0-ab79-708cba12bf6d", "metadata": {}, "source": [ "
    \n", @@ -408,12 +407,12 @@ " It updates the connection_url to connect to that database.

    \n", "
    \n", "
    " - ] + ], + "id": "3deb2065" }, { "cell_type": "code", "execution_count": 19, - "id": "df308ee5-0d82-4b5c-a362-3f92e6784797", "metadata": {}, "outputs": [], "source": [ @@ -424,21 +423,21 @@ " age INT,\n", " height INT\n", ");" - ] + ], + "id": "28c4dab8" }, { "attachments": {}, "cell_type": "markdown", - "id": "09702737-9751-4bcd-b21a-97c24b5ff2ab", "metadata": {}, "source": [ "Load the data from the Stage using a pipeline." - ] + ], + "id": "62ac5718" }, { "cell_type": "code", "execution_count": 20, - "id": "3099b215-98cf-45c2-935f-c6f5300462be", "metadata": {}, "outputs": [], "source": [ @@ -454,24 +453,24 @@ " FORMAT CSV;\n", "START PIPELINE stage_test FOREGROUND;\n", "DROP PIPELINE stage_test;" - ] + ], + "id": "cbef048d" }, { "attachments": {}, "cell_type": "markdown", - "id": "dca63d30-6649-4174-b1f6-0227529f5452", "metadata": {}, "source": [ "We can now query the table and select the output into a Stage. Note that the\n", "`GROUP BY 1` is used here to combine the outputs from all of the database partitions\n", "into a single file. If you don't use that, you'll get multiple output files,\n", "each with a portion of the result set." - ] + ], + "id": "f87ce322" }, { "cell_type": "code", "execution_count": 21, - "id": "755d538d-e91b-4429-abc3-2a701a9a4eb4", "metadata": {}, "outputs": [], "source": [ @@ -479,34 +478,34 @@ "SELECT * FROM stats GROUP BY 1 INTO STAGE 'project-3/data/stats.csv'\n", " FIELDS TERMINATED BY ','\n", " LINES TERMINATED BY '\\n'" - ] + ], + "id": "e04ea9c9" }, { "cell_type": "code", "execution_count": 22, - "id": "ca342ca9-5215-4e5b-9c45-81d0d3d9ae17", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SHOW STAGE FILES IN GROUP '{{ wg_name }}' AT 'project-3' RECURSIVE" - ] + ], + "id": "4cf83faf" }, { "cell_type": "code", "execution_count": 23, - "id": "6ecdc7f9-1a41-4ec8-90f7-4e5712090ae5", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "DOWNLOAD STAGE FILE 'project-3/data/stats.csv' ENCODING 'utf-8'" - ] + ], + "id": "b41add98" }, { "attachments": {}, "cell_type": "markdown", - "id": "281ce8cc-4b2c-4e92-95ff-7fec8b4d8ff5", "metadata": {}, "source": [ "## Deleting Stage files and folders\n", @@ -520,77 +519,77 @@ "```\n", "\n", "Let's delete the `stats.csv` file at the root of our Stage." - ] + ], + "id": "11ec3a9f" }, { "cell_type": "code", "execution_count": 24, - "id": "a470b978-ff52-4095-8745-f98aac3f9671", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "DROP STAGE FILE 'stats.csv' IN GROUP '{{ wg_name }}'" - ] + ], + "id": "058ab079" }, { "cell_type": "code", "execution_count": 25, - "id": "35c37895-f4bc-49ae-bf9c-a055aa435032", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SHOW STAGE FILES IN GROUP '{{ wg_name }}'" - ] + ], + "id": "96516b35" }, { "attachments": {}, "cell_type": "markdown", - "id": "aea791e0-829a-4502-b1e5-19dd6295b851", "metadata": {}, "source": [ "Now let's delete the `project-2` folder including all of the files in it." - ] + ], + "id": "2e95a34e" }, { "cell_type": "code", "execution_count": 26, - "id": "a1731a9b-407c-4d91-9930-6d2e9311cbf2", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "DROP STAGE FOLDER 'project-2' IN GROUP '{{ wg_name }}' RECURSIVE" - ] + ], + "id": "112632ed" }, { "cell_type": "code", "execution_count": 27, - "id": "553e4756-f63a-40b3-b9dd-dea2c8a4f985", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SHOW STAGE FILES IN GROUP '{{ wg_name }}' RECURSIVE" - ] + ], + "id": "58410c8c" }, { "cell_type": "code", "execution_count": 28, - "id": "e2dbde48-0a93-44cf-b0ed-fc5e3c83cdf8", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "DROP STAGE FOLDER 'project-1' IN GROUP '{{ wg_name }}' RECURSIVE;\n", "DROP STAGE FOLDER 'project-3' IN GROUP '{{ wg_name }}' RECURSIVE;" - ] + ], + "id": "0c1291e3" }, { "attachments": {}, "cell_type": "markdown", - "id": "97516316-c04b-4d88-9f40-b6e2723fe26b", "metadata": {}, "source": [ "## Conclusion\n", @@ -599,11 +598,12 @@ "using Fusion SQL. It is also possible to work with Stage files using the SingleStoreDB\n", "Python SDK, see the [API documentation](https://singlestoredb-python.labs.singlestore.com/api.html#stage)\n", "for more details." - ] + ], + "id": "9708218d" }, { + "id": "43ecb8fb", "cell_type": "markdown", - "id": "8844d60d-0a3c-418c-8375-d5f1c1c72cdc", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/movie-recommendation/notebook.ipynb b/notebooks/movie-recommendation/notebook.ipynb index a3cd0c02..412d6dca 100644 --- a/notebooks/movie-recommendation/notebook.ipynb +++ b/notebooks/movie-recommendation/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "0826a622", "cell_type": "markdown", - "id": "ce06fbf2-db8e-4fb9-9036-40f9ec8c4592", "metadata": {}, "source": [ "
    \n", @@ -18,7 +18,6 @@ }, { "cell_type": "markdown", - "id": "e1c74826", "metadata": {}, "source": [ "
    \n", @@ -28,11 +27,11 @@ "

    This tutorial is meant for Standard & Premium Workspaces. You can't run this with a Free Starter Workspace due to restrictions on Storage. Create a Workspace using +group in the left nav & select Standard for this notebook. Gallery notebooks tagged with \"Starter\" are suitable to run on a Free Starter Workspace

    \n", "
    \n", "
    " - ] + ], + "id": "bd3c1a79" }, { "cell_type": "markdown", - "id": "7ed6450d-8003-47b8-9d31-4ffa221906ae", "metadata": {}, "source": [ "*Source*: [Full MovieLens 25M Dataset](https://grouplens.org/datasets/movielens/25m/) - [Appplication](https://movie-recommender-flask-t954.vercel.app/)\n", @@ -40,59 +39,59 @@ "This notebook demonstrates how SingleStoreDB helps you build a simple Movie Recommender System.\n", "\n", "" - ] + ], + "id": "9039eecb" }, { "cell_type": "markdown", - "id": "f940c981-b378-40cf-bb24-53b6015e486d", "metadata": {}, "source": [ "## 1. Install required libraries\n", "\n", "Install the library for vectorizing the data (up to 2 minutes)." - ] + ], + "id": "eee7cef6" }, { "cell_type": "code", "execution_count": 1, - "id": "4fc72c97-8ba9-462b-b241-ae2ff4e7531c", "metadata": {}, "outputs": [], "source": [ "!pip install sentence-transformers --quiet" - ] + ], + "id": "039f0b97" }, { "cell_type": "markdown", - "id": "5c9049cb-88b8-411a-b926-2517bd44859e", "metadata": {}, "source": [ "## 2. Create database and ingest data" - ] + ], + "id": "82ddb890" }, { "cell_type": "markdown", - "id": "2642c910-72a8-433b-b5fe-e5654f93f239", "metadata": {}, "source": [ "Create the `movie_recommender` database." - ] + ], + "id": "d3181151" }, { "cell_type": "code", "execution_count": 2, - "id": "fd83f672-5ef5-4a8e-9e7a-267dd19815f7", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "DROP DATABASE IF EXISTS movie_recommender;\n", "CREATE DATABASE IF NOT EXISTS movie_recommender;" - ] + ], + "id": "3e763118" }, { "cell_type": "markdown", - "id": "d6c75b9a-7a1f-44fe-9e25-f67f75c0d11f", "metadata": {}, "source": [ "
    \n", @@ -103,20 +102,20 @@ " It updates the connection_url which is used by the %%sql magic command and SQLAlchemy to make connections to the selected database.

    \n", "
    \n", "
    " - ] + ], + "id": "339bf265" }, { "cell_type": "markdown", - "id": "d215945f-3dcf-4e8d-b201-361af227260c", "metadata": {}, "source": [ "Create `tags` table and start pipeline." - ] + ], + "id": "3fda9e9d" }, { "cell_type": "code", "execution_count": 3, - "id": "f6e6f6d6-87c0-4cf8-9d21-72edf3416e8d", "metadata": {}, "outputs": [], "source": [ @@ -144,20 +143,20 @@ " (userId, movieId, tag, timestamp);\n", "\n", "START PIPELINE tags;" - ] + ], + "id": "3e21f911" }, { "cell_type": "markdown", - "id": "a85c13ff-e719-46b4-a781-891d60415ca8", "metadata": {}, "source": [ "Create `ratings` table and start pipeline." - ] + ], + "id": "9321b7e3" }, { "cell_type": "code", "execution_count": 4, - "id": "4c55da7c-fc1e-446c-aa9d-52ee3b085d97", "metadata": {}, "outputs": [], "source": [ @@ -185,20 +184,20 @@ " (userId, movieId, rating, timestamp);\n", "\n", "START PIPELINE ratings;" - ] + ], + "id": "fe23f570" }, { "cell_type": "markdown", - "id": "e79e99ac-190b-481c-bf8f-7d66265fba81", "metadata": {}, "source": [ "Create `movies` table and start pipeline." - ] + ], + "id": "b23fd64b" }, { "cell_type": "code", "execution_count": 5, - "id": "fba97ca5-3673-4ce8-b6f7-3214f6205bd9", "metadata": {}, "outputs": [], "source": [ @@ -226,23 +225,23 @@ " (movieId, title, genres);\n", "\n", "START PIPELINE movies;" - ] + ], + "id": "f7e76f0f" }, { "cell_type": "markdown", - "id": "6e41ed09-2cc3-4fb7-90da-077a04111417", "metadata": {}, "source": [ "### Check that all the data has been loaded\n", "\n", "There should be 25m rows for ratings, 62k for movies and 1m for tags. If the values are less than that, try the query\n", "again in a few seconds, the pipelines are still running." - ] + ], + "id": "25876eb6" }, { "cell_type": "code", "execution_count": 6, - "id": "cf4c1d0f-8de2-42ea-ab33-7dadbde74855", "metadata": {}, "outputs": [], "source": [ @@ -252,20 +251,20 @@ "SELECT COUNT(*) AS count_rows FROM movies\n", "UNION ALL\n", "SELECT COUNT(*) AS count_rows FROM tags" - ] + ], + "id": "9b7d1d50" }, { "cell_type": "markdown", - "id": "0fb4162d-a55a-4926-9ce0-500d5909e3a2", "metadata": {}, "source": [ "### Concatenate `tags` and `movies` tables using all tags" - ] + ], + "id": "3a2ee35a" }, { "cell_type": "code", "execution_count": 7, - "id": "17be3378-28ea-4487-9177-a266d0998a08", "metadata": {}, "outputs": [], "source": [ @@ -279,68 +278,68 @@ " FROM movies m\n", " LEFT JOIN tags t ON m.movieId = t.movieId\n", " GROUP BY m.movieId, m.title, m.genres;" - ] + ], + "id": "11de8ee9" }, { "cell_type": "markdown", - "id": "f4f2be44-5c5d-4113-9b9e-55bfa89d5d5b", "metadata": {}, "source": [ "## 3. Vectorize data" - ] + ], + "id": "8bd899ae" }, { "cell_type": "markdown", - "id": "f300136e-2a85-4095-867d-7723b3d61b2d", "metadata": {}, "source": [ "Initialize sentence transformer." - ] + ], + "id": "f4a0cd2d" }, { "cell_type": "code", "execution_count": 8, - "id": "204c4a32-bafd-45d7-928e-3ec1083b4b58", "metadata": {}, "outputs": [], "source": [ "from sentence_transformers import SentenceTransformer\n", "\n", "model = SentenceTransformer('flax-sentence-embeddings/all_datasets_v3_mpnet-base')" - ] + ], + "id": "fb22e686" }, { "cell_type": "markdown", - "id": "fce95741-8094-4a4b-8a4a-f92699471da5", "metadata": {}, "source": [ "Query the `movies_with_tags` table and store the output in a variable named `result`. The `result <<` syntax in the\n", "`%%sql` line indicates that the output from the query should get stored under that variable name." - ] + ], + "id": "80b88943" }, { "cell_type": "code", "execution_count": 9, - "id": "920798f0-55b6-4f2d-ba36-7f94e750f684", "metadata": {}, "outputs": [], "source": [ "%%sql result <<\n", "SELECT * FROM movies_with_tags" - ] + ], + "id": "0ca4f33c" }, { "cell_type": "markdown", - "id": "08651421-7313-4bb5-87f1-56ea165d45c5", "metadata": {}, "source": [ "Convert the result from the above SQL into a DataFrame and clean up quotes." - ] + ], + "id": "5347eef3" }, { "cell_type": "code", "execution_count": 10, - "id": "01d5cf58-65eb-4676-90b9-0d5a9253fd34", "metadata": {}, "outputs": [], "source": [ @@ -353,106 +352,106 @@ "df['allTags'] = df['allTags'].str.replace('\"', '').str.replace(\"'\", '')\n", "\n", "data = df.to_dict(orient='records')" - ] + ], + "id": "4a33d614" }, { "cell_type": "markdown", - "id": "134caeab-a430-4412-91d7-d74b3dd0002b", "metadata": {}, "source": [ "Check the first row of the list." - ] + ], + "id": "17c90ea9" }, { "cell_type": "code", "execution_count": 11, - "id": "950a5cd3-2b9d-41e7-9e9e-d841b2e92a02", "metadata": {}, "outputs": [], "source": [ "data[0]" - ] + ], + "id": "adeebd97" }, { "cell_type": "markdown", - "id": "6b88c557-44b5-4d48-b314-c50d65a4e5d5", "metadata": {}, "source": [ "Concatenate title and tags." - ] + ], + "id": "de736010" }, { "cell_type": "code", "execution_count": 12, - "id": "a1d9b7f5-409c-4053-a724-39747ce663b8", "metadata": {}, "outputs": [], "source": [ "all_title_type_column = [f'{row[\"title\"]}-{row[\"allTags\"]}' if row[\"title\"] is not None else row[\"title\"] for row in data]" - ] + ], + "id": "071d1141" }, { "cell_type": "markdown", - "id": "77af8001-a7d5-4a24-8290-b025d00ca1f3", "metadata": {}, "source": [ "Create the embeddings for Title & Tag (~3 minutes)." - ] + ], + "id": "e266fe5c" }, { "cell_type": "code", "execution_count": 13, - "id": "efd8aaa2-feaf-43ce-9f72-f5161d781f00", "metadata": {}, "outputs": [], "source": [ "# Remove [:3000] if you want to vectorize all rows (~60 minutes)\n", "all_embeddings = model.encode(all_title_type_column[:3000])\n", "all_embeddings.shape" - ] + ], + "id": "5125a794" }, { "cell_type": "markdown", - "id": "1e59050c-d53c-476f-a4cb-ce97f759efa0", "metadata": {}, "source": [ "Merge the original data with the vector data." - ] + ], + "id": "97f49fdc" }, { "cell_type": "code", "execution_count": 14, - "id": "54b0546d-0e22-4fde-a903-18c714bdd21d", "metadata": {}, "outputs": [], "source": [ "# Remember the list will be only 3,000 elements\n", "for row, embedding in zip(data, all_embeddings):\n", " row['embedding'] = embedding" - ] + ], + "id": "05a56d9b" }, { "cell_type": "code", "execution_count": 15, - "id": "1706fa59-d0f4-4378-9c76-2d74d53c428a", "metadata": {}, "outputs": [], "source": [ "data[0]" - ] + ], + "id": "53c5cac8" }, { "cell_type": "markdown", - "id": "f482feed-edd4-4489-8ab4-cdd5803515f3", "metadata": {}, "source": [ "## 4. Create table for movie information and vectors" - ] + ], + "id": "83ae8f78" }, { "cell_type": "code", "execution_count": 16, - "id": "32f4d535-8ea6-47a9-9fd3-5215a23f72e3", "metadata": {}, "outputs": [], "source": [ @@ -466,43 +465,43 @@ " allTags longtext CHARACTER SET utf8mb4,\n", " vector BLOB\n", ")" - ] + ], + "id": "e90dfaf0" }, { "cell_type": "markdown", - "id": "23570610-0ec1-4c3c-a63b-8221130042cf", "metadata": {}, "source": [ "Create a database connection using SQLAlchemy. We are going to use an SQLAlchemy connection here because one\n", "column of data is numpy arrays. The SingleStoreDB SQLAlchemy driver will automatically convert those to\n", "the correct binary format when uploading, so it's a bit more convenient than doing the conversions and\n", "formatting manually for the `%sql` magic command." - ] + ], + "id": "d86c3ab9" }, { "cell_type": "code", "execution_count": 17, - "id": "46c50550-5961-45df-ace8-65d7b91edc42", "metadata": {}, "outputs": [], "source": [ "from singlestoredb import create_engine\n", "\n", "conn = create_engine().connect()" - ] + ], + "id": "a0f60a5d" }, { "cell_type": "markdown", - "id": "db42c917-2d53-46c8-96b5-55d7e8b4d327", "metadata": {}, "source": [ "Insert the data. Some rows might encounter errors due to unsupported characters." - ] + ], + "id": "8d09c8e9" }, { "cell_type": "code", "execution_count": 18, - "id": "4cbf8232-7738-49fa-be00-1e7a5f1882b3", "metadata": {}, "outputs": [], "source": [ @@ -526,36 +525,36 @@ " ''')\n", "\n", "conn.execute(sql_query, data[:3000])" - ] + ], + "id": "857354ca" }, { "cell_type": "markdown", - "id": "192353be-db56-4cec-8d6e-a258532c6dd9", "metadata": {}, "source": [ "## 5. Marrying Search \u2764\ufe0f Semantic Search \u2764\ufe0f Analytics" - ] + ], + "id": "35d9e0ab" }, { "cell_type": "markdown", - "id": "edd2d5e9-b4ad-4d10-b638-eabeaaad4846", "metadata": {}, "source": [ "### Build autocomplete search" - ] + ], + "id": "f4d0f756" }, { "cell_type": "markdown", - "id": "26e07b98-b934-42b9-935c-1e9d750b7697", "metadata": {}, "source": [ "This is en experimentat we started with to render a full text search." - ] + ], + "id": "88fb5547" }, { "cell_type": "code", "execution_count": 19, - "id": "eca42fb8-2591-472b-a607-85c5fb3d5f63", "metadata": {}, "outputs": [], "source": [ @@ -567,20 +566,20 @@ " ORDER BY relevance DESC\n", " LIMIT 10)\n", " SELECT title, movieId FROM queryouter;" - ] + ], + "id": "d86ebf5b" }, { "cell_type": "markdown", - "id": "9f4d78ba-a0ad-4a4c-98e7-f06ce1281ed1", "metadata": {}, "source": [ "### Create user favorite movie tables" - ] + ], + "id": "482c813e" }, { "cell_type": "code", "execution_count": 20, - "id": "45a78f0a-c6bd-423b-bc5e-df641c78f48e", "metadata": {}, "outputs": [], "source": [ @@ -591,20 +590,20 @@ " ts datetime DEFAULT NULL,\n", " KEY userid (userid)\n", ")" - ] + ], + "id": "67bc3465" }, { "cell_type": "markdown", - "id": "b6f053f4-5654-48d0-b897-9dfb1aaff016", "metadata": {}, "source": [ "Enter dummy data for testing purposes." - ] + ], + "id": "ee7b2569" }, { "cell_type": "code", "execution_count": 21, - "id": "98ca4b36-3aa0-43d9-8e9f-ffb9f08e9b72", "metadata": {}, "outputs": [], "source": [ @@ -613,20 +612,20 @@ " VALUES ('user1', 'Zone 39 (1997)', '2022-01-01 00:00:00'),\n", " ('user1', 'Star Trek II: The Wrath of Khan (1982)', '2022-01-01 00:00:00'),\n", " ('user1', 'Giver, The (2014)', '2022-01-01 00:00:00');" - ] + ], + "id": "cc8db25d" }, { "cell_type": "markdown", - "id": "cbe64201-0260-43be-87c0-20616c34ce59", "metadata": {}, "source": [ "### Build semantic search for a movie recommendation" - ] + ], + "id": "549dd511" }, { "cell_type": "code", "execution_count": 22, - "id": "c2368b35-e2cd-4a7c-82a5-565cabc73f90", "metadata": {}, "outputs": [], "source": [ @@ -710,30 +709,30 @@ " dc.Rating_Match DESC\n", "LIMIT\n", " 5;" - ] + ], + "id": "1405d3b4" }, { "cell_type": "markdown", - "id": "3a9efa4a-1a38-49f3-ba6d-a00b6c90e2b0", "metadata": {}, "source": [ "## 6. What are you looking for?" - ] + ], + "id": "db8a6e82" }, { "cell_type": "code", "execution_count": 23, - "id": "87547d68-e1c9-4fd8-946f-6b378c9b36f2", "metadata": {}, "outputs": [], "source": [ "search_embedding = model.encode(\"I want see a French comedy movie\")" - ] + ], + "id": "9686be3c" }, { "cell_type": "code", "execution_count": 24, - "id": "3b70a678-4829-4456-8946-d1dcbf46c98b", "metadata": {}, "outputs": [], "source": [ @@ -747,30 +746,31 @@ "\n", "for i, res in enumerate(results):\n", " print(f\"{i + 1}: {res.title} {res.genres} Score: {res.score}\")" - ] + ], + "id": "46366909" }, { "cell_type": "markdown", - "id": "be066a74-95ff-4396-85df-bf8f1dd7b553", "metadata": {}, "source": [ "## Clean up" - ] + ], + "id": "f44af172" }, { "cell_type": "code", "execution_count": 25, - "id": "f5f8890c-b7c4-4503-9f5e-a93b5eb45fc6", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "DROP DATABASE IF EXISTS movie_recommender" - ] + ], + "id": "88ced78a" }, { + "id": "f678873e", "cell_type": "markdown", - "id": "0017d0c2-6647-431d-a937-85132af15b1a", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/network-intrusion-detection-part-1/notebook.ipynb b/notebooks/network-intrusion-detection-part-1/notebook.ipynb index 60adc011..16eb8472 100644 --- a/notebooks/network-intrusion-detection-part-1/notebook.ipynb +++ b/notebooks/network-intrusion-detection-part-1/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "16f985a0", "cell_type": "markdown", - "id": "93ad2bda-e101-4aad-a83b-45f84560597c", "metadata": {}, "source": [ "
    \n", @@ -18,7 +18,6 @@ }, { "cell_type": "markdown", - "id": "b59d7dae", "metadata": {}, "source": [ "
    \n", @@ -28,11 +27,11 @@ "

    This tutorial is meant for Standard & Premium Workspaces. You can't run this with a Free Starter Workspace due to restrictions on Storage. Create a Workspace using +group in the left nav & select Standard for this notebook. Gallery notebooks tagged with \"Starter\" are suitable to run on a Free Starter Workspace

    \n", "
    \n", "
    " - ] + ], + "id": "612751b7" }, { "cell_type": "markdown", - "id": "4d343fe9-0c6f-4cf7-bf04-02524cdb5879", "metadata": {}, "source": [ "This notebook demonstrates the application of SingleStoreDB's similarity search to create a system for identifying infrequent occurrences, a common requirement in fields such as cybersecurity and fraud detection where only a small percentage of events are potentially malicious.\n", @@ -44,22 +43,22 @@ "Subsequently, we'll apply this dataset to search for the most similar matches when presented with new, unseen network events. We'll retrieve these matches along with their corresponding labels. This process enables us to classify the unseen events as either **benign** or **malicious** by propagating the labels of the matched events. It's essential to note that intrusion detection is a complex classification task, primarily because malicious events occur infrequently. The similarity search service plays a crucial role in identifying relevant historical labeled events, thus enabling the identification of these rare events while maintaining a low rate of false alarms.\n", "\n", "## Install Dependencies" - ] + ], + "id": "6914882f" }, { "cell_type": "code", "execution_count": 1, - "id": "c649045f-0a53-4c49-88cb-0351e872d68c", "metadata": {}, "outputs": [], "source": [ "!pip install tensorflow keras==2.15.0 --quiet" - ] + ], + "id": "452a7616" }, { "cell_type": "code", "execution_count": 2, - "id": "f3bb35f9-67ea-4a23-b713-888746494baf", "metadata": {}, "outputs": [], "source": [ @@ -70,20 +69,20 @@ "import tensorflow.keras.backend as K\n", "from tensorflow import keras\n", "from tensorflow.keras.models import Model" - ] + ], + "id": "eba059ba" }, { "cell_type": "markdown", - "id": "01fa2a0b-0213-4399-8481-adc1734cecf0", "metadata": {}, "source": [ "We'll define a Python context manager called `clear_memory()` using the **contextlib** module. This context manager will be used to clear memory by running Python's garbage collector (`gc.collect()`) after a block of code is executed." - ] + ], + "id": "82d925f1" }, { "cell_type": "code", "execution_count": 3, - "id": "b7b5a0c6-aef9-4fea-91d9-8777e75d5c1f", "metadata": {}, "outputs": [], "source": [ @@ -96,29 +95,29 @@ " yield\n", " finally:\n", " gc.collect()" - ] + ], + "id": "e6104311" }, { "cell_type": "markdown", - "id": "6eaabcc0-3bfa-4c67-86cd-8b3a42150a6f", "metadata": {}, "source": [ "We'll will incorporate portions of code from [research work](https://github.com/Colorado-Mesa-University-Cybersecurity/DeepLearning-IDS). To begin, we'll clone the repository required for data preparation." - ] + ], + "id": "ab2099e8" }, { "cell_type": "code", "execution_count": 4, - "id": "1d06bd7a-f3ad-4c3a-a085-d8583d20e2bc", "metadata": {}, "outputs": [], "source": [ "!git clone -q https://github.com/Colorado-Mesa-University-Cybersecurity/DeepLearning-IDS.git" - ] + ], + "id": "87b0d80a" }, { "cell_type": "markdown", - "id": "8c323aab-899d-4156-8cfe-98f0fdbb2147", "metadata": {}, "source": [ "## Data Preparation\n", @@ -144,31 +143,31 @@ "2. February 23, 2018\n", "\n", "These files will be retrieved and saved to the current directory. Our intention is to use one of these dates for training and generating vectors, while the other will be reserved for testing purposes." - ] + ], + "id": "9fa8527f" }, { "cell_type": "code", "execution_count": 5, - "id": "47a639f2-1f03-4972-8212-316887cc1c73", "metadata": {}, "outputs": [], "source": [ "!wget \"https://cse-cic-ids2018.s3.ca-central-1.amazonaws.com/Processed%20Traffic%20Data%20for%20ML%20Algorithms/Thursday-22-02-2018_TrafficForML_CICFlowMeter.csv\" -q --show-progress\n", "!wget \"https://cse-cic-ids2018.s3.ca-central-1.amazonaws.com/Processed%20Traffic%20Data%20for%20ML%20Algorithms/Friday-23-02-2018_TrafficForML_CICFlowMeter.csv\" -q --show-progress" - ] + ], + "id": "74172c31" }, { "cell_type": "markdown", - "id": "21103ba6-036d-4525-9e5a-073ec515aba9", "metadata": {}, "source": [ "### Review Data" - ] + ], + "id": "3fd974ac" }, { "cell_type": "code", "execution_count": 6, - "id": "c17e283b-8629-483a-9631-69392b3b872e", "metadata": {}, "outputs": [], "source": [ @@ -176,40 +175,40 @@ " data = pd.read_csv('Friday-23-02-2018_TrafficForML_CICFlowMeter.csv')\n", "\n", "data.Label.value_counts()" - ] + ], + "id": "af906dd8" }, { "cell_type": "markdown", - "id": "176bc1e1-822e-4e22-a5fa-0dd03d6ecf04", "metadata": {}, "source": [ "### Clean Data\n", "\n", "We'll run a cleanup script from the previously downloaded GitHub repo." - ] + ], + "id": "d7bdde5b" }, { "cell_type": "code", "execution_count": 7, - "id": "af4f1d28-cba5-4547-a2bd-a34b200fc261", "metadata": {}, "outputs": [], "source": [ "!python DeepLearning-IDS/data_cleanup.py \"Friday-23-02-2018_TrafficForML_CICFlowMeter.csv\" \"result23022018\"" - ] + ], + "id": "4a202100" }, { "cell_type": "markdown", - "id": "15fd1f07-d8a1-4ab5-a36c-5a08f1f043e9", "metadata": {}, "source": [ "We'll now review the cleaned data from the previous step." - ] + ], + "id": "26dfb70a" }, { "cell_type": "code", "execution_count": 8, - "id": "d893178a-6bcc-4a16-89ef-f3559adad14c", "metadata": {}, "outputs": [], "source": [ @@ -217,21 +216,21 @@ " data_23_cleaned = pd.read_csv('result23022018.csv')\n", "\n", "data_23_cleaned.head()" - ] + ], + "id": "8022583d" }, { "cell_type": "code", "execution_count": 9, - "id": "d98eee6b-6a68-4fb6-b418-1599daa9e4ff", "metadata": {}, "outputs": [], "source": [ "data_23_cleaned.Label.value_counts()" - ] + ], + "id": "3318ae80" }, { "cell_type": "markdown", - "id": "1b466f2f-ee4f-43ba-b1e6-0cb3380c9125", "metadata": {}, "source": [ "## Load Model\n", @@ -264,23 +263,23 @@ "

    \n", "
    \n", "
    " - ] + ], + "id": "28259b48" }, { "cell_type": "code", "execution_count": 10, - "id": "4074a769-b16b-4543-b0db-518c7f95f205", "metadata": {}, "outputs": [], "source": [ "!wget -q -O it_threat_model.zip \"https://drive.google.com/uc?export=download&id=1ahr5dYlhuxS56M6helUFI0yIxxIoFk9o\"\n", "!unzip -q it_threat_model.zip" - ] + ], + "id": "9b4a2044" }, { "cell_type": "code", "execution_count": 11, - "id": "7c77cf96-e3f7-4bed-9f7e-913f851367d8", "metadata": {}, "outputs": [], "source": [ @@ -288,12 +287,12 @@ " model = keras.models.load_model('it_threat_model')\n", "\n", "model.summary()" - ] + ], + "id": "c835db10" }, { "cell_type": "code", "execution_count": 12, - "id": "ed66c137-1023-4dce-b9ea-5e742099302e", "metadata": {}, "outputs": [], "source": [ @@ -304,23 +303,23 @@ " inputs = model.input,\n", " outputs = model.get_layer(layer_name).output\n", " )" - ] + ], + "id": "bcf6f626" }, { "cell_type": "markdown", - "id": "24d3bfbb-78ee-4a8b-a429-84f988915b6d", "metadata": {}, "source": [ "## Upload Data to SingleStoreDB\n", "\n", "### Prepare Data\n", "We'll use a method for defining item IDs that aligns with the event's label." - ] + ], + "id": "38c9b80e" }, { "cell_type": "code", "execution_count": 13, - "id": "ae1fc38e-f532-430a-99cd-f3e154343bd4", "metadata": {}, "outputs": [], "source": [ @@ -333,20 +332,20 @@ " for i, res in tqdm(zip(data_23_cleaned.iterrows(), model_res), total = len(model_res)):\n", " benign_or_attack = i[1]['Label'][:3]\n", " items_to_upload.append((benign_or_attack + '_' + str(i[0]), res.tolist()))" - ] + ], + "id": "061b15ef" }, { "cell_type": "markdown", - "id": "e4b3f74e-efea-499e-b25a-74316fcf2395", "metadata": {}, "source": [ "We'll store the data in a Pandas DataFrame." - ] + ], + "id": "f739e1d9" }, { "cell_type": "code", "execution_count": 14, - "id": "0177714c-62b7-4d3e-a93b-2fd467ff1352", "metadata": {}, "outputs": [], "source": [ @@ -354,20 +353,20 @@ " df = pd.DataFrame(items_to_upload, columns=['ID', 'Model_Results'])\n", "\n", "df.head()" - ] + ], + "id": "79591196" }, { "cell_type": "markdown", - "id": "f52639f6-82c8-427f-aa7b-9859d6580eae", "metadata": {}, "source": [ "Now we'll convert the vectors to a binary format, ready to store in SingleStoreDB." - ] + ], + "id": "1de5e52c" }, { "cell_type": "code", "execution_count": 15, - "id": "ec8a2d26-98cf-4f1a-9e13-e197cd65040c", "metadata": {}, "outputs": [], "source": [ @@ -379,38 +378,38 @@ "\n", "with clear_memory():\n", " df['Model_Results'] = df['Model_Results'].apply(data_to_binary)" - ] + ], + "id": "bc09c903" }, { "cell_type": "markdown", - "id": "e267e6a4-3e70-4411-96cd-94da3f1f6011", "metadata": {}, "source": [ "We'll check the DataFrame." - ] + ], + "id": "18e498c2" }, { "cell_type": "code", "execution_count": 16, - "id": "770cfe1e-4d55-43a8-a5e0-36da2525ca25", "metadata": {}, "outputs": [], "source": [ "df.head()" - ] + ], + "id": "33ffbd2c" }, { "cell_type": "markdown", - "id": "8131ad6c-6d50-49db-845c-457e9957ddb9", "metadata": {}, "source": [ "### Create Database and Table" - ] + ], + "id": "b0d4da4a" }, { "cell_type": "code", "execution_count": 17, - "id": "8940168f-072f-42cd-8855-4c6b6829421d", "metadata": {}, "outputs": [], "source": [ @@ -427,11 +426,11 @@ " id TEXT,\n", " Model_Results BLOB\n", ");" - ] + ], + "id": "9446c98b" }, { "cell_type": "markdown", - "id": "b6390a0c-d60a-4876-85bd-56e17deabeeb", "metadata": {}, "source": [ "### Get Connection Details\n", @@ -443,32 +442,32 @@ "

    Select the database from the drop-down menu at the top of this notebook. It updates the connection_url which is used by SQLAlchemy to make connections to the selected database.

    \n", "
    \n", "
    " - ] + ], + "id": "60cd440c" }, { "cell_type": "code", "execution_count": 18, - "id": "8208b373-1973-4925-8d8a-cd43ba13e7d7", "metadata": {}, "outputs": [], "source": [ "from sqlalchemy import *\n", "\n", "db_connection = create_engine(connection_url)" - ] + ], + "id": "e505621d" }, { "cell_type": "markdown", - "id": "d97c6886-0ea9-47b5-938b-03ada717b394", "metadata": {}, "source": [ "### Store DataFrame" - ] + ], + "id": "8c15b5f7" }, { "cell_type": "code", "execution_count": 19, - "id": "e70b91a0-fdb9-46da-8c0d-5acc947799be", "metadata": {}, "outputs": [], "source": [ @@ -480,20 +479,20 @@ " index = False,\n", " chunksize = 1000\n", " )" - ] + ], + "id": "ffda3c99" }, { "cell_type": "markdown", - "id": "559928bd-10a0-4880-805a-23444d7d72a3", "metadata": {}, "source": [ "### Check Stored Data" - ] + ], + "id": "78c6998c" }, { "cell_type": "code", "execution_count": 20, - "id": "f4cb8e7f-362f-4477-8600-e5f39ec197b8", "metadata": {}, "outputs": [], "source": [ @@ -503,11 +502,12 @@ "SELECT ID, JSON_ARRAY_UNPACK(Model_Results) AS Model_Results\n", "FROM model_results\n", "LIMIT 1;" - ] + ], + "id": "ae793e88" }, { + "id": "739a625c", "cell_type": "markdown", - "id": "e4eb9309-df32-42f3-b170-51c49b175855", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/network-intrusion-detection-part-2/notebook.ipynb b/notebooks/network-intrusion-detection-part-2/notebook.ipynb index 1eaef402..933083a9 100644 --- a/notebooks/network-intrusion-detection-part-2/notebook.ipynb +++ b/notebooks/network-intrusion-detection-part-2/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "2992bf6e", "cell_type": "markdown", - "id": "d62916d4-c174-4708-92b1-b6940fa5a861", "metadata": {}, "source": [ "
    \n", @@ -18,7 +18,6 @@ }, { "cell_type": "markdown", - "id": "afc4742e", "metadata": {}, "source": [ "
    \n", @@ -28,30 +27,30 @@ "

    This tutorial is meant for Standard & Premium Workspaces. You can't run this with a Free Starter Workspace due to restrictions on Storage. Create a Workspace using +group in the left nav & select Standard for this notebook. Gallery notebooks tagged with \"Starter\" are suitable to run on a Free Starter Workspace

    \n", "
    \n", "
    " - ] + ], + "id": "f2242d49" }, { "cell_type": "markdown", - "id": "59521b37-8c16-4713-b341-dcf4d020ef88", "metadata": {}, "source": [ "## Install Dependencies" - ] + ], + "id": "2fc2391c" }, { "cell_type": "code", "execution_count": 1, - "id": "c649045f-0a53-4c49-88cb-0351e872d68c", "metadata": {}, "outputs": [], "source": [ "!pip3 install tensorflow keras==2.15.0 scikit-learn --quiet" - ] + ], + "id": "2ff93766" }, { "cell_type": "code", "execution_count": 2, - "id": "f3bb35f9-67ea-4a23-b713-888746494baf", "metadata": {}, "outputs": [], "source": [ @@ -65,20 +64,20 @@ "from sklearn.metrics import confusion_matrix\n", "from tensorflow import keras\n", "from tensorflow.keras.models import Model" - ] + ], + "id": "9a2a4099" }, { "cell_type": "markdown", - "id": "7f949377-d93b-43d3-b9e1-2fb910f843fd", "metadata": {}, "source": [ "We'll define a Python context manager called `clear_memory()` using the **contextlib** module. This context manager will be used to clear memory by running Python's garbage collector (`gc.collect()`) after a block of code is executed." - ] + ], + "id": "14ff0f59" }, { "cell_type": "code", "execution_count": 3, - "id": "00eccb20-5b95-4383-88c4-2dff1b494a1f", "metadata": {}, "outputs": [], "source": [ @@ -91,20 +90,20 @@ " yield\n", " finally:\n", " gc.collect()" - ] + ], + "id": "418c5bb8" }, { "cell_type": "markdown", - "id": "b06e5a33-bf7f-41ea-85fd-1b13871c40e9", "metadata": {}, "source": [ "## Load Model" - ] + ], + "id": "cffe1c26" }, { "cell_type": "code", "execution_count": 4, - "id": "7c77cf96-e3f7-4bed-9f7e-913f851367d8", "metadata": {}, "outputs": [], "source": [ @@ -112,12 +111,12 @@ " model = keras.models.load_model('it_threat_model')\n", "\n", "model.summary()" - ] + ], + "id": "9261dede" }, { "cell_type": "code", "execution_count": 5, - "id": "ed66c137-1023-4dce-b9ea-5e742099302e", "metadata": {}, "outputs": [], "source": [ @@ -128,11 +127,11 @@ " inputs = model.input,\n", " outputs = model.get_layer(layer_name).output\n", " )" - ] + ], + "id": "0b1ae83c" }, { "cell_type": "markdown", - "id": "f3756497-a5a8-453a-9257-13ac0122a6ff", "metadata": {}, "source": [ "## Data Preparation\n", @@ -140,12 +139,12 @@ "We'll use the second file we downloaded earlier for testing purposes.\n", "\n", "### Review Data" - ] + ], + "id": "cb5bed43" }, { "cell_type": "code", "execution_count": 6, - "id": "bc2325d6-a298-4916-bf33-7416b3b4ab25", "metadata": {}, "outputs": [], "source": [ @@ -153,40 +152,40 @@ " data = pd.read_csv('Thursday-22-02-2018_TrafficForML_CICFlowMeter.csv')\n", "\n", "data.Label.value_counts()" - ] + ], + "id": "dce8c0e5" }, { "cell_type": "markdown", - "id": "3c515eb7-1e06-4be3-993b-cc4483cb7bf6", "metadata": {}, "source": [ "### Clean Data\n", "\n", "We'll run a cleanup script from the previously downloaded GitHub repo." - ] + ], + "id": "c1bf9a08" }, { "cell_type": "code", "execution_count": 7, - "id": "73e53251-5fd9-4fbd-856e-2bbc560a2924", "metadata": {}, "outputs": [], "source": [ "!python DeepLearning-IDS/data_cleanup.py \"Thursday-22-02-2018_TrafficForML_CICFlowMeter.csv\" \"result22022018\"" - ] + ], + "id": "d08aeccd" }, { "cell_type": "markdown", - "id": "67c1125c-bb62-40b8-b885-d5d9187b5f81", "metadata": {}, "source": [ "We'll now review the cleaned data from the previous step." - ] + ], + "id": "3510cc41" }, { "cell_type": "code", "execution_count": 8, - "id": "326a320a-9388-4108-a0f0-10520d25c2a4", "metadata": {}, "outputs": [], "source": [ @@ -194,30 +193,30 @@ " data_22_cleaned = pd.read_csv('result22022018.csv')\n", "\n", "data_22_cleaned.head()" - ] + ], + "id": "fdb4acc0" }, { "cell_type": "code", "execution_count": 9, - "id": "66a13b5c-07b2-413b-9b77-ce490fa8507b", "metadata": {}, "outputs": [], "source": [ "data_22_cleaned.Label.value_counts()" - ] + ], + "id": "00215a84" }, { "cell_type": "markdown", - "id": "df41d065-924e-4de5-834c-967846688e13", "metadata": {}, "source": [ "We'll create a sample that encompasses all the distinct types of web attacks observed on this particular date." - ] + ], + "id": "a3caf02f" }, { "cell_type": "code", "execution_count": 10, - "id": "9f669747-2c58-4113-a099-070c5a08a492", "metadata": {}, "outputs": [], "source": [ @@ -225,11 +224,11 @@ " data_sample = data_22_cleaned[-2000:]\n", "\n", "data_sample.Label.value_counts()" - ] + ], + "id": "02dfb61b" }, { "cell_type": "markdown", - "id": "722010dd-2a2e-41a8-848f-bad78d0fa79f", "metadata": {}, "source": [ "## Get Connection Details\n", @@ -241,34 +240,34 @@ "

    Select the database from the drop-down menu at the top of this notebook. It updates the connection_url which is used by SQLAlchemy to make connections to the selected database.

    \n", "
    \n", "
    " - ] + ], + "id": "93de2597" }, { "cell_type": "code", "execution_count": 11, - "id": "806149fa-bba9-42fd-b804-3b5964bceca5", "metadata": {}, "outputs": [], "source": [ "from sqlalchemy import *\n", "\n", "db_connection = create_engine(connection_url)" - ] + ], + "id": "93738c2e" }, { "cell_type": "markdown", - "id": "0f5e79af-3991-45e1-845e-a24390437d56", "metadata": {}, "source": [ "## Queries\n", "\n", "Next, we'll perform queries on the test dataset and store the predicted and expected results, enabling us to construct a confusion matrix." - ] + ], + "id": "281bbdda" }, { "cell_type": "code", "execution_count": 12, - "id": "ddcd73cd-0d87-4aaf-82be-5d7bd0e73566", "metadata": {}, "outputs": [], "source": [ @@ -311,22 +310,22 @@ " y_pred.append(1)\n", " else:\n", " y_pred.append(0)" - ] + ], + "id": "df9e8836" }, { "cell_type": "markdown", - "id": "62a3faef-773d-48ed-82c7-63dca0e31e07", "metadata": {}, "source": [ "## Visualize Results\n", "\n", "### Confusion Matrix" - ] + ], + "id": "fca608c9" }, { "cell_type": "code", "execution_count": 13, - "id": "28dde466-1dcc-4b5f-98a4-4e3ce31cf12c", "metadata": {}, "outputs": [], "source": [ @@ -386,12 +385,12 @@ ")\n", "\n", "fig.show()" - ] + ], + "id": "b002ba31" }, { "cell_type": "code", "execution_count": 14, - "id": "abaddb79-c9ea-4331-968e-98cd540f034b", "metadata": {}, "outputs": [], "source": [ @@ -406,20 +405,20 @@ "for i in range(len(class_labels)):\n", " for j in range(len(class_labels)):\n", " print(f\"{class_labels[i]} (Actual) -> {class_labels[j]} (Predicted): {conf_matrix[i][j]}\")" - ] + ], + "id": "68823624" }, { "cell_type": "markdown", - "id": "a212b279-edce-4b3b-ad82-3b4933b1b10f", "metadata": {}, "source": [ "### Accuracy" - ] + ], + "id": "851a1321" }, { "cell_type": "code", "execution_count": 15, - "id": "bdb00bdc-4264-4ee5-b953-fed3c7317dd6", "metadata": {}, "outputs": [], "source": [ @@ -431,20 +430,20 @@ "print(f\"Accuracy: {acc:.3f}\")\n", "print(f\"Precision: {precision:.3f}\")\n", "print(f\"Recall: {recall:.3f}\")" - ] + ], + "id": "09277b85" }, { "cell_type": "markdown", - "id": "30a06f84-7d9c-48d2-9b59-5d0aafb9b328", "metadata": {}, "source": [ "### Per Class Accuracy" - ] + ], + "id": "6f0d60f8" }, { "cell_type": "code", "execution_count": 16, - "id": "46f58998-e5ce-4c31-9612-25669254e98c", "metadata": {}, "outputs": [], "source": [ @@ -453,22 +452,22 @@ "per_class_accuracy_df = pd.DataFrame([(index, round(value,4)) for index, value in zip(['Benign', 'Attack'], cmd)], columns = ['type', 'accuracy'])\n", "per_class_accuracy_df = per_class_accuracy_df.round(2)\n", "display(per_class_accuracy_df)" - ] + ], + "id": "ed745dbc" }, { "cell_type": "markdown", - "id": "18da66d1-fa5f-4c0a-913f-915fc49c9411", "metadata": {}, "source": [ "## Predict Values Directly from Model\n", "\n", "We achieved excellent results with SingleStoreDB. Now, let's explore what happens when we bypass the similarity search step and make predictions directly from the model. In other words, we'll utilize the model responsible for generating the embeddings as a classifier. We can then compare the accuracy of this approach with that of the similarity search method." - ] + ], + "id": "02793d36" }, { "cell_type": "code", "execution_count": 17, - "id": "1ae8e5cb-c0fe-46e1-950f-3274644d869d", "metadata": {}, "outputs": [], "source": [ @@ -478,22 +477,22 @@ "data_sample = normalize(data_22_cleaned.iloc[:, :-1])[-2000:]\n", "y_pred_model = model.predict(normalize(data_sample)).flatten()\n", "y_pred_model = np.round(y_pred_model)" - ] + ], + "id": "a76887e3" }, { "cell_type": "markdown", - "id": "23953237-8c81-4fe3-bead-aacbcc994c05", "metadata": {}, "source": [ "## Visualize Results\n", "\n", "### Confusion Matrix" - ] + ], + "id": "7ddfcc36" }, { "cell_type": "code", "execution_count": 18, - "id": "ce2b6188-5fab-4ee8-ad0b-8e10f11215fe", "metadata": {}, "outputs": [], "source": [ @@ -551,12 +550,12 @@ ")\n", "\n", "fig.show()" - ] + ], + "id": "31b69dd8" }, { "cell_type": "code", "execution_count": 19, - "id": "c9621933-9b22-4536-bf7a-6c1c78c4ac4d", "metadata": {}, "outputs": [], "source": [ @@ -571,20 +570,20 @@ "for i in range(len(class_labels)):\n", " for j in range(len(class_labels)):\n", " print(f\"{class_labels[i]} (Actual) -> {class_labels[j]} (Predicted): {conf_matrix[i][j]}\")" - ] + ], + "id": "d1b16e3b" }, { "cell_type": "markdown", - "id": "7168534e-77c1-440f-b55a-7fad84af37aa", "metadata": {}, "source": [ "### Accuracy" - ] + ], + "id": "2328cf64" }, { "cell_type": "code", "execution_count": 20, - "id": "9ad16de3-2eb6-4ae3-b269-76c1f131fe63", "metadata": {}, "outputs": [], "source": [ @@ -596,20 +595,20 @@ "print(f\"Accuracy: {acc:.3f}\")\n", "print(f\"Precision: {precision:.3f}\")\n", "print(f\"Recall: {recall:.3f}\")" - ] + ], + "id": "125dac6e" }, { "cell_type": "markdown", - "id": "b80e72e5-814c-4533-9685-b126a0b481e7", "metadata": {}, "source": [ "### Per Class Accuracy" - ] + ], + "id": "b3aa1fc8" }, { "cell_type": "code", "execution_count": 21, - "id": "bc88066d-df3b-48be-9ff6-246587e6630c", "metadata": {}, "outputs": [], "source": [ @@ -618,21 +617,22 @@ "per_class_accuracy_df = pd.DataFrame([(index, round(value,4)) for index, value in zip(['Benign', 'Attack'], cmd)], columns = ['type', 'accuracy'])\n", "per_class_accuracy_df = per_class_accuracy_df.round(2)\n", "display(per_class_accuracy_df)" - ] + ], + "id": "4e493b23" }, { "cell_type": "markdown", - "id": "5455ba06-f2ba-4e2e-9494-c65ab2d9d96c", "metadata": {}, "source": [ "# Conclusions\n", "\n", "Utilizing SingleStoreDB's vector embeddings, we achieved an extremely high detection rate for attacks while maintaining a very small false-positive rate. Furthermore, our example showed that our similarity search methodology surpassed the direct classification approach that relies on the classifier's embedding model." - ] + ], + "id": "74d0b2a2" }, { + "id": "af0d74fe", "cell_type": "markdown", - "id": "9f7e1cd9-615c-40cf-ac02-de9b51f0fa24", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/network-intrusion-detection-part-3/notebook.ipynb b/notebooks/network-intrusion-detection-part-3/notebook.ipynb index a149cff2..e6edb574 100644 --- a/notebooks/network-intrusion-detection-part-3/notebook.ipynb +++ b/notebooks/network-intrusion-detection-part-3/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "713fcbd6", "cell_type": "markdown", - "id": "37793ab8-641c-4596-9e5b-445dd92ba229", "metadata": {}, "source": [ "
    \n", @@ -18,7 +18,6 @@ }, { "cell_type": "markdown", - "id": "27abe4ed", "metadata": {}, "source": [ "
    \n", @@ -28,15 +27,16 @@ "

    This tutorial is meant for Standard & Premium Workspaces. You can't run this with a Free Starter Workspace due to restrictions on Storage. Create a Workspace using +group in the left nav & select Standard for this notebook. Gallery notebooks tagged with \"Starter\" are suitable to run on a Free Starter Workspace

    \n", "
    \n", "
    " - ] + ], + "id": "16803778" }, { "cell_type": "markdown", - "id": "184309fb-7eef-4ae2-bb10-dd431eba6811", "metadata": {}, "source": [ "## Get pipeline data from Confluent (Kafka)" - ] + ], + "id": "e8f5f369" }, { "attachments": { @@ -45,52 +45,51 @@ } }, "cell_type": "markdown", - "id": "996738fc-01ab-4d72-8acd-a96f2862c476", "metadata": {}, "source": [ "![Kafka_SingleStore.png](attachment:e89ae78c-5f57-46a6-82d2-b38fb4b926f5.png)" - ] + ], + "id": "6009d19c" }, { "cell_type": "markdown", - "id": "b4cd5073", "metadata": {}, "source": [ "### We recommend for that step to use a S1+ size workspace" - ] + ], + "id": "4986a39b" }, { "cell_type": "markdown", - "id": "ee260ca9", "metadata": {}, "source": [ "

    Action Required

    Make sure to select the siem_log_kafka_demo database from the drop-down menu at the top of this notebook. It updates the connection_url which is used by the %%sql magic command and SQLAlchemy to make connections to the selected database.

    " - ] + ], + "id": "de7a12af" }, { "cell_type": "code", "execution_count": 1, - "id": "89c133a9-1765-4f61-a9bb-fce74f4a7c0a", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "DROP PIPELINE IF EXISTS `siem_log_real`;\n", "DROP TABLE IF EXISTS `siem_log_real`;" - ] + ], + "id": "8f9df203" }, { "cell_type": "markdown", - "id": "f6100d05-d796-452b-893c-8e9240d9a3f9", "metadata": {}, "source": [ "We start creating a simple table to load the logs into a JSON column" - ] + ], + "id": "c2ae2155" }, { "cell_type": "code", "execution_count": 2, - "id": "f09a053e-3082-422a-af8a-3cc81846b172", "metadata": {}, "outputs": [], "source": [ @@ -99,20 +98,20 @@ " `logs` JSON COLLATE utf8_bin\n", " , SHARD KEY ()\n", ") AUTOSTATS_CARDINALITY_MODE=PERIODIC AUTOSTATS_HISTOGRAM_MODE=CREATE SQL_MODE='STRICT_ALL_TABLES';" - ] + ], + "id": "9be03cca" }, { "cell_type": "markdown", - "id": "0f9e5c48-ed62-4354-a311-90672862fcae", "metadata": {}, "source": [ "We create a pipeline from the Confluent Cluster with an interval of 20ms" - ] + ], + "id": "afe01c6d" }, { "cell_type": "code", "execution_count": 3, - "id": "2022c2aa-4b29-41b3-894f-1a2e40311ed2", "metadata": {}, "outputs": [], "source": [ @@ -126,39 +125,39 @@ "INTO TABLE `siem_log_real`\n", "FIELDS TERMINATED BY '\\t' ENCLOSED BY '' ESCAPED BY '\\\\'\n", "LINES TERMINATED BY '\\n' STARTING BY '';" - ] + ], + "id": "f5e04248" }, { "cell_type": "markdown", - "id": "a8511976-97ba-4bc0-8dca-0b41cd6e0c84", "metadata": {}, "source": [ "Let's start the pipeline" - ] + ], + "id": "a4ca8d6f" }, { "cell_type": "code", "execution_count": 4, - "id": "042af566-8b5d-47e1-9626-81c2aad6b4a0", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "START PIPELINE siem_log_real;" - ] + ], + "id": "2431ca04" }, { "cell_type": "markdown", - "id": "5a414b17-23a0-465a-81b0-3a62e7f1f4c3", "metadata": {}, "source": [ "We extract a few elements from the JSON column such as timestamp, Log_ID, and the vector to be stored in a blob format. Data is extracted as soon as an update is made to the table" - ] + ], + "id": "ff7a0d75" }, { "cell_type": "code", "execution_count": 5, - "id": "5d687011-86ab-4acf-9f5e-863284c23793", "metadata": {}, "outputs": [], "source": [ @@ -167,30 +166,30 @@ "ADD COLUMN Timestamp as JSON_EXTRACT_STRING(`logs`,'Timestamp') PERSISTED datetime,\n", "ADD COLUMN model_res_blob AS JSON_ARRAY_PACK_F32(JSON_EXTRACT_STRING(`logs`, 'model_res')) PERSISTED BLOB,\n", "ADD COLUMN Log_ID AS JSON_EXTRACT_BIGINT(`logs`, 'Log_ID') PERSISTED bigint;" - ] + ], + "id": "bf8044d2" }, { "cell_type": "markdown", - "id": "fdfdaa2b-b3e3-46e0-8bc7-a697d03cb5c0", "metadata": {}, "source": [ "## Install libraries for real-time dashboarding with Perspective" - ] + ], + "id": "bd38fa36" }, { "cell_type": "code", "execution_count": 6, - "id": "7e6d493e-5fd8-47d5-a040-b3c5a629c6d2", "metadata": {}, "outputs": [], "source": [ "%pip install perspective-python --quiet" - ] + ], + "id": "08dab25d" }, { "cell_type": "code", "execution_count": 7, - "id": "b4f823ea-8a41-458e-b19c-f65a59a928f7", "metadata": {}, "outputs": [], "source": [ @@ -202,20 +201,20 @@ "from perspective import Table, PerspectiveWidget\n", "import warnings\n", "warnings.filterwarnings('ignore')" - ] + ], + "id": "2109fe0f" }, { "cell_type": "markdown", - "id": "0bc126ab-971e-4acd-8d93-3507636b6605", "metadata": {}, "source": [ "We will set dashboard with a refresh rate of 500ms. We use two modes: stop and run to stop a dashboard retrieving results from the database." - ] + ], + "id": "2fb49b93" }, { "cell_type": "code", "execution_count": 8, - "id": "f866ec95-0c9f-4771-aaee-5e86a05e1939", "metadata": {}, "outputs": [], "source": [ @@ -224,20 +223,20 @@ " while mode == 'run':\n", " table.update(data_source())\n", " time.sleep(0.5)" - ] + ], + "id": "3078059c" }, { "cell_type": "markdown", - "id": "c65ef6c4-3a1e-48d3-8cf7-b54619fd6e84", "metadata": {}, "source": [ "## Track Real-Time Connections" - ] + ], + "id": "509f9e66" }, { "cell_type": "code", "execution_count": 9, - "id": "4d3f67ff-3143-4f55-b371-52898af7aeda", "metadata": {}, "outputs": [], "source": [ @@ -250,47 +249,48 @@ " \"Timestamp\": datetime,\n", " \"count_connections\": int\n", "}" - ] + ], + "id": "b25fec09" }, { "cell_type": "code", "execution_count": 10, - "id": "f27cd500-8e90-475a-b991-ecb371212721", "metadata": {}, "outputs": [], "source": [ "mode = 'run'\n", "table = perspective.Table(SCHEMA, limit=100)\n", "threading.Thread(target=loop).start()" - ] + ], + "id": "0a47a677" }, { "cell_type": "code", "execution_count": 11, - "id": "77dc7166-9037-483b-8578-d4fd6626af71", "metadata": {}, "outputs": [], "source": [ "perspective.PerspectiveWidget(table,title = \"Track Real-Time Connections\", group_by=[\"Timestamp\"],plugin=\"Y Line\",columns=[\"count_connections\"])" - ] + ], + "id": "2aceda75" }, { "cell_type": "code", "execution_count": 12, - "id": "40296099-a5cf-40b5-8295-b097fd94d630", "metadata": {}, "outputs": [], "source": [ "mode = 'stop'" - ] + ], + "id": "f910d062" }, { "cell_type": "markdown", - "id": "2282842a-918c-4937-a602-d2edc41b3825", "metadata": {}, "source": [ "## Monitor and Infer IT Threats using Semantic Search over Real-Time Data" - ] + ], + "id": "d45fcdb5" }, { "attachments": { @@ -299,16 +299,15 @@ } }, "cell_type": "markdown", - "id": "9414c751-1c61-4c01-aa55-f5a6e064df7a", "metadata": {}, "source": [ "![Semantic_searches.png](attachment:09fb8fd5-67f5-47d9-9904-a782e51df5dc.png)" - ] + ], + "id": "92c9b97f" }, { "cell_type": "code", "execution_count": 13, - "id": "755d7238-4fe9-4119-ad4b-d4fc2f5bf4b6", "metadata": {}, "outputs": [], "source": [ @@ -321,52 +320,52 @@ " \"log_status\": str,\n", " \"count_connections\": int\n", "}" - ] + ], + "id": "1e9a7fd5" }, { "cell_type": "code", "execution_count": 14, - "id": "8ea8b1cf-50ec-48c5-94fe-f19977e81f93", "metadata": {}, "outputs": [], "source": [ "mode = 'run'\n", "table = perspective.Table(SCHEMA, limit=100)\n", "threading.Thread(target=loop).start()" - ] + ], + "id": "ea1a4736" }, { "cell_type": "code", "execution_count": 15, - "id": "70d45c26-a29e-4fdb-a42c-fb5bd35e2836", "metadata": {}, "outputs": [], "source": [ "perspective.PerspectiveWidget(table,title = \"Monitor Threat Inference\", split_by=[\"log_status\"],plugin=\"Y Line\",columns=[\"count_connections\"])" - ] + ], + "id": "551458cb" }, { "cell_type": "code", "execution_count": 16, - "id": "948fa4fb-a55e-4e52-a843-d0dadf47448e", "metadata": {}, "outputs": [], "source": [ "mode = 'stop'" - ] + ], + "id": "8b160e20" }, { "cell_type": "markdown", - "id": "96a3c621-f5b1-4236-832d-514979b45cef", "metadata": {}, "source": [ "## Track latest connections with Inferences Threat Inference by Log IDs" - ] + ], + "id": "3e00e91f" }, { "cell_type": "code", "execution_count": 17, - "id": "3d846312-3c0f-47c1-ba13-88b83a20ed7d", "metadata": {}, "outputs": [], "source": [ @@ -380,43 +379,44 @@ " \"TIMESTAMP\": datetime,\n", " \"log_status\": str\n", "}" - ] + ], + "id": "8a0d5b54" }, { "cell_type": "code", "execution_count": 18, - "id": "a79aa8f9-c4e1-4c4e-a5ec-e1ef5e6b0299", "metadata": {}, "outputs": [], "source": [ "mode = 'run'\n", "table = perspective.Table(SCHEMA, limit=20)\n", "threading.Thread(target=loop).start()" - ] + ], + "id": "89f6912c" }, { "cell_type": "code", "execution_count": 19, - "id": "3e6db8bb-f609-4e1f-8b5a-5ffd6567e210", "metadata": {}, "outputs": [], "source": [ "perspective.PerspectiveWidget(table,title = \"Latest Connections\", group_by=[\"TIMESTAMP\"],plugin=\"Datagrid\",columns=[\"count_attack\"])" - ] + ], + "id": "b1e75945" }, { "cell_type": "code", "execution_count": 20, - "id": "e86ad97f-83e2-432c-ac9e-80e0cd1d95c1", "metadata": {}, "outputs": [], "source": [ "mode = 'stop'" - ] + ], + "id": "22207148" }, { + "id": "b8763cdf", "cell_type": "markdown", - "id": "d51d030e-8929-4cdf-a7f2-c74dd0635bd7", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/notebook-basics/notebook.ipynb b/notebooks/notebook-basics/notebook.ipynb index fee96e98..306e2acd 100644 --- a/notebooks/notebook-basics/notebook.ipynb +++ b/notebooks/notebook-basics/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "dbd8a673", "cell_type": "markdown", - "id": "7e127046-57ff-4259-88eb-94596d9b4c6c", "metadata": {}, "source": [ "
    \n", @@ -17,8 +17,8 @@ ] }, { + "id": "0fa91f4b", "cell_type": "markdown", - "id": "eabd10b9-f442-49ce-bc41-5f6c1c9c4357", "metadata": {}, "source": [ "
    \n", @@ -32,7 +32,6 @@ }, { "cell_type": "markdown", - "id": "680677eb-d3b7-460a-8ac2-94e9a68c8f43", "metadata": {}, "source": [ "Prototyping applications or analyzing datasets using notebooks in SingleStoreDB Cloud follows the same general principles as developing with a Jupyter Notebook. SingleStoreDB Cloud supports internal and external datasources. Internal datasources are databases that exist within your workspace. An external datasource could be an AWS S3 bucket for example. In this Notebook we cover:\n", @@ -46,62 +45,62 @@ "7. Using Magic Commands\n", "\n", "*To learn more about working with SingleStoreDB notebooks check out our [docs](https://docs.singlestore.com/managed-service/en/developer-resources/notebooks.html)!*" - ] + ], + "id": "db0cac5d" }, { "cell_type": "markdown", - "id": "cb310396-93ad-4c04-b64d-9601b8a202bc", "metadata": {}, "source": [ "## 1. Connecting to SingleStoreDB\n", "\n", "Once you select a workspace, you can access all of the databases attached to that workspace. You cannot connect to databases that are not attached to the workspace you are using." - ] + ], + "id": "838c4af9" }, { "cell_type": "markdown", - "id": "8c502a67-9e8e-43b8-8bc6-41df4d5835da", "metadata": {}, "source": [ "First select a workspace and the `information_schema` database from the drop-down menu at the top of this notebook.\n", "\n", "" - ] + ], + "id": "c6c8e0cc" }, { "cell_type": "markdown", - "id": "886169b5-d60f-4669-9d34-20c14e9aba40", "metadata": {}, "source": [ "With the database selected, the `connection_url` variable in the Python enviroment is now updated with that information\n", "and we can use the `%%sql` magic command to query the selected database." - ] + ], + "id": "58c80276" }, { "cell_type": "code", "execution_count": 1, - "id": "146c9641-23ec-4570-8466-14d2880c66f0", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT * FROM users\n", " LIMIT 3;" - ] + ], + "id": "8e3ec961" }, { "cell_type": "markdown", - "id": "8cb9cb0f-c301-4cef-9f19-e86db0e52f73", "metadata": {}, "source": [ "When running SQL commands against a different database explicitly, you can specify the database in your\n", "SQL code with the `USE` command:" - ] + ], + "id": "48a4bc93" }, { "cell_type": "code", "execution_count": 2, - "id": "22b88c07-c956-4a77-944d-4aac485c1514", "metadata": {}, "outputs": [], "source": [ @@ -110,70 +109,70 @@ "\n", "SELECT * FROM users\n", " LIMIT 3;" - ] + ], + "id": "8963691b" }, { "cell_type": "markdown", - "id": "e8b06918-25d2-40e6-9ad9-3e8c558e89e9", "metadata": {}, "source": [ "Alternatively, you can specify the database prefix on the table in the query itself." - ] + ], + "id": "7ef362b7" }, { "cell_type": "code", "execution_count": 3, - "id": "8ab697a9-3b41-4f92-8b88-65717d7a4202", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT * FROM information_schema.users\n", " LIMIT 3;" - ] + ], + "id": "9ed182ae" }, { "cell_type": "markdown", - "id": "3aff8361-669b-474d-a45a-6345de985757", "metadata": {}, "source": [ "## Connecting with SQLAlchemy" - ] + ], + "id": "55dace8c" }, { "cell_type": "markdown", - "id": "c21cdbb8-c77e-4e31-a584-ff922620fb58", "metadata": {}, "source": [ "You can also connect to your SingleStoreDB datasource using Python and SQLAlchemy. As mentioned above,\n", "the `connection_url` variable is automatically populated by the notebook environment when selecting a\n", "database in the drop-down menu at the top of the notebook." - ] + ], + "id": "aa8da56d" }, { "cell_type": "code", "execution_count": 4, - "id": "3e2781f6-626d-4f0d-a5bb-828537c9e6e1", "metadata": {}, "outputs": [], "source": [ "import sqlalchemy as sa\n", "\n", "sa_conn = sa.create_engine(connection_url).connect()" - ] + ], + "id": "fe56aaa8" }, { "cell_type": "markdown", - "id": "1cae1a31-08c6-44b4-99c8-0d0a1b8b5ff8", "metadata": {}, "source": [ "You can also explicitly define a URL using the individual connection components." - ] + ], + "id": "c19e3f2a" }, { "cell_type": "code", "execution_count": 5, - "id": "93f26bcd-d07d-48a9-9f7a-edc2f9431c09", "metadata": {}, "outputs": [], "source": [ @@ -182,41 +181,41 @@ "connection_url2 = f\"singlestoredb://{connection_user}:{connection_password}@{connection_host}:{connection_port}/{database_name}\"\n", "\n", "url_conn = sa.create_engine(connection_url2).connect()" - ] + ], + "id": "d47acc43" }, { "cell_type": "markdown", - "id": "082e240d-9480-46a2-a7da-33508423b8e9", "metadata": {}, "source": [ "In addition, the SingleStoreDB Python package includes a wrapper `create_engine` function that\n", "uses the `SINGLESTOREDB_URL` without having to specify `connection_url`." - ] + ], + "id": "7d6b11a3" }, { "cell_type": "code", "execution_count": 6, - "id": "4ec8e9a0-b45a-4f6f-b3a5-7b51a5a89ed0", "metadata": {}, "outputs": [], "source": [ "import singlestoredb as s2\n", "\n", "conn = s2.create_engine().connect()" - ] + ], + "id": "66cd56df" }, { "cell_type": "markdown", - "id": "2dbc2854-2396-49e0-ae9f-5e68cc1e316c", "metadata": {}, "source": [ "Using `conn`, we can run our queries much like the `%%sql` command." - ] + ], + "id": "ddb8077e" }, { "cell_type": "code", "execution_count": 7, - "id": "cb22f3b0-547a-471b-80d3-213b38f41121", "metadata": {}, "outputs": [], "source": [ @@ -224,11 +223,11 @@ "\n", "for row in conn.execute(query1):\n", " print(row)" - ] + ], + "id": "e73dcf45" }, { "cell_type": "markdown", - "id": "1a15dcbc-4a03-49c2-ae18-130d97fb03e9", "metadata": {}, "source": [ "# 2. Connecting to an external datasource\n", @@ -247,64 +246,64 @@ "4. Select Save.\n", "\n", "" - ] + ], + "id": "275448eb" }, { "cell_type": "markdown", - "id": "3eed3310-621f-4c37-9db4-a992980a4f46", "metadata": {}, "source": [ "# 3. Using SQL\n", "The default language for SingleStoreDB Cloud notebooks is Python. However, the `%%sql` magic command can be used to\n", "submit SQL code for an entire cell." - ] + ], + "id": "6bdc2d33" }, { "cell_type": "code", "execution_count": 8, - "id": "d82fc6bf-b786-4956-a056-851e746f97b8", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT * FROM users\n", " LIMIT 3;" - ] + ], + "id": "2da087fa" }, { "cell_type": "markdown", - "id": "eb91c21c-1ce6-4e31-95d1-a981dea630c7", "metadata": {}, "source": [ "By default, the results are displayed as a table. We can also store the result in a variable for use later in the\n", "notebook. The following code includes the `result1 <<` syntax which indicates that the output of the SQL code\n", "should be stored in the `result` variable in the Python environment." - ] + ], + "id": "7288ccc8" }, { "cell_type": "code", "execution_count": 9, - "id": "0a6341cd-0328-4d8a-8158-72aff97b77de", "metadata": {}, "outputs": [], "source": [ "%%sql result1 <<\n", "SELECT * FROM users\n", " LIMIT 3;" - ] + ], + "id": "4cc6a000" }, { "cell_type": "markdown", - "id": "91cf2054-d223-4013-8867-2f4a9494978a", "metadata": {}, "source": [ "We now have access to the `result` variable and can convert it to a DataFrame!" - ] + ], + "id": "903cf8ba" }, { "cell_type": "code", "execution_count": 10, - "id": "5e436bc0-4843-4d0c-b64c-3470d963f29a", "metadata": {}, "outputs": [], "source": [ @@ -312,11 +311,11 @@ "\n", "df = pd.DataFrame(result1)\n", "df" - ] + ], + "id": "1db6bdc7" }, { "cell_type": "markdown", - "id": "360b2dc0-038e-4311-a5c3-b497b8feaf57", "metadata": {}, "source": [ "## 4. Using Python in a code cell\n", @@ -324,12 +323,12 @@ "By default, Python is the language for code cells. In the cell below, we are using a SQLAlchemy connection to execute\n", "the same query as the previous example. The result of this query can be converted into a DataFrame in the same manner\n", "as above" - ] + ], + "id": "490f1e09" }, { "cell_type": "code", "execution_count": 11, - "id": "e0085cca-2278-4904-94aa-4e46da840b66", "metadata": {}, "outputs": [], "source": [ @@ -337,11 +336,11 @@ "\n", "df = pd.DataFrame(result)\n", "df" - ] + ], + "id": "81e5c76f" }, { "cell_type": "markdown", - "id": "afb80434-583d-4171-a95b-694ed14bbd98", "metadata": {}, "source": [ "## 5. Using both SQL & Python in a code cell\n", @@ -349,12 +348,12 @@ "We can use a single line of SQL within a Python cell using a single `%sql` call. Below we combine SQL and\n", "Python in the same cell to capture the output in the `result` variable. We then convert it to a DataFrame\n", "as in previous examples." - ] + ], + "id": "916e3660" }, { "cell_type": "code", "execution_count": 12, - "id": "d79f9268-7c76-47cf-bee7-577ce07ae85d", "metadata": {}, "outputs": [], "source": [ @@ -362,67 +361,67 @@ "\n", "df = pd.DataFrame(result)\n", "df" - ] + ], + "id": "b62d2cf7" }, { "cell_type": "markdown", - "id": "2b9a3995-32df-4931-8aff-44bcd2db5908", "metadata": {}, "source": [ "## 6. Preinstalled libraries\n", "\n", "By default, a SingleStoreDB notebook has a large number of preinstalled libraries. Run the cell below to see what libraries are already installed!" - ] + ], + "id": "40b48421" }, { "cell_type": "code", "execution_count": 13, - "id": "abee048d-f18a-4a35-8eae-c8f92939230a", "metadata": {}, "outputs": [], "source": [ "!pip list" - ] + ], + "id": "b67782b8" }, { "cell_type": "markdown", - "id": "bbc061e3-acb3-40cc-be84-ada979aaa1a5", "metadata": {}, "source": [ "Our notebooks support libraries available from https://pypi.org/. For example, run the cell below to install the [Kaggle open dataset library](https://pypi.org/project/opendatasets/) to install the `opendatasets` package." - ] + ], + "id": "c91a8510" }, { "cell_type": "code", "execution_count": 14, - "id": "e17e1322-33df-4e2f-97fe-9815df235b40", "metadata": {}, "outputs": [], "source": [ "!pip3 install opendatasets" - ] + ], + "id": "8e6f739e" }, { "cell_type": "markdown", - "id": "9c6684da-af62-42bc-9481-b53c75f64b5e", "metadata": {}, "source": [ "You can even upgrade versions of a preinstalled library. Run the cell below to get the new version of Plotly." - ] + ], + "id": "cae98979" }, { "cell_type": "code", "execution_count": 15, - "id": "5a989a0f-6334-42d9-a75e-a04d09bccbec", "metadata": {}, "outputs": [], "source": [ "!pip3 install plotly --upgrade" - ] + ], + "id": "2c3430fc" }, { "cell_type": "markdown", - "id": "8d69cb4d-58ea-40ae-83db-03ff489d8676", "metadata": {}, "source": [ "## 7. Magic commands\n", @@ -432,29 +431,30 @@ "\n", "There are many other magic commands as well for everything from file system access to debugging your Python code.\n", "For information about teh full list of magic commands available, run the code cell below." - ] + ], + "id": "81bc2f91" }, { "cell_type": "code", "execution_count": 16, - "id": "b413bb30-0e9f-4484-8d3e-e7bc724a0c13", "metadata": {}, "outputs": [], "source": [ "%quickref" - ] + ], + "id": "eb96981b" }, { "cell_type": "markdown", - "id": "0ea02e78-b1e2-4cb4-a6d7-d813fdcb2759", "metadata": {}, "source": [ "**Learn more about SingleStoreDB notebooks [here](https://docs.singlestore.com/managed-service/en/developer-resources/notebooks.html) and get started with your first notebook!**" - ] + ], + "id": "02550a95" }, { + "id": "b71125bc", "cell_type": "markdown", - "id": "df3c9ee9-ac57-4e84-9201-df635ac7bd36", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/optimize-performance-with-tpch-100/notebook.ipynb b/notebooks/optimize-performance-with-tpch-100/notebook.ipynb index 18c8d9d7..a9a77c02 100644 --- a/notebooks/optimize-performance-with-tpch-100/notebook.ipynb +++ b/notebooks/optimize-performance-with-tpch-100/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "fa1acf27", "cell_type": "markdown", - "id": "8e67bcbe-6ace-4ca9-b28c-927b4b5a85b2", "metadata": {}, "source": [ "
    \n", @@ -18,7 +18,6 @@ }, { "cell_type": "markdown", - "id": "5cf0f5ed", "metadata": {}, "source": [ "
    \n", @@ -28,12 +27,12 @@ "

    This tutorial is meant for Standard & Premium Workspaces. You can't run this with a Free Starter Workspace due to restrictions on Storage. Create a Workspace using +group in the left nav & select Standard for this notebook. Gallery notebooks tagged with \"Starter\" are suitable to run on a Free Starter Workspace

    \n", "
    \n", "
    " - ] + ], + "id": "3d9c24bf" }, { "attachments": {}, "cell_type": "markdown", - "id": "5d93af8b-eb1d-4207-a060-1a45c46d8b41", "metadata": {}, "source": [ "### Context\n", @@ -50,12 +49,12 @@ "They are highly efficient for exact-match lookups (point-reads). Because hash indexes store rows in a sparse array of buckets indexed through a hash function on the relevant columns, queries can quickly retrieve data by examining only the corresponding bucket rather than searching the entire dataset. This enables significant reduction in lookup time and hence, increased performance for specific query types.\n", "\n", "**For that tutorial, we recommend using a workspace of size S4 to ingest data faster and also see the difference and gain you can get from a distributed architecture.**" - ] + ], + "id": "daf1156b" }, { "attachments": {}, "cell_type": "markdown", - "id": "67f041ef-5605-43ef-8ca0-5db3194b4cad", "metadata": {}, "source": [ "
    \n", @@ -65,21 +64,21 @@ "

    For that tutorial, we recommend using workspace of size S4 to ingest data faster and also see the difference and gain you can get from a distributed architecture.

    \n", "
    \n", "
    " - ] + ], + "id": "ee4f6399" }, { "attachments": {}, "cell_type": "markdown", - "id": "6052728d-7828-4fb2-bb53-b960a7ad43af", "metadata": {}, "source": [ "### Let's first create the unoptimized database" - ] + ], + "id": "25446e47" }, { "cell_type": "code", "execution_count": 1, - "id": "7301a602-48cf-4f3b-9cbc-2e7184d97ae0", "metadata": {}, "outputs": [], "source": [ @@ -88,41 +87,41 @@ "\n", "# To create a database with custom partitions use the following syntax: CREATE DATABASE YourDatabaseName PARTITIONS=X;\n", "# You cannot change after creation the number of partitions" - ] + ], + "id": "4f043653" }, { "attachments": {}, "cell_type": "markdown", - "id": "94c8bb6f-658d-4434-9074-4847f1c7d721", "metadata": {}, "source": [ "If using a S00, the database will have 2 partitions, if using S1, it will have 8 partitions" - ] + ], + "id": "91210525" }, { "cell_type": "code", "execution_count": 2, - "id": "b36585b9-4d52-4301-ac68-60fa49425751", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT num_partitions FROM information_schema.DISTRIBUTED_DATABASES WHERE database_name = 's2_tpch_unoptimized';" - ] + ], + "id": "1b8b112f" }, { "attachments": {}, "cell_type": "markdown", - "id": "b576e31c-6a67-4126-86ab-480fd96805d3", "metadata": {}, "source": [ "##### Let's create all the tables in that database with no index, shard key or primary key" - ] + ], + "id": "18c35c9f" }, { "attachments": {}, "cell_type": "markdown", - "id": "4587c575-9b5a-4535-bebe-70779064e9dc", "metadata": {}, "source": [ "
    \n", @@ -133,12 +132,12 @@ " It updates the connection_url to connect to that database.

    \n", "
    \n", "
    " - ] + ], + "id": "c142577e" }, { "cell_type": "code", "execution_count": 3, - "id": "afde1362-2d38-4732-94ed-6d4ed05a6806", "metadata": {}, "outputs": [], "source": [ @@ -227,21 +226,21 @@ " `s_acctbal` decimal(15,2) NOT NULL,\n", " `s_comment` varchar(101) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL\n", ");" - ] + ], + "id": "fb617b23" }, { "attachments": {}, "cell_type": "markdown", - "id": "09711e8c-fb01-4e10-862b-ee5350be6076", "metadata": {}, "source": [ "### Now let's create the pipelines and run them to ingest data" - ] + ], + "id": "27ea8e5d" }, { "cell_type": "code", "execution_count": 4, - "id": "4e8ca124-ac4b-49de-b0ee-d9441d43bedd", "metadata": {}, "outputs": [], "source": [ @@ -256,12 +255,12 @@ " INTO TABLE `customer`\n", " FIELDS TERMINATED BY '|' ENCLOSED BY '' ESCAPED BY '\\\\'\n", " LINES TERMINATED BY '|\\n' STARTING BY '';" - ] + ], + "id": "dfab1447" }, { "cell_type": "code", "execution_count": 5, - "id": "ce739af4-6839-4751-8a81-019fb26cad72", "metadata": {}, "outputs": [], "source": [ @@ -276,12 +275,12 @@ " INTO TABLE `lineitem`\n", " FIELDS TERMINATED BY '|' ENCLOSED BY '' ESCAPED BY '\\\\'\n", " LINES TERMINATED BY '|\\n' STARTING BY '';" - ] + ], + "id": "fd13762d" }, { "cell_type": "code", "execution_count": 6, - "id": "bfdd5bbc-702a-4f77-b771-cd38189040e0", "metadata": {}, "outputs": [], "source": [ @@ -296,12 +295,12 @@ " INTO TABLE `nation`\n", " FIELDS TERMINATED BY '|' ENCLOSED BY '' ESCAPED BY '\\\\'\n", " LINES TERMINATED BY '|\\n' STARTING BY '';" - ] + ], + "id": "a3304896" }, { "cell_type": "code", "execution_count": 7, - "id": "1b040083-f864-4e64-9bd2-00b1ff7d1e2b", "metadata": {}, "outputs": [], "source": [ @@ -316,12 +315,12 @@ " INTO TABLE `orders`\n", " FIELDS TERMINATED BY '|' ENCLOSED BY '' ESCAPED BY '\\\\'\n", " LINES TERMINATED BY '|\\n' STARTING BY '';" - ] + ], + "id": "c8588fdb" }, { "cell_type": "code", "execution_count": 8, - "id": "b5e06dfe-f679-4fe2-bd47-802a0b127270", "metadata": {}, "outputs": [], "source": [ @@ -336,12 +335,12 @@ " INTO TABLE `partsupp`\n", " FIELDS TERMINATED BY '|' ENCLOSED BY '' ESCAPED BY '\\\\'\n", " LINES TERMINATED BY '|\\n' STARTING BY '';" - ] + ], + "id": "9b99f619" }, { "cell_type": "code", "execution_count": 9, - "id": "8b2726ea-9d0c-4809-bf35-b851821cf336", "metadata": {}, "outputs": [], "source": [ @@ -356,12 +355,12 @@ " INTO TABLE `part`\n", " FIELDS TERMINATED BY '|' ENCLOSED BY '' ESCAPED BY '\\\\'\n", " LINES TERMINATED BY '|\\n' STARTING BY '';" - ] + ], + "id": "3d7854cf" }, { "cell_type": "code", "execution_count": 10, - "id": "06114a70-d7f9-4c1f-b6a6-4554b33bb5c6", "metadata": {}, "outputs": [], "source": [ @@ -376,12 +375,12 @@ " INTO TABLE `region`\n", " FIELDS TERMINATED BY '|' ENCLOSED BY '' ESCAPED BY '\\\\'\n", " LINES TERMINATED BY '|\\n' STARTING BY '';" - ] + ], + "id": "7b8e5f9f" }, { "cell_type": "code", "execution_count": 11, - "id": "bfc2e19d-f32c-4b9e-8fa7-1e68711f834e", "metadata": {}, "outputs": [], "source": [ @@ -396,12 +395,12 @@ " INTO TABLE `supplier`\n", " FIELDS TERMINATED BY '|' ENCLOSED BY '' ESCAPED BY '\\\\'\n", " LINES TERMINATED BY '|\\n' STARTING BY '';" - ] + ], + "id": "2d321f80" }, { "cell_type": "code", "execution_count": 12, - "id": "e19cb045-bebc-4aa6-92d7-ae06778d8af8", "metadata": {}, "outputs": [], "source": [ @@ -414,21 +413,21 @@ "START PIPELINE part_pipeline;\n", "START PIPELINE region_pipeline;\n", "START PIPELINE supplier_pipeline;" - ] + ], + "id": "263f100c" }, { "attachments": {}, "cell_type": "markdown", - "id": "3eacdd09-9b27-4995-a3df-9514cd733a57", "metadata": {}, "source": [ "#### [Optional Step] Check data ingestion in real-time with Perspective" - ] + ], + "id": "8bd4de36" }, { "cell_type": "code", "execution_count": 13, - "id": "b61f205f-5d1d-4af2-8369-e31057c76f66", "metadata": {}, "outputs": [], "source": [ @@ -440,12 +439,12 @@ "from datetime import datetime, date\n", "from perspective import Table, PerspectiveWidget\n", "warnings.filterwarnings('ignore')" - ] + ], + "id": "3437aabc" }, { "cell_type": "code", "execution_count": 14, - "id": "ac376696-0b7e-4182-bc04-ddef400b7fca", "metadata": {}, "outputs": [], "source": [ @@ -454,12 +453,12 @@ " while mode == 'run':\n", " table.update(data_source())\n", " time.sleep(1)" - ] + ], + "id": "d475aba6" }, { "cell_type": "code", "execution_count": 15, - "id": "5e2ce253-576d-49c9-a92b-f165ffcc4ae7", "metadata": {}, "outputs": [], "source": [ @@ -471,53 +470,53 @@ "SCHEMA = {\n", " \"rows_streamed\": int\n", "}" - ] + ], + "id": "43b76b99" }, { "cell_type": "code", "execution_count": 16, - "id": "d388a00d-23ec-45d2-a94c-5e747da707c0", "metadata": {}, "outputs": [], "source": [ "mode = 'run'\n", "table = perspective.Table(SCHEMA, limit=100)\n", "threading.Thread(target=loop).start()" - ] + ], + "id": "acbae49c" }, { "cell_type": "code", "execution_count": 17, - "id": "9e88e523-63b1-48e3-bde5-a2840490199c", "metadata": {}, "outputs": [], "source": [ "perspective.PerspectiveWidget(table,title = \"Track Row Ingestion\",plugin=\"Y Line\",columns=[\"count_rows\"])" - ] + ], + "id": "5631365c" }, { "cell_type": "code", "execution_count": 18, - "id": "debbed63-11dc-43dc-9f12-a0e80a5b7703", "metadata": {}, "outputs": [], "source": [ "mode = 'stop'" - ] + ], + "id": "5408d2f6" }, { "attachments": {}, "cell_type": "markdown", - "id": "442281db-7fd2-4bba-9104-84f3b0537a9a", "metadata": {}, "source": [ "### Now, let's see the performance of a few queries" - ] + ], + "id": "b5d1dc34" }, { "cell_type": "code", "execution_count": 19, - "id": "97e9050b-3b8e-40e3-b871-2b0bb73eb5ae", "metadata": {}, "outputs": [], "source": [ @@ -537,12 +536,12 @@ "WHERE l_shipdate <= DATE('1998-12-01') - INTERVAL '90' DAY\n", "GROUP BY l_returnflag, l_linestatus\n", "ORDER BY l_returnflag, l_linestatus;" - ] + ], + "id": "f75b8c5a" }, { "cell_type": "code", "execution_count": 20, - "id": "71669c07-d06e-41f4-bd63-5046add22afb", "metadata": {}, "outputs": [], "source": [ @@ -563,12 +562,12 @@ " )\n", "GROUP BY o_orderpriority\n", "ORDER BY o_orderpriority;" - ] + ], + "id": "2b825665" }, { "cell_type": "code", "execution_count": 21, - "id": "654f872b-4a35-4897-86d2-c51b548919b8", "metadata": {}, "outputs": [], "source": [ @@ -614,32 +613,32 @@ " numwait DESC,\n", " s_name\n", "LIMIT 100;" - ] + ], + "id": "0117bfa6" }, { "attachments": {}, "cell_type": "markdown", - "id": "0ad2d768-cb53-4e0b-8353-6178c8f9508c", "metadata": {}, "source": [ "### Now, let's first focus on optimizing the performance" - ] + ], + "id": "b2c5b8bf" }, { "cell_type": "code", "execution_count": 22, - "id": "d1844926-ae73-4fec-b5ba-802c8173e846", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "CREATE DATABASE IF NOT EXISTS s2_tpch_optimized" - ] + ], + "id": "336cb7af" }, { "attachments": {}, "cell_type": "markdown", - "id": "a163696a-0507-4b05-9146-6cbfa1ba1e29", "metadata": {}, "source": [ "
    \n", @@ -650,23 +649,23 @@ " It updates the connection_url to connect to that database.

    \n", "
    \n", "
    " - ] + ], + "id": "b3f50fad" }, { "attachments": {}, "cell_type": "markdown", - "id": "bafd1114-f88c-409c-8281-b32ac27f1222", "metadata": {}, "source": [ "##### Now, let's create each table with optimized data structure:\n", "* We create a unique key through primary key. For example **lineitem** table needs both the orderkey and linenumber to identify rows by uniqueness\n", "* We create a shard key which will distribute data in an efficient way to perform fast join and filtering. For **lineitem** table since we perform joins and calculation based on the orderkey we create a shardkey with orderkey" - ] + ], + "id": "c4abd9bf" }, { "cell_type": "code", "execution_count": 23, - "id": "ff6aea3d-a965-475d-becd-3910965f5c8f", "metadata": {}, "outputs": [], "source": [ @@ -779,12 +778,12 @@ " SHARD KEY `__SHARDKEY` (`s_suppkey`),\n", " KEY `s_suppkey` (`s_suppkey`) USING CLUSTERED COLUMNSTORE\n", ");" - ] + ], + "id": "4e0e2718" }, { "cell_type": "code", "execution_count": 24, - "id": "6ed484a7-8ed8-479d-9ebe-5716749369bc", "metadata": {}, "outputs": [], "source": [ @@ -797,12 +796,12 @@ "INSERT INTO s2_tpch_optimized.partsupp SELECT * FROM s2_tpch_unoptimized.partsupp;\n", "INSERT INTO s2_tpch_optimized.region SELECT * FROM s2_tpch_unoptimized.region;\n", "INSERT INTO s2_tpch_optimized.supplier SELECT * FROM s2_tpch_unoptimized.supplier;" - ] + ], + "id": "aafed60f" }, { "cell_type": "code", "execution_count": 25, - "id": "9c79d8a5-c626-4f14-85e2-1f00afbceb8f", "metadata": {}, "outputs": [], "source": [ @@ -822,12 +821,12 @@ "WHERE l_shipdate <= DATE('1998-12-01') - INTERVAL '90' DAY\n", "GROUP BY l_returnflag, l_linestatus\n", "ORDER BY l_returnflag, l_linestatus;" - ] + ], + "id": "ad4ce108" }, { "cell_type": "code", "execution_count": 26, - "id": "e9b21e9b-1869-4a1e-8998-a209c3dc6ffd", "metadata": {}, "outputs": [], "source": [ @@ -848,12 +847,12 @@ " )\n", "GROUP BY o_orderpriority\n", "ORDER BY o_orderpriority;" - ] + ], + "id": "6cefe706" }, { "cell_type": "code", "execution_count": 27, - "id": "c8b63c2a-b1fb-4cd1-90bb-62e8b56922f5", "metadata": {}, "outputs": [], "source": [ @@ -899,21 +898,21 @@ " numwait desc,\n", " s_name\n", "LIMIT 100;" - ] + ], + "id": "66871245" }, { "attachments": {}, "cell_type": "markdown", - "id": "54a6902d-bc3b-455d-8f47-3450d5928de8", "metadata": {}, "source": [ "### Finally, let's do a side by side comparison between the optimized and unoptimized database" - ] + ], + "id": "e6116c9a" }, { "cell_type": "code", "execution_count": 28, - "id": "2f23e742-d9fe-467a-a299-a09a20d2af1e", "metadata": {}, "outputs": [], "source": [ @@ -922,21 +921,21 @@ "\n", "db_connection_unoptimized = create_engine(database='s2_tpch_unoptimized').connect()\n", "db_connection_optimized = create_engine(database='s2_tpch_optimized').connect()" - ] + ], + "id": "6d7f1df9" }, { "attachments": {}, "cell_type": "markdown", - "id": "7f84ed59-9d2c-4e3e-92eb-63603645d953", "metadata": {}, "source": [ "Here are a few queries that you can test side by side against. Overall you will notice an average of 4x improvement in performance" - ] + ], + "id": "8e18a35e" }, { "cell_type": "code", "execution_count": 29, - "id": "f3d86d62-ffe2-40ad-b59a-7c77cae98ca0", "metadata": {}, "outputs": [], "source": [ @@ -957,12 +956,12 @@ "GROUP BY o_orderpriority\n", "ORDER BY o_orderpriority;\n", "''')" - ] + ], + "id": "e706e8ed" }, { "cell_type": "code", "execution_count": 30, - "id": "7d947658-6dec-4efa-97f6-863977b2003b", "metadata": {}, "outputs": [], "source": [ @@ -1008,22 +1007,22 @@ " s_name\n", "LIMIT 100;\n", "''')" - ] + ], + "id": "1bc671a3" }, { "cell_type": "code", "execution_count": 31, - "id": "6e4da6ac-52ff-4506-8826-3a46bf350656", "metadata": {}, "outputs": [], "source": [ "result = db_connection_optimized.execute(sql_query21)" - ] + ], + "id": "d0e27e11" }, { "cell_type": "code", "execution_count": 32, - "id": "b699bc42-e9bd-4b8f-81c0-48311b7fd14e", "metadata": {}, "outputs": [], "source": [ @@ -1073,11 +1072,12 @@ "\n", "# Show the plot\n", "fig.show()" - ] + ], + "id": "34309401" }, { + "id": "a4d98441", "cell_type": "markdown", - "id": "4708ffe5-ea88-48a8-a4ac-9eaa5f801f79", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/performance-troubleshooting/notebook.ipynb b/notebooks/performance-troubleshooting/notebook.ipynb index b22e2e95..d6402374 100644 --- a/notebooks/performance-troubleshooting/notebook.ipynb +++ b/notebooks/performance-troubleshooting/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "cefe3aab", "cell_type": "markdown", - "id": "f30fc302-c915-4c1f-94a2-0b237798086e", "metadata": {}, "source": [ "
    \n", @@ -25,7 +25,8 @@ " \n", " \n", "" - ] + ], + "id": "db5556b1" }, { "attachments": {}, @@ -66,7 +67,8 @@ "## Questions?\n", "\n", "Reach out to us through our [forum](https://www.singlestore.com/forum)." - ] + ], + "id": "82a212c2" }, { "attachments": {}, @@ -106,7 +108,8 @@ " QueryTxt\n", "\n", "" - ] + ], + "id": "91bea727" }, { "attachments": {}, @@ -114,7 +117,8 @@ "metadata": {}, "source": [ "### Install Libraries" - ] + ], + "id": "91dd88cd" }, { "cell_type": "code", @@ -123,7 +127,8 @@ "outputs": [], "source": [ "pip install openpyxl jsonpath_ng sql_metadata" - ] + ], + "id": "4bd011c8" }, { "attachments": {}, @@ -131,7 +136,8 @@ "metadata": {}, "source": [ "### Imports" - ] + ], + "id": "1dc32168" }, { "cell_type": "code", @@ -162,7 +168,8 @@ "from openpyxl import Workbook\n", "\n", "from IPython.display import display, HTML" - ] + ], + "id": "930821f1" }, { "attachments": {}, @@ -170,7 +177,8 @@ "metadata": {}, "source": [ "### Variables" - ] + ], + "id": "95b46f1a" }, { "cell_type": "code", @@ -235,7 +243,8 @@ "WHERE m2.plan_id = p2.plan_id\n", " AND m2.database_name = p2.database_name ;\n", "\"\"\"" - ] + ], + "id": "0d4f5ed2" }, { "attachments": {}, @@ -243,7 +252,8 @@ "metadata": {}, "source": [ "### Log Control" - ] + ], + "id": "ca1df734" }, { "cell_type": "code", @@ -266,7 +276,8 @@ " logging.getLogger().setLevel(logging.CRITICAL)\n", "\n", "set_logging_enabled(False)" - ] + ], + "id": "8d5dab52" }, { "attachments": {}, @@ -276,7 +287,8 @@ "**Note** To enable logs\n", "\n", " - Modify 'set_logging_enabled(False)' to 'set_logging_enabled(True)' in code below" - ] + ], + "id": "ef3f42af" }, { "attachments": {}, @@ -284,7 +296,8 @@ "metadata": {}, "source": [ "### Functions to display various alerts" - ] + ], + "id": "5cf480ad" }, { "cell_type": "code", @@ -344,7 +357,8 @@ "

    {success_msg}

    \n", "
    \n", "
    '''))" - ] + ], + "id": "53d4642c" }, { "attachments": {}, @@ -352,7 +366,8 @@ "metadata": {}, "source": [ "### Utility functions handling db connection and archiving" - ] + ], + "id": "b2fef9c1" }, { "cell_type": "code", @@ -411,7 +426,8 @@ " except Exception as e:\n", " logging.error(f'Failed to create archive: {e}')\n", " raise Exception(f'Failed to create archive: {e}')" - ] + ], + "id": "210c09c8" }, { "attachments": {}, @@ -419,7 +435,8 @@ "metadata": {}, "source": [ "### Utility functions handling HTML generation" - ] + ], + "id": "2eafb506" }, { "cell_type": "code", @@ -492,7 +509,8 @@ " {title}\n", " STAGE Link      {curr_file_path} \n", " \"\"\"" - ] + ], + "id": "d6e09de3" }, { "attachments": {}, @@ -500,7 +518,8 @@ "metadata": {}, "source": [ "### Function loading query data in CSV" - ] + ], + "id": "c7601752" }, { "cell_type": "code", @@ -552,7 +571,8 @@ " dtype={'QueryID': int, 'QueryName': str, 'QueryTxt': str, 'QueryParams': str})\n", " csv_df.sort_values(by=['QueryID'], inplace=True)\n", " return csv_df" - ] + ], + "id": "c1923b65" }, { "attachments": {}, @@ -560,7 +580,8 @@ "metadata": {}, "source": [ "### Verify Stage Path and Create if not exists" - ] + ], + "id": "2e96214a" }, { "cell_type": "code", @@ -595,7 +616,8 @@ " except Exception as stage_ex:\n", " logging.error(f'Stage Path Verification Failed. {stage_ex}')\n", " return False" - ] + ], + "id": "aacd2fcf" }, { "attachments": {}, @@ -603,7 +625,8 @@ "metadata": {}, "source": [ "## Functions to analyze data type mismatch" - ] + ], + "id": "eeb6ab7c" }, { "cell_type": "code", @@ -945,7 +968,8 @@ " except Exception as e:\n", " logging.error(f\"An error occurred during database record processing: {e}\")\n", " raise" - ] + ], + "id": "2e4fe2f0" }, { "attachments": {}, @@ -953,7 +977,8 @@ "metadata": {}, "source": [ "### Process dataframe and generate reports for each query in csv" - ] + ], + "id": "52f9ebbe" }, { "cell_type": "code", @@ -1021,7 +1046,8 @@ "\n", " logging.info('Result Pages are generated')\n", " logging.info(f'Excel Report perf_troubleshoot_report.xlsx is generated')" - ] + ], + "id": "dede33bd" }, { "attachments": {}, @@ -1029,7 +1055,8 @@ "metadata": {}, "source": [ "### Function to clean up generated directories" - ] + ], + "id": "acab5016" }, { "cell_type": "code", @@ -1061,7 +1088,8 @@ " error_msg = 'clean up failed'\n", " print(\"Error: %s : %s\" % (dir_path, e.strerror))\n", " raise Exception(f'Failed to clean up {str(e)}')" - ] + ], + "id": "90c91568" }, { "cell_type": "code", @@ -1146,7 +1174,8 @@ " show_error(error_msg)\n", "\n", " logging.info(f'Script execution completed sucessfully: {execution_success}')" - ] + ], + "id": "e988f5f1" }, { "attachments": {}, @@ -1157,11 +1186,12 @@ "\n", " - Actions suggested suit most of performance improvement scenarios, Still we would encourage to test and verify before applying on prod environemnts\n", " - To use notebook as scheduled one, we have to modify python code to refer configuration from table instead of user input" - ] + ], + "id": "93cf4edb" }, { + "id": "7c102a8b", "cell_type": "markdown", - "id": "37a3a54b-ea69-4827-bc8b-81cb767e6a84", "metadata": {}, "source": [ "
    \n", @@ -1189,5 +1219,5 @@ } }, "nbformat": 4, - "nbformat_minor": 4 + "nbformat_minor": 5 } diff --git a/notebooks/pipelines-query-tuning/notebook.ipynb b/notebooks/pipelines-query-tuning/notebook.ipynb index c85701d2..e309ead6 100644 --- a/notebooks/pipelines-query-tuning/notebook.ipynb +++ b/notebooks/pipelines-query-tuning/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "b0bbc9ff", "cell_type": "markdown", - "id": "5278edf1-1abe-4d62-9623-445379f91ba3", "metadata": {}, "source": [ "
    \n", @@ -18,15 +18,14 @@ }, { "cell_type": "markdown", - "id": "d4c162a7-e101-475a-9e22-86d045c138ae", "metadata": {}, "source": [ "# Ingesting real time data from the International Space Station (ISS)" - ] + ], + "id": "6f9a4211" }, { "cell_type": "markdown", - "id": "1e182e1d-9395-4fb8-9e1b-62d6d4f91f58", "metadata": {}, "source": [ "### 1. Drop the database if it exists, create a new database, switch to it, and then create a table.\n", @@ -36,30 +35,30 @@ "\n", "### Example:\n", "If your email address is lorrin.smith-bates@singlestore.com you would use **lorrin_smith_bates_singlestore_com**" - ] + ], + "id": "135d167a" }, { "cell_type": "code", "execution_count": 1, - "id": "854b5ad5-f2f7-434d-9d17-ea916516327f", "metadata": {}, "outputs": [], "source": [ "email_address = \"<< enter your email address >>\"" - ] + ], + "id": "71853e30" }, { "cell_type": "markdown", - "id": "f55ddc3c-00ec-4b28-8df6-edfbf0d66aba", "metadata": {}, "source": [ "Remove characters that can't be used in a database name." - ] + ], + "id": "a8482c67" }, { "cell_type": "code", "execution_count": 2, - "id": "3e6687cf-414f-490d-89fa-5c0f13814393", "metadata": {}, "outputs": [], "source": [ @@ -67,12 +66,12 @@ "\n", "modified_email_address = re.sub(r'[^A-Za-z0-9]', '_', email_address)\n", "modified_email_address" - ] + ], + "id": "2dd6f0bd" }, { "cell_type": "code", "execution_count": 3, - "id": "b3daf7fc-c60e-4c3b-a811-3338b94015ca", "metadata": {}, "outputs": [], "source": [ @@ -95,20 +94,20 @@ " units varchar(20),\n", " url varchar(255)\n", ");" - ] + ], + "id": "cc0e3e95" }, { "cell_type": "markdown", - "id": "c4a85e23-6309-4e55-a2d7-39ab529f28a2", "metadata": {}, "source": [ "### 2. Create a SingleStore pipeline to ingest ISS data from a Kafka topic." - ] + ], + "id": "04e2160b" }, { "cell_type": "code", "execution_count": 4, - "id": "6af2f20a-8d80-4ff4-b56a-07db593157a5", "metadata": {}, "outputs": [], "source": [ @@ -118,80 +117,80 @@ " LOAD DATA kafka '100.25.125.23/iss'\n", " INTO TABLE iss_location\n", " FORMAT JSON;" - ] + ], + "id": "2de706be" }, { "cell_type": "markdown", - "id": "5d7745a0-6649-445d-b1cf-7068cad7d74c", "metadata": {}, "source": [ "### 3. Test the pipeline." - ] + ], + "id": "d6fa41c4" }, { "cell_type": "code", "execution_count": 5, - "id": "49316e8f-03b1-4dcc-a747-170135ccf7a0", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "\n", "TEST PIPELINE iss_pipeline;" - ] + ], + "id": "922d9ede" }, { "cell_type": "markdown", - "id": "ca166af0-6440-4869-9ce0-6e11caba41e5", "metadata": {}, "source": [ "### 4. Start the Pipeline" - ] + ], + "id": "6ae60777" }, { "cell_type": "code", "execution_count": 6, - "id": "6f36c4ff-a153-4acf-ba20-0b28a0ba75e4", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "\n", "START PIPELINE iss_pipeline;" - ] + ], + "id": "637f2ab4" }, { "cell_type": "markdown", - "id": "908c7e4c-fa9c-42b1-a164-6f410efb4f05", "metadata": {}, "source": [ "### 5. Get the count of records. Run this a few times to see the number of records ingested." - ] + ], + "id": "a8c81433" }, { "cell_type": "code", "execution_count": 7, - "id": "e97f6065-9f42-4a32-9a15-fa06c4a1dda7", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "\n", "SELECT COUNT(*) FROM iss_location;" - ] + ], + "id": "481a5a84" }, { "cell_type": "markdown", - "id": "06769299-1807-42e0-9a5e-f71ea5bf8dfe", "metadata": {}, "source": [ "### 6. Get the latest location record. Click the link to see the location of the ISS in Google Maps." - ] + ], + "id": "8a7d1af7" }, { "cell_type": "code", "execution_count": 8, - "id": "46a63ec2-bb8b-4887-b41a-42612e64c756", "metadata": {}, "outputs": [], "source": [ @@ -201,20 +200,20 @@ " FROM iss_location\n", " ORDER BY timestamp desc\n", " LIMIT 1;" - ] + ], + "id": "79755dda" }, { "cell_type": "markdown", - "id": "f547c895-5a4a-40c6-9284-14763ac2bea2", "metadata": {}, "source": [ "### 7. Stop the pipeline and delete the data from the iss_location table." - ] + ], + "id": "fb11a6dd" }, { "cell_type": "code", "execution_count": 9, - "id": "bc4a122a-e86e-405d-a0ed-fb82dcc21816", "metadata": {}, "outputs": [], "source": [ @@ -222,20 +221,20 @@ "\n", "STOP PIPELINE iss_pipeline;\n", "DELETE FROM iss_location;" - ] + ], + "id": "9aae9731" }, { "cell_type": "markdown", - "id": "ccaef975-6a97-42dc-8112-64ae05bfbe70", "metadata": {}, "source": [ "### 8. Change the pipeline offsets and interval." - ] + ], + "id": "b30644b4" }, { "cell_type": "code", "execution_count": 10, - "id": "7efb3adb-bc12-4ca4-8eae-28703020af18", "metadata": {}, "outputs": [], "source": [ @@ -244,90 +243,90 @@ "ALTER PIPELINE iss_pipeline\n", " SET BATCH_INTERVAL 30000\n", " SET OFFSETS latest ;" - ] + ], + "id": "02578bc5" }, { "cell_type": "markdown", - "id": "814bff09-f60e-4c04-9408-9dd65e893c96", "metadata": {}, "source": [ "### 9. Start the Pipeline again." - ] + ], + "id": "d4f4d033" }, { "cell_type": "code", "execution_count": 11, - "id": "94e46f4b-597e-4c71-b329-c4a3c3b8fff9", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "\n", "START PIPELINE iss_pipeline;" - ] + ], + "id": "80b83394" }, { "cell_type": "markdown", - "id": "a4a6f369-8ee8-479d-8014-5d528d449e13", "metadata": {}, "source": [ "### 10. Count the records, notice how the records are populated now after alterning the pipeline." - ] + ], + "id": "c1e18d41" }, { "cell_type": "code", "execution_count": 12, - "id": "196f0285-0c8f-4e71-8a7d-8237fcdc891e", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "\n", "SELECT COUNT(*) from iss_location;" - ] + ], + "id": "9cd65706" }, { "cell_type": "markdown", - "id": "a186addf-29fe-4311-b67d-03696ef4bc62", "metadata": {}, "source": [ "### 11. Stop the pipeline" - ] + ], + "id": "20661dec" }, { "cell_type": "code", "execution_count": 13, - "id": "ddf2a7d5-050a-4146-b965-68a2aa6ce92c", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "\n", "STOP PIPELINE iss_pipeline;" - ] + ], + "id": "77992744" }, { "cell_type": "markdown", - "id": "b2e91faf-99e8-4753-9e6d-08e3d703fce3", "metadata": {}, "source": [ "# Query Optimization" - ] + ], + "id": "8b257a40" }, { "cell_type": "markdown", - "id": "df7cd5f1-20a5-4f6e-8d44-c47def53dfb2", "metadata": {}, "source": [ "### 1. Restore the 'employees' database that has been backed up into a public S3 bucket\n", "\n", "For the database name we'll prepend employees_ to the modified email address again." - ] + ], + "id": "8b903e94" }, { "cell_type": "code", "execution_count": 14, - "id": "cb27d520-2141-450d-b0cf-925c12c32f3c", "metadata": {}, "outputs": [], "source": [ @@ -336,39 +335,39 @@ " FROM S3 'train.memsql.com/employee'\n", " CONFIG'{\"region\":\"us-east-1\"}'\n", " CREDENTIALS'{}';" - ] + ], + "id": "fca91b00" }, { "cell_type": "markdown", - "id": "ba2c8fc3-44c4-4d73-bf36-b9b1aab1b74d", "metadata": {}, "source": [ "### 2. Switch to the Employees database" - ] + ], + "id": "daa59705" }, { "cell_type": "code", "execution_count": 15, - "id": "bafa5dbc-f149-4465-ab16-83339e515abb", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "USE employees_{{ modified_email_address }};" - ] + ], + "id": "30c7c835" }, { "cell_type": "markdown", - "id": "fbbba65d-5473-4b86-8f8e-ca6835ac1b77", "metadata": {}, "source": [ "### 3. Run a query that joins 4 tables and orders by 4 columns in 3 tables" - ] + ], + "id": "2be1611e" }, { "cell_type": "code", "execution_count": 16, - "id": "41fb7c5b-e007-4d2c-a271-f91bc40a8405", "metadata": {}, "outputs": [], "source": [ @@ -381,20 +380,20 @@ " INNER JOIN titles t ON e.emp_no=t.emp_no\n", " ORDER BY e.first_name, e.last_name, d.dept_name, t.from_date\n", " LIMIT 10;" - ] + ], + "id": "82c5e283" }, { "cell_type": "markdown", - "id": "77c58db6-8dab-4de8-8834-a4ca4e508c1e", "metadata": {}, "source": [ "### 4. Examine the query execution profile using EXPLAIN" - ] + ], + "id": "0d818a3b" }, { "cell_type": "code", "execution_count": 17, - "id": "d58506b8-f97f-462c-a0e5-a15a93c8a7a5", "metadata": {}, "outputs": [], "source": [ @@ -407,20 +406,20 @@ " INNER JOIN titles t ON e.emp_no=t.emp_no\n", " ORDER BY e.first_name, e.last_name, d.dept_name, t.from_date\n", " LIMIT 10;" - ] + ], + "id": "09f6dac9" }, { "cell_type": "markdown", - "id": "00b95e97-87f7-456f-b3dc-c6e70c014a40", "metadata": {}, "source": [ "### 5. Profile the query by using PROFILE." - ] + ], + "id": "c2ac5b2f" }, { "cell_type": "code", "execution_count": 18, - "id": "0c35c2fd-67b2-4ddd-be8f-4e25f0e851b0", "metadata": {}, "outputs": [], "source": [ @@ -432,38 +431,38 @@ " INNER JOIN titles t ON e.emp_no=t.emp_no\n", " ORDER BY e.first_name, e.last_name, d.dept_name, t.from_date\n", " LIMIT 10;" - ] + ], + "id": "ce2f483b" }, { "cell_type": "markdown", - "id": "abbbd5b9-d9c2-4a54-b391-fb3a2e9ba0c1", "metadata": {}, "source": [ "### 6. Run SHOW PROFILE to view the statistics on an actual run of the query" - ] + ], + "id": "5c1a31ee" }, { "cell_type": "code", "execution_count": 19, - "id": "a45226d3-4a3e-442e-9800-90654ef8d045", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SHOW PROFILE;" - ] + ], + "id": "d79b8588" }, { "cell_type": "markdown", - "id": "7c2a606c-15b3-4c67-97b6-e92ed00108fb", "metadata": {}, "source": [ "### 7. Run Visual Profile to see this the profile in a GUI format" - ] + ], + "id": "51f45ac9" }, { "cell_type": "markdown", - "id": "3e9ab748-6b05-40e8-9563-93775d089625", "metadata": {}, "source": [ "## Query/Schema Tuning Exercise\n", @@ -471,12 +470,12 @@ "Now that we've visualized our query execution plan, let's address some of the issues we've uncovered.\n", "\n", "### 1. Create a Reference table for departments" - ] + ], + "id": "171437e1" }, { "cell_type": "code", "execution_count": 20, - "id": "60599ab6-5c8e-4a49-a2cd-a4b201ff2f84", "metadata": {}, "outputs": [], "source": [ @@ -489,20 +488,20 @@ ");\n", "\n", "INSERT INTO departments_ref (SELECT * FROM departments);" - ] + ], + "id": "f56d36a3" }, { "cell_type": "markdown", - "id": "bf12d27f-a303-4c57-9b33-1d1a3280bfef", "metadata": {}, "source": [ "### 2. Profile the old and the new" - ] + ], + "id": "b5c9ec5a" }, { "cell_type": "code", "execution_count": 21, - "id": "fdba3687-6430-4c30-8b50-177c528e54be", "metadata": {}, "outputs": [], "source": [ @@ -526,20 +525,20 @@ " LIMIT 10;\n", "\n", "-- PROFILE them both and observe the differences." - ] + ], + "id": "14e927e8" }, { "cell_type": "markdown", - "id": "e569d4cc-80b7-4da0-9022-c9bdddac6e01", "metadata": {}, "source": [ "### 3. Create a titles table with sort and shard keys defined." - ] + ], + "id": "29b4ffb7" }, { "cell_type": "code", "execution_count": 22, - "id": "8494132a-38cd-40e7-bcff-3d6a32eb25fb", "metadata": {}, "outputs": [], "source": [ @@ -554,20 +553,20 @@ ");\n", "\n", "INSERT INTO titles_sharded (SELECT * FROM titles);" - ] + ], + "id": "bd8c4a25" }, { "cell_type": "markdown", - "id": "aca3590d-4144-4d0d-b2a6-e24ee21cb573", "metadata": {}, "source": [ "### 4. Add shard and sort keys to the dept_emp table" - ] + ], + "id": "72575e7e" }, { "cell_type": "code", "execution_count": 23, - "id": "40327756-cb80-444c-8745-f0646385d8da", "metadata": {}, "outputs": [], "source": [ @@ -583,12 +582,12 @@ ");\n", "\n", "INSERT INTO dept_emp_sharded (SELECT * FROM dept_emp);" - ] + ], + "id": "95810fd4" }, { "cell_type": "code", "execution_count": 24, - "id": "7066ff30-e4f1-4105-aaa4-0369af08658d", "metadata": {}, "outputs": [], "source": [ @@ -600,20 +599,20 @@ " INNER JOIN titles_sharded t ON e.emp_no=t.emp_no\n", " ORDER BY e.first_name, e.last_name, d.dept_name, t.from_date\n", " LIMIT 10;" - ] + ], + "id": "e1e7626a" }, { "cell_type": "markdown", - "id": "884c0aeb-ba6a-4e30-bd87-1a23f23ec451", "metadata": {}, "source": [ "### 5. Add shard and sort keys to the employees table" - ] + ], + "id": "8195fc4b" }, { "cell_type": "code", "execution_count": 25, - "id": "9b05ca85-78f7-4a4b-b4d9-779af58d8501", "metadata": {}, "outputs": [], "source": [ @@ -629,12 +628,12 @@ ");\n", "\n", "INSERT INTO employees_sharded (SELECT * FROM employees);" - ] + ], + "id": "5ac1403e" }, { "cell_type": "code", "execution_count": 26, - "id": "431ee5a9-9953-4891-9db5-effd0ec04320", "metadata": {}, "outputs": [], "source": [ @@ -646,11 +645,12 @@ " INNER JOIN titles_sharded t ON e.emp_no=t.emp_no\n", " ORDER BY e.first_name, e.last_name, d.dept_name, t.from_date\n", " LIMIT 10;" - ] + ], + "id": "26f28baa" }, { + "id": "4f6aa82b", "cell_type": "markdown", - "id": "0485bc25-71c5-45b0-93b3-3d47e9501be3", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/rag-with-bedrock/notebook.ipynb b/notebooks/rag-with-bedrock/notebook.ipynb index 78bca7f9..d3699e18 100644 --- a/notebooks/rag-with-bedrock/notebook.ipynb +++ b/notebooks/rag-with-bedrock/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "a905c745", "cell_type": "markdown", - "id": "4b7b2c4c-98c4-45c6-b2d0-b024df167636", "metadata": {}, "source": [ "
    \n", @@ -17,8 +17,8 @@ ] }, { + "id": "93cf31c0", "cell_type": "markdown", - "id": "5e4aa3cf-f650-461a-aab0-f5b4e10af9ce", "metadata": {}, "source": [ "
    \n", @@ -71,7 +71,8 @@ "- Add those chunks as part of the context in the prompt\n", "- Send the prompt to the model under Amazon Bedrock\n", "- Get the contextual answer based on the documents retrieved" - ] + ], + "id": "4fef4efb" }, { "attachments": {}, @@ -81,7 +82,8 @@ "## Usecase\n", "#### Dataset\n", "In this example, you will use several years of Amazon's Letter to Shareholders as a text corpus to perform Q&A on." - ] + ], + "id": "e129f74b" }, { "attachments": {}, @@ -111,7 +113,8 @@ "Note: It is possible to choose other models available with Bedrock. You can replace the `model_id` as follows to change the model.\n", "\n", "`llm = Bedrock(model_id=\"...\")`" - ] + ], + "id": "e74e67af" }, { "cell_type": "code", @@ -120,7 +123,8 @@ "outputs": [], "source": [ "!pip install boto3==1.34.74 langchain==0.1.14 pypdf==4.1.0 tiktoken==0.6.0 SQLAlchemy==2.0.29 --quiet" - ] + ], + "id": "506362db" }, { "cell_type": "code", @@ -133,7 +137,8 @@ "os.environ['AWS_DEFAULT_REGION']='us-east-1'\n", "os.environ['AWS_ACCESS_KEY_ID']= getpass.getpass(\"AWS_ACCESS_KEY_ID: \")\n", "os.environ['AWS_SECRET_ACCESS_KEY']=getpass.getpass(\"AWS_SECRET_ACCESS_KEY: \")" - ] + ], + "id": "f72f65ba" }, { "cell_type": "code", @@ -144,7 +149,8 @@ "import boto3\n", "import json\n", "import sys" - ] + ], + "id": "a2aeedff" }, { "cell_type": "code", @@ -169,7 +175,8 @@ " sys.stdout = _stdout\n", " for line in output.splitlines():\n", " print(\"\\n\".join(textwrap.wrap(line, width=width)))" - ] + ], + "id": "80bd6f48" }, { "cell_type": "code", @@ -179,7 +186,8 @@ "source": [ "session = boto3.session.Session()\n", "bedrock_client = session.client('bedrock')" - ] + ], + "id": "2a5b203f" }, { "attachments": {}, @@ -189,7 +197,8 @@ "### Setup langchain\n", "\n", "We create an instance of the Bedrock classes for the LLM and the embedding models. At the time of writing, Bedrock supports one embedding model and therefore we do not need to specify any model id. To be able to compare token consumption across the different RAG-approaches shown in the workshop labs we use langchain callbacks to count token consumption." - ] + ], + "id": "417ea88b" }, { "cell_type": "code", @@ -214,7 +223,8 @@ "# - create the Titan Embeddings Model\n", "bedrock_embeddings = BedrockEmbeddings(model_id=\"amazon.titan-embed-text-v1\",\n", " client=bedrock_runtime_client)" - ] + ], + "id": "7d4dd00c" }, { "attachments": {}, @@ -225,7 +235,8 @@ "Let's first download some of the files to build our document store.\n", "\n", "In this example, you will use several years of Amazon's Letter to Shareholders as a text corpus to perform Q&A on." - ] + ], + "id": "932646b5" }, { "cell_type": "markdown", @@ -238,7 +249,8 @@ "

    To get the external files, please add s2.q4cdn.com to the notebook Firewall.

    \n", "
    \n", "
    " - ] + ], + "id": "780af600" }, { "cell_type": "code", @@ -274,7 +286,8 @@ "for idx, url in enumerate(urls):\n", " file_path = data_root + filenames[idx]\n", " urlretrieve(url, file_path)" - ] + ], + "id": "7a6eac28" }, { "attachments": {}, @@ -282,7 +295,8 @@ "metadata": {}, "source": [ "As part of Amazon's culture, the CEO always includes a copy of the 1997 Letter to Shareholders with every new release. This will cause repetition, take longer to generate embeddings, and may skew your results. In the next section you will take the downloaded data, trim the 1997 letter (last 3 pages) and overwrite them as processed files." - ] + ], + "id": "ea77a711" }, { "cell_type": "code", @@ -306,7 +320,8 @@ " new_file.seek(0)\n", " pdf_writer.write(new_file)\n", " new_file.truncate()" - ] + ], + "id": "2188b052" }, { "attachments": {}, @@ -316,7 +331,8 @@ "After downloading we can load the documents with the help of [DirectoryLoader from PyPDF available under LangChain](https://python.langchain.com/en/latest/reference/modules/document_loaders.html) and splitting them into smaller chunks.\n", "\n", "Note: The retrieved document/text should be large enough to contain enough information to answer a question; but small enough to fit into the LLM prompt. Also the embeddings model has a limit of the length of input tokens limited to 512 tokens, which roughly translates to ~2000 characters. For the sake of this use-case we are creating chunks of roughly 1000 characters with an overlap of 100 characters using [RecursiveCharacterTextSplitter](https://python.langchain.com/en/latest/modules/indexes/text_splitters/examples/recursive_text_splitter.html)." - ] + ], + "id": "8af3a057" }, { "cell_type": "code", @@ -347,7 +363,8 @@ ")\n", "\n", "docs = text_splitter.split_documents(documents)" - ] + ], + "id": "633e3121" }, { "attachments": {}, @@ -355,7 +372,8 @@ "metadata": {}, "source": [ "Before we are proceeding we are looking into some interesting statistics regarding the document preprocessing we just performed:" - ] + ], + "id": "802cb756" }, { "cell_type": "code", @@ -367,7 +385,8 @@ "print(f'Average length among {len(documents)} documents loaded is {avg_doc_length(documents)} characters.')\n", "print(f'After the split we have {len(docs)} documents as opposed to the original {len(documents)}.')\n", "print(f'Average length among {len(docs)} documents (after split) is {avg_doc_length(docs)} characters.')" - ] + ], + "id": "a0c3b9eb" }, { "attachments": {}, @@ -377,7 +396,8 @@ "We had 4 PDF documents which have been split into smaller chunks.\n", "\n", "Now we can see how a sample embedding would look like for one of those chunks." - ] + ], + "id": "13839639" }, { "cell_type": "code", @@ -388,7 +408,8 @@ "sample_embedding = np.array(bedrock_embeddings.embed_query(docs[0].page_content))\n", "print(\"Sample embedding of a document chunk: \", sample_embedding)\n", "print(\"Size of the embedding: \", sample_embedding.shape)" - ] + ], + "id": "b564f5e5" }, { "attachments": {}, @@ -400,7 +421,8 @@ "This can be easily done using SingleStoreDB implementation inside [LangChain](https://python.langchain.com/docs/integrations/vectorstores/singlestoredb) which takes input the embeddings model and the documents to create the entire vector store.\n", "\n", "**\u26a0\ufe0f\u26a0\ufe0f\u26a0\ufe0f NOTE: it might take few minutes to run the following cell \u26a0\ufe0f\u26a0\ufe0f\u26a0\ufe0f**" - ] + ], + "id": "debae495" }, { "cell_type": "code", @@ -415,7 +437,8 @@ " bedrock_embeddings,\n", " table_name = \"amazon_data\"\n", ")" - ] + ], + "id": "9d355119" }, { "attachments": {}, @@ -425,7 +448,8 @@ "### Similarity Search\n", "\n", "Here you will set your search query, and look for documents that match." - ] + ], + "id": "51c83c1a" }, { "cell_type": "code", @@ -434,7 +458,8 @@ "outputs": [], "source": [ "query = \"How has AWS evolved?\"" - ] + ], + "id": "54dff431" }, { "attachments": {}, @@ -442,7 +467,8 @@ "metadata": {}, "source": [ "The first step would be to create an embedding of the query such that it could be compared with the documents" - ] + ], + "id": "37ab29b7" }, { "cell_type": "code", @@ -452,7 +478,8 @@ "source": [ "query_embedding = bedrock_embeddings.embed_query(\"This is a content of the document\")\n", "np.array(query_embedding)" - ] + ], + "id": "826e4c8e" }, { "attachments": {}, @@ -462,7 +489,8 @@ "#### Basic Similarity Search\n", "\n", "The results that come back from the `similarity_search_with_score` API are sorted by score from highest to lowest. The score value is represented by Dot product. Higher scores are better, for normalized vector embeddings this would approch 1." - ] + ], + "id": "79555029" }, { "cell_type": "code", @@ -473,7 +501,8 @@ "results_with_scores = db.similarity_search_with_score(query)\n", "for doc, score in results_with_scores:\n", " print(f\"Content: {doc.page_content}\\nMetadata: {doc.metadata}\\nScore: {score}\\n\\n\")" - ] + ], + "id": "f7b44474" }, { "attachments": {}, @@ -482,7 +511,8 @@ "source": [ "#### Similarity Search with Metadata Filtering\n", "Additionally, you can provide metadata to your query to filter the scope of your results. The `filter` parameter for search queries is a dictionary of metadata key/value pairs that will be matched to results to include/exclude them from your query." - ] + ], + "id": "c27154e0" }, { "cell_type": "code", @@ -491,7 +521,8 @@ "outputs": [], "source": [ "filter = dict(year=2022)" - ] + ], + "id": "48a2ec88" }, { "attachments": {}, @@ -499,7 +530,8 @@ "metadata": {}, "source": [ "In the next section, you will notice that your query has returned less results than the basic search, because of your filter criteria on the resultset." - ] + ], + "id": "31536458" }, { "cell_type": "code", @@ -510,7 +542,8 @@ "results_with_scores = db.similarity_search_with_score(query, filter=filter)\n", "for doc, score in results_with_scores:\n", " print(f\"Content: {doc.page_content}\\nMetadata: {doc.metadata}, Score: {score}\\n\\n\")" - ] + ], + "id": "6267089a" }, { "attachments": {}, @@ -529,7 +562,8 @@ "\n", "* `k`, the max number of results to return at the end of our query\n", "* `fetch_k`, the max number of results to return from the similarity search before applying filters" - ] + ], + "id": "f4b50eb1" }, { "cell_type": "code", @@ -540,7 +574,8 @@ "results = db.similarity_search(query, filter=filter, k=2, fetch_k=4)\n", "for doc in results:\n", " print(f\"Content: {doc.page_content}\\nMetadata: {doc.metadata}\\n\\n\")" - ] + ], + "id": "772925b7" }, { "attachments": {}, @@ -552,7 +587,8 @@ "We will take our inital prompt, together with our relevant documents which were retreived based on the results of our similarity search. We then by combining these create a prompt that we feed back to the model to get our result. At this point our model should give us highly informed information on how we can change the tire of our specific car as it was outlined in our manual.\n", "\n", "LangChain provides an abstraction of how this can be done easily." - ] + ], + "id": "1053ef24" }, { "attachments": {}, @@ -563,7 +599,8 @@ "In the above scenario you explored the quick and easy way to get a context-aware answer to your question. Now let's have a look at a more customizable option with the help of [RetrievalQA](https://python.langchain.com/en/latest/modules/chains/index_examples/vector_db_qa.html) where you can customize how the documents fetched should be added to prompt using `chain_type` parameter. Also, if you want to control how many relevant documents should be retrieved then change the `k` parameter in the cell below to see different outputs. In many scenarios you might want to know which were the source documents that the LLM used to generate the answer, you can get those documents in the output using `return_source_documents` which returns the documents that are added to the context of the LLM prompt. `RetrievalQA` also allows you to provide a custom [prompt template](https://python.langchain.com/en/latest/modules/prompts/prompt_templates/getting_started.html) which can be specific to the model.\n", "\n", "Note: In this example we are using Anthropic Claude as the LLM under Amazon Bedrock, this particular model performs best if the inputs are provided under `Human:` and the model is requested to generate an output after `Assistant:`. In the cell below you see an example of how to control the prompt such that the LLM stays grounded and doesn't answer outside the context." - ] + ], + "id": "8f0fe150" }, { "cell_type": "code", @@ -597,7 +634,8 @@ " chain_type_kwargs={\"prompt\": PROMPT},\n", " callbacks=[StreamingStdOutCallbackHandler()]\n", ")" - ] + ], + "id": "d6f1cc9a" }, { "cell_type": "code", @@ -613,7 +651,8 @@ "print(f'\\nContext Documents: ')\n", "for srcdoc in result[\"source_documents\"]:\n", " print_ww(f'{srcdoc}\\n')" - ] + ], + "id": "866823ac" }, { "cell_type": "code", @@ -629,7 +668,8 @@ "print(f'\\nContext Documents: ')\n", "for srcdoc in result[\"source_documents\"]:\n", " print_ww(f'{srcdoc}\\n')" - ] + ], + "id": "b9097468" }, { "cell_type": "code", @@ -645,7 +685,8 @@ "print(f'\\nContext Documents: ')\n", "for srcdoc in result[\"source_documents\"]:\n", " print_ww(f'{srcdoc}\\n')" - ] + ], + "id": "857be677" }, { "cell_type": "code", @@ -661,7 +702,8 @@ "print(f'\\nContext Documents: ')\n", "for srcdoc in result[\"source_documents\"]:\n", " print_ww(f'{srcdoc}\\n')" - ] + ], + "id": "dca1da66" }, { "attachments": {}, @@ -670,7 +712,8 @@ "source": [ "## Clean up\n", "Clear the downloaded PDFs and the `amazon_data` table" - ] + ], + "id": "eb36b686" }, { "cell_type": "code", @@ -679,7 +722,8 @@ "outputs": [], "source": [ "!rm -rf ./RAG_Bedrock_data" - ] + ], + "id": "13911724" }, { "cell_type": "code", @@ -689,7 +733,8 @@ "source": [ "%%sql\n", "DROP TABLE IF EXISTS amazon_data" - ] + ], + "id": "93a0f084" }, { "attachments": {}, @@ -712,11 +757,12 @@ "- Integration with enterprise data stores\n", "\n", "# Thank You" - ] + ], + "id": "2cc16899" }, { + "id": "6cbc0ac7", "cell_type": "markdown", - "id": "b45f7528-5054-4006-9dd1-0347c5fb7bae", "metadata": {}, "source": [ "
    \n", @@ -744,5 +790,5 @@ } }, "nbformat": 4, - "nbformat_minor": 4 + "nbformat_minor": 5 } diff --git a/notebooks/real-time-anomaly-detection/notebook.ipynb b/notebooks/real-time-anomaly-detection/notebook.ipynb index c4cd9fe8..66e32329 100644 --- a/notebooks/real-time-anomaly-detection/notebook.ipynb +++ b/notebooks/real-time-anomaly-detection/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "df40bfbd", "cell_type": "markdown", - "id": "951e6991-8cdd-453d-bad4-dd1472d29ff7", "metadata": {}, "source": [ "
    \n", @@ -19,28 +19,27 @@ { "attachments": {}, "cell_type": "markdown", - "id": "3238cf7d-5a34-4463-8bf7-f64f89fc3055", "metadata": {}, "source": [ "In this notebook, we embark on a cutting-edge exploration of real-time anomaly detection in IoT sensor data, harnessing the robust capabilities of SingleStoreDB and advanced analytical techniques. Our journey begins with the efficient ingestion of sensor data into SingleStoreDB, setting the stage for dynamic and insightful analysis. The heart of this notebook lies in its innovative approach to handling and interpreting sensor data. We utilize the power of vector embeddings, generated through the UMAP library, to transform high-dimensional sensor readings into a format ripe for anomaly detection. These embeddings, capturing the essence of weather parameters like wind, rain, and temperature, are then seamlessly integrated into SingleStoreDB.\n", "\n", "Our focus intensifies as we apply SingleStoreDB's dot_product function to these embeddings, unearthing anomalies in real-time. This not only provides a swift identification of irregularities but also paints a vivid picture of sensor data behavior over time. We don\u2019t just stop at detection; the notebook further enriches the data analysis with a visually engaging, real-time dashboard. This dashboard, crafted using Plotly and Rich libraries, offers an interactive and constantly updated view of the anomalies, allowing users to monitor and respond to sensor data trends as they happen. Join us in this exciting venture as we blend SQL, SingleStoreDB, and Python to unlock new possibilities in real-time anomaly detection. Whether you're a data scientist, an IoT enthusiast, or simply intrigued by the power of real-time analytics, this notebook is your gateway to understanding and leveraging the full potential of IoT sensor data." - ] + ], + "id": "9883af46" }, { "cell_type": "markdown", - "id": "f3df232a-aee8-4d16-ac9b-035336e2e1ad", "metadata": {}, "source": [ "
    \n", " \"Architecture\n", "
    " - ] + ], + "id": "af814072" }, { "attachments": {}, "cell_type": "markdown", - "id": "73457163-c407-4886-bb54-b869d22a0bea", "metadata": {}, "source": [ "## Database Setup\n", @@ -72,12 +71,12 @@ " - **`CREATE TABLE sensor_data_stage`**\n", " Serves as a staging area for raw sensor data before processing.\n", " - **Columns**: Similar to `sensor_data_with_vectors`, but used for staging raw data." - ] + ], + "id": "36547140" }, { "cell_type": "code", "execution_count": 1, - "id": "21f4b459-f1cf-4419-94be-7a32cbe5062d", "metadata": {}, "outputs": [], "source": [ @@ -110,12 +109,12 @@ " embeddings text,\n", " date DATETIME\n", ");" - ] + ], + "id": "c7c8dac2" }, { "attachments": {}, "cell_type": "markdown", - "id": "d9b24a3c-0c7b-4f3d-afbd-38fdae6c8d74", "metadata": {}, "source": [ "## Setting Up and Initiating the Sensor Data Pipeline\n", @@ -162,12 +161,12 @@ "\n", "### Usage\n", "The establishment of these pipelines is essential for the real-time and historical analysis of IoT sensor data. `sensor_data_pipeline` facilitates the ingestion of historical data for retrospective analysis, while `sensor_realtime_data_pipeline` caters to ongoing, real-time data analysis needs." - ] + ], + "id": "59acc775" }, { "cell_type": "code", "execution_count": 2, - "id": "261175c2-9a3f-4b42-b542-e9216f6df555", "metadata": {}, "outputs": [], "source": [ @@ -180,34 +179,34 @@ "ENCLOSED BY '\"'\n", "LINES TERMINATED BY '\\n'\n", "IGNORE 1 LINES;" - ] + ], + "id": "fe95234f" }, { "cell_type": "code", "execution_count": 3, - "id": "2af087a8-2925-4808-bbe6-4fc224f3b083", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "START PIPELINE sensor_data_pipeline FOREGROUND;" - ] + ], + "id": "1b0a2602" }, { "cell_type": "code", "execution_count": 4, - "id": "14befa3d-637a-40ea-88bc-f27e8197e82d", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT * FROM sensor_data_with_vectors limit 2;" - ] + ], + "id": "a9916c45" }, { "cell_type": "code", "execution_count": 5, - "id": "50b4cc6a-e047-41c7-8dd9-95b6c5ef9b2e", "metadata": {}, "outputs": [], "source": [ @@ -221,34 +220,34 @@ "LINES TERMINATED BY '\\r\\n'\n", "IGNORE 1 LINES\n", "SET date = NOW();" - ] + ], + "id": "89dc6da9" }, { "cell_type": "code", "execution_count": 6, - "id": "efa7eaf0-8238-4c9e-87df-861979fae434", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "START PIPELINE sensor_realtime_data_pipeline FOREGROUND;" - ] + ], + "id": "9de7f180" }, { "cell_type": "code", "execution_count": 7, - "id": "d3b777a0-7583-41d3-aaa2-f52a2cc45d95", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT * FROM sensor_data_stage limit 1;" - ] + ], + "id": "e8dca4f5" }, { "attachments": {}, "cell_type": "markdown", - "id": "495f261f-7143-492e-a08f-9daa06845db9", "metadata": {}, "source": [ "## Data Preparation for Analysis\n", @@ -277,22 +276,22 @@ " Fills null values in the DataFrame by propagating non-null values backward.\n", " - **`df = df.dropna()`**\n", " Drops any remaining rows with null values to ensure the dataset is clean for analysis." - ] + ], + "id": "339ddba3" }, { "cell_type": "code", "execution_count": 8, - "id": "672ea6de-3785-485c-9779-bd90f4538625", "metadata": {}, "outputs": [], "source": [ "!pip install umap-learn --quiet" - ] + ], + "id": "24f413c3" }, { "attachments": {}, "cell_type": "markdown", - "id": "08f0b338-2c66-42d2-bd08-8613c3b6ecfc", "metadata": {}, "source": [ "
    \n", @@ -302,12 +301,12 @@ "

    Restart Kernel if importing umap gives error

    \n", "
    \n", "
    " - ] + ], + "id": "6ddccf4c" }, { "cell_type": "code", "execution_count": 9, - "id": "ee9612f6-0598-4730-b01c-939f184498f7", "metadata": {}, "outputs": [], "source": [ @@ -318,12 +317,12 @@ "from sqlalchemy import create_engine\n", "import json\n", "import pandas as pd" - ] + ], + "id": "daffbe61" }, { "cell_type": "code", "execution_count": 10, - "id": "869ebb07-2294-4279-8470-f695b8d94b04", "metadata": {}, "outputs": [], "source": [ @@ -336,12 +335,12 @@ "df = df.bfill(axis=0)\n", "\n", "df = df.dropna()" - ] + ], + "id": "fc18d376" }, { "attachments": {}, "cell_type": "markdown", - "id": "ee1a3971-4006-4ef7-885f-d5bd8d67cbe5", "metadata": {}, "source": [ "## Generating Vector Embeddings using UMAP Library\n", @@ -374,12 +373,12 @@ "5. **Displaying Results**\n", " - **`new_df1.head()`**\n", " Displays the first few rows of `new_df1` to verify the embedding generation and integration process." - ] + ], + "id": "f0a68bd6" }, { "cell_type": "code", "execution_count": 11, - "id": "70bc031c-cc4a-4163-ab3c-d03b9591455a", "metadata": {}, "outputs": [], "source": [ @@ -389,33 +388,33 @@ "\n", "reducer = umap.UMAP(n_components=15)\n", "embeddings = reducer.fit_transform(features)" - ] + ], + "id": "0fba3856" }, { "cell_type": "code", "execution_count": 12, - "id": "ea92368c-16c6-41c0-ba42-0de5603f0d47", "metadata": {}, "outputs": [], "source": [ "normalized_embeddings = normalize(embeddings, norm='l2')\n", "new_df1['embeddings'] = list(normalized_embeddings)" - ] + ], + "id": "5491e571" }, { "cell_type": "code", "execution_count": 13, - "id": "5e9d01b2-1142-4a91-b384-c2e3e82ca4be", "metadata": {}, "outputs": [], "source": [ "new_df1.head()" - ] + ], + "id": "1346f84a" }, { "attachments": {}, "cell_type": "markdown", - "id": "83031520-8beb-42e5-8189-78fbef2ab67b", "metadata": {}, "source": [ "## Anomaly Detection and Data Integration\n", @@ -434,12 +433,12 @@ " - Ensures appropriate data types for columns in `new_df` (e.g., converting `date` to datetime, `city`, `longitude`, `latitude` to strings, etc.).\n", "- **Appending to SQL Table**:\n", " - `new_df.to_sql('sensor_data_with_vectors', con=engine, if_exists='append', index=False)` appends the processed data in `new_df` to the `sensor_data_with_vectors` table in the database." - ] + ], + "id": "aafe9b25" }, { "cell_type": "code", "execution_count": 14, - "id": "0a4d4069-5699-45fa-b0de-393e73271053", "metadata": {}, "outputs": [], "source": [ @@ -478,22 +477,22 @@ " else:\n", " # set anomaly to None or some default value\n", " new_df.loc[index, 'anomaly'] = 'none'" - ] + ], + "id": "21179b1e" }, { "cell_type": "code", "execution_count": 15, - "id": "7f9ff388-6b80-4f10-878b-48fd7e65ea66", "metadata": {}, "outputs": [], "source": [ "new_df.head()" - ] + ], + "id": "65202ca1" }, { "cell_type": "code", "execution_count": 16, - "id": "3f279837-caf1-47aa-8334-efa71a5c5e9d", "metadata": {}, "outputs": [], "source": [ @@ -510,12 +509,12 @@ "\n", "# Append data to SQL table\n", "new_df.to_sql('sensor_data_with_vectors', con=engine, if_exists='append', index=False)" - ] + ], + "id": "ec523747" }, { "attachments": {}, "cell_type": "markdown", - "id": "50236f0e-de7a-4be7-851e-cb24be8ec435", "metadata": {}, "source": [ "## Dashboard for Monitoring Anomalies over Time\n", @@ -538,12 +537,12 @@ "- **City-Specific Anomaly Trends**:\n", " - Further groups data by `city` along with `date_only` and `anomaly`.\n", " - Loops through a predefined list of cities to create separate histograms for each city, showcasing city-specific anomaly trends." - ] + ], + "id": "a41f2f1e" }, { "cell_type": "code", "execution_count": 17, - "id": "58abb86c-a8ae-4e4a-8c48-fc9c5fba67b5", "metadata": {}, "outputs": [], "source": [ @@ -551,23 +550,23 @@ "import plotly.express as px\n", "from sqlalchemy import create_engine\n", "engine = create_engine(connection_url)" - ] + ], + "id": "4dbdf3b3" }, { "cell_type": "code", "execution_count": 18, - "id": "52e860f3-64ff-4a0f-96e4-3cdaa5da88ff", "metadata": {}, "outputs": [], "source": [ "# df = pd.read_sql('select * from sensor_data_with_vectors limit 50000;', engine)\n", "df = pd.read_sql(\"select * from sensor_data_with_vectors where anomaly <> 'none' limit 50000;\", engine)" - ] + ], + "id": "55056545" }, { "cell_type": "code", "execution_count": 19, - "id": "ae6a5994-fae4-453d-add6-5cfc614fe6c2", "metadata": {}, "outputs": [], "source": [ @@ -582,12 +581,12 @@ "\n", "# Show plot\n", "fig.show()" - ] + ], + "id": "99b8e549" }, { "cell_type": "code", "execution_count": 20, - "id": "6f0fe862-fbb4-4f4b-847a-213e50fdb9bb", "metadata": {}, "outputs": [], "source": [ @@ -603,12 +602,12 @@ " fig = px.histogram(city_df, x='date_only', y='counts', color='anomaly',\n", " title=f'Anomalies over Time for {city}', labels={'date_only': 'Date', 'counts': 'Anomaly Count'})\n", " fig.show()" - ] + ], + "id": "2ac31b5f" }, { "attachments": {}, "cell_type": "markdown", - "id": "dd964c7c-b08a-4a68-b2ec-88267d47d78f", "metadata": {}, "source": [ "## Real-Time Anomaly Detection Dashboard\n", @@ -633,22 +632,22 @@ " - Adds the top 50 rows from the fetched data to the table.\n", " - Styles rows based on the anomaly type (e.g., different colors for different anomaly types).\n", " - Refreshes the display every 10 seconds, fetching updated data from the database." - ] + ], + "id": "4106f242" }, { "cell_type": "code", "execution_count": 21, - "id": "3c2137ab-acfc-4dfb-9c99-f09d5863b1d4", "metadata": {}, "outputs": [], "source": [ "!pip install tabulate pymysql Ipython rich --quiet" - ] + ], + "id": "e32e2f01" }, { "cell_type": "code", "execution_count": 22, - "id": "83fecd11-e017-44b5-9845-27404b829b9a", "metadata": {}, "outputs": [], "source": [ @@ -660,12 +659,12 @@ "from rich.table import Table\n", "from rich import box\n", "from IPython.display import clear_output" - ] + ], + "id": "12c90c07" }, { "cell_type": "code", "execution_count": 23, - "id": "33117fdf-f1d5-4ad2-b957-7ca75586b969", "metadata": {}, "outputs": [], "source": [ @@ -739,11 +738,12 @@ "\n", "# Call the function to start displaying the table contents\n", "display_table_contents()" - ] + ], + "id": "02ed29ff" }, { + "id": "bf219c72", "cell_type": "markdown", - "id": "fd0e71ca-601d-4a30-8a0c-fc016e473dd7", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/real-time-recommendation-engine/notebook.ipynb b/notebooks/real-time-recommendation-engine/notebook.ipynb index 4a9b6fec..068f1a47 100644 --- a/notebooks/real-time-recommendation-engine/notebook.ipynb +++ b/notebooks/real-time-recommendation-engine/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "636e0777", "cell_type": "markdown", - "id": "6c991811-dee6-4315-b831-320573e8e06f", "metadata": {}, "source": [ "
    \n", @@ -17,8 +17,8 @@ ] }, { + "id": "2c650094", "cell_type": "markdown", - "id": "fee274a2-fc65-43ff-9f07-cfb83f1cbc40", "metadata": {}, "source": [ "
    \n", @@ -36,7 +36,8 @@ "metadata": {}, "source": [ "# How to build a real-time recommendation engine with SingleStore & Vercel" - ] + ], + "id": "f7018e9e" }, { "attachments": { @@ -55,7 +56,8 @@ "### Architecture:\n", "\n", "![Screenshot 2024-01-12 at 2.13.37 PM.png](attachment:c7f1d715-a955-408e-87f4-fdc1e1b3dc05.png)" - ] + ], + "id": "29a5f5cd" }, { "attachments": {}, @@ -72,7 +74,8 @@ "4. Combine the results of the semantic search with analytics on the public benchmarks, # likes, # downloads of these models.\n", "6. Power the app entirely on a single SingleStore Free Shared Tier Workspace.\n", "7. Ensure that all of the latest posts / scores are reflected in the App. Power this entirely with SingleStore Notebook and Job Service" - ] + ], + "id": "983e7229" }, { "attachments": {}, @@ -87,7 +90,8 @@ "- Step 5: Creating Helper Functions to load data into SingleStore\n", "- Step 6: Loading data with embeddings into SingleStore\n", "- Step 7: Building the Recommendation Engine Algorithm on Vercel" - ] + ], + "id": "c28ac257" }, { "attachments": {}, @@ -97,7 +101,8 @@ "## Step 1. Create a Starter Workspace\n", "\n", "Create a new Workpsace Group and select a Starter Workspace. If you do not have this enabled email pm@singlestore.com" - ] + ], + "id": "81914336" }, { "attachments": {}, @@ -105,7 +110,8 @@ "metadata": {}, "source": [ "## Step 2. Install and import required libraries" - ] + ], + "id": "a085e5da" }, { "cell_type": "code", @@ -130,7 +136,8 @@ "from markdown import markdown\n", "from datetime import datetime\n", "from time import time, sleep" - ] + ], + "id": "ec21352e" }, { "attachments": {}, @@ -140,7 +147,8 @@ "## Step 3. Seting Environment variables\n", "\n", "### 3.1. Set the app common variables. Do not change these" - ] + ], + "id": "416e8ae7" }, { "cell_type": "code", @@ -157,7 +165,8 @@ "LEADERBOARD_DATASET_URL = 'https://llm-recommender.vercel.app/datasets/leaderboard.json'\n", "TOKENS_LIMIT = 2047\n", "TOKENS_TRASHHOLD_LIMIT = TOKENS_LIMIT - 128" - ] + ], + "id": "09bca45d" }, { "attachments": {}, @@ -172,7 +181,8 @@ "1. [Open the OpenAI API keys page](https://platform.openai.com/api-keys)\n", "2. Create a new key\n", "3. Copy the key and paste it into the `OPENAI_API_KEY` variable" - ] + ], + "id": "9adbecb2" }, { "cell_type": "code", @@ -181,7 +191,8 @@ "outputs": [], "source": [ "OPENAI_API_KEY = getpass.getpass(\"enter openAI apikey here\")" - ] + ], + "id": "b1526fb8" }, { "attachments": {}, @@ -194,7 +205,8 @@ "1. [Open the HuggingFace Access Tokens page](https://huggingface.co/settings/tokens)\n", "2. Create a new token\n", "3. Copy the key and paste it into the `HF_TOKEN` variable" - ] + ], + "id": "ebcda1f7" }, { "cell_type": "code", @@ -203,7 +215,8 @@ "outputs": [], "source": [ "HF_TOKEN = getpass.getpass(\"enter HuggingFace apikey here\")" - ] + ], + "id": "4bc1c6f8" }, { "attachments": {}, @@ -216,7 +229,8 @@ "2. Add a new app\n", "3. Fill the form\n", "4. Generate a Bearer Token and paste it into the `TWITTER_BEARER_TOKEN` variable" - ] + ], + "id": "96f115fb" }, { "cell_type": "code", @@ -225,7 +239,8 @@ "outputs": [], "source": [ "TWITTER_BEARER_TOKEN = getpass.getpass(\"enter Twitter Bearer Token here\")" - ] + ], + "id": "bfc41462" }, { "attachments": {}, @@ -237,7 +252,8 @@ "1. [Open the Register new GitHub App page](https://github.com/settings/apps/new)\n", "2. Fill the form\n", "3. Get an access token and paste it into the `GITHUB_ACCESS_TOKEN` variable" - ] + ], + "id": "af33ae31" }, { "cell_type": "code", @@ -246,7 +262,8 @@ "outputs": [], "source": [ "GITHUB_ACCESS_TOKEN = getpass.getpass(\"enter Github Access Token here\")" - ] + ], + "id": "237b4e4a" }, { "attachments": {}, @@ -268,7 +285,8 @@ "- `readmes_table` - table with model readme texts from the HugginFace model pages (used in semantic search)\n", "- `twitter_posts` - table with tweets related to models (used in semantic search)\n", "- `github_repos` - table with GitHub readme texts related to models (used in semantic search)" - ] + ], + "id": "825c8b97" }, { "cell_type": "markdown", @@ -281,7 +299,8 @@ "

    Make sure to select a database from the drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.

    \n", "
    \n", "
    " - ] + ], + "id": "da4ce011" }, { "cell_type": "code", @@ -403,7 +422,8 @@ " created_at = created_at.strftime('%Y-%m-%dT%H:%M:%SZ')\n", "\n", " return created_at" - ] + ], + "id": "e8bb7c0d" }, { "attachments": {}, @@ -413,7 +433,8 @@ "## Step 5. Creating helper functions to load data into SingleStore\n", "\n", "### 5.1. Setting up the `openai.api_key`" - ] + ], + "id": "daa576eb" }, { "cell_type": "code", @@ -422,7 +443,8 @@ "outputs": [], "source": [ "openai.api_key = OPENAI_API_KEY" - ] + ], + "id": "2a53c729" }, { "attachments": {}, @@ -431,7 +453,8 @@ "source": [ "### 5.2. Create the `create_embeddings` function\n", "This function will be used to create embeddings on data based on an input to the function. We will be doing this to all data pulled from Github, HuggingFace and Twitter. The vector embeddings created will be stored in the same SingleStore table as a separate column." - ] + ], + "id": "26a4f3cc" }, { "cell_type": "code", @@ -450,7 +473,8 @@ " except Exception as e:\n", " print(e)\n", " return [[]]" - ] + ], + "id": "4e1ffcd1" }, { "attachments": {}, @@ -459,7 +483,8 @@ "source": [ "### 5.3. Create the function/Utils to help parse the data ingested from the various sources\n", "This is a set of functions that ensure the JSON is in the right format and can be stored in SingleStore as a JSON column. In your Free Shared Tier workspace you can bring data of various formats (JSON, Geospatial, Vector) and interact with this data with SQL and MongoDB API." - ] + ], + "id": "6e7cb906" }, { "cell_type": "code", @@ -525,7 +550,8 @@ " new_string = remove_links(new_string)\n", "\n", " return new_string" - ] + ], + "id": "e238f7bf" }, { "attachments": {}, @@ -533,7 +559,8 @@ "metadata": {}, "source": [ "## Step 6. Loading Data into SingleStore" - ] + ], + "id": "2b912955" }, { "attachments": {}, @@ -543,7 +570,8 @@ "### 6.1. Load Data on all Open-Source LLM models from [HuggingFace Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)\n", "This function loads a pre-generated Open LLM Leaderboard dataset. Based on this dataset, all model data is created and inserted into the database.\n", "We will also create embeddings for all of this data pulled using the OpenAI Embedding Model." - ] + ], + "id": "ba02f219" }, { "cell_type": "code", @@ -625,7 +653,8 @@ " for i, row in leaderboard_df.iterrows():\n", " if not row['repo_id'] in existed_model_repo_ids:\n", " leaderboard_insert_model(row.to_dict())" - ] + ], + "id": "3d6ca6f2" }, { "attachments": {}, @@ -638,7 +667,8 @@ "This allows us to see in which kinds of scenarios are developers using a particular LLM and incoporate it as a part of our recommendation.\n", "\n", "In the first step we search for the model using the github API" - ] + ], + "id": "2998a051" }, { "cell_type": "code", @@ -704,7 +734,8 @@ " return repos\n", "\n", " return repos" - ] + ], + "id": "d8d46df1" }, { "attachments": {}, @@ -712,7 +743,8 @@ "metadata": {}, "source": [ "After we conduct this serach, we will insert it into another table in the database. The data inserted will have embeddings associated with it." - ] + ], + "id": "66bdf649" }, { "cell_type": "code", @@ -776,7 +808,8 @@ " github_insert_model_repos(repo_id, found_repos)\n", " except Exception as e:\n", " print('Error github_process_models_repos: ', e)" - ] + ], + "id": "6c431f53" }, { "attachments": {}, @@ -784,7 +817,8 @@ "metadata": {}, "source": [ "### 6.3. Load Data from Twitter about these models." - ] + ], + "id": "5d4a67f3" }, { "attachments": {}, @@ -792,7 +826,8 @@ "metadata": {}, "source": [ "First, we will search Twitter based on the model names we have using the API." - ] + ], + "id": "8b11c16a" }, { "cell_type": "code", @@ -822,7 +857,8 @@ " return posts\n", "\n", " return posts" - ] + ], + "id": "a7deb38b" }, { "attachments": {}, @@ -830,7 +866,8 @@ "metadata": {}, "source": [ "Next, we will add the text from the posts per model into another table. This table will also have embeddings associated with it." - ] + ], + "id": "0a4f975f" }, { "cell_type": "code", @@ -881,7 +918,8 @@ " twitter_insert_model_posts(repo_id, found_posts)\n", " except Exception as e:\n", " print('Error twitter_process_models_posts: ', e)" - ] + ], + "id": "28ce9447" }, { "attachments": {}, @@ -894,7 +932,8 @@ "Next, it executes a query to retrieve all the models in the database. Based on these models, Twitter posts, Reddit posts, and GitHub repositories are searched, converted into embeddings and inserted into tables.\n", "\n", "Finally, we get a ready set of data for finding the most appropriate model for any use case using semantic search." - ] + ], + "id": "be8b1474" }, { "cell_type": "code", @@ -907,7 +946,8 @@ "existed_models = get_models('repo_id, name', f'ORDER BY score DESC LIMIT {MODELS_LIMIT}')\n", "twitter_process_models_posts(existed_models)\n", "github_process_models_repos(existed_models)" - ] + ], + "id": "964c2e55" }, { "attachments": {}, @@ -919,7 +959,8 @@ "This will ensure that you can capture the latest sentiment and usage from Twitter / Github about developers.\n", "\n", "SingleStore Notebook + Job Service makes it really easy to bring real-time data to your vector-based searches and AI/ML models downstream. You can ensure that the data is in the right format and apply python based transformations like creating embeddings on the most newly ingested data. This would've previously required a combination of several serverless technologies alongside your database as we wrote about this [previously](https://www.singlestore.com/blog/a-serverless-architecture-for-creating-openai-embeddings-with-singlestoredb/)" - ] + ], + "id": "aa29569c" }, { "attachments": {}, @@ -927,7 +968,8 @@ "metadata": {}, "source": [ "## (Optional) Step 7: Host the app with Vercel" - ] + ], + "id": "5ae8ca82" }, { "cell_type": "markdown", @@ -936,11 +978,12 @@ "Follow our github [repo](https://github.com/singlestore-labs/llm-recommender/tree/main) where we showcase how to write the front end code of the app which does the vector similarity search to provide the results.The front end is built with our [elegance SDK](https://elegancesdk.com/) and hosted with Vercel.\n", "\n", "See our [guide](https://docs.singlestore.com/cloud/integrate-with-singlestoredb-cloud/connect-with-vercel/) on our vercel integration with SingleStore. We have a public version of the app running for free [here](https://llm-recommender.vercel.app/)." - ] + ], + "id": "27ffde9a" }, { + "id": "ca0c4a75", "cell_type": "markdown", - "id": "996c0586-1c4b-4c1f-aa37-240d11f544eb", "metadata": {}, "source": [ "
    \n", @@ -968,5 +1011,5 @@ } }, "nbformat": 4, - "nbformat_minor": 4 + "nbformat_minor": 5 } diff --git a/notebooks/representing-unstructured-data-as-vectors/notebook.ipynb b/notebooks/representing-unstructured-data-as-vectors/notebook.ipynb index 737f1e64..3ce02261 100644 --- a/notebooks/representing-unstructured-data-as-vectors/notebook.ipynb +++ b/notebooks/representing-unstructured-data-as-vectors/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "0ec251f1", "cell_type": "markdown", - "id": "8c9f7c64", "metadata": {}, "source": [ "
    \n", @@ -18,72 +18,71 @@ }, { "cell_type": "markdown", - "id": "e868fad2-ab36-4544-a5af-01f5f738d2ca", "metadata": {}, "source": [ "## Representing Unstructured Data as Vectors\n", "Visualizing these vectors and measuring the distance between these vectors using various methods such as Manhattan Distance, Euclidean Distance, Cosine Distance & Dot Product" - ] + ], + "id": "565b961f" }, { "cell_type": "markdown", - "id": "9a659bec-ce40-482f-8819-e6a17bc38b54", "metadata": {}, "source": [ "Let's take an example of two pets and visualize them in a 3D space. We will try to find the Manhattan Distance, Euclidean Distance, Cosine Distance & Dot Product between these two pets." - ] + ], + "id": "7ac857fe" }, { "cell_type": "markdown", - "id": "2167d436-e922-49a2-a3ba-a873d4738a9a", "metadata": {}, "source": [ "Hypothetically consider these vectors for to represent dog and cat.\n", "dog = [5, 30, 2]\n", "cat = [3, 25, 4]" - ] + ], + "id": "916063d3" }, { "cell_type": "markdown", - "id": "33a6d04e-170f-4e00-ab87-633661c1d92d", "metadata": {}, "source": [ "### Install the libraries required" - ] + ], + "id": "bbae7661" }, { "cell_type": "code", "execution_count": 1, - "id": "4266432c-04be-4cbc-b079-125b621a6527", "metadata": {}, "outputs": [], "source": [ "!pip install numpy matplotlib --quiet" - ] + ], + "id": "386e29fc" }, { "cell_type": "markdown", - "id": "d910f26d-0f08-4771-b468-7dbf5b477f1a", "metadata": {}, "source": [ "### Import the libraries" - ] + ], + "id": "7207d8e9" }, { "cell_type": "code", "execution_count": 2, - "id": "f6672c54-e90c-44ff-9005-ecd733d13805", "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "import numpy as np" - ] + ], + "id": "8d9089b4" }, { "cell_type": "code", "execution_count": 3, - "id": "f8d00505-6a2d-4139-a7bd-f3b84f121d82", "metadata": {}, "outputs": [], "source": [ @@ -120,55 +119,55 @@ "ax.set_title('3D Representation of Pets')\n", "\n", "plt.show()" - ] + ], + "id": "ea181a08" }, { "cell_type": "markdown", - "id": "8c7ce294-3148-4c43-b3c6-e3a879a30edc", "metadata": {}, "source": [ "### Manhattan Distance" - ] + ], + "id": "988ef57c" }, { "cell_type": "markdown", - "id": "bc4e2bc6-1fe9-4af1-9bde-4ca9b5da26ea", "metadata": {}, "source": [ "Manhattan distance is like calculating the total distance you would travel between two points (dog and cat here) if you could only move in straight lines" - ] + ], + "id": "7252eade" }, { "cell_type": "code", "execution_count": 4, - "id": "76716923-7235-43f7-9833-a9790192972f", "metadata": {}, "outputs": [], "source": [ "L1 = [abs(dog[i] - cat[i]) for i in range(len(dog))]\n", "sum(L1)" - ] + ], + "id": "5bddfc07" }, { "cell_type": "markdown", - "id": "f0217e8f-fa53-4f01-8f6d-1a8934b740e6", "metadata": {}, "source": [ "### Euclidean Distance" - ] + ], + "id": "5ab22a1e" }, { "cell_type": "markdown", - "id": "f4496860-6428-41f1-9a70-1c5c0b2b79d0", "metadata": {}, "source": [ "Euclidean distance is like the straight-line distance between two points, as if you could draw a straight line from one point to another, not limited by any paths or grids, similar to how a bird would fly directly between two locations." - ] + ], + "id": "f5bcdc90" }, { "cell_type": "code", "execution_count": 5, - "id": "3b88cae7-8702-4663-bc1e-ee11810274ae", "metadata": {}, "outputs": [], "source": [ @@ -176,64 +175,65 @@ "\n", "L2 = np.sqrt(np.array(L2).sum())\n", "L2" - ] + ], + "id": "ec15b420" }, { "cell_type": "markdown", - "id": "63c3f243-38f7-44fe-b121-b02d81580f5d", "metadata": {}, "source": [ "### Cosine Distance" - ] + ], + "id": "6591546d" }, { "cell_type": "markdown", - "id": "f260bccd-17aa-498a-8c1c-26c1fb342837", "metadata": {}, "source": [ "Cosine distance is a measure of orientation rather than actual distance; it's like comparing the directions in which two arrows are pointing, regardless of how long the arrows are or where they are located." - ] + ], + "id": "3d04c0e3" }, { "cell_type": "code", "execution_count": 6, - "id": "bc2dba64-efe4-489d-9f51-7fc7c21bab0a", "metadata": {}, "outputs": [], "source": [ "cosine = np.dot(dog, cat) / (np.linalg.norm(dog) * np.linalg.norm(cat))\n", "cosine" - ] + ], + "id": "5bd73484" }, { "cell_type": "markdown", - "id": "42ac6850-578e-4326-8f78-b9815ceee344", "metadata": {}, "source": [ "### Dot Product" - ] + ], + "id": "315867e3" }, { "cell_type": "markdown", - "id": "6581d30d-4f17-4439-8ab9-5c720537ae9f", "metadata": {}, "source": [ "The dot product is like measuring how much one vector goes in the same direction as another. It's like comparing two arrows and seeing how much one arrow points in the same direction as the other." - ] + ], + "id": "68b952bf" }, { "cell_type": "code", "execution_count": 7, - "id": "83f3e765-c8e5-44d2-9e8f-660bcf3a86b6", "metadata": {}, "outputs": [], "source": [ "np.dot(dog,cat)" - ] + ], + "id": "8cdea606" }, { + "id": "da390400", "cell_type": "markdown", - "id": "cb02aafc", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/restore-database-from-s3/notebook.ipynb b/notebooks/restore-database-from-s3/notebook.ipynb index 5c8b0b16..73b58c69 100644 --- a/notebooks/restore-database-from-s3/notebook.ipynb +++ b/notebooks/restore-database-from-s3/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "2941235d", "cell_type": "markdown", - "id": "5eb7d07e-1d70-4fcd-9b53-b446d91c2bb6", "metadata": {}, "source": [ "
    \n", @@ -41,7 +41,8 @@ "## Questions?\n", "\n", "Reach out to us through our [forum](https://www.singlestore.com/forum)." - ] + ], + "id": "7f663e34" }, { "cell_type": "markdown", @@ -66,7 +67,8 @@ "
    \n",
             "select * from information_schema.MV_BACKUP_HISTORY where STATUS = 'Success' and DATABASE_NAME = {database_name} order by BACKUP_ID desc\n",
             "
    " - ] + ], + "id": "23b493f3" }, { "attachments": {}, @@ -77,7 +79,8 @@ "\n", " SELECT * from information_schema.MV_BACKUP_HISTORY\n", "" - ] + ], + "id": "5cf876e2" }, { "attachments": {}, @@ -85,7 +88,8 @@ "metadata": {}, "source": [ "### Imports" - ] + ], + "id": "3c0750f6" }, { "cell_type": "code", @@ -99,7 +103,8 @@ "\n", "import singlestoredb as s2\n", "from IPython.display import display, HTML" - ] + ], + "id": "69b323c6" }, { "attachments": {}, @@ -107,7 +112,8 @@ "metadata": {}, "source": [ "### Variables" - ] + ], + "id": "69c02a1e" }, { "cell_type": "code", @@ -121,7 +127,8 @@ "aws_session_token = ''\n", "target_db_name = None\n", "backup_id = None" - ] + ], + "id": "db3341f9" }, { "attachments": {}, @@ -129,7 +136,8 @@ "metadata": {}, "source": [ "### Functions to display various alerts" - ] + ], + "id": "0fb9a0d5" }, { "cell_type": "code", @@ -189,7 +197,8 @@ "

    {success_msg}

    \n", "
    \n", "
    '''))" - ] + ], + "id": "1e419bac" }, { "attachments": {}, @@ -197,7 +206,8 @@ "metadata": {}, "source": [ "### Log Control" - ] + ], + "id": "376ebfc6" }, { "cell_type": "code", @@ -210,7 +220,8 @@ " logging.getLogger().setLevel(logging.DEBUG)\n", " else:\n", " logging.getLogger().setLevel(logging.ERROR)" - ] + ], + "id": "548733a1" }, { "attachments": {}, @@ -221,7 +232,8 @@ "\n", "To enable logs\n", "Modify 'enable_debug_logs(False)' to 'enable_debug_logs(True)' in code above" - ] + ], + "id": "32ce230d" }, { "attachments": {}, @@ -229,7 +241,8 @@ "metadata": {}, "source": [ "### Function to generate restore statement" - ] + ], + "id": "83c3fcb2" }, { "cell_type": "code", @@ -263,7 +276,8 @@ " data.write('}\\' ')\n", " logging.debug(f'statement: {data.getvalue()}')\n", " return data.getvalue()" - ] + ], + "id": "752b4492" }, { "cell_type": "code", @@ -334,7 +348,8 @@ "\n", "# End of script execution\n", "print('\\n\\nScript execution completed')" - ] + ], + "id": "739c1756" }, { "attachments": {}, @@ -342,7 +357,8 @@ "metadata": {}, "source": [ "### Verify Result" - ] + ], + "id": "884241f2" }, { "cell_type": "code", @@ -351,11 +367,12 @@ "outputs": [], "source": [ "%sql select schema_name from information_schema.SCHEMATA;" - ] + ], + "id": "5fe37c9c" }, { + "id": "e10b5a37", "cell_type": "markdown", - "id": "cafa399a-cdca-47ee-b0ac-d863bb1d2fec", "metadata": {}, "source": [ "
    \n", @@ -383,5 +400,5 @@ } }, "nbformat": 4, - "nbformat_minor": 4 + "nbformat_minor": 5 } diff --git a/notebooks/resume-evaluator-ANN-index-search/notebook.ipynb b/notebooks/resume-evaluator-ANN-index-search/notebook.ipynb index b7c1bcef..2ecd24ce 100644 --- a/notebooks/resume-evaluator-ANN-index-search/notebook.ipynb +++ b/notebooks/resume-evaluator-ANN-index-search/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "3b7dea3b", "cell_type": "markdown", - "id": "16a9df3f-11ef-4e5d-b0b0-60135f0f462d", "metadata": {}, "source": [ "
    \n", @@ -17,8 +17,8 @@ ] }, { + "id": "17190c32", "cell_type": "markdown", - "id": "88802017-5fcf-44a4-a6c1-bd4902551fc4", "metadata": {}, "source": [ "
    \n", @@ -33,7 +33,6 @@ { "attachments": {}, "cell_type": "markdown", - "id": "d96c8713-7f80-4f37-9f6d-ca73012205ec", "metadata": {}, "source": [ "In this notebook, we will showcase a practical use-case of evaluating resumes leveraging the combined powers of SingleStoreDB and OpenAI. Moving beyond traditional resume-matching techniques, we introduce a method that dives deeper into the nuances of resume content and the specific needs of a job description.\n", @@ -43,7 +42,8 @@ "When it's time to match a resume to a job description, we translate the latter into its vector form. Using a dot_product operation, we search against the table housing the resume summaries' embeddings. This provides us with resumes most aligned with the job description. To add another layer of precision, the matched resumes are further evaluated alongside the job description using OpenAI's LLM, offering a comprehensive assessment.\n", "\n", "Join us on this journey to redefine resume evaluations using SQL, SingleStoreDB, and OpenAI for better, more insightful matches!" - ] + ], + "id": "03fad061" }, { "attachments": { @@ -52,16 +52,15 @@ } }, "cell_type": "markdown", - "id": "2760efe7-22d0-415a-bdeb-df893b884e55", "metadata": {}, "source": [ "![resume_matcher.jpg](attachment:be902f72-2270-4a82-98c3-ace892a26458.jpg)" - ] + ], + "id": "bd0dca79" }, { "attachments": {}, "cell_type": "markdown", - "id": "3896f5d3-5f34-462c-a10e-da1d97ba904e", "metadata": {}, "source": [ "## 1. Create a workspace in your workspace group\n", @@ -69,19 +68,19 @@ "S-00 is sufficient.\n", "\n", "## 2. Create a database named `resume_evaluator`" - ] + ], + "id": "222b7901" }, { "cell_type": "markdown", - "id": "263516d2-3de3-41d5-b35c-fc4076b6f319", "metadata": {}, "source": [ "#### If you want to use free tier, follow below steps" - ] + ], + "id": "c077c872" }, { "cell_type": "markdown", - "id": "a63a004d-396d-4c86-b074-96bc6221364c", "metadata": {}, "source": [ "
    \n", @@ -91,12 +90,12 @@ "

    If you have a Free Starter Workspace deployed already, select the database from drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.

    \n", "
    \n", "
    " - ] + ], + "id": "a9fea1ed" }, { "cell_type": "code", "execution_count": 1, - "id": "50d55b12-2be8-4a80-84dc-d7ce7826eac6", "metadata": {}, "outputs": [], "source": [ @@ -104,11 +103,11 @@ "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", " %sql DROP DATABASE IF EXISTS new_transactions;\n", " %sql CREATE DATABASE new_transactions;" - ] + ], + "id": "5cc2be0b" }, { "cell_type": "markdown", - "id": "0516909c-2b04-4ef8-812b-709b782f5d54", "metadata": {}, "source": [ "
    \n", @@ -118,11 +117,11 @@ "

    Make sure to select a database from the drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.

    \n", "
    \n", "
    " - ] + ], + "id": "4419ebeb" }, { "cell_type": "markdown", - "id": "2cf96cd8-3322-4253-b3dd-2e517e6b750c", "metadata": {}, "source": [ "####\n", @@ -133,12 +132,12 @@ "

    If you want to use dedicated workspace, directly select workspace and db from dropdown at the top and follow below steps to create databse.

    \n", "
    \n", "" - ] + ], + "id": "52800df3" }, { "cell_type": "code", "execution_count": 2, - "id": "9be90062-ea39-406e-9eb2-1a830af981c6", "metadata": {}, "outputs": [], "source": [ @@ -146,12 +145,12 @@ "DROP DATABASE IF EXISTS resume_evaluator;\n", "CREATE DATABASE resume_evaluator;\n", "USE resume_evaluator;" - ] + ], + "id": "38f7aa27" }, { "cell_type": "code", "execution_count": 3, - "id": "49701a42-bfef-4072-aee9-7dc40dc2b8cb", "metadata": {}, "outputs": [], "source": [ @@ -166,12 +165,12 @@ " resume_summary text,\n", " resume_embeddings vector(1536, F32)\n", " );" - ] + ], + "id": "15a3b7d1" }, { "attachments": {}, "cell_type": "markdown", - "id": "0a944957-8ddb-4838-af3a-22a56e8f4ff9", "metadata": {}, "source": [ "## 3. Install and import required libraries\n", @@ -179,22 +178,22 @@ "In this section, we will set up the necessary environment by installing some crucial libraries. For our task of extracting text from resume PDFs, we'll be using pdfminer.six. To interact with the OpenAI's LLM and manage our data efficiently, openai will be instrumental.\n", "\n", "The install process may take a couple minutes." - ] + ], + "id": "41e91d34" }, { "cell_type": "code", "execution_count": 4, - "id": "3d537041-7ae9-4b7f-aa27-0305a0425878", "metadata": {}, "outputs": [], "source": [ "!pip install -q pdfminer.six openai boto3 sqlalchemy.singlestoredb" - ] + ], + "id": "afb993ee" }, { "cell_type": "code", "execution_count": 5, - "id": "5e0f1a4d-60b6-4c09-98ca-8c194f4b3461", "metadata": {}, "outputs": [], "source": [ @@ -209,36 +208,36 @@ "import pandas as pd\n", "import numpy as np\n", "from sqlalchemy import text, create_engine" - ] + ], + "id": "302260ae" }, { "attachments": {}, "cell_type": "markdown", - "id": "60fb5de9-12a3-42b4-b9c9-738061600dc6", "metadata": {}, "source": [ "## 4. Create a function called `get_embedding()`\n", "In our workflow, we need a consistent way to transform textual content into vector embeddings. To achieve this, we introduce the get_embedding() function.\n", "\n", "This function takes in a piece of text and, by default, uses the \"text-embedding-ada-002\" model to produce embeddings. We ensure that any newline characters in the text are replaced with spaces to maintain the integrity of the input. The function then leverages OpenAI's API to generate and retrieve the embedding for the given text." - ] + ], + "id": "4049c4d1" }, { "cell_type": "code", "execution_count": 6, - "id": "bf0b8325-b447-48dd-8a6b-5565dfda4cf4", "metadata": {}, "outputs": [], "source": [ "api_key = getpass.getpass('OpenAI API Key: ')\n", "openai.api_key = api_key\n", "client = OpenAI(api_key = api_key)" - ] + ], + "id": "98e79c92" }, { "cell_type": "code", "execution_count": 7, - "id": "07683cdd-988d-468b-a81e-4e99453faf90", "metadata": {}, "outputs": [], "source": [ @@ -246,12 +245,12 @@ " text = text.replace(\"\\n\", \" \")\n", " response = openai.embeddings.create(input=[text], model=model)\n", " return response.data[0].embedding" - ] + ], + "id": "fc40779b" }, { "attachments": {}, "cell_type": "markdown", - "id": "ad6c68a5-23a0-4926-a6bc-1ac44731b73b", "metadata": {}, "source": [ "## 5. Create a function called `print_pdf_text()`\n", @@ -274,12 +273,12 @@ "\n", "#### Output:\n", "Returns the cleaned and formatted text from the PDF." - ] + ], + "id": "cc12e6a6" }, { "cell_type": "code", "execution_count": 8, - "id": "2373ad58-4057-4c07-9804-796d30595d49", "metadata": {}, "outputs": [], "source": [ @@ -310,12 +309,12 @@ " os.remove(temp_file_path)\n", "\n", " return cleaned_text" - ] + ], + "id": "b221e318" }, { "attachments": {}, "cell_type": "markdown", - "id": "8cfe7778-a8d4-48d9-88a8-45a4233e6581", "metadata": {}, "source": [ "## 6. Create a function called `pinfo_extractor()`\n", @@ -337,12 +336,12 @@ "\n", "### Output:\n", "Returns a dictionary with keys such as 'name', 'email', 'phone_no', and more, containing extracted information from the resume." - ] + ], + "id": "f38985e3" }, { "cell_type": "code", "execution_count": 9, - "id": "9942951c-f2df-4507-856a-a79212a4998e", "metadata": {}, "outputs": [], "source": [ @@ -400,12 +399,12 @@ " }\n", " # print(data_dict, \"\\n\")\n", " return data_dict;" - ] + ], + "id": "2f794c61" }, { "attachments": {}, "cell_type": "markdown", - "id": "ec23888e-bf29-437b-a87f-794af8654294", "metadata": {}, "source": [ "## 7. Create a function called `add_data_to_db()`\n", @@ -427,12 +426,12 @@ "\n", "### Output:\n", "Prints a confirmation message upon successful data insertion." - ] + ], + "id": "b8040ec7" }, { "cell_type": "code", "execution_count": 10, - "id": "f7bdd3e0-0579-4f99-a81f-491b5c7db733", "metadata": {}, "outputs": [], "source": [ @@ -455,12 +454,12 @@ " connection.commit()\n", " #print(\"\\nData Written to resumes_profile_data_2 table\")\n", " connection.close()" - ] + ], + "id": "343b97b7" }, { "attachments": {}, "cell_type": "markdown", - "id": "8abcb853-e58f-4ff8-ac95-f13d0d587557", "metadata": {}, "source": [ "## 8. Create a function called `search_resumes()`\n", @@ -480,12 +479,12 @@ "\n", "### Output:\n", "Returns a list of the top 5 most relevant resumes based on the given query." - ] + ], + "id": "9be97c20" }, { "cell_type": "code", "execution_count": 11, - "id": "d3f7f52e-44db-4ecb-b3a7-7dd01488bad3", "metadata": {}, "outputs": [], "source": [ @@ -507,12 +506,12 @@ " connection.close()\n", " engine.dispose()\n", " return result" - ] + ], + "id": "ef13d894" }, { "attachments": {}, "cell_type": "markdown", - "id": "c884ea95-f50f-4ec3-9cb0-88baaf07f2fa", "metadata": {}, "source": [ "## 9. Create a function called `evaluate_candidates()`\n", @@ -535,12 +534,12 @@ "Returns a list of tuples, where each tuple contains:\n", "- Candidate's name.\n", "- Evaluation response from the model, describing the compatibility of the candidate with the given job description." - ] + ], + "id": "61cacf55" }, { "cell_type": "code", "execution_count": 12, - "id": "6b714782-8049-4378-945f-ee23a3c9dbec", "metadata": {}, "outputs": [], "source": [ @@ -570,12 +569,12 @@ " response_text = response.choices[0].message.content # response['choices'][0]['message']['content']\n", " responses.append((name, response_text)) # Append the name and response_text to the responses list\n", " return responses" - ] + ], + "id": "bc1e33bb" }, { "cell_type": "code", "execution_count": 13, - "id": "b21990f9-710e-4e08-95d4-e12f7256c3ce", "metadata": {}, "outputs": [], "source": [ @@ -602,12 +601,12 @@ "\n", "urls = list_s3_files(bucket_name, path_prefix)\n", "# urls" - ] + ], + "id": "45c59dc0" }, { "cell_type": "code", "execution_count": 14, - "id": "c2d4c146-5b15-43b2-8a48-3d4f114a4a2c", "metadata": {}, "outputs": [], "source": [ @@ -625,22 +624,22 @@ " #print(\"Information extracted\\n\")\n", " add_data_to_db(ip_data_dict)\n", " # print(\"\\n\")" - ] + ], + "id": "a8b85b4c" }, { "cell_type": "code", "execution_count": 15, - "id": "68185f51-8a2b-44bc-8a82-a8ab2f86c14b", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT COUNT(*) FROM resumes_profile_data;" - ] + ], + "id": "552244c0" }, { "cell_type": "markdown", - "id": "516ff5d0", "metadata": {}, "source": [ "## 10. Using Vector Index in SingleStore\n", @@ -669,41 +668,41 @@ "\n", "\n", "Selecting the right vector index depends on your specific application needs, the size of your dataset, and the balance you wish to strike between query speed and accuracy. Experimenting with different index types and configurations is key to finding the optimal setup for your use case." - ] + ], + "id": "92a1e9a1" }, { "cell_type": "code", "execution_count": 16, - "id": "6ca3e505-ead0-4c78-99d0-8d822cad8253", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "ALTER TABLE resumes_profile_data\n", "ADD VECTOR INDEX ivf_flat (new_resume_embeddings) INDEX_OPTIONS '{\"index_type\":\"IVF_FLAT\"}';" - ] + ], + "id": "c539f610" }, { "cell_type": "code", "execution_count": 17, - "id": "c7aa7fb6-9c8d-4b2a-b628-1633f74672bb", "metadata": {}, "outputs": [], "source": [ "evaluate_candidates(input(\"Enter Job description : \\n\"))" - ] + ], + "id": "ecaef735" }, { "cell_type": "markdown", - "id": "39053ae5-e87e-4679-bc1d-5d5e48370ab4", "metadata": {}, "source": [ "## Clean up" - ] + ], + "id": "9f62313a" }, { "cell_type": "markdown", - "id": "c25b6a87-6abd-4ac1-8ae6-7449cc5f3270", "metadata": {}, "source": [ "
    \n", @@ -713,23 +712,24 @@ "

    If you created a new database in your Standard or Premium Workspace, you can drop the database by running the cell below. Note: this will not drop your database for Free Starter Workspaces. To drop a Free Starter Workspace, terminate the Workspace using the UI.

    \n", "
    \n", "" - ] + ], + "id": "7e35253c" }, { "cell_type": "code", "execution_count": 18, - "id": "2545e75b-d0e1-4ba4-bc8d-f38589aa5420", "metadata": {}, "outputs": [], "source": [ "shared_tier_check = %sql show variables like 'is_shared_tier'\n", "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", " %sql DROP DATABASE IF EXISTS new_transactions;" - ] + ], + "id": "4449d1ec" }, { + "id": "0764cf02", "cell_type": "markdown", - "id": "62acda9e-b80e-4de3-a6bd-3fb381f3fe51", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/resume-evaluator/notebook.ipynb b/notebooks/resume-evaluator/notebook.ipynb index 7c06d1bc..4416d581 100644 --- a/notebooks/resume-evaluator/notebook.ipynb +++ b/notebooks/resume-evaluator/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "255b1ca3", "cell_type": "markdown", - "id": "16a9df3f-11ef-4e5d-b0b0-60135f0f462d", "metadata": {}, "source": [ "
    \n", @@ -17,8 +17,8 @@ ] }, { + "id": "e8993583", "cell_type": "markdown", - "id": "bdb16eeb", "metadata": {}, "source": [ "
    \n", @@ -33,7 +33,6 @@ { "attachments": {}, "cell_type": "markdown", - "id": "d96c8713-7f80-4f37-9f6d-ca73012205ec", "metadata": {}, "source": [ "In this notebook, we will showcase a practical use-case of evaluating resumes leveraging the combined powers of SingleStoreDB and OpenAI. Moving beyond traditional resume-matching techniques, we introduce a method that dives deeper into the nuances of resume content and the specific needs of a job description.\n", @@ -43,12 +42,12 @@ "When it's time to match a resume to a job description, we translate the latter into its vector form. Using a dot_product operation, we search against the table housing the resume summaries' embeddings. This provides us with resumes most aligned with the job description. To add another layer of precision, the matched resumes are further evaluated alongside the job description using OpenAI's LLM, offering a comprehensive assessment.\n", "\n", "Join us on this journey to redefine resume evaluations using SQL, SingleStoreDB, and OpenAI for better, more insightful matches!" - ] + ], + "id": "8e1fef21" }, { "attachments": {}, "cell_type": "markdown", - "id": "3896f5d3-5f34-462c-a10e-da1d97ba904e", "metadata": {}, "source": [ "## 1. Create a workspace in your workspace group\n", @@ -64,12 +63,12 @@ "
    \n", "\n", "## 2. Create a database named resume_evaluator" - ] + ], + "id": "dd26de8e" }, { "cell_type": "code", "execution_count": 1, - "id": "9be90062-ea39-406e-9eb2-1a830af981c6", "metadata": {}, "outputs": [], "source": [ @@ -77,12 +76,12 @@ "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", " %sql DROP DATABASE IF EXISTS resume_evaluator;\n", " %sql CREATE DATABASE resume_evaluator;" - ] + ], + "id": "c7deae6d" }, { "attachments": {}, "cell_type": "markdown", - "id": "a065c93b-6086-4cf9-9f14-249cb8b23ed9", "metadata": {}, "source": [ "
    \n", @@ -93,12 +92,12 @@ " It updates the connection_url which is used by the %%sql magic command and SQLAlchemy to make connections to the selected database.

    \n", "
    \n", "
    " - ] + ], + "id": "f023a713" }, { "cell_type": "code", "execution_count": 2, - "id": "49701a42-bfef-4072-aee9-7dc40dc2b8cb", "metadata": {}, "outputs": [], "source": [ @@ -114,12 +113,12 @@ " resume_summary text,\n", " resume_embeddings blob\n", ");" - ] + ], + "id": "b8bf5006" }, { "attachments": {}, "cell_type": "markdown", - "id": "0a944957-8ddb-4838-af3a-22a56e8f4ff9", "metadata": {}, "source": [ "## 3. Install and import required libraries\n", @@ -127,22 +126,22 @@ "In this section, we will set up the necessary environment by installing some crucial libraries. For our task of extracting text from resume PDFs, we'll be using pdfminer.six. To interact with the OpenAI's LLM and manage our data efficiently, openai will be instrumental.\n", "\n", "The install process may take a couple minutes." - ] + ], + "id": "b066170a" }, { "cell_type": "code", "execution_count": 3, - "id": "3d537041-7ae9-4b7f-aa27-0305a0425878", "metadata": {}, "outputs": [], "source": [ "!pip install -q pdfminer.six openai==1.3.3" - ] + ], + "id": "0a911696" }, { "cell_type": "code", "execution_count": 4, - "id": "5e0f1a4d-60b6-4c09-98ca-8c194f4b3461", "metadata": {}, "outputs": [], "source": [ @@ -157,35 +156,35 @@ "from pdfminer.high_level import extract_text\n", "from singlestoredb import create_engine\n", "from sqlalchemy import text" - ] + ], + "id": "a08db39b" }, { "cell_type": "code", "execution_count": 5, - "id": "46e57211-722b-4056-adf3-c802f50accba", "metadata": {}, "outputs": [], "source": [ "os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key: ')\n", "client = openai.OpenAI()" - ] + ], + "id": "03b1ae45" }, { "attachments": {}, "cell_type": "markdown", - "id": "60fb5de9-12a3-42b4-b9c9-738061600dc6", "metadata": {}, "source": [ "## 4. Create a function called `get_embedding()`\n", "In our workflow, we need a consistent way to transform textual content into vector embeddings. To achieve this, we introduce the get_embedding() function.\n", "\n", "This function takes in a piece of text and, by default, uses the \"text-embedding-ada-002\" model to produce embeddings. We ensure that any newline characters in the text are replaced with spaces to maintain the integrity of the input. The function then leverages OpenAI's API to generate and retrieve the embedding for the given text." - ] + ], + "id": "6cfdcf1f" }, { "cell_type": "code", "execution_count": 6, - "id": "07683cdd-988d-468b-a81e-4e99453faf90", "metadata": {}, "outputs": [], "source": [ @@ -193,12 +192,12 @@ " text = text.replace(\"\\n\", \" \")\n", " response = openai.embeddings.create(input=[text], model=model)\n", " return response.data[0].embedding" - ] + ], + "id": "e1d61bed" }, { "attachments": {}, "cell_type": "markdown", - "id": "ad6c68a5-23a0-4926-a6bc-1ac44731b73b", "metadata": {}, "source": [ "## 5. Create a function called `print_pdf_text()`\n", @@ -221,12 +220,12 @@ "\n", "#### Output:\n", "Returns the cleaned and formatted text from the PDF." - ] + ], + "id": "65ba6010" }, { "cell_type": "code", "execution_count": 7, - "id": "2373ad58-4057-4c07-9804-796d30595d49", "metadata": {}, "outputs": [], "source": [ @@ -257,12 +256,12 @@ " os.remove(temp_file_path)\n", "\n", " return cleaned_text" - ] + ], + "id": "c64e8265" }, { "attachments": {}, "cell_type": "markdown", - "id": "8cfe7778-a8d4-48d9-88a8-45a4233e6581", "metadata": {}, "source": [ "## 6. Create a function called `pinfo_extractor()`\n", @@ -284,12 +283,12 @@ "\n", "### Output:\n", "Returns a dictionary with keys such as 'name', 'email', 'phone_no', and more, containing extracted information from the resume." - ] + ], + "id": "3f50e333" }, { "cell_type": "code", "execution_count": 8, - "id": "9942951c-f2df-4507-856a-a79212a4998e", "metadata": {}, "outputs": [], "source": [ @@ -347,12 +346,12 @@ " }\n", " print(data_dict, \"\\n\")\n", " return data_dict;" - ] + ], + "id": "fdf815ca" }, { "attachments": {}, "cell_type": "markdown", - "id": "ec23888e-bf29-437b-a87f-794af8654294", "metadata": {}, "source": [ "## 7. Create a function called `add_data_to_db()`\n", @@ -374,12 +373,12 @@ "\n", "### Output:\n", "Prints a confirmation message upon successful data insertion." - ] + ], + "id": "13436642" }, { "cell_type": "code", "execution_count": 9, - "id": "f7bdd3e0-0579-4f99-a81f-491b5c7db733", "metadata": {}, "outputs": [], "source": [ @@ -401,12 +400,12 @@ " connection.execute(text(query_sql))\n", " connection.commit()\n", " print(\"\\nData Written to resumes_profile_data_2 table\")" - ] + ], + "id": "ecf9721c" }, { "attachments": {}, "cell_type": "markdown", - "id": "8abcb853-e58f-4ff8-ac95-f13d0d587557", "metadata": {}, "source": [ "## 8. Create a function called `search_resumes()`\n", @@ -426,12 +425,12 @@ "\n", "### Output:\n", "Returns a list of the top 5 most relevant resumes based on the given query." - ] + ], + "id": "7a0c032a" }, { "cell_type": "code", "execution_count": 10, - "id": "d3f7f52e-44db-4ecb-b3a7-7dd01488bad3", "metadata": {}, "outputs": [], "source": [ @@ -454,12 +453,12 @@ " connection.close()\n", " engine.dispose()\n", " return result" - ] + ], + "id": "b071d666" }, { "attachments": {}, "cell_type": "markdown", - "id": "c884ea95-f50f-4ec3-9cb0-88baaf07f2fa", "metadata": {}, "source": [ "## 8. Create a function called `evaluate_candidates()`\n", @@ -482,12 +481,12 @@ "Returns a list of tuples, where each tuple contains:\n", "- Candidate's name.\n", "- Evaluation response from the model, describing the compatibility of the candidate with the given job description." - ] + ], + "id": "590c1d7e" }, { "cell_type": "code", "execution_count": 11, - "id": "6b714782-8049-4378-945f-ee23a3c9dbec", "metadata": {}, "outputs": [], "source": [ @@ -517,12 +516,12 @@ " response_text = response.choices[0].message.content # response['choices'][0]['message']['content']\n", " responses.append((name, response_text)) # Append the name and response_text to the responses list\n", " return responses" - ] + ], + "id": "0ffb2c38" }, { "cell_type": "code", "execution_count": 12, - "id": "c2d4c146-5b15-43b2-8a48-3d4f114a4a2c", "metadata": {}, "outputs": [], "source": [ @@ -537,44 +536,44 @@ " print(\"Information extracted\\n\")\n", " add_data_to_db(ip_data_dict)\n", " print(\"\\n\")" - ] + ], + "id": "4a5d926d" }, { "cell_type": "code", "execution_count": 13, - "id": "68185f51-8a2b-44bc-8a82-a8ab2f86c14b", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "\n", "SELECT * FROM resumes_profile_data;" - ] + ], + "id": "7fd89294" }, { "cell_type": "code", "execution_count": 14, - "id": "2916185e-66ef-4600-9c67-9404fa71d053", "metadata": {}, "outputs": [], "source": [ "job_description = input(\"Enter Job description : \\n\")\n", "evaluate_candidates(job_description)" - ] + ], + "id": "dc6e863a" }, { "attachments": {}, "cell_type": "markdown", - "id": "e5a56e51-8bbe-4dd3-9233-6a1cccc8465d", "metadata": {}, "source": [ "## Cleanup" - ] + ], + "id": "8ff0fcbc" }, { "attachments": {}, "cell_type": "markdown", - "id": "b697cc4c", "metadata": {}, "source": [ "
    \n", @@ -584,23 +583,24 @@ "

    If you created a new database in your Standard or Premium Workspace, you can drop the database by running the cell below. Note: this will not drop your database for Free Starter Workspaces. To drop a Free Starter Workspace, terminate the Workspace using the UI.

    \n", "
    \n", "" - ] + ], + "id": "ddc98af3" }, { "cell_type": "code", "execution_count": 15, - "id": "c7aaef3c-5b4b-4eff-a561-e0182ec71d1b", "metadata": {}, "outputs": [], "source": [ "shared_tier_check = %sql show variables like 'is_shared_tier'\n", "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", " %sql DROP DATABASE IF EXISTS resume_evaluator;" - ] + ], + "id": "7e473c11" }, { + "id": "522b95f0", "cell_type": "markdown", - "id": "4e391976-78ce-4146-8346-c0bba61953b6", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/searching-all-of-wikipedia/notebook.ipynb b/notebooks/searching-all-of-wikipedia/notebook.ipynb index 93e2daaa..a88be565 100644 --- a/notebooks/searching-all-of-wikipedia/notebook.ipynb +++ b/notebooks/searching-all-of-wikipedia/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "7aac61ac", "cell_type": "markdown", - "id": "40094f2d", "metadata": {}, "source": [ "
    \n", @@ -18,7 +18,6 @@ }, { "cell_type": "markdown", - "id": "67c42f45", "metadata": {}, "source": [ "
    \n", @@ -28,12 +27,12 @@ "

    This tutorial is meant for Standard & Premium Workspaces. You can't run this with a Free Starter Workspace due to restrictions on Storage. Create a Workspace using +group in the left nav & select Standard for this notebook. Gallery notebooks tagged with \"Starter\" are suitable to run on a Free Starter Workspace

    \n", "
    \n", "
    " - ] + ], + "id": "31deb491" }, { "attachments": {}, "cell_type": "markdown", - "id": "3125c1a7-bb6f-406c-8d13-b67bcd387ceb", "metadata": {}, "source": [ "In this notebook, we've embarked on the task to implement semantic search and Retrieval-Augmented Generation (RAG) across Wikipedia's extensive database, using SingleStore's indexed ANN search capabilities.\n", @@ -41,11 +40,11 @@ "We have focused on a subset of Wikipedia, scraping 1,800 video game articles to obtain real OpenAI embeddings for about 40,000 paragraphs, supplemented by 10 million mock vectors for a scaled-up simulation. We then stored these vectors in a SingleStore database, applying different indexing options to enhance search efficiency. We show remarkable improvements in query response times, dropping to sub 100 milliseconds with indexing.\n", "\n", "We also integrated this system with a RAG-based chat, where you can ask and retrieve contextual answers based on the video game information in the database. Additionally, I've shared the technical details and resources, including our Python code and data links, in [this GitHub repository](https://github.com/rohitbhamidi/singlestore-indexed-ann) and an AWS S3 bucket (S3 URI: s3://wikipedia-video-game-data/video-game-embeddings(1).csv)." - ] + ], + "id": "b80ec63d" }, { "cell_type": "markdown", - "id": "f3c8cab4", "metadata": {}, "source": [ "## Part 0: Creating your workspace and database\n", @@ -56,12 +55,12 @@ "- 20 million vectors can be handled by an S-4 Workspace\n", "\n", "You can now extrapolate the workspace size you require from the number of vectors you want to generate!" - ] + ], + "id": "e30504b2" }, { "cell_type": "code", "execution_count": 1, - "id": "92d20572-7039-4a49-90d6-5a2ce0604c53", "metadata": {}, "outputs": [], "source": [ @@ -80,12 +79,12 @@ " key(id) using hash,\n", " fulltext (paragraph)\n", ");" - ] + ], + "id": "59d7247b" }, { "attachments": {}, "cell_type": "markdown", - "id": "8545719a-a39f-4be8-a11a-cfa57a3f434f", "metadata": {}, "source": [ "## Part 1: Generating the Mock Vectors\n", @@ -100,12 +99,12 @@ "- `nrandv1536`: generates a normalized, random vector of dimension 1536\n", "\n", "Finally, we populate `vecs` with 10,000,000 rows of mock vectors." - ] + ], + "id": "0a0390ac" }, { "cell_type": "code", "execution_count": 2, - "id": "64cdabb5-9c5c-406e-8a10-e9d133e42065", "metadata": {}, "outputs": [], "source": [ @@ -116,12 +115,12 @@ "begin\n", " return (rand()*(b - a) + a);\n", "end ;" - ] + ], + "id": "1258b171" }, { "cell_type": "code", "execution_count": 3, - "id": "14df424b-4626-4a1b-8257-735a8b76d3ff", "metadata": {}, "outputs": [], "source": [ @@ -143,12 +142,12 @@ " s = s || randbetween(-1,1) || \"]\";\n", " return s;\n", "end;" - ] + ], + "id": "c098908b" }, { "cell_type": "code", "execution_count": 4, - "id": "5b5a9f31-10ff-4bf2-978b-939ff4f2a36b", "metadata": {}, "outputs": [], "source": [ @@ -161,12 +160,12 @@ "begin\n", " return scalar_vector_mul(1/length, v);\n", "end;" - ] + ], + "id": "0b707241" }, { "cell_type": "code", "execution_count": 5, - "id": "bfc912be-ae11-4340-8499-846b1d0ec8e2", "metadata": {}, "outputs": [], "source": [ @@ -176,12 +175,12 @@ "begin\n", " return normalize(v) :> vector(1536);\n", "end;" - ] + ], + "id": "614f85d0" }, { "cell_type": "code", "execution_count": 6, - "id": "85aa12e0-b021-4257-b2ad-599923bed7e6", "metadata": {}, "outputs": [], "source": [ @@ -191,12 +190,12 @@ "begin\n", " return norm1536(gen_vector(1536));\n", "end;" - ] + ], + "id": "f5e7a62d" }, { "cell_type": "code", "execution_count": 7, - "id": "9341099e-853d-48b5-b54a-3ddc59d353d4", "metadata": {}, "outputs": [], "source": [ @@ -217,12 +216,12 @@ " end if;\n", " end loop;\n", "end;" - ] + ], + "id": "90571219" }, { "cell_type": "code", "execution_count": 8, - "id": "c15e4565-85d6-4d59-b8ac-c27d19663df3", "metadata": {}, "outputs": [], "source": [ @@ -231,31 +230,31 @@ "-- this will take around 20 min\n", "insert into vecs (id, v) values (1, nrandv1536());\n", "call insert_vectors(10000000);" - ] + ], + "id": "81b39e22" }, { "cell_type": "markdown", - "id": "a29162bb", "metadata": {}, "source": [ "As a quick aside, if you want to generate the full 160 million vectors, you simply have to change the `num_rows` to 160,000,000: `call insert_vectors(160000000);`" - ] + ], + "id": "0f2f6417" }, { "attachments": {}, "cell_type": "markdown", - "id": "5cff5e2a-ca94-437a-a0ae-c68a4cabfaa1", "metadata": {}, "source": [ "## Part 2: Getting the Wikipedia video game data\n", "\n", "We will use a SingleStore pipeline named `wiki_pipeline` to import data from an S3 bucket into `vecs`. The pipeline is configured to load data from a CSV file located at `s3://wikipedia-video-game-data/video-game-embeddings(1).csv`. Since the S3 bucket is open, the credentials section is left empty." - ] + ], + "id": "adce04a3" }, { "cell_type": "code", "execution_count": 9, - "id": "0d6a4c99-72bf-4fc9-b726-a5ecaa886864", "metadata": {}, "outputs": [], "source": [ @@ -272,24 +271,24 @@ "fields terminated by ','\n", "enclosed by '\"'\n", "lines terminated by '\\r\\n';" - ] + ], + "id": "02e85cce" }, { "cell_type": "code", "execution_count": 10, - "id": "0320bcf6-f547-43f4-a31c-e1c01e680777", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "-- start the pipeline!\n", "start pipeline `wiki_pipeline`;" - ] + ], + "id": "46b2136c" }, { "cell_type": "code", "execution_count": 11, - "id": "1417483f-9df4-4057-b88a-8478d336e4fb", "metadata": {}, "outputs": [], "source": [ @@ -298,56 +297,56 @@ "select DATABASE_NAME, PIPELINE_NAME, BATCH_ID, BATCH_STATE, START_TIME, ROWS_STREAMED, ROWS_PER_SEC\n", "from information_schema.PIPELINES_BATCHES_SUMMARY\n", "order by BATCH_ID;" - ] + ], + "id": "4f9b0875" }, { "attachments": {}, "cell_type": "markdown", - "id": "ab1f7406-01b3-4cdb-94a1-c00f2787aaba", "metadata": {}, "source": [ "## Part 3: Building the vector indexes\n", "\n", "Now, we have all the data in our table `vecs`. Let's go ahead and build our vector index. SingleStore gives us many options for our index with many tunable parameters. We will stick with the IVF indexes with default parameters." - ] + ], + "id": "636ec475" }, { "cell_type": "code", "execution_count": 12, - "id": "3617142a-0fc2-406c-9b7f-4e8853a19840", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "alter table vecs add vector index auto (v) INDEX_OPTIONS '{\"index_type\":\"AUTO\"}';" - ] + ], + "id": "40f3072a" }, { "cell_type": "code", "execution_count": 13, - "id": "ddac3841-2726-4834-9db8-9eab0b7d9ea2", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "alter table vecs add vector index ivf_flat (v) INDEX_OPTIONS '{\"index_type\":\"IVF_FLAT\"}';" - ] + ], + "id": "3a626544" }, { "cell_type": "code", "execution_count": 14, - "id": "55de3a82-4e6f-4e8b-9fbd-8f5370fd4145", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "alter table vecs add vector index ivf_pq (v) INDEX_OPTIONS '{\"index_type\":\"IVF_PQ\"}';" - ] + ], + "id": "bc8a3b33" }, { "attachments": {}, "cell_type": "markdown", - "id": "fe9ba00d-8979-4bc1-994f-cf9e77cd835b", "metadata": {}, "source": [ "## Part 4: Testing our indexes\n", @@ -357,12 +356,12 @@ "We have chosen a test vector whose paragraph is about Nintendo's Rad Racer video game. We will compare the performance of an exact K-nearest neighbor search to the searches with our ANN indexes.\n", "\n", "As we will see, we get an order of magnitude improvement when using an index in comparison to the exact KNN search!" - ] + ], + "id": "5ade8ae2" }, { "cell_type": "code", "execution_count": 15, - "id": "146b962e-9ad0-454b-ab8e-ba79115ed4f9", "metadata": {}, "outputs": [], "source": [ @@ -374,12 +373,12 @@ "from vecs\n", "order by sim use index () desc\n", "limit 5;" - ] + ], + "id": "9f40900b" }, { "cell_type": "code", "execution_count": 16, - "id": "6e349451-2a44-4d1b-84f8-7491dcbf247f", "metadata": {}, "outputs": [], "source": [ @@ -391,12 +390,12 @@ "from vecs\n", "order by sim use index (auto) desc\n", "limit 5;" - ] + ], + "id": "51f21307" }, { "cell_type": "code", "execution_count": 17, - "id": "8534f0ac-5b1f-4543-b087-ba70a35b8121", "metadata": {}, "outputs": [], "source": [ @@ -408,12 +407,12 @@ "from vecs\n", "order by sim use index (ivf_flat) desc\n", "limit 5;" - ] + ], + "id": "9e9a8fa9" }, { "cell_type": "code", "execution_count": 18, - "id": "40ea859c-c57a-4624-965f-94a18cd05c90", "metadata": {}, "outputs": [], "source": [ @@ -425,23 +424,23 @@ "from vecs\n", "order by sim use index (ivf_pq) desc\n", "limit 5;" - ] + ], + "id": "d46484c5" }, { "attachments": {}, "cell_type": "markdown", - "id": "830e749f-2dd1-481d-8e94-5e6be9271f63", "metadata": {}, "source": [ "## Part 5: Hybrid Search in SingleStore\n", "\n", "Let us now see how we can implement a \"hybrid search\" in SingleStore! This is going to be a query that combines two powerful tools: a fulltext search and a semantic search!" - ] + ], + "id": "4b1113d0" }, { "cell_type": "code", "execution_count": 19, - "id": "374ef252-165b-4b65-bb3b-b352740ac6f2", "metadata": {}, "outputs": [], "source": [ @@ -476,12 +475,12 @@ " on fts.id = vs.id\n", "order by hybrid_score desc\n", "limit 5;" - ] + ], + "id": "5a402fee" }, { "attachments": {}, "cell_type": "markdown", - "id": "09c2f7d3-7d79-4aae-ad10-09815785deb5", "metadata": {}, "source": [ "## Part 6: Chatting with the Video Game data!\n", @@ -495,22 +494,22 @@ "- `ask_wiki_page` Function:\n", " - Utilizes results from `search_wiki_page` for chatbot input.\n", " - Generates a query for an OpenAI GPT model-based chatbot." - ] + ], + "id": "c6f7ede2" }, { "cell_type": "code", "execution_count": 20, - "id": "708a2b4e-ca3e-474d-bd0d-86873d4817e0", "metadata": {}, "outputs": [], "source": [ "!pip3 install openai --quiet" - ] + ], + "id": "27551786" }, { "cell_type": "code", "execution_count": 21, - "id": "48f8d45a-134e-466b-8a0e-187baca6c12f", "metadata": {}, "outputs": [], "source": [ @@ -520,12 +519,12 @@ "import os\n", "import time\n", "import json" - ] + ], + "id": "de331a08" }, { "cell_type": "code", "execution_count": 22, - "id": "3f82d490-3725-44a2-8420-3869d66f2812", "metadata": {}, "outputs": [], "source": [ @@ -538,12 +537,12 @@ "# SingleStore connection\n", "engine = sa.create_engine(connection_url)\n", "connection = engine.connect()" - ] + ], + "id": "36038718" }, { "cell_type": "code", "execution_count": 23, - "id": "690e4a78-9d33-43e9-a850-c5d0c809bba9", "metadata": {}, "outputs": [], "source": [ @@ -552,12 +551,12 @@ " if isinstance(text, str):\n", " response = client.embeddings.create(input=[text], model=model)\n", " return json.dumps(response.data[0].embedding)" - ] + ], + "id": "eb5b430d" }, { "cell_type": "code", "execution_count": 24, - "id": "321398d6-ac8a-4df5-bf20-a7983adc7639", "metadata": {}, "outputs": [], "source": [ @@ -578,12 +577,12 @@ " print(f\"Search complete in {execution_time} seconds.\")\n", " results_as_dict = results.fetchall()\n", " return results_as_dict" - ] + ], + "id": "abcbb35b" }, { "cell_type": "code", "execution_count": 25, - "id": "20048761-c10c-43d9-b7ac-1a8b0b40af37", "metadata": {}, "outputs": [], "source": [ @@ -610,22 +609,23 @@ " )\n", " response_message = response.choices[0].message.content\n", " return response_message" - ] + ], + "id": "852878d7" }, { "cell_type": "code", "execution_count": 26, - "id": "36861579-f184-4ee5-9f69-761b2f4dd0d8", "metadata": {}, "outputs": [], "source": [ "query = input('Ask me a question about video games!')\n", "ask_wiki_page(query)" - ] + ], + "id": "ac0fc0f3" }, { + "id": "d53fccb8", "cell_type": "markdown", - "id": "4fce3001-ebf4-4bd2-8d67-e13e42deeb9b", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/semantic-search-with-hugging-face/notebook.ipynb b/notebooks/semantic-search-with-hugging-face/notebook.ipynb index 2df1d98a..1fab6675 100644 --- a/notebooks/semantic-search-with-hugging-face/notebook.ipynb +++ b/notebooks/semantic-search-with-hugging-face/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "0dcd1169", "cell_type": "markdown", - "id": "8e19358e-22e8-406c-ae17-d916db889313", "metadata": {}, "source": [ "
    \n", @@ -17,8 +17,8 @@ ] }, { + "id": "8eaf13ef", "cell_type": "markdown", - "id": "60eb6690", "metadata": {}, "source": [ "
    \n", @@ -32,7 +32,6 @@ }, { "cell_type": "markdown", - "id": "9bebf253-7913-4d7a-8ebc-f10463803baa", "metadata": {}, "source": [ "In this notebook, we will demonstrate an example of conducting semantic search on SingleStoreDB with SQL! Unlike traditional keyword-based search methods, semantic search algorithms take into account the relationships between words and their meanings, enabling them to deliver more accurate and relevant results \u2013 even when search terms are vague or ambiguous.\n", @@ -40,11 +39,11 @@ "SingleStoreDB\u2019s built-in parallelization and Intel SIMD-based vector processing takes care of the heavy lifting involved in processing vector data. This allows your to run your ML algorithms right in your database extremely efficiently with just 1 line of SQL!\n", "\n", "In this example, we use Hugging Face to create embeddings for our dataset and run semantic_search using dot_product vector matching function!" - ] + ], + "id": "8930a289" }, { "cell_type": "markdown", - "id": "358d1eb0-a0dd-423d-86ea-0d131abe4169", "metadata": {}, "source": [ "## 1. Create a workspace in your workspace group\n", @@ -60,12 +59,12 @@ "
    \n", "\n", "## 2. Create a database named `semantic_search`" - ] + ], + "id": "377e3866" }, { "cell_type": "code", "execution_count": 1, - "id": "af5e02fb-e15b-4c85-ac69-a40dd974cd88", "metadata": {}, "outputs": [], "source": [ @@ -73,11 +72,11 @@ "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", " %sql DROP DATABASE IF EXISTS semantic_search;\n", " %sql CREATE DATABASE semantic_search;" - ] + ], + "id": "989af2dd" }, { "cell_type": "markdown", - "id": "284f2bdc-a428-4a55-9f1f-fce623914b34", "metadata": {}, "source": [ "
    \n", @@ -88,11 +87,11 @@ " It updates the connection_url which is used by the %%sql magic command and SQLAlchemy to make connections to the selected database.

    \n", "
    \n", "
    " - ] + ], + "id": "4f648058" }, { "cell_type": "markdown", - "id": "8124ab1c-7f17-47bc-9f8a-c7bd5a33a426", "metadata": {}, "source": [ "## 3. Install and import required libraries\n", @@ -100,12 +99,12 @@ "We will use an embedding model on Hugging Face with Sentence Transfomers library. We will be analysing the sentiment of reviewers of selected movies. This dataset is available on Hugging Face and to use it, we will need the datasets library.\n", "\n", "The install process may take a couple minutes." - ] + ], + "id": "f39d4c0d" }, { "cell_type": "code", "execution_count": 2, - "id": "af6146b2-a044-4dd8-b020-e3d8c1f91aba", "metadata": {}, "outputs": [], "source": [ @@ -121,22 +120,22 @@ "from datasets import load_dataset\n", "from transformers import AutoTokenizer\n", "from transformers import AutoModel" - ] + ], + "id": "7c20b03d" }, { "cell_type": "markdown", - "id": "f80d23bc-7e98-4ac8-b2a0-7a737e4010e5", "metadata": {}, "source": [ "## 4. Load Sentence Transformer library and create a function called `get_embedding()`\n", "\n", "To vectorize and embed the reviews that watchers of the movies left, we will be using the `sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2` model. We will create a function called `get_embedding()` that will call this model and return the vectorized version of the sentence." - ] + ], + "id": "1304a356" }, { "cell_type": "code", "execution_count": 3, - "id": "a463c0fd-c747-4605-a728-c22a2fa17cfb", "metadata": {}, "outputs": [], "source": [ @@ -145,20 +144,20 @@ "\n", "model = AutoModel.from_pretrained(model_name)\n", "tokenizer = AutoTokenizer.from_pretrained(model_name)" - ] + ], + "id": "c67a6fb3" }, { "cell_type": "markdown", - "id": "bac82174-a2a3-40aa-b843-235a7547cace", "metadata": {}, "source": [ "Add a function to compute the embedding. The result will be a numpy array of 32-bit floats." - ] + ], + "id": "faaa17e1" }, { "cell_type": "code", "execution_count": 4, - "id": "f2e31300-1e6a-425c-bcf7-3708ce9e40d0", "metadata": {}, "outputs": [], "source": [ @@ -168,22 +167,22 @@ " with torch.no_grad():\n", " embedding = model(**inputs).last_hidden_state.mean(dim=1).squeeze().tolist()\n", " return np.array(embedding, dtype='\n", @@ -303,23 +302,24 @@ "

    If you created a new database in your Standard or Premium Workspace, you can drop the database by running the cell below. Note: this will not drop your database for Free Starter Workspaces. To drop a Free Starter Workspace, terminate the Workspace using the UI.

    \n", " \n", "" - ] + ], + "id": "fc8b28e8" }, { "cell_type": "code", "execution_count": 10, - "id": "0e91592f-4856-4cab-b15e-23585f551ab3", "metadata": {}, "outputs": [], "source": [ "shared_tier_check = %sql show variables like 'is_shared_tier'\n", "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", " %sql DROP DATABASE IF EXISTS semantic_search;" - ] + ], + "id": "10aae5a1" }, { + "id": "60d17a89", "cell_type": "markdown", - "id": "a6829f66-b37e-493d-9631-6da519140485", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/semantic-search-with-openai-embedding-creation/notebook.ipynb b/notebooks/semantic-search-with-openai-embedding-creation/notebook.ipynb index 0fd6c313..28a6d830 100644 --- a/notebooks/semantic-search-with-openai-embedding-creation/notebook.ipynb +++ b/notebooks/semantic-search-with-openai-embedding-creation/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "abecce4f", "cell_type": "markdown", - "id": "8e19358e-22e8-406c-ae17-d916db889313", "metadata": {}, "source": [ "
    \n", @@ -17,8 +17,8 @@ ] }, { + "id": "c68352ae", "cell_type": "markdown", - "id": "78c82354", "metadata": {}, "source": [ "
    \n", @@ -32,7 +32,6 @@ }, { "cell_type": "markdown", - "id": "9bebf253-7913-4d7a-8ebc-f10463803baa", "metadata": {}, "source": [ "In this notebook, we will demonstrate an example of conducting semantic search on SingleStoreDB with SQL! Unlike traditional keyword-based search methods, semantic search algorithms take into account the relationships between words and their meanings, enabling them to deliver more accurate and relevant results \u2013 even when search terms are vague or ambiguous.\n", @@ -41,11 +40,11 @@ "\n", "\n", "In this example, we use Open AI embeddings API to create embeddings for our dataset and run semantic_search using dot_product vector matching function!" - ] + ], + "id": "23cba3ba" }, { "cell_type": "markdown", - "id": "358d1eb0-a0dd-423d-86ea-0d131abe4169", "metadata": {}, "source": [ "## 1. Create a workspace in your workspace group\n", @@ -61,12 +60,12 @@ "
    \n", "\n", "## 2. Create a Database named `semantic_search`" - ] + ], + "id": "2e30cf79" }, { "cell_type": "code", "execution_count": 1, - "id": "af5e02fb-e15b-4c85-ac69-a40dd974cd88", "metadata": {}, "outputs": [], "source": [ @@ -74,11 +73,11 @@ "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", " %sql DROP DATABASE IF EXISTS semantic_search;\n", " %sql CREATE DATABASE semantic_search;" - ] + ], + "id": "b9104351" }, { "cell_type": "markdown", - "id": "284f2bdc-a428-4a55-9f1f-fce623914b34", "metadata": {}, "source": [ "
    \n", @@ -89,22 +88,22 @@ " It updates the connection_url which is used by the %%sql magic command and SQLAlchemy to make connections to the selected database.

    \n", "
    \n", "
    " - ] + ], + "id": "c636f590" }, { "cell_type": "markdown", - "id": "8124ab1c-7f17-47bc-9f8a-c7bd5a33a426", "metadata": {}, "source": [ "## 3. Install and import required libraries\n", "\n", "We will use the OpenAI embeddings API and will need to import the relevant dependencies accordingly." - ] + ], + "id": "7c480439" }, { "cell_type": "code", "execution_count": 2, - "id": "af6146b2-a044-4dd8-b020-e3d8c1f91aba", "metadata": {}, "outputs": [], "source": [ @@ -115,21 +114,21 @@ "\n", "from openai import OpenAI\n", "import requests" - ] + ], + "id": "48c7b3d1" }, { "cell_type": "markdown", - "id": "f80d23bc-7e98-4ac8-b2a0-7a737e4010e5", "metadata": {}, "source": [ "## 4. Create an OpenAI account and get API connection details\n", "\n", "To vectorize and embed the employee reviews and query strings, we leverage OpenAI's embeddings API. To use this API, you will need an API key, which you can get [here](https://platform.openai.com/account/api-keys). You'll need to add a payment method to actually get vector embeddings using the API, though the charges are minimal for a small example like we present here." - ] + ], + "id": "6d2c11f1" }, { "cell_type": "markdown", - "id": "22de8d4c-6b79-4496-8812-37e0b82e980b", "metadata": {}, "source": [ "
    \n", @@ -139,12 +138,12 @@ "

    You will have to update your notebook's firewall settings to include *.*.openai.com in order to get embedddings from OpenAI APIS.

    \n", "
    \n", "" - ] + ], + "id": "cf0225f1" }, { "cell_type": "code", "execution_count": 3, - "id": "a463c0fd-c747-4605-a728-c22a2fa17cfb", "metadata": {}, "outputs": [], "source": [ @@ -153,20 +152,20 @@ "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass('OpenAI API Key: ')\n", "\n", "client = OpenAI()" - ] + ], + "id": "df37a380" }, { "cell_type": "markdown", - "id": "17fb3aad-e3a8-4a2a-985c-64f0c94431b8", "metadata": {}, "source": [ "## 5. Create a new table in your database called reviews" - ] + ], + "id": "89c30e7c" }, { "cell_type": "code", "execution_count": 4, - "id": "e3af3810-0ce5-432b-a879-4eaa16524d38", "metadata": {}, "outputs": [], "source": [ @@ -178,62 +177,62 @@ " location VARCHAR(255),\n", " review TEXT\n", ");" - ] + ], + "id": "0f09871b" }, { "cell_type": "markdown", - "id": "db124797-a11c-4a97-9f58-b337c50014e3", "metadata": {}, "source": [ "## 6. Import our sample data into your table\n", "\n", "This dataset has 15 reviews left by anonymous employees of a firm." - ] + ], + "id": "33a35d4a" }, { "cell_type": "code", "execution_count": 5, - "id": "bce5a7cb-ad4f-4293-8bc3-9d09f76ae5e8", "metadata": {}, "outputs": [], "source": [ "url = 'https://raw.githubusercontent.com/singlestore-labs/singlestoredb-samples/main/Tutorials/ai-powered-semantic-search/hr_sample_data.sql'" - ] + ], + "id": "4b73ee99" }, { "cell_type": "markdown", - "id": "7ddec245-7c79-40ea-85b2-7a88e25e5321", "metadata": {}, "source": [ "Note that we are using the `%sql` magic command here to run a query against the currently\n", "selected database." - ] + ], + "id": "dec13942" }, { "cell_type": "code", "execution_count": 6, - "id": "227c2fcf-2dc8-4ed2-92f1-5a28667bf3d3", "metadata": {}, "outputs": [], "source": [ "for query in [x for x in requests.get(url).text.split('\\n') if x.strip()]:\n", " %sql {{query}}" - ] + ], + "id": "0ab0a16c" }, { "cell_type": "markdown", - "id": "8188ccb2-d5cf-48b5-8c9f-8b3858c18ae7", "metadata": {}, "source": [ "## 7. Add vector embeddings for each review\n", "\n", "To embed the reviews in our SingleStoreDB database, we iterate through each row in the table, make a call to OpenAI\u2019s embeddings API with the text in the reviews field and update the new column called embeddings for each entry." - ] + ], + "id": "db70fb54" }, { "cell_type": "code", "execution_count": 7, - "id": "419a690a-810c-4c80-b7ea-fd25cf1d5e80", "metadata": {}, "outputs": [], "source": [ @@ -252,22 +251,22 @@ "\n", "for embedding, review in zip(embeddings, reviews):\n", " %sql UPDATE reviews SET embeddings = JSON_ARRAY_PACK('{{json.dumps(embedding)}}') WHERE review='{{review}}';" - ] + ], + "id": "5be45303" }, { "cell_type": "markdown", - "id": "e34e62fb-7690-4a31-a874-ff7856d16cc7", "metadata": {}, "source": [ "## 8. Run the semantic search algorithm with just one line of SQL\n", "\n", "We will utilize SingleStoreDB's distributed architecture to efficiently compute the dot product of the input string (stored in searchstring) with each entry in the database and return the top 5 reviews with the highest dot product score. Each vector is normalized to length 1, hence the dot product function essentially computes the cosine similarity between two vectors \u2013 an appropriate nearness metric. SingleStoreDB makes this extremely fast because it compiles queries to machine code and runs dot_product using SIMD instructions." - ] + ], + "id": "6b72d93d" }, { "cell_type": "code", "execution_count": 8, - "id": "08bd6b1c-9731-4062-9b9a-a5e1a1d8efa3", "metadata": {}, "outputs": [], "source": [ @@ -280,19 +279,19 @@ "print()\n", "for i, res in enumerate(results):\n", " print(f'{i + 1}: {res.review} Score: {res.score}\\n')" - ] + ], + "id": "b03cc3e1" }, { "cell_type": "markdown", - "id": "0383939d-7fd3-434d-a27b-952eeed40e5f", "metadata": {}, "source": [ "## 9. Clean up" - ] + ], + "id": "e088f082" }, { "cell_type": "markdown", - "id": "40181201", "metadata": {}, "source": [ "
    \n", @@ -302,23 +301,24 @@ "

    If you created a new database in your Standard or Premium Workspace, you can drop the database by running the cell below. Note: this will not drop your database for Free Starter Workspaces. To drop a Free Starter Workspace, terminate the Workspace using the UI.

    \n", "
    \n", "" - ] + ], + "id": "52dac21e" }, { "cell_type": "code", "execution_count": 9, - "id": "0e91592f-4856-4cab-b15e-23585f551ab3", "metadata": {}, "outputs": [], "source": [ "shared_tier_check = %sql show variables like 'is_shared_tier'\n", "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", " %sql DROP DATABASE IF EXISTS semantic_search;" - ] + ], + "id": "bc062e9c" }, { + "id": "d4a768aa", "cell_type": "markdown", - "id": "a6829f66-b37e-493d-9631-6da519140485", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/semantic-search-with-openai-qa/notebook.ipynb b/notebooks/semantic-search-with-openai-qa/notebook.ipynb index 5b11fe65..e152328d 100644 --- a/notebooks/semantic-search-with-openai-qa/notebook.ipynb +++ b/notebooks/semantic-search-with-openai-qa/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "67480459", "cell_type": "markdown", - "id": "b05a4dfa-1763-431d-9027-75c783712e85", "metadata": {}, "source": [ "
    \n", @@ -17,8 +17,8 @@ ] }, { + "id": "09632a57", "cell_type": "markdown", - "id": "7002feac", "metadata": {}, "source": [ "
    \n", @@ -33,47 +33,46 @@ { "attachments": {}, "cell_type": "markdown", - "id": "f801cd94-180c-4dea-b85c-67659aad0ea6", "metadata": {}, "source": [ "In this Notebook you will use a combination of Semantic Search and a Large Langauge Model (LLM) to build a basic Retrieval Augmented Generation (RAG) application. For a great introduction into what RAG is, please read [A Beginner's Guide to Retrieval Augmented Generation (RAG)](https://www.singlestore.com/blog/a-guide-to-retrieval-augmented-generation-rag/).\n", "## Prerequisites for interacting with ChatGPT" - ] + ], + "id": "5887cda1" }, { "attachments": {}, "cell_type": "markdown", - "id": "df04d713-f330-4335-9837-3ab79eb552d6", "metadata": {}, "source": [ "### Install OpenAI package\n", "\n", "Let's start by installing the [openai](https://platform.openai.com/docs/api-reference?lang=python) Python package." - ] + ], + "id": "d55d2aa8" }, { "cell_type": "code", "execution_count": 1, - "id": "7690dc5e-93f4-477d-87b2-db35da95fb65", "metadata": {}, "outputs": [], "source": [ "!pip install openai==1.3.3 --quiet" - ] + ], + "id": "297675eb" }, { "attachments": {}, "cell_type": "markdown", - "id": "62bd45fa-daac-4d71-ab76-be014ddd3a32", "metadata": {}, "source": [ "### Connect to ChatGPT and display the response" - ] + ], + "id": "37988f8c" }, { "cell_type": "code", "execution_count": 2, - "id": "9c4378f4-02d4-4c19-a512-f3eed0a9cb88", "metadata": {}, "outputs": [], "source": [ @@ -81,21 +80,21 @@ "\n", "EMBEDDING_MODEL = \"text-embedding-ada-002\"\n", "GPT_MODEL = \"gpt-3.5-turbo\"" - ] + ], + "id": "60318fb1" }, { "attachments": {}, "cell_type": "markdown", - "id": "c244aa25-f548-47b2-8942-991552dc0ca1", "metadata": {}, "source": [ "You will need an OpenAI API key in order to use the the `openai` Python library." - ] + ], + "id": "44c80931" }, { "cell_type": "code", "execution_count": 3, - "id": "c2114aea-e137-407e-8e05-bf5688594a98", "metadata": {}, "outputs": [], "source": [ @@ -105,21 +104,21 @@ "os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key: ')\n", "\n", "client = openai.OpenAI()" - ] + ], + "id": "603fbac4" }, { "attachments": {}, "cell_type": "markdown", - "id": "0663c6f2-7741-4966-aea8-d5629e4a1cd4", "metadata": {}, "source": [ "Test the connection." - ] + ], + "id": "a29c90d9" }, { "cell_type": "code", "execution_count": 4, - "id": "244f1d45-db89-489e-aad0-153a652d59f6", "metadata": {}, "outputs": [], "source": [ @@ -132,40 +131,40 @@ ")\n", "\n", "print(response.choices[0].message.content)" - ] + ], + "id": "23d11168" }, { "attachments": {}, "cell_type": "markdown", - "id": "d287b813-2885-4b22-a431-03c6b4eab058", "metadata": {}, "source": [ "# Get the data about Winter Olympics and provide the information to ChatGPT as context" - ] + ], + "id": "2363818e" }, { "attachments": {}, "cell_type": "markdown", - "id": "682326b6-a475-4d79-828d-951780a6fb96", "metadata": {}, "source": [ "## 1. Install and import libraries" - ] + ], + "id": "98ab17e2" }, { "cell_type": "code", "execution_count": 5, - "id": "b5ef7c8a-7ab0-473d-b944-8cdbfe4918d8", "metadata": {}, "outputs": [], "source": [ "!pip install tabulate tiktoken wget --quiet" - ] + ], + "id": "9a18969c" }, { "cell_type": "code", "execution_count": 6, - "id": "a01f2552-cde3-49a4-8205-72b8fa8260c1", "metadata": {}, "outputs": [], "source": [ @@ -174,30 +173,30 @@ "import os\n", "import pandas as pd\n", "import wget" - ] + ], + "id": "a4a0ba11" }, { "attachments": {}, "cell_type": "markdown", - "id": "5f7aee40-4774-4ef1-b700-a83f9fed4fbb", "metadata": {}, "source": [ "## 2. Fetch the CSV data and read it into a DataFrame" - ] + ], + "id": "ce88a189" }, { "attachments": {}, "cell_type": "markdown", - "id": "05fcb9a8-2290-4507-aad1-a3002cab0ba6", "metadata": {}, "source": [ "Download pre-chunked text and pre-computed embeddings. This file is ~200 MB, so may take a minute depending on your connection speed." - ] + ], + "id": "ee0738ce" }, { "cell_type": "code", "execution_count": 7, - "id": "bef09ede-40b1-40f3-9966-f68d4b025fbd", "metadata": {}, "outputs": [], "source": [ @@ -209,22 +208,22 @@ " print(\"File downloaded successfully.\")\n", "else:\n", " print(\"File already exists in the local file system.\")" - ] + ], + "id": "bdfd15f0" }, { "attachments": {}, "cell_type": "markdown", - "id": "1faf22b7-5b99-4a9b-a88a-24acb16d133e", "metadata": {}, "source": [ "Here we are using the `converters=` parameter of the `pd.read_csv` function to convert the JSON\n", "array in the CSV file to numpy arrays." - ] + ], + "id": "66a2c515" }, { "cell_type": "code", "execution_count": 8, - "id": "6e0669bf-9070-42bd-a561-f6729f9203a6", "metadata": {}, "outputs": [], "source": [ @@ -234,31 +233,31 @@ "\n", "df = pd.read_csv(embeddings_path, converters=dict(embedding=json_to_numpy_array))\n", "df" - ] + ], + "id": "1956ecc7" }, { "cell_type": "code", "execution_count": 9, - "id": "bcf14e64-982e-465b-8e2b-6239650b2f51", "metadata": {}, "outputs": [], "source": [ "df.info(show_counts=True)" - ] + ], + "id": "7b0569fd" }, { "attachments": {}, "cell_type": "markdown", - "id": "cb523f8c-78b2-4a75-be15-52d29fac0fff", "metadata": {}, "source": [ "## 3. Set up the database" - ] + ], + "id": "e5f60bd0" }, { "attachments": {}, "cell_type": "markdown", - "id": "117daa30", "metadata": {}, "source": [ "
    \n", @@ -268,21 +267,21 @@ "

    If you have a Free Starter Workspace deployed already, select the database from drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.

    \n", "
    \n", "
    " - ] + ], + "id": "43947d3d" }, { "attachments": {}, "cell_type": "markdown", - "id": "ca811e5f-6dcd-471b-a4de-03eab42acf4f", "metadata": {}, "source": [ "Create the database." - ] + ], + "id": "3f1baf5b" }, { "cell_type": "code", "execution_count": 10, - "id": "ca33b770-95f9-47ae-9509-5dd98c331037", "metadata": {}, "outputs": [], "source": [ @@ -290,12 +289,12 @@ "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", " %sql DROP DATABASE IF EXISTS winter_wikipedia;\n", " %sql CREATE DATABASE winter_wikipedia;" - ] + ], + "id": "4f789d23" }, { "attachments": {}, "cell_type": "markdown", - "id": "393e0d4a-8020-447e-b0ae-aa4199b1a016", "metadata": {}, "source": [ "
    \n", @@ -306,12 +305,12 @@ " It updates the connection_url which is used by the %%sql magic command and SQLAlchemy to make connections to the selected database.

    \n", "
    \n", "
    " - ] + ], + "id": "a5fd7f34" }, { "cell_type": "code", "execution_count": 11, - "id": "011afdb9-cbd4-42af-8121-a8928d5c8432", "metadata": {}, "outputs": [], "source": [ @@ -321,70 +320,70 @@ " text TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,\n", " embedding BLOB\n", ");" - ] + ], + "id": "a6cbcb7e" }, { "attachments": {}, "cell_type": "markdown", - "id": "6b7ab530-4f55-482f-8e4c-475df06fe9b3", "metadata": {}, "source": [ "## 4. Populate the table with our DataFrame" - ] + ], + "id": "4dedda5c" }, { "attachments": {}, "cell_type": "markdown", - "id": "51be94b1-9901-499c-a364-c85782239e2a", "metadata": {}, "source": [ "Create a SQLAlchemy connection." - ] + ], + "id": "cae0a395" }, { "cell_type": "code", "execution_count": 12, - "id": "b8722e7e-211e-44a6-b512-eee1719c2879", "metadata": {}, "outputs": [], "source": [ "import singlestoredb as s2\n", "\n", "conn = s2.create_engine().connect()" - ] + ], + "id": "dba9a833" }, { "attachments": {}, "cell_type": "markdown", - "id": "2ce8c4c5-f389-4d0d-b434-6cd628343688", "metadata": {}, "source": [ "Use the `to_sql` method of the DataFrame to upload the data to the requested table." - ] + ], + "id": "490ccfe3" }, { "cell_type": "code", "execution_count": 13, - "id": "8dd44f53-f6ec-4009-8d1d-95209d704eec", "metadata": {}, "outputs": [], "source": [ "df.to_sql('winter_olympics_2022', con=conn, index=True, index_label='id', if_exists='append', chunksize=1000)" - ] + ], + "id": "1360d505" }, { "attachments": {}, "cell_type": "markdown", - "id": "c4d4602c-bfec-4819-904b-4d376b920e44", "metadata": {}, "source": [ "## 5. Do a semantic search with the same question from above and use the response to send to OpenAI again" - ] + ], + "id": "8ea4a8eb" }, { "cell_type": "code", "execution_count": 14, - "id": "e8560501-ff5e-4727-8d07-99f2173a3d62", "metadata": {}, "outputs": [], "source": [ @@ -430,12 +429,12 @@ "\n", " # Return the results.\n", " return strings[:top_n], relatednesses[:top_n]" - ] + ], + "id": "a147f99c" }, { "cell_type": "code", "execution_count": 15, - "id": "a1b27188-409c-4e43-8065-5448f40e1986", "metadata": {}, "outputs": [], "source": [ @@ -452,12 +451,12 @@ " print(f\"{relatedness=:.3f}\")\n", " print(tabulate([[string]], headers=['Result'], tablefmt='fancy_grid'))\n", " print('\\n\\n')" - ] + ], + "id": "a8edab89" }, { "cell_type": "code", "execution_count": 16, - "id": "ce7411d2-6f3c-408b-996b-2ec34cad5d7d", "metadata": {}, "outputs": [], "source": [ @@ -515,31 +514,31 @@ " )\n", " response_message = response.choices[0].message.content\n", " return response_message" - ] + ], + "id": "231acb2b" }, { "cell_type": "code", "execution_count": 17, - "id": "46a21d10-88bb-4cfb-b68d-b4a5b12ba1f4", "metadata": {}, "outputs": [], "source": [ "print(ask('Who won the gold medal for curling in Olymics 2022?'))" - ] + ], + "id": "cda561e4" }, { "attachments": {}, "cell_type": "markdown", - "id": "0935c8cb-a397-4892-b0ef-5b5ddee2b82a", "metadata": {}, "source": [ "## Clean up" - ] + ], + "id": "77dde827" }, { "attachments": {}, "cell_type": "markdown", - "id": "1202b364", "metadata": {}, "source": [ "
    \n", @@ -549,23 +548,24 @@ "

    If you created a new database in your Standard or Premium Workspace, you can drop the database by running the cell below. Note: this will not drop your database for Free Starter Workspaces. To drop a Free Starter Workspace, terminate the Workspace using the UI.

    \n", "
    \n", "" - ] + ], + "id": "750bbd14" }, { "cell_type": "code", "execution_count": 18, - "id": "5f448b09-8855-4dc7-984e-7d5dfdb68fc5", "metadata": {}, "outputs": [], "source": [ "shared_tier_check = %sql show variables like 'is_shared_tier'\n", "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", " %sql DROP DATABASE IF EXISTS winter_wikipedia;" - ] + ], + "id": "dd07be8d" }, { + "id": "2a95146c", "cell_type": "markdown", - "id": "30cca5fc-9cf5-474b-820f-440255193976", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/semantic-visualization-and-vector-datatype/notebook.ipynb b/notebooks/semantic-visualization-and-vector-datatype/notebook.ipynb index 34aa1238..9e1cb56a 100644 --- a/notebooks/semantic-visualization-and-vector-datatype/notebook.ipynb +++ b/notebooks/semantic-visualization-and-vector-datatype/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "26367f5e", "cell_type": "markdown", - "id": "eda1fdc1-ff12-4aee-9ec7-ee57896625cd", "metadata": {}, "source": [ "
    \n", @@ -17,8 +17,8 @@ ] }, { + "id": "8abb8215", "cell_type": "markdown", - "id": "1af5ad61-74b6-498c-884a-150013aed37f", "metadata": {}, "source": [ "
    \n", @@ -33,7 +33,6 @@ { "attachments": {}, "cell_type": "markdown", - "id": "db3fc39a-3817-433e-a4cf-e65d3253dd31", "metadata": {}, "source": [ "\n", @@ -41,12 +40,12 @@ "Vectors usually come from objects: text, images, video, audio, etc.\n", "In a vector space model, words with similar meanings, such as \"happy\" and \"joyful,\" are represented by vectors that lie in proximity, reflecting their semantic similarity.\n", "Vector database searches find data based on its content or meaning, even without exact matches." - ] + ], + "id": "66c2ea71" }, { "attachments": {}, "cell_type": "markdown", - "id": "3876c8c3-7313-46d5-822d-2ec5785376c9", "metadata": {}, "source": [ "## 1. Create a workspace in your workspace group\n", @@ -54,12 +53,12 @@ "S-00 is sufficient.\n", "\n", "## 2. Create a database named `db_vector`" - ] + ], + "id": "c1efbe85" }, { "attachments": {}, "cell_type": "markdown", - "id": "8bf18203-feb5-45f3-b41f-289236c3704c", "metadata": {}, "source": [ "
    \n", @@ -69,12 +68,12 @@ "

    If you have a Free Starter Workspace deployed already, select the database from drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.

    \n", "
    \n", "
    " - ] + ], + "id": "834d7507" }, { "cell_type": "code", "execution_count": 1, - "id": "fd4285e4-070c-4f36-9335-8d07c42f33aa", "metadata": {}, "outputs": [], "source": [ @@ -82,12 +81,12 @@ "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", " %sql DROP DATABASE IF EXISTS db_vector;\n", " %sql CREATE DATABASE db_vector;" - ] + ], + "id": "1d6d455c" }, { "attachments": {}, "cell_type": "markdown", - "id": "ca75637e-ae94-4f37-bc94-8d84f2560790", "metadata": {}, "source": [ "
    \n", @@ -97,21 +96,21 @@ "

    Make sure to select a database from the drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.

    \n", "
    \n", "
    " - ] + ], + "id": "180f3641" }, { "attachments": {}, "cell_type": "markdown", - "id": "3174a995-aa93-45da-833f-29cffa054945", "metadata": {}, "source": [ "Create table `words` and insert the words into the table." - ] + ], + "id": "aa7a4a14" }, { "cell_type": "code", "execution_count": 2, - "id": "0ea39e6a-7765-4e18-b66a-ede465491b2f", "metadata": {}, "outputs": [], "source": [ @@ -129,54 +128,54 @@ " (\"fish\"), ('king'), ('man'), ('woman'), ('queen'),\n", " ('Paris'), ('France'), ('Poland'), ('Warsaw'),\n", " ('prince'), ('throne'), ('Elizabeth'), ('ruler');" - ] + ], + "id": "e1d38b42" }, { "cell_type": "code", "execution_count": 3, - "id": "b98fff5d-72bd-4d52-8c0e-7ac1eb4040c8", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SHOW TABLES EXTENDED;" - ] + ], + "id": "7cfb0013" }, { "attachments": {}, "cell_type": "markdown", - "id": "146d4e8d-ae81-4f08-b25e-63b55c8c6439", "metadata": {}, "source": [ "## 3. Install and import required libraries\n", "In this section, we will set up the necessary environment by installing important libraries .\n", "\n", "The install process may take a couple minutes." - ] + ], + "id": "5818d73b" }, { "cell_type": "code", "execution_count": 4, - "id": "1fd1a85f-8511-48c2-89f1-ac0573f30fd3", "metadata": {}, "outputs": [], "source": [ "!pip3 install --upgrade sentence-transformers torch tensorflow pandarallel --quiet" - ] + ], + "id": "6c5421e5" }, { "attachments": {}, "cell_type": "markdown", - "id": "73064337-681e-447d-8257-975012a6149b", "metadata": {}, "source": [ "Import several libraries for data manipulation (e.g., Pandas, NumPy), database connectivity (SQLAlchemy, SingleStoreDB), machine learning (PyTorch, Transformers), and parallel processing (pandarallel)." - ] + ], + "id": "b83a712d" }, { "cell_type": "code", "execution_count": 5, - "id": "1cd529be-d314-47ad-84c9-d56a09911573", "metadata": {}, "outputs": [], "source": [ @@ -193,30 +192,30 @@ "from sqlalchemy import *\n", "db_connection = create_engine(connection_url)\n", "pandarallel.initialize(nb_workers=2, progress_bar=True)" - ] + ], + "id": "1e03f1e3" }, { "attachments": {}, "cell_type": "markdown", - "id": "2170f85e-3057-4e99-b026-1cda3f54203e", "metadata": {}, "source": [ "## 4. Load Sentence Transformer model along with its tokenizer, making them ready for use in tasks like sentence embeddings or similarity calculations." - ] + ], + "id": "5edb77c5" }, { "attachments": {}, "cell_type": "markdown", - "id": "50cddc8b-f5c2-4ec9-8e1f-6f94b40568df", "metadata": {}, "source": [ "Load Sentence Transformers model" - ] + ], + "id": "b82fecbf" }, { "cell_type": "code", "execution_count": 6, - "id": "d5948f1c-df1f-4aa5-830c-59ec95dd0b7f", "metadata": {}, "outputs": [], "source": [ @@ -224,30 +223,30 @@ "\n", "model = AutoModel.from_pretrained(model_name)\n", "tokenizer = AutoTokenizer.from_pretrained(model_name)" - ] + ], + "id": "07ce4e31" }, { "attachments": {}, "cell_type": "markdown", - "id": "fbef9ff6-d4d1-41d4-8400-fe0d37a0048d", "metadata": {}, "source": [ "## 5. Load the data into dataframe from database table" - ] + ], + "id": "ddef8850" }, { "attachments": {}, "cell_type": "markdown", - "id": "c8f22e01-f80c-4b79-8ebe-857d179d4392", "metadata": {}, "source": [ "Load the data into a DataFrame" - ] + ], + "id": "8b410662" }, { "cell_type": "code", "execution_count": 7, - "id": "0b4ea02f-f214-431a-bc4b-e3b9f930bc4f", "metadata": {}, "outputs": [], "source": [ @@ -255,23 +254,23 @@ "\n", "df = pd.DataFrame(result)\n", "df" - ] + ], + "id": "cbef4bf7" }, { "attachments": {}, "cell_type": "markdown", - "id": "cb855078-6608-4a19-b329-8de84da6de07", "metadata": {}, "source": [ "## 6. Function to retrieve the embedding\n", "\n", "This function, named get_embedding, takes a sentence as input and returns its embedding using a pre-trained tokenizer and model. It tokenizes the sentence and returns the resulting embedding as a NumPy array with a float32 data type." - ] + ], + "id": "a4d048d4" }, { "cell_type": "code", "execution_count": 8, - "id": "737eac3b-9c5a-4ff3-8f3c-81ab281fb06a", "metadata": {}, "outputs": [], "source": [ @@ -283,43 +282,43 @@ " with torch.no_grad():\n", " embedding = model(**inputs).last_hidden_state.mean(dim=1).squeeze().tolist()\n", " return np.array(embedding, dtype=' words table and execute all the cells above to visualize the semantic patterns." - ] + ], + "id": "dc848797" }, { "attachments": {}, "cell_type": "markdown", - "id": "75a661ea-d447-4a7d-9379-c80730e6a14e", "metadata": {}, "source": [ "## 9. Introducing Vector Datatype" - ] + ], + "id": "d37f2217" }, { "attachments": {}, "cell_type": "markdown", - "id": "5917b0d2-1ffb-48cf-b15e-d00fc3b27c1b", "metadata": {}, "source": [ "We can see below `word_embeddings` column is `blob` datatype" - ] + ], + "id": "99cd0b59" }, { "cell_type": "code", "execution_count": 15, - "id": "5d299e2b-d816-47b4-9b34-4ffdf60a2242", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "DESC words_table;" - ] + ], + "id": "8f0eebf9" }, { "cell_type": "code", "execution_count": 16, - "id": "18ccbbf4-1de4-4e73-bba6-053310566e51", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT word, word_embeddings FROM words_table LIMIT 2;" - ] + ], + "id": "fa8d1ec0" }, { "attachments": {}, "cell_type": "markdown", - "id": "ffe0ce17-9508-47ad-a239-41bacf09b8da", "metadata": {}, "source": [ "This below line of code executes a SQL query on the `words_table`, selecting the `word` column and the hexadecimal representation of the `word_embeddings` column for the first row in the table using the `limit 1` clause." - ] + ], + "id": "be1a5e6a" }, { "cell_type": "code", "execution_count": 17, - "id": "60677964-d188-4d33-9dd4-b93f1e28a350", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT word, HEX(word_embeddings) FROM words_table LIMIT 1;" - ] + ], + "id": "60689524" }, { "attachments": {}, "cell_type": "markdown", - "id": "178899b6-9d0a-409b-aacb-9e05a2669136", "metadata": {}, "source": [ "Below query extracts the `word` column and unpacks the JSON array stored in the `word_embeddings` column for the first row in the `words_table`, providing a more readable representation of the word embeddings." - ] + ], + "id": "508faddf" }, { "cell_type": "code", "execution_count": 18, - "id": "3d54aa09-f2c5-4a9d-a3bb-72d993a84a07", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT word, JSON_ARRAY_UNPACK(word_embeddings) FROM words_table LIMIT 1;" - ] + ], + "id": "d3fa6534" }, { "attachments": {}, "cell_type": "markdown", - "id": "60db6de5-0e1a-469e-bffc-b77c53cad9ee", "metadata": {}, "source": [ "## 10. Transition from BLOB to Vector datatype\n", @@ -538,69 +537,69 @@ "

    3. Drop the blob column.

    \n", "

    4. Rename the new vector column to the old blob column name. This will ensure any previous queries will still work, or at least require fewer changes.\n", "

    " - ] + ], + "id": "ca8ad8bd" }, { "cell_type": "code", "execution_count": 19, - "id": "233bfe2c-e99c-4dd3-bf56-eafa923ba4d8", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT VECTOR_NUM_ELEMENTS(word_embeddings) FROM words_table LIMIT 1;" - ] + ], + "id": "f6e449f9" }, { "cell_type": "code", "execution_count": 20, - "id": "b467da39-b9c4-4576-828e-135520713907", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "ALTER TABLE words_table ADD COLUMN emb2 vector(384) AFTER word_embeddings;\n", "UPDATE words_table SET emb2=word_embeddings;" - ] + ], + "id": "d58eefb6" }, { "cell_type": "code", "execution_count": 21, - "id": "4288cee5-3dc0-4108-a272-287a1ffbbb01", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT word, emb2, JSON_ARRAY_UNPACK(word_embeddings) FROM words_table LIMIT 1;" - ] + ], + "id": "d40fe9ce" }, { "cell_type": "code", "execution_count": 22, - "id": "bf42a048-0a42-496d-8925-fbc6125316b4", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "ALTER TABLE words_table DROP COLUMN word_embeddings;\n", "ALTER TABLE words_table CHANGE emb2 word_embeddings;" - ] + ], + "id": "5bffb8aa" }, { "cell_type": "code", "execution_count": 23, - "id": "fef19d53-3f4e-4e75-a1ca-8328a5ae29a2", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "DESC words_table;" - ] + ], + "id": "60a3d4bd" }, { "attachments": {}, "cell_type": "markdown", - "id": "d906c5e5-bd11-4c4f-ba13-8b4a2a09c9d7", "metadata": {}, "source": [ "## 11. Semantic Search of the word -sunshine 🌞 using Infix Operator\n", @@ -608,12 +607,12 @@ "Performing a semantic search for the word 'sunshine' to find contextually similar or related words and phrases based on their semantic meanings rather than exact lexical matches.\n", "\n", "The infix operators `<*>` and `<->` can be used to facilitate DOT_PRODUCT and EUCLIDEAN_DISTANCE operations, respectively, providing a more concise query syntax compared to using the existing built-in functions such as DOT_PRODUCT(a, b) and EUCLIDEAN_DISTANCE(a, b)." - ] + ], + "id": "02364df2" }, { "cell_type": "code", "execution_count": 24, - "id": "77bb3be5-83c0-4413-87fa-36fc245856d7", "metadata": {}, "outputs": [], "source": [ @@ -623,21 +622,21 @@ " FROM words_table\n", " ORDER BY score desc\n", " LIMIT 3;" - ] + ], + "id": "98a39ef5" }, { "attachments": {}, "cell_type": "markdown", - "id": "9bd7668c-a84d-42ff-962b-ead151d81b9a", "metadata": {}, "source": [ "## Clean up" - ] + ], + "id": "99b34b0a" }, { "attachments": {}, "cell_type": "markdown", - "id": "72e79695-e9a0-4e09-817e-8963a9dcd340", "metadata": {}, "source": [ "
    \n", @@ -647,23 +646,24 @@ "

    If you created a new database in your Standard or Premium Workspace, you can drop the database by running the cell below. Note: this will not drop your database for Free Starter Workspaces. To drop a Free Starter Workspace, terminate the Workspace using the UI.

    \n", "
    \n", "" - ] + ], + "id": "90b0773b" }, { "cell_type": "code", "execution_count": 25, - "id": "a7ea4778-2df1-441d-9464-1255116130ff", "metadata": {}, "outputs": [], "source": [ "shared_tier_check = %sql show variables like 'is_shared_tier'\n", "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", " %sql DROP DATABASE IF EXISTS db_vector;" - ] + ], + "id": "b4b02c5d" }, { + "id": "cf2b8a2b", "cell_type": "markdown", - "id": "f0c4e53f-b952-4d79-b851-f4c9222a928a", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/similarity-search-on-vector-data/notebook.ipynb b/notebooks/similarity-search-on-vector-data/notebook.ipynb index 0f8d9cb0..72b9bb17 100644 --- a/notebooks/similarity-search-on-vector-data/notebook.ipynb +++ b/notebooks/similarity-search-on-vector-data/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "1a39b6fe", "cell_type": "markdown", - "id": "352a16c8-a1f1-4790-89d0-1c3db2ecb08f", "metadata": {}, "source": [ "
    \n", @@ -17,8 +17,8 @@ ] }, { + "id": "8c8a73b0", "cell_type": "markdown", - "id": "38989162-aeba-4ada-91ab-23fd8bd27740", "metadata": {}, "source": [ "
    \n", @@ -33,7 +33,6 @@ { "attachments": {}, "cell_type": "markdown", - "id": "a3efcd15-5da0-4f79-85de-2ae1bf061b98", "metadata": {}, "source": [ "## What's in this notebook:\n", @@ -49,48 +48,48 @@ "## Questions?\n", "\n", "Reach out to us through our [forum](https://www.singlestore.com/forum)." - ] + ], + "id": "5a2581d8" }, { "attachments": {}, "cell_type": "markdown", - "id": "9d78727a-868a-4e23-be2e-1954119cb65f", "metadata": {}, "source": [ "## 1. Create and use a database." - ] + ], + "id": "e29287ae" }, { "attachments": {}, "cell_type": "markdown", - "id": "178a20af-5680-4190-a53d-57d81fdeecf6", "metadata": {}, "source": [ "To use this notebook, you need to have an active workspace and have selected a database to use. Please select a database using the dropdown above." - ] + ], + "id": "52fbe108" }, { "attachments": {}, "cell_type": "markdown", - "id": "d8ba2e84-d75c-4997-9a55-0e9294bda309", "metadata": {}, "source": [ "## 2. Create a table to hold vector data and load data." - ] + ], + "id": "00912cab" }, { "attachments": {}, "cell_type": "markdown", - "id": "4ff7221d-295d-4097-88c9-44e2728230b0", "metadata": {}, "source": [ "The SQL below creates a table to hold comments as one might find on a restaurant review site. The table contains the comment itself stored as a TEXT column and a vector embedding of that comment stored as a VECTOR ([Vector Type](https://docs.singlestore.com/cloud/vectors/vector-type)) column. [Working with Vector Data](https://docs.singlestore.com/cloud/vectors/working-with-vector-data/) provides more details on this example and information about similarity search over vectors." - ] + ], + "id": "f785b548" }, { "cell_type": "code", "execution_count": 1, - "id": "3dc4c365-1832-4525-bf6f-a53b77e6d6af", "metadata": {}, "outputs": [], "source": [ @@ -100,12 +99,12 @@ " comment TEXT,\n", " comment_embedding VECTOR(4) NOT NULL,\n", " category VARCHAR(256));" - ] + ], + "id": "99d2e315" }, { "cell_type": "code", "execution_count": 2, - "id": "41bba8dc-a558-4e32-b484-be7321d3497f", "metadata": {}, "outputs": [], "source": [ @@ -120,41 +119,41 @@ " (3, \"The B24 restaurant salad bar is quite good.\",\n", " '[0.1, 0.15, 0.37, 0.05]',\n", " \"Food\");" - ] + ], + "id": "f52bcffc" }, { "attachments": {}, "cell_type": "markdown", - "id": "50a64717-fca6-4a58-8afa-5301a65be8f2", "metadata": {}, "source": [ "### Verify the data was loaded" - ] + ], + "id": "3ac554e1" }, { "attachments": {}, "cell_type": "markdown", - "id": "4da232c3-7349-4785-99ea-042974597bf7", "metadata": {}, "source": [ "Use the following SQL to view the data in the comments table." - ] + ], + "id": "67e9630b" }, { "cell_type": "code", "execution_count": 3, - "id": "e2b7e8f2-101f-447f-887a-a86c0e963aff", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT * FROM comments;" - ] + ], + "id": "ee3acd15" }, { "attachments": {}, "cell_type": "markdown", - "id": "981b4ba7-109c-418b-ab39-1df64426a1f2", "metadata": {}, "source": [ "## 3. Search based on vector similarity.\n", @@ -162,12 +161,12 @@ "To find the most similar vectors in a query vector, use an ORDER BY\u2026 LIMIT\u2026 query. The ORDER BY command will sort the vectors by a similarity score produced by a vector similarity function, with the closest matches at the top.\n", "\n", "The SQL below sets up a query vector, then uses the DOT_PRODUCT infix operator (<\\*>) to find the two vectors that are most similar to the query vector." - ] + ], + "id": "faa052dd" }, { "cell_type": "code", "execution_count": 4, - "id": "9ad316ae-495d-4e7f-a508-f84d7af20432", "metadata": {}, "outputs": [], "source": [ @@ -179,12 +178,12 @@ " FROM comments\n", " ORDER BY score DESC\n", " LIMIT 2;" - ] + ], + "id": "1f2b57e4" }, { "attachments": {}, "cell_type": "markdown", - "id": "7220f9af-7a0c-4142-ace1-32102bedf869", "metadata": {}, "source": [ "## 4. Search using metadata filtering.\n", @@ -192,12 +191,12 @@ "When building vector search applications, you may wish to filter on the fields of a record, with simple filters or via joins, in addition to applying vector similarity operations.\n", "\n", "The following query combines the use of an ORDER BY ... LIMIT query and a metadata filter on category. This query will filter to find all comments in the category \"Food\" and then calculate the score for each of those and rank in descending order." - ] + ], + "id": "066fdd44" }, { "cell_type": "code", "execution_count": 5, - "id": "c846a3b0-5477-4f73-9a7e-bd935717dcf0", "metadata": {}, "outputs": [], "source": [ @@ -210,64 +209,64 @@ " WHERE category = \"Food\"\n", " ORDER BY score DESC\n", " LIMIT 3;" - ] + ], + "id": "22679a21" }, { "attachments": {}, "cell_type": "markdown", - "id": "c2010186-159d-4968-a9a3-fc285ab5a3cd", "metadata": {}, "source": [ "## 5. Create and use a vector index.\n", "\n", "The command below creates a vector index on the comment_embedding field of the comments table." - ] + ], + "id": "db0f41ad" }, { "cell_type": "code", "execution_count": 6, - "id": "87ab9e1b-d7ed-455e-b3f8-0691034436de", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "ALTER TABLE comments ADD VECTOR INDEX ivf(comment_embedding)\n", "INDEX_OPTIONS '{\"index_type\":\"IVF_FLAT\"}';" - ] + ], + "id": "de0c6f3f" }, { "attachments": {}, "cell_type": "markdown", - "id": "9b8dd26d-edc9-4bec-b7f7-b1a393d459d7", "metadata": {}, "source": [ "Optionally optimize the table for best performance." - ] + ], + "id": "16ae9f59" }, { "cell_type": "code", "execution_count": 7, - "id": "089d7fc3-e23b-4b33-92cd-4ccba9121336", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "OPTIMIZE TABLE comments FULL;" - ] + ], + "id": "ae4c1b16" }, { "attachments": {}, "cell_type": "markdown", - "id": "6c0bfb7f-4831-4436-bed0-ef30a5a30e00", "metadata": {}, "source": [ "The following query will use the vector index. Vector indexes can be used to improve performance of queries over large vector data sets. Refer to [Vector Indexing](https://docs.singlestore.com/cloud/vectors/vector-indexing/) for information on creating and using vector indexes." - ] + ], + "id": "cddc5974" }, { "cell_type": "code", "execution_count": 8, - "id": "5327b465-0191-455d-a800-67e8ad403df6", "metadata": {}, "outputs": [], "source": [ @@ -279,23 +278,23 @@ " FROM comments\n", " ORDER BY score DESC\n", " LIMIT 2;" - ] + ], + "id": "24fc1b33" }, { "attachments": {}, "cell_type": "markdown", - "id": "cec7c7db-fb6a-4a22-8ebe-38c077f7ae70", "metadata": {}, "source": [ "## 6. Check that your query is using a vector index.\n", "\n", "The EXPLAIN command can be used to see the query plan and verify that the vector index is being used. In the example below, you can see INTERNAL_VECTOR_SEARCH in the ColumnStoreFilter row. This tells you that the vector index is being used." - ] + ], + "id": "85a6bba3" }, { "cell_type": "code", "execution_count": 9, - "id": "e4fcced9-b650-4786-9a3d-3f8e2ac4fad1", "metadata": {}, "outputs": [], "source": [ @@ -308,33 +307,34 @@ " FROM comments\n", " ORDER BY score DESC\n", " LIMIT 2;" - ] + ], + "id": "258a9714" }, { "attachments": {}, "cell_type": "markdown", - "id": "08034846-168c-4547-abbb-72e10d9629e2", "metadata": {}, "source": [ "## 7. Clean up.\n", "\n", "The command below will drop the table created as part of this notebook. Dropping this table will allow you to rerun the notebook from the beginning." - ] + ], + "id": "e2a0af68" }, { "cell_type": "code", "execution_count": 10, - "id": "57290d8e-98d4-4ea8-b290-a925f5ba9bee", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "DROP TABLE comments;" - ] + ], + "id": "663219d6" }, { + "id": "aca52f19", "cell_type": "markdown", - "id": "b87169e1-aa2c-4364-bc4b-86ca97ef24fa", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/singlestore-april-challenge-haiku-ascii/notebook.ipynb b/notebooks/singlestore-april-challenge-haiku-ascii/notebook.ipynb index 5412bdc9..2783eb21 100644 --- a/notebooks/singlestore-april-challenge-haiku-ascii/notebook.ipynb +++ b/notebooks/singlestore-april-challenge-haiku-ascii/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "0c39a476", "cell_type": "markdown", - "id": "574444dd-dce1-4658-b7a0-74d4ed39620c", "metadata": {}, "source": [ "
    \n", @@ -19,15 +19,14 @@ { "attachments": {}, "cell_type": "markdown", - "id": "192680e4-9009-4bb2-8053-2b8e81517eff", "metadata": {}, "source": [ "## \ud83d\udce3 SingleStore's Creative Challenge for #NationalPoetryMonth: *Win RayBan Smart sunglasses \ud83d\ude0e and a $500 AWS gift card! \ud83d\udcb8*" - ] + ], + "id": "d5d4f1ce" }, { "cell_type": "markdown", - "id": "6b38d13c-506e-4c68-8446-f99df09e1757", "metadata": {}, "source": [ "---\n", @@ -57,12 +56,12 @@ "#### For questions about this contest or SingleStore Notebooks, use our dedicated Discord channel: https://discord.gg/re56Fwyd\n", "\n", "---" - ] + ], + "id": "4f113dac" }, { "attachments": {}, "cell_type": "markdown", - "id": "54676c95-d210-42d3-9dab-2a660b0fd901", "metadata": {}, "source": [ "Feel free to make changes to this starter code to generate a Haiku and ASCII art. The below code consists of two main parts:\n", @@ -70,31 +69,31 @@ "1. Generating a Haiku: The generate_haiku function creates a simple haiku using pre-defined lists of phrases that correspond to the traditional 5-7-5 syllable structure of haikus. This function randomly selects one phrase from each list to construct the haiku.\n", "\n", "2. Visualizing the Haiku: The visualize_haiku function uses matplotlib to create a visualization of the generated haiku. It sets up a figure with a custom background color, hides the axes for a cleaner look, and displays the haiku text in the center with a styled bounding box around it." - ] + ], + "id": "ae02b515" }, { "cell_type": "code", "execution_count": 1, - "id": "a30655d6-9180-467c-aab9-0fc70ce50812", "metadata": {}, "outputs": [], "source": [ "!pip install matplotlib --quiet" - ] + ], + "id": "21faa4da" }, { "attachments": {}, "cell_type": "markdown", - "id": "08d1b7fb-bf75-4157-a615-3e798708cb85", "metadata": {}, "source": [ "### Generating a Haiku with Seasonal Transitions" - ] + ], + "id": "1747d6b2" }, { "cell_type": "code", "execution_count": 2, - "id": "5f4d1299-17c2-47ae-9875-1870320f6e8c", "metadata": {}, "outputs": [], "source": [ @@ -179,39 +178,39 @@ "# Generate and Visualize\n", "haiku, start_season, end_season = generate_seasonal_transition_haiku()\n", "visualize_seasonal_transition(haiku, start_season, end_season)" - ] + ], + "id": "42e7bea8" }, { "attachments": {}, "cell_type": "markdown", - "id": "0da8f807-9174-4493-a532-3e5166b092f2", "metadata": {}, "source": [ "## ASCII Art Generation" - ] + ], + "id": "fcc0889f" }, { "cell_type": "markdown", - "id": "0a340f24-f704-4bc2-bace-7b87cdf3d1b8", "metadata": {}, "source": [ "#### Note that you might have to add the URL to firewall when asked to do so, to be able to access your input image in the below code" - ] + ], + "id": "aa41c0eb" }, { "cell_type": "code", "execution_count": 3, - "id": "48335134-f5d9-42d9-bc98-68ba0e97f47d", "metadata": {}, "outputs": [], "source": [ "!pip install Pillow requests --quiet" - ] + ], + "id": "d6e52bb1" }, { "cell_type": "code", "execution_count": 4, - "id": "9f3a0969-7aa9-4807-94cd-a915d327a258", "metadata": {}, "outputs": [], "source": [ @@ -267,12 +266,12 @@ "# Example usage with a public image URL\n", "image_url = 'https://raw.githubusercontent.com/singlestore-labs/spaces-notebooks/master/common/images/singlestore-banner.png' # Replace with your image's URL\n", "process_image_from_url(image_url, 100)" - ] + ], + "id": "1b8b09c1" }, { "attachments": {}, "cell_type": "markdown", - "id": "df3b8c8d-3f85-4885-b090-13ed74e451a7", "metadata": {}, "source": [ "---\n", @@ -282,11 +281,12 @@ "\u2705 Share a Github link to your notebook by April 30, 2024: https://docs.google.com/forms/d/e/1FAIpQLSdXcvzSxtTtHYxRG40Pc5HVknxu6EbngDrsX6ukzkEbRu26ww/viewform\n", "\n", "\u2705 Make sure to tag @SingleStore and use #SingleStorePoetry when sharing your work on LinkedIn/X" - ] + ], + "id": "2b414d85" }, { + "id": "b1116d19", "cell_type": "markdown", - "id": "46fc1d5d-491a-4ce4-9d0d-fb8564d07fd4", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/singlestore-now-2024/notebook.ipynb b/notebooks/singlestore-now-2024/notebook.ipynb index d6a57124..6597f569 100644 --- a/notebooks/singlestore-now-2024/notebook.ipynb +++ b/notebooks/singlestore-now-2024/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "fc7e7b35", "cell_type": "markdown", - "id": "1d35bf5a-7a16-4eea-9a45-797273ac5491", "metadata": {}, "source": [ "
    \n", @@ -17,8 +17,8 @@ ] }, { + "id": "50e92616", "cell_type": "markdown", - "id": "5fc3a6d9-e064-40dd-8cd0-636a567d5af0", "metadata": {}, "source": [ "
    \n", @@ -33,7 +33,6 @@ { "attachments": {}, "cell_type": "markdown", - "id": "dfc73c1e-9918-4d0a-ab22-4187a9c47678", "metadata": {}, "source": [ "\n", @@ -41,42 +40,42 @@ "The data set used in this competition/demo contains some E-commerce data revolving around customers and products that they have purchased. In this notebook, we will run a few queries using SingleStore Kai which will allow us to migrate MongoDB data and run MongoDB queries directly through SingleStore. To create your entry for the raffle, please open and complete the following form: https://forms.gle/n8KjTpJgPL29wFHV9\n", "\n", "If you have any issues while completing the form, please reach out to a SingleStore team member at the event." - ] + ], + "id": "9019c195" }, { "attachments": {}, "cell_type": "markdown", - "id": "1c7f4c37-2c1d-4507-9564-de2bea190005", "metadata": {}, "source": [ "## Install libraries and import modules\n", "\n", "First, we will need to import the necessary dependencies into our notebook environment. This includes some python libraries needed to run our queries." - ] + ], + "id": "050d60ed" }, { "cell_type": "code", "execution_count": 1, - "id": "fb64cdc7-3ff1-4809-a9f1-9f0e770874b3", "metadata": {}, "outputs": [], "source": [ "!pip install pymongo pandas ipywidgets --quiet" - ] + ], + "id": "f34495f0" }, { "attachments": {}, "cell_type": "markdown", - "id": "58c2085f-d9f4-4faa-b787-2e0cf952d0b1", "metadata": {}, "source": [ "To ensure that we have a database we can use, we will then make sure that a database exists. If it doesn't we will have the notebook create one for us." - ] + ], + "id": "2bdd3998" }, { "cell_type": "code", "execution_count": 2, - "id": "784ccd70-014c-429a-8325-91407fbf0e96", "metadata": {}, "outputs": [], "source": [ @@ -88,21 +87,21 @@ "else:\n", " database_to_use = \"new_transactions\"\n", " %sql CREATE DATABASE {{database_to_use}}" - ] + ], + "id": "1b1fb18e" }, { "attachments": {}, "cell_type": "markdown", - "id": "033b3e8b-d445-41de-b682-77d66f98aed8", "metadata": {}, "source": [ "Next, let's run the code that will actually import the needed dependencies, including `pymongo`, that will be used to connect to SingleStore and our Mongo instance where the initial data is stored." - ] + ], + "id": "7a940e1f" }, { "cell_type": "code", "execution_count": 3, - "id": "3f1f2731-e117-4ead-871a-5711eb1cb391", "metadata": {}, "outputs": [], "source": [ @@ -112,23 +111,23 @@ "import pandas as pd\n", "import pymongo\n", "from pymongo import MongoClient" - ] + ], + "id": "40a410f2" }, { "attachments": {}, "cell_type": "markdown", - "id": "ca323068-a897-478f-839a-244f4bbc1719", "metadata": {}, "source": [ "## Connect to Atlas and SingleStore Kai endpoints\n", "\n", "Next, we will connect to the MongoDB Atlas instance using a Mongo client. We will need to connect to this instance to get our initial data, currently stored in Mongo." - ] + ], + "id": "68eadfe5" }, { "cell_type": "code", "execution_count": 4, - "id": "5bf785b4-79c6-440f-9bb1-34a033c9f4db", "metadata": {}, "outputs": [], "source": [ @@ -138,21 +137,21 @@ "mongoitems = mydbmongodb[\"items\"]\n", "mongocusts = mydbmongodb[\"custs\"]\n", "mongotxs = mydbmongodb[\"txs\"]" - ] + ], + "id": "9259e348" }, { "attachments": {}, "cell_type": "markdown", - "id": "52c51825-19ac-4512-87b5-619fb0b48a67", "metadata": {}, "source": [ "Then, we will need to connect to the SingleStore Kai API which will allow us to import and access the Mongo data we will move over from Mongo Atlas." - ] + ], + "id": "c7d1953d" }, { "cell_type": "code", "execution_count": 5, - "id": "20e25f4a-a6ce-4e3a-80c5-c56002945c7e", "metadata": {}, "outputs": [], "source": [ @@ -162,23 +161,23 @@ "s2mongoitems = s2dbmongodb[\"items\"]\n", "s2mongocusts = s2dbmongodb[\"custs\"]\n", "s2mongotxs = s2dbmongodb[\"txs\"]" - ] + ], + "id": "2d4eced0" }, { "attachments": {}, "cell_type": "markdown", - "id": "36c6162c-e0a2-404b-8d9f-9af8df8b8cea", "metadata": {}, "source": [ "## Copy Atlas collections into SingleStore Kai\n", "\n", "As our next step, we need to get our MongoDB data hosted in Atlas over to SingleStore. For this, we will run the following code that will then replicate the selected Mongo collections into our SingleStore instance. This will make the MongoDB data available in SingleStore, allowing us to migrate away from MongoDB and to perform all of our data storage and queries in a single database instead of having multiple data silos." - ] + ], + "id": "84bc8dc3" }, { "cell_type": "code", "execution_count": 6, - "id": "ebbefa07-2fbf-468c-bf65-00e12dcc606f", "metadata": {}, "outputs": [], "source": [ @@ -189,23 +188,23 @@ " data_dict = df.to_dict(orient='records')\n", " s2mongo_collection = s2dbmongodb[mongo_collection.name]\n", " s2mongo_collection.insert_many(data_dict)" - ] + ], + "id": "400f97a1" }, { "attachments": {}, "cell_type": "markdown", - "id": "ca4dbc9b-f96a-46c1-a4ac-aa761e0d19ec", "metadata": {}, "source": [ "## QUERY 1: Total quantity of products sold across all products\n", "\n", "Our first query on the newly migrated data will be to retrieve the total quanitity of products across every product within our dataset. As you'll see, even though we are running in SingleStore, we can still use Mongo query syntax using SingleStore Kai." - ] + ], + "id": "ddd381fb" }, { "cell_type": "code", "execution_count": 7, - "id": "2d3e0782-198f-4539-92cd-91e1758db721", "metadata": {}, "outputs": [], "source": [ @@ -230,33 +229,33 @@ "\n", "# Returning the numeric values of total quantity sold\n", "print(\"Total Product Quantity Sold is\",total_quantity)" - ] + ], + "id": "a303e52f" }, { "attachments": {}, "cell_type": "markdown", - "id": "17c39c6d-5f5c-4712-86d4-57ab70f185ed", "metadata": {}, "source": [ "#### ACTION ITEM!\n", "Take the output from this query and put it into the **ANSWER NUMBER 1** field in the Google Form." - ] + ], + "id": "1ef685e6" }, { "attachments": {}, "cell_type": "markdown", - "id": "58f643e0-0205-4cf7-97de-dcd93bef0a64", "metadata": {}, "source": [ "## QUERY 2: Top selling Product\n", "\n", "Our next query will be to find the top selling product within our data. Once again, we are issuing a Mongo query against our SingleStore instance. If we had an application integrated with MongoDB but wanted to migrate to SingleStore, we could do so without having to rewrite the queries within our application!" - ] + ], + "id": "999bc317" }, { "cell_type": "code", "execution_count": 8, - "id": "a31e6d36-9eb3-43d3-a8c9-50a740d8d36c", "metadata": {}, "outputs": [], "source": [ @@ -283,31 +282,31 @@ "\n", "# Return the #1 selling product and its total quantity sold\n", "print(\"Top-Selling product : \",product_name,\"With total quantity sold \",total_quantity_sold)" - ] + ], + "id": "8169cd92" }, { "attachments": {}, "cell_type": "markdown", - "id": "7efdb5cb-502f-46e0-9464-a62ab60beace", "metadata": {}, "source": [ "### ACTION ITEM!\n", "Take the output from this query and put it into the **ANSWER NUMBER 2** field in the Google Form." - ] + ], + "id": "d8aae59c" }, { "attachments": {}, "cell_type": "markdown", - "id": "e45de51e-f54b-4788-8fb3-2aadc9143533", "metadata": {}, "source": [ "## QUERY 3: Top selling Location" - ] + ], + "id": "bb9b91f5" }, { "cell_type": "code", "execution_count": 9, - "id": "923bf8d1-6869-4448-9916-80e4f1b6e3f0", "metadata": {}, "outputs": [], "source": [ @@ -349,22 +348,22 @@ "# Return the top-selling location and transaction count\n", "\n", "print(\"Top-Selling Location : \",location_name,\"With transaction of Count \",transaction_count)" - ] + ], + "id": "2863eb28" }, { "attachments": {}, "cell_type": "markdown", - "id": "153c9dd5-1804-42c6-b55e-ce043ee07a84", "metadata": {}, "source": [ "### ACTION ITEM!\n", "Take the output from this query and put it into the **ANSWER NUMBER 3** field in the Google Form." - ] + ], + "id": "2e1ec36a" }, { "attachments": {}, "cell_type": "markdown", - "id": "93934fde-c22e-4bda-992f-ed01dc83283c", "metadata": {}, "source": [ "## Clean up and submit!\n", @@ -372,12 +371,12 @@ "**Make sure to click submit on your Google Form to make sure you've been entered into the SingleStore NOW 2024 raffle!**\n", "\n", "Additionally, if you'd like to clean up your instance, you can run the statement below. To learn more about SingleStore, please connect with one of our SingleStore reps here at the conference!" - ] + ], + "id": "793d7a60" }, { "attachments": {}, "cell_type": "markdown", - "id": "599ca6e3-3847-467a-8a33-8f91e52a9cd1", "metadata": {}, "source": [ "
    \n", @@ -387,23 +386,24 @@ "

    If you created a new database in your Standard or Premium Workspace, you can drop the database by running the cell below. Note: this will not drop your database for Free Starter Workspaces. To drop a Free Starter Workspace, terminate the Workspace using the UI.

    \n", "
    \n", "
    " - ] + ], + "id": "29c7c0ec" }, { "cell_type": "code", "execution_count": 10, - "id": "2f4cf3c7-5e1f-442e-8b6e-e4f106ded82b", "metadata": {}, "outputs": [], "source": [ "shared_tier_check = %sql show variables like 'is_shared_tier'\n", "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", " %sql DROP DATABASE IF EXISTS new_transactions;" - ] + ], + "id": "ca401f18" }, { + "id": "35f88772", "cell_type": "markdown", - "id": "760cef98-671d-4754-bab4-67dc6f38c209", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/unified-data-analysis-sql-nosql-kai/notebook.ipynb b/notebooks/unified-data-analysis-sql-nosql-kai/notebook.ipynb index acab8263..5f429703 100644 --- a/notebooks/unified-data-analysis-sql-nosql-kai/notebook.ipynb +++ b/notebooks/unified-data-analysis-sql-nosql-kai/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "1a6d5359", "cell_type": "markdown", - "id": "568d434c-21e4-417c-aecb-d803c39cd3de", "metadata": {}, "source": [ "
    \n", @@ -19,25 +19,24 @@ { "attachments": {}, "cell_type": "markdown", - "id": "1744aa42-1d83-4ffe-9633-34bca98f537c", "metadata": {}, "source": [ "# Unified Data Analysis: SQL & NoSQL on a Single Database with Kai" - ] + ], + "id": "6c2d8700" }, { "attachments": {}, "cell_type": "markdown", - "id": "7f3d6b1c-9365-4113-8963-6aeaf08efbd2", "metadata": {}, "source": [ "" - ] + ], + "id": "ced3b94d" }, { "attachments": {}, "cell_type": "markdown", - "id": "cacc9529-3715-4854-8d7f-05543df12c15", "metadata": {}, "source": [ "### What you will learn in this notebook:\n", @@ -50,45 +49,45 @@ "2. Analyze data using both NoSQL and relational approaches, depending on your specific needs. Developers and data analytics who are familiar with different programming approaches like MongoDB query language and SQL can work together on the same database. Perform familiar SQL queries on your NoSQL data!\n", "\n", "Ready to unlock real-time analytics and unified data access? Let's start!" - ] + ], + "id": "3eae9c76" }, { "cell_type": "code", "execution_count": 1, - "id": "e87ac44e-ff79-466f-9c17-9f9667ad8089", "metadata": {}, "outputs": [], "source": [ "!pip install pymongo prettytable matplotlib --quiet" - ] + ], + "id": "33a2b51c" }, { "attachments": {}, "cell_type": "markdown", - "id": "10a03a4a-2cfa-4ecd-b2ef-97c6bfa6f755", "metadata": {}, "source": [ "### Create database for importing data from different sources\n", "\n", "This example gets banking data from three different sources: ATM locations from S3, transaction data from MySQL and user profile details from MongoDB databases. Joins data from different sources to generate rich insights about the transactional activity across user profile and locations across the globe" - ] + ], + "id": "18e3e006" }, { "cell_type": "code", "execution_count": 2, - "id": "d0463f3f-419a-4a4b-be77-ef02a027f8aa", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "DROP DATABASE IF EXISTS BankingAnalytics;\n", "CREATE DATABASE BankingAnalytics;" - ] + ], + "id": "a77b82b3" }, { "attachments": {}, "cell_type": "markdown", - "id": "a3335e3a-1fad-4857-975d-81d4a5205f08", "metadata": {}, "source": [ "
    \n", @@ -98,48 +97,48 @@ "

    Make sure to select 'BankingAnalytics' database from the drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.

    \n", "
    \n", "
    " - ] + ], + "id": "08515681" }, { "attachments": {}, "cell_type": "markdown", - "id": "ff27f40c-4d92-4597-91d8-5e2c43ed32ed", "metadata": {}, "source": [ "" - ] + ], + "id": "047ed31a" }, { "attachments": {}, "cell_type": "markdown", - "id": "629241ce-8464-4834-bad3-3f88af675e59", "metadata": {}, "source": [ "## Setup CDC from MySQL" - ] + ], + "id": "aa69ee29" }, { "attachments": {}, "cell_type": "markdown", - "id": "e99f3b06-82d3-47d0-a3f9-d93d08f3e9a2", "metadata": {}, "source": [ "### SingleStore allows you to ingest the data from mysql using pipelines" - ] + ], + "id": "cab7be5d" }, { "attachments": {}, "cell_type": "markdown", - "id": "1807cb20-ddc8-4f8c-97c0-c473e8049768", "metadata": {}, "source": [ "In this step, we create a link from MySQL instance and start the pipelines for the CDC" - ] + ], + "id": "d37b87da" }, { "cell_type": "code", "execution_count": 3, - "id": "37cf747a-286b-4f37-8d8a-7b191fec1366", "metadata": {}, "outputs": [], "source": [ @@ -156,52 +155,52 @@ " \"database.password\": \"Password@123\",\n", " \"database.user\": \"repl_user\"\n", " }';" - ] + ], + "id": "fa97ca56" }, { "cell_type": "code", "execution_count": 4, - "id": "da44f747-85c0-400b-b155-be55c860f076", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "CREATE TABLES AS INFER PIPELINE AS LOAD DATA LINK mysqllink \"*\" FORMAT AVRO;" - ] + ], + "id": "d1025983" }, { "cell_type": "code", "execution_count": 5, - "id": "4b230ff8-913a-4f67-83d7-778a514f8136", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "START ALL PIPELINES;" - ] + ], + "id": "e9d91511" }, { "attachments": {}, "cell_type": "markdown", - "id": "af22186a-9989-4a73-8adc-6d8532c70cd6", "metadata": {}, "source": [ "### Migrate the data from S3 storage to SingleStore using Pipelines" - ] + ], + "id": "dd0bcb77" }, { "attachments": {}, "cell_type": "markdown", - "id": "4d51519a-9e5e-43cd-95bd-07f4b59f80c7", "metadata": {}, "source": [ "This steps loads data from S3, this requires the tables to be defined beforehand" - ] + ], + "id": "802e47d1" }, { "cell_type": "code", "execution_count": 6, - "id": "82260c10-9482-4997-8208-5a8d6bff48d2", "metadata": {}, "outputs": [], "source": [ @@ -215,12 +214,12 @@ " latitude DECIMAL(9, 6),\n", " longitude DECIMAL(9, 6)\n", ");" - ] + ], + "id": "dd05a31c" }, { "cell_type": "code", "execution_count": 7, - "id": "689ec731-2a2a-4035-9dc3-fead074240cc", "metadata": {}, "outputs": [], "source": [ @@ -230,43 +229,43 @@ "CONFIG '{\"region\":\"ap-southeast-1\"}'\n", "SKIP DUPLICATE KEY ERRORS\n", "INTO TABLE atm_locations;" - ] + ], + "id": "5ba219df" }, { "cell_type": "code", "execution_count": 8, - "id": "5c482f6a-c053-438e-a671-a3a7894052a7", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "START PIPELINE atmlocations" - ] + ], + "id": "94eebb51" }, { "attachments": {}, "cell_type": "markdown", - "id": "806952a2-090c-48aa-a1e4-8c71cd66143d", "metadata": {}, "source": [ "### Setup CDC from MongoDB to SingleStore" - ] + ], + "id": "56a58831" }, { "attachments": {}, "cell_type": "markdown", - "id": "1c2d83fa-6a94-4a2b-a657-0e3f8ea0bf8b", "metadata": {}, "source": [ "Now we setup CDC from MongoDB to replicate the data SingleStore\n", "\n", "The collections to be replicated are specified as a comma separated or in a wildcard format in \"collection.include.list\"" - ] + ], + "id": "87444b9b" }, { "cell_type": "code", "execution_count": 9, - "id": "fb852b9c-51a6-4d8c-a4a2-f0cb6e78dc60", "metadata": {}, "outputs": [], "source": [ @@ -283,166 +282,166 @@ " \"mongodb.user\":\"mongo_sample_reader\",\n", " \"mongodb.password\":\"SingleStoreRocks27017\"\n", " }';" - ] + ], + "id": "b84d21a8" }, { "cell_type": "code", "execution_count": 10, - "id": "ae958c77-2e84-4434-9dcd-205837bd5f02", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "CREATE TABLES AS INFER PIPELINE AS LOAD DATA LINK mongo '*' FORMAT AVRO;" - ] + ], + "id": "4e58d374" }, { "cell_type": "code", "execution_count": 11, - "id": "32b32f1e-072b-4273-952b-6772b4a8e380", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SHOW PIPELINES" - ] + ], + "id": "1f3b9fe9" }, { "cell_type": "code", "execution_count": 12, - "id": "3a91eefe-6d82-453b-8d3d-10344baae466", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "START ALL PIPELINES" - ] + ], + "id": "2c088673" }, { "attachments": {}, "cell_type": "markdown", - "id": "c0d54948-e440-43f7-963b-8d6b2c1b6129", "metadata": {}, "source": [ "### Check for records in tables" - ] + ], + "id": "8b6caff3" }, { "attachments": {}, "cell_type": "markdown", - "id": "e45a1cb9-d884-4ac5-91dd-c8ded628bcb0", "metadata": {}, "source": [ "Data from MySQL" - ] + ], + "id": "3e469dab" }, { "cell_type": "code", "execution_count": 13, - "id": "c288f83a-9495-4baa-940f-9fe8400ab93b", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT COUNT(*) FROM transactions" - ] + ], + "id": "44688a3d" }, { "cell_type": "code", "execution_count": 14, - "id": "f8604704-9324-4630-9eec-2c3f6be3c853", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT * FROM transactions WHERE transaction_type LIKE '%Deposit%' LIMIT 1;" - ] + ], + "id": "d3fc1c14" }, { "attachments": {}, "cell_type": "markdown", - "id": "0a39438a-a537-490e-9191-77c9b224bbb4", "metadata": {}, "source": [ "Data from S3" - ] + ], + "id": "37088621" }, { "cell_type": "code", "execution_count": 15, - "id": "5d26d20e-ef26-4d73-8a6a-060716e5f5ae", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT COUNT(*) FROM atm_locations" - ] + ], + "id": "ed1670ee" }, { "cell_type": "code", "execution_count": 16, - "id": "62eced11-86b7-4c66-a7a8-ad197460e99d", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT * FROM atm_locations LIMIT 1;" - ] + ], + "id": "367930d2" }, { "attachments": {}, "cell_type": "markdown", - "id": "a06fc97a-59ec-4134-a3cf-016b39108374", "metadata": {}, "source": [ "Data from MongoDB" - ] + ], + "id": "2f11225d" }, { "cell_type": "code", "execution_count": 17, - "id": "40c77598-464b-4d91-b36e-89569e1ecb12", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "SELECT _id:>JSON, _more:>JSON FROM profile LIMIT 1;" - ] + ], + "id": "0150f6bc" }, { "cell_type": "code", "execution_count": 18, - "id": "a4694186-b064-465f-9e4f-9a33b1d6ae76", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "\n", "SELECT _id:>JSON, _more:>JSON FROM history LIMIT 1;" - ] + ], + "id": "21fbf852" }, { "attachments": {}, "cell_type": "markdown", - "id": "40ec6ef2-dd95-43b7-947a-16c80efc359a", "metadata": {}, "source": [ "### Join tables from different sources using SQL queries" - ] + ], + "id": "3f3dd776" }, { "attachments": {}, "cell_type": "markdown", - "id": "3bfb62de-ae36-4824-881f-c858cb852c01", "metadata": {}, "source": [ "SQL Query 1: View Users details, their associated ATMs" - ] + ], + "id": "1e1ade40" }, { "cell_type": "code", "execution_count": 19, - "id": "9efacacd-b36e-4208-a360-3ca239d1fad3", "metadata": {}, "outputs": [], "source": [ @@ -460,21 +459,21 @@ "WHERE\n", " p._more::$account_id = a.id\n", "LIMIT 10;" - ] + ], + "id": "9eb6e86c" }, { "attachments": {}, "cell_type": "markdown", - "id": "2cce1897-681f-4136-807e-711ed78ed80b", "metadata": {}, "source": [ "SQL Query 2: View Users details, their associated ATMs and transaction details" - ] + ], + "id": "c8147f4a" }, { "cell_type": "code", "execution_count": 20, - "id": "82d52f1a-35e8-4f59-99ed-cbd65842f699", "metadata": {}, "outputs": [], "source": [ @@ -497,21 +496,21 @@ "LEFT JOIN\n", " transactions t ON p._more::$account_id = t.account_id\n", "LIMIT 10;" - ] + ], + "id": "9baa53de" }, { "attachments": {}, "cell_type": "markdown", - "id": "2fb24ac0-1579-4ddb-8009-efb36aaff2d6", "metadata": {}, "source": [ "### Run queries in Mongo Query Language using Kai" - ] + ], + "id": "ed30b745" }, { "cell_type": "code", "execution_count": 21, - "id": "d7f5fa93-473e-4e0b-96ca-e743cb75684d", "metadata": {}, "outputs": [], "source": [ @@ -527,12 +526,12 @@ "\n", "for profile in profile_coll.find().limit(1):\n", " pprint.pprint(profile)" - ] + ], + "id": "42e097e7" }, { "cell_type": "code", "execution_count": 22, - "id": "dd303812-2ec2-4e84-b436-617591099a06", "metadata": {}, "outputs": [], "source": [ @@ -585,12 +584,12 @@ " ])\n", "\n", "print(table)" - ] + ], + "id": "a3441786" }, { "cell_type": "code", "execution_count": 23, - "id": "00e2c386-2ae5-478d-be0a-c9c07005745e", "metadata": {}, "outputs": [], "source": [ @@ -604,12 +603,12 @@ "]\n", "\n", "pprint.pprint(list(profile_coll.aggregate(pipeline)))" - ] + ], + "id": "fe24435f" }, { "cell_type": "code", "execution_count": 24, - "id": "a0e0be36-0605-426b-a84a-1c47bab2f7cb", "metadata": {}, "outputs": [], "source": [ @@ -623,19 +622,20 @@ "\n", "plt.bar(country,count)\n", "plt.plot()" - ] + ], + "id": "1e919cbe" }, { "cell_type": "markdown", - "id": "3636037e", "metadata": {}, "source": [ "With SingleStore Kai you can power analytics on SQL and NoSQL data using the API of your choice" - ] + ], + "id": "f05705e2" }, { + "id": "2abf5b5f", "cell_type": "markdown", - "id": "32756572-0488-4ac0-bf10-1a47d84bc79f", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/vector-database-basics/notebook.ipynb b/notebooks/vector-database-basics/notebook.ipynb index 519c81f3..960852c8 100644 --- a/notebooks/vector-database-basics/notebook.ipynb +++ b/notebooks/vector-database-basics/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "76ae2947", "cell_type": "markdown", - "id": "1794de89-5ae3-44d3-bfa6-3d41c4f4deba", "metadata": {}, "source": [ "
    \n", @@ -21,7 +21,8 @@ "metadata": {}, "source": [ "**Required Installations**" - ] + ], + "id": "3990f5b0" }, { "cell_type": "code", @@ -30,7 +31,8 @@ "outputs": [], "source": [ "!pip install openai numpy pandas singlestoredb langchain==0.1.8 langchain-community==0.0.21 langchain-core==0.1.25 langchain-openai==0.0.6" - ] + ], + "id": "f35b269b" }, { "cell_type": "markdown", @@ -46,7 +48,8 @@ "- Percentage of consonants in the word\n", "\n", "This is a simple implementation of a **rule** based system to demonstrate the essence of what vector embedding models do. However, they utlize neural networks that are trained on vast datasets to learn key features and self-corrects using gradient descent." - ] + ], + "id": "62525eb4" }, { "cell_type": "code", @@ -80,7 +83,8 @@ "word = \"example\"\n", "vector = word_to_vector(word)\n", "print(f\"Word: {word}\\nVector: {vector}\")" - ] + ], + "id": "e856e1c0" }, { "cell_type": "markdown", @@ -91,7 +95,8 @@ "In this example, we demonstrate a way to determine the similarity between two vectors. There are many techniques to find the similiarity between two vectors but one of the most popular ways is using **cosine similarity**. Consine similarity is the the dot product between the two vectors divided by the product of the vector's normals (magnitudes).\n", "\n", "This is just an example to show how vector databases search for similar vectors. The fundamental problem with a system like this is our rule-based embedding because it does not give us a semantic understanding of the word/sentences/paragraphs. Instead, it gives us a classification of a single word's structure." - ] + ], + "id": "930b3443" }, { "cell_type": "code", @@ -120,7 +125,8 @@ "# Calculate and print cosine similarity\n", "similarity_score = cosine_similarity(vector1, vector2)\n", "print(f\"Cosine similarity between '{word1}' and '{word2}': {similarity_score}\")" - ] + ], + "id": "f42ed680" }, { "cell_type": "markdown", @@ -131,7 +137,8 @@ "In order to generate semantic understanding of language within vectors, embedding models are required. Embedding models are trained on vast corpus of language data. Training embedding models starts by initializing word embeddings with random vectors. Each word in the vocabulary is assigned a vector of real numbers. They use neural networks trained on large datasets to predict a word from its context (Continuous Bag of Words model) or to predict the context given a word (Skip-Gram model). During training, the model adjusts the word vectors to minimize some loss function, often related to the likelihood of observing a word given its context (or vice versa) through gradient descent.\n", "\n", "Examples of embedding models include Word2Vec, GloVe, BERT, OpenAI text-embedding." - ] + ], + "id": "959b415b" }, { "cell_type": "code", @@ -152,14 +159,16 @@ " return response.data[0].embedding\n", "\n", "print(openAIEmbeddings(\"Golden Retreiver\"))" - ] + ], + "id": "e272c367" }, { "cell_type": "markdown", "metadata": {}, "source": [ "As you can see, this is a huge vector! Over 1000 dimensions just in this one vector. This is why it is important for us to have good dimensionality reduction techniques during the similarity searches." - ] + ], + "id": "c9caf1e8" }, { "cell_type": "markdown", @@ -170,7 +179,8 @@ "In the following code we create a vector datbase with SingleStoreDB. We utilize Langchain to chunk and split the raw text into documents and use the OpenAI embeddings model to generate the vector embeddings. We then take the raw documents and embeddings and create a table with the columns \"docs\" and \"embeddings\".\n", "\n", "To test this out, we perform a similarity search based on a query and it returns the most similar document in the vector database." - ] + ], + "id": "e6fc6afd" }, { "cell_type": "code", @@ -204,7 +214,8 @@ "query = \"How old was Michael Jackson when he died?\"\n", "docs = vector_database.similarity_search(query)\n", "print(docs[0].page_content)" - ] + ], + "id": "edf5b87e" }, { "cell_type": "markdown", @@ -213,7 +224,8 @@ "## Retrieval Augmented Generation System\n", "\n", "RAG combines large language models with a retrieval mechanism to search a database for relevant information before generating responses. It utilizes real-world data from retrieved documents to ground responses, enhancing factual accuracy and reducing hallucinations. Documents are vectorized using embeddings and stored in a vector database for efficient retrieval. SingleStoreDB serves as a great vector database. The user query is converted into a vector, and a vector search is performed in the database to find documents relevant to that specific query. The system returns the documents with the highest relevance scores, which are then fed to the chatbot for generating informed responses." - ] + ], + "id": "a33d3409" }, { "cell_type": "code", @@ -276,11 +288,12 @@ "\n", " else:\n", " print(\"AI: Sorry, I couldn't find relevant information.\")" - ] + ], + "id": "9700834d" }, { + "id": "983cd74d", "cell_type": "markdown", - "id": "03050224-26e0-4015-a05c-a5025fd233d0", "metadata": {}, "source": [ "
    \n", @@ -301,5 +314,5 @@ } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 5 } diff --git a/notebooks/vector-search-with-kai/notebook.ipynb b/notebooks/vector-search-with-kai/notebook.ipynb index 3123c328..34454db2 100644 --- a/notebooks/vector-search-with-kai/notebook.ipynb +++ b/notebooks/vector-search-with-kai/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "d119cd5f", "cell_type": "markdown", - "id": "aa202b34-20c7-4021-a142-d959a81d2cfd", "metadata": {}, "source": [ "
    \n", @@ -18,34 +18,33 @@ }, { "cell_type": "markdown", - "id": "b1eef9c6-fcc6-4d1c-b280-102eea62a5ec", "metadata": {}, "source": [ "## Vector Search with Kai" - ] + ], + "id": "0d0e07b6" }, { "cell_type": "markdown", - "id": "078b13da-9901-4be5-b472-25c8cd8e01c4", "metadata": {}, "source": [ "In this notebook, we load a dataset into a collection, create a vector index and perform vector searches using Kai in a way that is compatible with MongoDB clients and applications" - ] + ], + "id": "866a5240" }, { "cell_type": "code", "execution_count": 1, - "id": "2b25990e-8ef4-4759-b828-98d8fbe092f0", "metadata": {}, "outputs": [], "source": [ "!pip install datasets --quiet" - ] + ], + "id": "62d44fb0" }, { "cell_type": "code", "execution_count": 2, - "id": "b305676b-f253-45f9-8b97-d9c5074eae6f", "metadata": {}, "outputs": [], "source": [ @@ -57,83 +56,83 @@ "from pymongo import MongoClient\n", "from datasets import load_dataset\n", "from bson import json_util" - ] + ], + "id": "884be378" }, { "cell_type": "markdown", - "id": "8dd06cce-e988-47b3-88f0-82833b8111a9", "metadata": {}, "source": [ "### 1. Initializing a pymongo client" - ] + ], + "id": "50e5eb0e" }, { "cell_type": "code", "execution_count": 3, - "id": "4a41b5af-0c80-4d4b-bad3-503cb8ee91a5", "metadata": {}, "outputs": [], "source": [ "current_database = %sql SELECT DATABASE() as CurrentDatabase\n", "DB = current_database[0][0]\n", "COLLECTION = 'wiki_embeddings'" - ] + ], + "id": "69eb1b7b" }, { "cell_type": "code", "execution_count": 4, - "id": "4c68f060-4f40-4d3d-939e-b0d524514245", "metadata": {}, "outputs": [], "source": [ "# Using the environment variable that holds the kai endpoint\n", "client = MongoClient(connection_url_kai)\n", "collection = client[DB][COLLECTION]" - ] + ], + "id": "2ad21ed8" }, { "cell_type": "markdown", - "id": "383015eb-9d56-40f5-9fd3-fec01934fcdf", "metadata": {}, "source": [ "### 2. Create a collection and load the dataset" - ] + ], + "id": "9e2264ff" }, { "cell_type": "markdown", - "id": "3d9261a0-28fe-474c-b89e-1c237b2681f9", "metadata": {}, "source": [ "It is recommended that you create a collection with the embedding field as a top level column for optimized utilization of storage. The name of the column should be the name of the field holding the embedding" - ] + ], + "id": "c2cf8565" }, { "cell_type": "code", "execution_count": 5, - "id": "c88d16fe-0746-40f2-abcc-4d3c3036165e", "metadata": {}, "outputs": [], "source": [ "client[DB].create_collection(COLLECTION,\n", " columns=[{ 'id': \"emb\", 'type': \"VECTOR(768) NOT NULL\" }],\n", ");" - ] + ], + "id": "9594856b" }, { "cell_type": "code", "execution_count": 6, - "id": "f76d2205-3e98-4969-be80-db59bd64467d", "metadata": {}, "outputs": [], "source": [ "# Using the \"wikipedia-22-12-simple-embeddings\" dataset from Hugging Face\n", "dataset = load_dataset(\"Cohere/wikipedia-22-12-simple-embeddings\", split=\"train\")" - ] + ], + "id": "e9ef7506" }, { "cell_type": "code", "execution_count": 7, - "id": "ca0f4601-ca6a-4e87-9939-2bef27c60c42", "metadata": {}, "outputs": [], "source": [ @@ -161,39 +160,39 @@ "if len(insert_data) > 0:\n", " collection.insert_many(insert_data)\n", " print(\"Data Ingested\")" - ] + ], + "id": "647d6232" }, { "cell_type": "markdown", - "id": "cefa9acd-3779-4dcc-9117-8b12a27abae2", "metadata": {}, "source": [ "A sample document from the collection" - ] + ], + "id": "0e512df2" }, { "cell_type": "code", "execution_count": 8, - "id": "7595312b-88ce-485d-9582-9c7e9ece3e11", "metadata": {}, "outputs": [], "source": [ "sample_doc = collection.find_one()\n", "pprint.pprint(sample_doc, compact=True)" - ] + ], + "id": "274582aa" }, { "cell_type": "markdown", - "id": "9876f080-e500-43df-afa5-04ca31b83524", "metadata": {}, "source": [ "### 3. Create a vector Index" - ] + ], + "id": "adbdd29f" }, { "cell_type": "code", "execution_count": 9, - "id": "d81aeb45-5bdb-429d-8710-10d4c25f2941", "metadata": {}, "outputs": [], "source": [ @@ -205,39 +204,39 @@ " 'kaiSearchOptions': {\"index_type\":\"AUTO\", \"metric_type\": \"EUCLIDEAN_DISTANCE\", \"dimensions\": 768}\n", " }],\n", "})" - ] + ], + "id": "aa1945d5" }, { "cell_type": "markdown", - "id": "5d44f05f-9316-4934-b856-213dbb540fa4", "metadata": {}, "source": [ "Selecting the query embedding from the sample_doc selected above" - ] + ], + "id": "3acb0065" }, { "cell_type": "code", "execution_count": 10, - "id": "007e376b-e969-479c-97fc-a85ac1c50b56", "metadata": {}, "outputs": [], "source": [ "# input vector\n", "query_vector = sample_doc['emb']" - ] + ], + "id": "808f1cbd" }, { "cell_type": "markdown", - "id": "e36426ec-8fc9-45d8-83b4-a261c3d9c8bf", "metadata": {}, "source": [ "### 4. Perform a vector search" - ] + ], + "id": "6d868667" }, { "cell_type": "code", "execution_count": 11, - "id": "0f9f9126-12a9-41cc-9f93-5309202fea98", "metadata": {}, "outputs": [], "source": [ @@ -261,30 +260,30 @@ " ]\n", " results = collection.aggregate(pipeline)\n", " return list(results)" - ] + ], + "id": "702698e1" }, { "cell_type": "code", "execution_count": 12, - "id": "69af232c-5c24-49ce-a28e-b982ab12b696", "metadata": {}, "outputs": [], "source": [ "execute_kai_search(query_vector)" - ] + ], + "id": "ccbd0433" }, { "cell_type": "markdown", - "id": "a1cbe6a2-3bef-4727-8920-e6af2801477e", "metadata": {}, "source": [ "Running concurrent vector search queries" - ] + ], + "id": "79a09da6" }, { "cell_type": "code", "execution_count": 13, - "id": "32972b4e-880d-4084-8a58-05b8787005a8", "metadata": {}, "outputs": [], "source": [ @@ -304,19 +303,20 @@ " print(f.exception())\n", "failed_count = sum(1 for f in futures if f.exception() is not None)\n", "print(f\"Failed queries: {failed_count}\")" - ] + ], + "id": "15224548" }, { "cell_type": "markdown", - "id": "a26070f4-7bfb-4ee7-8f48-d74e80c9966e", "metadata": {}, "source": [ "This shows the Kai can create vector indexes instantaneously and perform a large number of concurrent vector search queries surpassing MongoDB Atlas Vector Search capabilities" - ] + ], + "id": "fdb6d999" }, { + "id": "8403dbcd", "cell_type": "markdown", - "id": "8cfd1834-f238-4845-aa80-bf2bf82dd221", "metadata": {}, "source": [ "
    \n", diff --git a/notebooks/working-with-vector-data/notebook.ipynb b/notebooks/working-with-vector-data/notebook.ipynb index d51b6a9a..e02c3037 100644 --- a/notebooks/working-with-vector-data/notebook.ipynb +++ b/notebooks/working-with-vector-data/notebook.ipynb @@ -1,8 +1,8 @@ { "cells": [ { + "id": "3360077f", "cell_type": "markdown", - "id": "b80549ed-1471-4fc5-8b1a-a3050246078a", "metadata": {}, "source": [ "
    \n", @@ -17,8 +17,8 @@ ] }, { + "id": "fc325bcb", "cell_type": "markdown", - "id": "09cb34bc", "metadata": {}, "source": [ "
    \n", @@ -33,7 +33,6 @@ { "attachments": {}, "cell_type": "markdown", - "id": "782ffea9-fbc0-4942-8a1a-da8788ed2fec", "metadata": {}, "source": [ "Using vector embeddings has become popular recently, but getting vector data into your\n", @@ -43,12 +42,12 @@ "of numerics.\n", "\n", "We'll use the following function to reset the vector data table between examples." - ] + ], + "id": "be8595c6" }, { "cell_type": "code", "execution_count": 1, - "id": "f7fe2c95-9e0d-4b1b-ad24-d0536c4ef2d9", "metadata": {}, "outputs": [], "source": [ @@ -62,34 +61,34 @@ " vec_f32 BLOB\n", " );\n", " ''')" - ] + ], + "id": "d510653d" }, { "attachments": {}, "cell_type": "markdown", - "id": "d087092f-696c-4735-9c66-33b8efc885ca", "metadata": {}, "source": [ "At any time, if you want to see the actual query being sent to the database, you can set the following\n", "environment variable before making the query to the server." - ] + ], + "id": "e8e5c551" }, { "cell_type": "code", "execution_count": 2, - "id": "45628671-dee1-41fe-ae77-b8c651c8c389", "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "# os.environ['SINGLESTOREDB_DEBUG_QUERIES'] = '1'" - ] + ], + "id": "cb254279" }, { "attachments": {}, "cell_type": "markdown", - "id": "9b1cd9d3", "metadata": {}, "source": [ "
    \n", @@ -99,21 +98,21 @@ "

    If you have a Free Starter Workspace deployed already, select the database from drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.

    \n", "
    \n", "
    " - ] + ], + "id": "8ebe9269" }, { "attachments": {}, "cell_type": "markdown", - "id": "e2e322f5-b81d-4249-b512-bd36f88aa168", "metadata": {}, "source": [ "Create a database for our examples." - ] + ], + "id": "a137c9ec" }, { "cell_type": "code", "execution_count": 3, - "id": "fec12f93-7ca6-4f77-bc7f-355b0bfa98f9", "metadata": {}, "outputs": [], "source": [ @@ -121,12 +120,12 @@ "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", " %sql DROP DATABASE IF EXISTS vector_data;\n", " %sql CREATE DATABASE vector_data;" - ] + ], + "id": "cff6e948" }, { "attachments": {}, "cell_type": "markdown", - "id": "1f2db020-7f76-44d0-9b32-cc81d35979ef", "metadata": {}, "source": [ "
    \n", @@ -136,34 +135,34 @@ "

    Make sure to select the vector_data database from the drop-down menu at the top of this notebook. It updates the connection_url which is used by the %%sql magic command and SQLAlchemy to make connections to the selected database.

    \n", "
    \n", "
    " - ] + ], + "id": "0082edbf" }, { "attachments": {}, "cell_type": "markdown", - "id": "3e65bd3b-49b4-48ca-8409-e3da89ebcce4", "metadata": {}, "source": [ "## Generate numpy arrays containing vector data\n", "\n", "The code belowe generates 1,000 rows of 10 random 32-bit float numbers in a numpy array.\n", "This data will be used in the following examples." - ] + ], + "id": "ee33fc8b" }, { "cell_type": "code", "execution_count": 4, - "id": "c9fd6e9f-5513-45b3-bc4f-395e115ccd9e", "metadata": {}, "outputs": [], "source": [ "import numpy as np" - ] + ], + "id": "9e673f32" }, { "cell_type": "code", "execution_count": 5, - "id": "aab67ef8-8bd1-4f9e-957a-ac8248984f4f", "metadata": {}, "outputs": [ { @@ -178,24 +177,24 @@ "source": [ "vec_f32 = [np.random.rand(10).astype(np.float32) for _ in range(1000)]\n", "vec_f32[:3]" - ] + ], + "id": "a4bcc69a" }, { "attachments": {}, "cell_type": "markdown", - "id": "1539013e-6ad8-49cc-aafd-e0aa5c2dbf60", "metadata": {}, "source": [ "### Create a Python list of float values from the numpy array\n", "\n", "We will show how to work with both numpy arrays and Python lists in the following examples.\n", "This cell creates a list of Python lists of floats equivalent to the numpy arrays above." - ] + ], + "id": "897a7548" }, { "cell_type": "code", "execution_count": 6, - "id": "c72202fa-3a15-42a0-83f2-2650a6d5faa6", "metadata": {}, "outputs": [ { @@ -210,33 +209,33 @@ "source": [ "vec_f32_list = [list([float(y) for y in x]) for x in vec_f32]\n", "vec_f32_list[:3]" - ] + ], + "id": "e7796c8b" }, { "attachments": {}, "cell_type": "markdown", - "id": "ebe71955-7358-4c7c-add8-162f5bca098a", "metadata": {}, "source": [ "## Upload and downloading data to SingleStoreDB\n", "\n", "In the following sections, we'll describe how to use the SingleStoreDB Python client, SQLAlchemy, the `%%sql` magic,\n", "and pandas to upload and download vector data." - ] + ], + "id": "61603e2f" }, { "attachments": {}, "cell_type": "markdown", - "id": "2860a4f6-bfc6-4bc0-89d8-6c9d765f1240", "metadata": {}, "source": [ "### Using SingleStoreDB Python client" - ] + ], + "id": "be41864b" }, { "cell_type": "code", "execution_count": 7, - "id": "35cd7e37-d05f-424f-98c7-ae61958c42d5", "metadata": {}, "outputs": [], "source": [ @@ -244,52 +243,52 @@ "\n", "conn = s2.connect()\n", "cursor = conn.cursor()" - ] + ], + "id": "1fb204e1" }, { "attachments": {}, "cell_type": "markdown", - "id": "66e77736-4625-481b-9991-d7e7f28401cb", "metadata": {}, "source": [ "#### Working with numpy arrays" - ] + ], + "id": "661762a8" }, { "attachments": {}, "cell_type": "markdown", - "id": "2d1453cd-21d2-4843-a41a-6aa1a33ce0a1", "metadata": {}, "source": [ "The SingleStoreDB Python client supports numpy arrays natively. If a numpy array is passed as a parameter to a query,\n", "it will be converted to a byte string containing the contents of the array. The data type of the numpy array is\n", "preserved, so you need to ensure that it is the proper numpy dtype before uploading. You can change the data type\n", "of a numpy array by using the `astype` method." - ] + ], + "id": "4a965d17" }, { "cell_type": "code", "execution_count": 8, - "id": "5fa23885-106d-4b37-ade2-d7b6e6c8b593", "metadata": {}, "outputs": [], "source": [ "reset_table()" - ] + ], + "id": "aa3823b6" }, { "attachments": {}, "cell_type": "markdown", - "id": "a752e82f-bdf9-442e-94eb-9e29459da840", "metadata": {}, "source": [ "Recall that `vec_f32` contained numpy arrays of float32 values." - ] + ], + "id": "1b719316" }, { "cell_type": "code", "execution_count": 9, - "id": "9fcdb1ce-254b-4420-815e-76cb2199ac05", "metadata": {}, "outputs": [ { @@ -303,21 +302,21 @@ ], "source": [ "vec_f32[:3]" - ] + ], + "id": "6803d53a" }, { "attachments": {}, "cell_type": "markdown", - "id": "df0f98b0-d916-4113-a34c-e0c13cffa242", "metadata": {}, "source": [ "The `executemany` method will insert multiple rows of data in a single SQL query." - ] + ], + "id": "ec0bbe37" }, { "cell_type": "code", "execution_count": 10, - "id": "b55d0954-9e8c-468b-b1da-019a3adf4fd2", "metadata": {}, "outputs": [ { @@ -331,22 +330,22 @@ ], "source": [ "cursor.executemany('INSERT INTO vectors(vec_f32) VALUES (%s)', vec_f32)" - ] + ], + "id": "4aa8a6e7" }, { "attachments": {}, "cell_type": "markdown", - "id": "f929f1ed-2ee1-4209-a27d-121bec2a3a79", "metadata": {}, "source": [ "To download the vector data from SingleStoreDB, you simple execute a `SELECT` statement. The data is held in\n", "blob columns, so the result will simply contain byte strings." - ] + ], + "id": "de75eb11" }, { "cell_type": "code", "execution_count": 11, - "id": "218071ef-0742-460b-b0a4-b079970ae568", "metadata": {}, "outputs": [ { @@ -360,21 +359,21 @@ ], "source": [ "cursor.execute('SELECT vec_f32 FROM vectors LIMIT 5')" - ] + ], + "id": "c40853f5" }, { "attachments": {}, "cell_type": "markdown", - "id": "22892481-3d71-48aa-abe3-ffd63b309419", "metadata": {}, "source": [ "Since we want to use the data as numpy arrays, we can \"reconstitute\" the arrays as we read the data using the `np.frombuffer` function." - ] + ], + "id": "819ab832" }, { "cell_type": "code", "execution_count": 12, - "id": "52bfac93-5503-4144-8700-95db21f13897", "metadata": {}, "outputs": [ { @@ -389,12 +388,12 @@ "source": [ "out_f32 = [np.frombuffer(x[0], dtype=np.float32) for x in cursor]\n", "out_f32" - ] + ], + "id": "204ac834" }, { "attachments": {}, "cell_type": "markdown", - "id": "390b149f-8039-43ee-ae43-215ea7997a4f", "metadata": {}, "source": [ "#### Working with Python lists\n", @@ -409,22 +408,22 @@ "* h - int16\n", "* l - int32\n", "* q - int64" - ] + ], + "id": "aceed0cb" }, { "cell_type": "code", "execution_count": 13, - "id": "5707a569-4361-4d69-a078-5c71bb547dce", "metadata": {}, "outputs": [], "source": [ "reset_table()" - ] + ], + "id": "5118e2c1" }, { "cell_type": "code", "execution_count": 14, - "id": "a0777da4-daba-4b06-8fb6-c7fcc30dcc25", "metadata": {}, "outputs": [ { @@ -444,21 +443,21 @@ "\n", "vec_f32_list_bytes = [struct.pack(fmt, *x) for x in vec_f32_list]\n", "vec_f32_list_bytes[:3]" - ] + ], + "id": "bd0686c1" }, { "attachments": {}, "cell_type": "markdown", - "id": "77a3b930-33cd-4436-a021-9e99ed94cd9c", "metadata": {}, "source": [ "##### The `INSERT` and `SELECT` code is the same as for numy arrays" - ] + ], + "id": "1e684f2d" }, { "cell_type": "code", "execution_count": 15, - "id": "0a1f4d5b-50f1-4987-b8f8-613b2b6f03bd", "metadata": {}, "outputs": [ { @@ -472,12 +471,12 @@ ], "source": [ "cursor.executemany('INSERT INTO vectors(vec_f32) VALUES (%s)', vec_f32_list_bytes)" - ] + ], + "id": "57b00249" }, { "cell_type": "code", "execution_count": 16, - "id": "171acbee-c663-4073-843b-a3f83fa0a99a", "metadata": {}, "outputs": [ { @@ -491,21 +490,21 @@ ], "source": [ "cursor.execute('SELECT vec_f32 FROM vectors LIMIT 5')" - ] + ], + "id": "f87d2f25" }, { "attachments": {}, "cell_type": "markdown", - "id": "b0b40daa-52a9-4bf8-aefd-4722974cb8f5", "metadata": {}, "source": [ "To unpack the rows as Python lists, we use the `struct` package again." - ] + ], + "id": "27af11b3" }, { "cell_type": "code", "execution_count": 17, - "id": "63490736-c68b-49d5-8db1-8ec203c7a583", "metadata": {}, "outputs": [ { @@ -520,12 +519,12 @@ "source": [ "out_f32_list = [list(struct.unpack(fmt, x[0])) for x in cursor]\n", "out_f32_list[:3]" - ] + ], + "id": "e874b93f" }, { "attachments": {}, "cell_type": "markdown", - "id": "91927fbd-f19c-448a-926a-d4ee8dc3e607", "metadata": {}, "source": [ "### Using SQLAlchemy\n", @@ -535,12 +534,12 @@ "```\n", "pip install sqlalchemy-singlestoredb\n", "```" - ] + ], + "id": "4bd95c3f" }, { "cell_type": "code", "execution_count": 18, - "id": "7d715701-eba1-4868-b223-a17a6fa4b6ce", "metadata": {}, "outputs": [], "source": [ @@ -548,51 +547,51 @@ "\n", "eng = sa.create_engine(connection_url)\n", "conn = eng.connect()" - ] + ], + "id": "b8efd4ff" }, { "attachments": {}, "cell_type": "markdown", - "id": "6ca1960a-a55e-465c-a4f2-3daeb56e2739", "metadata": {}, "source": [ "The SQLAlchemy method works much like the SingleStoreDB method. However, SQLAlchemy (v2+) requires parameters to be\n", "in a dictionary, and the substitution syntax is of the form `:var_name` where 'var_name' in the key in the dictionary." - ] + ], + "id": "3571d297" }, { "attachments": {}, "cell_type": "markdown", - "id": "473114ce-4b51-484d-90d9-eaafce4d4b58", "metadata": {}, "source": [ "#### Working with numpy arrays" - ] + ], + "id": "af7ab775" }, { "cell_type": "code", "execution_count": 19, - "id": "74707c74-2529-43e1-ba87-b693403b5e8d", "metadata": {}, "outputs": [], "source": [ "reset_table()" - ] + ], + "id": "eac86f33" }, { "attachments": {}, "cell_type": "markdown", - "id": "86eff78c-4b8f-40d1-bc9f-978fd39dada6", "metadata": {}, "source": [ "SQLAlchemy requires you to construct the query as a `sa.text` object. Parameters for inserting multple\n", "rows are in a list of dictionaries." - ] + ], + "id": "1d10b9b3" }, { "cell_type": "code", "execution_count": 20, - "id": "03905527-9239-4fd7-9a9b-4c35da0b7447", "metadata": {}, "outputs": [ { @@ -607,40 +606,40 @@ "source": [ "query = sa.text('INSERT INTO vectors(vec_f32) VALUES (:vec_f32)')\n", "conn.execute(query, [dict(vec_f32=x) for x in vec_f32])" - ] + ], + "id": "79903d6c" }, { "attachments": {}, "cell_type": "markdown", - "id": "f95fb2be-e513-4555-b580-118f337e0f19", "metadata": {}, "source": [ "Selecting the data works much as before as well." - ] + ], + "id": "efe51fac" }, { "cell_type": "code", "execution_count": 21, - "id": "d7b22128-188c-475e-a1cb-5c52261d8403", "metadata": {}, "outputs": [], "source": [ "result = conn.execute(sa.text('SELECT vec_f32 FROM vectors LIMIT 5'))" - ] + ], + "id": "7c938e0f" }, { "attachments": {}, "cell_type": "markdown", - "id": "f7bc12cf-6ce6-4c20-8fa4-e83c2cb49e71", "metadata": {}, "source": [ "We can use the `np.frombuffer` function again to convert the byte strings to numpy arrays." - ] + ], + "id": "71b25150" }, { "cell_type": "code", "execution_count": 22, - "id": "3391ee73-86c5-4913-b412-bf4d12fb9b68", "metadata": {}, "outputs": [ { @@ -655,12 +654,12 @@ "source": [ "out_f32 = [np.frombuffer(x[0], dtype=np.float32) for x in result]\n", "out_f32" - ] + ], + "id": "54996a5e" }, { "attachments": {}, "cell_type": "markdown", - "id": "893ff3b8-5f16-4736-b157-52eec72a6fea", "metadata": {}, "source": [ "#### Working with Python lists\n", @@ -668,32 +667,32 @@ "To upload Python lists of values, you use the `struct` package to construct the byte strings as described in the\n", "\"Uploading Python Lists\" in the previous section. The rest of the code here stays the same with the exception of\n", "replacing `vec_f32` with `vec_f32_list_bytes` as the query parameter for the `INSERT` query." - ] + ], + "id": "ed7d05d0" }, { "cell_type": "code", "execution_count": 23, - "id": "4a4124d1-588b-408a-9d85-dc2acd8d8f31", "metadata": {}, "outputs": [], "source": [ "reset_table()" - ] + ], + "id": "72fa16ed" }, { "attachments": {}, "cell_type": "markdown", - "id": "3a09006f-6c12-40fd-b3d7-cd3b4c33f040", "metadata": {}, "source": [ "Recall that we create a list of bytes (vector) objects in the previous example. This list of vectors\n", "can be passed to the `INSERT` as well as numpy arrays." - ] + ], + "id": "38b2a3b9" }, { "cell_type": "code", "execution_count": 24, - "id": "465ffe10-cc32-400c-adec-f4e91f25fb98", "metadata": {}, "outputs": [ { @@ -707,12 +706,12 @@ ], "source": [ "vec_f32_list_bytes[:3]" - ] + ], + "id": "a53b2b6e" }, { "cell_type": "code", "execution_count": 25, - "id": "97b2069f-2cd2-4af5-95cc-87637d1fc838", "metadata": {}, "outputs": [ { @@ -727,31 +726,31 @@ "source": [ "query = sa.text('INSERT INTO vectors(vec_f32) VALUES (:vec_f32)')\n", "conn.execute(query, [dict(vec_f32=x) for x in vec_f32_list_bytes])" - ] + ], + "id": "fbb57f59" }, { "cell_type": "code", "execution_count": 26, - "id": "ea364348-8b95-4835-9481-11a7bf67fce0", "metadata": {}, "outputs": [], "source": [ "result = conn.execute(sa.text('SELECT vec_f32 FROM vectors LIMIT 5'))" - ] + ], + "id": "b313ed55" }, { "attachments": {}, "cell_type": "markdown", - "id": "8fa7bd8e-8842-438f-a336-e93ecc321820", "metadata": {}, "source": [ "Unpacking the Python lists works as before as well." - ] + ], + "id": "cf84d2e3" }, { "cell_type": "code", "execution_count": 27, - "id": "78b0619f-a057-4edb-a230-1e96c5b0b2e7", "metadata": {}, "outputs": [ { @@ -766,12 +765,12 @@ "source": [ "out_f32_list = [list(struct.unpack(fmt, x[0])) for x in result]\n", "out_f32_list[:3]" - ] + ], + "id": "bc631b99" }, { "attachments": {}, "cell_type": "markdown", - "id": "2b2db64d-8e96-4f59-b91a-3731ee934287", "metadata": {}, "source": [ "### Using pandas\n", @@ -779,50 +778,50 @@ "The pandas package has utilities for working with databases. The two primary methods / functions are\n", "`DataFrame.to_sql` which uploads `DataFrame` data to a table, and `pd.read_sql` which downloads\n", "data from a table." - ] + ], + "id": "b155d866" }, { "cell_type": "code", "execution_count": 28, - "id": "a53088c8-af5e-40f6-84b5-aa83cc81303f", "metadata": {}, "outputs": [], "source": [ "reset_table()" - ] + ], + "id": "baf5fcda" }, { "cell_type": "code", "execution_count": 29, - "id": "50511825-6506-45b4-9b36-607dcee37dea", "metadata": {}, "outputs": [], "source": [ "import pandas as pd" - ] + ], + "id": "39d002fc" }, { "attachments": {}, "cell_type": "markdown", - "id": "91b876a8-da7d-48b5-89a9-8149fab91566", "metadata": {}, "source": [ "First, we'll create a pandas `DataFrame` with our numpy arrays." - ] + ], + "id": "c007ade3" }, { "attachments": {}, "cell_type": "markdown", - "id": "fdf50e43-68a2-4cfb-a6a0-215d442f27c8", "metadata": {}, "source": [ "#### Working with numpy arrays" - ] + ], + "id": "2960dc61" }, { "cell_type": "code", "execution_count": 30, - "id": "7f871623-9176-4865-97f4-5e89cf7c3a70", "metadata": {}, "outputs": [ { @@ -838,22 +837,22 @@ "source": [ "df = pd.DataFrame(dict(vec_f32=pd.Series(vec_f32)))\n", "df.head()" - ] + ], + "id": "8be6c7de" }, { "attachments": {}, "cell_type": "markdown", - "id": "c37150fa-e5f1-49d5-b13b-e26e9e88ed92", "metadata": {}, "source": [ "We can use the `to_sql` method of the `DataFrame` to upload the data. Notice that we are using the SQLAlchemy\n", "connection we created in the previous section as the `con` parameter." - ] + ], + "id": "ddd90e18" }, { "cell_type": "code", "execution_count": 31, - "id": "1a853637-f29e-434a-9dd4-d2fb92bc4597", "metadata": {}, "outputs": [ { @@ -867,22 +866,22 @@ ], "source": [ "df.to_sql('vectors', con=conn, if_exists='append', index=False)" - ] + ], + "id": "fb056253" }, { "attachments": {}, "cell_type": "markdown", - "id": "67fdc9d4-9d48-4af9-a4f9-b643a43992b9", "metadata": {}, "source": [ "To read the data, we use the `read_sql` function. As before, we are getting byte strings back that will need to be\n", "converted into numpy arrays." - ] + ], + "id": "3f442526" }, { "cell_type": "code", "execution_count": 32, - "id": "a75c5726-0ee7-4876-aac7-e71dc9752eae", "metadata": {}, "outputs": [ { @@ -898,31 +897,31 @@ "source": [ "out_df = pd.read_sql('vectors', con=conn)\n", "out_df.head(3)" - ] + ], + "id": "d1042b11" }, { "attachments": {}, "cell_type": "markdown", - "id": "9d774b5f-88f9-45b3-a54d-229020aa16af", "metadata": {}, "source": [ "We apply the `np.frombuffer` function to each element in the `vec_f32` column to reconstruct the numpy array." - ] + ], + "id": "3c1b00d0" }, { "cell_type": "code", "execution_count": 33, - "id": "48b56238-b251-479f-9d1f-271f46a7111e", "metadata": {}, "outputs": [], "source": [ "out_df['vec_f32'] = out_df['vec_f32'].apply(lambda x: np.frombuffer(x, dtype=np.float32))" - ] + ], + "id": "4ffcbed9" }, { "cell_type": "code", "execution_count": 34, - "id": "c4e77d6b-a93c-47d2-89ce-b1c502950c71", "metadata": {}, "outputs": [ { @@ -937,43 +936,43 @@ ], "source": [ "out_df.head(3)" - ] + ], + "id": "1c395e87" }, { "attachments": {}, "cell_type": "markdown", - "id": "71b184dd-641c-4ef0-91cf-c581143d3945", "metadata": {}, "source": [ "#### Working with Python lists\n", "\n", "Because Python lists are not typed arrays like numpy arrays, we have to convert them to bytes before\n", "uploading them." - ] + ], + "id": "5bdc0a63" }, { "cell_type": "code", "execution_count": 35, - "id": "43187411-efe0-465d-b6dd-a167534f6823", "metadata": {}, "outputs": [], "source": [ "reset_table()" - ] + ], + "id": "30dcb4d6" }, { "attachments": {}, "cell_type": "markdown", - "id": "6b0fa295-99e9-4846-9996-a704df463a36", "metadata": {}, "source": [ "Construct a `DataFrame` using Python lists as the data." - ] + ], + "id": "1795d15b" }, { "cell_type": "code", "execution_count": 36, - "id": "1ed1b6c2-3c79-42b9-a671-41b2828c4c31", "metadata": {}, "outputs": [ { @@ -989,22 +988,22 @@ "source": [ "df = pd.DataFrame(dict(vec_f32=vec_f32_list))\n", "df.head(3)" - ] + ], + "id": "6801590c" }, { "attachments": {}, "cell_type": "markdown", - "id": "94263962-9ec2-4e34-a08e-1e2ad41247dd", "metadata": {}, "source": [ "Note that we are using our `fmt` value from a previous section to convert the Python lists\n", "to bytes using `struct.pack`." - ] + ], + "id": "fd1dde1a" }, { "cell_type": "code", "execution_count": 37, - "id": "3cb1b6e1-a732-4a2f-a751-095d6727e6ae", "metadata": {}, "outputs": [ { @@ -1018,22 +1017,22 @@ ], "source": [ "fmt" - ] + ], + "id": "9032ca0d" }, { "cell_type": "code", "execution_count": 38, - "id": "6cdaafa5-7406-488b-a780-744f23b5c0e4", "metadata": {}, "outputs": [], "source": [ "df['vec_f32'] = df['vec_f32'].apply(lambda x: struct.pack(fmt, *x))" - ] + ], + "id": "e3a201cf" }, { "cell_type": "code", "execution_count": 39, - "id": "af739340-e5fd-482a-96c8-5eedf8202f1c", "metadata": {}, "outputs": [ { @@ -1047,21 +1046,21 @@ ], "source": [ "df['vec_f32'].head(3)" - ] + ], + "id": "39870bf8" }, { "attachments": {}, "cell_type": "markdown", - "id": "6f2d8675-c1ee-44d2-ac17-eef1c543d71c", "metadata": {}, "source": [ "Use the `to_sql` method to upload the `DataFrame`." - ] + ], + "id": "8cfe7184" }, { "cell_type": "code", "execution_count": 40, - "id": "49dde7bd-9823-4c55-8f34-4e16643e6b8e", "metadata": {}, "outputs": [ { @@ -1075,12 +1074,12 @@ ], "source": [ "df.to_sql('vectors', con=conn, if_exists='append', index=False)" - ] + ], + "id": "730f4fb0" }, { "cell_type": "code", "execution_count": 41, - "id": "137a7f8e-d713-4179-bcad-66f194d1f839", "metadata": {}, "outputs": [ { @@ -1096,31 +1095,31 @@ "source": [ "out_df = pd.read_sql('vectors', con=conn)\n", "out_df.head(3)" - ] + ], + "id": "6fb53006" }, { "attachments": {}, "cell_type": "markdown", - "id": "99233fdb-57b2-4290-9038-7c3e5eaf553e", "metadata": {}, "source": [ "We now have to convert the byte strings back to Python lists." - ] + ], + "id": "635e86c5" }, { "cell_type": "code", "execution_count": 42, - "id": "a60f967c-c8fe-4ad9-a11f-25f5fb35ce69", "metadata": {}, "outputs": [], "source": [ "out_df['vec_f32'] = out_df['vec_f32'].apply(lambda x: list(struct.unpack(fmt, x)))" - ] + ], + "id": "56830c9e" }, { "cell_type": "code", "execution_count": 43, - "id": "2924f8b8-f543-4a2f-90c8-8e6e5c15275d", "metadata": {}, "outputs": [ { @@ -1135,12 +1134,12 @@ ], "source": [ "out_df.head(3)" - ] + ], + "id": "5ab374a5" }, { "attachments": {}, "cell_type": "markdown", - "id": "8f070295-78e3-4137-82d6-8be8c64b3898", "metadata": {}, "source": [ "### Using the `%%sql` / `%sql` magic commands\n", @@ -1149,22 +1148,22 @@ "for complex queries that insert data. The primary issue is that you must construct the query as a string\n", "and ensure that all of your data is properly escaped. We'll demonstrate some basics here, but the\n", "methods described in the previous sections are likely to work better." - ] + ], + "id": "4e511cf8" }, { "cell_type": "code", "execution_count": 44, - "id": "5f13939e-2254-4956-9537-315f1dde1b63", "metadata": {}, "outputs": [], "source": [ "reset_table()" - ] + ], + "id": "934a4e48" }, { "attachments": {}, "cell_type": "markdown", - "id": "3ac2349f-d2bd-452d-9e4f-d869ef0e774f", "metadata": {}, "source": [ "#### Working with numpy arrays or Python lists\n", @@ -1173,12 +1172,12 @@ "manually before creating the query. This is done the same way whether the source is numpy arrays or Python lists.\n", "In either case, you must convert the objects to byte strings as we have in the previous sections, then convert that\n", "byte string into a hex literal that can be used in the query." - ] + ], + "id": "fbd5e5f3" }, { "cell_type": "code", "execution_count": 45, - "id": "f6781046-e636-4495-8a99-e035db8988aa", "metadata": {}, "outputs": [ { @@ -1193,12 +1192,12 @@ "source": [ "# Convert an element of the numpy array to a hex string\n", "vec_f32[0].tobytes().hex()" - ] + ], + "id": "420aa346" }, { "cell_type": "code", "execution_count": 46, - "id": "957f98e1-c3d5-4e7c-b43a-5583cdff045e", "metadata": {}, "outputs": [ { @@ -1213,24 +1212,24 @@ "source": [ "# Convert an element of the Python list to a hex string\n", "struct.pack(fmt, *vec_f32_list[0]).hex()" - ] + ], + "id": "9b7b6edd" }, { "attachments": {}, "cell_type": "markdown", - "id": "5424355e-fffb-4cc7-b0c3-eba7012d1bd1", "metadata": {}, "source": [ "To construct the query string for the `%%sql` command, we need to build the entire list of values to insert\n", "in a separate step. We'll insert the `X` at the beginning of the string to indicate a hex literal to\n", "SingleStoreDB. We'll also add the parentheses around the value for inserting multiple rows of data using\n", "the `INSERT` statement." - ] + ], + "id": "09fed577" }, { "cell_type": "code", "execution_count": 47, - "id": "e269be32-5b56-4e19-baed-6420d6fd4bfb", "metadata": {}, "outputs": [ { @@ -1245,12 +1244,12 @@ "source": [ "params = [\"(X'{}')\".format(x.tobytes().hex()) for x in vec_f32]\n", "params[:3]" - ] + ], + "id": "9354ce86" }, { "cell_type": "code", "execution_count": 48, - "id": "bd51d277-eec1-4787-b9b9-7a943f3eea0c", "metadata": {}, "outputs": [ { @@ -1266,32 +1265,32 @@ "source": [ "%%sql\n", "INSERT INTO vectors(vec_f32) VALUES {{ ','.join(params) }}" - ] + ], + "id": "953fe4bb" }, { "attachments": {}, "cell_type": "markdown", - "id": "5b982cc3-5e8a-460c-beff-440dbae58144", "metadata": {}, "source": [ "We can now select the data." - ] + ], + "id": "896d16f5" }, { "cell_type": "code", "execution_count": 49, - "id": "8a9ce43b-2ebc-4c9a-8898-afacaff13df9", "metadata": {}, "outputs": [], "source": [ "%%sql out <<\n", "SELECT * FROM vectors LIMIT 5" - ] + ], + "id": "e864790d" }, { "cell_type": "code", "execution_count": 50, - "id": "b10b2f71-02fd-4630-8ae0-7845a8385934", "metadata": {}, "outputs": [ { @@ -1306,42 +1305,42 @@ ], "source": [ "out" - ] + ], + "id": "884c0468" }, { "attachments": {}, "cell_type": "markdown", - "id": "3178ea48-2bfd-44d2-8211-a291dd5bf5ba", "metadata": {}, "source": [ "At this point, there is nothing we can do with SQL magic commands to convert the data back into numpy arrays or Python\n", "lists. We need to drop to Python for that." - ] + ], + "id": "552f374d" }, { "cell_type": "code", "execution_count": 51, - "id": "5b6d0c98-a016-423c-a460-aa617615bcdf", "metadata": {}, "outputs": [], "source": [ "out_df = pd.DataFrame(out)" - ] + ], + "id": "f2d95030" }, { "cell_type": "code", "execution_count": 52, - "id": "5a3e6552-b66c-460e-9394-04b6b1a25795", "metadata": {}, "outputs": [], "source": [ "out_df['vec_f32'] = out_df['vec_f32'].apply(lambda x: np.frombuffer(x, dtype=np.float32))" - ] + ], + "id": "c04ef18a" }, { "cell_type": "code", "execution_count": 53, - "id": "821ac65c-b8d5-47f5-8b14-945ed8e8d1fa", "metadata": {}, "outputs": [ { @@ -1356,12 +1355,12 @@ ], "source": [ "out_df.head(3)" - ] + ], + "id": "8e58a2fd" }, { "attachments": {}, "cell_type": "markdown", - "id": "ad5c2b96-0002-4948-87a4-949a68c3e0a2", "metadata": {}, "source": [ "### Using JSON\n", @@ -1370,22 +1369,22 @@ "end which isn't quite a efficient as the techniques above. It also requires using the `JSON_ARRAY_PACK` and `JSON_ARRAY_UNPACK`\n", "functions in your queries to go back and forth between the vector bytes and JSON. Here is an example of inserting the\n", "Python list of floats." - ] + ], + "id": "739f94cb" }, { "cell_type": "code", "execution_count": 54, - "id": "df5af6d1-15e1-4867-a02c-31634a65393b", "metadata": {}, "outputs": [], "source": [ "import json" - ] + ], + "id": "71f38575" }, { "cell_type": "code", "execution_count": 55, - "id": "79f06760-9039-408a-a4c2-6331947dd3e4", "metadata": {}, "outputs": [ { @@ -1400,12 +1399,12 @@ "source": [ "params = ['(JSON_ARRAY_PACK(\"{}\"))'.format(json.dumps(x)) for x in vec_f32_list]\n", "params[:3]" - ] + ], + "id": "71af63b7" }, { "cell_type": "code", "execution_count": 56, - "id": "92217c8d-f374-49a6-8fb8-f21666681f95", "metadata": {}, "outputs": [ { @@ -1421,32 +1420,32 @@ "source": [ "%%sql\n", "INSERT INTO vectors(vec_f32) VALUES {{ ','.join(params) }}" - ] + ], + "id": "e648cb2c" }, { "attachments": {}, "cell_type": "markdown", - "id": "bd323c0a-7de4-4c56-9fc4-f2a22f4f661c", "metadata": {}, "source": [ "If you use the `JSON_ARRAY_UNPACK` function in your `SELECT` statement, you can download the data as JSON." - ] + ], + "id": "e912c1bb" }, { "cell_type": "code", "execution_count": 57, - "id": "b9aac5ba-efea-466b-82c7-12fa02174630", "metadata": {}, "outputs": [], "source": [ "%%sql out <<\n", "SELECT JSON_ARRAY_UNPACK(vec_f32) AS 'vec_f32' FROM vectors LIMIT 5" - ] + ], + "id": "3a6ce7d0" }, { "cell_type": "code", "execution_count": 58, - "id": "9f9e9b6e-5a25-483b-9d40-aea95a302b5f", "metadata": {}, "outputs": [ { @@ -1462,12 +1461,12 @@ "source": [ "out = pd.DataFrame(out)\n", "out" - ] + ], + "id": "f573f194" }, { "cell_type": "code", "execution_count": 59, - "id": "bbfe895a-0f09-4094-a835-793329ee388e", "metadata": {}, "outputs": [ { @@ -1481,22 +1480,22 @@ ], "source": [ "out['vec_f32'][0]" - ] + ], + "id": "9061e7dc" }, { "attachments": {}, "cell_type": "markdown", - "id": "6bdb300a-a8f6-40cc-a5f9-de54508bb22b", "metadata": {}, "source": [ "Notice that since the data type of the column in the `SELECT` is JSON, it automatically gets converted to a Python list\n", "in the client." - ] + ], + "id": "76599526" }, { "cell_type": "code", "execution_count": 60, - "id": "b5234146-b058-4462-b5d0-516ae699efc6", "metadata": {}, "outputs": [ { @@ -1510,12 +1509,12 @@ ], "source": [ "type(out['vec_f32'][0])" - ] + ], + "id": "3a9c06c8" }, { "attachments": {}, "cell_type": "markdown", - "id": "8a8cfd32-c903-4e9b-b27b-253fcbca6ad4", "metadata": {}, "source": [ "## Conclusion\n", @@ -1523,12 +1522,12 @@ "As you can see, there are various interfaces available for uploading and downloading vector data. Depending on\n", "which Python framework you are using and what format your data is in, you can pick and choose which\n", "methods work for your use-case." - ] + ], + "id": "c0fc8b03" }, { "attachments": {}, "cell_type": "markdown", - "id": "42060943", "metadata": {}, "source": [ "
    \n", @@ -1538,23 +1537,24 @@ "

    If you created a new database in your Standard or Premium Workspace, you can drop the database by running the cell below. Note: this will not drop your database for Free Starter Workspaces. To drop a Free Starter Workspace, terminate the Workspace using the UI.

    \n", "
    \n", "
    " - ] + ], + "id": "a311f8cc" }, { "cell_type": "code", "execution_count": 61, - "id": "8f911f36-0153-4959-828a-41e637cc9887", "metadata": {}, "outputs": [], "source": [ "shared_tier_check = %sql show variables like 'is_shared_tier'\n", "if not shared_tier_check or shared_tier_check[0][1] == 'OFF':\n", " %sql DROP DATABASE IF EXISTS vector_data;" - ] + ], + "id": "f7c9d491" }, { + "id": "26f664c5", "cell_type": "markdown", - "id": "546a9cee-db0a-438b-9fcc-081223339a9f", "metadata": {}, "source": [ "
    \n", diff --git a/resources/nb-check.py b/resources/nb-check.py index 87aaa486..bccf3289 100755 --- a/resources/nb-check.py +++ b/resources/nb-check.py @@ -6,9 +6,12 @@ import sys import tomllib import uuid +import warnings from typing import Any from typing import List +import nbformat + DEFAULT_NOTEBOOK_METADATA = { 'metadata': { @@ -119,6 +122,18 @@ } +IDS = set() + + +def generate_corpus_id(): + """Generate a corpus id.""" + while True: + id = uuid.uuid4().hex[:8] + if id not in IDS: + IDS.add(id) + return id + + def error(msg: str) -> None: """Print an error message and end the program.""" print('ERROR:', msg, file=sys.stderr) @@ -132,7 +147,7 @@ def new_markdown_cell(cell_id: str, content: list[str]) -> dict[str, Any]: Parameters ---------- cell_id : str - The UUID to use for the cell ID + ID to apply content : list[str] The list of strings that make up the cell contents @@ -142,8 +157,8 @@ def new_markdown_cell(cell_id: str, content: list[str]) -> dict[str, Any]: """ return dict( - cell_type='markdown', id=cell_id, + cell_type='markdown', metadata={}, source=content, ) @@ -181,15 +196,37 @@ def new_markdown_cell(cell_id: str, content: list[str]) -> dict[str, Any]: if 'metadata' in cell: cell['metadata'] = {} + nbversion = nb.get('nbformat', 4) + nbversion_minor = max(nb.get('nbformat_minor', 0), 5) + + nb['nbformat'] = nbversion + nb['nbformat_minor'] = nbversion_minor + + for i, cell in enumerate(cells): + # Remove duplicate IDs + if 'id' in cell and cell['id'] in IDS: + del cell['id'] + # Remove invalid IDs + if 'id' in cell and len(cell['id']) != 8: + del cell['id'] + # Add ID to cache + if 'id' in cell: + IDS.add(cell['id']) + + # Generate IDs for cells that need it + for i, cell in enumerate(cells): + if 'id' not in cell: + cell['id'] = generate_corpus_id() + # Remove empty cells at the end of the notebook end = len(cells) - 1 while end > 0 and 'source' in cells[end] and not cells[end]['source']: cells.pop(end) end -= 1 - header_id = str(uuid.uuid4()) - starter_id = str(uuid.uuid4()) - footer_id = str(uuid.uuid4()) + header_id = generate_corpus_id() + footer_id = generate_corpus_id() + starter_id = generate_corpus_id() # Remove header cell, it will be regenerated later if cells: @@ -198,8 +235,7 @@ def new_markdown_cell(cell_id: str, content: list[str]) -> dict[str, Any]: source = ''.join(source) if 'id="singlestore-header"' in source: header_cell = cells.pop(0) - if header_cell.get('id', None): - header_id = header_cell['id'] + header_id = header_cell.get('id', header_id) # Remove Free Starter Workspace notification, it will be regenerated later if cells: @@ -209,11 +245,10 @@ def new_markdown_cell(cell_id: str, content: list[str]) -> dict[str, Any]: if not isinstance(source, str): source = ''.join(source) if 'alert-warning' in source and 'can be run on a Free Starter' in source: - if cells[i].get('id', None): - starter_id = cells[i]['id'] remove_cells.insert(0, i) for i in remove_cells: - cells.pop(i) + starter_cell = cells.pop(i) + starter_id = starter_cell.get('id', starter_id) # Remove footer cell, it will be regenerated later if cells: @@ -222,8 +257,7 @@ def new_markdown_cell(cell_id: str, content: list[str]) -> dict[str, Any]: source = ''.join(source) if 'id="singlestore-footer"' in source: footer_cell = cells.pop(-1) - if footer_cell.get('id', None): - footer_id = footer_cell['id'] + footer_id = footer_cell.get('id', footer_id) for cell in cells: @@ -241,10 +275,6 @@ def new_markdown_cell(cell_id: str, content: list[str]) -> dict[str, Any]: if 'attachments' in cell and cell['attachments'] is None: cell['attachments'] = {} - # Remove empty IDs - if 'id' in cell and not cell['id']: - del cell['id'] - # Prepare parameter substitutions for header try: icon_name = toml_info['meta']['icon'] @@ -293,6 +323,8 @@ def new_markdown_cell(cell_id: str, content: list[str]) -> dict[str, Any]: if 'execution_count' in output: output['execution_count'] = code_idx + nbformat.validate(nb) + with open(f, 'w') as outfile: outfile.write(json.dumps(nb, indent=2)) outfile.write('\n')