update

pablomarin · Aug 14, 2023 · 6640592 · 6640592
1 parent cd9c582
commit 6640592
Show file tree

Hide file tree

Showing 9 changed files with 437 additions and 213 deletions.
diff --git a/01-Load-Data-ACogSearch.ipynb b/01-Load-Data-ACogSearch.ipynb
@@ -116,6 +116,10 @@
     "    \"credentials\": {\n",
     "        \"connectionString\": os.environ['BLOB_CONNECTION_STRING']\n",
     "    },\n",
+    "    \"dataDeletionDetectionPolicy\" : {\n",
+    "        \"@odata.type\" :\"#Microsoft.Azure.Search.NativeBlobSoftDeleteDeletionDetectionPolicy\"\n",
+    "   }\n",
+    "\n",
     "    \"container\": {\n",
     "        \"name\": BLOB_CONTAINER_NAME\n",
     "    }\n",
@@ -136,6 +140,13 @@
     "# r.text"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For information on Change and Delete file detection please see [HERE](https://learn.microsoft.com/en-us/azure/search/search-howto-index-changed-deleted-blobs?tabs=rest-api)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -305,10 +316,6 @@
     "                    \"targetName\": \"organizations\"\n",
     "                },\n",
     "                {\n",
-    "                    \"name\": \"dateTimes\", \n",
-    "                    \"targetName\": \"dateTimes\"\n",
-    "                },\n",
-    "                {\n",
     "                    \"name\": \"urls\", \n",
     "                    \"targetName\": \"urls\"\n",
     "                },\n",
@@ -379,14 +386,15 @@
     "        {\"name\": \"language\", \"type\": \"Edm.String\", \"searchable\": \"false\", \"retrievable\": \"true\", \"sortable\": \"true\", \"filterable\": \"true\", \"facetable\": \"true\"},\n",
     "        {\"name\": \"name\", \"type\": \"Edm.String\", \"searchable\": \"true\", \"retrievable\": \"true\", \"sortable\": \"false\", \"filterable\": \"false\", \"facetable\": \"false\"},\n",
     "        {\"name\": \"location\", \"type\": \"Edm.String\", \"searchable\": \"false\", \"retrievable\": \"true\", \"sortable\": \"false\", \"filterable\": \"false\", \"facetable\": \"false\"},\n",
+    "        {\"name\": \"vectorized\", \"type\": \"Edm.Boolean\", \"searchable\": \"false\", \"retrievable\": \"true\", \"sortable\": \"false\", \"filterable\": \"false\", \"facetable\": \"false\"},\n",
     "        {\"name\": \"images_text\", \"type\": \"Collection(Edm.String)\", \"searchable\": \"true\", \"retrievable\": \"true\", \"sortable\": \"false\", \"filterable\": \"false\", \"facetable\": \"false\"},\n",
     "        {\"name\": \"keyPhrases\", \"type\": \"Collection(Edm.String)\", \"searchable\": \"true\", \"retrievable\": \"true\", \"sortable\": \"false\", \"filterable\": \"true\", \"facetable\": \"true\"},\n",
     "        {\"name\": \"persons\", \"type\": \"Collection(Edm.String)\", \"searchable\": \"true\", \"retrievable\": \"true\", \"sortable\": \"false\", \"filterable\": \"false\", \"facetable\": \"false\"},\n",
     "        {\"name\": \"locations\", \"type\": \"Collection(Edm.String)\", \"searchable\": \"true\", \"retrievable\": \"true\", \"sortable\": \"false\", \"filterable\": \"true\", \"facetable\": \"true\"},\n",
     "        {\"name\": \"organizations\", \"type\": \"Collection(Edm.String)\", \"searchable\": \"true\", \"retrievable\": \"true\", \"sortable\": \"false\", \"filterable\": \"true\", \"facetable\": \"true\"},\n",
-    "        {\"name\": \"dateTimes\", \"type\": \"Collection(Edm.String)\", \"searchable\": \"true\", \"retrievable\": \"true\", \"sortable\": \"false\", \"filterable\": \"false\", \"facetable\": \"false\"},\n",
     "        {\"name\": \"urls\", \"type\": \"Collection(Edm.String)\", \"searchable\": \"false\", \"retrievable\": \"true\", \"sortable\": \"false\", \"filterable\": \"false\", \"facetable\": \"false\"},\n",
     "        {\"name\": \"emails\", \"type\": \"Collection(Edm.String)\", \"searchable\": \"true\", \"retrievable\": \"true\", \"sortable\": \"false\", \"filterable\": \"true\", \"facetable\": \"false\"}\n",
+    "        \n",
     "    ],\n",
     "    \"semantic\": {\n",
     "      \"configurations\": [\n",
@@ -533,10 +541,6 @@
     "            \"targetFieldName\": \"organizations\"\n",
     "        },\n",
     "        {\n",
-    "            \"sourceFieldName\": \"/document/pages/*/dateTimes/*\",\n",
-    "            \"targetFieldName\": \"dateTimes\"\n",
-    "        },\n",
-    "        {\n",
     "            \"sourceFieldName\": \"/document/pages/*/urls/*\",\n",
     "            \"targetFieldName\": \"urls\"\n",
     "        },\n",
@@ -652,7 +656,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.10"
+   "version": "3.10.11"
   },
   "vscode": {
    "interpreter": {

diff --git a/03-Quering-AOpenAI.ipynb b/03-Quering-AOpenAI.ipynb
@@ -76,7 +76,8 @@
    "outputs": [],
    "source": [
     "# Setup the Payloads header\n",
-    "headers = {'Content-Type': 'application/json','api-key': os.environ['AZURE_SEARCH_KEY']}"
+    "headers = {'Content-Type': 'application/json','api-key': os.environ['AZURE_SEARCH_KEY']}\n",
+    "params = {'api-version': os.environ['AZURE_SEARCH_API_VERSION']}"
    ]
   },
   {
@@ -165,26 +166,26 @@
     "agg_search_results = []\n",
     "\n",
     "for index in indexes:\n",
-    "    url = os.environ['AZURE_SEARCH_ENDPOINT'] + '/indexes/'+ index + '/docs'\n",
-    "    url += '?api-version={}'.format(os.environ['AZURE_SEARCH_API_VERSION'])\n",
-    "    url += '&search={}'.format(QUESTION)\n",
-    "    url += '&select=id,title,chunks,language,name,location'\n",
-    "    url += '&$top=10'  # You can change this to anything you need/want\n",
-    "    url += '&queryLanguage=en-us'\n",
-    "    url += '&queryType=semantic'\n",
-    "    url += '&semanticConfiguration=my-semantic-config'\n",
-    "    url += '&$count=true'\n",
-    "    url += '&speller=lexicon'\n",
-    "    url += '&answers=extractive|count-3'\n",
-    "    url += '&captions=extractive|highlight-false'\n",
+    "    search_payload = {\n",
+    "        \"search\": QUESTION,\n",
+    "        \"select\": \"id, title, chunks, language, name, location\",\n",
+    "        \"queryType\": \"semantic\",\n",
+    "        \"semanticConfiguration\": \"my-semantic-config\",\n",
+    "        \"count\": \"true\",\n",
+    "        \"speller\": \"lexicon\",\n",
+    "        \"queryLanguage\": \"en-us\",\n",
+    "        \"captions\": \"extractive\",\n",
+    "        \"answers\": \"extractive\",\n",
+    "        \"top\": \"10\"\n",
+    "    }\n",
     "\n",
-    "    resp = requests.get(url, headers=headers)\n",
-    "    print(url)\n",
-    "    print(resp.status_code)\n",
+    "    r = requests.post(os.environ['AZURE_SEARCH_ENDPOINT'] + \"/indexes/\" + index + \"/docs/search\",\n",
+    "                     data=json.dumps(search_payload), headers=headers, params=params)\n",
+    "    print(r.status_code)\n",
     "\n",
-    "    search_results = resp.json()\n",
+    "    search_results = r.json()\n",
     "    agg_search_results.append(search_results)\n",
-    "    print(\"Results Found: {}, Results Returned: {}\".format(search_results['@odata.count'], len(search_results['value'])))"
+    "    print(\"Index:\", index, \"Results Found: {}, Results Returned: {}\".format(search_results['@odata.count'], len(search_results['value'])))"
    ]
   },
   {
@@ -1304,11 +1305,18 @@
    "id": "f347373a-a5be-473d-b64e-0f6b6dbcd0e0",
    "metadata": {},
    "source": [
-    "# Summary\n",
+    "# Summary and limitations\n",
     "##### This answer is way better than taking just the result from Azure Cognitive Search. So the summary is:\n",
     "- Azure Cognitive Search give us the top results (context)\n",
     "- Azure OpenAI takes these results and understand the content and uses it as context to give the best answer\n",
-    "- Best of two worlds!"
+    "- Best of two worlds!\n",
+    "\n",
+    "However, although this solution looks great, we have some limitations still:\n",
+    "\n",
+    "- It takes about 30 secs in average to get an answer\n",
+    "- We are creating the same vectors over and over again each time the same documents are discovered by the semantic search query, which is not efficient\n",
+    "- Because of how Keyword and Semantic Search works, there is a degradation on the relevance of the results that Azure Search text-based index presents, as the documents increase in size.\n",
+    "- Only works with text, not images or videos."
    ]
   },
   {
@@ -1317,11 +1325,7 @@
    "metadata": {},
    "source": [
     "# NEXT\n",
-    "We just added a smart layer on top of Azure Cognitive Search. This is the backend for a GPT Smart Search Engine.\n",
-    "\n",
-    "However, we are missing something: **How to have a conversation with this engine?**\n",
-    "\n",
-    "On the next Notebook, we are going to understand the concept of **memory**. This is necessary in order to have a chatbot that can establish a conversation with the user. Without memory, there is no real conversation."
+    "In the next notebook, we are going to use Vector Search capabilities on Azure Cognitive Search Engine to see if we can overcome the above limitations"
    ]
   }
  ],
@@ -1341,7 +1345,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.10"
+   "version": "3.10.11"
   }
  },
  "nbformat": 4,