Skip to content

Commit

Permalink
fix hit count
Browse files Browse the repository at this point in the history
  • Loading branch information
marevol committed Mar 21, 2024
1 parent f7e30b1 commit b6ce9a5
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 59 deletions.
33 changes: 18 additions & 15 deletions run-elasticsearch.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -502,13 +502,14 @@
" obj = json.loads(response.text)\n",
" if obj.get(\"timed_out\"):\n",
" print(f\"[TIMEOUT] {query}\")\n",
" return -1, -1, [], [], []\n",
" product_ids = [x.get(\"_id\") for x in obj.get(\"hits\").get(\"hits\")]\n",
" scores = [x.get(\"_score\") for x in obj.get(\"hits\").get(\"hits\")]\n",
" explanations = [x.get(\"_explanation\") for x in obj.get(\"hits\").get(\"hits\")] if explain else []\n",
" return obj.get(\"took\"), obj.get(\"hits\").get(\"total\").get(\"value\"), product_ids, scores, explanations\n",
" return -1, -1, -1, [], [], []\n",
" hits = obj.get(\"hits\").get(\"hits\")\n",
" product_ids = [x.get(\"_id\") for x in hits]\n",
" scores = [x.get(\"_score\") for x in hits]\n",
" explanations = [x.get(\"_explanation\") for x in hits] if explain else []\n",
" return obj.get(\"took\"), len(hits), obj.get(\"hits\").get(\"total\").get(\"value\"), product_ids, scores, explanations\n",
" print(f\"[FAIL][{response.status_code}] {response.text}\")\n",
" return -1, -1, [], [], []\n"
" return -1, -1, -1, [], [], []\n"
]
},
{
Expand All @@ -535,21 +536,23 @@
" if distance == \"dot_product\":\n",
" embedding = embedding.astype(np.float32)\n",
" embedding = embedding / np.linalg.norm(embedding)\n",
" num_candidates = hnsw_ef if hnsw_ef > page_size else page_size\n",
" query = {\n",
" \"knn\": {\n",
" \"field\": \"embedding\",\n",
" \"query_vector\": embedding.tolist(),\n",
" \"num_candidates\": hnsw_ef\n",
" \"num_candidates\": num_candidates\n",
" }\n",
" }\n",
" took, total_hits, ids, scores, explanations = search(query=query, size=page_size, explain=explain, track_total_hits=track_total_hits)\n",
" took, hits, total_hits, ids, scores, explanations = search(query=query, size=page_size, explain=explain, track_total_hits=track_total_hits)\n",
" # print(f\"{took}, {total_hits}, {ids}, {scores}\")\n",
" if took == -1:\n",
" print(f\"norm: {np.linalg.norm(embedding)}\")\n",
" continue\n",
" result = {\n",
" \"id\": (count + 1),\n",
" \"took\": took,\n",
" \"hits\": hits,\n",
" \"total_hits\": total_hits,\n",
" \"ids\": ids,\n",
" \"scores\": scores,\n",
Expand Down Expand Up @@ -597,17 +600,17 @@
"metadata": {},
"outputs": [],
"source": [
"def print_took_and_total_hits(filename, min_hits=0):\n",
"def print_took_and_total_hits(filename):\n",
" tooks = []\n",
" hits = []\n",
" total_hits = []\n",
" with gzip.open(filename, \"rt\", encoding=\"utf-8\") as f:\n",
" for line in f.readlines():\n",
" obj = json.loads(line)\n",
" hits = obj.get(\"total_hits\")\n",
" if hits >= min_hits:\n",
" tooks.append(obj.get(\"took\"))\n",
" total_hits.append(hits)\n",
" df = pd.DataFrame({\"took\": tooks, \"total_hits\": total_hits})\n",
" tooks.append(obj.get(\"took\"))\n",
" hits = obj.get(\"hits\")\n",
" total_hits = obj.get(\"total_hits\")\n",
" df = pd.DataFrame({\"took\": tooks, \"hits\": hits, \"total_hits\": total_hits})\n",
" print(df.describe().to_markdown())"
]
},
Expand Down Expand Up @@ -672,7 +675,7 @@
"metadata": {},
"outputs": [],
"source": [
"for page_size in [10, 100, 400, 1000]:\n",
"for page_size in [10, 100, 400]:\n",
" print(f\"page size: {page_size}\")\n",
" filename = get_output_filename(es_version, f\"knn_{page_size}\", explain=False, track_total_hits=False)\n",
" search_with_knn_queries(filename, page_size=page_size, max_size=1000) # warmup\n",
Expand Down
30 changes: 16 additions & 14 deletions run-opensearch.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -512,13 +512,14 @@
" obj = json.loads(response.text)\n",
" if obj.get(\"timed_out\"):\n",
" print(f\"[TIMEOUT] {query}\")\n",
" return -1, -1, [], [], []\n",
" product_ids = [x.get(\"_id\") for x in obj.get(\"hits\").get(\"hits\")]\n",
" scores = [x.get(\"_score\") for x in obj.get(\"hits\").get(\"hits\")]\n",
" explanations = [x.get(\"_explanation\") for x in obj.get(\"hits\").get(\"hits\")] if explain else []\n",
" return obj.get(\"took\"), obj.get(\"hits\").get(\"total\").get(\"value\"), product_ids, scores, explanations\n",
" return -1, -1, -1, [], [], []\n",
" hits = obj.get(\"hits\").get(\"hits\")\n",
" product_ids = [x.get(\"_id\") for x in hits]\n",
" scores = [x.get(\"_score\") for x in hits]\n",
" explanations = [x.get(\"_explanation\") for x in hits] if explain else []\n",
" return obj.get(\"took\"), len(hits), obj.get(\"hits\").get(\"total\").get(\"value\"), product_ids, scores, explanations\n",
" print(f\"[FAIL][{response.status_code}] {response.text}\")\n",
" return -1, -1, [], [], []\n"
" return -1, -1, -1, [], [], []\n"
]
},
{
Expand Down Expand Up @@ -552,13 +553,14 @@
" }\n",
" }\n",
" }\n",
" took, total_hits, ids, scores, explanations = search(query=query, size=page_size, explain=explain, track_total_hits=track_total_hits)\n",
" took, hits, total_hits, ids, scores, explanations = search(query=query, size=page_size, explain=explain, track_total_hits=track_total_hits)\n",
" # print(f\"{took}, {total_hits}, {ids}, {scores}\")\n",
" if took == -1:\n",
" continue\n",
" result = {\n",
" \"id\": (count + 1),\n",
" \"took\": took,\n",
" \"hits\": hits,\n",
" \"total_hits\": total_hits,\n",
" \"ids\": ids,\n",
" \"scores\": scores,\n",
Expand Down Expand Up @@ -606,17 +608,17 @@
"metadata": {},
"outputs": [],
"source": [
"def print_took_and_total_hits(filename, min_hits=0):\n",
"def print_took_and_total_hits(filename):\n",
" tooks = []\n",
" hits = []\n",
" total_hits = []\n",
" with gzip.open(filename, \"rt\", encoding=\"utf-8\") as f:\n",
" for line in f.readlines():\n",
" obj = json.loads(line)\n",
" hits = obj.get(\"total_hits\")\n",
" if hits >= min_hits:\n",
" tooks.append(obj.get(\"took\"))\n",
" total_hits.append(hits)\n",
" df = pd.DataFrame({\"took\": tooks, \"total_hits\": total_hits})\n",
" tooks.append(obj.get(\"took\"))\n",
" hits = obj.get(\"hits\")\n",
" total_hits = obj.get(\"total_hits\")\n",
" df = pd.DataFrame({\"took\": tooks, \"hits\": hits, \"total_hits\": total_hits})\n",
" print(df.describe().to_markdown())"
]
},
Expand Down Expand Up @@ -679,7 +681,7 @@
"metadata": {},
"outputs": [],
"source": [
"for page_size in [10, 100, 400, 1000]:\n",
"for page_size in [10, 100, 400]:\n",
" print(f\"page size: {page_size}\")\n",
" filename = get_output_filename(opensearch_version, f\"knn_{page_size}\", explain=False, track_total_hits=False)\n",
" search_with_knn_queries(filename, page_size=page_size, max_size=1000) # warmup\n",
Expand Down
16 changes: 7 additions & 9 deletions run-qdrant.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -442,14 +442,14 @@
" \"hnsw_ef\": hnsw_ef,\n",
" },\n",
" }\n",
" took, total_hits, ids, scores = search(query=query)\n",
" took, hits, ids, scores = search(query=query)\n",
" # print(f\"{took}, {total_hits}, {ids}, {scores}\")\n",
" if took == -1:\n",
" continue\n",
" result = {\n",
" \"id\": (count + 1),\n",
" \"took\": took,\n",
" \"total_hits\": total_hits,\n",
" \"hits\": hits,\n",
" \"ids\": ids,\n",
" \"scores\": scores,\n",
" }\n",
Expand Down Expand Up @@ -489,17 +489,15 @@
"metadata": {},
"outputs": [],
"source": [
"def print_took_and_total_hits(filename, min_hits=0):\n",
"def print_took_and_total_hits(filename):\n",
" tooks = []\n",
" total_hits = []\n",
" hits = []\n",
" with gzip.open(filename, \"rt\", encoding=\"utf-8\") as f:\n",
" for line in f.readlines():\n",
" obj = json.loads(line)\n",
" hits = obj.get(\"total_hits\")\n",
" if hits >= min_hits:\n",
" tooks.append(obj.get(\"took\"))\n",
" total_hits.append(hits)\n",
" df = pd.DataFrame({\"took\": tooks, \"total_hits\": total_hits})\n",
" tooks.append(obj.get(\"took\"))\n",
" hits = obj.get(\"hits\")\n",
" df = pd.DataFrame({\"took\": tooks, \"hits\": hits})\n",
" print(df.describe().to_markdown())"
]
},
Expand Down
27 changes: 15 additions & 12 deletions run-vespa.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -483,11 +483,12 @@
" if response.status_code == 200:\n",
" obj = json.loads(response.text)\n",
" took = obj.get(\"timing\").get(\"searchtime\") * 1000\n",
" product_ids = [x.get(\"id\") for x in obj.get(\"root\").get(\"children\")]\n",
" scores = [x.get(\"relevance\") for x in obj.get(\"root\").get(\"children\")]\n",
" return took, int(obj.get(\"root\").get(\"coverage\").get(\"documents\")), product_ids, scores\n",
" hits = obj.get(\"root\").get(\"children\")\n",
" product_ids = [x.get(\"id\") for x in hits]\n",
" scores = [x.get(\"relevance\") for x in hits]\n",
" return took, len(hits), int(obj.get(\"root\").get(\"coverage\").get(\"documents\")), product_ids, scores\n",
" print(f\"[FAIL][{response.status_code}] {response.text}\")\n",
" return -1, -1, [], []\n"
" return -1, -1, -1, [], []\n"
]
},
{
Expand All @@ -514,19 +515,21 @@
" if distance == \"dotproduct\":\n",
" embedding = embedding.astype(np.float32)\n",
" embedding = embedding / np.linalg.norm(embedding)\n",
" target_hits = hnsw_ef if hnsw_ef > page_size else page_size\n",
" query = {\n",
" \"hits\": page_size,\n",
" \"yql\": \"select * from \" + index_name + \" where {approximate:true,targetHits:\" + str(hnsw_ef) + \"}nearestNeighbor(embedding,q)\",\n",
" \"yql\": \"select * from \" + index_name + \" where {approximate:true,targetHits:\" + str(target_hits) + \"}nearestNeighbor(embedding,q)\",\n",
" \"ranking\": \"closeness\",\n",
" \"input.query(q)\": embedding.tolist(),\n",
" }\n",
" took, total_hits, ids, scores = search(query=query)\n",
" took, hits, total_hits, ids, scores = search(query=query)\n",
" # print(f\"{took}, {total_hits}, {ids}, {scores}\")\n",
" if took == -1:\n",
" continue\n",
" result = {\n",
" \"id\": (count + 1),\n",
" \"took\": took,\n",
" \"hits\": hits,\n",
" \"total_hits\": total_hits,\n",
" \"ids\": ids,\n",
" \"scores\": scores,\n",
Expand Down Expand Up @@ -567,17 +570,17 @@
"metadata": {},
"outputs": [],
"source": [
"def print_took_and_total_hits(filename, min_hits=0):\n",
"def print_took_and_total_hits(filename):\n",
" tooks = []\n",
" hits = []\n",
" total_hits = []\n",
" with gzip.open(filename, \"rt\", encoding=\"utf-8\") as f:\n",
" for line in f.readlines():\n",
" obj = json.loads(line)\n",
" hits = obj.get(\"total_hits\")\n",
" if hits >= min_hits:\n",
" tooks.append(obj.get(\"took\"))\n",
" total_hits.append(hits)\n",
" df = pd.DataFrame({\"took\": tooks, \"total_hits\": total_hits})\n",
" tooks.append(obj.get(\"took\"))\n",
" hits = obj.get(\"hits\")\n",
" total_hits = obj.get(\"total_hits\")\n",
" df = pd.DataFrame({\"took\": tooks, \"hits\": hits, \"total_hits\": total_hits})\n",
" print(df.describe().to_markdown())"
]
},
Expand Down
16 changes: 7 additions & 9 deletions run-weaviate.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -422,14 +422,14 @@
" if distance == \"dot\":\n",
" embedding = embedding / np.linalg.norm(embedding)\n",
" query = create_query(embedding.tolist(), page_size)\n",
" took, total_hits, ids, scores = search(query)\n",
" took, hits, ids, scores = search(query)\n",
" # print(f\"{took}, {total_hits}, {ids}, {scores}\")\n",
" if took == -1:\n",
" continue\n",
" result = {\n",
" \"id\": (count + 1),\n",
" \"took\": took,\n",
" \"total_hits\": total_hits,\n",
" \"hits\": hits,\n",
" \"ids\": ids,\n",
" \"scores\": scores,\n",
" }\n",
Expand Down Expand Up @@ -469,17 +469,15 @@
"metadata": {},
"outputs": [],
"source": [
"def print_took_and_total_hits(filename, min_hits=0):\n",
"def print_took_and_total_hits(filename):\n",
" tooks = []\n",
" total_hits = []\n",
" hits = []\n",
" with gzip.open(filename, \"rt\", encoding=\"utf-8\") as f:\n",
" for line in f.readlines():\n",
" obj = json.loads(line)\n",
" hits = obj.get(\"total_hits\")\n",
" if hits >= min_hits:\n",
" tooks.append(obj.get(\"took\"))\n",
" total_hits.append(hits)\n",
" df = pd.DataFrame({\"took\": tooks, \"total_hits\": total_hits})\n",
" tooks.append(obj.get(\"took\"))\n",
" hits = obj.get(\"hits\")\n",
" df = pd.DataFrame({\"took\": tooks, \"hits\": hits})\n",
" print(df.describe().to_markdown())"
]
},
Expand Down

0 comments on commit b6ce9a5

Please sign in to comment.