diff --git a/run-elasticsearch.ipynb b/run-elasticsearch.ipynb index 3359c24..8cada5f 100644 --- a/run-elasticsearch.ipynb +++ b/run-elasticsearch.ipynb @@ -502,13 +502,14 @@ " obj = json.loads(response.text)\n", " if obj.get(\"timed_out\"):\n", " print(f\"[TIMEOUT] {query}\")\n", - " return -1, -1, [], [], []\n", - " product_ids = [x.get(\"_id\") for x in obj.get(\"hits\").get(\"hits\")]\n", - " scores = [x.get(\"_score\") for x in obj.get(\"hits\").get(\"hits\")]\n", - " explanations = [x.get(\"_explanation\") for x in obj.get(\"hits\").get(\"hits\")] if explain else []\n", - " return obj.get(\"took\"), obj.get(\"hits\").get(\"total\").get(\"value\"), product_ids, scores, explanations\n", + " return -1, -1, -1, [], [], []\n", + " hits = obj.get(\"hits\").get(\"hits\")\n", + " product_ids = [x.get(\"_id\") for x in hits]\n", + " scores = [x.get(\"_score\") for x in hits]\n", + " explanations = [x.get(\"_explanation\") for x in hits] if explain else []\n", + " return obj.get(\"took\"), len(hits), obj.get(\"hits\").get(\"total\").get(\"value\"), product_ids, scores, explanations\n", " print(f\"[FAIL][{response.status_code}] {response.text}\")\n", - " return -1, -1, [], [], []\n" + " return -1, -1, -1, [], [], []\n" ] }, { @@ -535,14 +536,15 @@ " if distance == \"dot_product\":\n", " embedding = embedding.astype(np.float32)\n", " embedding = embedding / np.linalg.norm(embedding)\n", + " num_candidates = hnsw_ef if hnsw_ef > page_size else page_size\n", " query = {\n", " \"knn\": {\n", " \"field\": \"embedding\",\n", " \"query_vector\": embedding.tolist(),\n", - " \"num_candidates\": hnsw_ef\n", + " \"num_candidates\": num_candidates\n", " }\n", " }\n", - " took, total_hits, ids, scores, explanations = search(query=query, size=page_size, explain=explain, track_total_hits=track_total_hits)\n", + " took, hits, total_hits, ids, scores, explanations = search(query=query, size=page_size, explain=explain, track_total_hits=track_total_hits)\n", " # print(f\"{took}, {total_hits}, {ids}, {scores}\")\n", " if took == -1:\n", " print(f\"norm: {np.linalg.norm(embedding)}\")\n", @@ -550,6 +552,7 @@ " result = {\n", " \"id\": (count + 1),\n", " \"took\": took,\n", + " \"hits\": hits,\n", " \"total_hits\": total_hits,\n", " \"ids\": ids,\n", " \"scores\": scores,\n", @@ -597,17 +600,17 @@ "metadata": {}, "outputs": [], "source": [ - "def print_took_and_total_hits(filename, min_hits=0):\n", + "def print_took_and_total_hits(filename):\n", " tooks = []\n", + " hits = []\n", " total_hits = []\n", " with gzip.open(filename, \"rt\", encoding=\"utf-8\") as f:\n", " for line in f.readlines():\n", " obj = json.loads(line)\n", - " hits = obj.get(\"total_hits\")\n", - " if hits >= min_hits:\n", - " tooks.append(obj.get(\"took\"))\n", - " total_hits.append(hits)\n", - " df = pd.DataFrame({\"took\": tooks, \"total_hits\": total_hits})\n", + " tooks.append(obj.get(\"took\"))\n", + " hits = obj.get(\"hits\")\n", + " total_hits = obj.get(\"total_hits\")\n", + " df = pd.DataFrame({\"took\": tooks, \"hits\": hits, \"total_hits\": total_hits})\n", " print(df.describe().to_markdown())" ] }, @@ -672,7 +675,7 @@ "metadata": {}, "outputs": [], "source": [ - "for page_size in [10, 100, 400, 1000]:\n", + "for page_size in [10, 100, 400]:\n", " print(f\"page size: {page_size}\")\n", " filename = get_output_filename(es_version, f\"knn_{page_size}\", explain=False, track_total_hits=False)\n", " search_with_knn_queries(filename, page_size=page_size, max_size=1000) # warmup\n", diff --git a/run-opensearch.ipynb b/run-opensearch.ipynb index 41fcd0f..5b44fd4 100644 --- a/run-opensearch.ipynb +++ b/run-opensearch.ipynb @@ -512,13 +512,14 @@ " obj = json.loads(response.text)\n", " if obj.get(\"timed_out\"):\n", " print(f\"[TIMEOUT] {query}\")\n", - " return -1, -1, [], [], []\n", - " product_ids = [x.get(\"_id\") for x in obj.get(\"hits\").get(\"hits\")]\n", - " scores = [x.get(\"_score\") for x in obj.get(\"hits\").get(\"hits\")]\n", - " explanations = [x.get(\"_explanation\") for x in obj.get(\"hits\").get(\"hits\")] if explain else []\n", - " return obj.get(\"took\"), obj.get(\"hits\").get(\"total\").get(\"value\"), product_ids, scores, explanations\n", + " return -1, -1, -1, [], [], []\n", + " hits = obj.get(\"hits\").get(\"hits\")\n", + " product_ids = [x.get(\"_id\") for x in hits]\n", + " scores = [x.get(\"_score\") for x in hits]\n", + " explanations = [x.get(\"_explanation\") for x in hits] if explain else []\n", + " return obj.get(\"took\"), len(hits), obj.get(\"hits\").get(\"total\").get(\"value\"), product_ids, scores, explanations\n", " print(f\"[FAIL][{response.status_code}] {response.text}\")\n", - " return -1, -1, [], [], []\n" + " return -1, -1, -1, [], [], []\n" ] }, { @@ -552,13 +553,14 @@ " }\n", " }\n", " }\n", - " took, total_hits, ids, scores, explanations = search(query=query, size=page_size, explain=explain, track_total_hits=track_total_hits)\n", + " took, hits, total_hits, ids, scores, explanations = search(query=query, size=page_size, explain=explain, track_total_hits=track_total_hits)\n", " # print(f\"{took}, {total_hits}, {ids}, {scores}\")\n", " if took == -1:\n", " continue\n", " result = {\n", " \"id\": (count + 1),\n", " \"took\": took,\n", + " \"hits\": hits,\n", " \"total_hits\": total_hits,\n", " \"ids\": ids,\n", " \"scores\": scores,\n", @@ -606,17 +608,17 @@ "metadata": {}, "outputs": [], "source": [ - "def print_took_and_total_hits(filename, min_hits=0):\n", + "def print_took_and_total_hits(filename):\n", " tooks = []\n", + " hits = []\n", " total_hits = []\n", " with gzip.open(filename, \"rt\", encoding=\"utf-8\") as f:\n", " for line in f.readlines():\n", " obj = json.loads(line)\n", - " hits = obj.get(\"total_hits\")\n", - " if hits >= min_hits:\n", - " tooks.append(obj.get(\"took\"))\n", - " total_hits.append(hits)\n", - " df = pd.DataFrame({\"took\": tooks, \"total_hits\": total_hits})\n", + " tooks.append(obj.get(\"took\"))\n", + " hits = obj.get(\"hits\")\n", + " total_hits = obj.get(\"total_hits\")\n", + " df = pd.DataFrame({\"took\": tooks, \"hits\": hits, \"total_hits\": total_hits})\n", " print(df.describe().to_markdown())" ] }, @@ -679,7 +681,7 @@ "metadata": {}, "outputs": [], "source": [ - "for page_size in [10, 100, 400, 1000]:\n", + "for page_size in [10, 100, 400]:\n", " print(f\"page size: {page_size}\")\n", " filename = get_output_filename(opensearch_version, f\"knn_{page_size}\", explain=False, track_total_hits=False)\n", " search_with_knn_queries(filename, page_size=page_size, max_size=1000) # warmup\n", diff --git a/run-qdrant.ipynb b/run-qdrant.ipynb index 568fd0d..e3f8df3 100644 --- a/run-qdrant.ipynb +++ b/run-qdrant.ipynb @@ -442,14 +442,14 @@ " \"hnsw_ef\": hnsw_ef,\n", " },\n", " }\n", - " took, total_hits, ids, scores = search(query=query)\n", + " took, hits, ids, scores = search(query=query)\n", " # print(f\"{took}, {total_hits}, {ids}, {scores}\")\n", " if took == -1:\n", " continue\n", " result = {\n", " \"id\": (count + 1),\n", " \"took\": took,\n", - " \"total_hits\": total_hits,\n", + " \"hits\": hits,\n", " \"ids\": ids,\n", " \"scores\": scores,\n", " }\n", @@ -489,17 +489,15 @@ "metadata": {}, "outputs": [], "source": [ - "def print_took_and_total_hits(filename, min_hits=0):\n", + "def print_took_and_total_hits(filename):\n", " tooks = []\n", - " total_hits = []\n", + " hits = []\n", " with gzip.open(filename, \"rt\", encoding=\"utf-8\") as f:\n", " for line in f.readlines():\n", " obj = json.loads(line)\n", - " hits = obj.get(\"total_hits\")\n", - " if hits >= min_hits:\n", - " tooks.append(obj.get(\"took\"))\n", - " total_hits.append(hits)\n", - " df = pd.DataFrame({\"took\": tooks, \"total_hits\": total_hits})\n", + " tooks.append(obj.get(\"took\"))\n", + " hits = obj.get(\"hits\")\n", + " df = pd.DataFrame({\"took\": tooks, \"hits\": hits})\n", " print(df.describe().to_markdown())" ] }, diff --git a/run-vespa.ipynb b/run-vespa.ipynb index 61e86f9..58c6486 100644 --- a/run-vespa.ipynb +++ b/run-vespa.ipynb @@ -483,11 +483,12 @@ " if response.status_code == 200:\n", " obj = json.loads(response.text)\n", " took = obj.get(\"timing\").get(\"searchtime\") * 1000\n", - " product_ids = [x.get(\"id\") for x in obj.get(\"root\").get(\"children\")]\n", - " scores = [x.get(\"relevance\") for x in obj.get(\"root\").get(\"children\")]\n", - " return took, int(obj.get(\"root\").get(\"coverage\").get(\"documents\")), product_ids, scores\n", + " hits = obj.get(\"root\").get(\"children\")\n", + " product_ids = [x.get(\"id\") for x in hits]\n", + " scores = [x.get(\"relevance\") for x in hits]\n", + " return took, len(hits), int(obj.get(\"root\").get(\"coverage\").get(\"documents\")), product_ids, scores\n", " print(f\"[FAIL][{response.status_code}] {response.text}\")\n", - " return -1, -1, [], []\n" + " return -1, -1, -1, [], []\n" ] }, { @@ -514,19 +515,21 @@ " if distance == \"dotproduct\":\n", " embedding = embedding.astype(np.float32)\n", " embedding = embedding / np.linalg.norm(embedding)\n", + " target_hits = hnsw_ef if hnsw_ef > page_size else page_size\n", " query = {\n", " \"hits\": page_size,\n", - " \"yql\": \"select * from \" + index_name + \" where {approximate:true,targetHits:\" + str(hnsw_ef) + \"}nearestNeighbor(embedding,q)\",\n", + " \"yql\": \"select * from \" + index_name + \" where {approximate:true,targetHits:\" + str(target_hits) + \"}nearestNeighbor(embedding,q)\",\n", " \"ranking\": \"closeness\",\n", " \"input.query(q)\": embedding.tolist(),\n", " }\n", - " took, total_hits, ids, scores = search(query=query)\n", + " took, hits, total_hits, ids, scores = search(query=query)\n", " # print(f\"{took}, {total_hits}, {ids}, {scores}\")\n", " if took == -1:\n", " continue\n", " result = {\n", " \"id\": (count + 1),\n", " \"took\": took,\n", + " \"hits\": hits,\n", " \"total_hits\": total_hits,\n", " \"ids\": ids,\n", " \"scores\": scores,\n", @@ -567,17 +570,17 @@ "metadata": {}, "outputs": [], "source": [ - "def print_took_and_total_hits(filename, min_hits=0):\n", + "def print_took_and_total_hits(filename):\n", " tooks = []\n", + " hits = []\n", " total_hits = []\n", " with gzip.open(filename, \"rt\", encoding=\"utf-8\") as f:\n", " for line in f.readlines():\n", " obj = json.loads(line)\n", - " hits = obj.get(\"total_hits\")\n", - " if hits >= min_hits:\n", - " tooks.append(obj.get(\"took\"))\n", - " total_hits.append(hits)\n", - " df = pd.DataFrame({\"took\": tooks, \"total_hits\": total_hits})\n", + " tooks.append(obj.get(\"took\"))\n", + " hits = obj.get(\"hits\")\n", + " total_hits = obj.get(\"total_hits\")\n", + " df = pd.DataFrame({\"took\": tooks, \"hits\": hits, \"total_hits\": total_hits})\n", " print(df.describe().to_markdown())" ] }, diff --git a/run-weaviate.ipynb b/run-weaviate.ipynb index af8c1a1..b21d5c2 100644 --- a/run-weaviate.ipynb +++ b/run-weaviate.ipynb @@ -422,14 +422,14 @@ " if distance == \"dot\":\n", " embedding = embedding / np.linalg.norm(embedding)\n", " query = create_query(embedding.tolist(), page_size)\n", - " took, total_hits, ids, scores = search(query)\n", + " took, hits, ids, scores = search(query)\n", " # print(f\"{took}, {total_hits}, {ids}, {scores}\")\n", " if took == -1:\n", " continue\n", " result = {\n", " \"id\": (count + 1),\n", " \"took\": took,\n", - " \"total_hits\": total_hits,\n", + " \"hits\": hits,\n", " \"ids\": ids,\n", " \"scores\": scores,\n", " }\n", @@ -469,17 +469,15 @@ "metadata": {}, "outputs": [], "source": [ - "def print_took_and_total_hits(filename, min_hits=0):\n", + "def print_took_and_total_hits(filename):\n", " tooks = []\n", - " total_hits = []\n", + " hits = []\n", " with gzip.open(filename, \"rt\", encoding=\"utf-8\") as f:\n", " for line in f.readlines():\n", " obj = json.loads(line)\n", - " hits = obj.get(\"total_hits\")\n", - " if hits >= min_hits:\n", - " tooks.append(obj.get(\"took\"))\n", - " total_hits.append(hits)\n", - " df = pd.DataFrame({\"took\": tooks, \"total_hits\": total_hits})\n", + " tooks.append(obj.get(\"took\"))\n", + " hits = obj.get(\"hits\")\n", + " df = pd.DataFrame({\"took\": tooks, \"hits\": hits})\n", " print(df.describe().to_markdown())" ] },