fix hit count

codelibs · Mar 21, 2024 · b6ce9a5 · b6ce9a5
1 parent f7e30b1
commit b6ce9a5
Show file tree

Hide file tree

Showing 5 changed files with 63 additions and 59 deletions.
diff --git a/run-elasticsearch.ipynb b/run-elasticsearch.ipynb
@@ -502,13 +502,14 @@
     "        obj = json.loads(response.text)\n",
     "        if obj.get(\"timed_out\"):\n",
     "            print(f\"[TIMEOUT] {query}\")\n",
-    "            return -1, -1, [], [], []\n",
-    "        product_ids = [x.get(\"_id\") for x in obj.get(\"hits\").get(\"hits\")]\n",
-    "        scores = [x.get(\"_score\") for x in obj.get(\"hits\").get(\"hits\")]\n",
-    "        explanations = [x.get(\"_explanation\") for x in obj.get(\"hits\").get(\"hits\")] if explain else []\n",
-    "        return obj.get(\"took\"), obj.get(\"hits\").get(\"total\").get(\"value\"), product_ids, scores, explanations\n",
+    "            return -1, -1, -1, [], [], []\n",
+    "        hits = obj.get(\"hits\").get(\"hits\")\n",
+    "        product_ids = [x.get(\"_id\") for x in hits]\n",
+    "        scores = [x.get(\"_score\") for x in hits]\n",
+    "        explanations = [x.get(\"_explanation\") for x in hits] if explain else []\n",
+    "        return obj.get(\"took\"), len(hits), obj.get(\"hits\").get(\"total\").get(\"value\"), product_ids, scores, explanations\n",
     "    print(f\"[FAIL][{response.status_code}] {response.text}\")\n",
-    "    return -1, -1, [], [], []\n"
+    "    return -1, -1, -1, [], [], []\n"
    ]
   },
   {
@@ -535,21 +536,23 @@
     "                if distance == \"dot_product\":\n",
     "                    embedding = embedding.astype(np.float32)\n",
     "                    embedding = embedding / np.linalg.norm(embedding)\n",
+    "                num_candidates = hnsw_ef if hnsw_ef > page_size else page_size\n",
     "                query = {\n",
     "                    \"knn\": {\n",
     "                        \"field\": \"embedding\",\n",
     "                        \"query_vector\": embedding.tolist(),\n",
-    "                        \"num_candidates\": hnsw_ef\n",
+    "                        \"num_candidates\": num_candidates\n",
     "                    }\n",
     "                }\n",
-    "                took, total_hits, ids, scores, explanations = search(query=query, size=page_size, explain=explain, track_total_hits=track_total_hits)\n",
+    "                took, hits, total_hits, ids, scores, explanations = search(query=query, size=page_size, explain=explain, track_total_hits=track_total_hits)\n",
     "                # print(f\"{took}, {total_hits}, {ids}, {scores}\")\n",
     "                if took == -1:\n",
     "                    print(f\"norm: {np.linalg.norm(embedding)}\")\n",
     "                    continue\n",
     "                result = {\n",
     "                    \"id\": (count + 1),\n",
     "                    \"took\": took,\n",
+    "                    \"hits\": hits,\n",
     "                    \"total_hits\": total_hits,\n",
     "                    \"ids\": ids,\n",
     "                    \"scores\": scores,\n",
@@ -597,17 +600,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def print_took_and_total_hits(filename, min_hits=0):\n",
+    "def print_took_and_total_hits(filename):\n",
     "    tooks = []\n",
+    "    hits = []\n",
     "    total_hits = []\n",
     "    with gzip.open(filename, \"rt\", encoding=\"utf-8\") as f:\n",
     "        for line in f.readlines():\n",
     "            obj = json.loads(line)\n",
-    "            hits = obj.get(\"total_hits\")\n",
-    "            if hits >= min_hits:\n",
-    "                tooks.append(obj.get(\"took\"))\n",
-    "                total_hits.append(hits)\n",
-    "    df = pd.DataFrame({\"took\": tooks, \"total_hits\": total_hits})\n",
+    "            tooks.append(obj.get(\"took\"))\n",
+    "            hits = obj.get(\"hits\")\n",
+    "            total_hits = obj.get(\"total_hits\")\n",
+    "    df = pd.DataFrame({\"took\": tooks, \"hits\": hits, \"total_hits\": total_hits})\n",
     "    print(df.describe().to_markdown())"
    ]
   },
@@ -672,7 +675,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "for page_size in [10, 100, 400, 1000]:\n",
+    "for page_size in [10, 100, 400]:\n",
     "    print(f\"page size: {page_size}\")\n",
     "    filename = get_output_filename(es_version, f\"knn_{page_size}\", explain=False, track_total_hits=False)\n",
     "    search_with_knn_queries(filename, page_size=page_size, max_size=1000) # warmup\n",

diff --git a/run-opensearch.ipynb b/run-opensearch.ipynb
@@ -512,13 +512,14 @@
     "        obj = json.loads(response.text)\n",
     "        if obj.get(\"timed_out\"):\n",
     "            print(f\"[TIMEOUT] {query}\")\n",
-    "            return -1, -1, [], [], []\n",
-    "        product_ids = [x.get(\"_id\") for x in obj.get(\"hits\").get(\"hits\")]\n",
-    "        scores = [x.get(\"_score\") for x in obj.get(\"hits\").get(\"hits\")]\n",
-    "        explanations = [x.get(\"_explanation\") for x in obj.get(\"hits\").get(\"hits\")] if explain else []\n",
-    "        return obj.get(\"took\"), obj.get(\"hits\").get(\"total\").get(\"value\"), product_ids, scores, explanations\n",
+    "            return -1, -1, -1, [], [], []\n",
+    "        hits = obj.get(\"hits\").get(\"hits\")\n",
+    "        product_ids = [x.get(\"_id\") for x in hits]\n",
+    "        scores = [x.get(\"_score\") for x in hits]\n",
+    "        explanations = [x.get(\"_explanation\") for x in hits] if explain else []\n",
+    "        return obj.get(\"took\"), len(hits), obj.get(\"hits\").get(\"total\").get(\"value\"), product_ids, scores, explanations\n",
     "    print(f\"[FAIL][{response.status_code}] {response.text}\")\n",
-    "    return -1, -1, [], [], []\n"
+    "    return -1, -1, -1, [], [], []\n"
    ]
   },
   {
@@ -552,13 +553,14 @@
     "                        }\n",
     "                    }\n",
     "                }\n",
-    "                took, total_hits, ids, scores, explanations = search(query=query, size=page_size, explain=explain, track_total_hits=track_total_hits)\n",
+    "                took, hits, total_hits, ids, scores, explanations = search(query=query, size=page_size, explain=explain, track_total_hits=track_total_hits)\n",
     "                # print(f\"{took}, {total_hits}, {ids}, {scores}\")\n",
     "                if took == -1:\n",
     "                    continue\n",
     "                result = {\n",
     "                    \"id\": (count + 1),\n",
     "                    \"took\": took,\n",
+    "                    \"hits\": hits,\n",
     "                    \"total_hits\": total_hits,\n",
     "                    \"ids\": ids,\n",
     "                    \"scores\": scores,\n",
@@ -606,17 +608,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def print_took_and_total_hits(filename, min_hits=0):\n",
+    "def print_took_and_total_hits(filename):\n",
     "    tooks = []\n",
+    "    hits = []\n",
     "    total_hits = []\n",
     "    with gzip.open(filename, \"rt\", encoding=\"utf-8\") as f:\n",
     "        for line in f.readlines():\n",
     "            obj = json.loads(line)\n",
-    "            hits = obj.get(\"total_hits\")\n",
-    "            if hits >= min_hits:\n",
-    "                tooks.append(obj.get(\"took\"))\n",
-    "                total_hits.append(hits)\n",
-    "    df = pd.DataFrame({\"took\": tooks, \"total_hits\": total_hits})\n",
+    "            tooks.append(obj.get(\"took\"))\n",
+    "            hits = obj.get(\"hits\")\n",
+    "            total_hits = obj.get(\"total_hits\")\n",
+    "    df = pd.DataFrame({\"took\": tooks, \"hits\": hits, \"total_hits\": total_hits})\n",
     "    print(df.describe().to_markdown())"
    ]
   },
@@ -679,7 +681,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "for page_size in [10, 100, 400, 1000]:\n",
+    "for page_size in [10, 100, 400]:\n",
     "    print(f\"page size: {page_size}\")\n",
     "    filename = get_output_filename(opensearch_version, f\"knn_{page_size}\", explain=False, track_total_hits=False)\n",
     "    search_with_knn_queries(filename, page_size=page_size, max_size=1000) # warmup\n",

diff --git a/run-qdrant.ipynb b/run-qdrant.ipynb
@@ -442,14 +442,14 @@
     "                        \"hnsw_ef\": hnsw_ef,\n",
     "                    },\n",
     "                }\n",
-    "                took, total_hits, ids, scores = search(query=query)\n",
+    "                took, hits, ids, scores = search(query=query)\n",
     "                # print(f\"{took}, {total_hits}, {ids}, {scores}\")\n",
     "                if took == -1:\n",
     "                    continue\n",
     "                result = {\n",
     "                    \"id\": (count + 1),\n",
     "                    \"took\": took,\n",
-    "                    \"total_hits\": total_hits,\n",
+    "                    \"hits\": hits,\n",
     "                    \"ids\": ids,\n",
     "                    \"scores\": scores,\n",
     "                }\n",
@@ -489,17 +489,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def print_took_and_total_hits(filename, min_hits=0):\n",
+    "def print_took_and_total_hits(filename):\n",
     "    tooks = []\n",
-    "    total_hits = []\n",
+    "    hits = []\n",
     "    with gzip.open(filename, \"rt\", encoding=\"utf-8\") as f:\n",
     "        for line in f.readlines():\n",
     "            obj = json.loads(line)\n",
-    "            hits = obj.get(\"total_hits\")\n",
-    "            if hits >= min_hits:\n",
-    "                tooks.append(obj.get(\"took\"))\n",
-    "                total_hits.append(hits)\n",
-    "    df = pd.DataFrame({\"took\": tooks, \"total_hits\": total_hits})\n",
+    "            tooks.append(obj.get(\"took\"))\n",
+    "            hits = obj.get(\"hits\")\n",
+    "    df = pd.DataFrame({\"took\": tooks, \"hits\": hits})\n",
     "    print(df.describe().to_markdown())"
    ]
   },

diff --git a/run-vespa.ipynb b/run-vespa.ipynb
@@ -483,11 +483,12 @@
     "    if response.status_code == 200:\n",
     "        obj = json.loads(response.text)\n",
     "        took = obj.get(\"timing\").get(\"searchtime\") * 1000\n",
-    "        product_ids = [x.get(\"id\") for x in obj.get(\"root\").get(\"children\")]\n",
-    "        scores = [x.get(\"relevance\") for x in obj.get(\"root\").get(\"children\")]\n",
-    "        return took, int(obj.get(\"root\").get(\"coverage\").get(\"documents\")), product_ids, scores\n",
+    "        hits = obj.get(\"root\").get(\"children\")\n",
+    "        product_ids = [x.get(\"id\") for x in hits]\n",
+    "        scores = [x.get(\"relevance\") for x in hits]\n",
+    "        return took, len(hits), int(obj.get(\"root\").get(\"coverage\").get(\"documents\")), product_ids, scores\n",
     "    print(f\"[FAIL][{response.status_code}] {response.text}\")\n",
-    "    return -1, -1, [], []\n"
+    "    return -1, -1, -1, [], []\n"
    ]
   },
   {
@@ -514,19 +515,21 @@
     "                if distance == \"dotproduct\":\n",
     "                    embedding = embedding.astype(np.float32)\n",
     "                    embedding = embedding / np.linalg.norm(embedding)\n",
+    "                target_hits = hnsw_ef if hnsw_ef > page_size else page_size\n",
     "                query = {\n",
     "                    \"hits\": page_size,\n",
-    "                    \"yql\": \"select * from \" + index_name + \" where {approximate:true,targetHits:\" + str(hnsw_ef) + \"}nearestNeighbor(embedding,q)\",\n",
+    "                    \"yql\": \"select * from \" + index_name + \" where {approximate:true,targetHits:\" + str(target_hits) + \"}nearestNeighbor(embedding,q)\",\n",
     "                    \"ranking\": \"closeness\",\n",
     "                    \"input.query(q)\": embedding.tolist(),\n",
     "                }\n",
-    "                took, total_hits, ids, scores = search(query=query)\n",
+    "                took, hits, total_hits, ids, scores = search(query=query)\n",
     "                # print(f\"{took}, {total_hits}, {ids}, {scores}\")\n",
     "                if took == -1:\n",
     "                    continue\n",
     "                result = {\n",
     "                    \"id\": (count + 1),\n",
     "                    \"took\": took,\n",
+    "                    \"hits\": hits,\n",
     "                    \"total_hits\": total_hits,\n",
     "                    \"ids\": ids,\n",
     "                    \"scores\": scores,\n",
@@ -567,17 +570,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def print_took_and_total_hits(filename, min_hits=0):\n",
+    "def print_took_and_total_hits(filename):\n",
     "    tooks = []\n",
+    "    hits = []\n",
     "    total_hits = []\n",
     "    with gzip.open(filename, \"rt\", encoding=\"utf-8\") as f:\n",
     "        for line in f.readlines():\n",
     "            obj = json.loads(line)\n",
-    "            hits = obj.get(\"total_hits\")\n",
-    "            if hits >= min_hits:\n",
-    "                tooks.append(obj.get(\"took\"))\n",
-    "                total_hits.append(hits)\n",
-    "    df = pd.DataFrame({\"took\": tooks, \"total_hits\": total_hits})\n",
+    "            tooks.append(obj.get(\"took\"))\n",
+    "            hits = obj.get(\"hits\")\n",
+    "            total_hits = obj.get(\"total_hits\")\n",
+    "    df = pd.DataFrame({\"took\": tooks, \"hits\": hits, \"total_hits\": total_hits})\n",
     "    print(df.describe().to_markdown())"
    ]
   },

diff --git a/run-weaviate.ipynb b/run-weaviate.ipynb
@@ -422,14 +422,14 @@
     "                if distance == \"dot\":\n",
     "                    embedding = embedding / np.linalg.norm(embedding)\n",
     "                query = create_query(embedding.tolist(), page_size)\n",
-    "                took, total_hits, ids, scores = search(query)\n",
+    "                took, hits, ids, scores = search(query)\n",
     "                # print(f\"{took}, {total_hits}, {ids}, {scores}\")\n",
     "                if took == -1:\n",
     "                    continue\n",
     "                result = {\n",
     "                    \"id\": (count + 1),\n",
     "                    \"took\": took,\n",
-    "                    \"total_hits\": total_hits,\n",
+    "                    \"hits\": hits,\n",
     "                    \"ids\": ids,\n",
     "                    \"scores\": scores,\n",
     "                }\n",
@@ -469,17 +469,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def print_took_and_total_hits(filename, min_hits=0):\n",
+    "def print_took_and_total_hits(filename):\n",
     "    tooks = []\n",
-    "    total_hits = []\n",
+    "    hits = []\n",
     "    with gzip.open(filename, \"rt\", encoding=\"utf-8\") as f:\n",
     "        for line in f.readlines():\n",
     "            obj = json.loads(line)\n",
-    "            hits = obj.get(\"total_hits\")\n",
-    "            if hits >= min_hits:\n",
-    "                tooks.append(obj.get(\"took\"))\n",
-    "                total_hits.append(hits)\n",
-    "    df = pd.DataFrame({\"took\": tooks, \"total_hits\": total_hits})\n",
+    "            tooks.append(obj.get(\"took\"))\n",
+    "            hits = obj.get(\"hits\")\n",
+    "    df = pd.DataFrame({\"took\": tooks, \"hits\": hits})\n",
     "    print(df.describe().to_markdown())"
    ]
   },