Skip to content

Commit

Permalink
update results.json
Browse files Browse the repository at this point in the history
  • Loading branch information
marevol committed Apr 26, 2024
1 parent 82819d8 commit 6ca451e
Show file tree
Hide file tree
Showing 7 changed files with 78 additions and 47 deletions.
17 changes: 10 additions & 7 deletions run-elasticsearch.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@
"import pprint\n",
"import subprocess\n",
"import time\n",
"from datetime import timedelta\n",
"from datetime import timedelta, datetime\n",
"from pathlib import Path\n",
"from dataclasses import dataclass\n",
"from dataclasses import dataclass, asdict\n",
"import multiprocessing\n",
"\n",
"import numpy as np\n",
Expand Down Expand Up @@ -658,6 +658,7 @@
" print(\"Sending knn queries...\")\n",
" start_time = time.time()\n",
" pos = offset\n",
" doc_id = 0\n",
" count = 0\n",
" running = True\n",
" error_count = 0\n",
Expand All @@ -666,6 +667,7 @@
" with np.load(config.embedding_path / f\"{pos}.npz\") as data:\n",
" embedding_data = data[\"embs\"]\n",
" for embedding in embedding_data:\n",
" doc_id += 1\n",
" if count >= max_size:\n",
" running = False\n",
" break\n",
Expand All @@ -691,7 +693,7 @@
" break\n",
" continue\n",
" result = {\n",
" \"id\": (count + 1),\n",
" \"id\": doc_id,\n",
" \"took\": took,\n",
" \"hits\": hits,\n",
" \"total_hits\": total_hits,\n",
Expand Down Expand Up @@ -785,13 +787,14 @@
"metadata": {},
"outputs": [],
"source": [
"def save_results(config):\n",
"def save_results(target_config, config, results):\n",
" with open(\"results.json\", \"wt\", encoding=\"utf-8\") as f:\n",
" json.dump({\n",
" \"target\": target_config,\n",
" \"version\": config.elasticsearch_version,\n",
" \"java_heap\": config.elasticsearch_heap,\n",
" \"settings\": config,\n",
" \"settings\": asdict(config),\n",
" \"results\": results,\n",
" \"timestamp\": datetime.now().isoformat(),\n",
" }, f, ensure_ascii=False, default=lambda x: int(x) if isinstance(x, np.int64) else None)\n"
]
},
Expand Down Expand Up @@ -974,7 +977,7 @@
"metadata": {},
"outputs": [],
"source": [
"save_results(dataset_config)"
"save_results(target_config, dataset_config, results)"
]
},
{
Expand Down
16 changes: 10 additions & 6 deletions run-milvus.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@
"import pprint\n",
"import subprocess\n",
"import time\n",
"from datetime import timedelta\n",
"from datetime import timedelta, datetime\n",
"from pathlib import Path\n",
"from dataclasses import dataclass\n",
"from dataclasses import dataclass, asdict\n",
"import multiprocessing\n",
"\n",
"import numpy as np\n",
Expand Down Expand Up @@ -642,6 +642,7 @@
" print(\"Sending knn queries...\")\n",
" start_time = time.time()\n",
" pos = offset\n",
" doc_id = 0\n",
" count = 0\n",
" running = True\n",
" error_count = 0\n",
Expand All @@ -650,6 +651,7 @@
" with np.load(config.embedding_path / f\"{pos}.npz\") as data:\n",
" embedding_data = data[\"embs\"]\n",
" for embedding in embedding_data:\n",
" doc_id += 1\n",
" if count >= max_size:\n",
" running = False\n",
" break\n",
Expand Down Expand Up @@ -682,7 +684,7 @@
" break\n",
" continue\n",
" result = {\n",
" \"id\": (count + 1),\n",
" \"id\": doc_id,\n",
" \"took\": took,\n",
" \"hits\": hits,\n",
" \"ids\": ids,\n",
Expand Down Expand Up @@ -766,12 +768,14 @@
"metadata": {},
"outputs": [],
"source": [
"def save_results(config):\n",
"def save_results(target_config, config, results):\n",
" with open(\"results.json\", \"wt\", encoding=\"utf-8\") as f:\n",
" json.dump({\n",
" \"target\": target_config,\n",
" \"version\": config.milvus_version,\n",
" \"settings\": config,\n",
" \"settings\": asdict(config),\n",
" \"results\": results,\n",
" \"timestamp\": datetime.now().isoformat(),\n",
" }, f, ensure_ascii=False, default=lambda x: int(x) if isinstance(x, np.int64) else None)\n"
]
},
Expand Down Expand Up @@ -923,7 +927,7 @@
"metadata": {},
"outputs": [],
"source": [
"save_results(dataset_config)"
"save_results(target_config, dataset_config, results)"
]
},
{
Expand Down
17 changes: 10 additions & 7 deletions run-opensearch.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@
"import pprint\n",
"import subprocess\n",
"import time\n",
"from datetime import timedelta\n",
"from datetime import timedelta, datetime\n",
"from pathlib import Path\n",
"from dataclasses import dataclass\n",
"from dataclasses import dataclass, asdict\n",
"import multiprocessing\n",
"\n",
"import numpy as np\n",
Expand Down Expand Up @@ -669,6 +669,7 @@
" print(\"Sending knn queries...\")\n",
" start_time = time.time()\n",
" pos = offset\n",
" doc_id = 0\n",
" count = 0\n",
" running = True\n",
" error_count = 0\n",
Expand All @@ -677,6 +678,7 @@
" with np.load(config.embedding_path / f\"{pos}.npz\") as data:\n",
" embedding_data = data[\"embs\"]\n",
" for embedding in embedding_data:\n",
" doc_id += 1\n",
" if count >= max_size:\n",
" running = False\n",
" break\n",
Expand All @@ -701,7 +703,7 @@
" break\n",
" continue\n",
" result = {\n",
" \"id\": (count + 1),\n",
" \"id\": doc_id,\n",
" \"took\": took,\n",
" \"hits\": hits,\n",
" \"total_hits\": total_hits,\n",
Expand Down Expand Up @@ -795,13 +797,14 @@
"metadata": {},
"outputs": [],
"source": [
"def save_results(config):\n",
"def save_results(target_config, config, results):\n",
" with open(\"results.json\", \"wt\", encoding=\"utf-8\") as f:\n",
" json.dump({\n",
" \"target\": target_config,\n",
" \"version\": config.opensearch_version,\n",
" \"java_heap\": config.opensearch_heap,\n",
" \"settings\": config,\n",
" \"settings\": asdict(config),\n",
" \"results\": results,\n",
" \"timestamp\": datetime.now().isoformat(),\n",
" }, f, ensure_ascii=False, default=lambda x: int(x) if isinstance(x, np.int64) else None)\n"
]
},
Expand Down Expand Up @@ -980,7 +983,7 @@
"metadata": {},
"outputs": [],
"source": [
"save_results(dataset_config)"
"save_results(target_config, dataset_config, results)"
]
},
{
Expand Down
16 changes: 10 additions & 6 deletions run-pgvector.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@
"import pprint\n",
"import subprocess\n",
"import time\n",
"from datetime import timedelta\n",
"from datetime import timedelta, datetime\n",
"from pathlib import Path\n",
"from dataclasses import dataclass\n",
"from dataclasses import dataclass, asdict\n",
"import multiprocessing\n",
"\n",
"import numpy as np\n",
Expand Down Expand Up @@ -562,6 +562,7 @@
" print(\"Sending knn queries...\")\n",
" start_time = time.time()\n",
" pos = offset\n",
" doc_id = 0\n",
" count = 0\n",
" running = True\n",
" error_count = 0\n",
Expand All @@ -570,6 +571,7 @@
" with np.load(config.embedding_path / f\"{pos}.npz\") as data:\n",
" embedding_data = data[\"embs\"]\n",
" for embedding in embedding_data:\n",
" doc_id += 1\n",
" if count >= max_size:\n",
" running = False\n",
" break\n",
Expand All @@ -584,7 +586,7 @@
" break\n",
" continue\n",
" result = {\n",
" \"id\": (count + 1),\n",
" \"id\": doc_id,\n",
" \"took\": took,\n",
" \"hits\": hits,\n",
" \"ids\": ids,\n",
Expand Down Expand Up @@ -668,12 +670,14 @@
"metadata": {},
"outputs": [],
"source": [
"def save_results(config):\n",
"def save_results(target_config, config, results):\n",
" with open(\"results.json\", \"wt\", encoding=\"utf-8\") as f:\n",
" json.dump({\n",
" \"target\": target_config,\n",
" \"version\": config.pgvector_version,\n",
" \"settings\": config,\n",
" \"settings\": asdict(config),\n",
" \"results\": results,\n",
" \"timestamp\": datetime.now().isoformat(),\n",
" }, f, ensure_ascii=False, default=lambda x: int(x) if isinstance(x, np.int64) else None)\n"
]
},
Expand Down Expand Up @@ -820,7 +824,7 @@
"metadata": {},
"outputs": [],
"source": [
"save_results(dataset_config)"
"save_results(target_config, dataset_config, results)"
]
},
{
Expand Down
27 changes: 18 additions & 9 deletions run-qdrant.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@
"import pprint\n",
"import subprocess\n",
"import time\n",
"from datetime import timedelta\n",
"from datetime import timedelta, datetime\n",
"from pathlib import Path\n",
"from dataclasses import dataclass\n",
"from dataclasses import dataclass, asdict\n",
"import multiprocessing\n",
"\n",
"import numpy as np\n",
Expand All @@ -40,6 +40,7 @@
" index_name: str\n",
" distance: str\n",
" dimension: int\n",
" exact: bool\n",
" hnsw_m: int\n",
" hnsw_ef_construction: int\n",
" hnsw_ef: int\n",
Expand All @@ -62,6 +63,7 @@
" \"index_name\": \"contents\",\n",
" \"distance\": \"Dot\", # \"Cosine\"\n",
" \"dimension\": 768,\n",
" \"exact\": False,\n",
" \"hnsw_m\": 32,\n",
" \"hnsw_ef_construction\": 200,\n",
" \"hnsw_ef\": 100,\n",
Expand All @@ -76,6 +78,7 @@
" \"index_name\": \"contents\",\n",
" \"distance\": \"Dot\", # \"Cosine\"\n",
" \"dimension\": 768,\n",
" \"exact\": False,\n",
" \"hnsw_m\": 48,\n",
" \"hnsw_ef_construction\": 200,\n",
" \"hnsw_ef\": 100,\n",
Expand All @@ -90,6 +93,7 @@
" \"index_name\": \"contents\",\n",
" \"distance\": \"Dot\", # \"Cosine\"\n",
" \"dimension\": 768,\n",
" \"exact\": False,\n",
" \"hnsw_m\": 48,\n",
" \"hnsw_ef_construction\": 200,\n",
" \"hnsw_ef\": 100,\n",
Expand Down Expand Up @@ -553,10 +557,11 @@
"metadata": {},
"outputs": [],
"source": [
"def search_with_knn_queries(config, output_path, pre_filter=None, max_size=10000, page_size=100, offset=0, max_error_count=100):\n",
"def search_with_knn_queries(config, output_path, pre_filter=None, max_size=10000, page_size=100, offset=0, max_error_count=100, exact=False):\n",
" print(\"Sending knn queries...\")\n",
" start_time = time.time()\n",
" pos = offset\n",
" doc_id = 0\n",
" count = 0\n",
" running = True\n",
" error_count = 0\n",
Expand All @@ -565,6 +570,7 @@
" with np.load(config.embedding_path / f\"{pos}.npz\") as data:\n",
" embedding_data = data[\"embs\"]\n",
" for embedding in embedding_data:\n",
" doc_id += 1\n",
" if count >= max_size:\n",
" running = False\n",
" break\n",
Expand All @@ -576,6 +582,7 @@
" # \"with_payload\": True,\n",
" \"params\": {\n",
" \"hnsw_ef\": config.hnsw_ef,\n",
" \"exact\": exact,\n",
" },\n",
" }\n",
" if pre_filter is not None:\n",
Expand All @@ -589,7 +596,7 @@
" break\n",
" continue\n",
" result = {\n",
" \"id\": (count + 1),\n",
" \"id\": doc_id,\n",
" \"took\": took,\n",
" \"hits\": hits,\n",
" \"ids\": ids,\n",
Expand Down Expand Up @@ -673,12 +680,14 @@
"metadata": {},
"outputs": [],
"source": [
"def save_results(config):\n",
"def save_results(target_config, config, results):\n",
" with open(\"results.json\", \"wt\", encoding=\"utf-8\") as f:\n",
" json.dump({\n",
" \"target\": target_config,\n",
" \"version\": config.qdrant_version,\n",
" \"settings\": config,\n",
" \"settings\": asdict(config),\n",
" \"results\": results,\n",
" \"timestamp\": datetime.now().isoformat(),\n",
" }, f, ensure_ascii=False, default=lambda x: int(x) if isinstance(x, np.int64) else None)\n"
]
},
Expand Down Expand Up @@ -782,7 +791,7 @@
" filename = get_output_filename(dataset_config.qdrant_version, f\"knn_{page_size}\")\n",
" stop_update = start_update(target_config)\n",
" search_with_knn_queries(dataset_config, filename, page_size=page_size, max_size=1000) # warmup\n",
" search_with_knn_queries(dataset_config, filename, page_size=page_size, offset=dataset_config.index_size)\n",
" search_with_knn_queries(dataset_config, filename, page_size=page_size, offset=dataset_config.index_size, exact=dataset_config.exact)\n",
" stop_update()\n",
" results[f\"top_{page_size}\"] = print_took_and_total_hits(filename)"
]
Expand Down Expand Up @@ -815,7 +824,7 @@
" filename = get_output_filename(dataset_config.qdrant_version, f\"knn_{page_size}_filtered\")\n",
" stop_update = start_update(target_config)\n",
" search_with_knn_queries(dataset_config, filename, page_size=page_size, max_size=1000, pre_filter=pre_filter_generator()) # warmup\n",
" search_with_knn_queries(dataset_config, filename, page_size=page_size, offset=dataset_config.index_size, pre_filter=pre_filter_generator())\n",
" search_with_knn_queries(dataset_config, filename, page_size=page_size, offset=dataset_config.index_size, pre_filter=pre_filter_generator(), exact=dataset_config.exact)\n",
" stop_update()\n",
" results[f\"top_{page_size}_filtered\"] = print_took_and_total_hits(filename)"
]
Expand All @@ -839,7 +848,7 @@
"metadata": {},
"outputs": [],
"source": [
"save_results(dataset_config)"
"save_results(target_config, dataset_config, results)"
]
},
{
Expand Down
Loading

0 comments on commit 6ca451e

Please sign in to comment.