From 2f4120ff27ff28041b8b0e8836fc29b89a719cba Mon Sep 17 00:00:00 2001
From: "Yu, Peng" <peng.yu@intel.com>
Date: Mon, 14 Nov 2022 15:19:56 +0800
Subject: [PATCH] reduce memory consumption of default benchmark
 configurations; update doc

---
 volatile/benchmark/bench.cpp |  37 +++++----
 volatile/doc/benchmark.md    | 149 +++++------------------------------
 2 files changed, 36 insertions(+), 150 deletions(-)

diff --git a/volatile/benchmark/bench.cpp b/volatile/benchmark/bench.cpp
index ee71e21a..44592181 100644
--- a/volatile/benchmark/bench.cpp
+++ b/volatile/benchmark/bench.cpp
@@ -22,9 +22,9 @@ using namespace KVDK_NAMESPACE;
 // Benchmark configs
 DEFINE_string(path, "/mnt/pmem0/kvdk", "Instance path");
 
-DEFINE_uint64(num_kv, (1 << 30), "Number of KVs to place");
+DEFINE_uint64(num_kv, (1 << 23), "Number of KVs to place");
 
-DEFINE_uint64(num_operations, (1 << 30),
+DEFINE_uint64(num_operations, (1 << 20),
               "Number of total operations. "
               "num_kv will override this when benchmarking fill/insert");
 
@@ -63,14 +63,10 @@ DEFINE_string(key_distribution, "random",
               "be ignored and only uniform distribution will be used");
 
 // Engine configs
-DEFINE_bool(
-    populate, false,
-    "Populate pmem space while creating a new instance. This can improve write "
-    "performance in runtime, but will take long time to init the instance");
-
 DEFINE_uint64(max_access_threads, 64, "Max access threads of the instance");
 
-DEFINE_uint64(space, (256ULL << 30), "Max usable PMem space of the instance");
+DEFINE_uint64(hash_bucket_num, (1 << 20),
+              "The number of initial buckets in hash table");
 
 DEFINE_bool(opt_large_sorted_collection_restore, true,
             " Optional optimization strategy which Multi-thread recovery a "
@@ -129,6 +125,7 @@ double existing_keys_ratio = 0;
 std::uint64_t batch_size = 0;
 bool scan = false;
 std::uint64_t num_operations = 0;
+std::uint64_t benchmark_threads = 0;
 
 std::uint64_t max_key = UINT64_MAX;
 extd::zipfian_distribution<std::uint64_t>* zipf = nullptr;
@@ -422,6 +419,7 @@ void InitializeBenchmark() {
   if (bench_data_type != DataType::Blackhole) {
     Configs configs;
     configs.max_access_threads = FLAGS_max_access_threads;
+    configs.hash_bucket_num = FLAGS_hash_bucket_num;
     configs.opt_large_sorted_collection_recovery =
         FLAGS_opt_large_sorted_collection_restore;
     configs.dest_memory_nodes = FLAGS_dest_memory_nodes;
@@ -483,18 +481,19 @@ void ProcessBenchmarkConfigs() {
     throw std::invalid_argument{"value size too large"};
   }
 
-  random_engines.resize(FLAGS_threads);
+  benchmark_threads = fill ? FLAGS_max_access_threads : FLAGS_threads;
+  random_engines.resize(benchmark_threads);
   if (fill) {
     assert(read_ratio == 0);
     key_dist = KeyDistribution::Range;
-    operations_per_thread = FLAGS_num_kv / FLAGS_max_access_threads + 1;
+    operations_per_thread = FLAGS_num_kv / benchmark_threads + 1;
     ranges.clear();
-    for (size_t i = 0; i < FLAGS_max_access_threads; i++) {
+    for (size_t i = 0; i < benchmark_threads; i++) {
       ranges.emplace_back(i * operations_per_thread,
                           (i + 1) * operations_per_thread);
     }
   } else {
-    operations_per_thread = num_operations / FLAGS_threads;
+    operations_per_thread = num_operations / benchmark_threads;
     if (FLAGS_key_distribution == "random") {
       key_dist = KeyDistribution::Uniform;
     } else if (FLAGS_key_distribution == "zipf") {
@@ -527,12 +526,12 @@ void ResetBenchmarkData() {
   read_not_found = 0;
   has_timed_out = false;
   has_finished.clear();
-  has_finished.resize(FLAGS_threads, 0);
+  has_finished.resize(benchmark_threads, 0);
 
   if (FLAGS_latency) {
     printf("calculate latencies\n");
     latencies.clear();
-    latencies.resize(FLAGS_threads, std::vector<std::uint64_t>(MAX_LAT, 0));
+    latencies.resize(benchmark_threads, std::vector<std::uint64_t>(MAX_LAT, 0));
   }
 }
 
@@ -541,9 +540,9 @@ void RunBenchmark() {
   ResetBenchmarkData();
 
   size_t write_threads =
-      fill ? FLAGS_max_access_threads
-           : FLAGS_threads - read_ratio * 100 * FLAGS_threads / 100;
-  int read_threads = FLAGS_threads - write_threads;
+      fill ? benchmark_threads
+           : benchmark_threads - read_ratio * 100 * benchmark_threads / 100;
+  int read_threads = fill ? 0 : benchmark_threads - write_threads;
   std::vector<std::thread> ts;
 
   switch (bench_data_type) {
@@ -587,7 +586,7 @@ void RunBenchmark() {
   for (size_t i = 0; i < write_threads; i++) {
     ts.emplace_back(DBWrite, i);
   }
-  for (size_t i = write_threads; i < FLAGS_threads; i++) {
+  for (size_t i = write_threads; i < benchmark_threads; i++) {
     ts.emplace_back(scan ? DBScan : DBRead, i);
   }
 
@@ -628,7 +627,7 @@ void RunBenchmark() {
     if (num_finished == 0 || idx < 2) {
       last_effective_idx = idx;
     }
-    if (num_finished == FLAGS_threads) {
+    if (num_finished == benchmark_threads) {
       break;
     }
     if (!fill && (duration.count() >= FLAGS_timeout * 1000)) {
diff --git a/volatile/doc/benchmark.md b/volatile/doc/benchmark.md
index 34311427..968dcfd7 100644
--- a/volatile/doc/benchmark.md
+++ b/volatile/doc/benchmark.md
@@ -2,151 +2,38 @@
 
 To test performance of KVDK, you can run our benchmark tool "bench", the tool is auto-built along with KVDK library in the build dir. 
 
-You can manually run individual benchmark follow the examples as shown bellow, or simply run our basic benchmark script "scripts/run_benchmark.py" to test all the basic read/write performance.
-
-To run the script, you shoulf first build kvdk, then run:
-
+Here is an example to run benchmarks on `string` type:
+```bash
+./bench -path=./kvdk_bench_dir -type=string -num_kv=8388608 -num_operations=1048576 -threads=10 -max_access_threads=64 -value_size=120 -latency=1
 ```
-scripts/run_benchmark.py [data_type] [key distribution]
-```
-
-data_type: Which data type to benchmark, it can be string/sorted/hash/list/blackhole/all
 
-key distribution: Distribution of key of the benchmark workloads, it can be random/zipf/all
-## Fill data to new instance
-
-To test performance, we need to first fill key-value pairs to the KVDK instance. Since KVDK did not support cross-socket access yet, we need to bind bench program to a numa node: 
-
-    numactl --cpunodebind=0 --membind=0 ./bench -fill=1 -value_size=120 -threads=64 -path=/mnt/pmem0/kvdk -space=274877906944 -num=838860800 -max_write_threads=64 -type=string -populate=1
+To benchmark performance when KVs are stored on separted memory nodes, we can use `numactl`:
+```bash
+numactl --cpunodebind=0 --membind=0 ./bench -path=./kvdk_bench_dir -type=string -num_kv=8388608 -num_operations=1048576 -threads=10 -max_access_threads=64 -value_size=120 -latency=1 -dest_memory_nodes=1
+```
 
-This command will fill 83886088 uniform distributed string-type key-value pairs to the KVDK instance that located at /mnt/pmem0/kvdk.
+The above configurations will consume ~7 GB memory.
 
 Explanation of arguments:
 
-    -fill: Indicates filling data to a new instance.
-
-    -threads: Number of threads of benchmark.
+    -path: KVDK initialized here
 
-    -space: PMem space that allocate to the KVDK instance.
+    -type: Type of key-value pairs to benchmark, it can be string/sorted/hash/list/blackhole.
 
-    -max_access_threads: Max concurrent access threads in the KVDK instance, set it to the number of the hyper-threads for performance consideration. You can call KVDK API with any number of threads, but if your parallel threads more than max_access_threads, the performance will be degraded due to synchronization cost
+    -num_kv: Number of KV when benchmarking fill/insert.
 
-    -type: Type of key-value pairs to benchmark, it can be "string", "hash" or "sorted".
+    -num_operations: Number of operations running benchmarks other than fill/insert.
 
-    -populate: Populate pmem space while creating new KVDK instance for best write performance in runtime, see "include/kvdk/configs.hpp" for explanation.
+    -threads: Number of threads of benchmark. `max_access_threads` will override this when benchmarking `fill`.
 
-## Test read/write performance
+    -max_access_threads: Max concurrent access threads in the KVDK instance, set it to the number of the hyper-threads for performance consideration. You can call KVDK API with any number of threads, but if your parallel threads more than max_access_threads, the performance will be degraded due to synchronization cost.
 
-### Read performance
+    -value_size: Value length of values in Byte.
 
-After fill the instance, we can test read performance with the command below:
+    -latency: Print latencies of operations or not.
 
-    numactl --cpunodebind=0 --membind=0 ./bench -fill=0 -time=10 -value_size=120 -threads=64 -read_ratio=1 -existing_keys_ratio=1 -path=/mnt/pmem0/kvdk -space=274877906944 -num=838860800 -max_write_threads=64 -type=string
-
-This will read key-value pairs from the KVDK instance with 48 threads in 10 seconds.
-
-Explanation of arguments:
-
-    -read_ratio: Ratio of read threads among benchmark threads, for example, if set it to 0.5, then there will be 24 write threads and 24 read threads.
-    
-    -existing_keys_ratio: Ratio of keys among key-value pairs to read that already filled in the instance. For example, if set it to 0.5, then 50% read operations will return NotFound.
-
-Benchmark tool will print performance stats to stdout, include throughput in each second and average ops:
-
-    $numactl --cpunodebind=0 --membind=0 ./bench -fill=0 -time=10 -value_size=120 -threads=64 -read_ratio=1 -existing_keys_ratio=1 -path=/mnt/pmem0/kvdk -space=274877906944 -num=838860800 -max_write_threads=64 -type=string
-    
-    [LOG] time 0 ms: Initializing PMem size 274877906944 in file /mnt/pmem0/kvdk/data
-    [LOG] time 1864 ms: Map pmem space done
-    [LOG] time 9033 ms: In restoring: iterated 840882543 records
-    init 0 write threads
-    init 64 read threads
-    ------- ops in seconds -----------
-    time (ms),   read ops,   not found,  write ops,  total read,  total write
-    1000        73691000    0           0           73691000     0
-    2001        73613000    0           0           147304000    0
-    3002        73643000    0           0           220947000    0
-    4003        73656000    0           0           294603000    0
-    5004        73675000    0           0           368278000    0
-    6005        73667000    0           0           441945000    0
-    7006        73699000    0           0           515644000    0
-    8007        73647000    0           0           589291000    0
-    9008        73634000    0           0           662925000    0
-    10009       73677000    0           0           736602000    0
-    finish bench
-     ------------ statistics ------------
-    read ops 73660400, write ops 0
-    [LOG] time 19051 ms: instance closed
-
-
-
-### Write performance
-
-Similarily, to test write performance, we can simply modify "read_ratio":
-
-    numactl --cpunodebind=0 --membind=0 ./bench -fill=0 -time=10 -value_size=120 -threads=64 -read_ratio=0 -existing_keys_ratio=0 -path=/mnt/pmem0/kvdk -space=274877906944 -num=838860800 -max_write_threads=64 -type=string
-
-This command will insert new key-value pairs to the KVDK instance in 10 seconds. Likely wise, by modify "existing_keys_ratio", we can control how many write operations are updates.
-
-    $numactl --cpunodebind=0 --membind=0 ./bench -fill=0 -time=10 -value_size=120 -threads=64 -read_ratio=0     -existing_keys_ratio=0 -path=/mnt/pmem0/kvdk -space=274877906944 -num=838860800 -max_write_threads=64 -type=string
-
-    [LOG] time 0 ms: Initializing PMem size 274877906944 in file /mnt/pmem0/kvdk/data
-    [LOG] time 1865 ms: Map pmem space done
-    [LOG] time 9015 ms: In restoring: iterated 840882543 records
-    init 64 write threads
-    init 0 read threads
-    ------- ops in seconds -----------
-    time (ms),   read ops,   not found,  write ops,  total read,  total write
-    1000        0           0           50610000    0            50610000
-    2007        0           0           50053000    0            100663000
-    3016        0           0           49669000    0            150332000
-    4017        0           0           49048000    0            199380000
-    5018        0           0           48540000    0            247920000
-    6022        0           0           48210000    0            296130000
-    7023        0           0           47725000    0            343855000
-    8024        0           0           47354000    0            391209000
-    9027        0           0           47080000    0            438289000
-    10028       0           0           46544000    0            484833000
-    finish bench
-     ------------ statistics ------------
-    read ops 0, write ops 48483400
-    [LOG] time 19055 ms: instance closed
-
-
-### Stat latencies
-
-We can also stat latency information by add "-latency=1" to the benchmark command.
-
-    $ numactl --cpunodebind=0 --membind=0 ./bench -fill=0 -time=10 -value_size=120 -threads=64 -read_ratio=0.5 -existing_keys_ratio=1 -path=/mnt/pmem0/kvdk -space=274877906944 -num=838860800 -max_write_threads=64 -type=string -latency=1
-
-    [LOG] time 0 ms: Initializing PMem size 274877906944 in file /mnt/pmem0/kvdk/data
-    [LOG] time 1869 ms: Map pmem space done
-    [LOG] time 14963 ms: In restoring: iterated 1323729106 records
-    calculate latencies
-    init 6 write threads
-    init 58 read threads
-    ------- ops in seconds -----------
-    time (ms),   read ops,   not found,  write ops,  total read,  total write
-    1000        62763000    0           3933000     62763000     3933000
-    2001        62297000    0           4303000     125060000    8236000
-    3002        62190000    0           4530000     187250000    12766000
-    4003        62194000    0           4530000     249444000    17296000
-    5004        62206000    0           4531000     311650000    21827000
-    6005        62172000    0           4527000     373822000    26354000
-    7006        62194000    0           4530000     436016000    30884000
-    8007        62227000    0           4535000     498243000    35419000
-    9008        62196000    0           4529000     560439000    39948000
-    10009       62190000    0           4527000     622629000    44475000
-    finish bench
-     ------------ statistics ------------
-    read ops 62263100, write ops 4447500
-    read lantencies (us): Avg: 0.89, P50: 0.83, P99: 1.54, P99.5: 1.67, P99.9: 2.77, P99.99: 4.20
-    write lantencies (us): Avg: 0.09, P50: 1.22, P99: 2.64, P99.5: 3.25, P99.9: 4.22, P99.99: 5.35
-    [LOG] time 28382 ms: instance closed
+    -dest_memory_nodes: The memory nodes to store KV data.
 
 ## More configurations
 
-For more configurations of the benchmark tool, please reference to "benchmark/bench.cpp" and "scripts/basic_benchmarks.py".
-
-
-
-    
+For more configurations of the benchmark tool, please reference to "benchmark/bench.cpp".