From 2f4120ff27ff28041b8b0e8836fc29b89a719cba Mon Sep 17 00:00:00 2001 From: "Yu, Peng" Date: Mon, 14 Nov 2022 15:19:56 +0800 Subject: [PATCH] reduce memory consumption of default benchmark configurations; update doc --- volatile/benchmark/bench.cpp | 37 +++++---- volatile/doc/benchmark.md | 149 +++++------------------------------ 2 files changed, 36 insertions(+), 150 deletions(-) diff --git a/volatile/benchmark/bench.cpp b/volatile/benchmark/bench.cpp index ee71e21a..44592181 100644 --- a/volatile/benchmark/bench.cpp +++ b/volatile/benchmark/bench.cpp @@ -22,9 +22,9 @@ using namespace KVDK_NAMESPACE; // Benchmark configs DEFINE_string(path, "/mnt/pmem0/kvdk", "Instance path"); -DEFINE_uint64(num_kv, (1 << 30), "Number of KVs to place"); +DEFINE_uint64(num_kv, (1 << 23), "Number of KVs to place"); -DEFINE_uint64(num_operations, (1 << 30), +DEFINE_uint64(num_operations, (1 << 20), "Number of total operations. " "num_kv will override this when benchmarking fill/insert"); @@ -63,14 +63,10 @@ DEFINE_string(key_distribution, "random", "be ignored and only uniform distribution will be used"); // Engine configs -DEFINE_bool( - populate, false, - "Populate pmem space while creating a new instance. This can improve write " - "performance in runtime, but will take long time to init the instance"); - DEFINE_uint64(max_access_threads, 64, "Max access threads of the instance"); -DEFINE_uint64(space, (256ULL << 30), "Max usable PMem space of the instance"); +DEFINE_uint64(hash_bucket_num, (1 << 20), + "The number of initial buckets in hash table"); DEFINE_bool(opt_large_sorted_collection_restore, true, " Optional optimization strategy which Multi-thread recovery a " @@ -129,6 +125,7 @@ double existing_keys_ratio = 0; std::uint64_t batch_size = 0; bool scan = false; std::uint64_t num_operations = 0; +std::uint64_t benchmark_threads = 0; std::uint64_t max_key = UINT64_MAX; extd::zipfian_distribution* zipf = nullptr; @@ -422,6 +419,7 @@ void InitializeBenchmark() { if (bench_data_type != DataType::Blackhole) { Configs configs; configs.max_access_threads = FLAGS_max_access_threads; + configs.hash_bucket_num = FLAGS_hash_bucket_num; configs.opt_large_sorted_collection_recovery = FLAGS_opt_large_sorted_collection_restore; configs.dest_memory_nodes = FLAGS_dest_memory_nodes; @@ -483,18 +481,19 @@ void ProcessBenchmarkConfigs() { throw std::invalid_argument{"value size too large"}; } - random_engines.resize(FLAGS_threads); + benchmark_threads = fill ? FLAGS_max_access_threads : FLAGS_threads; + random_engines.resize(benchmark_threads); if (fill) { assert(read_ratio == 0); key_dist = KeyDistribution::Range; - operations_per_thread = FLAGS_num_kv / FLAGS_max_access_threads + 1; + operations_per_thread = FLAGS_num_kv / benchmark_threads + 1; ranges.clear(); - for (size_t i = 0; i < FLAGS_max_access_threads; i++) { + for (size_t i = 0; i < benchmark_threads; i++) { ranges.emplace_back(i * operations_per_thread, (i + 1) * operations_per_thread); } } else { - operations_per_thread = num_operations / FLAGS_threads; + operations_per_thread = num_operations / benchmark_threads; if (FLAGS_key_distribution == "random") { key_dist = KeyDistribution::Uniform; } else if (FLAGS_key_distribution == "zipf") { @@ -527,12 +526,12 @@ void ResetBenchmarkData() { read_not_found = 0; has_timed_out = false; has_finished.clear(); - has_finished.resize(FLAGS_threads, 0); + has_finished.resize(benchmark_threads, 0); if (FLAGS_latency) { printf("calculate latencies\n"); latencies.clear(); - latencies.resize(FLAGS_threads, std::vector(MAX_LAT, 0)); + latencies.resize(benchmark_threads, std::vector(MAX_LAT, 0)); } } @@ -541,9 +540,9 @@ void RunBenchmark() { ResetBenchmarkData(); size_t write_threads = - fill ? FLAGS_max_access_threads - : FLAGS_threads - read_ratio * 100 * FLAGS_threads / 100; - int read_threads = FLAGS_threads - write_threads; + fill ? benchmark_threads + : benchmark_threads - read_ratio * 100 * benchmark_threads / 100; + int read_threads = fill ? 0 : benchmark_threads - write_threads; std::vector ts; switch (bench_data_type) { @@ -587,7 +586,7 @@ void RunBenchmark() { for (size_t i = 0; i < write_threads; i++) { ts.emplace_back(DBWrite, i); } - for (size_t i = write_threads; i < FLAGS_threads; i++) { + for (size_t i = write_threads; i < benchmark_threads; i++) { ts.emplace_back(scan ? DBScan : DBRead, i); } @@ -628,7 +627,7 @@ void RunBenchmark() { if (num_finished == 0 || idx < 2) { last_effective_idx = idx; } - if (num_finished == FLAGS_threads) { + if (num_finished == benchmark_threads) { break; } if (!fill && (duration.count() >= FLAGS_timeout * 1000)) { diff --git a/volatile/doc/benchmark.md b/volatile/doc/benchmark.md index 34311427..968dcfd7 100644 --- a/volatile/doc/benchmark.md +++ b/volatile/doc/benchmark.md @@ -2,151 +2,38 @@ To test performance of KVDK, you can run our benchmark tool "bench", the tool is auto-built along with KVDK library in the build dir. -You can manually run individual benchmark follow the examples as shown bellow, or simply run our basic benchmark script "scripts/run_benchmark.py" to test all the basic read/write performance. - -To run the script, you shoulf first build kvdk, then run: - +Here is an example to run benchmarks on `string` type: +```bash +./bench -path=./kvdk_bench_dir -type=string -num_kv=8388608 -num_operations=1048576 -threads=10 -max_access_threads=64 -value_size=120 -latency=1 ``` -scripts/run_benchmark.py [data_type] [key distribution] -``` - -data_type: Which data type to benchmark, it can be string/sorted/hash/list/blackhole/all -key distribution: Distribution of key of the benchmark workloads, it can be random/zipf/all -## Fill data to new instance - -To test performance, we need to first fill key-value pairs to the KVDK instance. Since KVDK did not support cross-socket access yet, we need to bind bench program to a numa node: - - numactl --cpunodebind=0 --membind=0 ./bench -fill=1 -value_size=120 -threads=64 -path=/mnt/pmem0/kvdk -space=274877906944 -num=838860800 -max_write_threads=64 -type=string -populate=1 +To benchmark performance when KVs are stored on separted memory nodes, we can use `numactl`: +```bash +numactl --cpunodebind=0 --membind=0 ./bench -path=./kvdk_bench_dir -type=string -num_kv=8388608 -num_operations=1048576 -threads=10 -max_access_threads=64 -value_size=120 -latency=1 -dest_memory_nodes=1 +``` -This command will fill 83886088 uniform distributed string-type key-value pairs to the KVDK instance that located at /mnt/pmem0/kvdk. +The above configurations will consume ~7 GB memory. Explanation of arguments: - -fill: Indicates filling data to a new instance. - - -threads: Number of threads of benchmark. + -path: KVDK initialized here - -space: PMem space that allocate to the KVDK instance. + -type: Type of key-value pairs to benchmark, it can be string/sorted/hash/list/blackhole. - -max_access_threads: Max concurrent access threads in the KVDK instance, set it to the number of the hyper-threads for performance consideration. You can call KVDK API with any number of threads, but if your parallel threads more than max_access_threads, the performance will be degraded due to synchronization cost + -num_kv: Number of KV when benchmarking fill/insert. - -type: Type of key-value pairs to benchmark, it can be "string", "hash" or "sorted". + -num_operations: Number of operations running benchmarks other than fill/insert. - -populate: Populate pmem space while creating new KVDK instance for best write performance in runtime, see "include/kvdk/configs.hpp" for explanation. + -threads: Number of threads of benchmark. `max_access_threads` will override this when benchmarking `fill`. -## Test read/write performance + -max_access_threads: Max concurrent access threads in the KVDK instance, set it to the number of the hyper-threads for performance consideration. You can call KVDK API with any number of threads, but if your parallel threads more than max_access_threads, the performance will be degraded due to synchronization cost. -### Read performance + -value_size: Value length of values in Byte. -After fill the instance, we can test read performance with the command below: + -latency: Print latencies of operations or not. - numactl --cpunodebind=0 --membind=0 ./bench -fill=0 -time=10 -value_size=120 -threads=64 -read_ratio=1 -existing_keys_ratio=1 -path=/mnt/pmem0/kvdk -space=274877906944 -num=838860800 -max_write_threads=64 -type=string - -This will read key-value pairs from the KVDK instance with 48 threads in 10 seconds. - -Explanation of arguments: - - -read_ratio: Ratio of read threads among benchmark threads, for example, if set it to 0.5, then there will be 24 write threads and 24 read threads. - - -existing_keys_ratio: Ratio of keys among key-value pairs to read that already filled in the instance. For example, if set it to 0.5, then 50% read operations will return NotFound. - -Benchmark tool will print performance stats to stdout, include throughput in each second and average ops: - - $numactl --cpunodebind=0 --membind=0 ./bench -fill=0 -time=10 -value_size=120 -threads=64 -read_ratio=1 -existing_keys_ratio=1 -path=/mnt/pmem0/kvdk -space=274877906944 -num=838860800 -max_write_threads=64 -type=string - - [LOG] time 0 ms: Initializing PMem size 274877906944 in file /mnt/pmem0/kvdk/data - [LOG] time 1864 ms: Map pmem space done - [LOG] time 9033 ms: In restoring: iterated 840882543 records - init 0 write threads - init 64 read threads - ------- ops in seconds ----------- - time (ms), read ops, not found, write ops, total read, total write - 1000 73691000 0 0 73691000 0 - 2001 73613000 0 0 147304000 0 - 3002 73643000 0 0 220947000 0 - 4003 73656000 0 0 294603000 0 - 5004 73675000 0 0 368278000 0 - 6005 73667000 0 0 441945000 0 - 7006 73699000 0 0 515644000 0 - 8007 73647000 0 0 589291000 0 - 9008 73634000 0 0 662925000 0 - 10009 73677000 0 0 736602000 0 - finish bench - ------------ statistics ------------ - read ops 73660400, write ops 0 - [LOG] time 19051 ms: instance closed - - - -### Write performance - -Similarily, to test write performance, we can simply modify "read_ratio": - - numactl --cpunodebind=0 --membind=0 ./bench -fill=0 -time=10 -value_size=120 -threads=64 -read_ratio=0 -existing_keys_ratio=0 -path=/mnt/pmem0/kvdk -space=274877906944 -num=838860800 -max_write_threads=64 -type=string - -This command will insert new key-value pairs to the KVDK instance in 10 seconds. Likely wise, by modify "existing_keys_ratio", we can control how many write operations are updates. - - $numactl --cpunodebind=0 --membind=0 ./bench -fill=0 -time=10 -value_size=120 -threads=64 -read_ratio=0 -existing_keys_ratio=0 -path=/mnt/pmem0/kvdk -space=274877906944 -num=838860800 -max_write_threads=64 -type=string - - [LOG] time 0 ms: Initializing PMem size 274877906944 in file /mnt/pmem0/kvdk/data - [LOG] time 1865 ms: Map pmem space done - [LOG] time 9015 ms: In restoring: iterated 840882543 records - init 64 write threads - init 0 read threads - ------- ops in seconds ----------- - time (ms), read ops, not found, write ops, total read, total write - 1000 0 0 50610000 0 50610000 - 2007 0 0 50053000 0 100663000 - 3016 0 0 49669000 0 150332000 - 4017 0 0 49048000 0 199380000 - 5018 0 0 48540000 0 247920000 - 6022 0 0 48210000 0 296130000 - 7023 0 0 47725000 0 343855000 - 8024 0 0 47354000 0 391209000 - 9027 0 0 47080000 0 438289000 - 10028 0 0 46544000 0 484833000 - finish bench - ------------ statistics ------------ - read ops 0, write ops 48483400 - [LOG] time 19055 ms: instance closed - - -### Stat latencies - -We can also stat latency information by add "-latency=1" to the benchmark command. - - $ numactl --cpunodebind=0 --membind=0 ./bench -fill=0 -time=10 -value_size=120 -threads=64 -read_ratio=0.5 -existing_keys_ratio=1 -path=/mnt/pmem0/kvdk -space=274877906944 -num=838860800 -max_write_threads=64 -type=string -latency=1 - - [LOG] time 0 ms: Initializing PMem size 274877906944 in file /mnt/pmem0/kvdk/data - [LOG] time 1869 ms: Map pmem space done - [LOG] time 14963 ms: In restoring: iterated 1323729106 records - calculate latencies - init 6 write threads - init 58 read threads - ------- ops in seconds ----------- - time (ms), read ops, not found, write ops, total read, total write - 1000 62763000 0 3933000 62763000 3933000 - 2001 62297000 0 4303000 125060000 8236000 - 3002 62190000 0 4530000 187250000 12766000 - 4003 62194000 0 4530000 249444000 17296000 - 5004 62206000 0 4531000 311650000 21827000 - 6005 62172000 0 4527000 373822000 26354000 - 7006 62194000 0 4530000 436016000 30884000 - 8007 62227000 0 4535000 498243000 35419000 - 9008 62196000 0 4529000 560439000 39948000 - 10009 62190000 0 4527000 622629000 44475000 - finish bench - ------------ statistics ------------ - read ops 62263100, write ops 4447500 - read lantencies (us): Avg: 0.89, P50: 0.83, P99: 1.54, P99.5: 1.67, P99.9: 2.77, P99.99: 4.20 - write lantencies (us): Avg: 0.09, P50: 1.22, P99: 2.64, P99.5: 3.25, P99.9: 4.22, P99.99: 5.35 - [LOG] time 28382 ms: instance closed + -dest_memory_nodes: The memory nodes to store KV data. ## More configurations -For more configurations of the benchmark tool, please reference to "benchmark/bench.cpp" and "scripts/basic_benchmarks.py". - - - - +For more configurations of the benchmark tool, please reference to "benchmark/bench.cpp".