Skip to content
This repository has been archived by the owner on Nov 4, 2024. It is now read-only.

Commit

Permalink
reduce memory consumption of default benchmark configurations; update…
Browse files Browse the repository at this point in the history
… doc
  • Loading branch information
iyupeng committed Nov 14, 2022
1 parent 7522e98 commit 2f4120f
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 150 deletions.
37 changes: 18 additions & 19 deletions volatile/benchmark/bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ using namespace KVDK_NAMESPACE;
// Benchmark configs
DEFINE_string(path, "/mnt/pmem0/kvdk", "Instance path");

DEFINE_uint64(num_kv, (1 << 30), "Number of KVs to place");
DEFINE_uint64(num_kv, (1 << 23), "Number of KVs to place");

DEFINE_uint64(num_operations, (1 << 30),
DEFINE_uint64(num_operations, (1 << 20),
"Number of total operations. "
"num_kv will override this when benchmarking fill/insert");

Expand Down Expand Up @@ -63,14 +63,10 @@ DEFINE_string(key_distribution, "random",
"be ignored and only uniform distribution will be used");

// Engine configs
DEFINE_bool(
populate, false,
"Populate pmem space while creating a new instance. This can improve write "
"performance in runtime, but will take long time to init the instance");

DEFINE_uint64(max_access_threads, 64, "Max access threads of the instance");

DEFINE_uint64(space, (256ULL << 30), "Max usable PMem space of the instance");
DEFINE_uint64(hash_bucket_num, (1 << 20),
"The number of initial buckets in hash table");

DEFINE_bool(opt_large_sorted_collection_restore, true,
" Optional optimization strategy which Multi-thread recovery a "
Expand Down Expand Up @@ -129,6 +125,7 @@ double existing_keys_ratio = 0;
std::uint64_t batch_size = 0;
bool scan = false;
std::uint64_t num_operations = 0;
std::uint64_t benchmark_threads = 0;

std::uint64_t max_key = UINT64_MAX;
extd::zipfian_distribution<std::uint64_t>* zipf = nullptr;
Expand Down Expand Up @@ -422,6 +419,7 @@ void InitializeBenchmark() {
if (bench_data_type != DataType::Blackhole) {
Configs configs;
configs.max_access_threads = FLAGS_max_access_threads;
configs.hash_bucket_num = FLAGS_hash_bucket_num;
configs.opt_large_sorted_collection_recovery =
FLAGS_opt_large_sorted_collection_restore;
configs.dest_memory_nodes = FLAGS_dest_memory_nodes;
Expand Down Expand Up @@ -483,18 +481,19 @@ void ProcessBenchmarkConfigs() {
throw std::invalid_argument{"value size too large"};
}

random_engines.resize(FLAGS_threads);
benchmark_threads = fill ? FLAGS_max_access_threads : FLAGS_threads;
random_engines.resize(benchmark_threads);
if (fill) {
assert(read_ratio == 0);
key_dist = KeyDistribution::Range;
operations_per_thread = FLAGS_num_kv / FLAGS_max_access_threads + 1;
operations_per_thread = FLAGS_num_kv / benchmark_threads + 1;
ranges.clear();
for (size_t i = 0; i < FLAGS_max_access_threads; i++) {
for (size_t i = 0; i < benchmark_threads; i++) {
ranges.emplace_back(i * operations_per_thread,
(i + 1) * operations_per_thread);
}
} else {
operations_per_thread = num_operations / FLAGS_threads;
operations_per_thread = num_operations / benchmark_threads;
if (FLAGS_key_distribution == "random") {
key_dist = KeyDistribution::Uniform;
} else if (FLAGS_key_distribution == "zipf") {
Expand Down Expand Up @@ -527,12 +526,12 @@ void ResetBenchmarkData() {
read_not_found = 0;
has_timed_out = false;
has_finished.clear();
has_finished.resize(FLAGS_threads, 0);
has_finished.resize(benchmark_threads, 0);

if (FLAGS_latency) {
printf("calculate latencies\n");
latencies.clear();
latencies.resize(FLAGS_threads, std::vector<std::uint64_t>(MAX_LAT, 0));
latencies.resize(benchmark_threads, std::vector<std::uint64_t>(MAX_LAT, 0));
}
}

Expand All @@ -541,9 +540,9 @@ void RunBenchmark() {
ResetBenchmarkData();

size_t write_threads =
fill ? FLAGS_max_access_threads
: FLAGS_threads - read_ratio * 100 * FLAGS_threads / 100;
int read_threads = FLAGS_threads - write_threads;
fill ? benchmark_threads
: benchmark_threads - read_ratio * 100 * benchmark_threads / 100;
int read_threads = fill ? 0 : benchmark_threads - write_threads;
std::vector<std::thread> ts;

switch (bench_data_type) {
Expand Down Expand Up @@ -587,7 +586,7 @@ void RunBenchmark() {
for (size_t i = 0; i < write_threads; i++) {
ts.emplace_back(DBWrite, i);
}
for (size_t i = write_threads; i < FLAGS_threads; i++) {
for (size_t i = write_threads; i < benchmark_threads; i++) {
ts.emplace_back(scan ? DBScan : DBRead, i);
}

Expand Down Expand Up @@ -628,7 +627,7 @@ void RunBenchmark() {
if (num_finished == 0 || idx < 2) {
last_effective_idx = idx;
}
if (num_finished == FLAGS_threads) {
if (num_finished == benchmark_threads) {
break;
}
if (!fill && (duration.count() >= FLAGS_timeout * 1000)) {
Expand Down
149 changes: 18 additions & 131 deletions volatile/doc/benchmark.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,151 +2,38 @@

To test performance of KVDK, you can run our benchmark tool "bench", the tool is auto-built along with KVDK library in the build dir.

You can manually run individual benchmark follow the examples as shown bellow, or simply run our basic benchmark script "scripts/run_benchmark.py" to test all the basic read/write performance.

To run the script, you shoulf first build kvdk, then run:

Here is an example to run benchmarks on `string` type:
```bash
./bench -path=./kvdk_bench_dir -type=string -num_kv=8388608 -num_operations=1048576 -threads=10 -max_access_threads=64 -value_size=120 -latency=1
```
scripts/run_benchmark.py [data_type] [key distribution]
```

data_type: Which data type to benchmark, it can be string/sorted/hash/list/blackhole/all

key distribution: Distribution of key of the benchmark workloads, it can be random/zipf/all
## Fill data to new instance

To test performance, we need to first fill key-value pairs to the KVDK instance. Since KVDK did not support cross-socket access yet, we need to bind bench program to a numa node:

numactl --cpunodebind=0 --membind=0 ./bench -fill=1 -value_size=120 -threads=64 -path=/mnt/pmem0/kvdk -space=274877906944 -num=838860800 -max_write_threads=64 -type=string -populate=1
To benchmark performance when KVs are stored on separted memory nodes, we can use `numactl`:
```bash
numactl --cpunodebind=0 --membind=0 ./bench -path=./kvdk_bench_dir -type=string -num_kv=8388608 -num_operations=1048576 -threads=10 -max_access_threads=64 -value_size=120 -latency=1 -dest_memory_nodes=1
```

This command will fill 83886088 uniform distributed string-type key-value pairs to the KVDK instance that located at /mnt/pmem0/kvdk.
The above configurations will consume ~7 GB memory.

Explanation of arguments:

-fill: Indicates filling data to a new instance.

-threads: Number of threads of benchmark.
-path: KVDK initialized here

-space: PMem space that allocate to the KVDK instance.
-type: Type of key-value pairs to benchmark, it can be string/sorted/hash/list/blackhole.

-max_access_threads: Max concurrent access threads in the KVDK instance, set it to the number of the hyper-threads for performance consideration. You can call KVDK API with any number of threads, but if your parallel threads more than max_access_threads, the performance will be degraded due to synchronization cost
-num_kv: Number of KV when benchmarking fill/insert.

-type: Type of key-value pairs to benchmark, it can be "string", "hash" or "sorted".
-num_operations: Number of operations running benchmarks other than fill/insert.

-populate: Populate pmem space while creating new KVDK instance for best write performance in runtime, see "include/kvdk/configs.hpp" for explanation.
-threads: Number of threads of benchmark. `max_access_threads` will override this when benchmarking `fill`.

## Test read/write performance
-max_access_threads: Max concurrent access threads in the KVDK instance, set it to the number of the hyper-threads for performance consideration. You can call KVDK API with any number of threads, but if your parallel threads more than max_access_threads, the performance will be degraded due to synchronization cost.

### Read performance
-value_size: Value length of values in Byte.

After fill the instance, we can test read performance with the command below:
-latency: Print latencies of operations or not.

numactl --cpunodebind=0 --membind=0 ./bench -fill=0 -time=10 -value_size=120 -threads=64 -read_ratio=1 -existing_keys_ratio=1 -path=/mnt/pmem0/kvdk -space=274877906944 -num=838860800 -max_write_threads=64 -type=string

This will read key-value pairs from the KVDK instance with 48 threads in 10 seconds.

Explanation of arguments:

-read_ratio: Ratio of read threads among benchmark threads, for example, if set it to 0.5, then there will be 24 write threads and 24 read threads.

-existing_keys_ratio: Ratio of keys among key-value pairs to read that already filled in the instance. For example, if set it to 0.5, then 50% read operations will return NotFound.

Benchmark tool will print performance stats to stdout, include throughput in each second and average ops:

$numactl --cpunodebind=0 --membind=0 ./bench -fill=0 -time=10 -value_size=120 -threads=64 -read_ratio=1 -existing_keys_ratio=1 -path=/mnt/pmem0/kvdk -space=274877906944 -num=838860800 -max_write_threads=64 -type=string

[LOG] time 0 ms: Initializing PMem size 274877906944 in file /mnt/pmem0/kvdk/data
[LOG] time 1864 ms: Map pmem space done
[LOG] time 9033 ms: In restoring: iterated 840882543 records
init 0 write threads
init 64 read threads
------- ops in seconds -----------
time (ms), read ops, not found, write ops, total read, total write
1000 73691000 0 0 73691000 0
2001 73613000 0 0 147304000 0
3002 73643000 0 0 220947000 0
4003 73656000 0 0 294603000 0
5004 73675000 0 0 368278000 0
6005 73667000 0 0 441945000 0
7006 73699000 0 0 515644000 0
8007 73647000 0 0 589291000 0
9008 73634000 0 0 662925000 0
10009 73677000 0 0 736602000 0
finish bench
------------ statistics ------------
read ops 73660400, write ops 0
[LOG] time 19051 ms: instance closed



### Write performance

Similarily, to test write performance, we can simply modify "read_ratio":

numactl --cpunodebind=0 --membind=0 ./bench -fill=0 -time=10 -value_size=120 -threads=64 -read_ratio=0 -existing_keys_ratio=0 -path=/mnt/pmem0/kvdk -space=274877906944 -num=838860800 -max_write_threads=64 -type=string

This command will insert new key-value pairs to the KVDK instance in 10 seconds. Likely wise, by modify "existing_keys_ratio", we can control how many write operations are updates.

$numactl --cpunodebind=0 --membind=0 ./bench -fill=0 -time=10 -value_size=120 -threads=64 -read_ratio=0 -existing_keys_ratio=0 -path=/mnt/pmem0/kvdk -space=274877906944 -num=838860800 -max_write_threads=64 -type=string

[LOG] time 0 ms: Initializing PMem size 274877906944 in file /mnt/pmem0/kvdk/data
[LOG] time 1865 ms: Map pmem space done
[LOG] time 9015 ms: In restoring: iterated 840882543 records
init 64 write threads
init 0 read threads
------- ops in seconds -----------
time (ms), read ops, not found, write ops, total read, total write
1000 0 0 50610000 0 50610000
2007 0 0 50053000 0 100663000
3016 0 0 49669000 0 150332000
4017 0 0 49048000 0 199380000
5018 0 0 48540000 0 247920000
6022 0 0 48210000 0 296130000
7023 0 0 47725000 0 343855000
8024 0 0 47354000 0 391209000
9027 0 0 47080000 0 438289000
10028 0 0 46544000 0 484833000
finish bench
------------ statistics ------------
read ops 0, write ops 48483400
[LOG] time 19055 ms: instance closed


### Stat latencies

We can also stat latency information by add "-latency=1" to the benchmark command.

$ numactl --cpunodebind=0 --membind=0 ./bench -fill=0 -time=10 -value_size=120 -threads=64 -read_ratio=0.5 -existing_keys_ratio=1 -path=/mnt/pmem0/kvdk -space=274877906944 -num=838860800 -max_write_threads=64 -type=string -latency=1

[LOG] time 0 ms: Initializing PMem size 274877906944 in file /mnt/pmem0/kvdk/data
[LOG] time 1869 ms: Map pmem space done
[LOG] time 14963 ms: In restoring: iterated 1323729106 records
calculate latencies
init 6 write threads
init 58 read threads
------- ops in seconds -----------
time (ms), read ops, not found, write ops, total read, total write
1000 62763000 0 3933000 62763000 3933000
2001 62297000 0 4303000 125060000 8236000
3002 62190000 0 4530000 187250000 12766000
4003 62194000 0 4530000 249444000 17296000
5004 62206000 0 4531000 311650000 21827000
6005 62172000 0 4527000 373822000 26354000
7006 62194000 0 4530000 436016000 30884000
8007 62227000 0 4535000 498243000 35419000
9008 62196000 0 4529000 560439000 39948000
10009 62190000 0 4527000 622629000 44475000
finish bench
------------ statistics ------------
read ops 62263100, write ops 4447500
read lantencies (us): Avg: 0.89, P50: 0.83, P99: 1.54, P99.5: 1.67, P99.9: 2.77, P99.99: 4.20
write lantencies (us): Avg: 0.09, P50: 1.22, P99: 2.64, P99.5: 3.25, P99.9: 4.22, P99.99: 5.35
[LOG] time 28382 ms: instance closed
-dest_memory_nodes: The memory nodes to store KV data.

## More configurations

For more configurations of the benchmark tool, please reference to "benchmark/bench.cpp" and "scripts/basic_benchmarks.py".




For more configurations of the benchmark tool, please reference to "benchmark/bench.cpp".

0 comments on commit 2f4120f

Please sign in to comment.