Skip to content

Commit

Permalink
chore(gpu): reduce throughput integer bench time
Browse files Browse the repository at this point in the history
  • Loading branch information
agnesLeroy committed Dec 4, 2024
1 parent 38a7e4f commit 38e7f79
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 5 deletions.
33 changes: 30 additions & 3 deletions tfhe/benches/integer/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,9 @@ fn bench_server_key_binary_function_clean_inputs<F>(
}
BenchmarkType::Throughput => {
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
bench_group
.sample_size(10)
.measurement_time(std::time::Duration::from_secs(30));
let elements = throughput_num_threads(num_block);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
Expand Down Expand Up @@ -292,6 +295,9 @@ fn bench_server_key_unary_function_clean_inputs<F>(
}
BenchmarkType::Throughput => {
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
bench_group
.sample_size(10)
.measurement_time(std::time::Duration::from_secs(30));
let elements = throughput_num_threads(num_block);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
Expand Down Expand Up @@ -446,6 +452,9 @@ fn bench_server_key_binary_scalar_function_clean_inputs<F, G>(
}
BenchmarkType::Throughput => {
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
bench_group
.sample_size(10)
.measurement_time(std::time::Duration::from_secs(30));
let elements = throughput_num_threads(num_block);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
Expand Down Expand Up @@ -559,6 +568,9 @@ fn if_then_else_parallelized(c: &mut Criterion) {
}
BenchmarkType::Throughput => {
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
bench_group
.sample_size(10)
.measurement_time(std::time::Duration::from_secs(30));
let elements = throughput_num_threads(num_block);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
Expand Down Expand Up @@ -654,6 +666,9 @@ fn ciphertexts_sum_parallelized(c: &mut Criterion) {
bench_id = format!(
"{bench_name}_{len}_ctxts::throughput::{param_name}::{bit_size}_bits"
);
bench_group
.sample_size(10)
.measurement_time(std::time::Duration::from_secs(30));
let elements = throughput_num_threads(num_block);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
Expand Down Expand Up @@ -1344,6 +1359,9 @@ mod cuda {
}
BenchmarkType::Throughput => {
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
bench_group
.sample_size(10)
.measurement_time(std::time::Duration::from_secs(30));
let elements = throughput_num_threads(num_block);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
Expand Down Expand Up @@ -1440,6 +1458,9 @@ mod cuda {
}
BenchmarkType::Throughput => {
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
bench_group
.sample_size(10)
.measurement_time(std::time::Duration::from_secs(30));
let elements = throughput_num_threads(num_block);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
Expand Down Expand Up @@ -1496,9 +1517,6 @@ mod cuda {
G: Fn(&mut ThreadRng, usize) -> ScalarType,
{
let mut bench_group = c.benchmark_group(bench_name);
bench_group
.sample_size(15)
.measurement_time(std::time::Duration::from_secs(30));
let mut rng = rand::thread_rng();

let streams = CudaStreams::new_multi_gpu();
Expand All @@ -1516,6 +1534,9 @@ mod cuda {

match BENCH_TYPE.get().unwrap() {
BenchmarkType::Latency => {
bench_group
.sample_size(15)
.measurement_time(std::time::Duration::from_secs(30));
bench_id =
format!("{bench_name}::{param_name}::{bit_size}_bits_scalar_{bit_size}"); // FIXME it makes no sense to duplicate `bit_size`
bench_group.bench_function(&bench_id, |b| {
Expand Down Expand Up @@ -1543,6 +1564,9 @@ mod cuda {
});
}
BenchmarkType::Throughput => {
bench_group
.sample_size(10)
.measurement_time(std::time::Duration::from_secs(30));
bench_id = format!(
"{bench_name}::throughput::{param_name}::{bit_size}_bits_scalar_{bit_size}"
);
Expand Down Expand Up @@ -1644,6 +1668,9 @@ mod cuda {
}
BenchmarkType::Throughput => {
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
bench_group
.sample_size(10)
.measurement_time(std::time::Duration::from_secs(30));
let elements = throughput_num_threads(num_block);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
Expand Down
4 changes: 2 additions & 2 deletions tfhe/benches/utilities.rs
Original file line number Diff line number Diff line change
Expand Up @@ -399,9 +399,9 @@ pub mod integer_utils {
#[cfg(feature = "gpu")]
{
// This value is for Nvidia H100 GPU
let streaming_multiprocessors = 144;
let streaming_multiprocessors = 132;
let num_gpus = unsafe { cuda_get_number_of_gpus() };
((streaming_multiprocessors * 16 * num_gpus) as f64 * block_multiplicator) as u64
((streaming_multiprocessors * num_gpus) as f64 * block_multiplicator) as u64
}
#[cfg(not(feature = "gpu"))]
{
Expand Down

0 comments on commit 38e7f79

Please sign in to comment.