chore(gpu): reduce throughput integer bench time

zama-ai · Dec 4, 2024 · 38e7f79 · 38e7f79
1 parent 38a7e4f
commit 38e7f79
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 5 deletions.
diff --git a/tfhe/benches/integer/bench.rs b/tfhe/benches/integer/bench.rs
@@ -144,6 +144,9 @@ fn bench_server_key_binary_function_clean_inputs<F>(
             }
             BenchmarkType::Throughput => {
                 bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
+                bench_group
+                    .sample_size(10)
+                    .measurement_time(std::time::Duration::from_secs(30));
                 let elements = throughput_num_threads(num_block);
                 bench_group.throughput(Throughput::Elements(elements));
                 bench_group.bench_function(&bench_id, |b| {
@@ -292,6 +295,9 @@ fn bench_server_key_unary_function_clean_inputs<F>(
             }
             BenchmarkType::Throughput => {
                 bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
+                bench_group
+                    .sample_size(10)
+                    .measurement_time(std::time::Duration::from_secs(30));
                 let elements = throughput_num_threads(num_block);
                 bench_group.throughput(Throughput::Elements(elements));
                 bench_group.bench_function(&bench_id, |b| {
@@ -446,6 +452,9 @@ fn bench_server_key_binary_scalar_function_clean_inputs<F, G>(
             }
             BenchmarkType::Throughput => {
                 bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
+                bench_group
+                    .sample_size(10)
+                    .measurement_time(std::time::Duration::from_secs(30));
                 let elements = throughput_num_threads(num_block);
                 bench_group.throughput(Throughput::Elements(elements));
                 bench_group.bench_function(&bench_id, |b| {
@@ -559,6 +568,9 @@ fn if_then_else_parallelized(c: &mut Criterion) {
             }
             BenchmarkType::Throughput => {
                 bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
+                bench_group
+                    .sample_size(10)
+                    .measurement_time(std::time::Duration::from_secs(30));
                 let elements = throughput_num_threads(num_block);
                 bench_group.throughput(Throughput::Elements(elements));
                 bench_group.bench_function(&bench_id, |b| {
@@ -654,6 +666,9 @@ fn ciphertexts_sum_parallelized(c: &mut Criterion) {
                     bench_id = format!(
                         "{bench_name}_{len}_ctxts::throughput::{param_name}::{bit_size}_bits"
                     );
+                    bench_group
+                        .sample_size(10)
+                        .measurement_time(std::time::Duration::from_secs(30));
                     let elements = throughput_num_threads(num_block);
                     bench_group.throughput(Throughput::Elements(elements));
                     bench_group.bench_function(&bench_id, |b| {
@@ -1344,6 +1359,9 @@ mod cuda {
                 }
                 BenchmarkType::Throughput => {
                     bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
+                    bench_group
+                        .sample_size(10)
+                        .measurement_time(std::time::Duration::from_secs(30));
                     let elements = throughput_num_threads(num_block);
                     bench_group.throughput(Throughput::Elements(elements));
                     bench_group.bench_function(&bench_id, |b| {
@@ -1440,6 +1458,9 @@ mod cuda {
                 }
                 BenchmarkType::Throughput => {
                     bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
+                    bench_group
+                        .sample_size(10)
+                        .measurement_time(std::time::Duration::from_secs(30));
                     let elements = throughput_num_threads(num_block);
                     bench_group.throughput(Throughput::Elements(elements));
                     bench_group.bench_function(&bench_id, |b| {
@@ -1496,9 +1517,6 @@ mod cuda {
         G: Fn(&mut ThreadRng, usize) -> ScalarType,
     {
         let mut bench_group = c.benchmark_group(bench_name);
-        bench_group
-            .sample_size(15)
-            .measurement_time(std::time::Duration::from_secs(30));
         let mut rng = rand::thread_rng();
 
         let streams = CudaStreams::new_multi_gpu();
@@ -1516,6 +1534,9 @@ mod cuda {
 
             match BENCH_TYPE.get().unwrap() {
                 BenchmarkType::Latency => {
+                    bench_group
+                        .sample_size(15)
+                        .measurement_time(std::time::Duration::from_secs(30));
                     bench_id =
                         format!("{bench_name}::{param_name}::{bit_size}_bits_scalar_{bit_size}"); // FIXME it makes no sense to duplicate `bit_size`
                     bench_group.bench_function(&bench_id, |b| {
@@ -1543,6 +1564,9 @@ mod cuda {
                     });
                 }
                 BenchmarkType::Throughput => {
+                    bench_group
+                        .sample_size(10)
+                        .measurement_time(std::time::Duration::from_secs(30));
                     bench_id = format!(
                         "{bench_name}::throughput::{param_name}::{bit_size}_bits_scalar_{bit_size}"
                     );
@@ -1644,6 +1668,9 @@ mod cuda {
                 }
                 BenchmarkType::Throughput => {
                     bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
+                    bench_group
+                        .sample_size(10)
+                        .measurement_time(std::time::Duration::from_secs(30));
                     let elements = throughput_num_threads(num_block);
                     bench_group.throughput(Throughput::Elements(elements));
                     bench_group.bench_function(&bench_id, |b| {

diff --git a/tfhe/benches/utilities.rs b/tfhe/benches/utilities.rs
@@ -399,9 +399,9 @@ pub mod integer_utils {
         #[cfg(feature = "gpu")]
         {
             // This value is for Nvidia H100 GPU
-            let streaming_multiprocessors = 144;
+            let streaming_multiprocessors = 132;
             let num_gpus = unsafe { cuda_get_number_of_gpus() };
-            ((streaming_multiprocessors * 16 * num_gpus) as f64 * block_multiplicator) as u64
+            ((streaming_multiprocessors * num_gpus) as f64 * block_multiplicator) as u64
         }
         #[cfg(not(feature = "gpu"))]
         {