From 0aec08023ae068abd10018cc6b25dcbc27cd9376 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?David=20Test=C3=A9?= <david.teste@zama.ai>
Date: Tue, 17 Dec 2024 12:39:02 +0100
Subject: [PATCH] chore(bench): new heuristic to define elements for throughput

This is done to fill up backend with enough elements to fill the backend and avoid having long execution time for heavy operations like division.
---
 Makefile                                      |  28 +--
 tfhe/benches/integer/bench.rs                 | 169 ++++++++++++---
 .../integer/glwe_packing_compression.rs       |  83 ++++----
 tfhe/benches/integer/oprf.rs                  |  16 +-
 tfhe/benches/integer/signed_bench.rs          | 197 ++++++++++++++----
 tfhe/benches/integer/zk_pke.rs                |  35 +++-
 tfhe/benches/utilities.rs                     |  15 +-
 7 files changed, 409 insertions(+), 134 deletions(-)

diff --git a/Makefile b/Makefile
index 4354809dc2..dd9f17765e 100644
--- a/Makefile
+++ b/Makefile
@@ -281,14 +281,14 @@ check_typos: install_typos_checker
 .PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
 clippy_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=boolean,shortint,integer,internal-keycache,gpu \
+		--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stat \
 		--all-targets \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
 
 .PHONY: check_gpu # Run check on tfhe with "gpu" enabled
 check_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" check \
-		--features=boolean,shortint,integer,internal-keycache,gpu \
+		--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stat \
 		--all-targets \
 		-p $(TFHE_SPEC)
 
@@ -393,10 +393,10 @@ clippy_trivium: install_rs_check_toolchain
 .PHONY: clippy_all_targets # Run clippy lints on all targets (benches, examples, etc.)
 clippy_all_targets: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		--features=boolean,shortint,integer,internal-keycache,zk-pok,strings \
+		--features=boolean,shortint,integer,internal-keycache,zk-pok,strings,pbs-stats \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		--features=boolean,shortint,integer,internal-keycache,zk-pok,strings,experimental \
+		--features=boolean,shortint,integer,internal-keycache,zk-pok,strings,pbs-stats,experimental \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
 
 .PHONY: clippy_tfhe_csprng # Run clippy lints on tfhe-csprng
@@ -1040,35 +1040,35 @@ bench_integer: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --
 
 .PHONY: bench_signed_integer # Run benchmarks for signed integer
 bench_signed_integer: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-signed-bench \
-	--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --
 
 .PHONY: bench_integer_gpu # Run benchmarks for integer on GPU backend
 bench_integer_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --
 
 .PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
 bench_integer_compression: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench	glwe_packing_compression-integer-bench \
-	--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --
 
 .PHONY: bench_integer_compression_gpu
 bench_integer_compression_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench	glwe_packing_compression-integer-bench \
-	--features=integer,internal-keycache,gpu -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,gpu,pbs-stats -p $(TFHE_SPEC) --
 
 .PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
 bench_integer_multi_bit: install_rs_check_toolchain
@@ -1076,7 +1076,7 @@ bench_integer_multi_bit: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --
 
 .PHONY: bench_signed_integer_multi_bit # Run benchmarks for signed integer using multi-bit parameters
 bench_signed_integer_multi_bit: install_rs_check_toolchain
@@ -1084,7 +1084,7 @@ bench_signed_integer_multi_bit: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-signed-bench \
-	--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --
 
 .PHONY: bench_integer_multi_bit_gpu # Run benchmarks for integer on GPU backend using multi-bit parameters
 bench_integer_multi_bit_gpu: install_rs_check_toolchain
@@ -1092,7 +1092,7 @@ bench_integer_multi_bit_gpu: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --
 
 .PHONY: bench_unsigned_integer_multi_bit_gpu # Run benchmarks for unsigned integer on GPU backend using multi-bit parameters
 bench_unsigned_integer_multi_bit_gpu: install_rs_check_toolchain
@@ -1100,14 +1100,14 @@ bench_unsigned_integer_multi_bit_gpu: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- ::unsigned
+	--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) -- ::unsigned
 
 .PHONY: bench_integer_zk # Run benchmarks for integer encryption with ZK proofs
 bench_integer_zk: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench zk-pke-bench \
-	--features=integer,internal-keycache,zk-pok,nightly-avx512 \
+	--features=integer,internal-keycache,zk-pok,nightly-avx512,pbs-stats \
 	-p $(TFHE_SPEC) --
 
 .PHONY: bench_shortint # Run benchmarks for shortint
diff --git a/tfhe/benches/integer/bench.rs b/tfhe/benches/integer/bench.rs
index d6ead085b7..4d4af899cc 100644
--- a/tfhe/benches/integer/bench.rs
+++ b/tfhe/benches/integer/bench.rs
@@ -143,15 +143,25 @@ fn bench_server_key_binary_function_clean_inputs<F>(
                 });
             }
             BenchmarkType::Throughput => {
+                let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+
+                // Execute the operation once to know its cost.
+                let clear_0 = gen_random_u256(&mut rng);
+                let mut ct_0 = cks.encrypt_radix(clear_0, num_block);
+                let clear_1 = gen_random_u256(&mut rng);
+                let mut ct_1 = cks.encrypt_radix(clear_1, num_block);
+
+                reset_pbs_count();
+                binary_op(&sks, &mut ct_0, &mut ct_1);
+                let pbs_count = get_pbs_count();
+
                 bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
                 bench_group
                     .sample_size(10)
                     .measurement_time(std::time::Duration::from_secs(30));
-                let elements = throughput_num_threads(num_block);
+                let elements = throughput_num_threads(num_block, pbs_count);
                 bench_group.throughput(Throughput::Elements(elements));
                 bench_group.bench_function(&bench_id, |b| {
-                    let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
-
                     let mut cts_0 = (0..elements)
                         .map(|_| cks.encrypt_radix(gen_random_u256(&mut rng), num_block))
                         .collect::<Vec<_>>();
@@ -294,15 +304,23 @@ fn bench_server_key_unary_function_clean_inputs<F>(
                 });
             }
             BenchmarkType::Throughput => {
+                let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+
+                // Execute the operation once to know its cost.
+                let clear_0 = gen_random_u256(&mut rng);
+                let mut ct_0 = cks.encrypt_radix(clear_0, num_block);
+
+                reset_pbs_count();
+                unary_fn(&sks, &mut ct_0);
+                let pbs_count = get_pbs_count();
+
                 bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
                 bench_group
                     .sample_size(10)
                     .measurement_time(std::time::Duration::from_secs(30));
-                let elements = throughput_num_threads(num_block);
+                let elements = throughput_num_threads(num_block, pbs_count);
                 bench_group.throughput(Throughput::Elements(elements));
                 bench_group.bench_function(&bench_id, |b| {
-                    let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
-
                     let mut cts_0 = (0..elements)
                         .map(|_| cks.encrypt_radix(gen_random_u256(&mut rng), num_block))
                         .collect::<Vec<_>>();
@@ -451,15 +469,24 @@ fn bench_server_key_binary_scalar_function_clean_inputs<F, G>(
                 });
             }
             BenchmarkType::Throughput => {
+                let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+
+                // Execute the operation once to know its cost.
+                let clear_0 = gen_random_u256(&mut rng);
+                let mut ct_0 = cks.encrypt_radix(clear_0, num_block);
+                let clear_1 = rng_func(&mut rng, bit_size) & max_value_for_bit_size;
+
+                reset_pbs_count();
+                binary_op(&sks, &mut ct_0, clear_1);
+                let pbs_count = get_pbs_count();
+
                 bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
                 bench_group
                     .sample_size(10)
                     .measurement_time(std::time::Duration::from_secs(30));
-                let elements = throughput_num_threads(num_block);
+                let elements = throughput_num_threads(num_block, pbs_count);
                 bench_group.throughput(Throughput::Elements(elements));
                 bench_group.bench_function(&bench_id, |b| {
-                    let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
-
                     let mut cts_0 = (0..elements)
                         .map(|_| cks.encrypt_radix(gen_random_u256(&mut rng), num_block))
                         .collect::<Vec<_>>();
@@ -567,15 +594,28 @@ fn if_then_else_parallelized(c: &mut Criterion) {
                 });
             }
             BenchmarkType::Throughput => {
+                let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+
+                // Execute the operation once to know its cost.
+                let clear_0 = gen_random_u256(&mut rng);
+                let true_ct = cks.encrypt_radix(clear_0, num_block);
+
+                let clear_1 = gen_random_u256(&mut rng);
+                let false_ct = cks.encrypt_radix(clear_1, num_block);
+
+                let condition = sks.create_trivial_boolean_block(rng.gen_bool(0.5));
+
+                reset_pbs_count();
+                sks.if_then_else_parallelized(&condition, &true_ct, &false_ct);
+                let pbs_count = get_pbs_count();
+
                 bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
                 bench_group
                     .sample_size(10)
                     .measurement_time(std::time::Duration::from_secs(30));
-                let elements = throughput_num_threads(num_block);
+                let elements = throughput_num_threads(num_block, pbs_count);
                 bench_group.throughput(Throughput::Elements(elements));
                 bench_group.bench_function(&bench_id, |b| {
-                    let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
-
                     let cts_cond = (0..elements)
                         .map(|_| sks.create_trivial_boolean_block(rng.gen_bool(0.5)))
                         .collect::<Vec<_>>();
@@ -663,20 +703,34 @@ fn ciphertexts_sum_parallelized(c: &mut Criterion) {
                     });
                 }
                 BenchmarkType::Throughput => {
+                    let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+
+                    // Execute the operation once to know its cost.
+                    let nb_ctxt = bit_size.div_ceil(param.message_modulus().0.ilog2() as usize);
+                    let cks = RadixClientKey::from((cks, nb_ctxt));
+
+                    let clears = (0..len)
+                        .map(|_| gen_random_u256(&mut rng) & max_for_bit_size)
+                        .collect::<Vec<_>>();
+                    let ctxts = clears
+                        .iter()
+                        .copied()
+                        .map(|clear| cks.encrypt(clear))
+                        .collect::<Vec<_>>();
+
+                    reset_pbs_count();
+                    sks.sum_ciphertexts_parallelized(&ctxts);
+                    let pbs_count = get_pbs_count();
+
                     bench_id = format!(
                         "{bench_name}_{len}_ctxts::throughput::{param_name}::{bit_size}_bits"
                     );
                     bench_group
                         .sample_size(10)
                         .measurement_time(std::time::Duration::from_secs(30));
-                    let elements = throughput_num_threads(num_block);
+                    let elements = throughput_num_threads(num_block, pbs_count);
                     bench_group.throughput(Throughput::Elements(elements));
                     bench_group.bench_function(&bench_id, |b| {
-                        let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
-
-                        let nb_ctxt = bit_size.div_ceil(param.message_modulus().0.ilog2() as usize);
-                        let cks = RadixClientKey::from((cks, nb_ctxt));
-
                         let cts = (0..elements)
                             .map(|_| {
                                 let clears = (0..len)
@@ -1358,17 +1412,24 @@ mod cuda {
                     });
                 }
                 BenchmarkType::Throughput => {
+                    let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+                    let gpu_sks = CudaServerKey::new(&cks, &streams);
+
+                    let ct = cks.encrypt_radix(gen_random_u256(&mut rng), num_block);
+                    let mut d_ctxt =
+                        CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &streams);
+
+                    reset_pbs_count();
+                    unary_op(&gpu_sks, &mut d_ctxt, &streams);
+                    let pbs_count = get_pbs_count();
+
                     bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
                     bench_group
                         .sample_size(10)
                         .measurement_time(std::time::Duration::from_secs(30));
-                    let elements = throughput_num_threads(num_block);
+                    let elements = throughput_num_threads(num_block, pbs_count);
                     bench_group.throughput(Throughput::Elements(elements));
                     bench_group.bench_function(&bench_id, |b| {
-                        let (cks, _cpu_sks) =
-                            KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
-                        let gpu_sks = CudaServerKey::new(&cks, &streams);
-
                         let mut cts_0 = (0..elements)
                             .map(|_| {
                                 let ct_0 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block);
@@ -1457,17 +1518,28 @@ mod cuda {
                     });
                 }
                 BenchmarkType::Throughput => {
+                    let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+                    let gpu_sks = CudaServerKey::new(&cks, &streams);
+
+                    // Execute the operation once to know its cost.
+                    let ct_0 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block);
+                    let ct_1 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block);
+                    let mut d_ctxt_1 =
+                        CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_0, &streams);
+                    let mut d_ctxt_2 =
+                        CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_1, &streams);
+
+                    reset_pbs_count();
+                    binary_op(&gpu_sks, &mut d_ctxt_1, &mut d_ctxt_2, &streams);
+                    let pbs_count = get_pbs_count();
+
                     bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
                     bench_group
                         .sample_size(10)
                         .measurement_time(std::time::Duration::from_secs(30));
-                    let elements = throughput_num_threads(num_block);
+                    let elements = throughput_num_threads(num_block, pbs_count);
                     bench_group.throughput(Throughput::Elements(elements));
                     bench_group.bench_function(&bench_id, |b| {
-                        let (cks, _cpu_sks) =
-                            KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
-                        let gpu_sks = CudaServerKey::new(&cks, &streams);
-
                         let mut cts_0 = (0..elements)
                             .map(|_| {
                                 let ct_0 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block);
@@ -1564,19 +1636,28 @@ mod cuda {
                     });
                 }
                 BenchmarkType::Throughput => {
+                    let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+                    let gpu_sks = CudaServerKey::new(&cks, &streams);
+
+                    // Execute the operation once to know its cost.
+                    let ct_1 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block);
+                    let mut d_ctxt_1 =
+                        CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_1, &streams);
+                    let clear_1 = rng_func(&mut rng, bit_size) & max_value_for_bit_size;
+
+                    reset_pbs_count();
+                    binary_op(&gpu_sks, &mut d_ctxt_1, clear_1, &streams);
+                    let pbs_count = get_pbs_count();
+
                     bench_group
                         .sample_size(10)
                         .measurement_time(std::time::Duration::from_secs(30));
                     bench_id = format!(
                         "{bench_name}::throughput::{param_name}::{bit_size}_bits_scalar_{bit_size}"
                     );
-                    let elements = throughput_num_threads(num_block);
+                    let elements = throughput_num_threads(num_block, pbs_count);
                     bench_group.throughput(Throughput::Elements(elements));
                     bench_group.bench_function(&bench_id, |b| {
-                        let (cks, _cpu_sks) =
-                            KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
-                        let gpu_sks = CudaServerKey::new(&cks, &streams);
-
                         let mut cts_0 = (0..elements)
                             .map(|_| {
                                 let ct_0 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block);
@@ -1667,11 +1748,29 @@ mod cuda {
                     });
                 }
                 BenchmarkType::Throughput => {
+                    let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+                    let gpu_sks = CudaServerKey::new(&cks, &stream);
+
+                    let clear_cond = rng.gen::<bool>();
+                    let ct_then = cks.encrypt_radix(gen_random_u256(&mut rng), num_block);
+                    let ct_else = cks.encrypt_radix(gen_random_u256(&mut rng), num_block);
+                    let ct_cond = cks.encrypt_bool(clear_cond);
+
+                    let d_ct_cond = CudaBooleanBlock::from_boolean_block(&ct_cond, &stream);
+                    let d_ct_then =
+                        CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_then, &stream);
+                    let d_ct_else =
+                        CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_else, &stream);
+
+                    reset_pbs_count();
+                    gpu_sks.if_then_else(&d_ct_cond, &d_ct_then, &d_ct_else, &stream);
+                    let pbs_count = get_pbs_count();
+
                     bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
                     bench_group
                         .sample_size(10)
                         .measurement_time(std::time::Duration::from_secs(30));
-                    let elements = throughput_num_threads(num_block);
+                    let elements = throughput_num_threads(num_block, pbs_count);
                     bench_group.throughput(Throughput::Elements(elements));
                     bench_group.bench_function(&bench_id, |b| {
                         let (cks, _cpu_sks) =
@@ -2516,6 +2615,7 @@ use cuda::{
     cuda_cast_ops, default_cuda_dedup_ops, default_cuda_ops, default_scalar_cuda_ops,
     unchecked_cuda_ops, unchecked_scalar_cuda_ops,
 };
+use tfhe::{get_pbs_count, reset_pbs_count};
 
 criterion_group!(
     smart_ops,
@@ -2617,6 +2717,7 @@ criterion_group!(
 
 criterion_group!(
     default_dedup_ops,
+    bitand_parallelized,
     add_parallelized,
     mul_parallelized,
     div_rem_parallelized,
diff --git a/tfhe/benches/integer/glwe_packing_compression.rs b/tfhe/benches/integer/glwe_packing_compression.rs
index cb30a63570..1e89a7e2fe 100644
--- a/tfhe/benches/integer/glwe_packing_compression.rs
+++ b/tfhe/benches/integer/glwe_packing_compression.rs
@@ -77,9 +77,19 @@ fn cpu_glwe_packing(c: &mut Criterion) {
                 });
             }
             BenchmarkType::Throughput => {
+                // Execute the operation once to know its cost.
+                let ct = cks.encrypt_radix(0_u32, num_blocks);
+                let mut builder = CompressedCiphertextListBuilder::new();
+                builder.push(ct);
+                let compressed = builder.build(&compression_key);
+
+                reset_pbs_count();
+                let _: RadixCiphertext = compressed.get(0, &decompression_key).unwrap().unwrap();
+                let pbs_count = get_pbs_count();
+
                 let num_block =
                     (bit_size as f64 / (param.message_modulus.0 as f64).log(2.0)).ceil() as usize;
-                let elements = throughput_num_threads(num_block);
+                let elements = throughput_num_threads(num_block, pbs_count);
                 // FIXME thread usage seemed to be somewhat more "efficient".
                 //  For example, with bit_size = 2, my laptop is only using around 2/3 of the
                 // available threads  Thread usage increases with bit_size = 8 but
@@ -185,27 +195,26 @@ mod cuda {
             let bench_id_pack;
             let bench_id_unpack;
 
+            // Generate private compression key
+            let cks = ClientKey::new(param);
+            let private_compression_key = cks.new_compression_private_key(comp_param);
+
+            // Generate and convert compression keys
+            let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream);
+            let (compressed_compression_key, compressed_decompression_key) =
+                radix_cks.new_compressed_compression_decompression_keys(&private_compression_key);
+            let cuda_compression_key = compressed_compression_key.decompress_to_cuda(&stream);
+            let cuda_decompression_key = compressed_decompression_key.decompress_to_cuda(
+                radix_cks.parameters().glwe_dimension(),
+                radix_cks.parameters().polynomial_size(),
+                radix_cks.parameters().message_modulus(),
+                radix_cks.parameters().carry_modulus(),
+                radix_cks.parameters().ciphertext_modulus(),
+                &stream,
+            );
+
             match BENCH_TYPE.get().unwrap() {
                 BenchmarkType::Latency => {
-                    // Generate private compression key
-                    let cks = ClientKey::new(param);
-                    let private_compression_key = cks.new_compression_private_key(comp_param);
-
-                    // Generate and convert compression keys
-                    let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream);
-                    let (compressed_compression_key, compressed_decompression_key) = radix_cks
-                        .new_compressed_compression_decompression_keys(&private_compression_key);
-                    let cuda_compression_key =
-                        compressed_compression_key.decompress_to_cuda(&stream);
-                    let cuda_decompression_key = compressed_decompression_key.decompress_to_cuda(
-                        radix_cks.parameters().glwe_dimension(),
-                        radix_cks.parameters().polynomial_size(),
-                        radix_cks.parameters().message_modulus(),
-                        radix_cks.parameters().carry_modulus(),
-                        radix_cks.parameters().ciphertext_modulus(),
-                        &stream,
-                    );
-
                     // Encrypt
                     let ct = cks.encrypt_radix(0_u32, num_blocks);
                     let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream);
@@ -239,28 +248,25 @@ mod cuda {
                     });
                 }
                 BenchmarkType::Throughput => {
+                    // Execute the operation once to know its cost.
+                    let ct = cks.encrypt_radix(0_u32, num_blocks);
+                    let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream);
+                    let mut builder = CudaCompressedCiphertextListBuilder::new();
+                    builder.push(d_ct, &stream);
+                    let compressed = builder.build(&cuda_compression_key, &stream);
+
+                    reset_pbs_count();
+                    let _: CudaUnsignedRadixCiphertext = compressed
+                        .get(0, &cuda_decompression_key, &stream)
+                        .unwrap()
+                        .unwrap();
+                    let pbs_count = get_pbs_count();
+
                     let num_block = (bit_size as f64 / (param.message_modulus.0 as f64).log(2.0))
                         .ceil() as usize;
-                    let elements = throughput_num_threads(num_block);
+                    let elements = throughput_num_threads(num_block, pbs_count);
                     bench_group.throughput(Throughput::Elements(elements));
 
-                    let cks = ClientKey::new(param);
-                    let private_compression_key = cks.new_compression_private_key(comp_param);
-
-                    let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream);
-                    let (compressed_compression_key, compressed_decompression_key) = radix_cks
-                        .new_compressed_compression_decompression_keys(&private_compression_key);
-                    let cuda_compression_key =
-                        compressed_compression_key.decompress_to_cuda(&stream);
-                    let cuda_decompression_key = compressed_decompression_key.decompress_to_cuda(
-                        radix_cks.parameters().glwe_dimension(),
-                        radix_cks.parameters().polynomial_size(),
-                        radix_cks.parameters().message_modulus(),
-                        radix_cks.parameters().carry_modulus(),
-                        radix_cks.parameters().ciphertext_modulus(),
-                        &stream,
-                    );
-
                     // Encrypt
                     let ct = cks.encrypt_radix(0_u32, num_blocks);
                     let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream);
@@ -344,6 +350,7 @@ criterion_group!(cpu_glwe_packing2, cpu_glwe_packing);
 
 #[cfg(feature = "gpu")]
 use cuda::gpu_glwe_packing2;
+use tfhe::{get_pbs_count, reset_pbs_count};
 
 fn main() {
     BENCH_TYPE.get_or_init(|| BenchmarkType::from_env().unwrap());
diff --git a/tfhe/benches/integer/oprf.rs b/tfhe/benches/integer/oprf.rs
index 8bdc1e9407..f664ed8406 100644
--- a/tfhe/benches/integer/oprf.rs
+++ b/tfhe/benches/integer/oprf.rs
@@ -7,6 +7,7 @@ use rayon::prelude::*;
 use tfhe::integer::keycache::KEY_CACHE;
 use tfhe::integer::IntegerKeyKind;
 use tfhe::keycache::NamedParam;
+use tfhe::{get_pbs_count, reset_pbs_count};
 use tfhe_csprng::seeders::Seed;
 
 pub fn unsigned_oprf(c: &mut Criterion) {
@@ -40,12 +41,21 @@ pub fn unsigned_oprf(c: &mut Criterion) {
                 });
             }
             BenchmarkType::Throughput => {
+                let (_, sk) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+
+                // Execute the operation once to know its cost.
+                reset_pbs_count();
+                sk.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
+                    Seed(0),
+                    bit_size as u64,
+                    num_block as u64,
+                );
+                let pbs_count = get_pbs_count();
+
                 bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
-                let elements = throughput_num_threads(num_block);
+                let elements = throughput_num_threads(num_block, pbs_count);
                 bench_group.throughput(Throughput::Elements(elements));
                 bench_group.bench_function(&bench_id, |b| {
-                    let (_, sk) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
-
                     b.iter(|| {
                         (0..elements).into_par_iter().for_each(|_| {
                             sk.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
diff --git a/tfhe/benches/integer/signed_bench.rs b/tfhe/benches/integer/signed_bench.rs
index 9c1cf0ed1b..6d9701f462 100644
--- a/tfhe/benches/integer/signed_bench.rs
+++ b/tfhe/benches/integer/signed_bench.rs
@@ -66,12 +66,20 @@ fn bench_server_key_signed_binary_function_clean_inputs<F>(
                 });
             }
             BenchmarkType::Throughput => {
+                let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+
+                // Execute the operation once to know its cost.
+                let ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
+                let ct_1 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
+
+                reset_pbs_count();
+                binary_op(&sks, &ct_0, &ct_1);
+                let pbs_count = get_pbs_count();
+
                 bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
-                let elements = throughput_num_threads(num_block);
+                let elements = throughput_num_threads(num_block, pbs_count);
                 bench_group.throughput(Throughput::Elements(elements));
                 bench_group.bench_function(&bench_id, |b| {
-                    let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
-
                     let mut cts_0 = (0..elements)
                         .map(|_| cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block))
                         .collect::<Vec<_>>();
@@ -151,12 +159,21 @@ fn bench_server_key_signed_shift_function_clean_inputs<F>(
                 });
             }
             BenchmarkType::Throughput => {
+                let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+
+                // Execute the operation once to know its cost.
+                let clear_1 = rng.gen_range(0u128..bit_size as u128);
+                let ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
+                let ct_1 = cks.encrypt_radix(clear_1, num_block);
+
+                reset_pbs_count();
+                binary_op(&sks, &ct_0, &ct_1);
+                let pbs_count = get_pbs_count();
+
                 bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
-                let elements = throughput_num_threads(num_block);
+                let elements = throughput_num_threads(num_block, pbs_count);
                 bench_group.throughput(Throughput::Elements(elements));
                 bench_group.bench_function(&bench_id, |b| {
-                    let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
-
                     let mut cts_0 = (0..elements)
                         .map(|_| cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block))
                         .collect::<Vec<_>>();
@@ -233,12 +250,19 @@ fn bench_server_key_unary_function_clean_inputs<F>(
                 });
             }
             BenchmarkType::Throughput => {
+                let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+
+                // Execute the operation once to know its cost.
+                let ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
+
+                reset_pbs_count();
+                unary_fn(&sks, &ct_0);
+                let pbs_count = get_pbs_count();
+
                 bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
-                let elements = throughput_num_threads(num_block);
+                let elements = throughput_num_threads(num_block, pbs_count);
                 bench_group.throughput(Throughput::Elements(elements));
                 bench_group.bench_function(&bench_id, |b| {
-                    let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
-
                     let mut cts_0 = (0..elements)
                         .map(|_| cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block))
                         .collect::<Vec<_>>();
@@ -307,12 +331,21 @@ fn signed_if_then_else_parallelized(c: &mut Criterion) {
                 });
             }
             BenchmarkType::Throughput => {
+                let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+
+                // Execute the operation once to know its cost.
+                let cond = sks.create_trivial_boolean_block(rng.gen_bool(0.5));
+                let ct_then = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
+                let ct_else = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
+
+                reset_pbs_count();
+                sks.if_then_else_parallelized(&cond, &ct_then, &ct_else);
+                let pbs_count = get_pbs_count();
+
                 bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
-                let elements = throughput_num_threads(num_block);
+                let elements = throughput_num_threads(num_block, pbs_count);
                 bench_group.throughput(Throughput::Elements(elements));
                 bench_group.bench_function(&bench_id, |b| {
-                    let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
-
                     let cts_cond = (0..elements)
                         .map(|_| sks.create_trivial_boolean_block(rng.gen_bool(0.5)))
                         .collect::<Vec<_>>();
@@ -830,12 +863,20 @@ fn bench_server_key_binary_scalar_function_clean_inputs<F, G>(
                 });
             }
             BenchmarkType::Throughput => {
+                let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+
+                // Execute the operation once to know its cost.
+                let mut ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
+                let clear_1 = rng_func(&mut rng, bit_size);
+
+                reset_pbs_count();
+                binary_op(&sks, &mut ct_0, clear_1);
+                let pbs_count = get_pbs_count();
+
                 bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
-                let elements = throughput_num_threads(num_block);
+                let elements = throughput_num_threads(num_block, pbs_count);
                 bench_group.throughput(Throughput::Elements(elements));
                 bench_group.bench_function(&bench_id, |b| {
-                    let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
-
                     let mut cts_0 = (0..elements)
                         .map(|_| cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block))
                         .collect::<Vec<_>>();
@@ -1401,14 +1442,32 @@ mod cuda {
                     });
                 }
                 BenchmarkType::Throughput => {
+                    let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+                    let gpu_sks = CudaServerKey::new(&cks, &stream);
+
+                    // Execute the operation once to know its cost.
+                    let clearlow = rng.gen::<u128>();
+                    let clearhigh = rng.gen::<u128>();
+                    let clear_0 = tfhe::integer::I256::from((clearlow, clearhigh));
+                    let ct_0 = cks.encrypt_signed_radix(clear_0, num_block);
+                    let d_ctxt_0 =
+                        CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct_0, &stream);
+
+                    let clearlow = rng.gen::<u128>();
+                    let clearhigh = rng.gen::<u128>();
+                    let clear_1 = tfhe::integer::I256::from((clearlow, clearhigh));
+                    let ct_1 = cks.encrypt_signed_radix(clear_1, num_block);
+                    let d_ctxt_1 =
+                        CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct_1, &stream);
+
+                    reset_pbs_count();
+                    binary_op(&gpu_sks, &mut d_ctxt_0, &mut d_ctxt_1, &stream);
+                    let pbs_count = get_pbs_count();
+
                     bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
-                    let elements = throughput_num_threads(num_block);
+                    let elements = throughput_num_threads(num_block, pbs_count);
                     bench_group.throughput(Throughput::Elements(elements));
                     bench_group.bench_function(&bench_id, |b| {
-                        let (cks, _cpu_sks) =
-                            KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
-                        let gpu_sks = CudaServerKey::new(&cks, &stream);
-
                         let mut cts_0 = (0..elements)
                             .map(|_| {
                                 let clearlow = rng.gen::<u128>();
@@ -1527,14 +1586,25 @@ mod cuda {
                     });
                 }
                 BenchmarkType::Throughput => {
+                    let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+                    let gpu_sks = CudaServerKey::new(&cks, &stream);
+
+                    // Execute the operation once to know its cost.
+                    let clearlow = rng.gen::<u128>();
+                    let clearhigh = rng.gen::<u128>();
+                    let clear = tfhe::integer::I256::from((clearlow, clearhigh));
+                    let ct = cks.encrypt_signed_radix(clear, num_block);
+                    let d_ctxt =
+                        CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct, &stream);
+
+                    reset_pbs_count();
+                    unary_op(&gpu_sks, &mut d_ctxt, &stream);
+                    let pbs_count = get_pbs_count();
+
                     bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
-                    let elements = throughput_num_threads(num_block);
+                    let elements = throughput_num_threads(num_block, pbs_count);
                     bench_group.throughput(Throughput::Elements(elements));
                     bench_group.bench_function(&bench_id, |b| {
-                        let (cks, _cpu_sks) =
-                            KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
-                        let gpu_sks = CudaServerKey::new(&cks, &stream);
-
                         let mut cts_0 = (0..elements)
                             .map(|_| {
                                 let clearlow = rng.gen::<u128>();
@@ -1650,16 +1720,29 @@ mod cuda {
                     });
                 }
                 BenchmarkType::Throughput => {
+                    let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+                    let gpu_sks = CudaServerKey::new(&cks, &stream);
+
+                    // Execute the operation once to know its cost.
+                    let clearlow = rng.gen::<u128>();
+                    let clearhigh = rng.gen::<u128>();
+                    let clear_0 = tfhe::integer::I256::from((clearlow, clearhigh));
+                    let ct_0 = cks.encrypt_signed_radix(clear_0, num_block);
+                    let d_ctxt_0 =
+                        CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct_0, &stream);
+
+                    let clear_0 = rng_func(&mut rng, bit_size) & max_value_for_bit_size;
+
+                    reset_pbs_count();
+                    binary_op(&gpu_sks, &mut d_ctxt_0, clear_0, &stream);
+                    let pbs_count = get_pbs_count();
+
                     bench_id = format!(
                         "{bench_name}::throughput::{param_name}::{bit_size}_bits_scalar_{bit_size}"
                     );
-                    let elements = throughput_num_threads(num_block);
+                    let elements = throughput_num_threads(num_block, pbs_count);
                     bench_group.throughput(Throughput::Elements(elements));
                     bench_group.bench_function(&bench_id, |b| {
-                        let (cks, _cpu_sks) =
-                            KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
-                        let gpu_sks = CudaServerKey::new(&cks, &stream);
-
                         let mut cts_0 = (0..elements)
                             .map(|_| {
                                 let clearlow = rng.gen::<u128>();
@@ -1786,14 +1869,32 @@ mod cuda {
                     });
                 }
                 BenchmarkType::Throughput => {
+                    let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+                    let gpu_sks = CudaServerKey::new(&cks, &stream);
+
+                    // Execute the operation once to know its cost.
+                    let clearlow = rng.gen::<u128>();
+                    let clearhigh = rng.gen::<u128>();
+                    let clear_0 = tfhe::integer::I256::from((clearlow, clearhigh));
+                    let ct_0 = cks.encrypt_signed_radix(clear_0, num_block);
+                    let d_ctxt_0 =
+                        CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct_0, &stream);
+
+                    let clearlow = rng.gen::<u128>();
+                    let clearhigh = rng.gen::<u128>();
+                    let clear_1 = tfhe::integer::U256::from((clearlow, clearhigh));
+                    let ct_1 = cks.encrypt_radix(clear_1, num_block);
+                    let d_ctxt_1 =
+                        CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_1, &stream);
+
+                    reset_pbs_count();
+                    binary_op(&gpu_sks, &mut d_ctxt_0, &mut d_ctxt_1, &stream);
+                    let pbs_count = get_pbs_count();
+
                     bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
-                    let elements = throughput_num_threads(num_block);
+                    let elements = throughput_num_threads(num_block, pbs_count);
                     bench_group.throughput(Throughput::Elements(elements));
                     bench_group.bench_function(&bench_id, |b| {
-                        let (cks, _cpu_sks) =
-                            KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
-                        let gpu_sks = CudaServerKey::new(&cks, &stream);
-
                         let mut cts_0 = (0..elements)
                             .map(|_| {
                                 let clearlow = rng.gen::<u128>();
@@ -1916,14 +2017,29 @@ mod cuda {
                     });
                 }
                 BenchmarkType::Throughput => {
+                    let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+                    let gpu_sks = CudaServerKey::new(&cks, &stream);
+
+                    // Execute the operation once to know its cost.
+                    let clear_cond = rng.gen::<bool>();
+                    let ct_then = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
+                    let ct_else = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
+                    let ct_cond = cks.encrypt_bool(clear_cond);
+
+                    let d_ct_cond = CudaBooleanBlock::from_boolean_block(&ct_cond, &stream);
+                    let d_ct_then =
+                        CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct_then, &stream);
+                    let d_ct_else =
+                        CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct_else, &stream);
+
+                    reset_pbs_count();
+                    gpu_sks.if_then_else(&d_ct_cond, &d_ct_then, &d_ct_else, &stream);
+                    let pbs_count = get_pbs_count();
+
                     bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
-                    let elements = throughput_num_threads(num_block);
+                    let elements = throughput_num_threads(num_block, pbs_count);
                     bench_group.throughput(Throughput::Elements(elements));
                     bench_group.bench_function(&bench_id, |b| {
-                        let (cks, _cpu_sks) =
-                            KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
-                        let gpu_sks = CudaServerKey::new(&cks, &stream);
-
                         let cts_cond = (0..elements)
                             .map(|_| {
                                 let ct_cond = cks.encrypt_bool(rng.gen::<bool>());
@@ -2697,6 +2813,7 @@ use cuda::{
     cuda_cast_ops, default_cuda_dedup_ops, default_cuda_ops, default_scalar_cuda_ops,
     unchecked_cuda_ops, unchecked_scalar_cuda_ops,
 };
+use tfhe::{get_pbs_count, reset_pbs_count};
 
 #[cfg(feature = "gpu")]
 fn go_through_gpu_bench_groups(val: &str) {
diff --git a/tfhe/benches/integer/zk_pke.rs b/tfhe/benches/integer/zk_pke.rs
index 8d789caf1f..6c8e89d344 100644
--- a/tfhe/benches/integer/zk_pke.rs
+++ b/tfhe/benches/integer/zk_pke.rs
@@ -18,6 +18,7 @@ use tfhe::shortint::parameters::compact_public_key_only::p_fail_2_minus_64::ks_p
 use tfhe::shortint::parameters::key_switching::p_fail_2_minus_64::ks_pbs::V0_11_PARAM_KEYSWITCH_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
 use tfhe::shortint::parameters::PBSParameters;
 use tfhe::zk::{CompactPkeCrs, ZkComputeLoad};
+use tfhe::{get_pbs_count, reset_pbs_count};
 use utilities::{write_to_json, OperatorType};
 
 fn write_result(file: &mut File, name: &str, value: usize) {
@@ -96,7 +97,17 @@ fn pke_zk_proof(c: &mut Criterion) {
                         });
                     }
                     BenchmarkType::Throughput => {
-                        let elements = throughput_num_threads(num_block);
+                        // Execute the operation once to know its cost.
+                        let input_msg = rng.gen::<u64>();
+                        let messages = vec![input_msg; fhe_uint_count];
+
+                        reset_pbs_count();
+                        let _ = tfhe::integer::ProvenCompactCiphertextList::builder(&pk)
+                            .extend(messages.iter().copied())
+                            .build_with_proof_packed(&crs, &metadata, compute_load);
+                        let pbs_count = get_pbs_count();
+
+                        let elements = throughput_num_threads(num_block, pbs_count);
                         bench_group.throughput(Throughput::Elements(elements));
 
                         bench_id = format!(
@@ -304,7 +315,27 @@ fn pke_zk_verify(c: &mut Criterion, results_file: &Path) {
                     }
                     BenchmarkType::Throughput => {
                         // In throughput mode object sizes are not recorded.
-                        let elements = throughput_num_threads(num_block);
+
+                        // Execute the operation once to know its cost.
+                        let input_msg = rng.gen::<u64>();
+                        let messages = vec![input_msg; fhe_uint_count];
+                        let ct1 = tfhe::integer::ProvenCompactCiphertextList::builder(&pk)
+                            .extend(messages.iter().copied())
+                            .build_with_proof_packed(&crs, &metadata, compute_load)
+                            .unwrap();
+
+                        reset_pbs_count();
+                        let _ = ct1.verify_and_expand(
+                            &crs,
+                            &pk,
+                            &metadata,
+                            IntegerCompactCiphertextListExpansionMode::CastAndUnpackIfNecessary(
+                                casting_key.as_view(),
+                            ),
+                        );
+                        let pbs_count = get_pbs_count();
+
+                        let elements = throughput_num_threads(num_block, pbs_count);
                         bench_group.throughput(Throughput::Elements(elements));
 
                         bench_id_verify = format!(
diff --git a/tfhe/benches/utilities.rs b/tfhe/benches/utilities.rs
index 5348010941..a343369256 100644
--- a/tfhe/benches/utilities.rs
+++ b/tfhe/benches/utilities.rs
@@ -392,7 +392,7 @@ pub mod integer_utils {
 
     /// Generate a number of threads to use to saturate current machine for throughput measurements.
     #[allow(dead_code)]
-    pub fn throughput_num_threads(num_block: usize) -> u64 {
+    pub fn throughput_num_threads(num_block: usize, op_pbs_count: u64) -> u64 {
         let ref_block_count = 32; // Represent a ciphertext of 64 bits for 2_2 parameters set
         let block_multiplicator = (ref_block_count as f64 / num_block as f64).ceil();
 
@@ -401,13 +401,22 @@ pub mod integer_utils {
             // This value is for Nvidia H100 GPU
             let streaming_multiprocessors = 132;
             let num_gpus = unsafe { cuda_get_number_of_gpus() };
-            ((streaming_multiprocessors * num_gpus) as f64 * block_multiplicator) as u64
+            let total_num_sm = streaming_multiprocessors * num_gpus;
+            // Some operations with a high count of PBS (e.g. division) would yield an operation
+            // loading value so low that the number of elements in the end wouldn't be meaningful.
+            let minimum_loading = 0.2;
+            let operation_loading = ((total_num_sm as u64 / op_pbs_count) as f64).max(minimum_loading);
+            (total_num_sm as f64 * block_multiplicator * operation_loading) as u64
         }
         #[cfg(not(feature = "gpu"))]
         {
             let num_threads = rayon::current_num_threads() as f64;
+            // Some operations with a high count of PBS (e.g. division) would yield an operation
+            // loading value so low that the number of elements in the end wouldn't be meaningful.
+            let minimum_loading = 0.2;
+            let operation_loading = (num_threads / (op_pbs_count as f64)).max(minimum_loading);
             // Add 20% more to maximum threads available.
-            ((num_threads + (num_threads * 0.2)) * block_multiplicator) as u64
+            ((num_threads + (num_threads * 0.2)) * block_multiplicator * operation_loading) as u64
         }
     }