From dc59088748d21386e6cdb1a78c83c3b4e229bbfe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20Test=C3=A9?= Date: Mon, 6 Jan 2025 15:51:34 +0100 Subject: [PATCH] chore(bench): new heuristic to define elements for throughput This is done to fill up backend with enough elements to fill the backend and avoid having long execution time for heavy operations like multiplication or division. --- Makefile | 28 +- tfhe/benches/integer/bench.rs | 295 +++++++++++++++--- .../integer/glwe_packing_compression.rs | 85 ++--- tfhe/benches/integer/oprf.rs | 17 +- tfhe/benches/integer/signed_bench.rs | 293 ++++++++++++++--- tfhe/benches/integer/zk_pke.rs | 36 ++- tfhe/benches/utilities.rs | 16 +- 7 files changed, 614 insertions(+), 156 deletions(-) diff --git a/Makefile b/Makefile index 1e82fb8ffd..f287ad0cc8 100644 --- a/Makefile +++ b/Makefile @@ -281,14 +281,14 @@ check_typos: install_typos_checker .PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled clippy_gpu: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \ - --features=boolean,shortint,integer,internal-keycache,gpu \ + --features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats \ --all-targets \ -p $(TFHE_SPEC) -- --no-deps -D warnings .PHONY: check_gpu # Run check on tfhe with "gpu" enabled check_gpu: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" check \ - --features=boolean,shortint,integer,internal-keycache,gpu \ + --features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats \ --all-targets \ -p $(TFHE_SPEC) @@ -393,10 +393,10 @@ clippy_trivium: install_rs_check_toolchain .PHONY: clippy_all_targets # Run clippy lints on all targets (benches, examples, etc.) clippy_all_targets: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \ - --features=boolean,shortint,integer,internal-keycache,zk-pok,strings \ + --features=boolean,shortint,integer,internal-keycache,zk-pok,strings,pbs-stats \ -p $(TFHE_SPEC) -- --no-deps -D warnings RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \ - --features=boolean,shortint,integer,internal-keycache,zk-pok,strings,experimental \ + --features=boolean,shortint,integer,internal-keycache,zk-pok,strings,pbs-stats,experimental \ -p $(TFHE_SPEC) -- --no-deps -D warnings .PHONY: clippy_tfhe_csprng # Run clippy lints on tfhe-csprng @@ -1045,35 +1045,35 @@ bench_integer: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \ cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ --bench integer-bench \ - --features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- + --features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) -- .PHONY: bench_signed_integer # Run benchmarks for signed integer bench_signed_integer: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \ cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ --bench integer-signed-bench \ - --features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- + --features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) -- .PHONY: bench_integer_gpu # Run benchmarks for integer on GPU backend bench_integer_gpu: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \ cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ --bench integer-bench \ - --features=integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- + --features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) -- .PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression bench_integer_compression: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \ cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ --bench glwe_packing_compression-integer-bench \ - --features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- + --features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) -- .PHONY: bench_integer_compression_gpu bench_integer_compression_gpu: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \ cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ --bench glwe_packing_compression-integer-bench \ - --features=integer,internal-keycache,gpu -p $(TFHE_SPEC) -- + --features=integer,internal-keycache,gpu,pbs-stats -p $(TFHE_SPEC) -- .PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters bench_integer_multi_bit: install_rs_check_toolchain @@ -1081,7 +1081,7 @@ bench_integer_multi_bit: install_rs_check_toolchain __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \ cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ --bench integer-bench \ - --features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- + --features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) -- .PHONY: bench_signed_integer_multi_bit # Run benchmarks for signed integer using multi-bit parameters bench_signed_integer_multi_bit: install_rs_check_toolchain @@ -1089,7 +1089,7 @@ bench_signed_integer_multi_bit: install_rs_check_toolchain __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \ cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ --bench integer-signed-bench \ - --features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- + --features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) -- .PHONY: bench_integer_multi_bit_gpu # Run benchmarks for integer on GPU backend using multi-bit parameters bench_integer_multi_bit_gpu: install_rs_check_toolchain @@ -1097,7 +1097,7 @@ bench_integer_multi_bit_gpu: install_rs_check_toolchain __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \ cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ --bench integer-bench \ - --features=integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- + --features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) -- .PHONY: bench_unsigned_integer_multi_bit_gpu # Run benchmarks for unsigned integer on GPU backend using multi-bit parameters bench_unsigned_integer_multi_bit_gpu: install_rs_check_toolchain @@ -1105,14 +1105,14 @@ bench_unsigned_integer_multi_bit_gpu: install_rs_check_toolchain __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \ cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ --bench integer-bench \ - --features=integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- ::unsigned + --features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) -- ::unsigned .PHONY: bench_integer_zk # Run benchmarks for integer encryption with ZK proofs bench_integer_zk: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \ cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ --bench zk-pke-bench \ - --features=integer,internal-keycache,zk-pok,nightly-avx512 \ + --features=integer,internal-keycache,zk-pok,nightly-avx512,pbs-stats \ -p $(TFHE_SPEC) -- .PHONY: bench_shortint # Run benchmarks for shortint diff --git a/tfhe/benches/integer/bench.rs b/tfhe/benches/integer/bench.rs index d6ead085b7..4bdec50f86 100644 --- a/tfhe/benches/integer/bench.rs +++ b/tfhe/benches/integer/bench.rs @@ -11,6 +11,7 @@ use crate::utilities::{ use criterion::{criterion_group, Criterion, Throughput}; use rand::prelude::*; use rayon::prelude::*; +use std::cmp::max; use std::env; use tfhe::integer::keycache::KEY_CACHE; use tfhe::integer::prelude::*; @@ -143,15 +144,25 @@ fn bench_server_key_binary_function_clean_inputs( }); } BenchmarkType::Throughput => { + let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + + // Execute the operation once to know its cost. + let clear_0 = gen_random_u256(&mut rng); + let mut ct_0 = cks.encrypt_radix(clear_0, num_block); + let clear_1 = gen_random_u256(&mut rng); + let mut ct_1 = cks.encrypt_radix(clear_1, num_block); + + reset_pbs_count(); + binary_op(&sks, &mut ct_0, &mut ct_1); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); bench_group .sample_size(10) .measurement_time(std::time::Duration::from_secs(30)); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let mut cts_0 = (0..elements) .map(|_| cks.encrypt_radix(gen_random_u256(&mut rng), num_block)) .collect::>(); @@ -294,15 +305,23 @@ fn bench_server_key_unary_function_clean_inputs( }); } BenchmarkType::Throughput => { + let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + + // Execute the operation once to know its cost. + let clear_0 = gen_random_u256(&mut rng); + let mut ct_0 = cks.encrypt_radix(clear_0, num_block); + + reset_pbs_count(); + unary_fn(&sks, &mut ct_0); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); bench_group .sample_size(10) .measurement_time(std::time::Duration::from_secs(30)); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let mut cts_0 = (0..elements) .map(|_| cks.encrypt_radix(gen_random_u256(&mut rng), num_block)) .collect::>(); @@ -451,15 +470,24 @@ fn bench_server_key_binary_scalar_function_clean_inputs( }); } BenchmarkType::Throughput => { + let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + + // Execute the operation once to know its cost. + let clear_0 = gen_random_u256(&mut rng); + let mut ct_0 = cks.encrypt_radix(clear_0, num_block); + let clear_1 = rng_func(&mut rng, bit_size) & max_value_for_bit_size; + + reset_pbs_count(); + binary_op(&sks, &mut ct_0, clear_1); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); bench_group .sample_size(10) .measurement_time(std::time::Duration::from_secs(30)); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let mut cts_0 = (0..elements) .map(|_| cks.encrypt_radix(gen_random_u256(&mut rng), num_block)) .collect::>(); @@ -567,15 +595,28 @@ fn if_then_else_parallelized(c: &mut Criterion) { }); } BenchmarkType::Throughput => { + let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + + // Execute the operation once to know its cost. + let clear_0 = gen_random_u256(&mut rng); + let true_ct = cks.encrypt_radix(clear_0, num_block); + + let clear_1 = gen_random_u256(&mut rng); + let false_ct = cks.encrypt_radix(clear_1, num_block); + + let condition = sks.create_trivial_boolean_block(rng.gen_bool(0.5)); + + reset_pbs_count(); + sks.if_then_else_parallelized(&condition, &true_ct, &false_ct); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); bench_group .sample_size(10) .measurement_time(std::time::Duration::from_secs(30)); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let cts_cond = (0..elements) .map(|_| sks.create_trivial_boolean_block(rng.gen_bool(0.5))) .collect::>(); @@ -663,20 +704,34 @@ fn ciphertexts_sum_parallelized(c: &mut Criterion) { }); } BenchmarkType::Throughput => { + let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + + // Execute the operation once to know its cost. + let nb_ctxt = bit_size.div_ceil(param.message_modulus().0.ilog2() as usize); + let cks = RadixClientKey::from((cks, nb_ctxt)); + + let clears = (0..len) + .map(|_| gen_random_u256(&mut rng) & max_for_bit_size) + .collect::>(); + let ctxts = clears + .iter() + .copied() + .map(|clear| cks.encrypt(clear)) + .collect::>(); + + reset_pbs_count(); + sks.sum_ciphertexts_parallelized(&ctxts); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!( "{bench_name}_{len}_ctxts::throughput::{param_name}::{bit_size}_bits" ); bench_group .sample_size(10) .measurement_time(std::time::Duration::from_secs(30)); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - - let nb_ctxt = bit_size.div_ceil(param.message_modulus().0.ilog2() as usize); - let cks = RadixClientKey::from((cks, nb_ctxt)); - let cts = (0..elements) .map(|_| { let clears = (0..len) @@ -1308,18 +1363,22 @@ define_server_key_bench_default_fn!( mod cuda { use super::*; use criterion::criterion_group; + use std::cmp::max; use tfhe::core_crypto::gpu::CudaStreams; use tfhe::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock; use tfhe::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext; use tfhe::integer::gpu::server_key::CudaServerKey; + use tfhe::integer::{RadixCiphertext, ServerKey}; - fn bench_cuda_server_key_unary_function_clean_inputs( + fn bench_cuda_server_key_unary_function_clean_inputs( c: &mut Criterion, bench_name: &str, display_name: &str, unary_op: F, + unary_op_cpu: G, ) where F: Fn(&CudaServerKey, &mut CudaUnsignedRadixCiphertext, &CudaStreams) + Sync, + G: Fn(&ServerKey, &mut RadixCiphertext) + Sync, { let mut bench_group = c.benchmark_group(bench_name); bench_group @@ -1358,17 +1417,24 @@ mod cuda { }); } BenchmarkType::Throughput => { + let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let gpu_sks = CudaServerKey::new(&cks, &streams); + + let clear_0 = gen_random_u256(&mut rng); + let mut ct_0 = cks.encrypt_radix(clear_0, num_block); + + reset_pbs_count(); + // Use CPU operation as pbs_count do not count PBS on GPU backend. + unary_op_cpu(&cpu_sks, &mut ct_0); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); bench_group .sample_size(10) .measurement_time(std::time::Duration::from_secs(30)); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, _cpu_sks) = - KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let gpu_sks = CudaServerKey::new(&cks, &streams); - let mut cts_0 = (0..elements) .map(|_| { let ct_0 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block); @@ -1401,11 +1467,12 @@ mod cuda { /// Base function to bench a server key function that is a binary operation, input ciphertext /// will contain only zero carries - fn bench_cuda_server_key_binary_function_clean_inputs( + fn bench_cuda_server_key_binary_function_clean_inputs( c: &mut Criterion, bench_name: &str, display_name: &str, binary_op: F, + binary_op_cpu: G, ) where F: Fn( &CudaServerKey, @@ -1413,6 +1480,7 @@ mod cuda { &mut CudaUnsignedRadixCiphertext, &CudaStreams, ) + Sync, + G: Fn(&ServerKey, &mut RadixCiphertext, &mut RadixCiphertext) + Sync, { let mut bench_group = c.benchmark_group(bench_name); bench_group @@ -1457,17 +1525,27 @@ mod cuda { }); } BenchmarkType::Throughput => { + let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let gpu_sks = CudaServerKey::new(&cks, &streams); + + // Execute the operation once to know its cost. + let clear_0 = gen_random_u256(&mut rng); + let mut ct_0 = cks.encrypt_radix(clear_0, num_block); + let clear_1 = gen_random_u256(&mut rng); + let mut ct_1 = cks.encrypt_radix(clear_1, num_block); + + reset_pbs_count(); + // Use CPU operation as pbs_count do not count PBS on GPU backend. + binary_op_cpu(&cpu_sks, &mut ct_0, &mut ct_1); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); bench_group .sample_size(10) .measurement_time(std::time::Duration::from_secs(30)); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, _cpu_sks) = - KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let gpu_sks = CudaServerKey::new(&cks, &streams); - let mut cts_0 = (0..elements) .map(|_| { let ct_0 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block); @@ -1506,15 +1584,17 @@ mod cuda { bench_group.finish() } - fn bench_cuda_server_key_binary_scalar_function_clean_inputs( + fn bench_cuda_server_key_binary_scalar_function_clean_inputs( c: &mut Criterion, bench_name: &str, display_name: &str, binary_op: F, - rng_func: G, + binary_op_cpu: G, + rng_func: H, ) where F: Fn(&CudaServerKey, &mut CudaUnsignedRadixCiphertext, ScalarType, &CudaStreams) + Sync, - G: Fn(&mut ThreadRng, usize) -> ScalarType, + G: Fn(&ServerKey, &mut RadixCiphertext, ScalarType) + Sync, + H: Fn(&mut ThreadRng, usize) -> ScalarType, { let mut bench_group = c.benchmark_group(bench_name); let mut rng = rand::thread_rng(); @@ -1564,19 +1644,28 @@ mod cuda { }); } BenchmarkType::Throughput => { + let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let gpu_sks = CudaServerKey::new(&cks, &streams); + + // Execute the operation once to know its cost. + let clear_0 = gen_random_u256(&mut rng); + let mut ct_0 = cks.encrypt_radix(clear_0, num_block); + let clear_1 = rng_func(&mut rng, bit_size) & max_value_for_bit_size; + + reset_pbs_count(); + // Use CPU operation as pbs_count do not count PBS on GPU backend. + binary_op_cpu(&cpu_sks, &mut ct_0, clear_1); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_group .sample_size(10) .measurement_time(std::time::Duration::from_secs(30)); bench_id = format!( "{bench_name}::throughput::{param_name}::{bit_size}_bits_scalar_{bit_size}" ); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, _cpu_sks) = - KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let gpu_sks = CudaServerKey::new(&cks, &streams); - let mut cts_0 = (0..elements) .map(|_| { let ct_0 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block); @@ -1667,17 +1756,28 @@ mod cuda { }); } BenchmarkType::Throughput => { + let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let gpu_sks = CudaServerKey::new(&cks, &stream); + + // Execute the operation once to know its cost. + let clear_0 = gen_random_u256(&mut rng); + let ct_then = cks.encrypt_radix(clear_0, num_block); + let clear_1 = gen_random_u256(&mut rng); + let ct_else = cks.encrypt_radix(clear_1, num_block); + let ct_cond = cpu_sks.create_trivial_boolean_block(rng.gen_bool(0.5)); + + reset_pbs_count(); + // Use CPU operation as pbs_count do not count PBS on GPU backend. + cpu_sks.if_then_else_parallelized(&ct_cond, &ct_then, &ct_else); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); bench_group .sample_size(10) .measurement_time(std::time::Duration::from_secs(30)); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, _cpu_sks) = - KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let gpu_sks = CudaServerKey::new(&cks, &stream); - let cts_cond = (0..elements) .map(|_| { let ct_cond = cks.encrypt_bool(rng.gen::()); @@ -1732,7 +1832,7 @@ mod cuda { } macro_rules! define_cuda_server_key_bench_clean_input_unary_fn ( - (method_name: $server_key_method:ident, display_name:$name:ident) => { + (method_name: $server_key_method:ident, method_name_cpu: $server_key_method_cpu:ident, display_name: $name:ident) => { ::paste::paste!{ fn [](c: &mut Criterion) { bench_cuda_server_key_unary_function_clean_inputs( @@ -1741,6 +1841,9 @@ mod cuda { stringify!($name), |server_key, lhs, stream| { server_key.$server_key_method(lhs, stream); + }, + |server_key_cpu, lhs| { + server_key_cpu.$server_key_method_cpu(lhs); } ) } @@ -1748,7 +1851,7 @@ mod cuda { }); macro_rules! define_cuda_server_key_bench_clean_input_fn ( - (method_name: $server_key_method:ident, display_name:$name:ident) => { + (method_name: $server_key_method:ident, method_name_cpu: $server_key_method_cpu:ident, display_name:$name:ident) => { ::paste::paste!{ fn [](c: &mut Criterion) { bench_cuda_server_key_binary_function_clean_inputs( @@ -1757,6 +1860,9 @@ mod cuda { stringify!($name), |server_key, lhs, rhs, stream| { server_key.$server_key_method(lhs, rhs, stream); + }, + |server_key_cpu, lhs, rhs| { + server_key_cpu.$server_key_method_cpu(lhs, rhs); } ) } @@ -1765,7 +1871,7 @@ mod cuda { ); macro_rules! define_cuda_server_key_bench_clean_input_scalar_fn ( - (method_name: $server_key_method:ident, display_name:$name:ident, rng_func:$($rng_fn:tt)*) => { + (method_name: $server_key_method:ident, method_name_cpu: $server_key_method_cpu:ident, display_name:$name:ident, rng_func:$($rng_fn:tt)*) => { ::paste::paste!{ fn [](c: &mut Criterion) { bench_cuda_server_key_binary_scalar_function_clean_inputs( @@ -1775,6 +1881,9 @@ mod cuda { |server_key, lhs, rhs, stream| { server_key.$server_key_method(lhs, rhs, stream); }, + |server_key_cpu, lhs, rhs| { + server_key_cpu.$server_key_method_cpu(lhs, rhs); + }, $($rng_fn)* ) } @@ -1787,222 +1896,262 @@ mod cuda { //=========================================== define_cuda_server_key_bench_clean_input_unary_fn!( method_name: unchecked_neg, + method_name_cpu: neg_parallelized, display_name: negation ); define_cuda_server_key_bench_clean_input_fn!( method_name: unchecked_bitand, + method_name_cpu: unchecked_bitand, display_name: bitand ); define_cuda_server_key_bench_clean_input_fn!( method_name: unchecked_bitor, + method_name_cpu: unchecked_bitor, display_name: bitor ); define_cuda_server_key_bench_clean_input_fn!( method_name: unchecked_bitxor, + method_name_cpu: unchecked_bitxor, display_name: bitxor ); define_cuda_server_key_bench_clean_input_unary_fn!( method_name: unchecked_bitnot, + method_name_cpu: bitnot, display_name: bitnot ); define_cuda_server_key_bench_clean_input_fn!( method_name: unchecked_mul, + method_name_cpu: unchecked_mul_parallelized, display_name: mul ); define_cuda_server_key_bench_clean_input_fn!( method_name: unchecked_div_rem, + method_name_cpu: unchecked_div_rem_parallelized, display_name: div_mod ); define_cuda_server_key_bench_clean_input_fn!( method_name: unchecked_add, + method_name_cpu: unchecked_add_parallelized, display_name: add ); define_cuda_server_key_bench_clean_input_fn!( method_name: unchecked_sub, + method_name_cpu: unchecked_sub, display_name: sub ); define_cuda_server_key_bench_clean_input_fn!( method_name: unchecked_unsigned_overflowing_sub, + method_name_cpu: unsigned_overflowing_sub_parallelized, display_name: overflowing_sub ); define_cuda_server_key_bench_clean_input_fn!( method_name: unchecked_unsigned_overflowing_add, + method_name_cpu: unsigned_overflowing_add_parallelized, display_name: overflowing_add ); define_cuda_server_key_bench_clean_input_fn!( method_name: unchecked_eq, + method_name_cpu: unchecked_eq, display_name: equal ); define_cuda_server_key_bench_clean_input_fn!( method_name: unchecked_ne, + method_name_cpu: unchecked_ne, display_name: not_equal ); define_cuda_server_key_bench_clean_input_fn!( method_name: unchecked_left_shift, + method_name_cpu: unchecked_left_shift_parallelized, display_name: left_shift ); define_cuda_server_key_bench_clean_input_fn!( method_name: unchecked_right_shift, + method_name_cpu: unchecked_right_shift_parallelized, display_name: right_shift ); define_cuda_server_key_bench_clean_input_fn!( method_name: unchecked_rotate_left, + method_name_cpu: unchecked_rotate_left_parallelized, display_name: rotate_left ); define_cuda_server_key_bench_clean_input_fn!( method_name: unchecked_rotate_right, + method_name_cpu: unchecked_rotate_right_parallelized, display_name: rotate_right ); define_cuda_server_key_bench_clean_input_unary_fn!( method_name: unchecked_ilog2, + method_name_cpu: ilog2_parallelized, display_name: ilog2 ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_bitand, + method_name_cpu: unchecked_scalar_bitand_parallelized, display_name: bitand, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_bitor, + method_name_cpu: unchecked_scalar_bitor_parallelized, display_name: bitand, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_bitxor, + method_name_cpu: unchecked_scalar_bitxor_parallelized, display_name: bitand, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_add, + method_name_cpu: unchecked_scalar_add, display_name: add, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_mul, + method_name_cpu: unchecked_scalar_mul_parallelized, display_name: mul, rng_func: mul_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_sub, + method_name_cpu: unchecked_scalar_sub, display_name: sub, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_left_shift, + method_name_cpu: unchecked_scalar_left_shift_parallelized, display_name: left_shift, rng_func: shift_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_right_shift, + method_name_cpu: unchecked_scalar_right_shift_parallelized, display_name: right_shift, rng_func: shift_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_rotate_left, + method_name_cpu: unchecked_scalar_rotate_left_parallelized, display_name: rotate_left, rng_func: shift_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_rotate_right, + method_name_cpu: unchecked_scalar_rotate_right_parallelized, display_name: rotate_right, rng_func: shift_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_eq, + method_name_cpu: unchecked_scalar_eq_parallelized, display_name: equal, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_ne, + method_name_cpu: unchecked_scalar_ne_parallelized, display_name: not_equal, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_gt, + method_name_cpu: unchecked_scalar_gt_parallelized, display_name: greater_than, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_ge, + method_name_cpu: unchecked_scalar_ge_parallelized, display_name: greater_or_equal, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_lt, + method_name_cpu: unchecked_scalar_lt_parallelized, display_name: less_than, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_le, + method_name_cpu: unchecked_scalar_le_parallelized, display_name: less_or_equal, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_max, + method_name_cpu: unchecked_scalar_max_parallelized, display_name: max, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_min, + method_name_cpu: unchecked_scalar_min_parallelized, display_name: min, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_div_rem, + method_name_cpu: unchecked_scalar_div_rem_parallelized, display_name: div_mod, rng_func: div_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_div, + method_name_cpu: unchecked_scalar_div_parallelized, display_name: div, rng_func: div_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_scalar_rem, + method_name_cpu: unchecked_scalar_rem_parallelized, display_name: modulo, rng_func: div_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unchecked_unsigned_overflowing_scalar_add, + method_name_cpu: unsigned_overflowing_scalar_add_parallelized, display_name: overflowing_add, rng_func: default_scalar ); @@ -2013,282 +2162,334 @@ mod cuda { define_cuda_server_key_bench_clean_input_unary_fn!( method_name: neg, + method_name_cpu: neg_parallelized, display_name: negation ); define_cuda_server_key_bench_clean_input_fn!( method_name: add, + method_name_cpu: add_parallelized, display_name: add ); define_cuda_server_key_bench_clean_input_fn!( method_name: sub, + method_name_cpu: sub_parallelized, display_name: sub ); define_cuda_server_key_bench_clean_input_fn!( method_name: unsigned_overflowing_sub, + method_name_cpu: unsigned_overflowing_sub_parallelized, display_name: overflowing_sub ); define_cuda_server_key_bench_clean_input_fn!( method_name: unsigned_overflowing_add, + method_name_cpu: unsigned_overflowing_add_parallelized, display_name: overflowing_add ); define_cuda_server_key_bench_clean_input_fn!( method_name: mul, + method_name_cpu: mul_parallelized, display_name: mul ); define_cuda_server_key_bench_clean_input_fn!( method_name: div_rem, + method_name_cpu: div_rem_parallelized, display_name: div_mod ); define_cuda_server_key_bench_clean_input_fn!( method_name: div, + method_name_cpu: div_parallelized, display_name: div ); define_cuda_server_key_bench_clean_input_fn!( method_name: rem, + method_name_cpu: rem_parallelized, display_name: modulo ); define_cuda_server_key_bench_clean_input_fn!( method_name: ne, + method_name_cpu: ne_parallelized, display_name: not_equal ); define_cuda_server_key_bench_clean_input_fn!( method_name: eq, + method_name_cpu: eq_parallelized, display_name: equal ); define_cuda_server_key_bench_clean_input_fn!( method_name: bitand, + method_name_cpu: bitand_parallelized, display_name: bitand ); define_cuda_server_key_bench_clean_input_fn!( method_name: bitor, + method_name_cpu: bitor_parallelized, display_name: bitor ); define_cuda_server_key_bench_clean_input_fn!( method_name: bitxor, + method_name_cpu: bitxor_parallelized, display_name: bitxor ); define_cuda_server_key_bench_clean_input_unary_fn!( method_name: bitnot, + method_name_cpu: bitnot, display_name: bitnot ); define_cuda_server_key_bench_clean_input_fn!( method_name: gt, + method_name_cpu: gt_parallelized, display_name: greater_than ); define_cuda_server_key_bench_clean_input_fn!( method_name: ge, + method_name_cpu: ge_parallelized, display_name: greater_or_equal ); define_cuda_server_key_bench_clean_input_fn!( method_name: lt, + method_name_cpu: lt_parallelized, display_name: less_than ); define_cuda_server_key_bench_clean_input_fn!( method_name: le, + method_name_cpu: le_parallelized, display_name: less_or_equal ); define_cuda_server_key_bench_clean_input_fn!( method_name: max, + method_name_cpu: max_parallelized, display_name: max ); define_cuda_server_key_bench_clean_input_fn!( method_name: min, + method_name_cpu: min_parallelized, display_name: min ); define_cuda_server_key_bench_clean_input_fn!( method_name: left_shift, + method_name_cpu: left_shift_parallelized, display_name: left_shift ); define_cuda_server_key_bench_clean_input_fn!( method_name: right_shift, + method_name_cpu: right_shift_parallelized, display_name: right_shift ); define_cuda_server_key_bench_clean_input_fn!( method_name: rotate_left, + method_name_cpu: rotate_left_parallelized, display_name: rotate_left ); define_cuda_server_key_bench_clean_input_fn!( method_name: rotate_right, + method_name_cpu: rotate_right_parallelized, display_name: rotate_right ); define_cuda_server_key_bench_clean_input_unary_fn!( method_name: leading_zeros, + method_name_cpu: leading_zeros_parallelized, display_name: leading_zeros ); define_cuda_server_key_bench_clean_input_unary_fn!( method_name: leading_ones, + method_name_cpu: leading_ones_parallelized, display_name: leading_ones ); define_cuda_server_key_bench_clean_input_unary_fn!( method_name: trailing_zeros, + method_name_cpu: trailing_zeros_parallelized, display_name: trailing_zeros ); define_cuda_server_key_bench_clean_input_unary_fn!( method_name: trailing_ones, + method_name_cpu: trailing_ones_parallelized, display_name: trailing_ones ); define_cuda_server_key_bench_clean_input_unary_fn!( method_name: ilog2, + method_name_cpu: ilog2_parallelized, display_name: ilog2 ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_sub, + method_name_cpu: scalar_sub_parallelized, display_name: sub, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_add, + method_name_cpu: scalar_add_parallelized, display_name: add, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_mul, + method_name_cpu: scalar_mul_parallelized, display_name: mul, rng_func: mul_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_left_shift, + method_name_cpu: scalar_left_shift_parallelized, display_name: left_shift, rng_func: shift_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_right_shift, + method_name_cpu: scalar_right_shift_parallelized, display_name: right_shift, rng_func: shift_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_rotate_left, + method_name_cpu: scalar_rotate_left_parallelized, display_name: rotate_left, rng_func: shift_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_rotate_right, + method_name_cpu: scalar_rotate_right_parallelized, display_name: rotate_right, rng_func: shift_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_bitand, + method_name_cpu: scalar_bitand_parallelized, display_name: bitand, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_bitor, + method_name_cpu: scalar_bitor_parallelized, display_name: bitor, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_bitxor, + method_name_cpu: scalar_bitxor_parallelized, display_name: bitxor, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_eq, + method_name_cpu: scalar_eq_parallelized, display_name: equal, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_ne, + method_name_cpu: scalar_ne_parallelized, display_name: not_equal, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_gt, + method_name_cpu: scalar_gt_parallelized, display_name: greater_than, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_ge, + method_name_cpu: scalar_ge_parallelized, display_name: greater_or_equal, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_lt, + method_name_cpu: scalar_lt_parallelized, display_name: less_than, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_le, + method_name_cpu: scalar_le_parallelized, display_name: less_or_equal, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_max, + method_name_cpu: scalar_max_parallelized, display_name: max, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_min, + method_name_cpu: scalar_min_parallelized, display_name: min, rng_func: default_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_div_rem, + method_name_cpu: scalar_div_rem_parallelized, display_name: div_mod, rng_func: div_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_div, + method_name_cpu: scalar_div_parallelized, display_name: div, rng_func: div_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: scalar_rem, + method_name_cpu: scalar_rem_parallelized, display_name: modulo, rng_func: div_scalar ); define_cuda_server_key_bench_clean_input_scalar_fn!( method_name: unsigned_overflowing_scalar_add, + method_name_cpu: unsigned_overflowing_scalar_add_parallelized, display_name: overflowing_add, rng_func: default_scalar ); @@ -2516,6 +2717,7 @@ use cuda::{ cuda_cast_ops, default_cuda_dedup_ops, default_cuda_ops, default_scalar_cuda_ops, unchecked_cuda_ops, unchecked_scalar_cuda_ops, }; +use tfhe::{get_pbs_count, reset_pbs_count}; criterion_group!( smart_ops, @@ -2617,6 +2819,7 @@ criterion_group!( criterion_group!( default_dedup_ops, + bitand_parallelized, add_parallelized, mul_parallelized, div_rem_parallelized, diff --git a/tfhe/benches/integer/glwe_packing_compression.rs b/tfhe/benches/integer/glwe_packing_compression.rs index cb30a63570..de030a8522 100644 --- a/tfhe/benches/integer/glwe_packing_compression.rs +++ b/tfhe/benches/integer/glwe_packing_compression.rs @@ -6,6 +6,7 @@ use crate::utilities::{ }; use criterion::{black_box, criterion_group, Criterion, Throughput}; use rayon::prelude::*; +use std::cmp::max; use tfhe::integer::ciphertext::CompressedCiphertextListBuilder; use tfhe::integer::{ClientKey, RadixCiphertext}; use tfhe::keycache::NamedParam; @@ -77,9 +78,19 @@ fn cpu_glwe_packing(c: &mut Criterion) { }); } BenchmarkType::Throughput => { + // Execute the operation once to know its cost. + let ct = cks.encrypt_radix(0_u32, num_blocks); + let mut builder = CompressedCiphertextListBuilder::new(); + builder.push(ct); + let compressed = builder.build(&compression_key); + + reset_pbs_count(); + let _: RadixCiphertext = compressed.get(0, &decompression_key).unwrap().unwrap(); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + let num_block = (bit_size as f64 / (param.message_modulus.0 as f64).log(2.0)).ceil() as usize; - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); // FIXME thread usage seemed to be somewhat more "efficient". // For example, with bit_size = 2, my laptop is only using around 2/3 of the // available threads Thread usage increases with bit_size = 8 but @@ -150,6 +161,7 @@ fn cpu_glwe_packing(c: &mut Criterion) { #[cfg(feature = "gpu")] mod cuda { use super::*; + use std::cmp::max; use tfhe::core_crypto::gpu::CudaStreams; use tfhe::integer::gpu::ciphertext::compressed_ciphertext_list::CudaCompressedCiphertextListBuilder; use tfhe::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext; @@ -185,27 +197,26 @@ mod cuda { let bench_id_pack; let bench_id_unpack; + // Generate private compression key + let cks = ClientKey::new(param); + let private_compression_key = cks.new_compression_private_key(comp_param); + + // Generate and convert compression keys + let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream); + let (compressed_compression_key, compressed_decompression_key) = + radix_cks.new_compressed_compression_decompression_keys(&private_compression_key); + let cuda_compression_key = compressed_compression_key.decompress_to_cuda(&stream); + let cuda_decompression_key = compressed_decompression_key.decompress_to_cuda( + radix_cks.parameters().glwe_dimension(), + radix_cks.parameters().polynomial_size(), + radix_cks.parameters().message_modulus(), + radix_cks.parameters().carry_modulus(), + radix_cks.parameters().ciphertext_modulus(), + &stream, + ); + match BENCH_TYPE.get().unwrap() { BenchmarkType::Latency => { - // Generate private compression key - let cks = ClientKey::new(param); - let private_compression_key = cks.new_compression_private_key(comp_param); - - // Generate and convert compression keys - let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream); - let (compressed_compression_key, compressed_decompression_key) = radix_cks - .new_compressed_compression_decompression_keys(&private_compression_key); - let cuda_compression_key = - compressed_compression_key.decompress_to_cuda(&stream); - let cuda_decompression_key = compressed_decompression_key.decompress_to_cuda( - radix_cks.parameters().glwe_dimension(), - radix_cks.parameters().polynomial_size(), - radix_cks.parameters().message_modulus(), - radix_cks.parameters().carry_modulus(), - radix_cks.parameters().ciphertext_modulus(), - &stream, - ); - // Encrypt let ct = cks.encrypt_radix(0_u32, num_blocks); let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream); @@ -239,28 +250,25 @@ mod cuda { }); } BenchmarkType::Throughput => { + // Execute the operation once to know its cost. + let (cpu_compression_key, cpu_decompression_key) = + cks.new_compression_decompression_keys(&private_compression_key); + let ct = cks.encrypt_radix(0_u32, num_blocks); + let mut builder = CompressedCiphertextListBuilder::new(); + builder.push(ct); + let compressed = builder.build(&cpu_compression_key); + + reset_pbs_count(); + // Use CPU operation as pbs_count do not count PBS on GPU backend. + let _: RadixCiphertext = + compressed.get(0, &cpu_decompression_key).unwrap().unwrap(); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + let num_block = (bit_size as f64 / (param.message_modulus.0 as f64).log(2.0)) .ceil() as usize; - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); - let cks = ClientKey::new(param); - let private_compression_key = cks.new_compression_private_key(comp_param); - - let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream); - let (compressed_compression_key, compressed_decompression_key) = radix_cks - .new_compressed_compression_decompression_keys(&private_compression_key); - let cuda_compression_key = - compressed_compression_key.decompress_to_cuda(&stream); - let cuda_decompression_key = compressed_decompression_key.decompress_to_cuda( - radix_cks.parameters().glwe_dimension(), - radix_cks.parameters().polynomial_size(), - radix_cks.parameters().message_modulus(), - radix_cks.parameters().carry_modulus(), - radix_cks.parameters().ciphertext_modulus(), - &stream, - ); - // Encrypt let ct = cks.encrypt_radix(0_u32, num_blocks); let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream); @@ -344,6 +352,7 @@ criterion_group!(cpu_glwe_packing2, cpu_glwe_packing); #[cfg(feature = "gpu")] use cuda::gpu_glwe_packing2; +use tfhe::{get_pbs_count, reset_pbs_count}; fn main() { BENCH_TYPE.get_or_init(|| BenchmarkType::from_env().unwrap()); diff --git a/tfhe/benches/integer/oprf.rs b/tfhe/benches/integer/oprf.rs index 8bdc1e9407..07279f4a53 100644 --- a/tfhe/benches/integer/oprf.rs +++ b/tfhe/benches/integer/oprf.rs @@ -4,9 +4,11 @@ use crate::utilities::{ }; use criterion::{black_box, Criterion, Throughput}; use rayon::prelude::*; +use std::cmp::max; use tfhe::integer::keycache::KEY_CACHE; use tfhe::integer::IntegerKeyKind; use tfhe::keycache::NamedParam; +use tfhe::{get_pbs_count, reset_pbs_count}; use tfhe_csprng::seeders::Seed; pub fn unsigned_oprf(c: &mut Criterion) { @@ -40,12 +42,21 @@ pub fn unsigned_oprf(c: &mut Criterion) { }); } BenchmarkType::Throughput => { + let (_, sk) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + + // Execute the operation once to know its cost. + reset_pbs_count(); + sk.par_generate_oblivious_pseudo_random_unsigned_integer_bounded( + Seed(0), + bit_size as u64, + num_block as u64, + ); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (_, sk) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - b.iter(|| { (0..elements).into_par_iter().for_each(|_| { sk.par_generate_oblivious_pseudo_random_unsigned_integer_bounded( diff --git a/tfhe/benches/integer/signed_bench.rs b/tfhe/benches/integer/signed_bench.rs index 9c1cf0ed1b..cfaffc0bf8 100644 --- a/tfhe/benches/integer/signed_bench.rs +++ b/tfhe/benches/integer/signed_bench.rs @@ -8,6 +8,7 @@ use crate::utilities::{ use criterion::{criterion_group, Criterion, Throughput}; use rand::prelude::*; use rayon::prelude::*; +use std::cmp::max; use std::env; use tfhe::integer::keycache::KEY_CACHE; use tfhe::integer::prelude::*; @@ -66,12 +67,20 @@ fn bench_server_key_signed_binary_function_clean_inputs( }); } BenchmarkType::Throughput => { + let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + + // Execute the operation once to know its cost. + let ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + let ct_1 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + + reset_pbs_count(); + binary_op(&sks, &ct_0, &ct_1); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let mut cts_0 = (0..elements) .map(|_| cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block)) .collect::>(); @@ -151,12 +160,21 @@ fn bench_server_key_signed_shift_function_clean_inputs( }); } BenchmarkType::Throughput => { + let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + + // Execute the operation once to know its cost. + let clear_1 = rng.gen_range(0u128..bit_size as u128); + let ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + let ct_1 = cks.encrypt_radix(clear_1, num_block); + + reset_pbs_count(); + binary_op(&sks, &ct_0, &ct_1); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let mut cts_0 = (0..elements) .map(|_| cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block)) .collect::>(); @@ -233,12 +251,19 @@ fn bench_server_key_unary_function_clean_inputs( }); } BenchmarkType::Throughput => { + let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + + // Execute the operation once to know its cost. + let ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + + reset_pbs_count(); + unary_fn(&sks, &ct_0); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let mut cts_0 = (0..elements) .map(|_| cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block)) .collect::>(); @@ -307,12 +332,21 @@ fn signed_if_then_else_parallelized(c: &mut Criterion) { }); } BenchmarkType::Throughput => { + let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + + // Execute the operation once to know its cost. + let cond = sks.create_trivial_boolean_block(rng.gen_bool(0.5)); + let ct_then = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + let ct_else = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + + reset_pbs_count(); + sks.if_then_else_parallelized(&cond, &ct_then, &ct_else); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let cts_cond = (0..elements) .map(|_| sks.create_trivial_boolean_block(rng.gen_bool(0.5))) .collect::>(); @@ -830,12 +864,20 @@ fn bench_server_key_binary_scalar_function_clean_inputs( }); } BenchmarkType::Throughput => { + let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + + // Execute the operation once to know its cost. + let mut ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + let clear_1 = rng_func(&mut rng, bit_size); + + reset_pbs_count(); + binary_op(&sks, &mut ct_0, clear_1); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let mut cts_0 = (0..elements) .map(|_| cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block)) .collect::>(); @@ -1328,6 +1370,7 @@ mod cuda { use super::*; use criterion::criterion_group; use rayon::iter::IntoParallelRefIterator; + use std::cmp::max; use tfhe::core_crypto::gpu::CudaStreams; use tfhe::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock; use tfhe::integer::gpu::ciphertext::{CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext}; @@ -1335,11 +1378,12 @@ mod cuda { /// Base function to bench a server key function that is a binary operation, input ciphertext /// will contain only zero carries - fn bench_cuda_server_key_binary_signed_function_clean_inputs( + fn bench_cuda_server_key_binary_signed_function_clean_inputs( c: &mut Criterion, bench_name: &str, display_name: &str, binary_op: F, + binary_op_cpu: G, ) where F: Fn( &CudaServerKey, @@ -1347,6 +1391,7 @@ mod cuda { &mut CudaSignedRadixCiphertext, &CudaStreams, ) + Sync, + G: Fn(&ServerKey, &SignedRadixCiphertext, &SignedRadixCiphertext) + Sync, { let mut bench_group = c.benchmark_group(bench_name); bench_group @@ -1401,14 +1446,22 @@ mod cuda { }); } BenchmarkType::Throughput => { + let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let gpu_sks = CudaServerKey::new(&cks, &stream); + + // Execute the operation once to know its cost. + let mut ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + let mut ct_1 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + + reset_pbs_count(); + // Use CPU operation as pbs_count do not count PBS on GPU backend. + binary_op_cpu(&cpu_sks, &mut ct_0, &mut ct_1); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, _cpu_sks) = - KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let gpu_sks = CudaServerKey::new(&cks, &stream); - let mut cts_0 = (0..elements) .map(|_| { let clearlow = rng.gen::(); @@ -1460,7 +1513,7 @@ mod cuda { } macro_rules! define_cuda_server_key_bench_clean_input_signed_fn ( - (method_name: $server_key_method:ident, display_name:$name:ident) => { + (method_name: $server_key_method:ident, method_name_cpu: $server_key_method_cpu:ident, display_name:$name:ident) => { ::paste::paste!{ fn [](c: &mut Criterion) { bench_cuda_server_key_binary_signed_function_clean_inputs( @@ -1469,6 +1522,9 @@ mod cuda { stringify!($name), |server_key, lhs, rhs, stream| { server_key.$server_key_method(lhs, rhs, stream); + }, + |server_key_cpu, lhs, rhs| { + server_key_cpu.$server_key_method_cpu(lhs, rhs); } ) } @@ -1478,13 +1534,15 @@ mod cuda { /// Base function to bench a server key function that is a unary operation, input ciphertext /// will contain only zero carries - fn bench_cuda_server_key_unary_signed_function_clean_inputs( + fn bench_cuda_server_key_unary_signed_function_clean_inputs( c: &mut Criterion, bench_name: &str, display_name: &str, unary_op: F, + unary_op_cpu: G, ) where F: Fn(&CudaServerKey, &mut CudaSignedRadixCiphertext, &CudaStreams) + Sync, + G: Fn(&ServerKey, &SignedRadixCiphertext) + Sync, { let mut bench_group = c.benchmark_group(bench_name); bench_group @@ -1527,14 +1585,21 @@ mod cuda { }); } BenchmarkType::Throughput => { + let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let gpu_sks = CudaServerKey::new(&cks, &stream); + + // Execute the operation once to know its cost. + let ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + + reset_pbs_count(); + // Use CPU operation as pbs_count do not count PBS on GPU backend. + unary_op_cpu(&cpu_sks, &ct_0); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, _cpu_sks) = - KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let gpu_sks = CudaServerKey::new(&cks, &stream); - let mut cts_0 = (0..elements) .map(|_| { let clearlow = rng.gen::(); @@ -1572,7 +1637,7 @@ mod cuda { } macro_rules! define_cuda_server_key_bench_clean_input_signed_unary_fn ( - (method_name: $server_key_method:ident, display_name:$name:ident) => { + (method_name: $server_key_method:ident, method_name_cpu: $server_key_method_cpu:ident, display_name:$name:ident) => { ::paste::paste!{ fn [](c: &mut Criterion) { bench_cuda_server_key_unary_signed_function_clean_inputs( @@ -1581,6 +1646,9 @@ mod cuda { stringify!($name), |server_key, input, stream| { server_key.$server_key_method(input, stream); + }, + |server_key_cpu, lhs| { + server_key_cpu.$server_key_method_cpu(lhs); } ) } @@ -1588,15 +1656,17 @@ mod cuda { } ); - fn bench_cuda_server_key_binary_scalar_signed_function_clean_inputs( + fn bench_cuda_server_key_binary_scalar_signed_function_clean_inputs( c: &mut Criterion, bench_name: &str, display_name: &str, binary_op: F, - rng_func: G, + binary_op_cpu: G, + rng_func: H, ) where F: Fn(&CudaServerKey, &mut CudaSignedRadixCiphertext, ScalarType, &CudaStreams) + Sync, - G: Fn(&mut ThreadRng, usize) -> ScalarType, + G: Fn(&ServerKey, &mut SignedRadixCiphertext, ScalarType) + Sync, + H: Fn(&mut ThreadRng, usize) -> ScalarType, { let mut bench_group = c.benchmark_group(bench_name); bench_group @@ -1650,16 +1720,24 @@ mod cuda { }); } BenchmarkType::Throughput => { + let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let gpu_sks = CudaServerKey::new(&cks, &stream); + + // Execute the operation once to know its cost. + let mut ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + let clear_0 = rng_func(&mut rng, bit_size); + + reset_pbs_count(); + // Use CPU operation as pbs_count do not count PBS on GPU backend. + binary_op_cpu(&cpu_sks, &mut ct_0, clear_0); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!( "{bench_name}::throughput::{param_name}::{bit_size}_bits_scalar_{bit_size}" ); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, _cpu_sks) = - KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let gpu_sks = CudaServerKey::new(&cks, &stream); - let mut cts_0 = (0..elements) .map(|_| { let clearlow = rng.gen::(); @@ -1702,7 +1780,7 @@ mod cuda { } macro_rules! define_cuda_server_key_bench_clean_input_scalar_signed_fn ( - (method_name: $server_key_method:ident, display_name:$name:ident, rng_func:$($rng_fn:tt)*) => { + (method_name: $server_key_method:ident, method_name_cpu: $server_key_method_cpu:ident, display_name:$name:ident, rng_func:$($rng_fn:tt)*) => { ::paste::paste!{ fn [](c: &mut Criterion) { bench_cuda_server_key_binary_scalar_signed_function_clean_inputs( @@ -1712,6 +1790,9 @@ mod cuda { |server_key, lhs, rhs, stream| { server_key.$server_key_method(lhs, rhs, stream); }, + |server_key_cpu, lhs, rhs| { + server_key_cpu.$server_key_method_cpu(lhs, rhs); + }, $($rng_fn)* ) } @@ -1721,11 +1802,12 @@ mod cuda { /// Base function to bench a server key function that is a binary operation for shift/rotate, /// input ciphertext will contain only zero carries - fn bench_cuda_server_key_shift_rotate_signed_function_clean_inputs( + fn bench_cuda_server_key_shift_rotate_signed_function_clean_inputs( c: &mut Criterion, bench_name: &str, display_name: &str, binary_op: F, + binary_op_cpu: G, ) where F: Fn( &CudaServerKey, @@ -1733,6 +1815,7 @@ mod cuda { &mut CudaUnsignedRadixCiphertext, &CudaStreams, ) + Sync, + G: Fn(&ServerKey, &SignedRadixCiphertext, &RadixCiphertext) + Sync, { let mut bench_group = c.benchmark_group(bench_name); bench_group @@ -1786,14 +1869,23 @@ mod cuda { }); } BenchmarkType::Throughput => { + let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let gpu_sks = CudaServerKey::new(&cks, &stream); + + // Execute the operation once to know its cost. + let clear_1 = rng.gen_range(0u128..bit_size as u128); + let ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + let ct_1 = cks.encrypt_radix(clear_1, num_block); + + reset_pbs_count(); + // Use CPU operation as pbs_count do not count PBS on GPU backend. + binary_op_cpu(&cpu_sks, &ct_0, &ct_1); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, _cpu_sks) = - KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let gpu_sks = CudaServerKey::new(&cks, &stream); - let mut cts_0 = (0..elements) .map(|_| { let clearlow = rng.gen::(); @@ -1843,7 +1935,7 @@ mod cuda { } macro_rules! define_cuda_server_key_bench_clean_input_signed_shift_rotate ( - (method_name: $server_key_method:ident, display_name:$name:ident) => { + (method_name: $server_key_method:ident, method_name_cpu: $server_key_method_cpu:ident, display_name:$name:ident) => { ::paste::paste!{ fn [](c: &mut Criterion) { bench_cuda_server_key_shift_rotate_signed_function_clean_inputs( @@ -1852,6 +1944,9 @@ mod cuda { stringify!($name), |server_key, lhs, rhs, stream| { server_key.$server_key_method(lhs, rhs, stream); + }, + |server_key_cpu, lhs, rhs| { + server_key_cpu.$server_key_method_cpu(lhs, rhs); } ) } @@ -1916,14 +2011,23 @@ mod cuda { }); } BenchmarkType::Throughput => { + let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let gpu_sks = CudaServerKey::new(&cks, &stream); + + // Execute the operation once to know its cost. + let cond = cpu_sks.create_trivial_boolean_block(rng.gen_bool(0.5)); + let ct_then = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + let ct_else = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block); + + reset_pbs_count(); + // Use CPU operation as pbs_count do not count PBS on GPU backend. + cpu_sks.if_then_else_parallelized(&cond, &ct_then, &ct_else); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); - let elements = throughput_num_threads(num_block); + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_group.bench_function(&bench_id, |b| { - let (cks, _cpu_sks) = - KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); - let gpu_sks = CudaServerKey::new(&cks, &stream); - let cts_cond = (0..elements) .map(|_| { let ct_cond = cks.encrypt_bool(rng.gen::()); @@ -1997,246 +2101,291 @@ mod cuda { define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_add, + method_name_cpu: add_parallelized, display_name: add ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_sub, + method_name_cpu: sub_parallelized, display_name: sub ); define_cuda_server_key_bench_clean_input_signed_unary_fn!( method_name: unchecked_neg, + method_name_cpu: neg_parallelized, display_name: neg ); define_cuda_server_key_bench_clean_input_signed_unary_fn!( method_name: unchecked_abs, + method_name_cpu: unchecked_abs_parallelized, display_name: abs ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_mul, + method_name_cpu: unchecked_mul_parallelized, display_name: mul ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_div_rem, + method_name_cpu: unchecked_div_rem_parallelized, display_name: div_mod ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_bitand, + method_name_cpu: unchecked_bitand_parallelized, display_name: bitand ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_bitor, + method_name_cpu: unchecked_bitor_parallelized, display_name: bitor ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_bitxor, + method_name_cpu: unchecked_bitxor_parallelized, display_name: bitxor ); define_cuda_server_key_bench_clean_input_signed_unary_fn!( method_name: unchecked_bitnot, + method_name_cpu: bitnot, display_name: bitnot ); define_cuda_server_key_bench_clean_input_signed_shift_rotate!( method_name: unchecked_rotate_left, + method_name_cpu: unchecked_rotate_left_parallelized, display_name: rotate_left ); define_cuda_server_key_bench_clean_input_signed_shift_rotate!( method_name: unchecked_rotate_right, + method_name_cpu: unchecked_rotate_right_parallelized, display_name: rotate_right ); define_cuda_server_key_bench_clean_input_signed_shift_rotate!( method_name: unchecked_left_shift, + method_name_cpu: unchecked_left_shift_parallelized, display_name: left_shift ); define_cuda_server_key_bench_clean_input_signed_shift_rotate!( method_name: unchecked_right_shift, + method_name_cpu: unchecked_right_shift_parallelized, display_name: right_shift ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_eq, + method_name_cpu: unchecked_eq_parallelized, display_name: eq ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_ne, + method_name_cpu: unchecked_ne_parallelized, display_name: ne ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_gt, + method_name_cpu: unchecked_gt_parallelized, display_name: gt ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_ge, + method_name_cpu: unchecked_ge_parallelized, display_name: ge ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_lt, + method_name_cpu: unchecked_lt_parallelized, display_name: lt ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_le, + method_name_cpu: unchecked_le_parallelized, display_name: le ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_min, + method_name_cpu: unchecked_min_parallelized, display_name: min ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_max, + method_name_cpu: unchecked_max_parallelized, display_name: max ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_signed_overflowing_add, + method_name_cpu: unchecked_signed_overflowing_add_parallelized, display_name: overflowing_add ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: unchecked_signed_overflowing_sub, + method_name_cpu: unchecked_signed_overflowing_sub_parallelized, display_name: overflowing_sub ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_add, + method_name_cpu: scalar_add_parallelized, display_name: add, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_mul, + method_name_cpu: unchecked_scalar_mul_parallelized, display_name: mul, rng_func: mul_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_sub, + method_name_cpu: scalar_sub_parallelized, display_name: sub, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_bitand, + method_name_cpu: unchecked_scalar_bitand_parallelized, display_name: bitand, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_bitor, + method_name_cpu: unchecked_scalar_bitor_parallelized, display_name: bitor, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_bitxor, + method_name_cpu: unchecked_scalar_bitxor_parallelized, display_name: bitxor, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_right_shift, + method_name_cpu: unchecked_scalar_right_shift_parallelized, display_name: right_shift, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_left_shift, + method_name_cpu: unchecked_scalar_left_shift_parallelized, display_name: left_shift, rng_func: shift_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_rotate_right, + method_name_cpu: unchecked_scalar_rotate_right_parallelized, display_name: rotate_right, rng_func: shift_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_rotate_left, + method_name_cpu: unchecked_scalar_rotate_left_parallelized, display_name: rotate_left, rng_func: shift_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_eq, + method_name_cpu: unchecked_scalar_eq_parallelized, display_name: eq, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_ne, + method_name_cpu: unchecked_scalar_ne_parallelized, display_name: ne, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_gt, + method_name_cpu: unchecked_scalar_gt_parallelized, display_name: gt, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_ge, + method_name_cpu: unchecked_scalar_ge_parallelized, display_name: ge, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_lt, + method_name_cpu: unchecked_scalar_lt_parallelized, display_name: lt, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_le, + method_name_cpu: unchecked_scalar_le_parallelized, display_name: le, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_min, + method_name_cpu: unchecked_scalar_min_parallelized, display_name: min, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_scalar_max, + method_name_cpu: unchecked_scalar_max_parallelized, display_name: max, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: signed_overflowing_scalar_add, + method_name_cpu: signed_overflowing_scalar_add_parallelized, display_name: overflowing_add, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: signed_overflowing_scalar_sub, + method_name_cpu: signed_overflowing_scalar_sub_parallelized, display_name: overflowing_sub, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: unchecked_signed_scalar_div_rem, + method_name_cpu: unchecked_signed_scalar_div_rem_parallelized, display_name: div_rem, rng_func: div_scalar ); @@ -2247,234 +2396,277 @@ mod cuda { define_cuda_server_key_bench_clean_input_signed_fn!( method_name: add, + method_name_cpu: add_parallelized, display_name: add ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: sub, + method_name_cpu: sub_parallelized, display_name: sub ); define_cuda_server_key_bench_clean_input_signed_unary_fn!( method_name: neg, + method_name_cpu: neg_parallelized, display_name: neg ); define_cuda_server_key_bench_clean_input_signed_unary_fn!( method_name: abs, + method_name_cpu: abs_parallelized, display_name: abs ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: mul, + method_name_cpu: mul_parallelized, display_name: mul ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: div_rem, + method_name_cpu: div_rem_parallelized, display_name: div_mod ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: bitand, + method_name_cpu: bitand_parallelized, display_name: bitand ); define_cuda_server_key_bench_clean_input_signed_unary_fn!( method_name: bitnot, + method_name_cpu: bitnot, display_name: bitnot ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: bitor, + method_name_cpu: bitor_parallelized, display_name: bitor ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: bitxor, + method_name_cpu: bitxor_parallelized, display_name: bitxor ); define_cuda_server_key_bench_clean_input_signed_shift_rotate!( method_name: rotate_left, + method_name_cpu: rotate_left_parallelized, display_name: rotate_left ); define_cuda_server_key_bench_clean_input_signed_shift_rotate!( method_name: rotate_right, + method_name_cpu: rotate_right_parallelized, display_name: rotate_right ); define_cuda_server_key_bench_clean_input_signed_shift_rotate!( method_name: left_shift, + method_name_cpu: left_shift_parallelized, display_name: left_shift ); define_cuda_server_key_bench_clean_input_signed_shift_rotate!( method_name: right_shift, + method_name_cpu: right_shift_parallelized, display_name: right_shift ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: eq, + method_name_cpu: eq_parallelized, display_name: eq ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: ne, + method_name_cpu: ne_parallelized, display_name: ne ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: gt, + method_name_cpu: gt_parallelized, display_name: gt ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: ge, + method_name_cpu: ge_parallelized, display_name: ge ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: lt, + method_name_cpu: lt_parallelized, display_name: lt ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: le, + method_name_cpu: le_parallelized, display_name: le ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: min, + method_name_cpu: min_parallelized, display_name: min ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: max, + method_name_cpu: max_parallelized, display_name: max ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: signed_overflowing_add, + method_name_cpu: signed_overflowing_add_parallelized, display_name: overflowing_add ); define_cuda_server_key_bench_clean_input_signed_fn!( method_name: signed_overflowing_sub, + method_name_cpu: signed_overflowing_sub_parallelized, display_name: overflowing_sub ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_add, + method_name_cpu: scalar_add_parallelized, display_name: add, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_mul, + method_name_cpu: scalar_mul_parallelized, display_name: mul, rng_func: mul_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_sub, + method_name_cpu: scalar_sub_parallelized, display_name: sub, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_bitand, + method_name_cpu: scalar_bitand_parallelized, display_name: bitand, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_bitor, + method_name_cpu: scalar_bitor_parallelized, display_name: bitor, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_bitxor, + method_name_cpu: scalar_bitxor_parallelized, display_name: bitxor, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_left_shift, + method_name_cpu: scalar_left_shift_parallelized, display_name: left_shift, rng_func: shift_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_right_shift, + method_name_cpu: scalar_right_shift_parallelized, display_name: right_shift, rng_func: shift_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_rotate_left, + method_name_cpu: scalar_rotate_left_parallelized, display_name: rotate_left, rng_func: shift_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_rotate_right, + method_name_cpu: scalar_rotate_right_parallelized, display_name: rotate_right, rng_func: shift_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_eq, + method_name_cpu: scalar_eq_parallelized, display_name: eq, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_ne, + method_name_cpu: scalar_ne_parallelized, display_name: ne, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_gt, + method_name_cpu: scalar_gt_parallelized, display_name: gt, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_ge, + method_name_cpu: scalar_ge_parallelized, display_name: ge, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_lt, + method_name_cpu: scalar_lt_parallelized, display_name: lt, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_le, + method_name_cpu: scalar_le_parallelized, display_name: le, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_min, + method_name_cpu: scalar_min_parallelized, display_name: min, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: scalar_max, + method_name_cpu: scalar_max_parallelized, display_name: max, rng_func: default_signed_scalar ); define_cuda_server_key_bench_clean_input_scalar_signed_fn!( method_name: signed_scalar_div_rem, + method_name_cpu: signed_scalar_div_rem_parallelized, display_name: div_rem, rng_func: div_scalar ); @@ -2697,6 +2889,7 @@ use cuda::{ cuda_cast_ops, default_cuda_dedup_ops, default_cuda_ops, default_scalar_cuda_ops, unchecked_cuda_ops, unchecked_scalar_cuda_ops, }; +use tfhe::{get_pbs_count, reset_pbs_count}; #[cfg(feature = "gpu")] fn go_through_gpu_bench_groups(val: &str) { diff --git a/tfhe/benches/integer/zk_pke.rs b/tfhe/benches/integer/zk_pke.rs index 8d789caf1f..79615b0b2c 100644 --- a/tfhe/benches/integer/zk_pke.rs +++ b/tfhe/benches/integer/zk_pke.rs @@ -5,6 +5,7 @@ use crate::utilities::{throughput_num_threads, BenchmarkType, BENCH_TYPE}; use criterion::{criterion_group, Criterion, Throughput}; use rand::prelude::*; use rayon::prelude::*; +use std::cmp::max; use std::fs::{File, OpenOptions}; use std::io::Write; use std::path::Path; @@ -18,6 +19,7 @@ use tfhe::shortint::parameters::compact_public_key_only::p_fail_2_minus_64::ks_p use tfhe::shortint::parameters::key_switching::p_fail_2_minus_64::ks_pbs::V0_11_PARAM_KEYSWITCH_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64; use tfhe::shortint::parameters::PBSParameters; use tfhe::zk::{CompactPkeCrs, ZkComputeLoad}; +use tfhe::{get_pbs_count, reset_pbs_count}; use utilities::{write_to_json, OperatorType}; fn write_result(file: &mut File, name: &str, value: usize) { @@ -96,7 +98,17 @@ fn pke_zk_proof(c: &mut Criterion) { }); } BenchmarkType::Throughput => { - let elements = throughput_num_threads(num_block); + // Execute the operation once to know its cost. + let input_msg = rng.gen::(); + let messages = vec![input_msg; fhe_uint_count]; + + reset_pbs_count(); + let _ = tfhe::integer::ProvenCompactCiphertextList::builder(&pk) + .extend(messages.iter().copied()) + .build_with_proof_packed(&crs, &metadata, compute_load); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_id = format!( @@ -304,7 +316,27 @@ fn pke_zk_verify(c: &mut Criterion, results_file: &Path) { } BenchmarkType::Throughput => { // In throughput mode object sizes are not recorded. - let elements = throughput_num_threads(num_block); + + // Execute the operation once to know its cost. + let input_msg = rng.gen::(); + let messages = vec![input_msg; fhe_uint_count]; + let ct1 = tfhe::integer::ProvenCompactCiphertextList::builder(&pk) + .extend(messages.iter().copied()) + .build_with_proof_packed(&crs, &metadata, compute_load) + .unwrap(); + + reset_pbs_count(); + let _ = ct1.verify_and_expand( + &crs, + &pk, + &metadata, + IntegerCompactCiphertextListExpansionMode::CastAndUnpackIfNecessary( + casting_key.as_view(), + ), + ); + let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default + + let elements = throughput_num_threads(num_block, pbs_count); bench_group.throughput(Throughput::Elements(elements)); bench_id_verify = format!( diff --git a/tfhe/benches/utilities.rs b/tfhe/benches/utilities.rs index 5348010941..e30fefff17 100644 --- a/tfhe/benches/utilities.rs +++ b/tfhe/benches/utilities.rs @@ -392,7 +392,7 @@ pub mod integer_utils { /// Generate a number of threads to use to saturate current machine for throughput measurements. #[allow(dead_code)] - pub fn throughput_num_threads(num_block: usize) -> u64 { + pub fn throughput_num_threads(num_block: usize, op_pbs_count: u64) -> u64 { let ref_block_count = 32; // Represent a ciphertext of 64 bits for 2_2 parameters set let block_multiplicator = (ref_block_count as f64 / num_block as f64).ceil(); @@ -401,13 +401,23 @@ pub mod integer_utils { // This value is for Nvidia H100 GPU let streaming_multiprocessors = 132; let num_gpus = unsafe { cuda_get_number_of_gpus() }; - ((streaming_multiprocessors * num_gpus) as f64 * block_multiplicator) as u64 + let total_num_sm = streaming_multiprocessors * num_gpus; + // Some operations with a high count of PBS (e.g. division) would yield an operation + // loading value so low that the number of elements in the end wouldn't be meaningful. + let minimum_loading = 0.2; + let operation_loading = + ((total_num_sm as u64 / op_pbs_count) as f64).max(minimum_loading); + (total_num_sm as f64 * block_multiplicator * operation_loading) as u64 } #[cfg(not(feature = "gpu"))] { let num_threads = rayon::current_num_threads() as f64; + // Some operations with a high count of PBS (e.g. division) would yield an operation + // loading value so low that the number of elements in the end wouldn't be meaningful. + let minimum_loading = 0.2; + let operation_loading = (num_threads / (op_pbs_count as f64)).max(minimum_loading); // Add 20% more to maximum threads available. - ((num_threads + (num_threads * 0.2)) * block_multiplicator) as u64 + ((num_threads + (num_threads * 0.2)) * block_multiplicator * operation_loading) as u64 } }