feat(gpu): implement fhe rand on gpu

zama-ai · Jan 14, 2025 · 63eb9b8 · 63eb9b8
1 parent f7189ed
commit 63eb9b8
Show file tree

Hide file tree

Showing 15 changed files with 1,105 additions and 73 deletions.
diff --git a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
@@ -44,6 +44,11 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
     void *stream, uint32_t gpu_index, void *lwe_array_out,
     void const *lwe_array_in, void const *cleartext_array_in,
     uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count);
+void cuda_add_lwe_ciphertext_vector_plaintext_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_array_in, const uint64_t plaintext_in,
+    const uint32_t input_lwe_dimension,
+    const uint32_t input_lwe_ciphertext_count);
 }
 
 #endif // CUDA_LINALG_H_
diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
@@ -114,3 +114,41 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
       static_cast<const uint64_t *>(plaintext_array_in), input_lwe_dimension,
       input_lwe_ciphertext_count);
 }
+
+/*
+ * Perform the addition of a u64 input LWE ciphertext vector with a u64 input
+ * plaintext scalar.
+ * - `stream` is a void pointer to the Cuda stream to be used in the kernel
+ * launch
+ * - `gpu_index` is the index of the GPU to be used in the kernel launch
+ * - `lwe_array_out` is an array of size
+ * `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
+ * been allocated on the GPU before calling this function, and that will hold
+ * the result of the computation.
+ * - `lwe_array_in` is the LWE ciphertext vector used as input, it should have
+ * been allocated and initialized before calling this function. It has the same
+ * size as the output array.
+ * - `plaintext_in` is the plaintext used as input.
+ * - `input_lwe_dimension` is the number of mask elements in the input and
+ * output LWE ciphertext vectors
+ * - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the
+ * input LWE ciphertext vector, as well as in the output.
+ *
+ * The same input plaintext is added to the body of the
+ * LWE ciphertexts in the LWE ciphertext vector. The result of the
+ * operation is stored in the output LWE ciphertext vector. The two input
+ * vectors are unchanged. This function is a wrapper to a device function that
+ * performs the operation on the GPU.
+ */
+void cuda_add_lwe_ciphertext_vector_plaintext_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_array_in, const uint64_t plaintext_in,
+    const uint32_t input_lwe_dimension,
+    const uint32_t input_lwe_ciphertext_count) {
+
+  host_addition_plaintext_scalar<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<const uint64_t *>(lwe_array_in), plaintext_in,
+      input_lwe_dimension, input_lwe_ciphertext_count);
+}
diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
@@ -27,6 +27,21 @@ plaintext_addition(T *output, T const *lwe_input, T const *plaintext_input,
   }
 }
 
+template <typename T>
+__global__ void plaintext_addition_scalar(T *output, T const *lwe_input,
+                                          const T plaintext_input,
+                                          const uint32_t input_lwe_dimension,
+                                          const uint32_t num_entries) {
+
+  int tid = threadIdx.x;
+  int lwe_index = blockIdx.x * blockDim.x + tid;
+  if (lwe_index < num_entries) {
+    int index = lwe_index * (input_lwe_dimension + 1) + input_lwe_dimension;
+    // Here we take advantage of the wrapping behaviour of uint
+    output[index] = lwe_input[index] + plaintext_input;
+  }
+}
+
 template <typename T>
 __host__ void
 host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index, T *output,
@@ -48,6 +63,27 @@ host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index, T *output,
   check_cuda_error(cudaGetLastError());
 }
 
+template <typename T>
+__host__ void host_addition_plaintext_scalar(
+    cudaStream_t stream, uint32_t gpu_index, T *output, T const *lwe_input,
+    const T plaintext_input, const uint32_t lwe_dimension,
+    const uint32_t lwe_ciphertext_count) {
+
+  cudaSetDevice(gpu_index);
+  int num_blocks = 0, num_threads = 0;
+  int num_entries = lwe_ciphertext_count;
+  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
+  dim3 grid(num_blocks, 1, 1);
+  dim3 thds(num_threads, 1, 1);
+
+  cuda_memcpy_async_gpu_to_gpu(
+      output, lwe_input, (lwe_dimension + 1) * lwe_ciphertext_count * sizeof(T),
+      stream, gpu_index);
+  plaintext_addition_scalar<T><<<grid, thds, 0, stream>>>(
+      output, lwe_input, plaintext_input, lwe_dimension, num_entries);
+  check_cuda_error(cudaGetLastError());
+}
+
 template <typename T>
 __global__ void addition(T *output, T const *input_1, T const *input_2,
                          uint32_t num_entries) {

diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -1345,6 +1345,17 @@ extern "C" {
         input_lwe_ciphertext_count: u32,
     );
 }
+extern "C" {
+    pub fn cuda_add_lwe_ciphertext_vector_plaintext_64(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        lwe_array_out: *mut ffi::c_void,
+        lwe_array_in: *const ffi::c_void,
+        plaintext_in: u64,
+        input_lwe_dimension: u32,
+        input_lwe_ciphertext_count: u32,
+    );
+}
 extern "C" {
     pub fn cuda_fourier_polynomial_mul(
         stream: *mut ffi::c_void,

diff --git a/tfhe/benches/integer/bench.rs b/tfhe/benches/integer/bench.rs
@@ -1307,11 +1307,12 @@ define_server_key_bench_default_fn!(
 #[cfg(feature = "gpu")]
 mod cuda {
     use super::*;
-    use criterion::criterion_group;
+    use criterion::{black_box, criterion_group};
     use tfhe::core_crypto::gpu::CudaStreams;
     use tfhe::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
     use tfhe::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext;
     use tfhe::integer::gpu::server_key::CudaServerKey;
+    use tfhe_csprng::seeders::Seed;
 
     fn bench_cuda_server_key_unary_function_clean_inputs<F>(
         c: &mut Criterion,
@@ -1731,6 +1732,81 @@ mod cuda {
         bench_group.finish()
     }
 
+    pub fn cuda_unsigned_oprf(c: &mut Criterion) {
+        let bench_name = "integer::cuda::unsigned_oprf";
+
+        let mut bench_group = c.benchmark_group(bench_name);
+        bench_group
+            .sample_size(15)
+            .measurement_time(std::time::Duration::from_secs(30));
+
+        let stream = CudaStreams::new_multi_gpu();
+
+        for (param, num_block, bit_size) in ParamsAndNumBlocksIter::default() {
+            let param_name = param.name();
+
+            let bench_id;
+
+            match BENCH_TYPE.get().unwrap() {
+                BenchmarkType::Latency => {
+                    bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
+                    bench_group.bench_function(&bench_id, |b| {
+                        let (cks, _cpu_sks) =
+                            KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+                        let gpu_sks = CudaServerKey::new(&cks, &stream);
+
+                        b.iter(|| {
+                            _ = black_box(
+                                gpu_sks
+                                    .par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
+                                        Seed(0),
+                                        bit_size as u64,
+                                        num_block as u64,
+                                        &stream,
+                                    ),
+                            );
+                        })
+                    });
+                }
+                BenchmarkType::Throughput => {
+                    bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
+                    let elements = throughput_num_threads(num_block);
+                    bench_group.throughput(Throughput::Elements(elements));
+
+                    bench_group.bench_function(&bench_id, |b| {
+                        let (cks, _cpu_sks) =
+                            KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+                        let gpu_sks = CudaServerKey::new(&cks, &stream);
+
+                        b.iter(|| {
+                            (0..elements).into_par_iter().for_each(|_| {
+                                gpu_sks
+                                    .par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
+                                        Seed(0),
+                                        bit_size as u64,
+                                        num_block as u64,
+                                        &stream,
+                                    );
+                            })
+                        })
+                    });
+                }
+            }
+
+            write_to_json::<u64, _>(
+                &bench_id,
+                param,
+                param.name(),
+                "oprf",
+                &OperatorType::Atomic,
+                bit_size as u32,
+                vec![param.message_modulus().0.ilog2(); num_block],
+            );
+        }
+
+        bench_group.finish()
+    }
+
     macro_rules! define_cuda_server_key_bench_clean_input_unary_fn (
         (method_name: $server_key_method:ident, display_name:$name:ident) => {
             ::paste::paste!{
@@ -2376,6 +2452,7 @@ mod cuda {
         cuda_trailing_zeros,
         cuda_trailing_ones,
         cuda_ilog2,
+        cuda_unsigned_oprf,
     );
 
     criterion_group!(
@@ -2395,6 +2472,7 @@ mod cuda {
         cuda_scalar_mul,
         cuda_scalar_div,
         cuda_scalar_rem,
+        cuda_unsigned_oprf,
     );
 
     criterion_group!(

diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs
@@ -481,6 +481,31 @@ pub unsafe fn add_lwe_ciphertext_vector_plaintext_vector_async<T: UnsignedIntege
     );
 }
 
+/// Addition of a vector of LWE ciphertexts with a plaintext scalar
+///
+/// # Safety
+///
+/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
+/// required
+pub unsafe fn add_lwe_ciphertext_vector_plaintext_scalar_async<T: UnsignedInteger>(
+    streams: &CudaStreams,
+    lwe_array_out: &mut CudaVec<T>,
+    lwe_array_in: &CudaVec<T>,
+    plaintext_in: u64,
+    lwe_dimension: LweDimension,
+    num_samples: u32,
+) {
+    cuda_add_lwe_ciphertext_vector_plaintext_64(
+        streams.ptr[0],
+        streams.gpu_indexes[0].0,
+        lwe_array_out.as_mut_c_ptr(0),
+        lwe_array_in.as_c_ptr(0),
+        plaintext_in,
+        lwe_dimension.0 as u32,
+        num_samples,
+    );
+}
+
 /// Assigned addition of a vector of LWE ciphertexts with a vector of plaintexts
 ///
 /// # Safety

diff --git a/tfhe/src/high_level_api/booleans/oprf.rs b/tfhe/src/high_level_api/booleans/oprf.rs
@@ -1,6 +1,12 @@
-use super::FheBool;
+use super::{FheBool, InnerBoolean};
 use crate::high_level_api::global_state;
+#[cfg(feature = "gpu")]
+use crate::high_level_api::global_state::with_thread_local_cuda_streams;
 use crate::high_level_api::keys::InternalServerKey;
+#[cfg(feature = "gpu")]
+use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
+#[cfg(feature = "gpu")]
+use crate::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext;
 use crate::integer::BooleanBlock;
 use tfhe_csprng::seeders::Seed;
 
@@ -24,16 +30,28 @@ impl FheBool {
     /// let dec_result: bool = ct_res.decrypt(&client_key);
     /// ```
     pub fn generate_oblivious_pseudo_random(seed: Seed) -> Self {
-        global_state::with_internal_keys(|key| match key {
+        let (ciphertext, tag) = global_state::with_internal_keys(|key| match key {
             InternalServerKey::Cpu(key) => {
                 let ct = key.pbs_key().key.generate_oblivious_pseudo_random(seed, 1);
-
-                Self::new(BooleanBlock(ct), key.tag.clone())
+                (
+                    InnerBoolean::Cpu(BooleanBlock::new_unchecked(ct)),
+                    key.tag.clone(),
+                )
             }
             #[cfg(feature = "gpu")]
-            InternalServerKey::Cuda(_) => {
-                todo!("Cuda devices do not yet support oblivious pseudo random generation")
-            }
-        })
+            InternalServerKey::Cuda(cuda_key) => with_thread_local_cuda_streams(|streams| {
+                let d_ct: CudaUnsignedRadixCiphertext = cuda_key
+                    .key
+                    .key
+                    .generate_oblivious_pseudo_random(seed, 1, streams);
+                (
+                    InnerBoolean::Cuda(CudaBooleanBlock::from_cuda_radix_ciphertext(
+                        d_ct.ciphertext,
+                    )),
+                    cuda_key.tag.clone(),
+                )
+            }),
+        });
+        Self::new(ciphertext, tag)
     }
 }