From 86505a146748ac59355e04ed6165d33ef21c8a41 Mon Sep 17 00:00:00 2001 From: Agnes Leroy Date: Tue, 26 Nov 2024 09:55:01 +0100 Subject: [PATCH] feat(gpu): add gpu array type in hl api --- tfhe/src/high_level_api/array/gpu/booleans.rs | 300 ++++++++ tfhe/src/high_level_api/array/gpu/integers.rs | 648 ++++++++++++++++++ tfhe/src/high_level_api/array/gpu/mod.rs | 9 + tfhe/src/high_level_api/array/mod.rs | 23 + .../high_level_api/array/tests/booleans.rs | 34 + tfhe/src/high_level_api/array/tests/mod.rs | 19 + tfhe/src/high_level_api/array/tests/signed.rs | 32 + .../high_level_api/array/tests/unsigned.rs | 32 + tfhe/src/high_level_api/global_state.rs | 25 + tfhe/src/lib.rs | 1 + 10 files changed, 1123 insertions(+) create mode 100644 tfhe/src/high_level_api/array/gpu/booleans.rs create mode 100644 tfhe/src/high_level_api/array/gpu/integers.rs create mode 100644 tfhe/src/high_level_api/array/gpu/mod.rs diff --git a/tfhe/src/high_level_api/array/gpu/booleans.rs b/tfhe/src/high_level_api/array/gpu/booleans.rs new file mode 100644 index 0000000000..f538f0c163 --- /dev/null +++ b/tfhe/src/high_level_api/array/gpu/booleans.rs @@ -0,0 +1,300 @@ +//! This module contains the implementation of the FheBool array backend +//! where the values and computations are always done on GPU + +use super::super::helpers::{create_sub_mut_slice_with_bound, create_sub_slice_with_bound}; +use super::super::traits::{BitwiseArrayBackend, ClearBitwiseArrayBackend}; +use crate::array::stride::{ParStridedIter, ParStridedIterMut, StridedIter}; +use crate::array::traits::TensorSlice; +use crate::high_level_api::array::{ArrayBackend, BackendDataContainer, BackendDataContainerMut}; +use crate::high_level_api::global_state; +use crate::high_level_api::global_state::with_thread_local_cuda_streams; +use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock; +use crate::prelude::{FheDecrypt, FheTryEncrypt}; +use crate::{ClientKey, FheBoolId}; +use rayon::prelude::*; +use std::ops::RangeBounds; + +pub struct GpuFheBoolArrayBackend; + +pub type GpuFheBoolArray = super::super::FheBackendArray; +pub type GpuFheBoolSlice<'a> = + super::super::FheBackendArraySlice<'a, GpuFheBoolArrayBackend, FheBoolId>; +pub type GpuFheBoolSliceMut<'a> = + super::super::FheBackendArraySliceMut<'a, GpuFheBoolArrayBackend, FheBoolId>; + +pub struct GpuBooleanSlice<'a>(pub(crate) &'a [CudaBooleanBlock]); +pub struct GpuBooleanSliceMut<'a>(pub(crate) &'a mut [CudaBooleanBlock]); +pub struct GpuBooleanOwned(pub(crate) Vec); + +impl Clone for GpuBooleanOwned { + fn clone(&self) -> Self { + with_thread_local_cuda_streams(|streams| { + Self(self.0.iter().map(|elem| elem.duplicate(streams)).collect()) + }) + } +} + +impl ArrayBackend for GpuFheBoolArrayBackend { + type Slice<'a> + = GpuBooleanSlice<'a> + where + Self: 'a; + type SliceMut<'a> + = GpuBooleanSliceMut<'a> + where + Self: 'a; + type Owned = GpuBooleanOwned; +} + +impl<'a> TensorSlice<'a, GpuBooleanSlice<'a>> { + pub fn iter(self) -> StridedIter<'a, CudaBooleanBlock> { + StridedIter::new(self.slice.0, self.dims.clone()) + } + + pub fn par_iter(self) -> ParStridedIter<'a, CudaBooleanBlock> { + ParStridedIter::new(self.slice.0, self.dims.clone()) + } +} + +impl<'a> TensorSlice<'a, GpuBooleanSliceMut<'a>> { + pub fn par_iter_mut(self) -> ParStridedIterMut<'a, CudaBooleanBlock> { + ParStridedIterMut::new(self.slice.0, self.dims.clone()) + } +} + +impl From> for GpuBooleanOwned { + fn from(value: Vec) -> Self { + Self(value) + } +} + +impl<'a> BackendDataContainer for GpuBooleanSlice<'a> { + type Backend = GpuFheBoolArrayBackend; + + fn len(&self) -> usize { + <[CudaBooleanBlock]>::len(self.0) + } + + fn as_sub_slice( + &self, + range: impl RangeBounds, + ) -> ::Slice<'_> { + GpuBooleanSlice(create_sub_slice_with_bound(self.0, range)) + } + + fn into_owned(self) -> ::Owned { + with_thread_local_cuda_streams(|streams| { + GpuBooleanOwned(self.0.iter().map(|elem| elem.duplicate(streams)).collect()) + }) + } +} + +impl<'a> BackendDataContainer for GpuBooleanSliceMut<'a> { + type Backend = GpuFheBoolArrayBackend; + + fn len(&self) -> usize { + <[CudaBooleanBlock]>::len(self.0) + } + + fn as_sub_slice( + &self, + range: impl RangeBounds, + ) -> ::Slice<'_> { + GpuBooleanSlice(create_sub_slice_with_bound(self.0, range)) + } + + fn into_owned(self) -> ::Owned { + with_thread_local_cuda_streams(|streams| { + GpuBooleanOwned(self.0.iter().map(|elem| elem.duplicate(streams)).collect()) + }) + } +} + +impl<'a> BackendDataContainerMut for GpuBooleanSliceMut<'a> { + fn as_sub_slice_mut( + &mut self, + range: impl RangeBounds, + ) -> ::SliceMut<'_> { + GpuBooleanSliceMut(create_sub_mut_slice_with_bound(self.0, range)) + } +} + +impl BackendDataContainer for GpuBooleanOwned { + type Backend = GpuFheBoolArrayBackend; + + fn len(&self) -> usize { + self.0.len() + } + + fn as_sub_slice( + &self, + range: impl RangeBounds, + ) -> ::Slice<'_> { + GpuBooleanSlice(create_sub_slice_with_bound(self.0.as_slice(), range)) + } + + fn into_owned(self) -> ::Owned { + self + } +} + +impl BackendDataContainerMut for GpuBooleanOwned { + fn as_sub_slice_mut( + &mut self, + range: impl RangeBounds, + ) -> ::SliceMut<'_> { + GpuBooleanSliceMut(create_sub_mut_slice_with_bound( + self.0.as_mut_slice(), + range, + )) + } +} + +impl BitwiseArrayBackend for GpuFheBoolArrayBackend { + fn bitand<'a>( + lhs: TensorSlice<'_, Self::Slice<'a>>, + rhs: TensorSlice<'_, Self::Slice<'a>>, + ) -> Self::Owned { + GpuBooleanOwned(global_state::with_cuda_internal_keys(|cuda_key| { + with_thread_local_cuda_streams(|streams| { + lhs.par_iter() + .zip(rhs.par_iter()) + .map(|(lhs, rhs)| CudaBooleanBlock(cuda_key.bitand(&lhs.0, &rhs.0, streams))) + .collect::>() + }) + })) + } + + fn bitor<'a>( + lhs: TensorSlice<'_, Self::Slice<'a>>, + rhs: TensorSlice<'_, Self::Slice<'a>>, + ) -> Self::Owned { + GpuBooleanOwned(global_state::with_cuda_internal_keys(|cuda_key| { + with_thread_local_cuda_streams(|streams| { + lhs.par_iter() + .zip(rhs.par_iter()) + .map(|(lhs, rhs)| CudaBooleanBlock(cuda_key.bitor(&lhs.0, &rhs.0, streams))) + .collect::>() + }) + })) + } + + fn bitxor<'a>( + lhs: TensorSlice<'_, Self::Slice<'a>>, + rhs: TensorSlice<'_, Self::Slice<'a>>, + ) -> Self::Owned { + GpuBooleanOwned(global_state::with_cuda_internal_keys(|cuda_key| { + with_thread_local_cuda_streams(|streams| { + lhs.par_iter() + .zip(rhs.par_iter()) + .map(|(lhs, rhs)| CudaBooleanBlock(cuda_key.bitxor(&lhs.0, &rhs.0, streams))) + .collect::>() + }) + })) + } + + fn bitnot(lhs: TensorSlice<'_, Self::Slice<'_>>) -> Self::Owned { + GpuBooleanOwned(global_state::with_cuda_internal_keys(|cuda_key| { + with_thread_local_cuda_streams(|streams| { + lhs.par_iter() + .map(|lhs| CudaBooleanBlock(cuda_key.bitnot(&lhs.0, streams))) + .collect::>() + }) + })) + } +} + +impl ClearBitwiseArrayBackend for GpuFheBoolArrayBackend { + fn bitand_slice( + lhs: TensorSlice<'_, Self::Slice<'_>>, + rhs: TensorSlice<'_, &'_ [bool]>, + ) -> Self::Owned { + GpuBooleanOwned(global_state::with_cuda_internal_keys(|cuda_key| { + with_thread_local_cuda_streams(|streams| { + lhs.par_iter() + .zip(rhs.par_iter().copied()) + .map(|(lhs, rhs)| { + CudaBooleanBlock(cuda_key.scalar_bitand(&lhs.0, rhs as u8, streams)) + }) + .collect::>() + }) + })) + } + + fn bitor_slice( + lhs: TensorSlice<'_, Self::Slice<'_>>, + rhs: TensorSlice<'_, &'_ [bool]>, + ) -> Self::Owned { + GpuBooleanOwned(global_state::with_cuda_internal_keys(|cuda_key| { + with_thread_local_cuda_streams(|streams| { + lhs.par_iter() + .zip(rhs.par_iter().copied()) + .map(|(lhs, rhs)| { + CudaBooleanBlock(cuda_key.scalar_bitor(&lhs.0, rhs as u8, streams)) + }) + .collect::>() + }) + })) + } + + fn bitxor_slice( + lhs: TensorSlice<'_, Self::Slice<'_>>, + rhs: TensorSlice<'_, &'_ [bool]>, + ) -> Self::Owned { + GpuBooleanOwned(global_state::with_cuda_internal_keys(|cuda_key| { + with_thread_local_cuda_streams(|streams| { + lhs.par_iter() + .zip(rhs.par_iter().copied()) + .map(|(lhs, rhs)| { + CudaBooleanBlock(cuda_key.scalar_bitxor(&lhs.0, rhs as u8, streams)) + }) + .collect::>() + }) + })) + } +} + +impl FheTryEncrypt<&[bool], ClientKey> for GpuFheBoolArray { + type Error = crate::Error; + + fn try_encrypt(values: &[bool], cks: &ClientKey) -> Result { + let encrypted = with_thread_local_cuda_streams(|streams| { + values + .iter() + .copied() + .map(|value| { + CudaBooleanBlock::from_boolean_block(&cks.key.key.encrypt_bool(value), streams) + }) + .collect::>() + }); + Ok(Self::new(encrypted, vec![values.len()])) + } +} + +impl<'a> FheDecrypt> for GpuFheBoolSlice<'a> { + fn decrypt(&self, key: &ClientKey) -> Vec { + with_thread_local_cuda_streams(|streams| { + self.elems + .0 + .iter() + .map(|encrypted_value| { + key.key + .key + .decrypt_bool(&encrypted_value.to_boolean_block(streams)) + }) + .collect() + }) + } +} + +impl<'a> FheDecrypt> for GpuFheBoolSliceMut<'a> { + fn decrypt(&self, key: &ClientKey) -> Vec { + self.as_slice().decrypt(key) + } +} + +impl FheDecrypt> for GpuFheBoolArray { + fn decrypt(&self, key: &ClientKey) -> Vec { + self.as_slice().decrypt(key) + } +} diff --git a/tfhe/src/high_level_api/array/gpu/integers.rs b/tfhe/src/high_level_api/array/gpu/integers.rs new file mode 100644 index 0000000000..a7bc8be2d7 --- /dev/null +++ b/tfhe/src/high_level_api/array/gpu/integers.rs @@ -0,0 +1,648 @@ +//! This module contains the implementations of the FheUint array and FheInt array backend +//! where the values and computations are always done on GPU +use super::super::helpers::{create_sub_mut_slice_with_bound, create_sub_slice_with_bound}; +use super::super::traits::{ArithmeticArrayBackend, BitwiseArrayBackend, ClearBitwiseArrayBackend}; +use crate::core_crypto::prelude::{SignedNumeric, UnsignedNumeric}; +use crate::high_level_api::array::{ + ArrayBackend, FheArrayBase, FheBackendArray, FheBackendArraySlice, FheBackendArraySliceMut, +}; + +use crate::array::stride::{ParStridedIter, ParStridedIterMut, StridedIter}; +use crate::array::traits::{ + BackendDataContainer, BackendDataContainerMut, ClearArithmeticArrayBackend, TensorSlice, +}; +use crate::core_crypto::gpu::CudaStreams; +use crate::high_level_api::global_state; +use crate::high_level_api::global_state::with_thread_local_cuda_streams; +use crate::high_level_api::integers::{FheIntId, FheUintId}; +use crate::integer::block_decomposition::{DecomposableInto, RecomposableFrom}; +use crate::integer::client_key::RecomposableSignedInteger; +use crate::integer::gpu::ciphertext::{ + CudaIntegerRadixCiphertext, CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext, +}; +use crate::integer::server_key::radix_parallel::scalar_div_mod::SignedReciprocable; +use crate::integer::server_key::{Reciprocable, ScalarMultiplier}; +use crate::prelude::{CastInto, FheDecrypt, FheTryEncrypt}; +use crate::{ClientKey, Error}; +use rayon::prelude::*; +use std::marker::PhantomData; +use std::ops::RangeBounds; + +pub struct GpuIntegerArrayBackend(PhantomData); + +pub type GpuUintArrayBackend = GpuIntegerArrayBackend; +pub type GpuIntArrayBackend = GpuIntegerArrayBackend; + +// Base alias for array of unsigned integers on the CPU only backend +pub type GpuFheUintArray = FheBackendArray; +pub type GpuFheUintSlice<'a, Id> = FheBackendArraySlice<'a, GpuUintArrayBackend, Id>; +pub type GpuFheUintSliceMut<'a, Id> = FheBackendArraySliceMut<'a, GpuUintArrayBackend, Id>; + +// Base alias for array of signed integers on the CPU only backend +pub type GpuFheIntArray = FheBackendArray; +pub type GpuFheIntSlice<'a, Id> = FheBackendArraySlice<'a, GpuIntArrayBackend, Id>; +pub type GpuFheIntSliceMut<'a, Id> = FheBackendArraySliceMut<'a, GpuIntArrayBackend, Id>; + +pub struct GpuSlice<'a, T>(&'a [T]); +pub struct GpuSliceMut<'a, T>(&'a mut [T]); +pub struct GpuOwned(Vec); + +impl Clone for GpuOwned +where + T: CudaIntegerRadixCiphertext, +{ + fn clone(&self) -> Self { + with_thread_local_cuda_streams(|streams| { + Self(self.0.iter().map(|elem| elem.duplicate(streams)).collect()) + }) + } +} + +impl ArrayBackend for GpuIntegerArrayBackend +where + T: CudaIntegerRadixCiphertext, +{ + type Slice<'a> + = GpuSlice<'a, T> + where + Self: 'a; + type SliceMut<'a> + = GpuSliceMut<'a, T> + where + Self: 'a; + type Owned = GpuOwned; +} + +impl<'a, T> TensorSlice<'a, GpuSlice<'a, T>> { + pub fn iter(self) -> StridedIter<'a, T> { + StridedIter::new(self.slice.0, self.dims.clone()) + } + + pub fn par_iter(self) -> ParStridedIter<'a, T> { + ParStridedIter::new(self.slice.0, self.dims.clone()) + } +} + +impl<'a, T> TensorSlice<'a, GpuSliceMut<'a, T>> { + pub fn par_iter_mut(self) -> ParStridedIterMut<'a, T> { + ParStridedIterMut::new(self.slice.0, self.dims.clone()) + } +} + +impl From> for GpuOwned { + fn from(value: Vec) -> Self { + Self(value) + } +} + +#[inline] +#[track_caller] +fn par_map_sks_op_on_pair_of_elements<'a, T, F>( + lhs: TensorSlice<'a, GpuSlice<'a, T>>, + rhs: TensorSlice<'a, GpuSlice<'a, T>>, + op: F, +) -> GpuOwned +where + T: CudaIntegerRadixCiphertext + Send + Sync, + F: Send + Sync + Fn(&crate::integer::gpu::CudaServerKey, &T, &T, &CudaStreams) -> T, +{ + GpuOwned(global_state::with_cuda_internal_keys(|cuda_key| { + with_thread_local_cuda_streams(|streams| { + lhs.par_iter() + .zip(rhs.par_iter()) + .map(|(lhs, rhs)| op(cuda_key, lhs, rhs, streams)) + .collect::>() + }) + })) +} + +impl ArithmeticArrayBackend for GpuIntegerArrayBackend +where + T: CudaIntegerRadixCiphertext + Send + Sync, +{ + fn add_slices<'a>( + lhs: TensorSlice<'_, Self::Slice<'a>>, + rhs: TensorSlice<'_, Self::Slice<'a>>, + ) -> Self::Owned { + par_map_sks_op_on_pair_of_elements(lhs, rhs, crate::integer::gpu::CudaServerKey::add) + } + + fn sub_slices<'a>( + lhs: TensorSlice<'_, Self::Slice<'a>>, + rhs: TensorSlice<'_, Self::Slice<'a>>, + ) -> Self::Owned { + par_map_sks_op_on_pair_of_elements(lhs, rhs, crate::integer::gpu::CudaServerKey::sub) + } + + fn mul_slices<'a>( + lhs: TensorSlice<'_, Self::Slice<'a>>, + rhs: TensorSlice<'_, Self::Slice<'a>>, + ) -> Self::Owned { + par_map_sks_op_on_pair_of_elements(lhs, rhs, crate::integer::gpu::CudaServerKey::mul) + } + + fn div_slices<'a>( + lhs: TensorSlice<'_, Self::Slice<'a>>, + rhs: TensorSlice<'_, Self::Slice<'a>>, + ) -> Self::Owned { + par_map_sks_op_on_pair_of_elements(lhs, rhs, crate::integer::gpu::CudaServerKey::div) + } + + fn rem_slices<'a>( + lhs: TensorSlice<'_, Self::Slice<'a>>, + rhs: TensorSlice<'_, Self::Slice<'a>>, + ) -> Self::Owned { + par_map_sks_op_on_pair_of_elements(lhs, rhs, crate::integer::gpu::CudaServerKey::rem) + } +} + +#[inline] +#[track_caller] +fn par_map_sks_scalar_op_on_pair_of_elements<'a, T, Clear, F>( + lhs: TensorSlice<'a, GpuSlice<'a, T>>, + rhs: TensorSlice<'a, &'a [Clear]>, + op: F, +) -> GpuOwned +where + T: CudaIntegerRadixCiphertext + Send + Sync, + Clear: Copy + Send + Sync, + F: Send + Sync + Fn(&crate::integer::gpu::CudaServerKey, &T, Clear, &CudaStreams) -> T, +{ + GpuOwned(global_state::with_cuda_internal_keys(|cuda_key| { + with_thread_local_cuda_streams(|streams| { + lhs.par_iter() + .zip(rhs.par_iter()) + .map(|(lhs, rhs)| op(cuda_key, lhs, *rhs, streams)) + .collect::>() + }) + })) +} + +impl ClearArithmeticArrayBackend + for GpuIntegerArrayBackend +where + Clear: DecomposableInto + + std::ops::Not + + std::ops::Add + + ScalarMultiplier + + Reciprocable + + CastInto, +{ + fn add_slices( + lhs: TensorSlice<'_, Self::Slice<'_>>, + rhs: TensorSlice<'_, &'_ [Clear]>, + ) -> Self::Owned { + par_map_sks_scalar_op_on_pair_of_elements( + lhs, + rhs, + crate::integer::gpu::CudaServerKey::scalar_add, + ) + } + + fn sub_slices( + lhs: TensorSlice<'_, Self::Slice<'_>>, + rhs: TensorSlice<'_, &'_ [Clear]>, + ) -> Self::Owned { + par_map_sks_scalar_op_on_pair_of_elements( + lhs, + rhs, + crate::integer::gpu::CudaServerKey::scalar_sub, + ) + } + + fn mul_slices( + lhs: TensorSlice<'_, Self::Slice<'_>>, + rhs: TensorSlice<'_, &'_ [Clear]>, + ) -> Self::Owned { + par_map_sks_scalar_op_on_pair_of_elements( + lhs, + rhs, + crate::integer::gpu::CudaServerKey::scalar_mul, + ) + } + + fn div_slices( + lhs: TensorSlice<'_, Self::Slice<'_>>, + rhs: TensorSlice<'_, &'_ [Clear]>, + ) -> Self::Owned { + par_map_sks_scalar_op_on_pair_of_elements( + lhs, + rhs, + crate::integer::gpu::CudaServerKey::scalar_div, + ) + } + + fn rem_slices( + lhs: TensorSlice<'_, Self::Slice<'_>>, + rhs: TensorSlice<'_, &'_ [Clear]>, + ) -> Self::Owned { + par_map_sks_scalar_op_on_pair_of_elements( + lhs, + rhs, + crate::integer::gpu::CudaServerKey::scalar_rem, + ) + } +} + +impl ClearArithmeticArrayBackend for GpuIntegerArrayBackend +where + Clear: DecomposableInto + + std::ops::Not + + std::ops::Add + + ScalarMultiplier + + SignedReciprocable, +{ + fn add_slices( + lhs: TensorSlice<'_, Self::Slice<'_>>, + rhs: TensorSlice<'_, &'_ [Clear]>, + ) -> Self::Owned { + par_map_sks_scalar_op_on_pair_of_elements( + lhs, + rhs, + crate::integer::gpu::CudaServerKey::scalar_add, + ) + } + + fn sub_slices( + lhs: TensorSlice<'_, Self::Slice<'_>>, + rhs: TensorSlice<'_, &'_ [Clear]>, + ) -> Self::Owned { + par_map_sks_scalar_op_on_pair_of_elements( + lhs, + rhs, + crate::integer::gpu::CudaServerKey::scalar_sub, + ) + } + + fn mul_slices( + lhs: TensorSlice<'_, Self::Slice<'_>>, + rhs: TensorSlice<'_, &'_ [Clear]>, + ) -> Self::Owned { + par_map_sks_scalar_op_on_pair_of_elements( + lhs, + rhs, + crate::integer::gpu::CudaServerKey::scalar_mul, + ) + } + + fn div_slices( + lhs: TensorSlice<'_, Self::Slice<'_>>, + rhs: TensorSlice<'_, &'_ [Clear]>, + ) -> Self::Owned { + par_map_sks_scalar_op_on_pair_of_elements( + lhs, + rhs, + crate::integer::gpu::CudaServerKey::signed_scalar_div, + ) + } + + fn rem_slices( + lhs: TensorSlice<'_, Self::Slice<'_>>, + rhs: TensorSlice<'_, &'_ [Clear]>, + ) -> Self::Owned { + par_map_sks_scalar_op_on_pair_of_elements( + lhs, + rhs, + crate::integer::gpu::CudaServerKey::signed_scalar_rem, + ) + } +} + +impl BitwiseArrayBackend for GpuIntegerArrayBackend +where + T: CudaIntegerRadixCiphertext + Send + Sync, +{ + fn bitand<'a>( + lhs: TensorSlice<'_, Self::Slice<'a>>, + rhs: TensorSlice<'_, Self::Slice<'a>>, + ) -> Self::Owned { + par_map_sks_op_on_pair_of_elements(lhs, rhs, crate::integer::gpu::CudaServerKey::bitand) + } + + fn bitor<'a>( + lhs: TensorSlice<'_, Self::Slice<'a>>, + rhs: TensorSlice<'_, Self::Slice<'a>>, + ) -> Self::Owned { + par_map_sks_op_on_pair_of_elements(lhs, rhs, crate::integer::gpu::CudaServerKey::bitor) + } + + fn bitxor<'a>( + lhs: TensorSlice<'_, Self::Slice<'a>>, + rhs: TensorSlice<'_, Self::Slice<'a>>, + ) -> Self::Owned { + par_map_sks_op_on_pair_of_elements(lhs, rhs, crate::integer::gpu::CudaServerKey::bitxor) + } + + fn bitnot(lhs: TensorSlice<'_, Self::Slice<'_>>) -> Self::Owned { + GpuOwned(global_state::with_cuda_internal_keys(|cuda_key| { + with_thread_local_cuda_streams(|streams| { + lhs.par_iter() + .map(|lhs| cuda_key.bitnot(lhs, streams)) + .collect::>() + }) + })) + } +} + +impl ClearBitwiseArrayBackend for GpuIntegerArrayBackend +where + T: CudaIntegerRadixCiphertext + Send + Sync, + Clear: DecomposableInto, +{ + fn bitand_slice( + lhs: TensorSlice<'_, Self::Slice<'_>>, + rhs: TensorSlice<'_, &'_ [Clear]>, + ) -> Self::Owned { + par_map_sks_scalar_op_on_pair_of_elements( + lhs, + rhs, + crate::integer::gpu::CudaServerKey::scalar_bitand, + ) + } + + fn bitor_slice( + lhs: TensorSlice<'_, Self::Slice<'_>>, + rhs: TensorSlice<'_, &'_ [Clear]>, + ) -> Self::Owned { + par_map_sks_scalar_op_on_pair_of_elements( + lhs, + rhs, + crate::integer::gpu::CudaServerKey::scalar_bitor, + ) + } + + fn bitxor_slice( + lhs: TensorSlice<'_, Self::Slice<'_>>, + rhs: TensorSlice<'_, &'_ [Clear]>, + ) -> Self::Owned { + par_map_sks_scalar_op_on_pair_of_elements( + lhs, + rhs, + crate::integer::gpu::CudaServerKey::scalar_bitxor, + ) + } +} + +impl BackendDataContainer for GpuOwned +where + T: CudaIntegerRadixCiphertext, +{ + type Backend = GpuIntegerArrayBackend; + + fn len(&self) -> usize { + self.0.len() + } + + fn as_sub_slice( + &self, + range: impl RangeBounds, + ) -> ::Slice<'_> { + GpuSlice(create_sub_slice_with_bound(self.0.as_slice(), range)) + } + + fn into_owned(self) -> ::Owned { + self + } +} + +impl BackendDataContainerMut for GpuOwned +where + T: CudaIntegerRadixCiphertext, +{ + fn as_sub_slice_mut( + &mut self, + range: impl RangeBounds, + ) -> ::SliceMut<'_> { + GpuSliceMut(create_sub_mut_slice_with_bound( + self.0.as_mut_slice(), + range, + )) + } +} + +impl<'a, T> BackendDataContainer for GpuSlice<'a, T> +where + T: CudaIntegerRadixCiphertext, +{ + type Backend = GpuIntegerArrayBackend; + + fn len(&self) -> usize { + <[T]>::len(self.0) + } + + fn as_sub_slice( + &self, + range: impl RangeBounds, + ) -> ::Slice<'_> { + GpuSlice(create_sub_slice_with_bound(self.0, range)) + } + + fn into_owned(self) -> ::Owned { + with_thread_local_cuda_streams(|streams| { + GpuOwned(self.0.iter().map(|elem| elem.duplicate(streams)).collect()) + }) + } +} + +impl<'a, T> BackendDataContainer for GpuSliceMut<'a, T> +where + T: CudaIntegerRadixCiphertext, +{ + type Backend = GpuIntegerArrayBackend; + + fn len(&self) -> usize { + <[T]>::len(self.0) + } + + fn as_sub_slice( + &self, + range: impl RangeBounds, + ) -> ::Slice<'_> { + GpuSlice(create_sub_slice_with_bound(self.0, range)) + } + + fn into_owned(self) -> ::Owned { + with_thread_local_cuda_streams(|streams| { + GpuOwned(self.0.iter().map(|elem| elem.duplicate(streams)).collect()) + }) + } +} + +impl<'a, T> BackendDataContainerMut for GpuSliceMut<'a, T> +where + T: CudaIntegerRadixCiphertext, +{ + fn as_sub_slice_mut( + &mut self, + range: impl RangeBounds, + ) -> ::SliceMut<'_> { + GpuSliceMut(create_sub_mut_slice_with_bound(self.0, range)) + } +} + +impl<'a, Clear, Id> FheTryEncrypt<&'a [Clear], ClientKey> + for FheArrayBase, Id> +where + Id: FheUintId, + Clear: DecomposableInto + UnsignedNumeric, +{ + type Error = Error; + + fn try_encrypt(clears: &'a [Clear], key: &ClientKey) -> Result { + let num_blocks = Id::num_blocks(key.message_modulus()); + Ok(Self::new( + with_thread_local_cuda_streams(|streams| { + clears + .iter() + .copied() + .map(|clear| { + CudaUnsignedRadixCiphertext::from_radix_ciphertext( + &key.key.key.encrypt_radix(clear, num_blocks), + streams, + ) + }) + .collect::>() + }), + vec![clears.len()], + )) + } +} + +impl<'a, Clear, Id> FheTryEncrypt<(&'a [Clear], Vec), ClientKey> + for FheArrayBase, Id> +where + Id: FheUintId, + Clear: DecomposableInto + UnsignedNumeric, +{ + type Error = Error; + + fn try_encrypt( + (clears, shape): (&'a [Clear], Vec), + key: &ClientKey, + ) -> Result { + if clears.len() != shape.iter().copied().product::() { + return Err(crate::Error::new( + "Shape does not matches the number of elements given".to_string(), + )); + } + let num_blocks = Id::num_blocks(key.message_modulus()); + let elems = with_thread_local_cuda_streams(|streams| { + clears + .iter() + .copied() + .map(|clear| { + CudaUnsignedRadixCiphertext::from_radix_ciphertext( + &key.key.key.encrypt_radix(clear, num_blocks), + streams, + ) + }) + .collect::>() + }); + let data = Self::new(elems, shape); + Ok(data) + } +} + +impl FheDecrypt> for GpuFheUintArray +where + Id: FheUintId, + Clear: RecomposableFrom + UnsignedNumeric, +{ + fn decrypt(&self, key: &ClientKey) -> Vec { + self.as_slice().decrypt(key) + } +} + +impl<'a, Clear, Id> FheDecrypt> for GpuFheUintSliceMut<'a, Id> +where + Id: FheUintId, + Clear: RecomposableFrom + UnsignedNumeric, +{ + fn decrypt(&self, key: &ClientKey) -> Vec { + self.as_slice().decrypt(key) + } +} + +impl<'a, Clear, Id> FheDecrypt> for GpuFheUintSlice<'a, Id> +where + Id: FheUintId, + Clear: RecomposableFrom + UnsignedNumeric, +{ + fn decrypt(&self, key: &ClientKey) -> Vec { + with_thread_local_cuda_streams(|streams| { + self.as_tensor_slice() + .iter() + .map(|ct: &CudaUnsignedRadixCiphertext| { + key.key.key.decrypt_radix(&ct.to_radix_ciphertext(streams)) + }) + .collect() + }) + } +} + +impl<'a, Clear, Id> FheTryEncrypt<&'a [Clear], ClientKey> for GpuFheIntArray +where + Id: FheIntId, + Clear: DecomposableInto + SignedNumeric, +{ + type Error = Error; + + fn try_encrypt(clears: &'a [Clear], key: &ClientKey) -> Result { + let num_blocks = Id::num_blocks(key.message_modulus()); + Ok(Self::new( + with_thread_local_cuda_streams(|streams| { + clears + .iter() + .copied() + .map(|clear| { + CudaSignedRadixCiphertext::from_signed_radix_ciphertext( + &key.key.key.encrypt_signed_radix(clear, num_blocks), + streams, + ) + }) + .collect::>() + }), + vec![clears.len()], + )) + } +} + +impl FheDecrypt> for GpuFheIntArray +where + Id: FheIntId, + Clear: RecomposableSignedInteger, +{ + fn decrypt(&self, key: &ClientKey) -> Vec { + self.as_slice().decrypt(key) + } +} + +impl<'a, Clear, Id> FheDecrypt> for GpuFheIntSliceMut<'a, Id> +where + Id: FheIntId, + Clear: RecomposableSignedInteger, +{ + fn decrypt(&self, key: &ClientKey) -> Vec { + self.as_slice().decrypt(key) + } +} + +impl<'a, Clear, Id> FheDecrypt> for GpuFheIntSlice<'a, Id> +where + Id: FheIntId, + Clear: RecomposableSignedInteger, +{ + fn decrypt(&self, key: &ClientKey) -> Vec { + with_thread_local_cuda_streams(|streams| { + self.elems + .0 + .iter() + .map(|ct| { + key.key + .key + .decrypt_signed_radix(&ct.to_signed_radix_ciphertext(streams)) + }) + .collect() + }) + } +} diff --git a/tfhe/src/high_level_api/array/gpu/mod.rs b/tfhe/src/high_level_api/array/gpu/mod.rs new file mode 100644 index 0000000000..62efcd1695 --- /dev/null +++ b/tfhe/src/high_level_api/array/gpu/mod.rs @@ -0,0 +1,9 @@ +pub(crate) mod booleans; +pub(crate) mod integers; +#[cfg(test)] +pub use booleans::GpuFheBoolArrayBackend; +pub use booleans::{GpuFheBoolArray, GpuFheBoolSlice, GpuFheBoolSliceMut}; +pub use integers::{ + GpuFheIntArray, GpuFheIntSlice, GpuFheIntSliceMut, GpuFheUintArray, GpuFheUintSlice, + GpuFheUintSliceMut, +}; diff --git a/tfhe/src/high_level_api/array/mod.rs b/tfhe/src/high_level_api/array/mod.rs index 16cbf8b5d4..5ee745a489 100644 --- a/tfhe/src/high_level_api/array/mod.rs +++ b/tfhe/src/high_level_api/array/mod.rs @@ -1,6 +1,8 @@ mod clear_ops; mod cpu; mod dynamic; +#[cfg(feature = "gpu")] +mod gpu; mod helpers; mod ops; pub mod stride; @@ -26,6 +28,11 @@ pub use dynamic::{ FheBoolArray, FheBoolSlice, FheBoolSliceMut, FheIntArray, FheIntSlice, FheIntSliceMut, FheUintArray, FheUintSlice, FheUintSliceMut, }; +#[cfg(feature = "gpu")] +pub use gpu::{ + GpuFheBoolArray, GpuFheBoolSlice, GpuFheBoolSliceMut, GpuFheIntArray, GpuFheIntSlice, + GpuFheIntSliceMut, GpuFheUintArray, GpuFheUintSlice, GpuFheUintSliceMut, +}; /// The base struct for Fhe array types. /// @@ -289,6 +296,14 @@ macro_rules! declare_concrete_array_types { pub type []<'a> = CpuFheUintSlice<'a, crate::[]>; pub type []<'a> = CpuFheUintSliceMut<'a, crate::[]>; + // Instantiate Array Types for Gpu backend + #[cfg(feature="gpu")] + pub type [] = GpuFheUintArray]>; + #[cfg(feature="gpu")] + pub type []<'a> = GpuFheUintSlice<'a, crate::[]>; + #[cfg(feature="gpu")] + pub type []<'a> = GpuFheUintSliceMut<'a, crate::[]>; + )* } @@ -308,6 +323,14 @@ macro_rules! declare_concrete_array_types { pub type []<'a> = CpuFheIntSlice<'a, crate::[]>; pub type []<'a> = CpuFheIntSliceMut<'a, crate::[]>; + // Instantiate Array Types for Gpu backend + #[cfg(feature="gpu")] + pub type [] = GpuFheIntArray]>; + #[cfg(feature="gpu")] + pub type []<'a> = GpuFheIntSlice<'a, crate::[]>; + #[cfg(feature="gpu")] + pub type []<'a> = GpuFheIntSliceMut<'a, crate::[]>; + )* } diff --git a/tfhe/src/high_level_api/array/tests/booleans.rs b/tfhe/src/high_level_api/array/tests/booleans.rs index 870fa35be9..a17f297dbf 100644 --- a/tfhe/src/high_level_api/array/tests/booleans.rs +++ b/tfhe/src/high_level_api/array/tests/booleans.rs @@ -8,6 +8,17 @@ fn test_cpu_only_bitand() { >(&ck); } +#[test] +#[cfg(feature = "gpu")] +fn test_gpu_only_bitand() { + let ck = super::setup_default_gpu(); + super::bitand_test_case::< + crate::FheBoolId, + crate::high_level_api::array::gpu::GpuFheBoolArrayBackend, + bool, + >(&ck); +} + #[test] fn test_cpu_dyn_bitand() { let ck = super::setup_default_cpu(); @@ -24,6 +35,13 @@ fn test_cpu_only_bitor() { super::bitor_test_case::(&ck); } +#[test] +#[cfg(feature = "gpu")] +fn test_gpu_only_bitor() { + let ck = super::setup_default_gpu(); + super::bitor_test_case::(&ck); +} + #[test] fn test_cpu_dyn_bitor() { let ck = super::setup_default_cpu(); @@ -36,6 +54,13 @@ fn test_cpu_only_bitxor() { super::bitxor_test_case::(&ck); } +#[test] +#[cfg(feature = "gpu")] +fn test_gpu_only_bitxor() { + let ck = super::setup_default_gpu(); + super::bitxor_test_case::(&ck); +} + #[test] fn test_cpu_dyn_bitxor() { let ck = super::setup_default_cpu(); @@ -48,6 +73,15 @@ fn test_cpu_only_bitand_scalar_slice() { super::bitand_scalar_slice_test_case::(&ck); } +#[test] +#[cfg(feature = "gpu")] +fn test_gpu_only_bitand_scalar_slice() { + let ck = super::setup_default_gpu(); + super::bitand_scalar_slice_test_case::( + &ck, + ); +} + #[test] fn test_cpu_dyn_bitand_scalar_slice() { let ck = super::setup_default_cpu(); diff --git a/tfhe/src/high_level_api/array/tests/mod.rs b/tfhe/src/high_level_api/array/tests/mod.rs index 6e1a9b57b3..bc0d6ca562 100644 --- a/tfhe/src/high_level_api/array/tests/mod.rs +++ b/tfhe/src/high_level_api/array/tests/mod.rs @@ -3,6 +3,8 @@ mod signed; mod unsigned; use crate::{generate_keys, set_server_key, ClientKey, ConfigBuilder, FheId}; +#[cfg(feature = "gpu")] +use crate::{Config, CudaServerKey}; use rand::distributions::{Distribution, Standard}; use rand::random; use std::fmt::Debug; @@ -13,6 +15,14 @@ use crate::high_level_api::array::{FheBackendArray, FheBackendArraySlice}; use crate::prelude::{FheDecrypt, FheTryEncrypt}; use std::ops::{BitAnd, BitOr, BitXor}; +#[cfg(feature = "gpu")] +pub(crate) fn generate_cuda_keys>(config: C) -> (ClientKey, CudaServerKey) { + let client_kc = ClientKey::generate(config); + let server_kc = client_kc.generate_compressed_server_key(); + let cuda_server_kc = server_kc.decompress_to_gpu(); + + (client_kc, cuda_server_kc) +} fn draw_random_values(num_values: usize) -> Vec where Standard: Distribution, @@ -28,6 +38,15 @@ fn setup_default_cpu() -> ClientKey { ck } +#[cfg(feature = "gpu")] +fn setup_default_gpu() -> ClientKey { + let config = ConfigBuilder::default().build(); + let (ck, sk) = generate_cuda_keys(config); + set_server_key(sk); + + ck +} + fn bitand_test_case(ck: &ClientKey) where Id: FheId, diff --git a/tfhe/src/high_level_api/array/tests/signed.rs b/tfhe/src/high_level_api/array/tests/signed.rs index ad6da41ce4..81a9cdd82e 100644 --- a/tfhe/src/high_level_api/array/tests/signed.rs +++ b/tfhe/src/high_level_api/array/tests/signed.rs @@ -8,6 +8,17 @@ fn test_cpu_only_bitand() { >(&ck); } +#[test] +#[cfg(feature = "gpu")] +fn test_gpu_only_bitand() { + let ck = super::setup_default_gpu(); + super::bitand_test_case::< + crate::FheInt32Id, + crate::high_level_api::array::gpu::integers::GpuIntArrayBackend, + i32, + >(&ck); +} + #[test] fn test_cpu_dyn_bitand() { let ck = super::setup_default_cpu(); @@ -24,6 +35,13 @@ fn test_cpu_only_bitor() { super::bitor_test_case::(&ck); } +#[test] +#[cfg(feature = "gpu")] +fn test_gpu_only_bitor() { + let ck = super::setup_default_gpu(); + super::bitor_test_case::(&ck); +} + #[test] fn test_cpu_dyn_bitor() { let ck = super::setup_default_cpu(); @@ -36,6 +54,13 @@ fn test_cpu_only_bitxor() { super::bitxor_test_case::(&ck); } +#[test] +#[cfg(feature = "gpu")] +fn test_gpu_only_bitxor() { + let ck = super::setup_default_gpu(); + super::bitxor_test_case::(&ck); +} + #[test] fn test_cpu_dyn_bitxor() { let ck = super::setup_default_cpu(); @@ -48,6 +73,13 @@ fn test_cpu_only_bitand_scalar_slice() { super::bitand_scalar_slice_test_case::(&ck); } +#[test] +#[cfg(feature = "gpu")] +fn test_gpu_only_bitand_scalar_slice() { + let ck = super::setup_default_gpu(); + super::bitand_scalar_slice_test_case::(&ck); +} + #[test] fn test_cpu_dyn_bitand_scalar_slice() { let ck = super::setup_default_cpu(); diff --git a/tfhe/src/high_level_api/array/tests/unsigned.rs b/tfhe/src/high_level_api/array/tests/unsigned.rs index 15ca5009ab..17cda31464 100644 --- a/tfhe/src/high_level_api/array/tests/unsigned.rs +++ b/tfhe/src/high_level_api/array/tests/unsigned.rs @@ -14,6 +14,17 @@ fn test_cpu_only_bitand() { >(&ck); } +#[test] +#[cfg(feature = "gpu")] +fn test_gpu_only_bitand() { + let ck = super::setup_default_gpu(); + super::bitand_test_case::< + crate::FheUint32Id, + crate::high_level_api::array::gpu::integers::GpuUintArrayBackend, + u32, + >(&ck); +} + #[test] fn test_cpu_dyn_bitand() { let ck = super::setup_default_cpu(); @@ -30,6 +41,13 @@ fn test_cpu_only_bitor() { super::bitor_test_case::(&ck); } +#[test] +#[cfg(feature = "gpu")] +fn test_gpu_only_bitor() { + let ck = super::setup_default_gpu(); + super::bitor_test_case::(&ck); +} + #[test] fn test_cpu_dyn_bitor() { let ck = super::setup_default_cpu(); @@ -42,6 +60,13 @@ fn test_cpu_only_bitxor() { super::bitxor_test_case::(&ck); } +#[test] +#[cfg(feature = "gpu")] +fn test_gpu_only_bitxor() { + let ck = super::setup_default_gpu(); + super::bitxor_test_case::(&ck); +} + #[test] fn test_cpu_dyn_bitxor() { let ck = super::setup_default_cpu(); @@ -53,6 +78,13 @@ fn test_cpu_only_bitand_scalar_slice() { super::bitand_scalar_slice_test_case::(&ck); } +#[test] +#[cfg(feature = "gpu")] +fn test_gpu_only_bitand_scalar_slice() { + let ck = super::setup_default_gpu(); + super::bitand_scalar_slice_test_case::(&ck); +} + #[test] fn test_cpu_dyn_bitand_scalar_slice() { let ck = super::setup_default_cpu(); diff --git a/tfhe/src/high_level_api/global_state.rs b/tfhe/src/high_level_api/global_state.rs index 1593ede673..c134eda58c 100644 --- a/tfhe/src/high_level_api/global_state.rs +++ b/tfhe/src/high_level_api/global_state.rs @@ -4,7 +4,10 @@ use crate::core_crypto::gpu::CudaStreams; use crate::high_level_api::errors::{UninitializedServerKey, UnwrapResultExt}; use crate::high_level_api::keys::{InternalServerKey, ServerKey}; +#[cfg(feature = "gpu")] +use crate::integer::gpu::CudaServerKey; use std::cell::RefCell; + /// We store the internal keys as thread local, meaning each thread has its own set of keys. /// /// This means that the user can do computations in multiple threads @@ -156,6 +159,28 @@ where }) } +#[inline] +#[cfg(feature = "gpu")] +pub(crate) fn with_cuda_internal_keys(func: F) -> T +where + F: FnOnce(&CudaServerKey) -> T, +{ + // Should use `with_borrow` when its stabilized + INTERNAL_KEYS.with(|keys| { + let maybe_key = &*keys.borrow(); + let key = maybe_key + .as_ref() + .ok_or(UninitializedServerKey) + .unwrap_display(); + match key { + InternalServerKey::Cuda(key) => func(&key.key.key), + InternalServerKey::Cpu(_) => { + panic!("Cuda key requested but only cpu key is available") + } + } + }) +} + #[cfg(feature = "gpu")] thread_local! { static CUDA_STREAMS: std::cell::OnceCell = std::cell::OnceCell::from(CudaStreams::new_multi_gpu()); diff --git a/tfhe/src/lib.rs b/tfhe/src/lib.rs index 93b220b05e..f57c9f047c 100644 --- a/tfhe/src/lib.rs +++ b/tfhe/src/lib.rs @@ -72,6 +72,7 @@ #![cfg_attr(all(doc, not(doctest)), feature(doc_auto_cfg))] #![cfg_attr(all(doc, not(doctest)), feature(doc_cfg))] #![warn(rustdoc::broken_intra_doc_links)] +extern crate core; #[cfg(feature = "__c_api")] pub mod c_api;