From 86505a146748ac59355e04ed6165d33ef21c8a41 Mon Sep 17 00:00:00 2001
From: Agnes Leroy <agnes.leroy@zama.ai>
Date: Tue, 26 Nov 2024 09:55:01 +0100
Subject: [PATCH] feat(gpu): add gpu array type in hl api

---
 tfhe/src/high_level_api/array/gpu/booleans.rs | 300 ++++++++
 tfhe/src/high_level_api/array/gpu/integers.rs | 648 ++++++++++++++++++
 tfhe/src/high_level_api/array/gpu/mod.rs      |   9 +
 tfhe/src/high_level_api/array/mod.rs          |  23 +
 .../high_level_api/array/tests/booleans.rs    |  34 +
 tfhe/src/high_level_api/array/tests/mod.rs    |  19 +
 tfhe/src/high_level_api/array/tests/signed.rs |  32 +
 .../high_level_api/array/tests/unsigned.rs    |  32 +
 tfhe/src/high_level_api/global_state.rs       |  25 +
 tfhe/src/lib.rs                               |   1 +
 10 files changed, 1123 insertions(+)
 create mode 100644 tfhe/src/high_level_api/array/gpu/booleans.rs
 create mode 100644 tfhe/src/high_level_api/array/gpu/integers.rs
 create mode 100644 tfhe/src/high_level_api/array/gpu/mod.rs
diff --git a/tfhe/src/high_level_api/array/gpu/booleans.rs b/tfhe/src/high_level_api/array/gpu/booleans.rs
new file mode 100644
index 0000000000..f538f0c163
--- /dev/null
+++ b/tfhe/src/high_level_api/array/gpu/booleans.rs
@@ -0,0 +1,300 @@
+//! This module contains the implementation of the FheBool array backend
+//! where the values and computations are always done on GPU
+
+use super::super::helpers::{create_sub_mut_slice_with_bound, create_sub_slice_with_bound};
+use super::super::traits::{BitwiseArrayBackend, ClearBitwiseArrayBackend};
+use crate::array::stride::{ParStridedIter, ParStridedIterMut, StridedIter};
+use crate::array::traits::TensorSlice;
+use crate::high_level_api::array::{ArrayBackend, BackendDataContainer, BackendDataContainerMut};
+use crate::high_level_api::global_state;
+use crate::high_level_api::global_state::with_thread_local_cuda_streams;
+use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
+use crate::prelude::{FheDecrypt, FheTryEncrypt};
+use crate::{ClientKey, FheBoolId};
+use rayon::prelude::*;
+use std::ops::RangeBounds;
+
+pub struct GpuFheBoolArrayBackend;
+
+pub type GpuFheBoolArray = super::super::FheBackendArray<GpuFheBoolArrayBackend, FheBoolId>;
+pub type GpuFheBoolSlice<'a> =
+    super::super::FheBackendArraySlice<'a, GpuFheBoolArrayBackend, FheBoolId>;
+pub type GpuFheBoolSliceMut<'a> =
+    super::super::FheBackendArraySliceMut<'a, GpuFheBoolArrayBackend, FheBoolId>;
+
+pub struct GpuBooleanSlice<'a>(pub(crate) &'a [CudaBooleanBlock]);
+pub struct GpuBooleanSliceMut<'a>(pub(crate) &'a mut [CudaBooleanBlock]);
+pub struct GpuBooleanOwned(pub(crate) Vec<CudaBooleanBlock>);
+
+impl Clone for GpuBooleanOwned {
+    fn clone(&self) -> Self {
+        with_thread_local_cuda_streams(|streams| {
+            Self(self.0.iter().map(|elem| elem.duplicate(streams)).collect())
+        })
+    }
+}
+
+impl ArrayBackend for GpuFheBoolArrayBackend {
+    type Slice<'a>
+        = GpuBooleanSlice<'a>
+    where
+        Self: 'a;
+    type SliceMut<'a>
+        = GpuBooleanSliceMut<'a>
+    where
+        Self: 'a;
+    type Owned = GpuBooleanOwned;
+}
+
+impl<'a> TensorSlice<'a, GpuBooleanSlice<'a>> {
+    pub fn iter(self) -> StridedIter<'a, CudaBooleanBlock> {
+        StridedIter::new(self.slice.0, self.dims.clone())
+    }
+
+    pub fn par_iter(self) -> ParStridedIter<'a, CudaBooleanBlock> {
+        ParStridedIter::new(self.slice.0, self.dims.clone())
+    }
+}
+
+impl<'a> TensorSlice<'a, GpuBooleanSliceMut<'a>> {
+    pub fn par_iter_mut(self) -> ParStridedIterMut<'a, CudaBooleanBlock> {
+        ParStridedIterMut::new(self.slice.0, self.dims.clone())
+    }
+}
+
+impl From<Vec<CudaBooleanBlock>> for GpuBooleanOwned {
+    fn from(value: Vec<CudaBooleanBlock>) -> Self {
+        Self(value)
+    }
+}
+
+impl<'a> BackendDataContainer for GpuBooleanSlice<'a> {
+    type Backend = GpuFheBoolArrayBackend;
+
+    fn len(&self) -> usize {
+        <[CudaBooleanBlock]>::len(self.0)
+    }
+
+    fn as_sub_slice(
+        &self,
+        range: impl RangeBounds<usize>,
+    ) -> <Self::Backend as ArrayBackend>::Slice<'_> {
+        GpuBooleanSlice(create_sub_slice_with_bound(self.0, range))
+    }
+
+    fn into_owned(self) -> <Self::Backend as ArrayBackend>::Owned {
+        with_thread_local_cuda_streams(|streams| {
+            GpuBooleanOwned(self.0.iter().map(|elem| elem.duplicate(streams)).collect())
+        })
+    }
+}
+
+impl<'a> BackendDataContainer for GpuBooleanSliceMut<'a> {
+    type Backend = GpuFheBoolArrayBackend;
+
+    fn len(&self) -> usize {
+        <[CudaBooleanBlock]>::len(self.0)
+    }
+
+    fn as_sub_slice(
+        &self,
+        range: impl RangeBounds<usize>,
+    ) -> <Self::Backend as ArrayBackend>::Slice<'_> {
+        GpuBooleanSlice(create_sub_slice_with_bound(self.0, range))
+    }
+
+    fn into_owned(self) -> <Self::Backend as ArrayBackend>::Owned {
+        with_thread_local_cuda_streams(|streams| {
+            GpuBooleanOwned(self.0.iter().map(|elem| elem.duplicate(streams)).collect())
+        })
+    }
+}
+
+impl<'a> BackendDataContainerMut for GpuBooleanSliceMut<'a> {
+    fn as_sub_slice_mut(
+        &mut self,
+        range: impl RangeBounds<usize>,
+    ) -> <Self::Backend as ArrayBackend>::SliceMut<'_> {
+        GpuBooleanSliceMut(create_sub_mut_slice_with_bound(self.0, range))
+    }
+}
+
+impl BackendDataContainer for GpuBooleanOwned {
+    type Backend = GpuFheBoolArrayBackend;
+
+    fn len(&self) -> usize {
+        self.0.len()
+    }
+
+    fn as_sub_slice(
+        &self,
+        range: impl RangeBounds<usize>,
+    ) -> <Self::Backend as ArrayBackend>::Slice<'_> {
+        GpuBooleanSlice(create_sub_slice_with_bound(self.0.as_slice(), range))
+    }
+
+    fn into_owned(self) -> <Self::Backend as ArrayBackend>::Owned {
+        self
+    }
+}
+
+impl BackendDataContainerMut for GpuBooleanOwned {
+    fn as_sub_slice_mut(
+        &mut self,
+        range: impl RangeBounds<usize>,
+    ) -> <Self::Backend as ArrayBackend>::SliceMut<'_> {
+        GpuBooleanSliceMut(create_sub_mut_slice_with_bound(
+            self.0.as_mut_slice(),
+            range,
+        ))
+    }
+}
+
+impl BitwiseArrayBackend for GpuFheBoolArrayBackend {
+    fn bitand<'a>(
+        lhs: TensorSlice<'_, Self::Slice<'a>>,
+        rhs: TensorSlice<'_, Self::Slice<'a>>,
+    ) -> Self::Owned {
+        GpuBooleanOwned(global_state::with_cuda_internal_keys(|cuda_key| {
+            with_thread_local_cuda_streams(|streams| {
+                lhs.par_iter()
+                    .zip(rhs.par_iter())
+                    .map(|(lhs, rhs)| CudaBooleanBlock(cuda_key.bitand(&lhs.0, &rhs.0, streams)))
+                    .collect::<Vec<_>>()
+            })
+        }))
+    }
+
+    fn bitor<'a>(
+        lhs: TensorSlice<'_, Self::Slice<'a>>,
+        rhs: TensorSlice<'_, Self::Slice<'a>>,
+    ) -> Self::Owned {
+        GpuBooleanOwned(global_state::with_cuda_internal_keys(|cuda_key| {
+            with_thread_local_cuda_streams(|streams| {
+                lhs.par_iter()
+                    .zip(rhs.par_iter())
+                    .map(|(lhs, rhs)| CudaBooleanBlock(cuda_key.bitor(&lhs.0, &rhs.0, streams)))
+                    .collect::<Vec<_>>()
+            })
+        }))
+    }
+
+    fn bitxor<'a>(
+        lhs: TensorSlice<'_, Self::Slice<'a>>,
+        rhs: TensorSlice<'_, Self::Slice<'a>>,
+    ) -> Self::Owned {
+        GpuBooleanOwned(global_state::with_cuda_internal_keys(|cuda_key| {
+            with_thread_local_cuda_streams(|streams| {
+                lhs.par_iter()
+                    .zip(rhs.par_iter())
+                    .map(|(lhs, rhs)| CudaBooleanBlock(cuda_key.bitxor(&lhs.0, &rhs.0, streams)))
+                    .collect::<Vec<_>>()
+            })
+        }))
+    }
+
+    fn bitnot(lhs: TensorSlice<'_, Self::Slice<'_>>) -> Self::Owned {
+        GpuBooleanOwned(global_state::with_cuda_internal_keys(|cuda_key| {
+            with_thread_local_cuda_streams(|streams| {
+                lhs.par_iter()
+                    .map(|lhs| CudaBooleanBlock(cuda_key.bitnot(&lhs.0, streams)))
+                    .collect::<Vec<_>>()
+            })
+        }))
+    }
+}
+
+impl ClearBitwiseArrayBackend<bool> for GpuFheBoolArrayBackend {
+    fn bitand_slice(
+        lhs: TensorSlice<'_, Self::Slice<'_>>,
+        rhs: TensorSlice<'_, &'_ [bool]>,
+    ) -> Self::Owned {
+        GpuBooleanOwned(global_state::with_cuda_internal_keys(|cuda_key| {
+            with_thread_local_cuda_streams(|streams| {
+                lhs.par_iter()
+                    .zip(rhs.par_iter().copied())
+                    .map(|(lhs, rhs)| {
+                        CudaBooleanBlock(cuda_key.scalar_bitand(&lhs.0, rhs as u8, streams))
+                    })
+                    .collect::<Vec<_>>()
+            })
+        }))
+    }
+
+    fn bitor_slice(
+        lhs: TensorSlice<'_, Self::Slice<'_>>,
+        rhs: TensorSlice<'_, &'_ [bool]>,
+    ) -> Self::Owned {
+        GpuBooleanOwned(global_state::with_cuda_internal_keys(|cuda_key| {
+            with_thread_local_cuda_streams(|streams| {
+                lhs.par_iter()
+                    .zip(rhs.par_iter().copied())
+                    .map(|(lhs, rhs)| {
+                        CudaBooleanBlock(cuda_key.scalar_bitor(&lhs.0, rhs as u8, streams))
+                    })
+                    .collect::<Vec<_>>()
+            })
+        }))
+    }
+
+    fn bitxor_slice(
+        lhs: TensorSlice<'_, Self::Slice<'_>>,
+        rhs: TensorSlice<'_, &'_ [bool]>,
+    ) -> Self::Owned {
+        GpuBooleanOwned(global_state::with_cuda_internal_keys(|cuda_key| {
+            with_thread_local_cuda_streams(|streams| {
+                lhs.par_iter()
+                    .zip(rhs.par_iter().copied())
+                    .map(|(lhs, rhs)| {
+                        CudaBooleanBlock(cuda_key.scalar_bitxor(&lhs.0, rhs as u8, streams))
+                    })
+                    .collect::<Vec<_>>()
+            })
+        }))
+    }
+}
+
+impl FheTryEncrypt<&[bool], ClientKey> for GpuFheBoolArray {
+    type Error = crate::Error;
+
+    fn try_encrypt(values: &[bool], cks: &ClientKey) -> Result<Self, Self::Error> {
+        let encrypted = with_thread_local_cuda_streams(|streams| {
+            values
+                .iter()
+                .copied()
+                .map(|value| {
+                    CudaBooleanBlock::from_boolean_block(&cks.key.key.encrypt_bool(value), streams)
+                })
+                .collect::<Vec<_>>()
+        });
+        Ok(Self::new(encrypted, vec![values.len()]))
+    }
+}
+
+impl<'a> FheDecrypt<Vec<bool>> for GpuFheBoolSlice<'a> {
+    fn decrypt(&self, key: &ClientKey) -> Vec<bool> {
+        with_thread_local_cuda_streams(|streams| {
+            self.elems
+                .0
+                .iter()
+                .map(|encrypted_value| {
+                    key.key
+                        .key
+                        .decrypt_bool(&encrypted_value.to_boolean_block(streams))
+                })
+                .collect()
+        })
+    }
+}
+
+impl<'a> FheDecrypt<Vec<bool>> for GpuFheBoolSliceMut<'a> {
+    fn decrypt(&self, key: &ClientKey) -> Vec<bool> {
+        self.as_slice().decrypt(key)
+    }
+}
+
+impl FheDecrypt<Vec<bool>> for GpuFheBoolArray {
+    fn decrypt(&self, key: &ClientKey) -> Vec<bool> {
+        self.as_slice().decrypt(key)
+    }
+}
diff --git a/tfhe/src/high_level_api/array/gpu/integers.rs b/tfhe/src/high_level_api/array/gpu/integers.rs
new file mode 100644
index 0000000000..a7bc8be2d7
--- /dev/null
+++ b/tfhe/src/high_level_api/array/gpu/integers.rs
@@ -0,0 +1,648 @@
+//! This module contains the implementations of the FheUint array and FheInt array backend
+//! where the values and computations are always done on GPU
+use super::super::helpers::{create_sub_mut_slice_with_bound, create_sub_slice_with_bound};
+use super::super::traits::{ArithmeticArrayBackend, BitwiseArrayBackend, ClearBitwiseArrayBackend};
+use crate::core_crypto::prelude::{SignedNumeric, UnsignedNumeric};
+use crate::high_level_api::array::{
+    ArrayBackend, FheArrayBase, FheBackendArray, FheBackendArraySlice, FheBackendArraySliceMut,
+};
+
+use crate::array::stride::{ParStridedIter, ParStridedIterMut, StridedIter};
+use crate::array::traits::{
+    BackendDataContainer, BackendDataContainerMut, ClearArithmeticArrayBackend, TensorSlice,
+};
+use crate::core_crypto::gpu::CudaStreams;
+use crate::high_level_api::global_state;
+use crate::high_level_api::global_state::with_thread_local_cuda_streams;
+use crate::high_level_api::integers::{FheIntId, FheUintId};
+use crate::integer::block_decomposition::{DecomposableInto, RecomposableFrom};
+use crate::integer::client_key::RecomposableSignedInteger;
+use crate::integer::gpu::ciphertext::{
+    CudaIntegerRadixCiphertext, CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext,
+};
+use crate::integer::server_key::radix_parallel::scalar_div_mod::SignedReciprocable;
+use crate::integer::server_key::{Reciprocable, ScalarMultiplier};
+use crate::prelude::{CastInto, FheDecrypt, FheTryEncrypt};
+use crate::{ClientKey, Error};
+use rayon::prelude::*;
+use std::marker::PhantomData;
+use std::ops::RangeBounds;
+
+pub struct GpuIntegerArrayBackend<T>(PhantomData<T>);
+
+pub type GpuUintArrayBackend = GpuIntegerArrayBackend<CudaUnsignedRadixCiphertext>;
+pub type GpuIntArrayBackend = GpuIntegerArrayBackend<CudaSignedRadixCiphertext>;
+
+// Base alias for array of unsigned integers on the CPU only backend
+pub type GpuFheUintArray<Id> = FheBackendArray<GpuUintArrayBackend, Id>;
+pub type GpuFheUintSlice<'a, Id> = FheBackendArraySlice<'a, GpuUintArrayBackend, Id>;
+pub type GpuFheUintSliceMut<'a, Id> = FheBackendArraySliceMut<'a, GpuUintArrayBackend, Id>;
+
+// Base alias for array of signed integers on the CPU only backend
+pub type GpuFheIntArray<Id> = FheBackendArray<GpuIntArrayBackend, Id>;
+pub type GpuFheIntSlice<'a, Id> = FheBackendArraySlice<'a, GpuIntArrayBackend, Id>;
+pub type GpuFheIntSliceMut<'a, Id> = FheBackendArraySliceMut<'a, GpuIntArrayBackend, Id>;
+
+pub struct GpuSlice<'a, T>(&'a [T]);
+pub struct GpuSliceMut<'a, T>(&'a mut [T]);
+pub struct GpuOwned<T>(Vec<T>);
+
+impl<T> Clone for GpuOwned<T>
+where
+    T: CudaIntegerRadixCiphertext,
+{
+    fn clone(&self) -> Self {
+        with_thread_local_cuda_streams(|streams| {
+            Self(self.0.iter().map(|elem| elem.duplicate(streams)).collect())
+        })
+    }
+}
+
+impl<T> ArrayBackend for GpuIntegerArrayBackend<T>
+where
+    T: CudaIntegerRadixCiphertext,
+{
+    type Slice<'a>
+        = GpuSlice<'a, T>
+    where
+        Self: 'a;
+    type SliceMut<'a>
+        = GpuSliceMut<'a, T>
+    where
+        Self: 'a;
+    type Owned = GpuOwned<T>;
+}
+
+impl<'a, T> TensorSlice<'a, GpuSlice<'a, T>> {
+    pub fn iter(self) -> StridedIter<'a, T> {
+        StridedIter::new(self.slice.0, self.dims.clone())
+    }
+
+    pub fn par_iter(self) -> ParStridedIter<'a, T> {
+        ParStridedIter::new(self.slice.0, self.dims.clone())
+    }
+}
+
+impl<'a, T> TensorSlice<'a, GpuSliceMut<'a, T>> {
+    pub fn par_iter_mut(self) -> ParStridedIterMut<'a, T> {
+        ParStridedIterMut::new(self.slice.0, self.dims.clone())
+    }
+}
+
+impl<T> From<Vec<T>> for GpuOwned<T> {
+    fn from(value: Vec<T>) -> Self {
+        Self(value)
+    }
+}
+
+#[inline]
+#[track_caller]
+fn par_map_sks_op_on_pair_of_elements<'a, T, F>(
+    lhs: TensorSlice<'a, GpuSlice<'a, T>>,
+    rhs: TensorSlice<'a, GpuSlice<'a, T>>,
+    op: F,
+) -> GpuOwned<T>
+where
+    T: CudaIntegerRadixCiphertext + Send + Sync,
+    F: Send + Sync + Fn(&crate::integer::gpu::CudaServerKey, &T, &T, &CudaStreams) -> T,
+{
+    GpuOwned(global_state::with_cuda_internal_keys(|cuda_key| {
+        with_thread_local_cuda_streams(|streams| {
+            lhs.par_iter()
+                .zip(rhs.par_iter())
+                .map(|(lhs, rhs)| op(cuda_key, lhs, rhs, streams))
+                .collect::<Vec<_>>()
+        })
+    }))
+}
+
+impl<T> ArithmeticArrayBackend for GpuIntegerArrayBackend<T>
+where
+    T: CudaIntegerRadixCiphertext + Send + Sync,
+{
+    fn add_slices<'a>(
+        lhs: TensorSlice<'_, Self::Slice<'a>>,
+        rhs: TensorSlice<'_, Self::Slice<'a>>,
+    ) -> Self::Owned {
+        par_map_sks_op_on_pair_of_elements(lhs, rhs, crate::integer::gpu::CudaServerKey::add)
+    }
+
+    fn sub_slices<'a>(
+        lhs: TensorSlice<'_, Self::Slice<'a>>,
+        rhs: TensorSlice<'_, Self::Slice<'a>>,
+    ) -> Self::Owned {
+        par_map_sks_op_on_pair_of_elements(lhs, rhs, crate::integer::gpu::CudaServerKey::sub)
+    }
+
+    fn mul_slices<'a>(
+        lhs: TensorSlice<'_, Self::Slice<'a>>,
+        rhs: TensorSlice<'_, Self::Slice<'a>>,
+    ) -> Self::Owned {
+        par_map_sks_op_on_pair_of_elements(lhs, rhs, crate::integer::gpu::CudaServerKey::mul)
+    }
+
+    fn div_slices<'a>(
+        lhs: TensorSlice<'_, Self::Slice<'a>>,
+        rhs: TensorSlice<'_, Self::Slice<'a>>,
+    ) -> Self::Owned {
+        par_map_sks_op_on_pair_of_elements(lhs, rhs, crate::integer::gpu::CudaServerKey::div)
+    }
+
+    fn rem_slices<'a>(
+        lhs: TensorSlice<'_, Self::Slice<'a>>,
+        rhs: TensorSlice<'_, Self::Slice<'a>>,
+    ) -> Self::Owned {
+        par_map_sks_op_on_pair_of_elements(lhs, rhs, crate::integer::gpu::CudaServerKey::rem)
+    }
+}
+
+#[inline]
+#[track_caller]
+fn par_map_sks_scalar_op_on_pair_of_elements<'a, T, Clear, F>(
+    lhs: TensorSlice<'a, GpuSlice<'a, T>>,
+    rhs: TensorSlice<'a, &'a [Clear]>,
+    op: F,
+) -> GpuOwned<T>
+where
+    T: CudaIntegerRadixCiphertext + Send + Sync,
+    Clear: Copy + Send + Sync,
+    F: Send + Sync + Fn(&crate::integer::gpu::CudaServerKey, &T, Clear, &CudaStreams) -> T,
+{
+    GpuOwned(global_state::with_cuda_internal_keys(|cuda_key| {
+        with_thread_local_cuda_streams(|streams| {
+            lhs.par_iter()
+                .zip(rhs.par_iter())
+                .map(|(lhs, rhs)| op(cuda_key, lhs, *rhs, streams))
+                .collect::<Vec<_>>()
+        })
+    }))
+}
+
+impl<Clear> ClearArithmeticArrayBackend<Clear>
+    for GpuIntegerArrayBackend<CudaUnsignedRadixCiphertext>
+where
+    Clear: DecomposableInto<u8>
+        + std::ops::Not<Output = Clear>
+        + std::ops::Add<Clear, Output = Clear>
+        + ScalarMultiplier
+        + Reciprocable
+        + CastInto<u64>,
+{
+    fn add_slices(
+        lhs: TensorSlice<'_, Self::Slice<'_>>,
+        rhs: TensorSlice<'_, &'_ [Clear]>,
+    ) -> Self::Owned {
+        par_map_sks_scalar_op_on_pair_of_elements(
+            lhs,
+            rhs,
+            crate::integer::gpu::CudaServerKey::scalar_add,
+        )
+    }
+
+    fn sub_slices(
+        lhs: TensorSlice<'_, Self::Slice<'_>>,
+        rhs: TensorSlice<'_, &'_ [Clear]>,
+    ) -> Self::Owned {
+        par_map_sks_scalar_op_on_pair_of_elements(
+            lhs,
+            rhs,
+            crate::integer::gpu::CudaServerKey::scalar_sub,
+        )
+    }
+
+    fn mul_slices(
+        lhs: TensorSlice<'_, Self::Slice<'_>>,
+        rhs: TensorSlice<'_, &'_ [Clear]>,
+    ) -> Self::Owned {
+        par_map_sks_scalar_op_on_pair_of_elements(
+            lhs,
+            rhs,
+            crate::integer::gpu::CudaServerKey::scalar_mul,
+        )
+    }
+
+    fn div_slices(
+        lhs: TensorSlice<'_, Self::Slice<'_>>,
+        rhs: TensorSlice<'_, &'_ [Clear]>,
+    ) -> Self::Owned {
+        par_map_sks_scalar_op_on_pair_of_elements(
+            lhs,
+            rhs,
+            crate::integer::gpu::CudaServerKey::scalar_div,
+        )
+    }
+
+    fn rem_slices(
+        lhs: TensorSlice<'_, Self::Slice<'_>>,
+        rhs: TensorSlice<'_, &'_ [Clear]>,
+    ) -> Self::Owned {
+        par_map_sks_scalar_op_on_pair_of_elements(
+            lhs,
+            rhs,
+            crate::integer::gpu::CudaServerKey::scalar_rem,
+        )
+    }
+}
+
+impl<Clear> ClearArithmeticArrayBackend<Clear> for GpuIntegerArrayBackend<CudaSignedRadixCiphertext>
+where
+    Clear: DecomposableInto<u8>
+        + std::ops::Not<Output = Clear>
+        + std::ops::Add<Clear, Output = Clear>
+        + ScalarMultiplier
+        + SignedReciprocable,
+{
+    fn add_slices(
+        lhs: TensorSlice<'_, Self::Slice<'_>>,
+        rhs: TensorSlice<'_, &'_ [Clear]>,
+    ) -> Self::Owned {
+        par_map_sks_scalar_op_on_pair_of_elements(
+            lhs,
+            rhs,
+            crate::integer::gpu::CudaServerKey::scalar_add,
+        )
+    }
+
+    fn sub_slices(
+        lhs: TensorSlice<'_, Self::Slice<'_>>,
+        rhs: TensorSlice<'_, &'_ [Clear]>,
+    ) -> Self::Owned {
+        par_map_sks_scalar_op_on_pair_of_elements(
+            lhs,
+            rhs,
+            crate::integer::gpu::CudaServerKey::scalar_sub,
+        )
+    }
+
+    fn mul_slices(
+        lhs: TensorSlice<'_, Self::Slice<'_>>,
+        rhs: TensorSlice<'_, &'_ [Clear]>,
+    ) -> Self::Owned {
+        par_map_sks_scalar_op_on_pair_of_elements(
+            lhs,
+            rhs,
+            crate::integer::gpu::CudaServerKey::scalar_mul,
+        )
+    }
+
+    fn div_slices(
+        lhs: TensorSlice<'_, Self::Slice<'_>>,
+        rhs: TensorSlice<'_, &'_ [Clear]>,
+    ) -> Self::Owned {
+        par_map_sks_scalar_op_on_pair_of_elements(
+            lhs,
+            rhs,
+            crate::integer::gpu::CudaServerKey::signed_scalar_div,
+        )
+    }
+
+    fn rem_slices(
+        lhs: TensorSlice<'_, Self::Slice<'_>>,
+        rhs: TensorSlice<'_, &'_ [Clear]>,
+    ) -> Self::Owned {
+        par_map_sks_scalar_op_on_pair_of_elements(
+            lhs,
+            rhs,
+            crate::integer::gpu::CudaServerKey::signed_scalar_rem,
+        )
+    }
+}
+
+impl<T> BitwiseArrayBackend for GpuIntegerArrayBackend<T>
+where
+    T: CudaIntegerRadixCiphertext + Send + Sync,
+{
+    fn bitand<'a>(
+        lhs: TensorSlice<'_, Self::Slice<'a>>,
+        rhs: TensorSlice<'_, Self::Slice<'a>>,
+    ) -> Self::Owned {
+        par_map_sks_op_on_pair_of_elements(lhs, rhs, crate::integer::gpu::CudaServerKey::bitand)
+    }
+
+    fn bitor<'a>(
+        lhs: TensorSlice<'_, Self::Slice<'a>>,
+        rhs: TensorSlice<'_, Self::Slice<'a>>,
+    ) -> Self::Owned {
+        par_map_sks_op_on_pair_of_elements(lhs, rhs, crate::integer::gpu::CudaServerKey::bitor)
+    }
+
+    fn bitxor<'a>(
+        lhs: TensorSlice<'_, Self::Slice<'a>>,
+        rhs: TensorSlice<'_, Self::Slice<'a>>,
+    ) -> Self::Owned {
+        par_map_sks_op_on_pair_of_elements(lhs, rhs, crate::integer::gpu::CudaServerKey::bitxor)
+    }
+
+    fn bitnot(lhs: TensorSlice<'_, Self::Slice<'_>>) -> Self::Owned {
+        GpuOwned(global_state::with_cuda_internal_keys(|cuda_key| {
+            with_thread_local_cuda_streams(|streams| {
+                lhs.par_iter()
+                    .map(|lhs| cuda_key.bitnot(lhs, streams))
+                    .collect::<Vec<_>>()
+            })
+        }))
+    }
+}
+
+impl<Clear, T> ClearBitwiseArrayBackend<Clear> for GpuIntegerArrayBackend<T>
+where
+    T: CudaIntegerRadixCiphertext + Send + Sync,
+    Clear: DecomposableInto<u8>,
+{
+    fn bitand_slice(
+        lhs: TensorSlice<'_, Self::Slice<'_>>,
+        rhs: TensorSlice<'_, &'_ [Clear]>,
+    ) -> Self::Owned {
+        par_map_sks_scalar_op_on_pair_of_elements(
+            lhs,
+            rhs,
+            crate::integer::gpu::CudaServerKey::scalar_bitand,
+        )
+    }
+
+    fn bitor_slice(
+        lhs: TensorSlice<'_, Self::Slice<'_>>,
+        rhs: TensorSlice<'_, &'_ [Clear]>,
+    ) -> Self::Owned {
+        par_map_sks_scalar_op_on_pair_of_elements(
+            lhs,
+            rhs,
+            crate::integer::gpu::CudaServerKey::scalar_bitor,
+        )
+    }
+
+    fn bitxor_slice(
+        lhs: TensorSlice<'_, Self::Slice<'_>>,
+        rhs: TensorSlice<'_, &'_ [Clear]>,
+    ) -> Self::Owned {
+        par_map_sks_scalar_op_on_pair_of_elements(
+            lhs,
+            rhs,
+            crate::integer::gpu::CudaServerKey::scalar_bitxor,
+        )
+    }
+}
+
+impl<T> BackendDataContainer for GpuOwned<T>
+where
+    T: CudaIntegerRadixCiphertext,
+{
+    type Backend = GpuIntegerArrayBackend<T>;
+
+    fn len(&self) -> usize {
+        self.0.len()
+    }
+
+    fn as_sub_slice(
+        &self,
+        range: impl RangeBounds<usize>,
+    ) -> <Self::Backend as ArrayBackend>::Slice<'_> {
+        GpuSlice(create_sub_slice_with_bound(self.0.as_slice(), range))
+    }
+
+    fn into_owned(self) -> <Self::Backend as ArrayBackend>::Owned {
+        self
+    }
+}
+
+impl<T> BackendDataContainerMut for GpuOwned<T>
+where
+    T: CudaIntegerRadixCiphertext,
+{
+    fn as_sub_slice_mut(
+        &mut self,
+        range: impl RangeBounds<usize>,
+    ) -> <Self::Backend as ArrayBackend>::SliceMut<'_> {
+        GpuSliceMut(create_sub_mut_slice_with_bound(
+            self.0.as_mut_slice(),
+            range,
+        ))
+    }
+}
+
+impl<'a, T> BackendDataContainer for GpuSlice<'a, T>
+where
+    T: CudaIntegerRadixCiphertext,
+{
+    type Backend = GpuIntegerArrayBackend<T>;
+
+    fn len(&self) -> usize {
+        <[T]>::len(self.0)
+    }
+
+    fn as_sub_slice(
+        &self,
+        range: impl RangeBounds<usize>,
+    ) -> <Self::Backend as ArrayBackend>::Slice<'_> {
+        GpuSlice(create_sub_slice_with_bound(self.0, range))
+    }
+
+    fn into_owned(self) -> <Self::Backend as ArrayBackend>::Owned {
+        with_thread_local_cuda_streams(|streams| {
+            GpuOwned(self.0.iter().map(|elem| elem.duplicate(streams)).collect())
+        })
+    }
+}
+
+impl<'a, T> BackendDataContainer for GpuSliceMut<'a, T>
+where
+    T: CudaIntegerRadixCiphertext,
+{
+    type Backend = GpuIntegerArrayBackend<T>;
+
+    fn len(&self) -> usize {
+        <[T]>::len(self.0)
+    }
+
+    fn as_sub_slice(
+        &self,
+        range: impl RangeBounds<usize>,
+    ) -> <Self::Backend as ArrayBackend>::Slice<'_> {
+        GpuSlice(create_sub_slice_with_bound(self.0, range))
+    }
+
+    fn into_owned(self) -> <Self::Backend as ArrayBackend>::Owned {
+        with_thread_local_cuda_streams(|streams| {
+            GpuOwned(self.0.iter().map(|elem| elem.duplicate(streams)).collect())
+        })
+    }
+}
+
+impl<'a, T> BackendDataContainerMut for GpuSliceMut<'a, T>
+where
+    T: CudaIntegerRadixCiphertext,
+{
+    fn as_sub_slice_mut(
+        &mut self,
+        range: impl RangeBounds<usize>,
+    ) -> <Self::Backend as ArrayBackend>::SliceMut<'_> {
+        GpuSliceMut(create_sub_mut_slice_with_bound(self.0, range))
+    }
+}
+
+impl<'a, Clear, Id> FheTryEncrypt<&'a [Clear], ClientKey>
+    for FheArrayBase<GpuOwned<CudaUnsignedRadixCiphertext>, Id>
+where
+    Id: FheUintId,
+    Clear: DecomposableInto<u64> + UnsignedNumeric,
+{
+    type Error = Error;
+
+    fn try_encrypt(clears: &'a [Clear], key: &ClientKey) -> Result<Self, Self::Error> {
+        let num_blocks = Id::num_blocks(key.message_modulus());
+        Ok(Self::new(
+            with_thread_local_cuda_streams(|streams| {
+                clears
+                    .iter()
+                    .copied()
+                    .map(|clear| {
+                        CudaUnsignedRadixCiphertext::from_radix_ciphertext(
+                            &key.key.key.encrypt_radix(clear, num_blocks),
+                            streams,
+                        )
+                    })
+                    .collect::<Vec<_>>()
+            }),
+            vec![clears.len()],
+        ))
+    }
+}
+
+impl<'a, Clear, Id> FheTryEncrypt<(&'a [Clear], Vec<usize>), ClientKey>
+    for FheArrayBase<GpuOwned<CudaUnsignedRadixCiphertext>, Id>
+where
+    Id: FheUintId,
+    Clear: DecomposableInto<u64> + UnsignedNumeric,
+{
+    type Error = Error;
+
+    fn try_encrypt(
+        (clears, shape): (&'a [Clear], Vec<usize>),
+        key: &ClientKey,
+    ) -> Result<Self, Self::Error> {
+        if clears.len() != shape.iter().copied().product::<usize>() {
+            return Err(crate::Error::new(
+                "Shape does not matches the number of elements given".to_string(),
+            ));
+        }
+        let num_blocks = Id::num_blocks(key.message_modulus());
+        let elems = with_thread_local_cuda_streams(|streams| {
+            clears
+                .iter()
+                .copied()
+                .map(|clear| {
+                    CudaUnsignedRadixCiphertext::from_radix_ciphertext(
+                        &key.key.key.encrypt_radix(clear, num_blocks),
+                        streams,
+                    )
+                })
+                .collect::<Vec<_>>()
+        });
+        let data = Self::new(elems, shape);
+        Ok(data)
+    }
+}
+
+impl<Clear, Id> FheDecrypt<Vec<Clear>> for GpuFheUintArray<Id>
+where
+    Id: FheUintId,
+    Clear: RecomposableFrom<u64> + UnsignedNumeric,
+{
+    fn decrypt(&self, key: &ClientKey) -> Vec<Clear> {
+        self.as_slice().decrypt(key)
+    }
+}
+
+impl<'a, Clear, Id> FheDecrypt<Vec<Clear>> for GpuFheUintSliceMut<'a, Id>
+where
+    Id: FheUintId,
+    Clear: RecomposableFrom<u64> + UnsignedNumeric,
+{
+    fn decrypt(&self, key: &ClientKey) -> Vec<Clear> {
+        self.as_slice().decrypt(key)
+    }
+}
+
+impl<'a, Clear, Id> FheDecrypt<Vec<Clear>> for GpuFheUintSlice<'a, Id>
+where
+    Id: FheUintId,
+    Clear: RecomposableFrom<u64> + UnsignedNumeric,
+{
+    fn decrypt(&self, key: &ClientKey) -> Vec<Clear> {
+        with_thread_local_cuda_streams(|streams| {
+            self.as_tensor_slice()
+                .iter()
+                .map(|ct: &CudaUnsignedRadixCiphertext| {
+                    key.key.key.decrypt_radix(&ct.to_radix_ciphertext(streams))
+                })
+                .collect()
+        })
+    }
+}
+
+impl<'a, Clear, Id> FheTryEncrypt<&'a [Clear], ClientKey> for GpuFheIntArray<Id>
+where
+    Id: FheIntId,
+    Clear: DecomposableInto<u64> + SignedNumeric,
+{
+    type Error = Error;
+
+    fn try_encrypt(clears: &'a [Clear], key: &ClientKey) -> Result<Self, Self::Error> {
+        let num_blocks = Id::num_blocks(key.message_modulus());
+        Ok(Self::new(
+            with_thread_local_cuda_streams(|streams| {
+                clears
+                    .iter()
+                    .copied()
+                    .map(|clear| {
+                        CudaSignedRadixCiphertext::from_signed_radix_ciphertext(
+                            &key.key.key.encrypt_signed_radix(clear, num_blocks),
+                            streams,
+                        )
+                    })
+                    .collect::<Vec<_>>()
+            }),
+            vec![clears.len()],
+        ))
+    }
+}
+
+impl<Clear, Id> FheDecrypt<Vec<Clear>> for GpuFheIntArray<Id>
+where
+    Id: FheIntId,
+    Clear: RecomposableSignedInteger,
+{
+    fn decrypt(&self, key: &ClientKey) -> Vec<Clear> {
+        self.as_slice().decrypt(key)
+    }
+}
+
+impl<'a, Clear, Id> FheDecrypt<Vec<Clear>> for GpuFheIntSliceMut<'a, Id>
+where
+    Id: FheIntId,
+    Clear: RecomposableSignedInteger,
+{
+    fn decrypt(&self, key: &ClientKey) -> Vec<Clear> {
+        self.as_slice().decrypt(key)
+    }
+}
+
+impl<'a, Clear, Id> FheDecrypt<Vec<Clear>> for GpuFheIntSlice<'a, Id>
+where
+    Id: FheIntId,
+    Clear: RecomposableSignedInteger,
+{
+    fn decrypt(&self, key: &ClientKey) -> Vec<Clear> {
+        with_thread_local_cuda_streams(|streams| {
+            self.elems
+                .0
+                .iter()
+                .map(|ct| {
+                    key.key
+                        .key
+                        .decrypt_signed_radix(&ct.to_signed_radix_ciphertext(streams))
+                })
+                .collect()
+        })
+    }
+}
diff --git a/tfhe/src/high_level_api/array/gpu/mod.rs b/tfhe/src/high_level_api/array/gpu/mod.rs
new file mode 100644
index 0000000000..62efcd1695
--- /dev/null
+++ b/tfhe/src/high_level_api/array/gpu/mod.rs
@@ -0,0 +1,9 @@
+pub(crate) mod booleans;
+pub(crate) mod integers;
+#[cfg(test)]
+pub use booleans::GpuFheBoolArrayBackend;
+pub use booleans::{GpuFheBoolArray, GpuFheBoolSlice, GpuFheBoolSliceMut};
+pub use integers::{
+    GpuFheIntArray, GpuFheIntSlice, GpuFheIntSliceMut, GpuFheUintArray, GpuFheUintSlice,
+    GpuFheUintSliceMut,
+};
diff --git a/tfhe/src/high_level_api/array/mod.rs b/tfhe/src/high_level_api/array/mod.rs
index 16cbf8b5d4..5ee745a489 100644
--- a/tfhe/src/high_level_api/array/mod.rs
+++ b/tfhe/src/high_level_api/array/mod.rs
@@ -1,6 +1,8 @@
 mod clear_ops;
 mod cpu;
 mod dynamic;
+#[cfg(feature = "gpu")]
+mod gpu;
 mod helpers;
 mod ops;
 pub mod stride;
@@ -26,6 +28,11 @@ pub use dynamic::{
     FheBoolArray, FheBoolSlice, FheBoolSliceMut, FheIntArray, FheIntSlice, FheIntSliceMut,
     FheUintArray, FheUintSlice, FheUintSliceMut,
 };
+#[cfg(feature = "gpu")]
+pub use gpu::{
+    GpuFheBoolArray, GpuFheBoolSlice, GpuFheBoolSliceMut, GpuFheIntArray, GpuFheIntSlice,
+    GpuFheIntSliceMut, GpuFheUintArray, GpuFheUintSlice, GpuFheUintSliceMut,
+};
 
 /// The base struct for Fhe array types.
 ///
@@ -289,6 +296,14 @@ macro_rules! declare_concrete_array_types {
                 pub type [<CpuFheUint $num_bits Slice>]<'a> = CpuFheUintSlice<'a, crate::[<FheUint $num_bits Id>]>;
                 pub type [<CpuFheUint $num_bits SliceMut>]<'a> = CpuFheUintSliceMut<'a, crate::[<FheUint $num_bits Id>]>;
 
+                // Instantiate Array Types for Gpu backend
+                #[cfg(feature="gpu")]
+                pub type [<GpuFheUint $num_bits Array>] = GpuFheUintArray<crate::[<FheUint $num_bits Id>]>;
+                #[cfg(feature="gpu")]
+                pub type [<GpuFheUint $num_bits Slice>]<'a> = GpuFheUintSlice<'a, crate::[<FheUint $num_bits Id>]>;
+                #[cfg(feature="gpu")]
+                pub type [<GpuFheUint $num_bits SliceMut>]<'a> = GpuFheUintSliceMut<'a, crate::[<FheUint $num_bits Id>]>;
+
             )*
 
         }
@@ -308,6 +323,14 @@ macro_rules! declare_concrete_array_types {
                 pub type [<CpuFheInt $num_bits Slice>]<'a> = CpuFheIntSlice<'a, crate::[<FheInt $num_bits Id>]>;
                 pub type [<CpuFheInt $num_bits SliceMut>]<'a> = CpuFheIntSliceMut<'a, crate::[<FheInt $num_bits Id>]>;
 
+                // Instantiate Array Types for Gpu backend
+                #[cfg(feature="gpu")]
+                pub type [<GpuFheInt $num_bits Array>] = GpuFheIntArray<crate::[<FheInt $num_bits Id>]>;
+                #[cfg(feature="gpu")]
+                pub type [<GpuFheInt $num_bits Slice>]<'a> = GpuFheIntSlice<'a, crate::[<FheInt $num_bits Id>]>;
+                #[cfg(feature="gpu")]
+                pub type [<GpuFheInt $num_bits SliceMut>]<'a> = GpuFheIntSliceMut<'a, crate::[<FheInt $num_bits Id>]>;
+
             )*
 
         }
diff --git a/tfhe/src/high_level_api/array/tests/booleans.rs b/tfhe/src/high_level_api/array/tests/booleans.rs
index 870fa35be9..a17f297dbf 100644
--- a/tfhe/src/high_level_api/array/tests/booleans.rs
+++ b/tfhe/src/high_level_api/array/tests/booleans.rs
@@ -8,6 +8,17 @@ fn test_cpu_only_bitand() {
     >(&ck);
 }
 
+#[test]
+#[cfg(feature = "gpu")]
+fn test_gpu_only_bitand() {
+    let ck = super::setup_default_gpu();
+    super::bitand_test_case::<
+        crate::FheBoolId,
+        crate::high_level_api::array::gpu::GpuFheBoolArrayBackend,
+        bool,
+    >(&ck);
+}
+
 #[test]
 fn test_cpu_dyn_bitand() {
     let ck = super::setup_default_cpu();
@@ -24,6 +35,13 @@ fn test_cpu_only_bitor() {
     super::bitor_test_case::<crate::CpuFheBoolArray, bool>(&ck);
 }
 
+#[test]
+#[cfg(feature = "gpu")]
+fn test_gpu_only_bitor() {
+    let ck = super::setup_default_gpu();
+    super::bitor_test_case::<crate::high_level_api::array::gpu::GpuFheBoolArray, bool>(&ck);
+}
+
 #[test]
 fn test_cpu_dyn_bitor() {
     let ck = super::setup_default_cpu();
@@ -36,6 +54,13 @@ fn test_cpu_only_bitxor() {
     super::bitxor_test_case::<crate::CpuFheBoolArray, bool>(&ck);
 }
 
+#[test]
+#[cfg(feature = "gpu")]
+fn test_gpu_only_bitxor() {
+    let ck = super::setup_default_gpu();
+    super::bitxor_test_case::<crate::high_level_api::array::gpu::GpuFheBoolArray, bool>(&ck);
+}
+
 #[test]
 fn test_cpu_dyn_bitxor() {
     let ck = super::setup_default_cpu();
@@ -48,6 +73,15 @@ fn test_cpu_only_bitand_scalar_slice() {
     super::bitand_scalar_slice_test_case::<crate::CpuFheBoolArray, bool>(&ck);
 }
 
+#[test]
+#[cfg(feature = "gpu")]
+fn test_gpu_only_bitand_scalar_slice() {
+    let ck = super::setup_default_gpu();
+    super::bitand_scalar_slice_test_case::<crate::high_level_api::array::gpu::GpuFheBoolArray, bool>(
+        &ck,
+    );
+}
+
 #[test]
 fn test_cpu_dyn_bitand_scalar_slice() {
     let ck = super::setup_default_cpu();
diff --git a/tfhe/src/high_level_api/array/tests/mod.rs b/tfhe/src/high_level_api/array/tests/mod.rs
index 6e1a9b57b3..bc0d6ca562 100644
--- a/tfhe/src/high_level_api/array/tests/mod.rs
+++ b/tfhe/src/high_level_api/array/tests/mod.rs
@@ -3,6 +3,8 @@ mod signed;
 mod unsigned;
 
 use crate::{generate_keys, set_server_key, ClientKey, ConfigBuilder, FheId};
+#[cfg(feature = "gpu")]
+use crate::{Config, CudaServerKey};
 use rand::distributions::{Distribution, Standard};
 use rand::random;
 use std::fmt::Debug;
@@ -13,6 +15,14 @@ use crate::high_level_api::array::{FheBackendArray, FheBackendArraySlice};
 use crate::prelude::{FheDecrypt, FheTryEncrypt};
 use std::ops::{BitAnd, BitOr, BitXor};
 
+#[cfg(feature = "gpu")]
+pub(crate) fn generate_cuda_keys<C: Into<Config>>(config: C) -> (ClientKey, CudaServerKey) {
+    let client_kc = ClientKey::generate(config);
+    let server_kc = client_kc.generate_compressed_server_key();
+    let cuda_server_kc = server_kc.decompress_to_gpu();
+
+    (client_kc, cuda_server_kc)
+}
 fn draw_random_values<T>(num_values: usize) -> Vec<T>
 where
     Standard: Distribution<T>,
@@ -28,6 +38,15 @@ fn setup_default_cpu() -> ClientKey {
     ck
 }
 
+#[cfg(feature = "gpu")]
+fn setup_default_gpu() -> ClientKey {
+    let config = ConfigBuilder::default().build();
+    let (ck, sk) = generate_cuda_keys(config);
+    set_server_key(sk);
+
+    ck
+}
+
 fn bitand_test_case<Id, Backend, Clear>(ck: &ClientKey)
 where
     Id: FheId,
diff --git a/tfhe/src/high_level_api/array/tests/signed.rs b/tfhe/src/high_level_api/array/tests/signed.rs
index ad6da41ce4..81a9cdd82e 100644
--- a/tfhe/src/high_level_api/array/tests/signed.rs
+++ b/tfhe/src/high_level_api/array/tests/signed.rs
@@ -8,6 +8,17 @@ fn test_cpu_only_bitand() {
     >(&ck);
 }
 
+#[test]
+#[cfg(feature = "gpu")]
+fn test_gpu_only_bitand() {
+    let ck = super::setup_default_gpu();
+    super::bitand_test_case::<
+        crate::FheInt32Id,
+        crate::high_level_api::array::gpu::integers::GpuIntArrayBackend,
+        i32,
+    >(&ck);
+}
+
 #[test]
 fn test_cpu_dyn_bitand() {
     let ck = super::setup_default_cpu();
@@ -24,6 +35,13 @@ fn test_cpu_only_bitor() {
     super::bitor_test_case::<crate::CpuFheInt32Array, i32>(&ck);
 }
 
+#[test]
+#[cfg(feature = "gpu")]
+fn test_gpu_only_bitor() {
+    let ck = super::setup_default_gpu();
+    super::bitor_test_case::<crate::array::GpuFheInt32Array, i32>(&ck);
+}
+
 #[test]
 fn test_cpu_dyn_bitor() {
     let ck = super::setup_default_cpu();
@@ -36,6 +54,13 @@ fn test_cpu_only_bitxor() {
     super::bitxor_test_case::<crate::CpuFheInt32Array, i32>(&ck);
 }
 
+#[test]
+#[cfg(feature = "gpu")]
+fn test_gpu_only_bitxor() {
+    let ck = super::setup_default_gpu();
+    super::bitxor_test_case::<crate::array::GpuFheInt32Array, i32>(&ck);
+}
+
 #[test]
 fn test_cpu_dyn_bitxor() {
     let ck = super::setup_default_cpu();
@@ -48,6 +73,13 @@ fn test_cpu_only_bitand_scalar_slice() {
     super::bitand_scalar_slice_test_case::<crate::CpuFheInt32Array, i32>(&ck);
 }
 
+#[test]
+#[cfg(feature = "gpu")]
+fn test_gpu_only_bitand_scalar_slice() {
+    let ck = super::setup_default_gpu();
+    super::bitand_scalar_slice_test_case::<crate::array::GpuFheInt32Array, i32>(&ck);
+}
+
 #[test]
 fn test_cpu_dyn_bitand_scalar_slice() {
     let ck = super::setup_default_cpu();
diff --git a/tfhe/src/high_level_api/array/tests/unsigned.rs b/tfhe/src/high_level_api/array/tests/unsigned.rs
index 15ca5009ab..17cda31464 100644
--- a/tfhe/src/high_level_api/array/tests/unsigned.rs
+++ b/tfhe/src/high_level_api/array/tests/unsigned.rs
@@ -14,6 +14,17 @@ fn test_cpu_only_bitand() {
     >(&ck);
 }
 
+#[test]
+#[cfg(feature = "gpu")]
+fn test_gpu_only_bitand() {
+    let ck = super::setup_default_gpu();
+    super::bitand_test_case::<
+        crate::FheUint32Id,
+        crate::high_level_api::array::gpu::integers::GpuUintArrayBackend,
+        u32,
+    >(&ck);
+}
+
 #[test]
 fn test_cpu_dyn_bitand() {
     let ck = super::setup_default_cpu();
@@ -30,6 +41,13 @@ fn test_cpu_only_bitor() {
     super::bitor_test_case::<crate::CpuFheUint32Array, u32>(&ck);
 }
 
+#[test]
+#[cfg(feature = "gpu")]
+fn test_gpu_only_bitor() {
+    let ck = super::setup_default_gpu();
+    super::bitor_test_case::<crate::array::GpuFheUint32Array, u32>(&ck);
+}
+
 #[test]
 fn test_cpu_dyn_bitor() {
     let ck = super::setup_default_cpu();
@@ -42,6 +60,13 @@ fn test_cpu_only_bitxor() {
     super::bitxor_test_case::<crate::CpuFheUint32Array, u32>(&ck);
 }
 
+#[test]
+#[cfg(feature = "gpu")]
+fn test_gpu_only_bitxor() {
+    let ck = super::setup_default_gpu();
+    super::bitxor_test_case::<crate::array::GpuFheUint32Array, u32>(&ck);
+}
+
 #[test]
 fn test_cpu_dyn_bitxor() {
     let ck = super::setup_default_cpu();
@@ -53,6 +78,13 @@ fn test_cpu_only_bitand_scalar_slice() {
     super::bitand_scalar_slice_test_case::<crate::CpuFheUint32Array, u32>(&ck);
 }
 
+#[test]
+#[cfg(feature = "gpu")]
+fn test_gpu_only_bitand_scalar_slice() {
+    let ck = super::setup_default_gpu();
+    super::bitand_scalar_slice_test_case::<crate::array::GpuFheUint32Array, u32>(&ck);
+}
+
 #[test]
 fn test_cpu_dyn_bitand_scalar_slice() {
     let ck = super::setup_default_cpu();
diff --git a/tfhe/src/high_level_api/global_state.rs b/tfhe/src/high_level_api/global_state.rs
index 1593ede673..c134eda58c 100644
--- a/tfhe/src/high_level_api/global_state.rs
+++ b/tfhe/src/high_level_api/global_state.rs
@@ -4,7 +4,10 @@
 use crate::core_crypto::gpu::CudaStreams;
 use crate::high_level_api::errors::{UninitializedServerKey, UnwrapResultExt};
 use crate::high_level_api::keys::{InternalServerKey, ServerKey};
+#[cfg(feature = "gpu")]
+use crate::integer::gpu::CudaServerKey;
 use std::cell::RefCell;
+
 /// We store the internal keys as thread local, meaning each thread has its own set of keys.
 ///
 /// This means that the user can do computations in multiple threads
@@ -156,6 +159,28 @@ where
     })
 }
 
+#[inline]
+#[cfg(feature = "gpu")]
+pub(crate) fn with_cuda_internal_keys<T, F>(func: F) -> T
+where
+    F: FnOnce(&CudaServerKey) -> T,
+{
+    // Should use `with_borrow` when its stabilized
+    INTERNAL_KEYS.with(|keys| {
+        let maybe_key = &*keys.borrow();
+        let key = maybe_key
+            .as_ref()
+            .ok_or(UninitializedServerKey)
+            .unwrap_display();
+        match key {
+            InternalServerKey::Cuda(key) => func(&key.key.key),
+            InternalServerKey::Cpu(_) => {
+                panic!("Cuda key requested but only cpu key is available")
+            }
+        }
+    })
+}
+
 #[cfg(feature = "gpu")]
 thread_local! {
     static CUDA_STREAMS: std::cell::OnceCell<CudaStreams> = std::cell::OnceCell::from(CudaStreams::new_multi_gpu());
diff --git a/tfhe/src/lib.rs b/tfhe/src/lib.rs
index 93b220b05e..f57c9f047c 100644
--- a/tfhe/src/lib.rs
+++ b/tfhe/src/lib.rs
@@ -72,6 +72,7 @@
 #![cfg_attr(all(doc, not(doctest)), feature(doc_auto_cfg))]
 #![cfg_attr(all(doc, not(doctest)), feature(doc_cfg))]
 #![warn(rustdoc::broken_intra_doc_links)]
+extern crate core;
 
 #[cfg(feature = "__c_api")]
 pub mod c_api;